diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 1f368e2afcbd..61773efbc29a 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -433,6 +433,12 @@
             "FluxKontextAutoBlocks",
             "FluxKontextModularPipeline",
             "FluxModularPipeline",
+            "HeliosAutoBlocks",
+            "HeliosModularPipeline",
+            "HeliosPyramidAutoBlocks",
+            "HeliosPyramidDistilledAutoBlocks",
+            "HeliosPyramidDistilledModularPipeline",
+            "HeliosPyramidModularPipeline",
             "QwenImageAutoBlocks",
             "QwenImageEditAutoBlocks",
             "QwenImageEditModularPipeline",
@@ -1186,6 +1192,12 @@
             FluxKontextAutoBlocks,
             FluxKontextModularPipeline,
             FluxModularPipeline,
+            HeliosAutoBlocks,
+            HeliosModularPipeline,
+            HeliosPyramidAutoBlocks,
+            HeliosPyramidDistilledAutoBlocks,
+            HeliosPyramidDistilledModularPipeline,
+            HeliosPyramidModularPipeline,
             QwenImageAutoBlocks,
             QwenImageEditAutoBlocks,
             QwenImageEditModularPipeline,
diff --git a/src/diffusers/modular_pipelines/__init__.py b/src/diffusers/modular_pipelines/__init__.py
index c9bebd8644f7..fd9bd691ca87 100644
--- a/src/diffusers/modular_pipelines/__init__.py
+++ b/src/diffusers/modular_pipelines/__init__.py
@@ -56,6 +56,14 @@
         "WanImage2VideoModularPipeline",
         "Wan22Image2VideoModularPipeline",
     ]
+    _import_structure["helios"] = [
+        "HeliosAutoBlocks",
+        "HeliosModularPipeline",
+        "HeliosPyramidAutoBlocks",
+        "HeliosPyramidDistilledAutoBlocks",
+        "HeliosPyramidDistilledModularPipeline",
+        "HeliosPyramidModularPipeline",
+    ]
     _import_structure["flux"] = [
         "FluxAutoBlocks",
         "FluxModularPipeline",
@@ -103,6 +111,14 @@
             Flux2KleinModularPipeline,
             Flux2ModularPipeline,
         )
+        from .helios import (
+            HeliosAutoBlocks,
+            HeliosModularPipeline,
+            HeliosPyramidAutoBlocks,
+            HeliosPyramidDistilledAutoBlocks,
+            HeliosPyramidDistilledModularPipeline,
+            HeliosPyramidModularPipeline,
+        )
         from .modular_pipeline import (
             AutoPipelineBlocks,
             BlockState,
diff --git a/src/diffusers/modular_pipelines/helios/__init__.py b/src/diffusers/modular_pipelines/helios/__init__.py
new file mode 100644
index 000000000000..26551399a3e8
--- /dev/null
+++ b/src/diffusers/modular_pipelines/helios/__init__.py
@@ -0,0 +1,59 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modular_blocks_helios"] = ["HeliosAutoBlocks"]
+    _import_structure["modular_blocks_helios_pyramid"] = ["HeliosPyramidAutoBlocks"]
+    _import_structure["modular_blocks_helios_pyramid_distilled"] = ["HeliosPyramidDistilledAutoBlocks"]
+    _import_structure["modular_pipeline"] = [
+        "HeliosModularPipeline",
+        "HeliosPyramidDistilledModularPipeline",
+        "HeliosPyramidModularPipeline",
+    ]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modular_blocks_helios import HeliosAutoBlocks
+        from .modular_blocks_helios_pyramid import HeliosPyramidAutoBlocks
+        from .modular_blocks_helios_pyramid_distilled import HeliosPyramidDistilledAutoBlocks
+        from .modular_pipeline import (
+            HeliosModularPipeline,
+            HeliosPyramidDistilledModularPipeline,
+            HeliosPyramidModularPipeline,
+        )
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/modular_pipelines/helios/before_denoise.py b/src/diffusers/modular_pipelines/helios/before_denoise.py
new file mode 100644
index 000000000000..6d317fa737f4
--- /dev/null
+++ b/src/diffusers/modular_pipelines/helios/before_denoise.py
@@ -0,0 +1,836 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+
+from ...models import HeliosTransformer3DModel
+from ...schedulers import HeliosScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import HeliosModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+
+
+class HeliosTextInputStep(ModularPipelineBlocks):
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Input processing step that:\n"
+            "  1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
+            "  2. Adjusts input tensor shapes based on `batch_size` (number of prompts) and `num_videos_per_prompt`\n\n"
+            "All input tensors are expected to have either batch_size=1 or match the batch_size\n"
+            "of prompt_embeds. The tensors will be duplicated across the batch dimension to\n"
+            "have a final batch_size of batch_size * num_videos_per_prompt."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "num_videos_per_prompt",
+                default=1,
+                type_hint=int,
+                description="Number of videos to generate per prompt.",
+            ),
+            InputParam.template("prompt_embeds"),
+            InputParam.template("negative_prompt_embeds"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[str]:
+        return [
+            OutputParam(
+                "batch_size",
+                type_hint=int,
+                description="Number of prompts, the final batch size of model inputs should be batch_size * num_videos_per_prompt",
+            ),
+            OutputParam(
+                "dtype",
+                type_hint=torch.dtype,
+                description="Data type of model tensor inputs (determined by `prompt_embeds.dtype`)",
+            ),
+        ]
+
+    def check_inputs(self, components, block_state):
+        if block_state.prompt_embeds is not None and block_state.negative_prompt_embeds is not None:
+            if block_state.prompt_embeds.shape != block_state.negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {block_state.prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {block_state.negative_prompt_embeds.shape}."
+                )
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        self.check_inputs(components, block_state)
+
+        block_state.batch_size = block_state.prompt_embeds.shape[0]
+        block_state.dtype = block_state.prompt_embeds.dtype
+
+        _, seq_len, _ = block_state.prompt_embeds.shape
+        block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_videos_per_prompt, 1)
+        block_state.prompt_embeds = block_state.prompt_embeds.view(
+            block_state.batch_size * block_state.num_videos_per_prompt, seq_len, -1
+        )
+
+        if block_state.negative_prompt_embeds is not None:
+            _, seq_len, _ = block_state.negative_prompt_embeds.shape
+            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.repeat(
+                1, block_state.num_videos_per_prompt, 1
+            )
+            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.view(
+                block_state.batch_size * block_state.num_videos_per_prompt, seq_len, -1
+            )
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+# Copied from diffusers.modular_pipelines.wan.before_denoise.repeat_tensor_to_batch_size
+def repeat_tensor_to_batch_size(
+    input_name: str,
+    input_tensor: torch.Tensor,
+    batch_size: int,
+    num_videos_per_prompt: int = 1,
+) -> torch.Tensor:
+    """Repeat tensor elements to match the final batch size.
+
+    This function expands a tensor's batch dimension to match the final batch size (batch_size * num_videos_per_prompt)
+    by repeating each element along dimension 0.
+
+    The input tensor must have batch size 1 or batch_size. The function will:
+    - If batch size is 1: repeat each element (batch_size * num_videos_per_prompt) times
+    - If batch size equals batch_size: repeat each element num_videos_per_prompt times
+
+    Args:
+        input_name (str): Name of the input tensor (used for error messages)
+        input_tensor (torch.Tensor): The tensor to repeat. Must have batch size 1 or batch_size.
+        batch_size (int): The base batch size (number of prompts)
+        num_videos_per_prompt (int, optional): Number of videos to generate per prompt. Defaults to 1.
+
+    Returns:
+        torch.Tensor: The repeated tensor with final batch size (batch_size * num_videos_per_prompt)
+
+    Raises:
+        ValueError: If input_tensor is not a torch.Tensor or has invalid batch size
+
+    Examples:
+        tensor = torch.tensor([[1, 2, 3]]) # shape: [1, 3] repeated = repeat_tensor_to_batch_size("image", tensor,
+        batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape:
+        [4, 3]
+
+        tensor = torch.tensor([[1, 2, 3], [4, 5, 6]]) # shape: [2, 3] repeated = repeat_tensor_to_batch_size("image",
+        tensor, batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]])
+        - shape: [4, 3]
+    """
+    # make sure input is a tensor
+    if not isinstance(input_tensor, torch.Tensor):
+        raise ValueError(f"`{input_name}` must be a tensor")
+
+    # make sure input tensor e.g. image_latents has batch size 1 or batch_size same as prompts
+    if input_tensor.shape[0] == 1:
+        repeat_by = batch_size * num_videos_per_prompt
+    elif input_tensor.shape[0] == batch_size:
+        repeat_by = num_videos_per_prompt
+    else:
+        raise ValueError(
+            f"`{input_name}` must have have batch size 1 or {batch_size}, but got {input_tensor.shape[0]}"
+        )
+
+    # expand the tensor to match the batch_size * num_videos_per_prompt
+    input_tensor = input_tensor.repeat_interleave(repeat_by, dim=0)
+
+    return input_tensor
+
+
+# Copied from diffusers.modular_pipelines.wan.before_denoise.calculate_dimension_from_latents
+def calculate_dimension_from_latents(
+    latents: torch.Tensor, vae_scale_factor_temporal: int, vae_scale_factor_spatial: int
+) -> tuple[int, int]:
+    """Calculate image dimensions from latent tensor dimensions.
+
+    This function converts latent temporal and spatial dimensions to image temporal and spatial dimensions by
+    multiplying the latent num_frames/height/width by the VAE scale factor.
+
+    Args:
+        latents (torch.Tensor): The latent tensor. Must have 4 or 5 dimensions.
+            Expected shapes: [batch, channels, height, width] or [batch, channels, frames, height, width]
+        vae_scale_factor_temporal (int): The scale factor used by the VAE to compress temporal dimension.
+            Typically 4 for most VAEs (video is 4x larger than latents in temporal dimension)
+        vae_scale_factor_spatial (int): The scale factor used by the VAE to compress spatial dimension.
+            Typically 8 for most VAEs (image is 8x larger than latents in each dimension)
+
+    Returns:
+        tuple[int, int]: The calculated image dimensions as (height, width)
+
+    Raises:
+        ValueError: If latents tensor doesn't have 4 or 5 dimensions
+
+    """
+    if latents.ndim != 5:
+        raise ValueError(f"latents must have 5 dimensions, but got {latents.ndim}")
+
+    _, _, num_latent_frames, latent_height, latent_width = latents.shape
+
+    num_frames = (num_latent_frames - 1) * vae_scale_factor_temporal + 1
+    height = latent_height * vae_scale_factor_spatial
+    width = latent_width * vae_scale_factor_spatial
+
+    return num_frames, height, width
+
+
+class HeliosAdditionalInputsStep(ModularPipelineBlocks):
+    """Configurable step that standardizes inputs for the denoising step.
+
+    This step handles:
+    1. For encoded image latents: Computes height/width from latents and expands batch size
+    2. For additional_batch_inputs: Expands batch dimensions to match final batch size
+    """
+
+    model_name = "helios"
+
+    def __init__(
+        self,
+        image_latent_inputs: list[InputParam] | None = None,
+        additional_batch_inputs: list[InputParam] | None = None,
+    ):
+        if image_latent_inputs is None:
+            image_latent_inputs = [InputParam.template("image_latents")]
+        if additional_batch_inputs is None:
+            additional_batch_inputs = []
+
+        if not isinstance(image_latent_inputs, list):
+            raise ValueError(f"image_latent_inputs must be a list, but got {type(image_latent_inputs)}")
+        else:
+            for input_param in image_latent_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(f"image_latent_inputs must be a list of InputParam, but got {type(input_param)}")
+
+        if not isinstance(additional_batch_inputs, list):
+            raise ValueError(f"additional_batch_inputs must be a list, but got {type(additional_batch_inputs)}")
+        else:
+            for input_param in additional_batch_inputs:
+                if not isinstance(input_param, InputParam):
+                    raise ValueError(
+                        f"additional_batch_inputs must be a list of InputParam, but got {type(input_param)}"
+                    )
+
+        self._image_latent_inputs = image_latent_inputs
+        self._additional_batch_inputs = additional_batch_inputs
+        super().__init__()
+
+    @property
+    def description(self) -> str:
+        summary_section = (
+            "Input processing step that:\n"
+            "  1. For image latent inputs: Computes height/width from latents and expands batch size\n"
+            "  2. For additional batch inputs: Expands batch dimensions to match final batch size"
+        )
+
+        inputs_info = ""
+        if self._image_latent_inputs or self._additional_batch_inputs:
+            inputs_info = "\n\nConfigured inputs:"
+            if self._image_latent_inputs:
+                inputs_info += f"\n  - Image latent inputs: {[p.name for p in self._image_latent_inputs]}"
+            if self._additional_batch_inputs:
+                inputs_info += f"\n  - Additional batch inputs: {[p.name for p in self._additional_batch_inputs]}"
+
+        placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
+
+        return summary_section + inputs_info + placement_section
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        inputs = [
+            InputParam(name="num_videos_per_prompt", default=1),
+            InputParam(name="batch_size", required=True),
+        ]
+        inputs += self._image_latent_inputs + self._additional_batch_inputs
+
+        return inputs
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        outputs = [
+            OutputParam("height", type_hint=int),
+            OutputParam("width", type_hint=int),
+        ]
+
+        for input_param in self._image_latent_inputs:
+            outputs.append(OutputParam(input_param.name, type_hint=torch.Tensor))
+
+        for input_param in self._additional_batch_inputs:
+            outputs.append(OutputParam(input_param.name, type_hint=torch.Tensor))
+
+        return outputs
+
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        for input_param in self._image_latent_inputs:
+            image_latent_tensor = getattr(block_state, input_param.name)
+            if image_latent_tensor is None:
+                continue
+
+            # Calculate height/width from latents
+            _, height, width = calculate_dimension_from_latents(
+                image_latent_tensor, components.vae_scale_factor_temporal, components.vae_scale_factor_spatial
+            )
+            block_state.height = height
+            block_state.width = width
+
+            # Expand batch size
+            image_latent_tensor = repeat_tensor_to_batch_size(
+                input_name=input_param.name,
+                input_tensor=image_latent_tensor,
+                num_videos_per_prompt=block_state.num_videos_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, input_param.name, image_latent_tensor)
+
+        for input_param in self._additional_batch_inputs:
+            input_tensor = getattr(block_state, input_param.name)
+            if input_tensor is None:
+                continue
+
+            input_tensor = repeat_tensor_to_batch_size(
+                input_name=input_param.name,
+                input_tensor=input_tensor,
+                num_videos_per_prompt=block_state.num_videos_per_prompt,
+                batch_size=block_state.batch_size,
+            )
+
+            setattr(block_state, input_param.name, input_tensor)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HeliosAddNoiseToImageLatentsStep(ModularPipelineBlocks):
+    """Adds noise to image_latents and fake_image_latents for I2V conditioning.
+
+    Applies single-sigma noise to image_latents (using image_noise_sigma range) and single-sigma noise to
+    fake_image_latents (using video_noise_sigma range).
+    """
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Adds noise to image_latents and fake_image_latents for I2V conditioning. "
+            "Uses random sigma from configured ranges for each."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("image_latents"),
+            InputParam(
+                "fake_image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Fake image latents used as history seed for I2V generation.",
+            ),
+            InputParam(
+                "image_noise_sigma_min",
+                default=0.111,
+                type_hint=float,
+                description="Minimum sigma for image latent noise.",
+            ),
+            InputParam(
+                "image_noise_sigma_max",
+                default=0.135,
+                type_hint=float,
+                description="Maximum sigma for image latent noise.",
+            ),
+            InputParam(
+                "video_noise_sigma_min",
+                default=0.111,
+                type_hint=float,
+                description="Minimum sigma for video/fake-image latent noise.",
+            ),
+            InputParam(
+                "video_noise_sigma_max",
+                default=0.135,
+                type_hint=float,
+                description="Maximum sigma for video/fake-image latent noise.",
+            ),
+            InputParam.template("generator"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam.template("image_latents"),
+            OutputParam("fake_image_latents", type_hint=torch.Tensor, description="Noisy fake image latents"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        device = components._execution_device
+        image_latents = block_state.image_latents
+        fake_image_latents = block_state.fake_image_latents
+
+        # Add noise to image_latents
+        image_noise_sigma = (
+            torch.rand(1, device=device, generator=block_state.generator)
+            * (block_state.image_noise_sigma_max - block_state.image_noise_sigma_min)
+            + block_state.image_noise_sigma_min
+        )
+        image_latents = (
+            image_noise_sigma * randn_tensor(image_latents.shape, generator=block_state.generator, device=device)
+            + (1 - image_noise_sigma) * image_latents
+        )
+
+        # Add noise to fake_image_latents
+        fake_image_noise_sigma = (
+            torch.rand(1, device=device, generator=block_state.generator)
+            * (block_state.video_noise_sigma_max - block_state.video_noise_sigma_min)
+            + block_state.video_noise_sigma_min
+        )
+        fake_image_latents = (
+            fake_image_noise_sigma
+            * randn_tensor(fake_image_latents.shape, generator=block_state.generator, device=device)
+            + (1 - fake_image_noise_sigma) * fake_image_latents
+        )
+
+        block_state.image_latents = image_latents.to(device=device, dtype=torch.float32)
+        block_state.fake_image_latents = fake_image_latents.to(device=device, dtype=torch.float32)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HeliosAddNoiseToVideoLatentsStep(ModularPipelineBlocks):
+    """Adds noise to image_latents and video_latents for V2V conditioning.
+
+    Applies single-sigma noise to image_latents (using image_noise_sigma range) and per-frame noise to video_latents in
+    chunks (using video_noise_sigma range).
+    """
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Adds noise to image_latents and video_latents for V2V conditioning. "
+            "Uses single-sigma noise for image_latents and per-frame noise for video chunks."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("image_latents"),
+            InputParam(
+                "video_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Encoded video latents for V2V generation.",
+            ),
+            InputParam(
+                "num_latent_frames_per_chunk",
+                default=9,
+                type_hint=int,
+                description="Number of latent frames per temporal chunk.",
+            ),
+            InputParam(
+                "image_noise_sigma_min",
+                default=0.111,
+                type_hint=float,
+                description="Minimum sigma for image latent noise.",
+            ),
+            InputParam(
+                "image_noise_sigma_max",
+                default=0.135,
+                type_hint=float,
+                description="Maximum sigma for image latent noise.",
+            ),
+            InputParam(
+                "video_noise_sigma_min",
+                default=0.111,
+                type_hint=float,
+                description="Minimum sigma for video latent noise.",
+            ),
+            InputParam(
+                "video_noise_sigma_max",
+                default=0.135,
+                type_hint=float,
+                description="Maximum sigma for video latent noise.",
+            ),
+            InputParam.template("generator"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam.template("image_latents"),
+            OutputParam("video_latents", type_hint=torch.Tensor, description="Noisy video latents"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        device = components._execution_device
+        image_latents = block_state.image_latents
+        video_latents = block_state.video_latents
+        num_latent_frames_per_chunk = block_state.num_latent_frames_per_chunk
+
+        # Add noise to first frame (single sigma)
+        image_noise_sigma = (
+            torch.rand(1, device=device, generator=block_state.generator)
+            * (block_state.image_noise_sigma_max - block_state.image_noise_sigma_min)
+            + block_state.image_noise_sigma_min
+        )
+        image_latents = (
+            image_noise_sigma * randn_tensor(image_latents.shape, generator=block_state.generator, device=device)
+            + (1 - image_noise_sigma) * image_latents
+        )
+
+        # Add per-frame noise to video chunks
+        noisy_latents_chunks = []
+        num_latent_chunks = video_latents.shape[2] // num_latent_frames_per_chunk
+        for i in range(num_latent_chunks):
+            chunk_start = i * num_latent_frames_per_chunk
+            chunk_end = chunk_start + num_latent_frames_per_chunk
+            latent_chunk = video_latents[:, :, chunk_start:chunk_end, :, :]
+
+            chunk_frames = latent_chunk.shape[2]
+            frame_sigmas = (
+                torch.rand(chunk_frames, device=device, generator=block_state.generator)
+                * (block_state.video_noise_sigma_max - block_state.video_noise_sigma_min)
+                + block_state.video_noise_sigma_min
+            )
+            frame_sigmas = frame_sigmas.view(1, 1, chunk_frames, 1, 1)
+
+            noisy_chunk = (
+                frame_sigmas * randn_tensor(latent_chunk.shape, generator=block_state.generator, device=device)
+                + (1 - frame_sigmas) * latent_chunk
+            )
+            noisy_latents_chunks.append(noisy_chunk)
+        video_latents = torch.cat(noisy_latents_chunks, dim=2)
+
+        block_state.image_latents = image_latents.to(device=device, dtype=torch.float32)
+        block_state.video_latents = video_latents.to(device=device, dtype=torch.float32)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HeliosPrepareHistoryStep(ModularPipelineBlocks):
+    """Prepares chunk/history indices and initializes history state for the chunk loop."""
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Prepares the chunk loop by computing latent dimensions, number of chunks, "
+            "history indices, and initializing history state (history_latents, image_latents, latent_chunks)."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", HeliosTransformer3DModel),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("height", default=384),
+            InputParam.template("width", default=640),
+            InputParam(
+                "num_frames", default=132, type_hint=int, description="Total number of video frames to generate."
+            ),
+            InputParam("batch_size", required=True, type_hint=int),
+            InputParam(
+                "num_latent_frames_per_chunk",
+                default=9,
+                type_hint=int,
+                description="Number of latent frames per temporal chunk.",
+            ),
+            InputParam(
+                "history_sizes",
+                default=[16, 2, 1],
+                type_hint=list,
+                description="Sizes of long/mid/short history buffers for temporal context.",
+            ),
+            InputParam(
+                "keep_first_frame",
+                default=True,
+                type_hint=bool,
+                description="Whether to keep the first frame as a prefix in history.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("num_latent_chunk", type_hint=int, description="Number of temporal chunks"),
+            OutputParam("latent_shape", type_hint=tuple, description="Shape of latent tensor per chunk"),
+            OutputParam("history_sizes", type_hint=list, description="Adjusted history sizes (sorted, descending)"),
+            OutputParam("indices_hidden_states", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"),
+            OutputParam("indices_latents_history_short", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"),
+            OutputParam("indices_latents_history_mid", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"),
+            OutputParam("indices_latents_history_long", type_hint=torch.Tensor, kwargs_type="denoiser_input_fields"),
+            OutputParam("history_latents", type_hint=torch.Tensor, description="Initialized zero history latents"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        batch_size = block_state.batch_size
+        device = components._execution_device
+
+        block_state.num_frames = max(block_state.num_frames, 1)
+        history_sizes = sorted(block_state.history_sizes, reverse=True)
+
+        num_channels_latents = components.num_channels_latents
+        h_latent = block_state.height // components.vae_scale_factor_spatial
+        w_latent = block_state.width // components.vae_scale_factor_spatial
+
+        # Compute number of chunks
+        block_state.window_num_frames = (
+            block_state.num_latent_frames_per_chunk - 1
+        ) * components.vae_scale_factor_temporal + 1
+        block_state.num_latent_chunk = max(
+            1, (block_state.num_frames + block_state.window_num_frames - 1) // block_state.window_num_frames
+        )
+
+        # Modify history_sizes for non-keep_first_frame (matching pipeline behavior)
+        if not block_state.keep_first_frame:
+            history_sizes = history_sizes.copy()
+            history_sizes[-1] = history_sizes[-1] + 1
+
+        # Compute indices ONCE (same structure for all chunks)
+        if block_state.keep_first_frame:
+            indices = torch.arange(0, sum([1, *history_sizes, block_state.num_latent_frames_per_chunk]))
+            (
+                indices_prefix,
+                indices_latents_history_long,
+                indices_latents_history_mid,
+                indices_latents_history_1x,
+                indices_hidden_states,
+            ) = indices.split([1, *history_sizes, block_state.num_latent_frames_per_chunk], dim=0)
+            indices_latents_history_short = torch.cat([indices_prefix, indices_latents_history_1x], dim=0)
+        else:
+            indices = torch.arange(0, sum([*history_sizes, block_state.num_latent_frames_per_chunk]))
+            (
+                indices_latents_history_long,
+                indices_latents_history_mid,
+                indices_latents_history_short,
+                indices_hidden_states,
+            ) = indices.split([*history_sizes, block_state.num_latent_frames_per_chunk], dim=0)
+
+        # Latent shape per chunk
+        block_state.latent_shape = (
+            batch_size,
+            num_channels_latents,
+            block_state.num_latent_frames_per_chunk,
+            h_latent,
+            w_latent,
+        )
+
+        # Set outputs
+        block_state.history_sizes = history_sizes
+        block_state.indices_hidden_states = indices_hidden_states.unsqueeze(0)
+        block_state.indices_latents_history_short = indices_latents_history_short.unsqueeze(0)
+        block_state.indices_latents_history_mid = indices_latents_history_mid.unsqueeze(0)
+        block_state.indices_latents_history_long = indices_latents_history_long.unsqueeze(0)
+        block_state.history_latents = torch.zeros(
+            batch_size,
+            num_channels_latents,
+            sum(history_sizes),
+            h_latent,
+            w_latent,
+            device=device,
+            dtype=torch.float32,
+        )
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+class HeliosI2VSeedHistoryStep(ModularPipelineBlocks):
+    """Seeds history_latents with fake_image_latents for I2V pipelines.
+
+    This small additive step runs after HeliosPrepareHistoryStep and appends fake_image_latents to the initialized
+    history_latents tensor.
+    """
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return "I2V history seeding: appends fake_image_latents to history_latents."
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("history_latents", required=True, type_hint=torch.Tensor),
+            InputParam("fake_image_latents", required=True, type_hint=torch.Tensor),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "history_latents", type_hint=torch.Tensor, description="History latents seeded with fake_image_latents"
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        block_state.history_latents = torch.cat([block_state.history_latents, block_state.fake_image_latents], dim=2)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HeliosV2VSeedHistoryStep(ModularPipelineBlocks):
+    """Seeds history_latents with video_latents for V2V pipelines.
+
+    This step runs after HeliosPrepareHistoryStep and replaces the tail of history_latents with video_latents. If the
+    video has fewer frames than the history, the beginning of history is preserved.
+    """
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return "V2V history seeding: replaces the tail of history_latents with video_latents."
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("history_latents", required=True, type_hint=torch.Tensor),
+            InputParam("video_latents", required=True, type_hint=torch.Tensor),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "history_latents", type_hint=torch.Tensor, description="History latents seeded with video_latents"
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        history_latents = block_state.history_latents
+        video_latents = block_state.video_latents
+
+        history_frames = history_latents.shape[2]
+        video_frames = video_latents.shape[2]
+        if video_frames < history_frames:
+            keep_frames = history_frames - video_frames
+            history_latents = torch.cat([history_latents[:, :, :keep_frames, :, :], video_latents], dim=2)
+        else:
+            history_latents = video_latents
+
+        block_state.history_latents = history_latents
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HeliosSetTimestepsStep(ModularPipelineBlocks):
+    """Computes scheduler parameters (mu, sigmas) for the chunk loop."""
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return "Computes scheduler shift parameter (mu) and default sigmas for the Helios chunk loop."
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", HeliosTransformer3DModel),
+            ComponentSpec("scheduler", HeliosScheduler),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("latent_shape", required=True, type_hint=tuple),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("sigmas"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("mu", type_hint=float, description="Scheduler shift parameter"),
+            OutputParam("sigmas", type_hint=list, description="Sigma schedule for diffusion"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        patch_size = components.transformer.config.patch_size
+        latent_shape = block_state.latent_shape
+        image_seq_len = (latent_shape[-1] * latent_shape[-2] * latent_shape[-3]) // (
+            patch_size[0] * patch_size[1] * patch_size[2]
+        )
+
+        if block_state.sigmas is None:
+            block_state.sigmas = np.linspace(0.999, 0.0, block_state.num_inference_steps + 1)[:-1]
+
+        block_state.mu = calculate_shift(
+            image_seq_len,
+            components.scheduler.config.get("base_image_seq_len", 256),
+            components.scheduler.config.get("max_image_seq_len", 4096),
+            components.scheduler.config.get("base_shift", 0.5),
+            components.scheduler.config.get("max_shift", 1.15),
+        )
+
+        self.set_block_state(state, block_state)
+
+        return components, state
diff --git a/src/diffusers/modular_pipelines/helios/decoders.py b/src/diffusers/modular_pipelines/helios/decoders.py
new file mode 100644
index 000000000000..f08ddedfd15a
--- /dev/null
+++ b/src/diffusers/modular_pipelines/helios/decoders.py
@@ -0,0 +1,110 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import PIL
+import torch
+
+from ...configuration_utils import FrozenDict
+from ...models import AutoencoderKLWan
+from ...utils import logging
+from ...video_processor import VideoProcessor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class HeliosDecodeStep(ModularPipelineBlocks):
+    """Decode all chunk latents with VAE, trim frames, and postprocess into final video output."""
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Decodes all chunk latents with the VAE, concatenates them, "
+            "trims to the target frame count, and postprocesses into the final video output."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLWan),
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "latent_chunks", required=True, type_hint=list, description="List of per-chunk denoised latent tensors"
+            ),
+            InputParam("num_frames", required=True, type_hint=int, description="The target number of output frames"),
+            InputParam.template("output_type", default="np"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                "videos",
+                type_hint=list[list[PIL.Image.Image]] | list[torch.Tensor] | list[np.ndarray],
+                description="The generated videos, can be a PIL.Image.Image, torch.Tensor or a numpy array",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        vae = components.vae
+
+        latents_mean = (
+            torch.tensor(vae.config.latents_mean).view(1, vae.config.z_dim, 1, 1, 1).to(vae.device, vae.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(vae.config.latents_std).view(1, vae.config.z_dim, 1, 1, 1).to(
+            vae.device, vae.dtype
+        )
+
+        history_video = None
+        for chunk_latents in block_state.latent_chunks:
+            current_latents = chunk_latents.to(vae.dtype) / latents_std + latents_mean
+            current_video = vae.decode(current_latents, return_dict=False)[0]
+
+            if history_video is None:
+                history_video = current_video
+            else:
+                history_video = torch.cat([history_video, current_video], dim=2)
+
+        # Trim to proper frame count
+        generated_frames = history_video.size(2)
+        generated_frames = (
+            generated_frames - 1
+        ) // components.vae_scale_factor_temporal * components.vae_scale_factor_temporal + 1
+        history_video = history_video[:, :, :generated_frames]
+
+        block_state.videos = components.video_processor.postprocess_video(
+            history_video, output_type=block_state.output_type
+        )
+
+        self.set_block_state(state, block_state)
+
+        return components, state
diff --git a/src/diffusers/modular_pipelines/helios/denoise.py b/src/diffusers/modular_pipelines/helios/denoise.py
new file mode 100644
index 000000000000..ff7a3699c51f
--- /dev/null
+++ b/src/diffusers/modular_pipelines/helios/denoise.py
@@ -0,0 +1,1069 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+
+import torch
+import torch.nn.functional as F
+from tqdm.auto import tqdm
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance, ClassifierFreeZeroStarGuidance
+from ...models import HeliosTransformer3DModel
+from ...schedulers import HeliosScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..modular_pipeline import (
+    BlockState,
+    LoopSequentialPipelineBlocks,
+    ModularPipelineBlocks,
+    PipelineState,
+)
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .before_denoise import calculate_shift
+from .modular_pipeline import HeliosModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def sample_block_noise(
+    batch_size,
+    channel,
+    num_frames,
+    height,
+    width,
+    gamma,
+    patch_size=(1, 2, 2),
+    device=None,
+    generator=None,
+):
+    """Generate spatially-correlated block noise for pyramid upsampling correction.
+
+    Uses a multivariate normal distribution with covariance based on `gamma` to produce noise with block structure,
+    matching the upsampling artifacts that need correction.
+    """
+    # NOTE: A generator must be provided to ensure correct and reproducible results.
+    # Creating a default generator here is a fallback only — without a fixed seed,
+    # the output will be non-deterministic and may produce incorrect results in CP context.
+    if generator is None:
+        generator = torch.Generator(device=device)
+    elif isinstance(generator, list):
+        generator = generator[0]
+
+    _, ph, pw = patch_size
+    block_size = ph * pw
+
+    cov = (
+        torch.eye(block_size, device=device) * (1 + gamma) - torch.ones(block_size, block_size, device=device) * gamma
+    )
+    cov += torch.eye(block_size, device=device) * 1e-8
+    cov = cov.float()  # Upcast to fp32 for numerical stability — cholesky is unreliable in fp16/bf16.
+
+    L = torch.linalg.cholesky(cov)
+    block_number = batch_size * channel * num_frames * (height // ph) * (width // pw)
+    z = torch.randn(block_number, block_size, device=generator.device, generator=generator).to(device)
+    noise = z @ L.T
+
+    noise = noise.view(batch_size, channel, num_frames, height // ph, width // pw, ph, pw)
+    noise = noise.permute(0, 1, 2, 3, 5, 4, 6).reshape(batch_size, channel, num_frames, height, width)
+    return noise
+
+
+# ========================================
+# Chunk Loop Leaf Blocks
+# ========================================
+
+
+class HeliosChunkHistorySliceStep(ModularPipelineBlocks):
+    """Slices history latents into short/mid/long for a T2V chunk.
+
+    At k==0 with no image_latents, creates a zero prefix. Otherwise uses image_latents (either provided or captured
+    from first chunk by HeliosChunkUpdateStep).
+    """
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "T2V history slice: splits history into long/mid/short. At k==0 with no image_latents, "
+            "creates a zero prefix; otherwise uses image_latents as prefix for short history."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "keep_first_frame",
+                default=True,
+                type_hint=bool,
+                description="Whether to keep the first frame as a prefix in history.",
+            ),
+            InputParam(
+                "history_sizes",
+                required=True,
+                type_hint=list,
+                description="Sizes of long/mid/short history buffers for temporal context.",
+            ),
+            InputParam(
+                "history_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Accumulated history latents from previous chunks.",
+            ),
+            InputParam("latent_shape", required=True, type_hint=tuple),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return []
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
+        keep_first_frame = block_state.keep_first_frame
+        history_sizes = block_state.history_sizes
+        image_latents = block_state.image_latents
+        device = components._execution_device
+
+        batch_size, num_channels_latents, _, h_latent, w_latent = block_state.latent_shape
+
+        if keep_first_frame:
+            latents_history_long, latents_history_mid, latents_history_1x = block_state.history_latents[
+                :, :, -sum(history_sizes) :
+            ].split(history_sizes, dim=2)
+            if image_latents is None and k == 0:
+                latents_prefix = torch.zeros(
+                    batch_size,
+                    num_channels_latents,
+                    1,
+                    h_latent,
+                    w_latent,
+                    device=device,
+                    dtype=torch.float32,
+                )
+            else:
+                latents_prefix = image_latents
+            latents_history_short = torch.cat([latents_prefix, latents_history_1x], dim=2)
+        else:
+            latents_history_long, latents_history_mid, latents_history_short = block_state.history_latents[
+                :, :, -sum(history_sizes) :
+            ].split(history_sizes, dim=2)
+
+        block_state.latents_history_short = latents_history_short
+        block_state.latents_history_mid = latents_history_mid
+        block_state.latents_history_long = latents_history_long
+
+        return components, block_state
+
+
+class HeliosI2VChunkHistorySliceStep(ModularPipelineBlocks):
+    """Slices history latents into short/mid/long for an I2V chunk.
+
+    Always uses image_latents as prefix (assumes history pre-seeded with fake_image_latents).
+    """
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "I2V history slice: splits pre-seeded history into long/mid/short, "
+            "always using image_latents as prefix for short history."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                "keep_first_frame",
+                default=True,
+                type_hint=bool,
+                description="Whether to keep the first frame as a prefix in history.",
+            ),
+            InputParam(
+                "history_sizes",
+                required=True,
+                type_hint=list,
+                description="Sizes of long/mid/short history buffers for temporal context.",
+            ),
+            InputParam(
+                "history_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Accumulated history latents from previous chunks.",
+            ),
+            InputParam(
+                "image_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="First-frame latents used as prefix for short history.",
+            ),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return []
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
+        keep_first_frame = block_state.keep_first_frame
+        history_sizes = block_state.history_sizes
+        image_latents = block_state.image_latents
+
+        if keep_first_frame:
+            latents_history_long, latents_history_mid, latents_history_1x = block_state.history_latents[
+                :, :, -sum(history_sizes) :
+            ].split(history_sizes, dim=2)
+            latents_history_short = torch.cat([image_latents, latents_history_1x], dim=2)
+        else:
+            latents_history_long, latents_history_mid, latents_history_short = block_state.history_latents[
+                :, :, -sum(history_sizes) :
+            ].split(history_sizes, dim=2)
+
+        block_state.latents_history_short = latents_history_short
+        block_state.latents_history_mid = latents_history_mid
+        block_state.latents_history_long = latents_history_long
+
+        return components, block_state
+
+
+class HeliosChunkNoiseGenStep(ModularPipelineBlocks):
+    """Generates noise latents for a chunk using randn_tensor."""
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return "Generates random noise latents at full resolution for a single chunk."
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("latent_shape", required=True, type_hint=tuple),
+            InputParam.template("generator"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
+        device = components._execution_device
+        block_state.latents = randn_tensor(
+            block_state.latent_shape, generator=block_state.generator, device=device, dtype=torch.float32
+        )
+        return components, block_state
+
+
+class HeliosPyramidChunkNoiseGenStep(ModularPipelineBlocks):
+    """Generates noise latents and downsamples to smallest pyramid level."""
+
+    model_name = "helios-pyramid"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Generates random noise at full resolution, then downsamples to the smallest "
+            "pyramid level via bilinear interpolation."
+        )
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("latent_shape", required=True, type_hint=tuple),
+            InputParam(
+                "pyramid_num_inference_steps_list",
+                default=[10, 10, 10],
+                type_hint=list,
+                description="Number of denoising steps per pyramid stage.",
+            ),
+            InputParam.template("generator"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
+        device = components._execution_device
+        batch_size, num_channels_latents, num_latent_frames, h_latent, w_latent = block_state.latent_shape
+
+        latents = randn_tensor(
+            block_state.latent_shape, generator=block_state.generator, device=device, dtype=torch.float32
+        )
+
+        # Downsample to smallest pyramid level
+        h, w = h_latent, w_latent
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_latent_frames, num_channels_latents, h, w)
+        for _ in range(len(block_state.pyramid_num_inference_steps_list) - 1):
+            h //= 2
+            w //= 2
+            latents = F.interpolate(latents, size=(h, w), mode="bilinear") * 2
+        block_state.latents = latents.reshape(batch_size, num_latent_frames, num_channels_latents, h, w).permute(
+            0, 2, 1, 3, 4
+        )
+
+        return components, block_state
+
+
+class HeliosChunkSchedulerResetStep(ModularPipelineBlocks):
+    """Resets the scheduler with timesteps for a single chunk."""
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return "Resets the scheduler with the correct timesteps and shift parameter (mu) for this chunk."
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("scheduler", HeliosScheduler),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("mu", required=True, type_hint=float),
+            InputParam.template("sigmas", required=True),
+            InputParam.template("num_inference_steps"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
+        device = components._execution_device
+        components.scheduler.set_timesteps(
+            block_state.num_inference_steps, device=device, sigmas=block_state.sigmas, mu=block_state.mu
+        )
+        block_state.timesteps = components.scheduler.timesteps
+
+        return components, block_state
+
+
+# ========================================
+# Inner Denoising Blocks
+# ========================================
+
+
+class HeliosChunkDenoiseInner(ModularPipelineBlocks):
+    """Inner timestep loop for denoising a single chunk, using guider for guidance."""
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Inner denoising loop that iterates over timesteps for a single chunk. "
+            "Uses the guider to manage conditional/unconditional forward passes with cache_context, "
+            "applies guidance, and runs scheduler step."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", HeliosTransformer3DModel),
+            ComponentSpec("scheduler", HeliosScheduler),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 5.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("latents"),
+            InputParam.template("timesteps"),
+            InputParam("prompt_embeds", type_hint=torch.Tensor),
+            InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
+            InputParam.template("denoiser_input_fields"),
+            InputParam.template("num_inference_steps"),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("generator"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
+        latents = block_state.latents
+        timesteps = block_state.timesteps
+        num_inference_steps = block_state.num_inference_steps
+
+        transformer_dtype = components.transformer.dtype
+        num_warmup_steps = len(timesteps) - num_inference_steps * components.scheduler.order
+
+        # Guider inputs: only encoder_hidden_states differs between cond/uncond
+        guider_inputs = {
+            "encoder_hidden_states": (block_state.prompt_embeds, block_state.negative_prompt_embeds),
+        }
+
+        # Build shared kwargs from denoiser_input_fields (excludes guider-managed ones)
+        transformer_args = set(inspect.signature(components.transformer.forward).parameters.keys())
+        shared_kwargs = {}
+        for field_name, field_value in block_state.denoiser_input_fields.items():
+            if field_name in transformer_args and field_name not in guider_inputs:
+                shared_kwargs[field_name] = field_value
+
+        # Add loop-internal history latents with dtype casting
+        shared_kwargs["latents_history_short"] = block_state.latents_history_short.to(transformer_dtype)
+        shared_kwargs["latents_history_mid"] = block_state.latents_history_mid.to(transformer_dtype)
+        shared_kwargs["latents_history_long"] = block_state.latents_history_long.to(transformer_dtype)
+        shared_kwargs["attention_kwargs"] = block_state.attention_kwargs
+
+        with tqdm(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                timestep = t.expand(latents.shape[0]).to(torch.int64)
+                latent_model_input = latents.to(transformer_dtype)
+
+                components.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
+                guider_state = components.guider.prepare_inputs(guider_inputs)
+
+                for guider_state_batch in guider_state:
+                    components.guider.prepare_models(components.transformer)
+                    cond_kwargs = {k: getattr(guider_state_batch, k) for k in guider_inputs.keys()}
+
+                    context_name = getattr(guider_state_batch, components.guider._identifier_key)
+                    with components.transformer.cache_context(context_name):
+                        guider_state_batch.noise_pred = components.transformer(
+                            hidden_states=latent_model_input,
+                            timestep=timestep,
+                            return_dict=False,
+                            **cond_kwargs,
+                            **shared_kwargs,
+                        )[0]
+                    components.guider.cleanup_models(components.transformer)
+
+                noise_pred = components.guider(guider_state)[0]
+
+                # Scheduler step
+                latents = components.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    generator=block_state.generator,
+                    return_dict=False,
+                )[0]
+
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % components.scheduler.order == 0
+                ):
+                    progress_bar.update()
+
+        block_state.latents = latents
+        return components, block_state
+
+
+class HeliosPyramidChunkDenoiseInner(ModularPipelineBlocks):
+    """Nested pyramid stage loop with inner timestep denoising.
+
+    For each pyramid stage (small -> full resolution):
+    1. Upsample latents + block noise correction (stages > 0)
+    2. Compute mu from current resolution, set scheduler timesteps
+    3. Run timestep denoising loop (same logic as HeliosChunkDenoiseInner)
+    """
+
+    model_name = "helios-pyramid"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Pyramid denoising inner block: loops over pyramid stages from smallest to full resolution. "
+            "Each stage upsamples latents (with block noise correction), recomputes scheduler parameters, "
+            "and runs the timestep denoising loop."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", HeliosTransformer3DModel),
+            ComponentSpec("scheduler", HeliosScheduler),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeZeroStarGuidance,
+                config=FrozenDict({"guidance_scale": 5.0, "zero_init_steps": 2}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("latents"),
+            InputParam("prompt_embeds", type_hint=torch.Tensor),
+            InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
+            InputParam.template("denoiser_input_fields"),
+            InputParam(
+                "pyramid_num_inference_steps_list",
+                default=[10, 10, 10],
+                type_hint=list,
+                description="Number of denoising steps per pyramid stage.",
+            ),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("generator"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
+        device = components._execution_device
+        transformer_dtype = components.transformer.dtype
+        latents = block_state.latents
+        pyramid_num_stages = len(block_state.pyramid_num_inference_steps_list)
+
+        # Guider inputs: only encoder_hidden_states differs between cond/uncond
+        guider_inputs = {
+            "encoder_hidden_states": (block_state.prompt_embeds, block_state.negative_prompt_embeds),
+        }
+
+        # Build shared kwargs from denoiser_input_fields (excludes guider-managed ones)
+        transformer_args = set(inspect.signature(components.transformer.forward).parameters.keys())
+        shared_kwargs = {}
+        for field_name, field_value in block_state.denoiser_input_fields.items():
+            if field_name in transformer_args and field_name not in guider_inputs:
+                shared_kwargs[field_name] = field_value
+
+        # Add loop-internal history latents with dtype casting
+        shared_kwargs["latents_history_short"] = block_state.latents_history_short.to(transformer_dtype)
+        shared_kwargs["latents_history_mid"] = block_state.latents_history_mid.to(transformer_dtype)
+        shared_kwargs["latents_history_long"] = block_state.latents_history_long.to(transformer_dtype)
+        shared_kwargs["attention_kwargs"] = block_state.attention_kwargs
+
+        # Save original zero_init_steps if the guider supports it (e.g. ClassifierFreeZeroStarGuidance).
+        # Helios only applies zero init in pyramid stage 0 (lowest resolution), so we disable it
+        # for subsequent stages by temporarily setting zero_init_steps=0.
+        orig_zero_init_steps = getattr(components.guider, "zero_init_steps", None)
+
+        for i_s in range(pyramid_num_stages):
+            # --- Stage setup ---
+
+            # Disable zero init for stages > 0 (only stage 0 should have zero init)
+            if orig_zero_init_steps is not None and i_s > 0:
+                components.guider.zero_init_steps = 0
+
+            # a. Compute mu from current resolution (before upsample, matching standard pipeline)
+            patch_size = components.transformer.config.patch_size
+            image_seq_len = (latents.shape[-1] * latents.shape[-2] * latents.shape[-3]) // (
+                patch_size[0] * patch_size[1] * patch_size[2]
+            )
+            mu = calculate_shift(
+                image_seq_len,
+                components.scheduler.config.get("base_image_seq_len", 256),
+                components.scheduler.config.get("max_image_seq_len", 4096),
+                components.scheduler.config.get("base_shift", 0.5),
+                components.scheduler.config.get("max_shift", 1.15),
+            )
+
+            # b. Set scheduler timesteps for this stage
+            num_inference_steps = block_state.pyramid_num_inference_steps_list[i_s]
+            components.scheduler.set_timesteps(
+                num_inference_steps,
+                i_s,
+                device=device,
+                mu=mu,
+            )
+            timesteps = components.scheduler.timesteps
+
+            # c. Upsample + block noise correction for stages > 0
+            if i_s > 0:
+                batch_size, num_channels_latents, num_frames, current_h, current_w = latents.shape
+                new_h = current_h * 2
+                new_w = current_w * 2
+
+                latents = latents.permute(0, 2, 1, 3, 4).reshape(
+                    batch_size * num_frames, num_channels_latents, current_h, current_w
+                )
+                latents = F.interpolate(latents, size=(new_h, new_w), mode="nearest")
+                latents = latents.reshape(batch_size, num_frames, num_channels_latents, new_h, new_w).permute(
+                    0, 2, 1, 3, 4
+                )
+
+                # Block noise correction
+                ori_sigma = 1 - components.scheduler.ori_start_sigmas[i_s]
+                gamma = components.scheduler.config.gamma
+                alpha = 1 / (math.sqrt(1 + (1 / gamma)) * (1 - ori_sigma) + ori_sigma)
+                beta = alpha * (1 - ori_sigma) / math.sqrt(gamma)
+
+                batch_size, num_channels_latents, num_frames, h, w = latents.shape
+                noise = sample_block_noise(
+                    batch_size,
+                    num_channels_latents,
+                    num_frames,
+                    h,
+                    w,
+                    gamma,
+                    patch_size,
+                    device=device,
+                    generator=block_state.generator,
+                )
+                noise = noise.to(dtype=transformer_dtype)
+                latents = alpha * latents + beta * noise
+
+            # --- Timestep denoising loop ---
+            num_warmup_steps = len(timesteps) - num_inference_steps * components.scheduler.order
+
+            with tqdm(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    timestep = t.expand(latents.shape[0]).to(torch.int64)
+                    latent_model_input = latents.to(transformer_dtype)
+
+                    components.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
+                    guider_state = components.guider.prepare_inputs(guider_inputs)
+
+                    for guider_state_batch in guider_state:
+                        components.guider.prepare_models(components.transformer)
+                        cond_kwargs = {kk: getattr(guider_state_batch, kk) for kk in guider_inputs.keys()}
+
+                        context_name = getattr(guider_state_batch, components.guider._identifier_key)
+                        with components.transformer.cache_context(context_name):
+                            guider_state_batch.noise_pred = components.transformer(
+                                hidden_states=latent_model_input,
+                                timestep=timestep,
+                                return_dict=False,
+                                **cond_kwargs,
+                                **shared_kwargs,
+                            )[0]
+                        components.guider.cleanup_models(components.transformer)
+
+                    noise_pred = components.guider(guider_state)[0]
+
+                    # Scheduler step
+                    latents = components.scheduler.step(
+                        noise_pred,
+                        t,
+                        latents,
+                        generator=block_state.generator,
+                        return_dict=False,
+                    )[0]
+
+                    if i == len(timesteps) - 1 or (
+                        (i + 1) > num_warmup_steps and (i + 1) % components.scheduler.order == 0
+                    ):
+                        progress_bar.update()
+
+        # Restore original zero_init_steps
+        if orig_zero_init_steps is not None:
+            components.guider.zero_init_steps = orig_zero_init_steps
+
+        block_state.latents = latents
+        return components, block_state
+
+
+# ========================================
+# Post-Denoise Update
+# ========================================
+
+
+class HeliosChunkUpdateStep(ModularPipelineBlocks):
+    """Updates chunk collection and history after denoising a single chunk."""
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Post-denoising update step: appends the denoised latents to the chunk list, "
+            "captures image_latents from the first chunk if needed, and extends history_latents."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return []
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("latents", type_hint=torch.Tensor),
+            InputParam("history_latents", type_hint=torch.Tensor),
+            InputParam("keep_first_frame", default=True, type_hint=bool),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
+        # e. Collect denoised latents for this chunk
+        block_state.latent_chunks.append(block_state.latents)
+
+        # f. Update history
+        if block_state.keep_first_frame and k == 0 and block_state.image_latents is None:
+            block_state.image_latents = block_state.latents[:, :, 0:1, :, :]
+
+        block_state.history_latents = torch.cat([block_state.history_latents, block_state.latents], dim=2)
+
+        return components, block_state
+
+
+# ========================================
+# Chunk Loop Wrapper
+# ========================================
+
+
+class HeliosChunkLoopWrapper(LoopSequentialPipelineBlocks):
+    """Outer chunk loop that iterates over temporal chunks.
+
+    History indices, scheduler params, and history state are prepared by HeliosPrepareHistoryStep and
+    HeliosSetTimestepsStep before this block runs. Sub-blocks handle per-chunk preparation, denoising, and history
+    updates.
+    """
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Pipeline block that iterates over temporal chunks for progressive video generation. "
+            "At each chunk iteration, it runs sub-blocks for preparation, denoising, and history updates."
+        )
+
+    @property
+    def loop_inputs(self) -> list[InputParam]:
+        return [
+            InputParam("num_latent_chunk", required=True, type_hint=int),
+        ]
+
+    @property
+    def loop_intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        block_state.latent_chunks = []
+
+        if not hasattr(block_state, "image_latents"):
+            block_state.image_latents = None
+
+        for k in range(block_state.num_latent_chunk):
+            components, block_state = self.loop_step(components, block_state, k=k)
+
+        self.set_block_state(state, block_state)
+
+        return components, state
+
+
+# ========================================
+# Composed Chunk Denoise Steps
+# ========================================
+
+
+class HeliosChunkDenoiseStep(HeliosChunkLoopWrapper):
+    """T2V chunk-based denoising: history slice -> noise gen -> scheduler reset -> denoise -> update."""
+
+    block_classes = [
+        HeliosChunkHistorySliceStep,
+        HeliosChunkNoiseGenStep,
+        HeliosChunkSchedulerResetStep,
+        HeliosChunkDenoiseInner,
+        HeliosChunkUpdateStep,
+    ]
+    block_names = ["history_slice", "noise_gen", "scheduler_reset", "denoise_inner", "update_chunk"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "T2V chunk denoise step that iterates over temporal chunks.\n"
+            "At each chunk: history_slice -> noise_gen -> scheduler_reset -> denoise_inner -> update_chunk."
+        )
+
+
+class HeliosI2VChunkDenoiseStep(HeliosChunkLoopWrapper):
+    """I2V chunk-based denoising: I2V history slice -> noise gen -> scheduler reset -> denoise -> update."""
+
+    block_classes = [
+        HeliosI2VChunkHistorySliceStep,
+        HeliosChunkNoiseGenStep,
+        HeliosChunkSchedulerResetStep,
+        HeliosChunkDenoiseInner,
+        HeliosChunkUpdateStep,
+    ]
+    block_names = ["history_slice", "noise_gen", "scheduler_reset", "denoise_inner", "update_chunk"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "I2V chunk denoise step that iterates over temporal chunks.\n"
+            "At each chunk: history_slice (I2V) -> noise_gen -> scheduler_reset -> denoise_inner -> update_chunk."
+        )
+
+
+class HeliosPyramidDistilledChunkDenoiseInner(ModularPipelineBlocks):
+    """Nested pyramid stage loop with DMD denoising for distilled checkpoints.
+
+    Same progressive multi-resolution strategy as HeliosPyramidChunkDenoiseInner, but:
+    - Guidance is disabled (guidance_scale=1.0, no unconditional pass)
+    - Supports is_amplify_first_chunk (doubles first chunk's timesteps via scheduler)
+    - Tracks start_point_list and passes DMD-specific args to scheduler.step()
+    """
+
+    model_name = "helios-pyramid"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Distilled pyramid denoising inner block for DMD checkpoints. Loops over pyramid stages "
+            "from smallest to full resolution with guidance disabled and DMD scheduler support."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", HeliosTransformer3DModel),
+            ComponentSpec("scheduler", HeliosScheduler),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 1.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("latents"),
+            InputParam("prompt_embeds", type_hint=torch.Tensor),
+            InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
+            InputParam.template("denoiser_input_fields"),
+            InputParam(
+                "pyramid_num_inference_steps_list",
+                default=[2, 2, 2],
+                type_hint=list,
+                description="Number of denoising steps per pyramid stage.",
+            ),
+            InputParam(
+                "is_amplify_first_chunk",
+                default=True,
+                type_hint=bool,
+                description="Whether to double the first chunk's timesteps via the scheduler for amplified generation.",
+            ),
+            InputParam.template("attention_kwargs"),
+            InputParam.template("generator"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
+        device = components._execution_device
+        transformer_dtype = components.transformer.dtype
+        latents = block_state.latents
+        pyramid_num_stages = len(block_state.pyramid_num_inference_steps_list)
+        is_first_chunk = k == 0
+
+        # Track start points for DMD scheduler
+        start_point_list = [latents]
+
+        # Guider inputs: only encoder_hidden_states differs between cond/uncond
+        guider_inputs = {
+            "encoder_hidden_states": (block_state.prompt_embeds, block_state.negative_prompt_embeds),
+        }
+
+        # Build shared kwargs from denoiser_input_fields (excludes guider-managed ones)
+        transformer_args = set(inspect.signature(components.transformer.forward).parameters.keys())
+        shared_kwargs = {}
+        for field_name, field_value in block_state.denoiser_input_fields.items():
+            if field_name in transformer_args and field_name not in guider_inputs:
+                shared_kwargs[field_name] = field_value
+
+        # Add loop-internal history latents with dtype casting
+        shared_kwargs["latents_history_short"] = block_state.latents_history_short.to(transformer_dtype)
+        shared_kwargs["latents_history_mid"] = block_state.latents_history_mid.to(transformer_dtype)
+        shared_kwargs["latents_history_long"] = block_state.latents_history_long.to(transformer_dtype)
+        shared_kwargs["attention_kwargs"] = block_state.attention_kwargs
+
+        for i_s in range(pyramid_num_stages):
+            # --- Stage setup ---
+            patch_size = components.transformer.config.patch_size
+
+            # a. Compute mu from current resolution (before upsample, matching standard pipeline)
+            image_seq_len = (latents.shape[-1] * latents.shape[-2] * latents.shape[-3]) // (
+                patch_size[0] * patch_size[1] * patch_size[2]
+            )
+            mu = calculate_shift(
+                image_seq_len,
+                components.scheduler.config.get("base_image_seq_len", 256),
+                components.scheduler.config.get("max_image_seq_len", 4096),
+                components.scheduler.config.get("base_shift", 0.5),
+                components.scheduler.config.get("max_shift", 1.15),
+            )
+
+            # b. Set scheduler timesteps for this stage (with DMD amplification)
+            num_inference_steps = block_state.pyramid_num_inference_steps_list[i_s]
+            components.scheduler.set_timesteps(
+                num_inference_steps,
+                i_s,
+                device=device,
+                mu=mu,
+                is_amplify_first_chunk=block_state.is_amplify_first_chunk and is_first_chunk,
+            )
+            timesteps = components.scheduler.timesteps
+
+            # c. Upsample + block noise correction for stages > 0
+            if i_s > 0:
+                batch_size, num_channels_latents, num_frames, current_h, current_w = latents.shape
+                new_h = current_h * 2
+                new_w = current_w * 2
+
+                latents = latents.permute(0, 2, 1, 3, 4).reshape(
+                    batch_size * num_frames, num_channels_latents, current_h, current_w
+                )
+                latents = F.interpolate(latents, size=(new_h, new_w), mode="nearest")
+                latents = latents.reshape(batch_size, num_frames, num_channels_latents, new_h, new_w).permute(
+                    0, 2, 1, 3, 4
+                )
+
+                # Block noise correction
+                ori_sigma = 1 - components.scheduler.ori_start_sigmas[i_s]
+                gamma = components.scheduler.config.gamma
+                alpha = 1 / (math.sqrt(1 + (1 / gamma)) * (1 - ori_sigma) + ori_sigma)
+                beta = alpha * (1 - ori_sigma) / math.sqrt(gamma)
+
+                batch_size, num_channels_latents, num_frames, h, w = latents.shape
+                noise = sample_block_noise(
+                    batch_size,
+                    num_channels_latents,
+                    num_frames,
+                    h,
+                    w,
+                    gamma,
+                    patch_size,
+                    device=device,
+                    generator=block_state.generator,
+                )
+                noise = noise.to(dtype=transformer_dtype)
+                latents = alpha * latents + beta * noise
+
+                start_point_list.append(latents)
+
+            # --- Timestep denoising loop ---
+            num_warmup_steps = len(timesteps) - num_inference_steps * components.scheduler.order
+
+            with tqdm(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    timestep = t.expand(latents.shape[0]).to(torch.int64)
+                    latent_model_input = latents.to(transformer_dtype)
+
+                    components.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
+                    guider_state = components.guider.prepare_inputs(guider_inputs)
+
+                    for guider_state_batch in guider_state:
+                        components.guider.prepare_models(components.transformer)
+                        cond_kwargs = {k: getattr(guider_state_batch, k) for k in guider_inputs.keys()}
+
+                        context_name = getattr(guider_state_batch, components.guider._identifier_key)
+                        with components.transformer.cache_context(context_name):
+                            guider_state_batch.noise_pred = components.transformer(
+                                hidden_states=latent_model_input,
+                                timestep=timestep,
+                                return_dict=False,
+                                **cond_kwargs,
+                                **shared_kwargs,
+                            )[0]
+                        components.guider.cleanup_models(components.transformer)
+
+                    noise_pred = components.guider(guider_state)[0]
+
+                    # Scheduler step with DMD args
+                    latents = components.scheduler.step(
+                        noise_pred,
+                        t,
+                        latents,
+                        generator=block_state.generator,
+                        return_dict=False,
+                        cur_sampling_step=i,
+                        dmd_noisy_tensor=start_point_list[i_s],
+                        dmd_sigmas=components.scheduler.sigmas,
+                        dmd_timesteps=components.scheduler.timesteps,
+                        all_timesteps=timesteps,
+                    )[0]
+
+                    if i == len(timesteps) - 1 or (
+                        (i + 1) > num_warmup_steps and (i + 1) % components.scheduler.order == 0
+                    ):
+                        progress_bar.update()
+
+        block_state.latents = latents
+        return components, block_state
+
+
+class HeliosPyramidChunkDenoiseStep(HeliosChunkLoopWrapper):
+    """T2V pyramid chunk denoising: history slice -> pyramid noise gen -> pyramid denoise inner -> update."""
+
+    block_classes = [
+        HeliosChunkHistorySliceStep,
+        HeliosPyramidChunkNoiseGenStep,
+        HeliosPyramidChunkDenoiseInner,
+        HeliosChunkUpdateStep,
+    ]
+    block_names = ["history_slice", "noise_gen", "denoise_inner", "update_chunk"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "T2V pyramid chunk denoise step that iterates over temporal chunks.\n"
+            "At each chunk: history_slice -> noise_gen (pyramid) -> denoise_inner (pyramid stages) -> update_chunk.\n"
+            "Denoising starts at the smallest resolution and progressively upsamples."
+        )
+
+
+class HeliosPyramidI2VChunkDenoiseStep(HeliosChunkLoopWrapper):
+    """I2V pyramid chunk denoising: I2V history slice -> pyramid noise gen -> pyramid denoise inner -> update."""
+
+    block_classes = [
+        HeliosI2VChunkHistorySliceStep,
+        HeliosPyramidChunkNoiseGenStep,
+        HeliosPyramidChunkDenoiseInner,
+        HeliosChunkUpdateStep,
+    ]
+    block_names = ["history_slice", "noise_gen", "denoise_inner", "update_chunk"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "I2V pyramid chunk denoise step that iterates over temporal chunks.\n"
+            "At each chunk: history_slice (I2V) -> noise_gen (pyramid) -> denoise_inner (pyramid stages) -> update_chunk.\n"
+            "Denoising starts at the smallest resolution and progressively upsamples."
+        )
+
+
+class HeliosPyramidDistilledChunkDenoiseStep(HeliosChunkLoopWrapper):
+    """T2V distilled pyramid chunk denoising with DMD scheduler and no CFG."""
+
+    block_classes = [
+        HeliosChunkHistorySliceStep,
+        HeliosPyramidChunkNoiseGenStep,
+        HeliosPyramidDistilledChunkDenoiseInner,
+        HeliosChunkUpdateStep,
+    ]
+    block_names = ["history_slice", "noise_gen", "denoise_inner", "update_chunk"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "T2V distilled pyramid chunk denoise step with DMD scheduler.\n"
+            "At each chunk: history_slice -> noise_gen (pyramid) -> denoise_inner (distilled/DMD) -> update_chunk."
+        )
+
+
+class HeliosPyramidDistilledI2VChunkDenoiseStep(HeliosChunkLoopWrapper):
+    """I2V distilled pyramid chunk denoising with DMD scheduler and no CFG."""
+
+    block_classes = [
+        HeliosI2VChunkHistorySliceStep,
+        HeliosPyramidChunkNoiseGenStep,
+        HeliosPyramidDistilledChunkDenoiseInner,
+        HeliosChunkUpdateStep,
+    ]
+    block_names = ["history_slice", "noise_gen", "denoise_inner", "update_chunk"]
+
+    @property
+    def description(self) -> str:
+        return (
+            "I2V distilled pyramid chunk denoise step with DMD scheduler.\n"
+            "At each chunk: history_slice (I2V) -> noise_gen (pyramid) -> denoise_inner (distilled/DMD) -> update_chunk."
+        )
diff --git a/src/diffusers/modular_pipelines/helios/encoders.py b/src/diffusers/modular_pipelines/helios/encoders.py
new file mode 100644
index 000000000000..4671fbd12c96
--- /dev/null
+++ b/src/diffusers/modular_pipelines/helios/encoders.py
@@ -0,0 +1,392 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+
+import regex as re
+import torch
+from transformers import AutoTokenizer, UMT5EncoderModel
+
+from ...configuration_utils import FrozenDict
+from ...guiders import ClassifierFreeGuidance
+from ...models import AutoencoderKLWan
+from ...utils import is_ftfy_available, logging
+from ...video_processor import VideoProcessor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import HeliosModularPipeline
+
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+
+def get_t5_prompt_embeds(
+    text_encoder: UMT5EncoderModel,
+    tokenizer: AutoTokenizer,
+    prompt: str | list[str],
+    max_sequence_length: int,
+    device: torch.device,
+    dtype: torch.dtype | None = None,
+):
+    """Encode text prompts into T5 embeddings for Helios.
+
+    Args:
+        text_encoder: The T5 text encoder model.
+        tokenizer: The tokenizer for the text encoder.
+        prompt: The prompt or prompts to encode.
+        max_sequence_length: Maximum sequence length for tokenization.
+        device: Device to place tensors on.
+        dtype: Optional dtype override. Defaults to `text_encoder.dtype`.
+
+    Returns:
+        A tuple of `(prompt_embeds, attention_mask)` where `prompt_embeds` is the encoded text embeddings and
+        `attention_mask` is a boolean mask.
+    """
+    dtype = dtype or text_encoder.dtype
+
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt = [prompt_clean(u) for u in prompt]
+
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_attention_mask=True,
+        return_tensors="pt",
+    )
+    text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+    seq_lens = mask.gt(0).sum(dim=1).long()
+
+    prompt_embeds = text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+    prompt_embeds = torch.stack(
+        [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+    )
+
+    return prompt_embeds, text_inputs.attention_mask.bool()
+
+
+class HeliosTextEncoderStep(ModularPipelineBlocks):
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return "Text Encoder step that generates text embeddings to guide the video generation"
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", UMT5EncoderModel),
+            ComponentSpec("tokenizer", AutoTokenizer),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 5.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("prompt"),
+            InputParam.template("negative_prompt"),
+            InputParam.template("max_sequence_length"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam.template("prompt_embeds"),
+            OutputParam.template("negative_prompt_embeds"),
+        ]
+
+    @staticmethod
+    def check_inputs(prompt, negative_prompt):
+        if prompt is not None and not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and not isinstance(negative_prompt, (str, list)):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        if prompt is not None and negative_prompt is not None:
+            prompt_list = [prompt] if isinstance(prompt, str) else prompt
+            neg_list = [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            if type(prompt_list) is not type(neg_list):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            if len(prompt_list) != len(neg_list):
+                raise ValueError(
+                    f"`negative_prompt` has batch size {len(neg_list)}, but `prompt` has batch size"
+                    f" {len(prompt_list)}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        prompt = block_state.prompt
+        negative_prompt = block_state.negative_prompt
+        max_sequence_length = block_state.max_sequence_length
+        device = components._execution_device
+
+        self.check_inputs(prompt, negative_prompt)
+
+        # Encode prompt
+        block_state.prompt_embeds, _ = get_t5_prompt_embeds(
+            text_encoder=components.text_encoder,
+            tokenizer=components.tokenizer,
+            prompt=prompt,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+
+        # Encode negative prompt
+        block_state.negative_prompt_embeds = None
+        if components.requires_unconditional_embeds:
+            negative_prompt = negative_prompt or ""
+            if isinstance(prompt, list) and isinstance(negative_prompt, str):
+                negative_prompt = len(prompt) * [negative_prompt]
+
+            block_state.negative_prompt_embeds, _ = get_t5_prompt_embeds(
+                text_encoder=components.text_encoder,
+                tokenizer=components.tokenizer,
+                prompt=negative_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HeliosImageVaeEncoderStep(ModularPipelineBlocks):
+    """Encodes an input image into VAE latent space for image-to-video generation."""
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Image Encoder step that encodes an input image into VAE latent space, "
+            "producing image_latents (first frame prefix) and fake_image_latents (history seed) "
+            "for image-to-video generation."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLWan),
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("image"),
+            InputParam.template("height", default=384),
+            InputParam.template("width", default=640),
+            InputParam(
+                "num_latent_frames_per_chunk",
+                default=9,
+                type_hint=int,
+                description="Number of latent frames per temporal chunk.",
+            ),
+            InputParam.template("generator"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam.template("image_latents"),
+            OutputParam(
+                "fake_image_latents", type_hint=torch.Tensor, description="Fake image latents for history seeding"
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        vae = components.vae
+        device = components._execution_device
+
+        latents_mean = (
+            torch.tensor(vae.config.latents_mean).view(1, vae.config.z_dim, 1, 1, 1).to(vae.device, vae.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(vae.config.latents_std).view(1, vae.config.z_dim, 1, 1, 1).to(
+            vae.device, vae.dtype
+        )
+
+        # Preprocess image to 4D tensor (B, C, H, W)
+        image = components.video_processor.preprocess(
+            block_state.image, height=block_state.height, width=block_state.width
+        )
+        image_5d = image.unsqueeze(2).to(device=device, dtype=vae.dtype)  # (B, C, 1, H, W)
+
+        # Encode image to get image_latents
+        image_latents = vae.encode(image_5d).latent_dist.sample(generator=block_state.generator)
+        image_latents = (image_latents - latents_mean) * latents_std
+
+        # Encode fake video to get fake_image_latents
+        min_frames = (block_state.num_latent_frames_per_chunk - 1) * components.vae_scale_factor_temporal + 1
+        fake_video = image_5d.repeat(1, 1, min_frames, 1, 1)  # (B, C, min_frames, H, W)
+        fake_latents_full = vae.encode(fake_video).latent_dist.sample(generator=block_state.generator)
+        fake_latents_full = (fake_latents_full - latents_mean) * latents_std
+        fake_image_latents = fake_latents_full[:, :, -1:, :, :]
+
+        block_state.image_latents = image_latents.to(device=device, dtype=torch.float32)
+        block_state.fake_image_latents = fake_image_latents.to(device=device, dtype=torch.float32)
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class HeliosVideoVaeEncoderStep(ModularPipelineBlocks):
+    """Encodes an input video into VAE latent space for video-to-video generation.
+
+    Produces `image_latents` (first frame) and `video_latents` (remaining frames encoded in chunks).
+    """
+
+    model_name = "helios"
+
+    @property
+    def description(self) -> str:
+        return (
+            "Video Encoder step that encodes an input video into VAE latent space, "
+            "producing image_latents (first frame) and video_latents (chunked video frames) "
+            "for video-to-video generation."
+        )
+
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLWan),
+            ComponentSpec(
+                "video_processor",
+                VideoProcessor,
+                config=FrozenDict({"vae_scale_factor": 8}),
+                default_creation_method="from_config",
+            ),
+        ]
+
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam("video", required=True, description="Input video for video-to-video generation"),
+            InputParam.template("height", default=384),
+            InputParam.template("width", default=640),
+            InputParam(
+                "num_latent_frames_per_chunk",
+                default=9,
+                type_hint=int,
+                description="Number of latent frames per temporal chunk.",
+            ),
+            InputParam.template("generator"),
+        ]
+
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam.template("image_latents"),
+            OutputParam("video_latents", type_hint=torch.Tensor, description="Encoded video latents (chunked)"),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        vae = components.vae
+        device = components._execution_device
+        num_latent_frames_per_chunk = block_state.num_latent_frames_per_chunk
+
+        latents_mean = (
+            torch.tensor(vae.config.latents_mean).view(1, vae.config.z_dim, 1, 1, 1).to(vae.device, vae.dtype)
+        )
+        latents_std = 1.0 / torch.tensor(vae.config.latents_std).view(1, vae.config.z_dim, 1, 1, 1).to(
+            vae.device, vae.dtype
+        )
+
+        # Preprocess video
+        video = components.video_processor.preprocess_video(
+            block_state.video, height=block_state.height, width=block_state.width
+        )
+        video = video.to(device=device, dtype=vae.dtype)
+
+        # Encode video into latents
+        num_frames = video.shape[2]
+        min_frames = (num_latent_frames_per_chunk - 1) * 4 + 1
+        num_chunks = num_frames // min_frames
+        if num_chunks == 0:
+            raise ValueError(
+                f"Video must have at least {min_frames} frames "
+                f"(got {num_frames} frames). "
+                f"Required: (num_latent_frames_per_chunk - 1) * 4 + 1 = ({num_latent_frames_per_chunk} - 1) * 4 + 1 = {min_frames}"
+            )
+        total_valid_frames = num_chunks * min_frames
+        start_frame = num_frames - total_valid_frames
+
+        # Encode first frame
+        first_frame = video[:, :, 0:1, :, :]
+        image_latents = vae.encode(first_frame).latent_dist.sample(generator=block_state.generator)
+        image_latents = (image_latents - latents_mean) * latents_std
+
+        # Encode remaining frames in chunks
+        latents_chunks = []
+        for i in range(num_chunks):
+            chunk_start = start_frame + i * min_frames
+            chunk_end = chunk_start + min_frames
+            video_chunk = video[:, :, chunk_start:chunk_end, :, :]
+            chunk_latents = vae.encode(video_chunk).latent_dist.sample(generator=block_state.generator)
+            chunk_latents = (chunk_latents - latents_mean) * latents_std
+            latents_chunks.append(chunk_latents)
+        video_latents = torch.cat(latents_chunks, dim=2)
+
+        block_state.image_latents = image_latents.to(device=device, dtype=torch.float32)
+        block_state.video_latents = video_latents.to(device=device, dtype=torch.float32)
+
+        self.set_block_state(state, block_state)
+        return components, state
diff --git a/src/diffusers/modular_pipelines/helios/modular_blocks_helios.py b/src/diffusers/modular_pipelines/helios/modular_blocks_helios.py
new file mode 100644
index 000000000000..e01d62966465
--- /dev/null
+++ b/src/diffusers/modular_pipelines/helios/modular_blocks_helios.py
@@ -0,0 +1,542 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
+from .before_denoise import (
+    HeliosAdditionalInputsStep,
+    HeliosAddNoiseToImageLatentsStep,
+    HeliosAddNoiseToVideoLatentsStep,
+    HeliosI2VSeedHistoryStep,
+    HeliosPrepareHistoryStep,
+    HeliosSetTimestepsStep,
+    HeliosTextInputStep,
+    HeliosV2VSeedHistoryStep,
+)
+from .decoders import HeliosDecodeStep
+from .denoise import HeliosChunkDenoiseStep, HeliosI2VChunkDenoiseStep
+from .encoders import HeliosImageVaeEncoderStep, HeliosTextEncoderStep, HeliosVideoVaeEncoderStep
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# ====================
+# 1. Vae Encoder
+# ====================
+
+
+# auto_docstring
+class HeliosAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Encoder step that encodes video or image inputs. This is an auto pipeline block.
+       - `HeliosVideoVaeEncoderStep` (video_encoder) is used when `video` is provided.
+       - `HeliosImageVaeEncoderStep` (image_encoder) is used when `image` is provided.
+       - If neither is provided, step will be skipped.
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          video (`None`, *optional*):
+              Input video for video-to-video generation
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+          video_latents (`Tensor`):
+              Encoded video latents (chunked)
+          fake_image_latents (`Tensor`):
+              Fake image latents for history seeding
+    """
+
+    block_classes = [HeliosVideoVaeEncoderStep, HeliosImageVaeEncoderStep]
+    block_names = ["video_encoder", "image_encoder"]
+    block_trigger_inputs = ["video", "image"]
+
+    @property
+    def description(self):
+        return (
+            "Encoder step that encodes video or image inputs. This is an auto pipeline block.\n"
+            " - `HeliosVideoVaeEncoderStep` (video_encoder) is used when `video` is provided.\n"
+            " - `HeliosImageVaeEncoderStep` (image_encoder) is used when `image` is provided.\n"
+            " - If neither is provided, step will be skipped."
+        )
+
+
+# ====================
+# 2. DENOISE
+# ====================
+
+
+# DENOISE (T2V)
+# auto_docstring
+class HeliosCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Denoise block that takes encoded conditions and runs the chunk-based denoising process.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          history_sizes (`list`, *optional*, defaults to [16, 2, 1]):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          timesteps (`Tensor`, *optional*):
+              Timesteps for the denoising process.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    model_name = "helios"
+    block_classes = [
+        HeliosTextInputStep,
+        HeliosPrepareHistoryStep,
+        HeliosSetTimestepsStep,
+        HeliosChunkDenoiseStep,
+    ]
+    block_names = ["input", "prepare_history", "set_timesteps", "chunk_denoise"]
+
+    @property
+    def description(self):
+        return "Denoise block that takes encoded conditions and runs the chunk-based denoising process."
+
+    @property
+    def outputs(self):
+        return [OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors")]
+
+
+# DENOISE (I2V)
+# auto_docstring
+class HeliosI2VCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    I2V denoise block that seeds history with image latents and uses I2V-aware chunk preparation.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          fake_image_latents (`Tensor`, *optional*):
+              Fake image latents used as history seed for I2V generation.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video/fake-image latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video/fake-image latent noise.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          history_sizes (`list`, *optional*, defaults to [16, 2, 1]):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          timesteps (`Tensor`, *optional*):
+              Timesteps for the denoising process.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    model_name = "helios"
+    block_classes = [
+        HeliosTextInputStep,
+        HeliosAdditionalInputsStep(
+            image_latent_inputs=[InputParam.template("image_latents")],
+            additional_batch_inputs=[
+                InputParam(
+                    "fake_image_latents",
+                    type_hint=torch.Tensor,
+                    description="Fake image latents used as history seed for I2V generation.",
+                ),
+            ],
+        ),
+        HeliosAddNoiseToImageLatentsStep,
+        HeliosPrepareHistoryStep,
+        HeliosI2VSeedHistoryStep,
+        HeliosSetTimestepsStep,
+        HeliosI2VChunkDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "add_noise_image",
+        "prepare_history",
+        "seed_history",
+        "set_timesteps",
+        "chunk_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "I2V denoise block that seeds history with image latents and uses I2V-aware chunk preparation."
+
+    @property
+    def outputs(self):
+        return [OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors")]
+
+
+# DENOISE (V2V)
+# auto_docstring
+class HeliosV2VCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    V2V denoise block that seeds history with video latents and uses I2V-aware chunk preparation.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          video_latents (`Tensor`, *optional*):
+              Encoded video latents for V2V generation.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video latent noise.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          history_sizes (`list`, *optional*, defaults to [16, 2, 1]):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`, *optional*):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          timesteps (`Tensor`, *optional*):
+              Timesteps for the denoising process.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    model_name = "helios"
+    block_classes = [
+        HeliosTextInputStep,
+        HeliosAdditionalInputsStep(
+            image_latent_inputs=[InputParam.template("image_latents")],
+            additional_batch_inputs=[
+                InputParam(
+                    "video_latents", type_hint=torch.Tensor, description="Encoded video latents for V2V generation."
+                ),
+            ],
+        ),
+        HeliosAddNoiseToVideoLatentsStep,
+        HeliosPrepareHistoryStep,
+        HeliosV2VSeedHistoryStep,
+        HeliosSetTimestepsStep,
+        HeliosI2VChunkDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "add_noise_video",
+        "prepare_history",
+        "seed_history",
+        "set_timesteps",
+        "chunk_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "V2V denoise block that seeds history with video latents and uses I2V-aware chunk preparation."
+
+    @property
+    def outputs(self):
+        return [OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors")]
+
+
+# AUTO DENOISE
+# auto_docstring
+class HeliosAutoCoreDenoiseStep(ConditionalPipelineBlocks):
+    """
+    Core denoise step that selects the appropriate denoising block.
+       - `HeliosV2VCoreDenoiseStep` (video2video) for video-to-video tasks.
+       - `HeliosI2VCoreDenoiseStep` (image2video) for image-to-video tasks.
+       - `HeliosCoreDenoiseStep` (text2video) for text-to-video tasks.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          video_latents (`Tensor`, *optional*):
+              Encoded video latents for V2V generation.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video latent noise.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          history_sizes (`list`):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          timesteps (`Tensor`, *optional*):
+              Timesteps for the denoising process.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          fake_image_latents (`Tensor`, *optional*):
+              Fake image latents used as history seed for I2V generation.
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    block_classes = [HeliosV2VCoreDenoiseStep, HeliosI2VCoreDenoiseStep, HeliosCoreDenoiseStep]
+    block_names = ["video2video", "image2video", "text2video"]
+    block_trigger_inputs = ["video_latents", "fake_image_latents"]
+    default_block_name = "text2video"
+
+    def select_block(self, video_latents=None, fake_image_latents=None):
+        if video_latents is not None:
+            return "video2video"
+        elif fake_image_latents is not None:
+            return "image2video"
+        return None
+
+    @property
+    def description(self):
+        return (
+            "Core denoise step that selects the appropriate denoising block.\n"
+            " - `HeliosV2VCoreDenoiseStep` (video2video) for video-to-video tasks.\n"
+            " - `HeliosI2VCoreDenoiseStep` (image2video) for image-to-video tasks.\n"
+            " - `HeliosCoreDenoiseStep` (text2video) for text-to-video tasks."
+        )
+
+
+AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", HeliosTextEncoderStep()),
+        ("vae_encoder", HeliosAutoVaeEncoderStep()),
+        ("denoise", HeliosAutoCoreDenoiseStep()),
+        ("decode", HeliosDecodeStep()),
+    ]
+)
+
+# ====================
+# 3. Auto Blocks
+# ====================
+
+
+# auto_docstring
+class HeliosAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-video, image-to-video, and video-to-video tasks using Helios.
+
+      Supported workflows:
+        - `text2video`: requires `prompt`
+        - `image2video`: requires `prompt`, `image`
+        - `video2video`: requires `prompt`, `video`
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) vae
+          (`AutoencoderKLWan`) video_processor (`VideoProcessor`) transformer (`HeliosTransformer3DModel`) scheduler
+          (`HeliosScheduler`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              Maximum sequence length for prompt encoding.
+          video (`None`, *optional*):
+              Input video for video-to-video generation
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          video_latents (`Tensor`, *optional*):
+              Encoded video latents for V2V generation.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video latent noise.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          history_sizes (`list`):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          num_inference_steps (`int`, *optional*, defaults to 50):
+              The number of denoising steps.
+          sigmas (`list`):
+              Custom sigmas for the denoising process.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          timesteps (`Tensor`, *optional*):
+              Timesteps for the denoising process.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          fake_image_latents (`Tensor`, *optional*):
+              Fake image latents used as history seed for I2V generation.
+          output_type (`str`, *optional*, defaults to np):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
+    model_name = "helios"
+
+    block_classes = AUTO_BLOCKS.values()
+    block_names = AUTO_BLOCKS.keys()
+
+    _workflow_map = {
+        "text2video": {"prompt": True},
+        "image2video": {"prompt": True, "image": True},
+        "video2video": {"prompt": True, "video": True},
+    }
+
+    @property
+    def description(self):
+        return "Auto Modular pipeline for text-to-video, image-to-video, and video-to-video tasks using Helios."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/helios/modular_blocks_helios_pyramid.py b/src/diffusers/modular_pipelines/helios/modular_blocks_helios_pyramid.py
new file mode 100644
index 000000000000..14f6bf80c221
--- /dev/null
+++ b/src/diffusers/modular_pipelines/helios/modular_blocks_helios_pyramid.py
@@ -0,0 +1,520 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
+from .before_denoise import (
+    HeliosAdditionalInputsStep,
+    HeliosAddNoiseToImageLatentsStep,
+    HeliosAddNoiseToVideoLatentsStep,
+    HeliosI2VSeedHistoryStep,
+    HeliosPrepareHistoryStep,
+    HeliosTextInputStep,
+    HeliosV2VSeedHistoryStep,
+)
+from .decoders import HeliosDecodeStep
+from .denoise import HeliosPyramidChunkDenoiseStep, HeliosPyramidI2VChunkDenoiseStep
+from .encoders import HeliosImageVaeEncoderStep, HeliosTextEncoderStep, HeliosVideoVaeEncoderStep
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# ====================
+# 1. Vae Encoder
+# ====================
+
+
+# auto_docstring
+class HeliosPyramidAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Encoder step that encodes video or image inputs. This is an auto pipeline block.
+       - `HeliosVideoVaeEncoderStep` (video_encoder) is used when `video` is provided.
+       - `HeliosImageVaeEncoderStep` (image_encoder) is used when `image` is provided.
+       - If neither is provided, step will be skipped.
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          video (`None`, *optional*):
+              Input video for video-to-video generation
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+          video_latents (`Tensor`):
+              Encoded video latents (chunked)
+          fake_image_latents (`Tensor`):
+              Fake image latents for history seeding
+    """
+
+    block_classes = [HeliosVideoVaeEncoderStep, HeliosImageVaeEncoderStep]
+    block_names = ["video_encoder", "image_encoder"]
+    block_trigger_inputs = ["video", "image"]
+
+    @property
+    def description(self):
+        return (
+            "Encoder step that encodes video or image inputs. This is an auto pipeline block.\n"
+            " - `HeliosVideoVaeEncoderStep` (video_encoder) is used when `video` is provided.\n"
+            " - `HeliosImageVaeEncoderStep` (image_encoder) is used when `image` is provided.\n"
+            " - If neither is provided, step will be skipped."
+        )
+
+
+# ====================
+# 2. DENOISE
+# ====================
+
+
+# DENOISE (T2V)
+# auto_docstring
+class HeliosPyramidCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    T2V pyramid denoise block with progressive multi-resolution denoising.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider
+          (`ClassifierFreeZeroStarGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          history_sizes (`list`, *optional*, defaults to [16, 2, 1]):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    model_name = "helios-pyramid"
+    block_classes = [
+        HeliosTextInputStep,
+        HeliosPrepareHistoryStep,
+        HeliosPyramidChunkDenoiseStep,
+    ]
+    block_names = ["input", "prepare_history", "pyramid_chunk_denoise"]
+
+    @property
+    def description(self):
+        return "T2V pyramid denoise block with progressive multi-resolution denoising."
+
+    @property
+    def outputs(self):
+        return [OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors")]
+
+
+# DENOISE (I2V)
+# auto_docstring
+class HeliosPyramidI2VCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    I2V pyramid denoise block with progressive multi-resolution denoising.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider
+          (`ClassifierFreeZeroStarGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          fake_image_latents (`Tensor`, *optional*):
+              Fake image latents used as history seed for I2V generation.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video/fake-image latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video/fake-image latent noise.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          history_sizes (`list`, *optional*, defaults to [16, 2, 1]):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    model_name = "helios-pyramid"
+    block_classes = [
+        HeliosTextInputStep,
+        HeliosAdditionalInputsStep(
+            image_latent_inputs=[InputParam.template("image_latents")],
+            additional_batch_inputs=[
+                InputParam(
+                    "fake_image_latents",
+                    type_hint=torch.Tensor,
+                    description="Fake image latents used as history seed for I2V generation.",
+                ),
+            ],
+        ),
+        HeliosAddNoiseToImageLatentsStep,
+        HeliosPrepareHistoryStep,
+        HeliosI2VSeedHistoryStep,
+        HeliosPyramidI2VChunkDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "add_noise_image",
+        "prepare_history",
+        "seed_history",
+        "pyramid_chunk_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "I2V pyramid denoise block with progressive multi-resolution denoising."
+
+    @property
+    def outputs(self):
+        return [OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors")]
+
+
+# DENOISE (V2V)
+# auto_docstring
+class HeliosPyramidV2VCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    V2V pyramid denoise block with progressive multi-resolution denoising.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider
+          (`ClassifierFreeZeroStarGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          video_latents (`Tensor`, *optional*):
+              Encoded video latents for V2V generation.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video latent noise.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          history_sizes (`list`, *optional*, defaults to [16, 2, 1]):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    model_name = "helios-pyramid"
+    block_classes = [
+        HeliosTextInputStep,
+        HeliosAdditionalInputsStep(
+            image_latent_inputs=[InputParam.template("image_latents")],
+            additional_batch_inputs=[
+                InputParam(
+                    "video_latents", type_hint=torch.Tensor, description="Encoded video latents for V2V generation."
+                ),
+            ],
+        ),
+        HeliosAddNoiseToVideoLatentsStep,
+        HeliosPrepareHistoryStep,
+        HeliosV2VSeedHistoryStep,
+        HeliosPyramidI2VChunkDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "add_noise_video",
+        "prepare_history",
+        "seed_history",
+        "pyramid_chunk_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "V2V pyramid denoise block with progressive multi-resolution denoising."
+
+    @property
+    def outputs(self):
+        return [OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors")]
+
+
+# AUTO DENOISE
+# auto_docstring
+class HeliosPyramidAutoCoreDenoiseStep(ConditionalPipelineBlocks):
+    """
+    Pyramid core denoise step that selects the appropriate denoising block.
+       - `HeliosPyramidV2VCoreDenoiseStep` (video2video) for video-to-video tasks.
+       - `HeliosPyramidI2VCoreDenoiseStep` (image2video) for image-to-video tasks.
+       - `HeliosPyramidCoreDenoiseStep` (text2video) for text-to-video tasks.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider
+          (`ClassifierFreeZeroStarGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          video_latents (`Tensor`, *optional*):
+              Encoded video latents for V2V generation.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video latent noise.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          history_sizes (`list`):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          fake_image_latents (`Tensor`, *optional*):
+              Fake image latents used as history seed for I2V generation.
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    block_classes = [HeliosPyramidV2VCoreDenoiseStep, HeliosPyramidI2VCoreDenoiseStep, HeliosPyramidCoreDenoiseStep]
+    block_names = ["video2video", "image2video", "text2video"]
+    block_trigger_inputs = ["video_latents", "fake_image_latents"]
+    default_block_name = "text2video"
+
+    def select_block(self, video_latents=None, fake_image_latents=None):
+        if video_latents is not None:
+            return "video2video"
+        elif fake_image_latents is not None:
+            return "image2video"
+        return None
+
+    @property
+    def description(self):
+        return (
+            "Pyramid core denoise step that selects the appropriate denoising block.\n"
+            " - `HeliosPyramidV2VCoreDenoiseStep` (video2video) for video-to-video tasks.\n"
+            " - `HeliosPyramidI2VCoreDenoiseStep` (image2video) for image-to-video tasks.\n"
+            " - `HeliosPyramidCoreDenoiseStep` (text2video) for text-to-video tasks."
+        )
+
+
+# ====================
+# 3. Auto Blocks
+# ====================
+
+PYRAMID_AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", HeliosTextEncoderStep()),
+        ("vae_encoder", HeliosPyramidAutoVaeEncoderStep()),
+        ("denoise", HeliosPyramidAutoCoreDenoiseStep()),
+        ("decode", HeliosDecodeStep()),
+    ]
+)
+
+
+# auto_docstring
+class HeliosPyramidAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for pyramid progressive generation (T2V/I2V/V2V) using Helios.
+
+      Supported workflows:
+        - `text2video`: requires `prompt`
+        - `image2video`: requires `prompt`, `image`
+        - `video2video`: requires `prompt`, `video`
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) vae
+          (`AutoencoderKLWan`) video_processor (`VideoProcessor`) transformer (`HeliosTransformer3DModel`) scheduler
+          (`HeliosScheduler`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              Maximum sequence length for prompt encoding.
+          video (`None`, *optional*):
+              Input video for video-to-video generation
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          video_latents (`Tensor`, *optional*):
+              Encoded video latents for V2V generation.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video latent noise.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          history_sizes (`list`):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          fake_image_latents (`Tensor`, *optional*):
+              Fake image latents used as history seed for I2V generation.
+          output_type (`str`, *optional*, defaults to np):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
+    model_name = "helios-pyramid"
+
+    block_classes = PYRAMID_AUTO_BLOCKS.values()
+    block_names = PYRAMID_AUTO_BLOCKS.keys()
+
+    _workflow_map = {
+        "text2video": {"prompt": True},
+        "image2video": {"prompt": True, "image": True},
+        "video2video": {"prompt": True, "video": True},
+    }
+
+    @property
+    def description(self):
+        return "Auto Modular pipeline for pyramid progressive generation (T2V/I2V/V2V) using Helios."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/helios/modular_blocks_helios_pyramid_distilled.py b/src/diffusers/modular_pipelines/helios/modular_blocks_helios_pyramid_distilled.py
new file mode 100644
index 000000000000..e9e37df5d00c
--- /dev/null
+++ b/src/diffusers/modular_pipelines/helios/modular_blocks_helios_pyramid_distilled.py
@@ -0,0 +1,530 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ...utils import logging
+from ..modular_pipeline import AutoPipelineBlocks, ConditionalPipelineBlocks, SequentialPipelineBlocks
+from ..modular_pipeline_utils import InputParam, InsertableDict, OutputParam
+from .before_denoise import (
+    HeliosAdditionalInputsStep,
+    HeliosAddNoiseToImageLatentsStep,
+    HeliosAddNoiseToVideoLatentsStep,
+    HeliosI2VSeedHistoryStep,
+    HeliosPrepareHistoryStep,
+    HeliosTextInputStep,
+    HeliosV2VSeedHistoryStep,
+)
+from .decoders import HeliosDecodeStep
+from .denoise import HeliosPyramidDistilledChunkDenoiseStep, HeliosPyramidDistilledI2VChunkDenoiseStep
+from .encoders import HeliosImageVaeEncoderStep, HeliosTextEncoderStep, HeliosVideoVaeEncoderStep
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# ====================
+# 1. Vae Encoder
+# ====================
+
+
+# auto_docstring
+class HeliosPyramidDistilledAutoVaeEncoderStep(AutoPipelineBlocks):
+    """
+    Encoder step for distilled pyramid pipeline.
+       - `HeliosVideoVaeEncoderStep` (video_encoder) is used when `video` is provided.
+       - `HeliosImageVaeEncoderStep` (image_encoder) is used when `image` is provided.
+       - If neither is provided, step will be skipped.
+
+      Components:
+          vae (`AutoencoderKLWan`) video_processor (`VideoProcessor`)
+
+      Inputs:
+          video (`None`, *optional*):
+              Input video for video-to-video generation
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+
+      Outputs:
+          image_latents (`Tensor`):
+              The latent representation of the input image.
+          video_latents (`Tensor`):
+              Encoded video latents (chunked)
+          fake_image_latents (`Tensor`):
+              Fake image latents for history seeding
+    """
+
+    block_classes = [HeliosVideoVaeEncoderStep, HeliosImageVaeEncoderStep]
+    block_names = ["video_encoder", "image_encoder"]
+    block_trigger_inputs = ["video", "image"]
+
+    @property
+    def description(self):
+        return (
+            "Encoder step for distilled pyramid pipeline.\n"
+            " - `HeliosVideoVaeEncoderStep` (video_encoder) is used when `video` is provided.\n"
+            " - `HeliosImageVaeEncoderStep` (image_encoder) is used when `image` is provided.\n"
+            " - If neither is provided, step will be skipped."
+        )
+
+
+# ====================
+# 2. DENOISE
+# ====================
+
+
+# DENOISE (T2V)
+# auto_docstring
+class HeliosPyramidDistilledCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    T2V distilled pyramid denoise block with DMD scheduler and no CFG.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          history_sizes (`list`, *optional*, defaults to [16, 2, 1]):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          is_amplify_first_chunk (`bool`, *optional*, defaults to True):
+              Whether to double the first chunk's timesteps via the scheduler for amplified generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    model_name = "helios-pyramid"
+    block_classes = [
+        HeliosTextInputStep,
+        HeliosPrepareHistoryStep,
+        HeliosPyramidDistilledChunkDenoiseStep,
+    ]
+    block_names = ["input", "prepare_history", "pyramid_chunk_denoise"]
+
+    @property
+    def description(self):
+        return "T2V distilled pyramid denoise block with DMD scheduler and no CFG."
+
+    @property
+    def outputs(self):
+        return [OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors")]
+
+
+# DENOISE (I2V)
+# auto_docstring
+class HeliosPyramidDistilledI2VCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    I2V distilled pyramid denoise block with DMD scheduler and no CFG.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          image_latents (`Tensor`):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          fake_image_latents (`Tensor`, *optional*):
+              Fake image latents used as history seed for I2V generation.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video/fake-image latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video/fake-image latent noise.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          history_sizes (`list`, *optional*, defaults to [16, 2, 1]):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          is_amplify_first_chunk (`bool`, *optional*, defaults to True):
+              Whether to double the first chunk's timesteps via the scheduler for amplified generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    model_name = "helios-pyramid"
+    block_classes = [
+        HeliosTextInputStep,
+        HeliosAdditionalInputsStep(
+            image_latent_inputs=[InputParam.template("image_latents")],
+            additional_batch_inputs=[
+                InputParam(
+                    "fake_image_latents",
+                    type_hint=torch.Tensor,
+                    description="Fake image latents used as history seed for I2V generation.",
+                ),
+            ],
+        ),
+        HeliosAddNoiseToImageLatentsStep,
+        HeliosPrepareHistoryStep,
+        HeliosI2VSeedHistoryStep,
+        HeliosPyramidDistilledI2VChunkDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "add_noise_image",
+        "prepare_history",
+        "seed_history",
+        "pyramid_chunk_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "I2V distilled pyramid denoise block with DMD scheduler and no CFG."
+
+    @property
+    def outputs(self):
+        return [OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors")]
+
+
+# DENOISE (V2V)
+# auto_docstring
+class HeliosPyramidDistilledV2VCoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    V2V distilled pyramid denoise block with DMD scheduler and no CFG.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          video_latents (`Tensor`, *optional*):
+              Encoded video latents for V2V generation.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video latent noise.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          history_sizes (`list`, *optional*, defaults to [16, 2, 1]):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          is_amplify_first_chunk (`bool`, *optional*, defaults to True):
+              Whether to double the first chunk's timesteps via the scheduler for amplified generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    model_name = "helios-pyramid"
+    block_classes = [
+        HeliosTextInputStep,
+        HeliosAdditionalInputsStep(
+            image_latent_inputs=[InputParam.template("image_latents")],
+            additional_batch_inputs=[
+                InputParam(
+                    "video_latents", type_hint=torch.Tensor, description="Encoded video latents for V2V generation."
+                ),
+            ],
+        ),
+        HeliosAddNoiseToVideoLatentsStep,
+        HeliosPrepareHistoryStep,
+        HeliosV2VSeedHistoryStep,
+        HeliosPyramidDistilledI2VChunkDenoiseStep,
+    ]
+    block_names = [
+        "input",
+        "additional_inputs",
+        "add_noise_video",
+        "prepare_history",
+        "seed_history",
+        "pyramid_chunk_denoise",
+    ]
+
+    @property
+    def description(self):
+        return "V2V distilled pyramid denoise block with DMD scheduler and no CFG."
+
+    @property
+    def outputs(self):
+        return [OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors")]
+
+
+# AUTO DENOISE
+# auto_docstring
+class HeliosPyramidDistilledAutoCoreDenoiseStep(ConditionalPipelineBlocks):
+    """
+    Distilled pyramid core denoise step that selects the appropriate denoising block.
+       - `HeliosPyramidDistilledV2VCoreDenoiseStep` (video2video) for video-to-video tasks.
+       - `HeliosPyramidDistilledI2VCoreDenoiseStep` (image2video) for image-to-video tasks.
+       - `HeliosPyramidDistilledCoreDenoiseStep` (text2video) for text-to-video tasks.
+
+      Components:
+          transformer (`HeliosTransformer3DModel`) scheduler (`HeliosScheduler`) guider (`ClassifierFreeGuidance`)
+
+      Inputs:
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          prompt_embeds (`Tensor`):
+              text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          negative_prompt_embeds (`Tensor`, *optional*):
+              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          video_latents (`Tensor`, *optional*):
+              Encoded video latents for V2V generation.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video latent noise.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          history_sizes (`list`):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          is_amplify_first_chunk (`bool`, *optional*, defaults to True):
+              Whether to double the first chunk's timesteps via the scheduler for amplified generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          fake_image_latents (`Tensor`, *optional*):
+              Fake image latents used as history seed for I2V generation.
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+
+      Outputs:
+          latent_chunks (`list`):
+              List of per-chunk denoised latent tensors
+    """
+
+    block_classes = [
+        HeliosPyramidDistilledV2VCoreDenoiseStep,
+        HeliosPyramidDistilledI2VCoreDenoiseStep,
+        HeliosPyramidDistilledCoreDenoiseStep,
+    ]
+    block_names = ["video2video", "image2video", "text2video"]
+    block_trigger_inputs = ["video_latents", "fake_image_latents"]
+    default_block_name = "text2video"
+
+    def select_block(self, video_latents=None, fake_image_latents=None):
+        if video_latents is not None:
+            return "video2video"
+        elif fake_image_latents is not None:
+            return "image2video"
+        return None
+
+    @property
+    def description(self):
+        return (
+            "Distilled pyramid core denoise step that selects the appropriate denoising block.\n"
+            " - `HeliosPyramidDistilledV2VCoreDenoiseStep` (video2video) for video-to-video tasks.\n"
+            " - `HeliosPyramidDistilledI2VCoreDenoiseStep` (image2video) for image-to-video tasks.\n"
+            " - `HeliosPyramidDistilledCoreDenoiseStep` (text2video) for text-to-video tasks."
+        )
+
+
+# ====================
+# 3. Auto Blocks
+# ====================
+
+DISTILLED_PYRAMID_AUTO_BLOCKS = InsertableDict(
+    [
+        ("text_encoder", HeliosTextEncoderStep()),
+        ("vae_encoder", HeliosPyramidDistilledAutoVaeEncoderStep()),
+        ("denoise", HeliosPyramidDistilledAutoCoreDenoiseStep()),
+        ("decode", HeliosDecodeStep()),
+    ]
+)
+
+
+# auto_docstring
+class HeliosPyramidDistilledAutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for distilled pyramid progressive generation (T2V/I2V/V2V) using Helios.
+
+      Supported workflows:
+        - `text2video`: requires `prompt`
+        - `image2video`: requires `prompt`, `image`
+        - `video2video`: requires `prompt`, `video`
+
+      Components:
+          text_encoder (`UMT5EncoderModel`) tokenizer (`AutoTokenizer`) guider (`ClassifierFreeGuidance`) vae
+          (`AutoencoderKLWan`) video_processor (`VideoProcessor`) transformer (`HeliosTransformer3DModel`) scheduler
+          (`HeliosScheduler`)
+
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          negative_prompt (`str`, *optional*):
+              The prompt or prompts not to guide the image generation.
+          max_sequence_length (`int`, *optional*, defaults to 512):
+              Maximum sequence length for prompt encoding.
+          video (`None`, *optional*):
+              Input video for video-to-video generation
+          height (`int`, *optional*, defaults to 384):
+              The height in pixels of the generated image.
+          width (`int`, *optional*, defaults to 640):
+              The width in pixels of the generated image.
+          num_latent_frames_per_chunk (`int`, *optional*, defaults to 9):
+              Number of latent frames per temporal chunk.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          image (`Image | list`, *optional*):
+              Reference image(s) for denoising. Can be a single image or list of images.
+          num_videos_per_prompt (`int`, *optional*, defaults to 1):
+              Number of videos to generate per prompt.
+          image_latents (`Tensor`, *optional*):
+              image latents used to guide the image generation. Can be generated from vae_encoder step.
+          video_latents (`Tensor`, *optional*):
+              Encoded video latents for V2V generation.
+          image_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for image latent noise.
+          image_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for image latent noise.
+          video_noise_sigma_min (`float`, *optional*, defaults to 0.111):
+              Minimum sigma for video latent noise.
+          video_noise_sigma_max (`float`, *optional*, defaults to 0.135):
+              Maximum sigma for video latent noise.
+          num_frames (`int`, *optional*, defaults to 132):
+              Total number of video frames to generate.
+          history_sizes (`list`):
+              Sizes of long/mid/short history buffers for temporal context.
+          keep_first_frame (`bool`, *optional*, defaults to True):
+              Whether to keep the first frame as a prefix in history.
+          pyramid_num_inference_steps_list (`list`, *optional*, defaults to [10, 10, 10]):
+              Number of denoising steps per pyramid stage.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          **denoiser_input_fields (`None`, *optional*):
+              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
+          is_amplify_first_chunk (`bool`, *optional*, defaults to True):
+              Whether to double the first chunk's timesteps via the scheduler for amplified generation.
+          attention_kwargs (`dict`, *optional*):
+              Additional kwargs for attention processors.
+          fake_image_latents (`Tensor`, *optional*):
+              Fake image latents used as history seed for I2V generation.
+          output_type (`str`, *optional*, defaults to np):
+              Output format: 'pil', 'np', 'pt'.
+
+      Outputs:
+          videos (`list`):
+              The generated videos.
+    """
+
+    model_name = "helios-pyramid"
+
+    block_classes = DISTILLED_PYRAMID_AUTO_BLOCKS.values()
+    block_names = DISTILLED_PYRAMID_AUTO_BLOCKS.keys()
+
+    _workflow_map = {
+        "text2video": {"prompt": True},
+        "image2video": {"prompt": True, "image": True},
+        "video2video": {"prompt": True, "video": True},
+    }
+
+    @property
+    def description(self):
+        return "Auto Modular pipeline for distilled pyramid progressive generation (T2V/I2V/V2V) using Helios."
+
+    @property
+    def outputs(self):
+        return [OutputParam.template("videos")]
diff --git a/src/diffusers/modular_pipelines/helios/modular_pipeline.py b/src/diffusers/modular_pipelines/helios/modular_pipeline.py
new file mode 100644
index 000000000000..fd3875381c56
--- /dev/null
+++ b/src/diffusers/modular_pipelines/helios/modular_pipeline.py
@@ -0,0 +1,87 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ...loaders import HeliosLoraLoaderMixin
+from ...utils import logging
+from ..modular_pipeline import ModularPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class HeliosModularPipeline(
+    ModularPipeline,
+    HeliosLoraLoaderMixin,
+):
+    """
+    A ModularPipeline for Helios text-to-video generation.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "HeliosAutoBlocks"
+
+    @property
+    def vae_scale_factor_spatial(self):
+        vae_scale_factor = 8
+        if hasattr(self, "vae") and self.vae is not None:
+            vae_scale_factor = self.vae.config.scale_factor_spatial
+        return vae_scale_factor
+
+    @property
+    def vae_scale_factor_temporal(self):
+        vae_scale_factor = 4
+        if hasattr(self, "vae") and self.vae is not None:
+            vae_scale_factor = self.vae.config.scale_factor_temporal
+        return vae_scale_factor
+
+    @property
+    def num_channels_latents(self):
+        # YiYi TODO: find out default value
+        num_channels_latents = 16
+        if hasattr(self, "transformer") and self.transformer is not None:
+            num_channels_latents = self.transformer.config.in_channels
+        return num_channels_latents
+
+    @property
+    def requires_unconditional_embeds(self):
+        requires_unconditional_embeds = False
+
+        if hasattr(self, "guider") and self.guider is not None:
+            requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
+
+        return requires_unconditional_embeds
+
+
+class HeliosPyramidModularPipeline(HeliosModularPipeline):
+    """
+    A ModularPipeline for Helios pyramid (progressive resolution) video generation.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "HeliosPyramidAutoBlocks"
+
+
+class HeliosPyramidDistilledModularPipeline(HeliosModularPipeline):
+    """
+    A ModularPipeline for Helios distilled pyramid video generation using DMD scheduler.
+
+    Uses guidance_scale=1.0 (no CFG) and supports is_amplify_first_chunk for the DMD scheduler.
+
+    > [!WARNING] > This is an experimental feature and is likely to change in the future.
+    """
+
+    default_blocks_name = "HeliosPyramidDistilledAutoBlocks"
diff --git a/src/diffusers/modular_pipelines/modular_pipeline.py b/src/diffusers/modular_pipelines/modular_pipeline.py
index a563d2aa99eb..9cd2f9f5c6ae 100644
--- a/src/diffusers/modular_pipelines/modular_pipeline.py
+++ b/src/diffusers/modular_pipelines/modular_pipeline.py
@@ -106,6 +106,16 @@ def _wan_i2v_map_fn(config_dict=None):
         return "WanImage2VideoModularPipeline"
 
 
+def _helios_pyramid_map_fn(config_dict=None):
+    if config_dict is None:
+        return "HeliosPyramidModularPipeline"
+
+    if config_dict.get("is_distilled", False):
+        return "HeliosPyramidDistilledModularPipeline"
+    else:
+        return "HeliosPyramidModularPipeline"
+
+
 MODULAR_PIPELINE_MAPPING = OrderedDict(
     [
         ("stable-diffusion-xl", _create_default_map_fn("StableDiffusionXLModularPipeline")),
@@ -120,6 +130,8 @@ def _wan_i2v_map_fn(config_dict=None):
         ("qwenimage-edit-plus", _create_default_map_fn("QwenImageEditPlusModularPipeline")),
         ("qwenimage-layered", _create_default_map_fn("QwenImageLayeredModularPipeline")),
         ("z-image", _create_default_map_fn("ZImageModularPipeline")),
+        ("helios", _create_default_map_fn("HeliosModularPipeline")),
+        ("helios-pyramid", _helios_pyramid_map_fn),
     ]
 )
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 157b04ef266a..730a788ed1b8 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -152,6 +152,96 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class HeliosAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HeliosModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HeliosPyramidAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HeliosPyramidDistilledAutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HeliosPyramidDistilledModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class HeliosPyramidModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class QwenImageAutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/modular_pipelines/helios/__init__.py b/tests/modular_pipelines/helios/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/modular_pipelines/helios/test_modular_pipeline_helios.py b/tests/modular_pipelines/helios/test_modular_pipeline_helios.py
new file mode 100644
index 000000000000..44a01dad6525
--- /dev/null
+++ b/tests/modular_pipelines/helios/test_modular_pipeline_helios.py
@@ -0,0 +1,166 @@
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from diffusers.modular_pipelines import (
+    HeliosAutoBlocks,
+    HeliosModularPipeline,
+    HeliosPyramidAutoBlocks,
+    HeliosPyramidModularPipeline,
+)
+
+from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+
+
+HELIOS_WORKFLOWS = {
+    "text2video": [
+        ("text_encoder", "HeliosTextEncoderStep"),
+        ("denoise.input", "HeliosTextInputStep"),
+        ("denoise.prepare_history", "HeliosPrepareHistoryStep"),
+        ("denoise.set_timesteps", "HeliosSetTimestepsStep"),
+        ("denoise.chunk_denoise", "HeliosChunkDenoiseStep"),
+        ("decode", "HeliosDecodeStep"),
+    ],
+    "image2video": [
+        ("text_encoder", "HeliosTextEncoderStep"),
+        ("vae_encoder", "HeliosImageVaeEncoderStep"),
+        ("denoise.input", "HeliosTextInputStep"),
+        ("denoise.additional_inputs", "HeliosAdditionalInputsStep"),
+        ("denoise.add_noise_image", "HeliosAddNoiseToImageLatentsStep"),
+        ("denoise.prepare_history", "HeliosPrepareHistoryStep"),
+        ("denoise.seed_history", "HeliosI2VSeedHistoryStep"),
+        ("denoise.set_timesteps", "HeliosSetTimestepsStep"),
+        ("denoise.chunk_denoise", "HeliosI2VChunkDenoiseStep"),
+        ("decode", "HeliosDecodeStep"),
+    ],
+    "video2video": [
+        ("text_encoder", "HeliosTextEncoderStep"),
+        ("vae_encoder", "HeliosVideoVaeEncoderStep"),
+        ("denoise.input", "HeliosTextInputStep"),
+        ("denoise.additional_inputs", "HeliosAdditionalInputsStep"),
+        ("denoise.add_noise_video", "HeliosAddNoiseToVideoLatentsStep"),
+        ("denoise.prepare_history", "HeliosPrepareHistoryStep"),
+        ("denoise.seed_history", "HeliosV2VSeedHistoryStep"),
+        ("denoise.set_timesteps", "HeliosSetTimestepsStep"),
+        ("denoise.chunk_denoise", "HeliosI2VChunkDenoiseStep"),
+        ("decode", "HeliosDecodeStep"),
+    ],
+}
+
+
+class TestHeliosModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = HeliosModularPipeline
+    pipeline_blocks_class = HeliosAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-helios-modular-pipe"
+
+    params = frozenset(["prompt", "height", "width", "num_frames"])
+    batch_params = frozenset(["prompt"])
+    optional_params = frozenset(["num_inference_steps", "num_videos_per_prompt", "latents"])
+    output_name = "videos"
+    expected_workflow_blocks = HELIOS_WORKFLOWS
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "height": 16,
+            "width": 16,
+            "num_frames": 9,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    @pytest.mark.skip(reason="num_videos_per_prompt")
+    def test_num_images_per_prompt(self):
+        pass
+
+
+HELIOS_PYRAMID_WORKFLOWS = {
+    "text2video": [
+        ("text_encoder", "HeliosTextEncoderStep"),
+        ("denoise.input", "HeliosTextInputStep"),
+        ("denoise.prepare_history", "HeliosPrepareHistoryStep"),
+        ("denoise.pyramid_chunk_denoise", "HeliosPyramidChunkDenoiseStep"),
+        ("decode", "HeliosDecodeStep"),
+    ],
+    "image2video": [
+        ("text_encoder", "HeliosTextEncoderStep"),
+        ("vae_encoder", "HeliosImageVaeEncoderStep"),
+        ("denoise.input", "HeliosTextInputStep"),
+        ("denoise.additional_inputs", "HeliosAdditionalInputsStep"),
+        ("denoise.add_noise_image", "HeliosAddNoiseToImageLatentsStep"),
+        ("denoise.prepare_history", "HeliosPrepareHistoryStep"),
+        ("denoise.seed_history", "HeliosI2VSeedHistoryStep"),
+        ("denoise.pyramid_chunk_denoise", "HeliosPyramidI2VChunkDenoiseStep"),
+        ("decode", "HeliosDecodeStep"),
+    ],
+    "video2video": [
+        ("text_encoder", "HeliosTextEncoderStep"),
+        ("vae_encoder", "HeliosVideoVaeEncoderStep"),
+        ("denoise.input", "HeliosTextInputStep"),
+        ("denoise.additional_inputs", "HeliosAdditionalInputsStep"),
+        ("denoise.add_noise_video", "HeliosAddNoiseToVideoLatentsStep"),
+        ("denoise.prepare_history", "HeliosPrepareHistoryStep"),
+        ("denoise.seed_history", "HeliosV2VSeedHistoryStep"),
+        ("denoise.pyramid_chunk_denoise", "HeliosPyramidI2VChunkDenoiseStep"),
+        ("decode", "HeliosDecodeStep"),
+    ],
+}
+
+
+class TestHeliosPyramidModularPipelineFast(ModularPipelineTesterMixin):
+    pipeline_class = HeliosPyramidModularPipeline
+    pipeline_blocks_class = HeliosPyramidAutoBlocks
+    pretrained_model_name_or_path = "hf-internal-testing/tiny-helios-pyramid-modular-pipe"
+
+    params = frozenset(["prompt", "height", "width", "num_frames"])
+    batch_params = frozenset(["prompt"])
+    optional_params = frozenset(["pyramid_num_inference_steps_list", "num_videos_per_prompt", "latents"])
+    output_name = "videos"
+    expected_workflow_blocks = HELIOS_PYRAMID_WORKFLOWS
+
+    def get_dummy_inputs(self, seed=0):
+        generator = self.get_generator(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "pyramid_num_inference_steps_list": [2, 2],
+            "height": 64,
+            "width": 64,
+            "num_frames": 9,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_inference_batch_single_identical(self):
+        # Pyramid pipeline injects noise at each stage, so batch vs single can differ more
+        super().test_inference_batch_single_identical(expected_max_diff=5e-1)
+
+    @pytest.mark.skip(reason="Pyramid multi-stage noise makes offload comparison unreliable with tiny models")
+    def test_components_auto_cpu_offload_inference_consistent(self):
+        pass
+
+    @pytest.mark.skip(reason="Pyramid multi-stage noise makes save/load comparison unreliable with tiny models")
+    def test_save_from_pretrained(self):
+        pass
+
+    @pytest.mark.skip(reason="num_videos_per_prompt")
+    def test_num_images_per_prompt(self):
+        pass