Skip to content

[modular] helios#13216

Open
yiyixuxu wants to merge 4 commits intomainfrom
helios-modular
Open

[modular] helios#13216
yiyixuxu wants to merge 4 commits intomainfrom
helios-modular

Conversation

@yiyixuxu
Copy link
Collaborator

@yiyixuxu yiyixuxu commented Mar 5, 2026

import torch
from diffusers import ModularPipeline, ClassifierFreeGuidance, ClassifierFreeZeroStarGuidance
from diffusers.utils import export_to_video, load_image, load_video
import gc

branch = ""

#############################
# helios-pyramid-distilled
#############################
print("=== Helios-Distilled ===")

mod_pipe = ModularPipeline.from_pretrained("BestWishYsh/Helios-Distilled")
mod_pipe.load_components(torch_dtype=torch.bfloat16)
mod_pipe.to("cuda")

# we need to upload guider to the model repo, so each checkpoint will be able to config their guidance differently
guider = ClassifierFreeGuidance(guidance_scale=1.0)
mod_pipe.update_components(guider=guider)

# --- T2V ---
print("=== T2V ===")
prompt = (
    "A vibrant tropical fish swimming gracefully among colorful coral reefs in a clear, turquoise ocean. "
    "The fish has bright blue and yellow scales with a small, distinctive orange spot on its side, its fins moving "
    "fluidly. The coral reefs are alive with a variety of marine life, including small schools of colorful fish and "
    "sea turtles gliding by. The water is crystal clear, allowing for a view of the sandy ocean floor below. The reef "
    "itself is adorned with a mix of hard and soft corals in shades of red, orange, and green. The photo captures "
    "the fish from a slightly elevated angle, emphasizing its lively movements and the vivid colors of its surroundings. "
    "A close-up shot with dynamic movement."
)

torch.manual_seed(42)
output = mod_pipe(
    prompt=prompt,
    height=384,
    width=640,
    num_frames=240,
    pyramid_num_inference_steps_list=[2, 2, 2],
    is_amplify_first_chunk=True,
    generator=torch.Generator("cuda").manual_seed(42),
    output="videos",
)

export_to_video(output[0], "helios_distilled_modular_t2v_output.mp4", fps=24)
print(f"T2V max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# --- I2V ---
print("=== I2V ===")
image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/wave.jpg"
)
i2v_prompt = (
    "A towering emerald wave surges forward, its crest curling with raw power and energy. "
    "Sunlight glints off the translucent water, illuminating the intricate textures and deep "
    "green hues within the wave’s body. A thick spray erupts from the breaking crest, casting "
    "a misty veil that dances above the churning surface. As the perspective widens, the immense "
    "scale of the wave becomes apparent, revealing the restless expanse of the ocean stretching beyond. "
    "The scene captures the ocean’s untamed beauty and relentless force, with every droplet and ripple "
    "shimmering in the light. The dynamic motion and vivid colors evoke both awe and respect for nature’s might."
)

torch.manual_seed(42)
output = mod_pipe(
    prompt=i2v_prompt,
    image=image,
    height=384,
    width=640,
    num_frames=240,
    pyramid_num_inference_steps_list=[2, 2, 2],
    is_amplify_first_chunk=True,
    generator=torch.Generator("cuda").manual_seed(42),
    output="videos",
)

export_to_video(output[0], f"helios_distilled_modular_i2v_output_{branch}.mp4", fps=24)
print(f"I2V max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# --- V2V ---
print("=== V2V ===")
video = load_video(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/car.mp4"
)
v2v_prompt = (
    "A bright yellow Lamborghini Huracn Tecnica speeds along a curving mountain road, surrounded by lush "
    "green trees under a partly cloudy sky. The car's sleek design and vibrant color stand out against "
    "the natural backdrop, emphasizing its dynamic movement. The road curves gently, with a guardrail "
    "visible on one side, adding depth to the scene. The motion blur captures the sense of speed and energy, "
    "creating a thrilling and exhilarating atmosphere. A front-facing shot from a slightly elevated "
    "angle, highlighting the car's aggressive stance and the surrounding greenery."
)

torch.manual_seed(42)
output = mod_pipe(
    prompt=v2v_prompt,
    video=video,
    height=384,
    width=640,
    num_frames=240,
    pyramid_num_inference_steps_list=[2, 2, 2],
    is_amplify_first_chunk=True,
    generator=torch.Generator("cuda").manual_seed(42),
    output="videos",
)

export_to_video(output[0], f"helios_distilled_modular_v2v_output_{branch}.mp4", fps=24)
print(f"V2V max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")

del mod_pipe
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

#############################
# helios-pyramid
#############################

print("=== Helios-Pyramid ===")
mod_pipe = ModularPipeline.from_pretrained("BestWishYsh/Helios-Mid")
mod_pipe.load_components(torch_dtype=torch.bfloat16)
mod_pipe.to("cuda")

# Swap guider to CFG-Zero* (matches non-modular use_cfg_zero_star=True, use_zero_init=True, zero_steps=1)
# Helios zero_steps=1 means steps 0 and 1 are zeroed (i <= 1), so zero_init_steps=2
guider= ClassifierFreeZeroStarGuidance(guidance_scale=5.0, zero_init_steps=2)
mod_pipe.update_components(guider = guider)

print(f"Guider: {mod_pipe.guider}")
print(f"Guider config: {mod_pipe.guider.config}")

negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"

# --- T2V ---
print("=== T2V ===")
prompt = (
    "A vibrant tropical fish swimming gracefully among colorful coral reefs in a clear, turquoise ocean. "
    "The fish has bright blue and yellow scales with a small, distinctive orange spot on its side, its fins moving "
    "fluidly. The coral reefs are alive with a variety of marine life, including small schools of colorful fish and "
    "sea turtles gliding by. The water is crystal clear, allowing for a view of the sandy ocean floor below. The reef "
    "itself is adorned with a mix of hard and soft corals in shades of red, orange, and green. The photo captures "
    "the fish from a slightly elevated angle, emphasizing its lively movements and the vivid colors of its surroundings. "
    "A close-up shot with dynamic movement."
)

torch.manual_seed(42)
output = mod_pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    height=384,
    width=640,
    num_frames=99,
    pyramid_num_inference_steps_list=[20, 20, 20],
    generator=torch.Generator("cuda").manual_seed(42),
    output="videos",
)

export_to_video(output[0], f"helios_pyramid_modular_t2v_output_after_{branch}.mp4", fps=24)
print(f"T2V max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# --- I2V ---
print("=== I2V ===")
image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/wave.jpg"
)
i2v_prompt = (
    "A towering emerald wave surges forward, its crest curling with raw power and energy. "
    "Sunlight glints off the translucent water, illuminating the intricate textures and deep "
    "green hues within the wave’s body. A thick spray erupts from the breaking crest, casting "
    "a misty veil that dances above the churning surface. As the perspective widens, the immense "
    "scale of the wave becomes apparent, revealing the restless expanse of the ocean stretching beyond. "
    "The scene captures the ocean’s untamed beauty and relentless force, with every droplet and ripple "
    "shimmering in the light. The dynamic motion and vivid colors evoke both awe and respect for nature’s might."
)

torch.manual_seed(42)
output = mod_pipe(
    prompt=i2v_prompt,
    negative_prompt=negative_prompt,
    image=image,
    height=384,
    width=640,
    num_frames=99,
    pyramid_num_inference_steps_list=[20, 20, 20],
    generator=torch.Generator("cuda").manual_seed(42),
    output="videos",
)

export_to_video(output[0], f"helios_pyramid_modular_i2v_output_after_{branch}.mp4", fps=24)
print(f"I2V max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# --- V2V ---
print("=== V2V ===")
video = load_video(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/car.mp4"
)
v2v_prompt = (
    "A bright yellow Lamborghini Huracn Tecnica speeds along a curving mountain road, surrounded by lush "
    "green trees under a partly cloudy sky. The car's sleek design and vibrant color stand out against "
    "the natural backdrop, emphasizing its dynamic movement. The road curves gently, with a guardrail "
    "visible on one side, adding depth to the scene. The motion blur captures the sense of speed and energy, "
    "creating a thrilling and exhilarating atmosphere. A front-facing shot from a slightly elevated "
    "angle, highlighting the car's aggressive stance and the surrounding greenery."
)

torch.manual_seed(42)
output = mod_pipe(
    prompt=v2v_prompt,
    negative_prompt=negative_prompt,
    video=video,
    height=384,
    width=640,
    num_frames=99,
    pyramid_num_inference_steps_list=[20, 20, 20],
    generator=torch.Generator("cuda").manual_seed(42),
    output="videos",
)

export_to_video(output[0], f"helios_pyramid_modular_v2v_output_after_{branch}.mp4", fps=24)
print(f"V2V max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")

del mod_pipe
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()


#############################
# helios
#############################
print("=== Helios ===")
pipe = ModularPipeline.from_pretrained("BestWishYsh/Helios-Base")
pipe.load_components(torch_dtype=torch.bfloat16)
pipe.to("cuda")


# --- T2V ---
print("=== T2V ===")
prompt = (
    "A vibrant tropical fish swimming gracefully among colorful coral reefs in a clear, turquoise ocean. "
    "The fish has bright blue and yellow scales with a small, distinctive orange spot on its side, its fins moving "
    "fluidly. The coral reefs are alive with a variety of marine life, including small schools of colorful fish and "
    "sea turtles gliding by. The water is crystal clear, allowing for a view of the sandy ocean floor below. The reef "
    "itself is adorned with a mix of hard and soft corals in shades of red, orange, and green. The photo captures "
    "the fish from a slightly elevated angle, emphasizing its lively movements and the vivid colors of its surroundings. "
    "A close-up shot with dynamic movement."
)

torch.manual_seed(42)
output = pipe(
    prompt=prompt,
    height=384,
    width=640,
    num_frames=99,
    num_inference_steps=50,
    generator=torch.Generator("cuda").manual_seed(42),
    output="videos",
)
export_to_video(output[0], f"helios_auto_t2v_output_{branch}.mp4", fps=24)
print(f"T2V max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# --- I2V ---
print("=== I2V ===")
image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/wave.jpg"
)
i2v_prompt = (
    "A towering emerald wave surges forward, its crest curling with raw power and energy. "
    "Sunlight glints off the translucent water, illuminating the intricate textures and deep "
    "green hues within the wave’s body. A thick spray erupts from the breaking crest, casting "
    "a misty veil that dances above the churning surface. As the perspective widens, the immense "
    "scale of the wave becomes apparent, revealing the restless expanse of the ocean stretching beyond. "
    "The scene captures the ocean’s untamed beauty and relentless force, with every droplet and ripple "
    "shimmering in the light. The dynamic motion and vivid colors evoke both awe and respect for nature’s might."
)
torch.manual_seed(42)
output = pipe(
    prompt=i2v_prompt,
    image=image,
    height=384,
    width=640,
    num_frames=99,
    num_inference_steps=50,
    generator=torch.Generator("cuda").manual_seed(42),
    output="videos",
)
export_to_video(output[0], f"helios_auto_i2v_output_{branch}.mp4", fps=24)
print(f"I2V max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

# --- V2V ---
print("=== V2V ===")
video = load_video(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/car.mp4"
)
v2v_prompt = (
    "A bright yellow Lamborghini Huracn Tecnica speeds along a curving mountain road, surrounded by lush "
    "green trees under a partly cloudy sky. The car's sleek design and vibrant color stand out against "
    "the natural backdrop, emphasizing its dynamic movement. The road curves gently, with a guardrail "
    "visible on one side, adding depth to the scene. The motion blur captures the sense of speed and energy, "
    "creating a thrilling and exhilarating atmosphere. A front-facing shot from a slightly elevated "
    "angle, highlighting the car's aggressive stance and the surrounding greenery."
)

torch.manual_seed(42)
output = pipe(
    prompt=v2v_prompt,
    video=video,
    height=384,
    width=640,
    num_frames=99,
    num_inference_steps=50,
    generator=torch.Generator("cuda").manual_seed(42),
    output="videos",
)
export_to_video(output[0], f"helios_auto_v2v_output_{branch}.mp4", fps=24)
print(f"V2V max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")

@SHYuanBest
Copy link
Contributor

@yiyixuxu I think cfg-zero-star with higher precision is better, and I open a PR to fix the regular pipeline.
#13214

yiyi@huggingface.co added 2 commits March 6, 2026 09:41
@yiyixuxu yiyixuxu requested a review from sayakpaul March 6, 2026 18:41
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants