reverted

prishajain1 · prishajain1 · commit 5de4c304c35a · 2025-12-31T11:26:56.000+05:30
diff --git a/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py b/src/maxdiffusion/pipelines/wan/wan_pipeline_i2v_2p1.py
@@ -152,8 +152,9 @@ def prepare_latents(
         jax.debug.print("condition stats: mask_mean={mm}, latent_mean={lm}",
                         mm=jnp.mean(condition[..., 0]),
                         lm=jnp.mean(condition[..., 1:]))
+        jax.debug.print("condition latent std={std}", std=jnp.std(condition[..., 1:]))
 
-        return latents, condition, None
+        return latents, condition, first_frame_mask
 
 
   def __call__(
@@ -212,6 +213,12 @@ def __call__(
         last_image=last_image_tensor,
         num_videos_per_prompt=num_videos_per_prompt,
     )
+    if first_frame_mask is not None:
+       jax.debug.print("FIRST FRAME MASK stats: min={mn}, max={mx}, mean={mean}, shape={shape}",
+                       mn=jnp.min(first_frame_mask),
+                       mx=jnp.max(first_frame_mask),
+                       mean=jnp.mean(first_frame_mask),
+                       shape=first_frame_mask.shape)
 
     scheduler_state = self.scheduler.set_timesteps(
         self.scheduler_state, num_inference_steps=num_inference_steps, shape=latents.shape
@@ -291,25 +298,43 @@ def run_inference_2_1_i2v(
   if do_classifier_free_guidance:
     prompt_embeds = jnp.concatenate([prompt_embeds, negative_prompt_embeds], axis=0)
     image_embeds = jnp.concatenate([image_embeds, image_embeds], axis=0)
-    condition = jnp.concatenate([condition] * 2)
-    if first_frame_mask is not None:
+    if  expand_timesteps:
+        condition = jnp.concatenate([condition] * 2)
         first_frame_mask = jnp.concatenate([first_frame_mask] * 2)
+    else:
+        condition = jnp.concatenate([condition] * 2)
 
 
   def loop_body(step, vals):
     latents, scheduler_state, rng = vals
     original_dtype = latents.dtype
     rng, timestep_rng = jax.random.split(rng)
     t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+    jax.debug.print("Step {s}: timestep={t}", s=step, t=t)
 
     latents_input = latents
     if do_classifier_free_guidance:
         latents_input = jnp.concatenate([latents, latents], axis=0)
+    jax.debug.print("Step{s}: latents_input stats min={mn}, max={mx}, mean={mean}, std={std}",
+                    s=step,
+                    mn=jnp.min(latents_input),
+                    mx=jnp.max(latents_input),
+                    mean=jnp.mean(latents_input),
+                    std=jnp.std(latents_input))
 
     latent_model_input = jnp.concatenate([latents_input, condition], axis=-1)
     timestep = jnp.broadcast_to(t, latents_input.shape[0])
     latent_model_input = jnp.transpose(latent_model_input, (0, 4, 1, 2, 3))
 
+    jax.debug.print("Step {s}: latent_model_input shape: {shape}",
+                    s=step,
+                    shape=latent_model_input.shape)
+
+    channel_energy = jnp.sum(latent_model_input*latent_model_input,axis=(0,2,3,4))
+    jax.debug.print("Step {s}: channel energy first 10={ce}",
+                    s=step,
+                    ce=channel_energy[:10])
+
     prompt_embeds_input = prompt_embeds
     image_embeds_input = image_embeds
 
@@ -322,19 +347,28 @@ def loop_body(step, vals):
         encoder_hidden_states_image=image_embeds_input,
     )
     noise_pred = jnp.transpose(noise_pred, (0, 2, 3, 4, 1))
+    jax.debug.print("Step {s}: noise_pred stats min={mn}, max={mx}, mean={mean}, std={std}",
+                    s=step,
+                    mn=jnp.min(noise_pred),
+                    mx=jnp.max(noise_pred),
+                    mean=jnp.mean(noise_pred),
+                    std=jnp.std(noise_pred))
     jax.debug.print("Step {s}: latents_prev std={std}, mean={mean}",
                     s=step,
                     std=jnp.std(latents),
                     mean=jnp.mean(latents))
+    jax.debug.print("first_frame_mask shape:", first_frame_mask.shape if first_frame_mask is not None else (-1,))
+    jax.debug.print("first_frame_mask unique values:", jnp.unique(first_frame_mask))
+    jax.debug.print("condition shape:", condition.shape)
+    jax.debug.print("condition stats:", jnp.min(condition), jnp.max(condition), jnp.mean(condition))
+    if first_frame_mask is not None:
+       clean_latents = condition[..., 4:]
+       latents = first_frame_mask * clean_latents + (1 - first_frame_mask) * latents
     latents, scheduler_state = scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
     jax.debug.print("Step {s}: latents_next std={std}, mean={mean}",
                     s=step,
                     std=jnp.std(latents),
                     mean=jnp.mean(latents))
-    # Apply first frame preservation
-    if first_frame_mask is not None:
-      clean_latents = condition[..., 4:]
-      latents = first_frame_mask * clean_latents + (1 - first_frame_mask) * latents
     latents = latents.astype(original_dtype)
     return latents, scheduler_state, rng
 
@@ -346,4 +380,4 @@ def loop_body(step, vals):
                   lmean=jnp.mean(latents),
                   lstd=jnp.std(latents))
   max_logging.log("Finished fori_loop.")
-  return latents
+  return latents