Agenta-AI · ardaerzin · Dec 8, 2025 · Nov 21, 2025 · Nov 21, 2025 · Nov 21, 2025
diff --git a/api/ee/databases/postgres/migrations/core/versions/652f6113b5f5_populate_runs_flags.py b/api/ee/databases/postgres/migrations/core/versions/652f6113b5f5_populate_runs_flags.py
@@ -0,0 +1,101 @@
+"""Populate runs flags
+
+Revision ID: 652f6113b5f5
+Revises: 395af3695bca
+Create Date: 2025-11-23 12:00:00
+"""
+
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+import json
+
+# revision identifiers, used by Alembic.
+revision: str = "652f6113b5f5"
+down_revision: Union[str, None] = "395af3695bca"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def _as_dict(value):
+    if isinstance(value, dict):
+        return value
+    if isinstance(value, str):
+        try:
+            return json.loads(value)
+        except Exception:
+            return None
+    return None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+
+    rows = conn.execute(
+        sa.text("SELECT id, data, flags FROM evaluation_runs")
+    ).fetchall()
+
+    for run_id, data_raw, flags_raw in rows:
+        # Start with all flags False
+        flags_out = {
+            "is_live": False,
+            "is_active": False,
+            "is_closed": False,
+            "has_queries": False,
+            "has_testsets": False,
+            "has_evaluators": False,
+            "has_custom": False,
+            "has_human": False,
+            "has_auto": False,
+        }
+
+        data = _as_dict(data_raw)
+        existing_flags = _as_dict(flags_raw)
+
+        # 1) Overlay existing is_* flags (if any) onto the base flags
+        if isinstance(existing_flags, dict):
+            for key in ("is_live", "is_active", "is_closed"):
+                if key in existing_flags and existing_flags[key] is not None:
+                    # Expecting booleans here, but be defensive
+                    flags_out[key] = bool(existing_flags[key])
+
+        # 2) Recompute has_* flags from data.steps (like _make_run_flags)
+        if isinstance(data, dict):
+            steps = data.get("steps", [])
+            if isinstance(steps, list):
+                for step in steps:
+                    if not isinstance(step, dict):
+                        continue
+
+                    step_type = step.get("type")
+
+                    # Input steps: infer queries/testsets from reference keys
+                    if step_type == "input":
+                        refs = step.get("references") or {}
+                        if isinstance(refs, dict):
+                            for key in refs.keys():
+                                key_str = str(key).lower()
+                                if "query" in key_str:
+                                    flags_out["has_queries"] = True
+                                if "testset" in key_str:
+                                    flags_out["has_testsets"] = True
+
+                    # Annotation steps: evaluators + origin
+                    if step_type == "annotation":
+                        flags_out["has_evaluators"] = True
+                        origin = step.get("origin")
+                        if origin == "custom":
+                            flags_out["has_custom"] = True
+                        elif origin == "human":
+                            flags_out["has_human"] = True
+                        elif origin == "auto":
+                            flags_out["has_auto"] = True
+
+        conn.execute(
+            sa.text("UPDATE evaluation_runs SET flags = :flags WHERE id = :id"),
+            {"flags": json.dumps(flags_out), "id": run_id},
+        )
+
+
+def downgrade() -> None:
+    pass
diff --git a/...es/postgres/migrations/core/versions/a1b2c3d4e5f6_drop_corrupted_metrics_for_some_runs.py b/...es/postgres/migrations/core/versions/a1b2c3d4e5f6_drop_corrupted_metrics_for_some_runs.py
@@ -0,0 +1,111 @@
+"""Drop corrupted metrics for some runs
+
+Revision ID: a1b2c3d4e5f6
+Revises: 652f6113b5f5
+Create Date: 2025-11-28 00:00:00
+
+"""
+
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision: str = "a1b2c3d4e5f6"
+down_revision: Union[str, None] = "652f6113b5f5"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    conn = op.get_bind()
+    batch_size = 100
+
+    # ------------------------------------------------------------------
+    # 1) Drop metrics for runs where flags.has_human = true (batched)
+    # ------------------------------------------------------------------
+    while True:
+        # Grab up to batch_size metric IDs whose run has has_human = true
+        rows = conn.execute(
+            sa.text(
+                """
+                SELECT em.id
+                FROM evaluation_metrics AS em
+                JOIN evaluation_runs AS er
+                  ON er.project_id = em.project_id
+                 AND er.id = em.run_id
+                WHERE er.flags::jsonb ->> 'has_human' = 'true'
+                LIMIT :batch_size
+                """
+            ),
+            {"batch_size": batch_size},
+        ).fetchall()
+
+        if not rows:
+            break
+
+        ids = [row[0] for row in rows]
+
+        conn.execute(
+            sa.text(
+                """
+                DELETE FROM evaluation_metrics
+                WHERE id = ANY(:ids)
+                """
+            ),
+            {"ids": ids},
+        )
+
+    # ------------------------------------------------------------------
+    # 2) Drop metrics whose data has at least one 2nd-level key
+    #    that does NOT start with 'attribute' (batched)
+    # ------------------------------------------------------------------
+    while True:
+        rows = conn.execute(
+            sa.text(
+                """
+                SELECT id
+                FROM evaluation_metrics AS em
+                WHERE EXISTS (
+                    SELECT 1
+                    FROM json_each(
+                             CASE
+                                 WHEN json_typeof(em.data) = 'object'
+                                 THEN em.data
+                                 ELSE '{}'::json
+                             END
+                         ) AS top(top_key, top_value)
+                    CROSS JOIN LATERAL json_each(
+                             CASE
+                                 WHEN json_typeof(top_value) = 'object'
+                                 THEN top_value
+                                 ELSE '{}'::json
+                             END
+                         ) AS second(second_key, second_value)
+                    WHERE second_key NOT LIKE 'attribute%'
+                )
+                LIMIT :batch_size
+                """
+            ),
+            {"batch_size": batch_size},
+        ).fetchall()
+
+        if not rows:
+            break
+
+        ids = [row[0] for row in rows]
+
+        conn.execute(
+            sa.text(
+                """
+                DELETE FROM evaluation_metrics
+                WHERE id = ANY(:ids)
+                """
+            ),
+            {"ids": ids},
+        )
+
+
+def downgrade() -> None:
+    # Data-destructive; nothing to restore
+    pass
diff --git a/...atabases/postgres/migrations/core/versions/b2c3d4e5f6a1_add_metrics_upsert_constraints.py b/...atabases/postgres/migrations/core/versions/b2c3d4e5f6a1_add_metrics_upsert_constraints.py
@@ -0,0 +1,75 @@
+"""add metrics upsert constraints (partial unique indexes)
+
+Revision ID: b2c3d4e5f6a1
+Revises: a1b2c3d4e5f6
+Create Date: 2025-12-04 12:00:00.000000
+
+This migration replaces the broken unique constraint (which allows multiple
+NULLs) with three partial unique indexes that enforce uniqueness for valid
+NULL combinations.
+"""
+
+from typing import Sequence, Union
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "b2c3d4e5f6a1"
+down_revision: Union[str, None] = "c1c2c3c4c5c6"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Remove broken unique constraint and add partial unique constraints."""
+
+    # Step 1: Drop broken unique constraint
+    # This constraint allowed multiple rows with (run_id, NULL, NULL)
+    # because PostgreSQL treats NULL != NULL in unique constraints
+    op.drop_constraint(
+        "uq_evaluation_metrics_project_run_scenario_timestamp_interval",
+        "evaluation_metrics",
+        type_="unique",
+    )
+
+    # Step 2: Create partial unique INDEX for Global Metrics
+    # Global metric: (project_id, run_id) where scenario_id IS NULL AND timestamp IS NULL
+    # Ensures: Only ONE global metric per (project_id, run_id)
+    op.execute("""
+        CREATE UNIQUE INDEX ux_evaluation_metrics_global
+        ON evaluation_metrics (project_id, run_id)
+        WHERE scenario_id IS NULL AND timestamp IS NULL
+    """)
+
+    # Step 3: Create partial unique INDEX for Variational Metrics
+    # Variational metric: (project_id, run_id, scenario_id) where timestamp IS NULL
+    # Ensures: Only ONE variational metric per (project_id, run_id, scenario_id)
+    op.execute("""
+        CREATE UNIQUE INDEX ux_evaluation_metrics_variational
+        ON evaluation_metrics (project_id, run_id, scenario_id)
+        WHERE timestamp IS NULL AND scenario_id IS NOT NULL
+    """)
+
+    # Step 4: Create partial unique INDEX for Temporal Metrics
+    # Temporal metric: (project_id, run_id, timestamp) where scenario_id IS NULL
+    # Ensures: Only ONE temporal metric per (project_id, run_id, timestamp)
+    op.execute("""
+        CREATE UNIQUE INDEX ux_evaluation_metrics_temporal
+        ON evaluation_metrics (project_id, run_id, timestamp)
+        WHERE scenario_id IS NULL AND timestamp IS NOT NULL
+    """)
+
+
+def downgrade() -> None:
+    """Rollback to old unique constraint."""
+
+    # Remove the three partial unique indexes
+    op.execute("DROP INDEX IF EXISTS ux_evaluation_metrics_global")
+    op.execute("DROP INDEX IF EXISTS ux_evaluation_metrics_variational")
+    op.execute("DROP INDEX IF EXISTS ux_evaluation_metrics_temporal")
+
+    # Recreate the old broken unique constraint
+    op.create_unique_constraint(
+        "uq_evaluation_metrics_project_run_scenario_timestamp_interval",
+        "evaluation_metrics",
+        ["project_id", "run_id", "scenario_id", "timestamp", "interval"],
+    )
diff --git a/...abases/postgres/migrations/core/versions/c1c2c3c4c5c6_cleanup_duplicate_global_metrics.py b/...abases/postgres/migrations/core/versions/c1c2c3c4c5c6_cleanup_duplicate_global_metrics.py
@@ -0,0 +1,94 @@
+"""cleanup duplicate global metrics
+
+Revision ID: c1c2c3c4c5c6
+Revises: a1b2c3d4e5f6
+Create Date: 2025-12-04 12:00:00.000000
+
+This migration removes duplicate global metrics (rows where scenario_id IS NULL
+and timestamp IS NULL for the same project_id, run_id pair) before applying the
+unique index constraint. For each duplicate set, we keep the most recently
+updated row and delete older duplicates.
+"""
+
+from typing import Sequence, Union
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision: str = "c1c2c3c4c5c6"
+down_revision: Union[str, None] = "a1b2c3d4e5f6"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    """Remove duplicate metrics across all three scenarios."""
+    conn = op.get_bind()
+
+    # Scenario 1: Global Metrics (project_id, run_id)
+    # where scenario_id IS NULL AND timestamp IS NULL
+    # Delete ALL duplicates (rows that share same project_id, run_id)
+    conn.execute(
+        sa.text(
+            """
+            DELETE FROM evaluation_metrics
+            WHERE (project_id, run_id) IN (
+                SELECT project_id, run_id
+                FROM evaluation_metrics
+                WHERE scenario_id IS NULL
+                  AND timestamp IS NULL
+                GROUP BY project_id, run_id
+                HAVING COUNT(*) > 1
+            )
+              AND scenario_id IS NULL
+              AND timestamp IS NULL
+            """
+        )
+    )
+
+    # Scenario 2: Variational Metrics (project_id, run_id, scenario_id)
+    # where timestamp IS NULL
+    # Delete ALL duplicates (rows that share same project_id, run_id, scenario_id)
+    conn.execute(
+        sa.text(
+            """
+            DELETE FROM evaluation_metrics
+            WHERE (project_id, run_id, scenario_id) IN (
+                SELECT project_id, run_id, scenario_id
+                FROM evaluation_metrics
+                WHERE timestamp IS NULL
+                  AND scenario_id IS NOT NULL
+                GROUP BY project_id, run_id, scenario_id
+                HAVING COUNT(*) > 1
+            )
+              AND timestamp IS NULL
+              AND scenario_id IS NOT NULL
+            """
+        )
+    )
+
+    # Scenario 3: Temporal Metrics (project_id, run_id, timestamp)
+    # where scenario_id IS NULL
+    # Delete ALL duplicates (rows that share same project_id, run_id, timestamp)
+    conn.execute(
+        sa.text(
+            """
+            DELETE FROM evaluation_metrics
+            WHERE (project_id, run_id, timestamp) IN (
+                SELECT project_id, run_id, timestamp
+                FROM evaluation_metrics
+                WHERE scenario_id IS NULL
+                  AND timestamp IS NOT NULL
+                GROUP BY project_id, run_id, timestamp
+                HAVING COUNT(*) > 1
+            )
+              AND scenario_id IS NULL
+              AND timestamp IS NOT NULL
+            """
+        )
+    )
+
+
+def downgrade() -> None:
+    # Data-destructive; nothing to restore
+    pass