Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
279 commits
Select commit Hold shift + click to select a range
7736369
fix evaluate
jp-agenta Nov 21, 2025
53e3f3d
cell improvements
ardaerzin Nov 21, 2025
eed2e18
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Nov 21, 2025
c61ae7f
fix scenario metrics fetch
ardaerzin Nov 21, 2025
4fb5cb7
style improvements
ardaerzin Nov 22, 2025
f94cc8f
scenario metrics
ardaerzin Nov 22, 2025
52b1f1a
Add extra flags to run
jp-agenta Nov 23, 2025
c3811dd
fix existing evaluation runs
jp-agenta Nov 23, 2025
5a29423
default meta to undefined
jp-agenta Nov 23, 2025
40ec82b
fix downgrades
jp-agenta Nov 23, 2025
fefcb55
clean up changes
jp-agenta Nov 23, 2025
ece36e9
improved popover
ardaerzin Nov 23, 2025
e2f10b8
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Nov 23, 2025
d396f5f
update and adjust filters
ardaerzin Nov 23, 2025
7eaf70d
fixes focus drawer opening
ardaerzin Nov 24, 2025
ea5d1f0
single scenario view and layout improvements
ardaerzin Nov 25, 2025
7dfd1d9
remove testsets change
ardaerzin Nov 25, 2025
0fcb502
fix evaluator filtering
ardaerzin Nov 25, 2025
b2fe2d8
remove `_include_run` util and its usage
ardaerzin Nov 25, 2025
b67b90a
fix unpack
ardaerzin Nov 25, 2025
7190407
api cleanup continued
ardaerzin Nov 25, 2025
3eb8018
[FE] improve evaluation runs filtering
ardaerzin Nov 25, 2025
f1a4cf1
remove extra in-memory filtering for metrics
ardaerzin Nov 25, 2025
d66a5a0
testset cleanup
ardaerzin Nov 26, 2025
e66b761
type cleanup
ardaerzin Nov 26, 2025
1e369ad
cleanup
ardaerzin Nov 26, 2025
dccafdb
annotate panels
ardaerzin Nov 26, 2025
1a25a8c
remove dummy import
jp-agenta Nov 26, 2025
2254e5f
revert testset changes
jp-agenta Nov 26, 2025
c4bcf29
unify scenario info and scenario navigation in focus tab
ardaerzin Nov 26, 2025
2b386a0
adds missing `GenerationResultUtils` to focus tab
ardaerzin Nov 26, 2025
02ea2f2
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Nov 26, 2025
07768ab
ui improvements / fixes
ardaerzin Nov 26, 2025
f1562ad
cell improvements
ardaerzin Nov 27, 2025
e684a6d
fix visibility popover
ardaerzin Nov 27, 2025
93f256b
improve column visibility popover
ardaerzin Nov 27, 2025
4efd297
new generation result util component
ardaerzin Nov 27, 2025
5949994
replace legacy pages
ardaerzin Nov 27, 2025
b75261c
Merge branch 'main' into frontend-feat/new-evaluations-pages
ardaerzin Nov 27, 2025
4805c5c
improve single scenario view layout
ardaerzin Nov 27, 2025
e50377f
fix duration display for SharedGenerationResultUtils
ardaerzin Nov 27, 2025
0899212
disable status display in evaluation results pages' SharedGenerationR…
ardaerzin Nov 27, 2025
1980d31
output cell layout improvement
ardaerzin Nov 27, 2025
f332f6b
monofont for output cells
ardaerzin Nov 27, 2025
4194bd4
font-size adjustments
ardaerzin Nov 27, 2025
93d0e9e
remove scenario hydration hook & atoms
ardaerzin Nov 27, 2025
6b7804a
cleanup
ardaerzin Nov 27, 2025
c2ec43d
cleanup
ardaerzin Nov 27, 2025
b2537b7
cleaning up legacy components
ardaerzin Nov 28, 2025
b1118c2
relocate renderChatMessages helper
ardaerzin Nov 28, 2025
e8f7c01
fix import path casing
ardaerzin Nov 28, 2025
e5aff71
cleanup
ardaerzin Nov 28, 2025
c779faf
cleanup continued
ardaerzin Nov 28, 2025
6ca8deb
cleanup continued
ardaerzin Nov 28, 2025
108775b
fix chat message rendering
ardaerzin Nov 28, 2025
3d084cb
prevent row jumpiness
ardaerzin Nov 28, 2025
0e405eb
performance improvements
ardaerzin Nov 28, 2025
c44bdaf
Merge branch 'main' into frontend-feat/new-evaluations-pages
ardaerzin Nov 28, 2025
8e2a144
fix incorrect reference query
ardaerzin Nov 28, 2025
5abe0f2
Merge branch 'main' into frontend-feat/new-evaluations-pages
ardaerzin Nov 28, 2025
4224b1c
refresh fix
ardaerzin Nov 28, 2025
5882a0f
add drop metrics migration
jp-agenta Nov 28, 2025
7373b89
fixinf refresh
jp-agenta Nov 28, 2025
22b24a7
Merge branch 'fix/clean-up-metrics-in-infinite-scrolling' into fronte…
jp-agenta Nov 28, 2025
0bf057b
fixing metrics ?
jp-agenta Nov 28, 2025
abcd1c7
metric display fixes
ardaerzin Nov 28, 2025
240a959
fixes AGE-3382
ardaerzin Nov 28, 2025
40de97a
Merge branch 'main' into frontend-feat/new-evaluations-pages
jp-agenta Nov 28, 2025
99ad0ad
fix migration tree
jp-agenta Nov 28, 2025
c696677
online eval scenario render improvements
ardaerzin Nov 28, 2025
810e673
improve invocation output display
ardaerzin Nov 28, 2025
4c0f7fc
trace drawer improvements
ardaerzin Nov 28, 2025
27c93b5
layout improvements
ardaerzin Nov 28, 2025
c3c5c33
fixing traces for SingleScenarioViewerPOC
ardaerzin Nov 28, 2025
932a0e6
improve SingleScenarioViewerPOC structure and annotations
ardaerzin Nov 28, 2025
fc9310e
fix array type metric value display
ardaerzin Nov 28, 2025
8d9be3e
fixes action button for human annotation scenarios table
ardaerzin Nov 29, 2025
c9406a5
free string metric display
ardaerzin Nov 29, 2025
044834f
improve metric displays
ardaerzin Nov 29, 2025
14e2f94
fix human evaluator metrics refresh
jp-agenta Nov 29, 2025
44b7b52
removing log.debug()
jp-agenta Nov 29, 2025
bd2f405
fix find_head.py
jp-agenta Nov 29, 2025
3c3e753
human eval annotation
ardaerzin Nov 29, 2025
733fe1e
improve spider metric chart invalid state
ardaerzin Nov 29, 2025
20be910
human evaluation invocation and several other fixes
ardaerzin Nov 29, 2025
67b3bd5
row height change
ardaerzin Nov 29, 2025
d564e72
auto evaluation tab rename
ardaerzin Nov 29, 2025
b675e30
cleanup
ardaerzin Nov 29, 2025
c083ed2
improve new evaluation modal evaluator selection step
ardaerzin Nov 29, 2025
ba33713
fix columns not updating issue after tab change
ardaerzin Nov 29, 2025
6fc62c2
improve evaluation kind label copy
ardaerzin Nov 29, 2025
c8ea0e5
shorten tab labels
ardaerzin Nov 29, 2025
e948f0e
improve vertical scroll
ardaerzin Nov 29, 2025
e4e968f
code split
ardaerzin Nov 30, 2025
34d545a
disable row cell click to copy run id action
ardaerzin Nov 30, 2025
06c05b1
fix message imports
ardaerzin Nov 30, 2025
795cc8c
cleanup
ardaerzin Nov 30, 2025
fa788c7
fix message usage
ardaerzin Nov 30, 2025
28bcdf2
more space for the scenarios table
ardaerzin Nov 30, 2025
1e97584
fix loading state issue
ardaerzin Nov 30, 2025
e7f06f3
perf
ardaerzin Nov 30, 2025
7397750
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Nov 30, 2025
31a172c
fix evaluator selection
ardaerzin Dec 1, 2025
6a42c31
antd deprecated prop fixes
ardaerzin Dec 1, 2025
11df8cc
narrow screen layout improvements
ardaerzin Dec 1, 2025
ec1866c
log clean
ardaerzin Dec 1, 2025
f3b6feb
fix revalidations after creating an evaluation
ardaerzin Dec 1, 2025
4c9ff88
fix online evals
jp-agenta Dec 1, 2025
c98b946
Merge branch 'frontend-feat/new-evaluations-pages' of github.com:Agen…
jp-agenta Dec 1, 2025
37655c0
fix deprecated prop
ardaerzin Dec 1, 2025
914be98
ugh, fixed sdk evals
jp-agenta Dec 1, 2025
23e9ae4
improve human eval SingleScenarioViewerPOC
ardaerzin Dec 1, 2025
1bd46e5
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Dec 1, 2025
f0ca559
fix invalidations
ardaerzin Dec 1, 2025
6df7b6b
display names in delete evaluations modal instead of ids
ardaerzin Dec 1, 2025
ff69a6d
fixes breadcrumbs
ardaerzin Dec 1, 2025
bdb98c9
fix llm-as-a-judge ?
jp-agenta Dec 1, 2025
c94c73e
Merge branch 'frontend-feat/new-evaluations-pages' of github.com:Agen…
jp-agenta Dec 1, 2025
9f1873a
fix evaluators typo
jp-agenta Dec 1, 2025
ecf0e21
ok fix basic/advanced mode
jp-agenta Dec 1, 2025
49c92cd
online metrics
ardaerzin Dec 1, 2025
35c3053
filters improvements
ardaerzin Dec 1, 2025
080097f
testset matching fixes
ardaerzin Dec 1, 2025
1dc10b2
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Dec 1, 2025
b32b174
online eval revalidation
ardaerzin Dec 1, 2025
b3adaf5
optimize for evaluator fetches
ardaerzin Dec 1, 2025
7529573
cleanup
ardaerzin Dec 1, 2025
086a673
fix hallucinations typo
jp-agenta Dec 1, 2025
a1763de
Merge branch 'frontend-feat/new-evaluations-pages' of github.com:Agen…
jp-agenta Dec 1, 2025
96d5851
prevent extra fetching
ardaerzin Dec 1, 2025
bf76d92
fix creation error after NewEvaluationModalInner
ardaerzin Dec 1, 2025
1a0e0bc
metric cell fix and improvements
ardaerzin Dec 1, 2025
a6ed302
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Dec 1, 2025
bc9f743
fixing stuck loading states
ardaerzin Dec 1, 2025
dbe7101
cleanup
ardaerzin Dec 1, 2025
fad10f6
fix not-available for evaluator metric cells
ardaerzin Dec 1, 2025
24dd218
loading state improvements
ardaerzin Dec 1, 2025
35bcf44
small / med / large row height controls
ardaerzin Dec 1, 2025
ee3dd89
cleanup
ardaerzin Dec 1, 2025
8102b64
Merge branch 'main' into frontend-feat/new-evaluations-pages
ardaerzin Dec 1, 2025
ac19476
index cell alignment
ardaerzin Dec 1, 2025
8ff38ca
improve SingleScenarioViewerPOC run overlay
ardaerzin Dec 1, 2025
5b893d6
prevent triggering refresh for scenarios without successful invocatio…
ardaerzin Dec 1, 2025
510f644
human eval column fix
ardaerzin Dec 1, 2025
d2a1bfd
Merge branch 'main' into frontend-feat/new-evaluations-pages
jp-agenta Dec 2, 2025
e2c5e65
improve annotate panels
ardaerzin Dec 2, 2025
e577c09
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Dec 2, 2025
1427042
improve focus drawer metric labels
ardaerzin Dec 2, 2025
86e6944
string metric value display
ardaerzin Dec 2, 2025
508cb02
rename revalidation
ardaerzin Dec 2, 2025
8db7435
metric polishes
ardaerzin Dec 2, 2025
ba88ea3
improved colors for compare
ardaerzin Dec 2, 2025
398daca
app scoped evaluations page fix
ardaerzin Dec 2, 2025
782226c
app level evals page layout improvement
ardaerzin Dec 2, 2025
9ce8726
evaluator dimension for comparable evaluations criteria
ardaerzin Dec 2, 2025
f1c5cc0
fix QA feedback item #2
ardaerzin Dec 2, 2025
63e1ed2
resize improvements & cleanup
ardaerzin Dec 2, 2025
f7373a3
no more live evaluation creation button in app scoped page
ardaerzin Dec 2, 2025
c035ebb
layout fixes
ardaerzin Dec 2, 2025
6524ef2
fix qa feedback #4
ardaerzin Dec 2, 2025
757cb27
fixes latest eval runs tables feedback #1 & #2
ardaerzin Dec 2, 2025
247156b
fix integer issue
ardaerzin Dec 2, 2025
0d735da
fix table filter content layout issue
ardaerzin Dec 2, 2025
acf924e
improve filters content
ardaerzin Dec 2, 2025
857e458
reference updates
ardaerzin Dec 3, 2025
6c92c83
polling improvements
ardaerzin Dec 3, 2025
f0f2b95
fix and improve status updates
ardaerzin Dec 3, 2025
642af5d
filter button improvements
ardaerzin Dec 3, 2025
5544dd6
QA feedback round fixes
ardaerzin Dec 3, 2025
0932b9e
qa round fixes
ardaerzin Dec 3, 2025
b1520f6
Merge branch 'main' into frontend-feat/new-evaluations-pages
ardaerzin Dec 3, 2025
c67e1ea
fix migrations ?
jp-agenta Dec 3, 2025
99f610d
improve reference labels of deleted entities
ardaerzin Dec 3, 2025
d1f099c
dummy change
jp-agenta Dec 3, 2025
ad245d7
dummy change again
jp-agenta Dec 3, 2025
0f07a7c
fixes
ardaerzin Dec 3, 2025
f3ba729
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Dec 3, 2025
87851c8
refactor: prevent premature run-level metrics refresh during evaluati…
ardaerzin Dec 3, 2025
39226cd
Merge branch 'main' into frontend-feat/new-evaluations-pages
ardaerzin Dec 3, 2025
7f8435b
bump to solve issues
ardaerzin Dec 3, 2025
86a4b66
supertokens bump
ardaerzin Dec 3, 2025
60f9b86
improve metric refresh
ardaerzin Dec 3, 2025
eaf44ad
Merge branch 'frontend-feat/new-evaluations-pages' into frontend-chor…
ardaerzin Dec 3, 2025
eadb70c
Add debug
jp-agenta Dec 3, 2025
b6cb18e
more logs
jp-agenta Dec 3, 2025
ada9a71
remove and add logs
jp-agenta Dec 3, 2025
dbc99ea
fixes chart alignment issues
ardaerzin Dec 4, 2025
e64ea76
spider chart alignments
ardaerzin Dec 4, 2025
a416c3d
fixes trace drawer header layout
ardaerzin Dec 4, 2025
333f2a0
shouldn't run refresh while pending
ardaerzin Dec 4, 2025
b5a5a03
fix refresh condition 1.
jp-agenta Dec 4, 2025
adf91f1
Merge branch 'frontend-feat/new-evaluations-pages' of github.com:Agen…
jp-agenta Dec 4, 2025
2b5b235
fix(frontend): prevent repeated run-level metric refresh requests
ardaerzin Dec 4, 2025
dbf7e48
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Dec 4, 2025
26f70d2
Merge branch 'frontend-feat/new-evaluations-pages' into frontend-chor…
ardaerzin Dec 4, 2025
ced7527
fix dropdown button icon
ardaerzin Dec 4, 2025
feb5725
add upsert to create metrics
jp-agenta Dec 4, 2025
8dc8a1f
fixing migrations
jp-agenta Dec 4, 2025
0581c86
try preventing unnecessary refreshes
ardaerzin Dec 4, 2025
5e0eb8a
project change fix
ardaerzin Dec 4, 2025
a4bc184
add more logs
jp-agenta Dec 4, 2025
1bc1283
fix(frontend): copy full ID value instead of truncated version in bre…
ardaerzin Dec 4, 2025
b3fa515
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Dec 4, 2025
0457fd8
remove duplicates
jp-agenta Dec 4, 2025
5eaf792
Merge branch 'frontend-feat/new-evaluations-pages' of github.com:Agen…
jp-agenta Dec 4, 2025
81055e1
add logs
jp-agenta Dec 4, 2025
1f8cdc8
try and fix upsert
jp-agenta Dec 4, 2025
ec685d5
adding more trace/deug
jp-agenta Dec 4, 2025
9b86710
fix missing windowing in refresh_metrics
jp-agenta Dec 4, 2025
dc911dd
fix migration issue ?
jp-agenta Dec 4, 2025
e77db0a
attempt at fixing upsert?
jp-agenta Dec 4, 2025
2bfddc4
split conditions for upsert
jp-agenta Dec 4, 2025
2498b37
enable refresh
ardaerzin Dec 4, 2025
80b000a
fixing upsert ?
jp-agenta Dec 4, 2025
e7de3a0
using unique constraints instead
jp-agenta Dec 4, 2025
e858e73
back to creating unique constraints
jp-agenta Dec 4, 2025
b56a8bf
fix upsert ?
jp-agenta Dec 4, 2025
81ffea2
human eval initial view improvements
ardaerzin Dec 4, 2025
a0eaaa5
fix projectId for metric query
ardaerzin Dec 4, 2025
206b355
initial commit (WIP)
jp-agenta Dec 4, 2025
4dc7971
enable testset comparison highlighting in metadata table
ardaerzin Dec 4, 2025
0208fe0
scenario metric refresh
ardaerzin Dec 4, 2025
00055f1
fix reqs
jp-agenta Dec 4, 2025
3b19054
add logging fix migrations down revision
jp-agenta Dec 4, 2025
e94e5fb
initial commit (WIP)
jp-agenta Dec 4, 2025
70ebcb1
fix reqs
jp-agenta Dec 4, 2025
72b075a
add logging fix migrations down revision
jp-agenta Dec 4, 2025
069a1f6
Merge branch 'release/v0.66.0' into frontend-feat/new-evaluations-pages
jp-agenta Dec 4, 2025
b9810c3
fix daytona for now ?
jp-agenta Dec 4, 2025
5c85f22
Merge branch 'frontend-feat/new-evaluations-pages' into poc/custom-co…
jp-agenta Dec 4, 2025
10dc7e3
apply ruff
jp-agenta Dec 4, 2025
c131a68
remove logs
jp-agenta Dec 4, 2025
779a4f2
fix llm as a judge regression
jp-agenta Dec 4, 2025
737e88b
fix regression for real
jp-agenta Dec 4, 2025
8169a36
fix migrations
jp-agenta Dec 4, 2025
de8f377
fix & improve human annotatiion
ardaerzin Dec 4, 2025
d4394ad
adjust invocation cell heights
ardaerzin Dec 4, 2025
4d06834
human eval scenario metric cell popover fix
ardaerzin Dec 4, 2025
5f57973
improve string metric popover display
ardaerzin Dec 4, 2025
b8f5928
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Dec 4, 2025
851eaae
Merge branch 'frontend-feat/new-evaluations-pages' into poc/custom-co…
jp-agenta Dec 4, 2025
f25ea4a
fix metric display: use invocation step for analytics, aggregate hist…
ardaerzin Dec 4, 2025
4722746
remove commented code, improve error handling, add input validation, …
ardaerzin Dec 4, 2025
d28db9d
fix daytona
jp-agenta Dec 4, 2025
996051c
Merge branch 'poc/custom-code-daytona-sandbox' into frontend-feat/new…
jp-agenta Dec 4, 2025
8bffcb0
remove humanizeEvaluatorName transformation logic and return label as-is
ardaerzin Dec 5, 2025
87692e8
Merge branch 'frontend-feat/new-evaluations-pages' of https://github.…
ardaerzin Dec 5, 2025
6a5c264
Merge branch 'frontend-feat/new-evaluations-pages' into frontend-chor…
ardaerzin Dec 5, 2025
8ab1f6d
cr fixes
ardaerzin Dec 5, 2025
2a6323e
Merge branch 'frontend-feat/new-evaluations-pages' into frontend-chor…
ardaerzin Dec 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""Populate runs flags

Revision ID: 652f6113b5f5
Revises: 395af3695bca
Create Date: 2025-11-23 12:00:00
"""

from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
import json

# revision identifiers, used by Alembic.
revision: str = "652f6113b5f5"
down_revision: Union[str, None] = "395af3695bca"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def _as_dict(value):
if isinstance(value, dict):
return value
if isinstance(value, str):
try:
return json.loads(value)
except Exception:
return None
return None


def upgrade() -> None:
conn = op.get_bind()

rows = conn.execute(
sa.text("SELECT id, data, flags FROM evaluation_runs")
).fetchall()

for run_id, data_raw, flags_raw in rows:
# Start with all flags False
flags_out = {
"is_live": False,
"is_active": False,
"is_closed": False,
"has_queries": False,
"has_testsets": False,
"has_evaluators": False,
"has_custom": False,
"has_human": False,
"has_auto": False,
}

data = _as_dict(data_raw)
existing_flags = _as_dict(flags_raw)

# 1) Overlay existing is_* flags (if any) onto the base flags
if isinstance(existing_flags, dict):
for key in ("is_live", "is_active", "is_closed"):
if key in existing_flags and existing_flags[key] is not None:
# Expecting booleans here, but be defensive
flags_out[key] = bool(existing_flags[key])

# 2) Recompute has_* flags from data.steps (like _make_run_flags)
if isinstance(data, dict):
steps = data.get("steps", [])
if isinstance(steps, list):
for step in steps:
if not isinstance(step, dict):
continue

step_type = step.get("type")

# Input steps: infer queries/testsets from reference keys
if step_type == "input":
refs = step.get("references") or {}
if isinstance(refs, dict):
for key in refs.keys():
key_str = str(key).lower()
if "query" in key_str:
flags_out["has_queries"] = True
if "testset" in key_str:
flags_out["has_testsets"] = True

# Annotation steps: evaluators + origin
if step_type == "annotation":
flags_out["has_evaluators"] = True
origin = step.get("origin")
if origin == "custom":
flags_out["has_custom"] = True
elif origin == "human":
flags_out["has_human"] = True
elif origin == "auto":
flags_out["has_auto"] = True

conn.execute(
sa.text("UPDATE evaluation_runs SET flags = :flags WHERE id = :id"),
{"flags": json.dumps(flags_out), "id": run_id},
)


def downgrade() -> None:
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
"""Drop corrupted metrics for some runs

Revision ID: a1b2c3d4e5f6
Revises: 652f6113b5f5
Create Date: 2025-11-28 00:00:00

"""

from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision: str = "a1b2c3d4e5f6"
down_revision: Union[str, None] = "652f6113b5f5"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
conn = op.get_bind()
batch_size = 100

# ------------------------------------------------------------------
# 1) Drop metrics for runs where flags.has_human = true (batched)
# ------------------------------------------------------------------
while True:
# Grab up to batch_size metric IDs whose run has has_human = true
rows = conn.execute(
sa.text(
"""
SELECT em.id
FROM evaluation_metrics AS em
JOIN evaluation_runs AS er
ON er.project_id = em.project_id
AND er.id = em.run_id
WHERE er.flags::jsonb ->> 'has_human' = 'true'
LIMIT :batch_size
"""
),
{"batch_size": batch_size},
).fetchall()

if not rows:
break

ids = [row[0] for row in rows]

conn.execute(
sa.text(
"""
DELETE FROM evaluation_metrics
WHERE id = ANY(:ids)
"""
),
{"ids": ids},
)

# ------------------------------------------------------------------
# 2) Drop metrics whose data has at least one 2nd-level key
# that does NOT start with 'attribute' (batched)
# ------------------------------------------------------------------
while True:
rows = conn.execute(
sa.text(
"""
SELECT id
FROM evaluation_metrics AS em
WHERE EXISTS (
SELECT 1
FROM json_each(
CASE
WHEN json_typeof(em.data) = 'object'
THEN em.data
ELSE '{}'::json
END
) AS top(top_key, top_value)
CROSS JOIN LATERAL json_each(
CASE
WHEN json_typeof(top_value) = 'object'
THEN top_value
ELSE '{}'::json
END
) AS second(second_key, second_value)
WHERE second_key NOT LIKE 'attribute%'
)
LIMIT :batch_size
"""
),
{"batch_size": batch_size},
).fetchall()

if not rows:
break

ids = [row[0] for row in rows]

conn.execute(
sa.text(
"""
DELETE FROM evaluation_metrics
WHERE id = ANY(:ids)
"""
),
{"ids": ids},
)


def downgrade() -> None:
# Data-destructive; nothing to restore
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""add metrics upsert constraints (partial unique indexes)

Revision ID: b2c3d4e5f6a1
Revises: a1b2c3d4e5f6
Create Date: 2025-12-04 12:00:00.000000

This migration replaces the broken unique constraint (which allows multiple
NULLs) with three partial unique indexes that enforce uniqueness for valid
NULL combinations.
"""

from typing import Sequence, Union
from alembic import op

# revision identifiers, used by Alembic.
revision: str = "b2c3d4e5f6a1"
down_revision: Union[str, None] = "c1c2c3c4c5c6"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
"""Remove broken unique constraint and add partial unique constraints."""

# Step 1: Drop broken unique constraint
# This constraint allowed multiple rows with (run_id, NULL, NULL)
# because PostgreSQL treats NULL != NULL in unique constraints
op.drop_constraint(
"uq_evaluation_metrics_project_run_scenario_timestamp_interval",
"evaluation_metrics",
type_="unique",
)

# Step 2: Create partial unique INDEX for Global Metrics
# Global metric: (project_id, run_id) where scenario_id IS NULL AND timestamp IS NULL
# Ensures: Only ONE global metric per (project_id, run_id)
op.execute("""
CREATE UNIQUE INDEX ux_evaluation_metrics_global
ON evaluation_metrics (project_id, run_id)
WHERE scenario_id IS NULL AND timestamp IS NULL
""")

# Step 3: Create partial unique INDEX for Variational Metrics
# Variational metric: (project_id, run_id, scenario_id) where timestamp IS NULL
# Ensures: Only ONE variational metric per (project_id, run_id, scenario_id)
op.execute("""
CREATE UNIQUE INDEX ux_evaluation_metrics_variational
ON evaluation_metrics (project_id, run_id, scenario_id)
WHERE timestamp IS NULL AND scenario_id IS NOT NULL
""")

# Step 4: Create partial unique INDEX for Temporal Metrics
# Temporal metric: (project_id, run_id, timestamp) where scenario_id IS NULL
# Ensures: Only ONE temporal metric per (project_id, run_id, timestamp)
op.execute("""
CREATE UNIQUE INDEX ux_evaluation_metrics_temporal
ON evaluation_metrics (project_id, run_id, timestamp)
WHERE scenario_id IS NULL AND timestamp IS NOT NULL
""")


def downgrade() -> None:
"""Rollback to old unique constraint."""

# Remove the three partial unique indexes
op.execute("DROP INDEX IF EXISTS ux_evaluation_metrics_global")
op.execute("DROP INDEX IF EXISTS ux_evaluation_metrics_variational")
op.execute("DROP INDEX IF EXISTS ux_evaluation_metrics_temporal")

# Recreate the old broken unique constraint
op.create_unique_constraint(
"uq_evaluation_metrics_project_run_scenario_timestamp_interval",
"evaluation_metrics",
["project_id", "run_id", "scenario_id", "timestamp", "interval"],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""cleanup duplicate global metrics

Revision ID: c1c2c3c4c5c6
Revises: a1b2c3d4e5f6
Create Date: 2025-12-04 12:00:00.000000

This migration removes duplicate global metrics (rows where scenario_id IS NULL
and timestamp IS NULL for the same project_id, run_id pair) before applying the
unique index constraint. For each duplicate set, we keep the most recently
updated row and delete older duplicates.
"""

from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa

# revision identifiers, used by Alembic.
revision: str = "c1c2c3c4c5c6"
down_revision: Union[str, None] = "a1b2c3d4e5f6"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
"""Remove duplicate metrics across all three scenarios."""
conn = op.get_bind()

# Scenario 1: Global Metrics (project_id, run_id)
# where scenario_id IS NULL AND timestamp IS NULL
# Delete ALL duplicates (rows that share same project_id, run_id)
conn.execute(
sa.text(
"""
DELETE FROM evaluation_metrics
WHERE (project_id, run_id) IN (
SELECT project_id, run_id
FROM evaluation_metrics
WHERE scenario_id IS NULL
AND timestamp IS NULL
GROUP BY project_id, run_id
HAVING COUNT(*) > 1
)
AND scenario_id IS NULL
AND timestamp IS NULL
"""
)
)

# Scenario 2: Variational Metrics (project_id, run_id, scenario_id)
# where timestamp IS NULL
# Delete ALL duplicates (rows that share same project_id, run_id, scenario_id)
conn.execute(
sa.text(
"""
DELETE FROM evaluation_metrics
WHERE (project_id, run_id, scenario_id) IN (
SELECT project_id, run_id, scenario_id
FROM evaluation_metrics
WHERE timestamp IS NULL
AND scenario_id IS NOT NULL
GROUP BY project_id, run_id, scenario_id
HAVING COUNT(*) > 1
)
AND timestamp IS NULL
AND scenario_id IS NOT NULL
"""
)
)

# Scenario 3: Temporal Metrics (project_id, run_id, timestamp)
# where scenario_id IS NULL
# Delete ALL duplicates (rows that share same project_id, run_id, timestamp)
conn.execute(
sa.text(
"""
DELETE FROM evaluation_metrics
WHERE (project_id, run_id, timestamp) IN (
SELECT project_id, run_id, timestamp
FROM evaluation_metrics
WHERE scenario_id IS NULL
AND timestamp IS NOT NULL
GROUP BY project_id, run_id, timestamp
HAVING COUNT(*) > 1
)
AND scenario_id IS NULL
AND timestamp IS NOT NULL
"""
)
)


def downgrade() -> None:
# Data-destructive; nothing to restore
pass
Loading