Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bench-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ jobs:

echo '# Benchmarks: ${{ matrix.benchmark.name }}' > comment.md
echo '' >> comment.md
uv run --no-project scripts/compare-benchmark-jsons.py base.json results.json "${{ matrix.benchmark.name }}" \
uv run --project scripts scripts/compare-benchmark-jsons.py base.json results.json "${{ matrix.benchmark.name }}" \
>> comment.md
cat comment.md >> $GITHUB_STEP_SUMMARY

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:

sudo apt-get update && sudo apt-get install -y jq
bash scripts/commit-json.sh > new-commit.json
bash scripts/cat-s3.sh vortex-ci-benchmark-results commits.json new-commit.json
uv run --project scripts scripts/cat-s3.py vortex-ci-benchmark-results commits.json new-commit.json

bench:
timeout-minutes: 120
Expand Down Expand Up @@ -100,7 +100,7 @@ jobs:
- name: Upload Benchmark Results
shell: bash
run: |
bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json
uv run --project scripts scripts/cat-s3.py vortex-ci-benchmark-results data.json.gz results.json

- name: Alert incident.io
if: failure()
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ jobs:

echo '# Benchmarks: ${{ matrix.name }}' > comment.md
echo '' >> comment.md
uv run --no-project scripts/compare-benchmark-jsons.py base.json results.json "${{ matrix.name }}" \
uv run --project scripts scripts/compare-benchmark-jsons.py base.json results.json "${{ matrix.name }}" \
>> comment.md
cat comment.md >> $GITHUB_STEP_SUMMARY

Expand Down Expand Up @@ -274,7 +274,7 @@ jobs:
if: inputs.mode == 'develop'
shell: bash
run: |
bash scripts/cat-s3.sh vortex-ci-benchmark-results data.json.gz results.json
uv run --project scripts scripts/cat-s3.py vortex-ci-benchmark-results data.json.gz results.json

- name: Alert incident.io
if: failure() && inputs.mode == 'develop'
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.0"
description = "Add your description here"
authors = [{ name = "Nicholas Gates", email = "nick@nickgates.com" }]
requires-python = ">= 3.11"
dependencies = ["bench-orchestrator", "vortex-data", "docs"]
dependencies = ["bench-orchestrator", "vortex-data", "vortex-scripts", "docs"]

[build-system]
requires = ["hatchling"]
Expand Down Expand Up @@ -32,11 +32,12 @@ managed = true
required-version = ">=0.8.0"

[tool.uv.workspace]
members = ["bench-orchestrator", "vortex-python", "docs"]
members = ["bench-orchestrator", "vortex-python", "scripts", "docs"]

[tool.uv.sources]
bench-orchestrator = { workspace = true }
vortex-data = { workspace = true }
vortex-scripts = { workspace = true }
docs = { workspace = true }

[tool.ruff]
Expand Down
174 changes: 174 additions & 0 deletions scripts/cat-s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors

"""Append JSONL benchmark results to an S3 object with duplicate-commit detection and optimistic locking."""

import gzip
import os
import subprocess
import sys
import tempfile
import time

import pandas as pd


def head_etag(bucket: str, key: str) -> str | None:
result = subprocess.run(
[
"aws",
"s3api",
"head-object",
"--bucket",
bucket,
"--key",
key,
"--query",
"ETag",
"--output",
"text",
],
capture_output=True,
text=True,
)
if result.returncode != 0:
return None
etag = result.stdout.strip()
if not etag or etag == "null":
return None
return etag


def get_object(bucket: str, key: str, dest: str, if_match: str) -> bool:
result = subprocess.run(
[
"aws",
"s3api",
"get-object",
"--bucket",
bucket,
"--key",
key,
"--if-match",
if_match,
dest,
],
)
return result.returncode == 0


def put_object(bucket: str, key: str, body: str, if_match: str) -> bool:
result = subprocess.run(
[
"aws",
"s3api",
"put-object",
"--bucket",
bucket,
"--key",
key,
"--body",
body,
"--if-match",
if_match,
],
)
return result.returncode == 0


def extract_commit_ids(path: str, is_gz: bool) -> set[str]:
"""Extract unique commit identifiers from a JSONL file using pandas.

Supports both benchmark data ("commit_id" column) and commit metadata ("id" column).
"""
df = pd.read_json(path, lines=True, compression="gzip" if is_gz else None)
ids: set[str] = set()
if "commit_id" in df.columns:
ids.update(df["commit_id"].dropna().unique())
if "id" in df.columns:
ids.update(df["id"].dropna().unique())
return ids


def main() -> None:
if len(sys.argv) != 4:
print(f"Usage: {sys.argv[0]} <bucket> <key> <local_file>", file=sys.stderr)
sys.exit(1)

bucket = sys.argv[1]
key = sys.argv[2]
local_file = sys.argv[3]
max_retries = 100

is_gz = key.endswith(".gz")

with open(local_file) as f:
new_data = f.read()
new_commit_ids = extract_commit_ids(local_file, is_gz=False)

for attempt in range(1, max_retries + 1):
etag = head_etag(bucket, key)
if etag is None:
print("Failed to retrieve ETag.", file=sys.stderr)
sys.exit(1)

local_copy = tempfile.mktemp()
try:
if not get_object(bucket, key, local_copy, etag):
print(
f"ETag mismatch during download (attempt {attempt}), retrying...",
file=sys.stderr,
)
continue

# Check for duplicate commits.
existing_commit_ids = extract_commit_ids(local_copy, is_gz)
duplicates = new_commit_ids & existing_commit_ids
if duplicates:
print(
f"ERROR: commit(s) {', '.join(sorted(duplicates))} already exist in "
f"s3://{bucket}/{key}. Refusing to append duplicate data.",
file=sys.stderr,
)
sys.exit(1)

# Decompress existing data, concatenate, recompress.
if is_gz:
with gzip.open(local_copy, "rt") as f:
existing_data = f.read()
else:
with open(local_copy) as f:
existing_data = f.read()

combined = existing_data + new_data
output_path = tempfile.mktemp(suffix=".gz" if is_gz else "")
try:
if is_gz:
with gzip.open(output_path, "wt") as f:
f.write(combined)
else:
with open(output_path, "w") as f:
f.write(combined)

if put_object(bucket, key, output_path, etag):
print("File updated and uploaded successfully.")
return

print(
f"ETag mismatch during upload (attempt {attempt}), retrying...",
file=sys.stderr,
)
time.sleep(0.1)
finally:
if os.path.exists(output_path):
os.unlink(output_path)
finally:
if os.path.exists(local_copy):
os.unlink(local_copy)

print(f"Too many failures: {max_retries}.", file=sys.stderr)
sys.exit(1)


if __name__ == "__main__":
main()
52 changes: 0 additions & 52 deletions scripts/cat-s3.sh

This file was deleted.

9 changes: 0 additions & 9 deletions scripts/compare-benchmark-jsons.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,3 @@
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "numpy",
# "pandas",
# "tabulate",
# ]
# ///

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors

Expand Down
17 changes: 17 additions & 0 deletions scripts/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors

[project]
name = "vortex-scripts"
version = "0.1.0"
description = "CI and benchmark scripts for Vortex"
requires-python = ">=3.11"
classifiers = ["Private :: Do Not Upload"]
dependencies = ["numpy", "pandas", "tabulate"]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["dummy"] # No importable package, just scripts
Loading
Loading