diff --git a/.github/workflows/release-dev.yml b/.github/workflows/release-dev.yml index 1114a2b74..6d0cb33e9 100644 --- a/.github/workflows/release-dev.yml +++ b/.github/workflows/release-dev.yml @@ -104,53 +104,44 @@ jobs: - arch: amd64 runner: linux-amd64-cpu8 artifact: linux-amd64 - task: python:build:linux:amd64 - output_path: target/wheels/linux-amd64/*.whl + target: x86_64-unknown-linux-gnu - arch: arm64 runner: linux-arm64-cpu8 artifact: linux-arm64 - task: python:build:linux:arm64 - output_path: target/wheels/linux-arm64/*.whl + target: aarch64-unknown-linux-gnu runs-on: ${{ matrix.runner }} timeout-minutes: 120 - container: - image: ghcr.io/nvidia/openshell/ci:latest - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - options: --privileged - env: - MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - OPENSHELL_IMAGE_TAG: dev steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: fetch-depth: 0 - - name: Mark workspace safe for git - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + - name: Set up mise + uses: jdx/mise-action@v2 - - name: Sync Python dependencies - run: uv sync - - - name: Cache Rust target and registry - uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2 - with: - shared-key: python-wheel-linux-${{ matrix.arch }} - cache-directories: .cache/sccache - cache-targets: "true" + - name: Generate Python protobuf stubs + run: uv sync --group dev && mise run python:proto - - name: Build Python wheels + - name: Patch workspace version + if: needs.compute-versions.outputs.cargo_version != '' run: | - set -euo pipefail - OPENSHELL_CARGO_VERSION="${{ needs.compute-versions.outputs.cargo_version }}" mise run ${{ matrix.task }} - ls -la ${{ matrix.output_path }} + sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "${{ needs.compute-versions.outputs.cargo_version }}"/}' Cargo.toml + + - name: Build Python wheel + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + manylinux: 2_28 + args: --release --features bundled-z3 --compatibility manylinux_2_28 --out dist + before-script-linux: | + dnf install -y --setopt=install_weak_deps=False \ + clang llvm-devel openssl-devel perl-core perl-IPC-Cmd - name: Upload wheel artifacts uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 with: name: python-wheels-${{ matrix.artifact }} - path: ${{ matrix.output_path }} + path: dist/*.whl retention-days: 5 build-python-wheel-macos: diff --git a/.github/workflows/release-tag.yml b/.github/workflows/release-tag.yml index ae842494b..c1957d970 100644 --- a/.github/workflows/release-tag.yml +++ b/.github/workflows/release-tag.yml @@ -133,54 +133,45 @@ jobs: - arch: amd64 runner: linux-amd64-cpu8 artifact: linux-amd64 - task: python:build:linux:amd64 - output_path: target/wheels/linux-amd64/*.whl + target: x86_64-unknown-linux-gnu - arch: arm64 runner: linux-arm64-cpu8 artifact: linux-arm64 - task: python:build:linux:arm64 - output_path: target/wheels/linux-arm64/*.whl + target: aarch64-unknown-linux-gnu runs-on: ${{ matrix.runner }} timeout-minutes: 120 - container: - image: ghcr.io/nvidia/openshell/ci:latest - credentials: - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - options: --privileged - env: - MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - OPENSHELL_IMAGE_TAG: ${{ needs.compute-versions.outputs.semver }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: ref: ${{ inputs.tag || github.ref }} fetch-depth: 0 - - name: Mark workspace safe for git - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + - name: Set up mise + uses: jdx/mise-action@v2 - - name: Sync Python dependencies - run: uv sync - - - name: Cache Rust target and registry - uses: Swatinem/rust-cache@c19371144df3bb44fab255c43d04cbc2ab54d1c4 # v2 - with: - shared-key: python-wheel-linux-${{ matrix.arch }} - cache-directories: .cache/sccache - cache-targets: "true" + - name: Generate Python protobuf stubs + run: uv sync --group dev && mise run python:proto - - name: Build Python wheels + - name: Patch workspace version + if: needs.compute-versions.outputs.cargo_version != '' run: | - set -euo pipefail - OPENSHELL_CARGO_VERSION="${{ needs.compute-versions.outputs.cargo_version }}" mise run ${{ matrix.task }} - ls -la ${{ matrix.output_path }} + sed -i -E '/^\[workspace\.package\]/,/^\[/{s/^version[[:space:]]*=[[:space:]]*".*"/version = "${{ needs.compute-versions.outputs.cargo_version }}"/}' Cargo.toml + + - name: Build Python wheel + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + manylinux: 2_28 + args: --release --features bundled-z3 --compatibility manylinux_2_28 --out dist + before-script-linux: | + dnf install -y --setopt=install_weak_deps=False \ + clang llvm-devel openssl-devel perl-core perl-IPC-Cmd - name: Upload wheel artifacts uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 with: name: python-wheels-${{ matrix.artifact }} - path: ${{ matrix.output_path }} + path: dist/*.whl retention-days: 5 build-python-wheel-macos: diff --git a/architecture/build-containers.md b/architecture/build-containers.md new file mode 100644 index 000000000..1be1e6d6b --- /dev/null +++ b/architecture/build-containers.md @@ -0,0 +1,99 @@ +# Container Images + +OpenShell produces two container images, both published for `linux/amd64` and `linux/arm64`. + +## Gateway (`openshell/gateway`) + +The gateway runs the control plane API server. It is deployed as a StatefulSet inside the cluster container via a bundled Helm chart. + +- **Docker target**: `gateway` in `deploy/docker/Dockerfile.images` +- **Registry**: `ghcr.io/nvidia/openshell/gateway:latest` +- **Pulled when**: Cluster startup (the Helm chart triggers the pull) +- **Entrypoint**: `openshell-gateway --port 8080` (gRPC + HTTP, mTLS) + +## Cluster (`openshell/cluster`) + +The cluster image is a single-container Kubernetes distribution that bundles the Helm charts, Kubernetes manifests, and the `openshell-sandbox` supervisor binary needed to bootstrap the control plane. + +- **Docker target**: `cluster` in `deploy/docker/Dockerfile.images` +- **Registry**: `ghcr.io/nvidia/openshell/cluster:latest` +- **Pulled when**: `openshell gateway start` + +The supervisor binary (`openshell-sandbox`) is built before the image build, staged under `deploy/docker/.build/prebuilt-binaries//`, and copied into the cluster image at `/opt/openshell/bin/openshell-sandbox`. It is exposed to sandbox pods at runtime via a read-only `hostPath` volume mount — it is not baked into sandbox images. + +## Image Build Pipeline + +`deploy/docker/Dockerfile.images` no longer compiles Rust. CI calls `.github/workflows/shadow-rust-native-build.yml` through `workflow_call` to build `openshell-gateway` or `openshell-sandbox` natively on the target architecture. `.github/workflows/docker-build.yml` downloads the resulting artifact, stages it at `deploy/docker/.build/prebuilt-binaries//`, builds the per-arch image with the local Buildx driver, and merges multi-arch pushes with `docker buildx imagetools create`. + +Local Docker builds use `tasks/scripts/stage-prebuilt-binaries.sh` through `tasks/scripts/docker-build-image.sh` before invoking Docker, so clean checkouts do not need to create the staging directory manually. + +## Standalone Gateway Binary + +OpenShell also publishes a standalone `openshell-gateway` binary as a GitHub release asset. + +- **Source crate**: `crates/openshell-server` +- **Artifact name**: `openshell-gateway-.tar.gz` +- **Targets**: `x86_64-unknown-linux-gnu`, `aarch64-unknown-linux-gnu`, `aarch64-apple-darwin` +- **Release workflows**: `.github/workflows/release-dev.yml`, `.github/workflows/release-tag.yml` +- **Installer**: None yet. The binary is a manual-download asset. + +Both the standalone artifact and the deployed container image use the `openshell-gateway` binary. + +## Python Wheels + +OpenShell also publishes Python wheels for `linux/amd64`, `linux/arm64`, and macOS ARM64. + +- Released Linux wheels are built per-arch using `PyO3/maturin-action` with `manylinux: 2_28`. The action pulls the PyPA `manylinux_2_28` container, installs Rust from `rust-toolchain.toml`, and runs `maturin build --features bundled-z3 --compatibility manylinux_2_28`. The resulting wheels install on any Linux with glibc >= 2.28 (RHEL 8+, Ubuntu 18.04+, Debian 10+). This follows the same pattern used by ruff and uv. +- For fast local iteration, `build:python:wheel:linux:{amd64,arm64}` build natively on the host (wheels tagged for the host glibc, not portable). +- The macOS ARM64 wheel is cross-compiled with `deploy/docker/Dockerfile.python-wheels-macos` via `build:python:wheel:macos`. +- Release workflows mirror the CLI layout: a Linux matrix job for amd64/arm64, a separate macOS job, and release jobs that download the per-platform wheel artifacts directly before publishing. + +## Sandbox Images + +Sandbox images are **not built in this repository**. They are maintained in the [openshell-community](https://github.com/nvidia/openshell-community) repository and pulled from `ghcr.io/nvidia/openshell-community/sandboxes/` at runtime. + +The default sandbox image is `ghcr.io/nvidia/openshell-community/sandboxes/base:latest`. To use a named community sandbox: + +```bash +openshell sandbox create --from +``` + +This pulls `ghcr.io/nvidia/openshell-community/sandboxes/:latest`. + +## Local Development + +`mise run cluster` is the primary development command. It bootstraps a cluster if one doesn't exist, then performs incremental deploys for subsequent runs. + +The incremental deploy (`cluster-deploy-fast.sh`) fingerprints local Git changes and only rebuilds components whose files have changed: + +| Changed files | Rebuild triggered | +|---|---| +| Cargo manifests, proto definitions, prebuilt staging script | Gateway + supervisor | +| `crates/openshell-server/*`, `crates/openshell-ocsf/*`, `deploy/docker/Dockerfile.images` | Gateway | +| `crates/openshell-sandbox/*`, `crates/openshell-policy/*` | Supervisor | +| `deploy/helm/openshell/*` | Helm upgrade | + +When no local changes are detected, the command is a no-op. + +**Gateway updates** are pushed to a local registry and the StatefulSet is restarted. **Supervisor updates** are copied directly into the running cluster container via `docker cp` — new sandbox pods pick up the updated binary immediately through the hostPath mount, with no image rebuild or cluster restart required. + +Fingerprints are stored in `.cache/cluster-deploy-fast.state`. You can also target specific components explicitly: + +```bash +mise run cluster -- gateway # rebuild gateway only +mise run cluster -- supervisor # rebuild supervisor only +mise run cluster -- chart # helm upgrade only +mise run cluster -- all # rebuild everything +``` + +To validate incremental routing and BuildKit cache reuse locally, run: + +```bash +mise run cluster:test:fast-deploy-cache +``` + +The harness runs isolated scenarios in temporary git worktrees, keeps its own state and cache under `.cache/cluster-deploy-fast-test/`, and writes a Markdown summary with: + +- auto-detection checks for gateway-only, supervisor-only, shared, Helm-only, unrelated, and explicit-target changes +- cold vs warm rebuild comparisons for gateway and supervisor code changes +- container-ID invalidation coverage to verify gateway + Helm are retriggered when the cluster container changes diff --git a/mise.lock b/mise.lock index 40050e701..6f1d1fda0 100644 --- a/mise.lock +++ b/mise.lock @@ -307,6 +307,7 @@ url = "https://ziglang.org/download/0.14.1/zig-aarch64-linux-0.14.1.tar.xz" [tools.zig."platforms.linux-x64"] checksum = "sha256:24aeeec8af16c381934a6cd7d95c807a8cb2cf7df9fa40d359aa884195c4716c" url = "https://ziglang.org/download/0.14.1/zig-x86_64-linux-0.14.1.tar.xz" +provenance = "minisign" [tools.zig."platforms.macos-arm64"] checksum = "sha256:39f3dc5e79c22088ce878edc821dedb4ca5a1cd9f5ef915e9b3cc3053e8faefa" diff --git a/tasks/python.toml b/tasks/python.toml index b95d96671..7daa5e1f8 100644 --- a/tasks/python.toml +++ b/tasks/python.toml @@ -83,22 +83,25 @@ ls -la "$WHEEL_OUTPUT_DIR"/*.whl hide = true ["build:python:wheel:linux:amd64"] -description = "Build Python wheel for Linux amd64 natively" +description = "Build Python wheel for Linux amd64 natively (host glibc; non-portable)" depends = ["EXPECTED_HOST_ARCH=amd64 WHEEL_OUTPUT_DIR=target/wheels/linux-amd64 build:python:wheel:linux"] hide = true -["python:build:linux:amd64"] -description = "Alias for build:python:wheel:linux:amd64" -depends = ["build:python:wheel:linux:amd64"] -hide = true - ["build:python:wheel:linux:arm64"] -description = "Build Python wheel for Linux arm64 natively" +description = "Build Python wheel for Linux arm64 natively (host glibc; non-portable)" depends = ["EXPECTED_HOST_ARCH=arm64 WHEEL_OUTPUT_DIR=target/wheels/linux-arm64 build:python:wheel:linux"] hide = true +# Release-pipeline aliases. CI uses PyO3/maturin-action directly (see +# .github/workflows/release-*.yml); these aliases remain for local iteration +# and produce wheels tagged for the host glibc only. +["python:build:linux:amd64"] +description = "Build Python wheel for Linux amd64 (local dev; CI uses maturin-action)" +depends = ["build:python:wheel:linux:amd64"] +hide = true + ["python:build:linux:arm64"] -description = "Alias for build:python:wheel:linux:arm64" +description = "Build Python wheel for Linux arm64 (local dev; CI uses maturin-action)" depends = ["build:python:wheel:linux:arm64"] hide = true