diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000..9bc45beeff --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,10 @@ +## What changes are proposed in this pull request? + + + +## How was this change tested? diff --git a/.github/actions/install-and-cache/action.yml b/.github/actions/install-and-cache/action.yml new file mode 100644 index 0000000000..8da88c8a41 --- /dev/null +++ b/.github/actions/install-and-cache/action.yml @@ -0,0 +1,101 @@ +# This is copied from https://github.com/tecolicom/actions-install-and-cache +# which is Copyright 2022 Office TECOLI, LLC + +name: install-and-cache generic backend +description: 'GitHub Action to run installer and cache the result' +branding: + color: orange + icon: type + +inputs: + run: { required: true, type: string } + path: { required: true, type: string } + cache: { required: false, type: string, default: yes } + key: { required: false, type: string } + sudo: { required: false, type: string } + verbose: { required: false, type: string, default: false } + +outputs: + cache-hit: + value: ${{ steps.cache.outputs.cache-hit }} + +runs: + using: composite + steps: + + - id: setup + shell: bash + run: | + : setup install-and-cache + define() { IFS='\n' read -r -d '' ${1} || true ; } + define script <<'EOS_cad8_c24e_' + ${{ inputs.run }} + EOS_cad8_c24e_ + directory="${{ inputs.path }}" + given_key="${{ inputs.key }}" + archive= key= + case "${{ inputs.cache }}" in + yes|workflow) + cache="${{ inputs.cache }}" + uname -mrs + hash=$( (uname -mrs ; cat <<< "$script" ; echo $directory) | \ + (md5sum||md5) | awk '{print $1}' ) + key="${hash}${given_key:+-$given_key}" + [ "$cache" == 'workflow' ] && \ + key+="-${{ github.run_id }}-${{ github.run_attempt }}" + archive=$HOME/archive-$hash.tz + ;; + *) + cache=no + ;; + esac + # use "--recursive-unlink" option if GNU tar is found + if tar --version | grep GNU > /dev/null + then + tar="tar --recursive-unlink" + elif gtar --version | grep GNU > /dev/null + then + tar="gtar --recursive-unlink" + else + tar=tar + fi + sed 's/^ *//' << END >> $GITHUB_OUTPUT + cache=$cache + archive=$archive + key=$key + tar=$tar + END + + - id: cache + if: steps.setup.outputs.cache != 'no' + uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0 + with: + path: ${{ steps.setup.outputs.archive }} + key: ${{ steps.setup.outputs.key }} + + - id: extract + if: steps.setup.outputs.cache != 'no' && steps.cache.outputs.cache-hit == 'true' + shell: bash + run: | + : extract + archive="${{ steps.setup.outputs.archive }}" + verbose="${{ inputs.verbose }}" + tar="${{ steps.setup.outputs.tar }}" + ls -l $archive + if [ -s $archive ] + then + opt=-Pxz + [[ $verbose == yes || $verbose == true ]] && opt+=v + sudo $tar -C / $opt -f $archive + else + echo "$archive is empty" + fi + + - id: install-and-archive + if: steps.cache.outputs.cache-hit != 'true' + uses: tecolicom/actions-install-and-archive@9d5afb27f9900f2df47fe40de58fbd837032bddf # v1.3 + with: + run: ${{ inputs.run }} + archive: ${{ steps.setup.outputs.archive }} + path: ${{ inputs.path }} + sudo: ${{ inputs.sudo }} diff --git a/.github/actions/pr-title-validator/action.yml b/.github/actions/pr-title-validator/action.yml new file mode 100644 index 0000000000..179f44a68d --- /dev/null +++ b/.github/actions/pr-title-validator/action.yml @@ -0,0 +1,43 @@ +name: 'PR Title Validator' +description: 'Validates a pull request title against a regex pattern' + +inputs: + regex: + description: 'Regular expression the PR title must match' + required: true + breaking-change-regex: + description: 'Regex to use instead when the breaking-change label is present' + required: false + default: '' + labels: + description: 'JSON array of label names on the PR' + required: false + default: '[]' + title: + description: 'PR title to validate. Defaults to github.event.pull_request.title.' + required: false + default: '' + +runs: + using: composite + steps: + - name: Validate PR title + shell: bash + env: + PR_TITLE: ${{ inputs.title || github.event.pull_request.title }} + INPUT_REGEX: ${{ inputs.regex }} + BREAKING_REGEX: ${{ inputs.breaking-change-regex }} + LABELS: ${{ inputs.labels }} + run: | + REGEX="$INPUT_REGEX" + if [[ -n "$BREAKING_REGEX" ]] && echo "$LABELS" | jq -e '.[] | select(. == "breaking-change")' > /dev/null 2>&1; then + REGEX="$BREAKING_REGEX" + echo "breaking-change label detected, using breaking change regex." + fi + + if [[ "$PR_TITLE" =~ $REGEX ]]; then + echo "PR title matches pattern." + exit 0 + fi + echo "::error::PR title \"$PR_TITLE\" does not match pattern: $REGEX" + exit 1 diff --git a/.github/actions/use-homebrew-tools/action.yml b/.github/actions/use-homebrew-tools/action.yml new file mode 100644 index 0000000000..bf8f65f1dd --- /dev/null +++ b/.github/actions/use-homebrew-tools/action.yml @@ -0,0 +1,47 @@ +# This is copied from https://github.com/tecolicom/actions-use-homebrew-tools/ +# which is Copyright 2022 Office TECOLI, LLC + +name: install-and-cache homebrew tools +description: 'GitHub Action to install and cache homebrew tools' +branding: + color: orange + icon: type + +inputs: + tools: { required: false, type: string } + key: { required: false, type: string } + path: { required: false, type: string } + cache: { required: false, type: string, default: yes } + verbose: { required: false, type: boolean, default: false } + +outputs: + cache-hit: + value: ${{ steps.update.outputs.cache-hit }} + +runs: + using: composite + steps: + + - id: setup + shell: bash + run: | + : setup use-homebrew-tools + given_key="${{ inputs.key }}" + brew_version="$(brew --version)" + echo "$brew_version" + version_key="$( echo "$brew_version" | (md5sum||md5) | awk '{print $1}' )" + key="${given_key:+$given_key-}${version_key}" + sed 's/^ *//' << END >> $GITHUB_OUTPUT + command=brew install + prefix=$(brew --prefix) + key=$key + END + + - id: update + uses: ./.github/actions/install-and-cache + with: + run: ${{ steps.setup.outputs.command }} ${{ inputs.tools }} + path: ${{ steps.setup.outputs.prefix }} ${{ inputs.path }} + key: ${{ steps.setup.outputs.key }} + cache: ${{ inputs.cache }} + verbose: ${{ inputs.verbose }} diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md deleted file mode 100644 index ec2fd0d911..0000000000 --- a/.github/pull_request_template.md +++ /dev/null @@ -1,46 +0,0 @@ - - - - -## What changes are proposed in this pull request? - - - - - -## How was this change tested? - \ No newline at end of file diff --git a/.github/workflows/auto-assign-pr.yml b/.github/workflows/auto-assign-pr.yml index 30aa86703e..0ef4f28db6 100644 --- a/.github/workflows/auto-assign-pr.yml +++ b/.github/workflows/auto-assign-pr.yml @@ -12,4 +12,4 @@ jobs: assign-author: runs-on: ubuntu-latest steps: - - uses: toshimaru/auto-author-assign@v2.1.1 + - uses: toshimaru/auto-author-assign@16f0022cf3d7970c106d8d1105f75a1165edb516 # v2.1.1 diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000000..ad88b5e50e --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,82 @@ +# issue_comment is used here to trigger on PR comments, as opposed to pull_request_review +# (review submissions) or pull_request_review_comment (comments on the diff itself) +# we want to trigger this on comment creation or edit +on: + issue_comment: + types: [created, edited] +name: Benchmarking PR performance +jobs: + run-benchmark: + name: Run benchmarks + if: > + github.event.issue.pull_request && + (github.event.comment.body == '/bench' || startsWith(github.event.comment.body, '/bench ')) + runs-on: ubuntu-latest + permissions: + contents: read + outputs: + pr_number: ${{ steps.pr.outputs.pr_number }} + steps: + - name: Get PR metadata + id: pr + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + PR_NUMBER: ${{ github.event.issue.number }} + run: | + PR_DATA=$(gh api "repos/$REPO/pulls/$PR_NUMBER") + HEAD_SHA=$(echo "$PR_DATA" | jq -r .head.sha) + BASE_REF=$(echo "$PR_DATA" | jq -r .base.ref) + [[ "$HEAD_SHA" == *$'\n'* || "$BASE_REF" == *$'\n'* ]] && { echo "Unexpected newline in API response" >&2; exit 1; } + [[ "$BASE_REF" =~ ^[a-zA-Z0-9/_.-]+$ ]] || { echo "Invalid BASE_REF: $BASE_REF" >&2; exit 1; } + printf 'head_sha=%s\n' "$HEAD_SHA" >> "$GITHUB_OUTPUT" + printf 'base_ref=%s\n' "$BASE_REF" >> "$GITHUB_OUTPUT" + printf 'pr_number=%s\n' "$PR_NUMBER" >> "$GITHUB_OUTPUT" + - name: Install critcmp + # Installed before checkout so the PR's .cargo/config.toml cannot + # redirect the registry to a malicious source. The runner's + # pre-installed Rust is sufficient -- no toolchain setup needed here. + # --locked is omitted for cargo install (same exemption as cargo miri + # setup); --version pins the top-level crate. + run: cargo install critcmp --version 0.1.8 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + with: + ref: ${{ steps.pr.outputs.head_sha }} + - uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + # See build.yml top-level comment for why save-if is restricted to main. + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + - name: Run benchmarks + # The comment is posted in the post-comment job after this job completes. + env: + COMMENT: ${{ github.event.comment.body }} + BASE_REF: ${{ steps.pr.outputs.base_ref }} + HEAD_SHA: ${{ steps.pr.outputs.head_sha }} + run: bash benchmarks/ci/run-benchmarks.sh + - name: Upload benchmark comment + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: bench-comment + path: /tmp/bench-comment.md + + post-comment: + name: Post benchmark results + needs: run-benchmark + runs-on: ubuntu-latest + permissions: + pull-requests: write + steps: + - name: Download benchmark comment + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + name: bench-comment + path: /tmp/ + - name: Post results as PR comment + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUMBER: ${{ needs.run-benchmark.outputs.pr_number }} + REPO: ${{ github.repository }} + run: gh pr comment "$PR_NUMBER" --repo "$REPO" --body-file /tmp/bench-comment.md diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cb25a4b915..dcc09b58aa 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -1,19 +1,50 @@ name: build -on: [push, pull_request] +on: [push, pull_request, merge_group] env: CARGO_TERM_COLOR: always RUST_BACKTRACE: 1 +# Supply chain security: all cargo commands that resolve dependencies use --locked to +# enforce the committed Cargo.lock. This prevents CI from silently resolving a newer +# (potentially compromised) dependency version. If Cargo.lock is out of sync with +# Cargo.toml, the build fails immediately. Any dependency change must be an explicit, +# reviewable update to Cargo.lock in the PR. Commands that skip --locked: cargo fmt +# (no dep resolution), cargo msrv verify/show (wrapper tool), cargo miri setup (tooling). +# +# Swatinem/rust-cache caches the cargo registry and target directory (~450MB per job). +# save-if restricts cache writes to main pushes only. PRs read from main's cache but +# never write their own entries. +# +# The key insight: Cargo.lock changes infrequently, so main's cache key almost always +# matches. PRs download and compile zero dependencies on cache hit. By only writing on +# main, we keep main's cache entries alive (no LRU eviction from PR churn), and every +# PR benefits from them. +# +# Without this, GHA's ref-scoped caching works against us: each PR writes ~6.3GB of +# cache entries (14 jobs x ~450MB) that only that PR can read. A handful of active PRs +# fills the 10GB cache budget, LRU evicts main's shared entries, and every subsequent +# PR compiles from scratch. +# +# The save-if condition checks both event_name == 'push' and ref == main because +# pull_request_target events set github.ref to the base branch (main), not the PR +# branch. Without the event_name check, those workflows would write cache entries on +# every PR. +# +# Note: actions-rust-lang/setup-rust-toolchain has built-in Swatinem/rust-cache that +# writes on every run with no save-if support. We disable it with cache: false and +# manage caching explicitly via the Swatinem/rust-cache steps below. + jobs: format: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - name: Install minimal stable with rustfmt - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 with: + cache: false components: rustfmt - name: format run: cargo fmt -- --check @@ -21,13 +52,17 @@ jobs: msrv: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - name: Install minimal stable and cargo msrv - uses: actions-rust-lang/setup-rust-toolchain@v1 - - name: Install cargo-msrv - shell: bash - run: | - cargo install cargo-msrv --locked + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + - uses: taiki-e/install-action@7bc99eee1f1b8902a125006cf790a1f4c8461e63 # v2.69.8 + with: + tool: cargo-msrv - name: verify-msrv run: | cargo msrv --path kernel/ verify --all-features @@ -37,38 +72,46 @@ jobs: msrv-run-tests: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - name: Install minimal stable and cargo msrv - uses: actions-rust-lang/setup-rust-toolchain@v1 - - uses: Swatinem/rust-cache@v2 - - name: Install cargo-msrv - shell: bash - run: | - cargo install cargo-msrv --locked + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + - uses: taiki-e/install-action@7bc99eee1f1b8902a125006cf790a1f4c8461e63 # v2.69.8 + with: + tool: cargo-msrv + - uses: taiki-e/install-action@98ec31d284eb962f41c14065e9391a955aa810cf # nextest - name: Get rust-version from Cargo.toml id: rust-version run: echo "RUST_VERSION=$(cargo msrv show --path kernel/ --output-format minimal)" >> $GITHUB_ENV - name: Install specified rust version - uses: actions-rs/toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 with: + cache: false toolchain: ${{ env.RUST_VERSION }} - profile: minimal - name: run tests run: | pushd kernel echo "Testing with $(cargo msrv show --output-format minimal)" - cargo +$(cargo msrv show --output-format minimal) test + cargo +$(cargo msrv show --output-format minimal) nextest run --locked docs: runs-on: ubuntu-latest env: RUSTDOCFLAGS: -D warnings steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - name: Install minimal stable - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - name: build docs - run: cargo doc --workspace --all-features - + run: cargo doc --locked --workspace --all-features --no-deps # When we run cargo { build, clippy } --no-default-features, we want to build/lint the kernel to # ensure that we can build the kernel without any features enabled. Unfortunately, due to how @@ -100,21 +143,29 @@ jobs: - ubuntu-latest - windows-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - name: Install minimal stable with clippy - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 with: + cache: false components: clippy + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - name: build and lint with clippy - run: cargo clippy --benches --tests --all-features -- -D warnings + run: cargo clippy --locked --benches --tests --all-features -- -D warnings - name: lint without default features - packages which depend on kernel with features enabled - run: cargo clippy --workspace --no-default-features --exclude delta_kernel --exclude delta_kernel_ffi --exclude delta_kernel_derive --exclude delta_kernel_ffi_macros -- -D warnings + run: cargo clippy --locked --workspace --no-default-features --exclude delta_kernel --exclude delta_kernel_ffi --exclude delta_kernel_derive --exclude delta_kernel_ffi_macros -- -D warnings - name: lint without default features - packages which don't depend on kernel with features enabled - run: cargo clippy --no-default-features --package delta_kernel --package delta_kernel_ffi --package delta_kernel_derive --package delta_kernel_ffi_macros -- -D warnings + run: cargo clippy --locked --no-default-features --package delta_kernel --package delta_kernel_ffi --package delta_kernel_derive --package delta_kernel_ffi_macros -- -D warnings - name: check kernel builds with default-engine-native-tls - run: cargo build -p feature_tests --features default-engine-native-tls + run: cargo build --locked -p feature_tests --features default-engine-native-tls + - name: test native-tls backend has no crypto provider conflict + run: cargo test --locked -p feature_tests --features default-engine-native-tls - name: check kernel builds with default-engine-rustls - run: cargo build -p feature_tests --features default-engine-rustls + run: cargo build --locked -p feature_tests --features default-engine-rustls + - name: test rustls TLS backend feature-tests + run: cargo test --locked -p feature_tests --features default-engine-rustls test: runs-on: ${{ matrix.os }} strategy: @@ -124,11 +175,27 @@ jobs: - ubuntu-latest - windows-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 + id: filter + with: + filters: | + ffi: + - 'ffi/src/handle.rs' + - 'ffi-proc-macros/**' - name: Install minimal stable with clippy and rustfmt - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + - uses: taiki-e/install-action@98ec31d284eb962f41c14065e9391a955aa810cf # nextest - name: test - run: cargo test --workspace --verbose --all-features -- --skip read_table_version_hdfs + run: cargo nextest run --locked --workspace --all-features -E 'not test(read_table_version_hdfs) and not test(invalid_handle_code)' + - name: trybuild tests + if: steps.filter.outputs.ffi == 'true' + run: cargo test --locked --package delta_kernel_ffi --features internal-api -- invalid_handle_code ffi_test: runs-on: ${{ matrix.os }} @@ -138,12 +205,12 @@ jobs: - macOS-latest - ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - name: Setup cmake - uses: jwlawson/actions-setup-cmake@v2 + uses: jwlawson/actions-setup-cmake@0d6a7d60b009d01c9e7523be22153ff8f19460d3 # v2.2.0 with: - cmake-version: '3.30.x' - - name: Install arrow-glib + cmake-version: "3.30.x" + - name: Install arrow-glib-linux run: | if [ "$RUNNER_OS" == "Linux" ]; then sudo apt update @@ -156,29 +223,28 @@ jobs: sudo apt install -y -V libarrow-dev # For C++ sudo apt install -y -V libarrow-glib-dev # For GLib (C) sudo apt install -y -V valgrind # For memory leak test - elif [ "$RUNNER_OS" == "macOS" ]; then - brew install apache-arrow - brew install apache-arrow-glib - else - echo "$RUNNER_OS not supported" - exit 1 fi - - name: Install minimal stable with clippy and rustfmt - uses: actions-rs/toolchain@v1 + - name: Install arrow-glib-macOS + if: runner.os == 'macOS' + uses: ./.github/actions/use-homebrew-tools + with: + tools: "apache-arrow apache-arrow-glib" + - name: Install minimal stable + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 with: - profile: default - toolchain: stable - override: true - - uses: Swatinem/rust-cache@v2 + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - name: Set output on fail run: echo "CTEST_OUTPUT_ON_FAILURE=1" >> "$GITHUB_ENV" - name: Build kernel run: | pushd acceptance - cargo build + cargo build --locked popd pushd ffi - cargo b --features default-engine-rustls,test-ffi,tracing + cargo build --locked --features default-engine-rustls,test-ffi,tracing,delta-kernel-unity-catalog popd - name: build and run read-table test run: | @@ -196,36 +262,55 @@ jobs: cmake .. make make test - + - name: build and run delta-kernel-unity-catalog-ffi test + run: | + pushd ffi/examples/delta-kernel-unity-catalog-example + mkdir build + pushd build + cmake .. + make + make test miri: - name: "Miri" + name: "Miri (shard ${{ matrix.partition }}/3)" runs-on: ubuntu-latest + strategy: + matrix: + partition: [1, 2, 3] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - name: Install Miri run: | rustup toolchain install nightly --component miri rustup override set nightly cargo miri setup + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} + - uses: taiki-e/install-action@98ec31d284eb962f41c14065e9391a955aa810cf # nextest - name: Test with Miri run: | pushd ffi - MIRIFLAGS=-Zmiri-disable-isolation cargo miri test --features default-engine-rustls + MIRIFLAGS=-Zmiri-disable-isolation cargo miri nextest run --locked --features default-engine-rustls,delta-kernel-unity-catalog --partition slice:${{ matrix.partition }}/3 coverage: runs-on: ubuntu-latest env: CARGO_TERM_COLOR: always steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - name: Install rust - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - name: Install cargo-llvm-cov - uses: taiki-e/install-action@cargo-llvm-cov + uses: taiki-e/install-action@2d15d02e710b40b6332201aba6af30d595b5cd96 # cargo-llvm-cov - name: Generate code coverage - run: cargo llvm-cov --all-features --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs + run: cargo llvm-cov --locked --all-features --workspace --codecov --output-path codecov.json -- --skip read_table_version_hdfs - name: Upload coverage to Codecov - uses: codecov/codecov-action@v5 + uses: codecov/codecov-action@1af58845a975a7985b0beb0cbe6fbbb71a41dbad # v5.5.3 with: files: codecov.json fail_ci_if_error: true diff --git a/.github/workflows/comment-on-title-failure.yml b/.github/workflows/comment-on-title-failure.yml new file mode 100644 index 0000000000..c52d85a215 --- /dev/null +++ b/.github/workflows/comment-on-title-failure.yml @@ -0,0 +1,61 @@ +name: Comment on PR Title Failure + +on: + workflow_run: + workflows: ["Validate PR Title"] + types: [completed] + +jobs: + comment: + runs-on: ubuntu-latest + permissions: + pull-requests: write + steps: + # Step taken from: https://github.com/orgs/community/discussions/25220#discussioncomment-11316244 + - name: Find PR info + id: pr-context + env: + GH_TOKEN: ${{ github.token }} + PR_TARGET_REPO: ${{ github.repository }} + # If the PR is from a fork, prefix it with `:`, otherwise only the PR branch name is relevant: + PR_BRANCH: |- + ${{ + (github.event.workflow_run.head_repository.owner.login != github.event.workflow_run.repository.owner.login) + && format('{0}:{1}', github.event.workflow_run.head_repository.owner.login, github.event.workflow_run.head_branch) + || github.event.workflow_run.head_branch + }} + # Query the PR number by repo + branch, then assign to step output: + run: | + gh pr view --repo "${PR_TARGET_REPO}" "${PR_BRANCH}" \ + --json 'number,title' --jq '"number=\(.number)\ntitle=\(.title)"' \ + >> "${GITHUB_OUTPUT}" + + - name: Find existing comment + id: find + uses: peter-evans/find-comment@3eae4d37986fb5a8592848f6a574fdf654e61f9e # v3.1.0 + with: + issue-number: ${{ steps.pr-context.outputs.number }} + comment-author: 'github-actions[bot]' + body-includes: PR title does not match the required pattern + + - name: Post or update failure comment + if: ${{ github.event.workflow_run.conclusion == 'failure' }} + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4.0.0 + env: + PR_TITLE: ${{ steps.pr-context.outputs.title }} + with: + comment-id: ${{ steps.find.outputs.comment-id }} + issue-number: ${{ steps.pr-context.outputs.number }} + body: | + PR title does not match the required pattern. Please ensure you follow the [conventional commits](https://www.conventionalcommits.org/) spec. + + Your title should start with `feat:`, `fix:`, `chore:`, `docs:`, `perf:`, `refactor:`, `test:`, or `ci:`, and if it's a breaking change that should be suffixed with a `!` (like `feat!:`), and then a 1-72 character brief description of your change. + + **Title:** `${{ env.PR_TITLE }}` + + - name: Delete comment on success + if: ${{ github.event.workflow_run.conclusion == 'success' && steps.find.outputs.comment-id != '' }} + env: + GH_TOKEN: ${{ github.token }} + run: | + gh api repos/${{ github.repository }}/issues/comments/${{ steps.find.outputs.comment-id }} -X DELETE diff --git a/.github/workflows/pr-validator.yml b/.github/workflows/pr-validator.yml new file mode 100644 index 0000000000..2b9b486b08 --- /dev/null +++ b/.github/workflows/pr-validator.yml @@ -0,0 +1,53 @@ +name: Validate PR Title + +on: + pull_request: + types: [opened, edited, reopened, synchronize, labeled, unlabeled] + workflow_run: + workflows: ["semver-label"] # we need this since auto-labels from jobs don't trigger a workflow + types: [completed] + +jobs: + validate-title: + runs-on: ubuntu-latest + steps: + - name: Resolve PR metadata + id: pr + env: + GH_TOKEN: ${{ github.token }} + # Captured as env vars to prevent expression injection into the shell command. + PR_TITLE: ${{ github.event.pull_request.title }} + PR_LABELS_JSON: ${{ toJson(github.event.pull_request.labels.*.name) }} + run: | + if [[ "${{ github.event_name }}" == "workflow_run" ]]; then + pr_json=$(gh api --paginate repos/${{ github.repository }}/pulls \ + --jq ".[] | select(.head.sha == \"${{ github.event.workflow_run.head_sha }}\")") + echo "number=$(echo "$pr_json" | jq -r '.number')" >> "$GITHUB_OUTPUT" + # Use multiline delimiter syntax so a title containing newlines cannot inject + # additional key=value pairs into GITHUB_OUTPUT. + { + echo 'title<> "$GITHUB_OUTPUT" + echo "labels=$(echo "$pr_json" | jq -c '[.labels[].name]')" >> "$GITHUB_OUTPUT" + else + echo "number=${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT" + # Use multiline delimiter syntax so a title containing newlines cannot inject + # additional key=value pairs into GITHUB_OUTPUT. + { + echo 'title<> "$GITHUB_OUTPUT" + echo "labels=$(echo "$PR_LABELS_JSON" | jq -c '.')" >> "$GITHUB_OUTPUT" + fi + + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + - uses: ./.github/actions/pr-title-validator + with: + regex: '^(feat|fix|chore|docs|perf|refactor|test|ci)!?(\(.+\))?: .{1,72}$' + breaking-change-regex: '^(feat|fix|chore|docs|perf|refactor|test|ci)!(\(.+\))?: .{1,72}$' + labels: ${{ steps.pr.outputs.labels }} + title: ${{ steps.pr.outputs.title }} diff --git a/.github/workflows/run-examples.yml b/.github/workflows/run-examples.yml index cdc9224173..9cf5bc9d7d 100644 --- a/.github/workflows/run-examples.yml +++ b/.github/workflows/run-examples.yml @@ -1,6 +1,6 @@ name: run-examples -on: [push, pull_request] +on: [push, pull_request, merge_group] env: CARGO_TERM_COLOR: always @@ -10,10 +10,15 @@ jobs: run-examples: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - name: Install minimal stable - uses: actions-rust-lang/setup-rust-toolchain@v1 - - uses: Swatinem/rust-cache@v2 + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + # See build.yml top-level comment for why save-if is restricted to main. + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - name: Run all examples run: | @@ -38,23 +43,23 @@ jobs: # Special case for write-table: it needs a temp directory if [ "$example_dir" = "write-table" ]; then tmp_dir=$(mktemp -d) - cargo run --manifest-path "$example_dir/Cargo.toml" --release -- "$tmp_dir" + cargo run --locked --manifest-path "$example_dir/Cargo.toml" --release -- "$tmp_dir" rm -r "$tmp_dir" # Special case for inspect-table: it needs an operation/subcommand, run each one elif [ "$example_dir" = "inspect-table" ]; then for operation in table-version metadata schema scan-metadata actions; do echo " Running inspect-table with operation: $operation" - cargo run --manifest-path "$example_dir/Cargo.toml" --release -- ../tests/data/table-without-dv-small $operation + cargo run --locked --manifest-path "$example_dir/Cargo.toml" --release -- ../tests/data/table-without-dv-small $operation done # Special case for read-table-changes: skip running it in CI as it needs a specific CDF-enabled table # but still verify it compiles # TODO: Add a suitable test table for CDF elif [ "$example_dir" = "read-table-changes" ]; then echo "Building read-table-changes (skipping run - requires CDF-enabled table)" - cargo build --manifest-path "$example_dir/Cargo.toml" --release + cargo build --locked --manifest-path "$example_dir/Cargo.toml" --release else # All other examples run with the test table path - cargo run --manifest-path "$example_dir/Cargo.toml" --release -- ../tests/data/table-without-dv-small + cargo run --locked --manifest-path "$example_dir/Cargo.toml" --release -- ../tests/data/table-without-dv-small fi echo "" diff --git a/.github/workflows/run_integration_test.yml b/.github/workflows/run_integration_test.yml index 73ffd599c8..8d23a5ca45 100644 --- a/.github/workflows/run_integration_test.yml +++ b/.github/workflows/run_integration_test.yml @@ -1,29 +1,39 @@ -name: Run tests to ensure we can compile across arrow versions +# TODO: Disabled. The test script runs cargo update which resolves fresh dependencies, +# bypassing the Cargo.lock supply chain policy (see build.yml top-level comment). -on: [workflow_dispatch, push, pull_request] - -jobs: - arrow_integration_test: - runs-on: ${{ matrix.os }} - timeout-minutes: 20 - strategy: - fail-fast: false - matrix: - include: - - os: macOS-latest - - os: ubuntu-latest - - os: windows-latest - skip: ${{ github.event_name == 'pull_request' }} # skip running windows tests on every PR since they are slow - steps: - - name: Skip job for pull requests on Windows - if: ${{ matrix.skip }} - run: echo "Skipping job for pull requests on Windows." - - uses: actions/checkout@v4 - if: ${{ !matrix.skip }} - - name: Setup rust toolchain - if: ${{ !matrix.skip }} - uses: actions-rust-lang/setup-rust-toolchain@v1 - - name: Run integration tests - if: ${{ !matrix.skip }} - shell: bash - run: pushd integration-tests && ./test-all-arrow-versions.sh +# name: Run tests to ensure we can compile across arrow versions +# +# on: [workflow_dispatch, push, pull_request, merge_group] +# +# jobs: +# arrow_integration_test: +# runs-on: ${{ matrix.os }} +# timeout-minutes: 20 +# strategy: +# fail-fast: false +# matrix: +# include: +# - os: macOS-latest +# - os: ubuntu-latest +# - os: windows-latest +# skip: ${{ github.event_name == 'pull_request' || github.event_name == 'merge_group' }} # skip running windows tests on PRs and merge queue since they are slow +# steps: +# - name: Skip job for pull requests on Windows +# if: ${{ matrix.skip }} +# run: echo "Skipping job for pull requests on Windows." +# - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 +# if: ${{ !matrix.skip }} +# - name: Setup rust toolchain +# if: ${{ !matrix.skip }} +# uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 +# with: +# cache: false +# # See build.yml top-level comment for why save-if is restricted to main. +# - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 +# if: ${{ !matrix.skip }} +# with: +# save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} +# - name: Run integration tests +# if: ${{ !matrix.skip }} +# shell: bash +# run: pushd integration-tests && ./test-all-arrow-versions.sh diff --git a/.github/workflows/semver-checks.yml b/.github/workflows/semver-checks.yml index fd12063e5f..dbf76c2324 100644 --- a/.github/workflows/semver-checks.yml +++ b/.github/workflows/semver-checks.yml @@ -1,12 +1,15 @@ name: semver-checks -# Trigger when a PR is opened or changed +# Trigger when a PR is opened or changed. This runs with `pull_request` trigger, which means it has +# only read perms. The adding of the label happens in semver-label.yml via workflow_run which will +# will look at the status of this job, and always runs in the base-repo context. on: - pull_request_target: + pull_request: types: - opened - synchronize - reopened + merge_group: env: CARGO_TERM_COLOR: always @@ -16,60 +19,70 @@ jobs: check_if_pr_breaks_semver: runs-on: ubuntu-latest permissions: - # this job runs with read because it checks out the PR head which could contain malicious code contents: read steps: - - uses: actions/checkout@v4 - name: checkout full rep + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 with: fetch-depth: 0 - ref: ${{ github.event.pull_request.head.sha }} + ref: >- + ${{ github.event_name == 'merge_group' + && github.event.merge_group.head_sha + || github.event.pull_request.head.sha }} - name: Install minimal stable - uses: actions-rust-lang/setup-rust-toolchain@v1 + uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4 + with: + cache: false + # See build.yml top-level comment for why save-if is restricted to main. + - uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + save-if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} - name: Install cargo-semver-checks + uses: taiki-e/install-action@7bc99eee1f1b8902a125006cf790a1f4c8461e63 # v2.69.8 + with: + tool: cargo-semver-checks + - name: Compute baseline revision + id: baseline shell: bash + env: + MERGE_GROUP_BASE_SHA: ${{ github.event.merge_group.base_sha }} + PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }} + PR_BASE_SHA: ${{ github.event.pull_request.base.sha }} run: | - cargo install cargo-semver-checks --locked - - name: Run check + if [ "${{ github.event_name }}" = "merge_group" ]; then + echo "rev=${MERGE_GROUP_BASE_SHA}" >> "$GITHUB_OUTPUT" + else + # Use the merge-base instead of the PR base SHA. The base SHA is the tip of + # the target branch when the webhook fires, which can differ from where the PR + # actually diverged. Using merge-base avoids false positives when the PR branch + # is behind the target branch. + MERGE_BASE=$(git merge-base "$PR_HEAD_SHA" "$PR_BASE_SHA") + echo "rev=${MERGE_BASE}" >> "$GITHUB_OUTPUT" + fi + - name: Run semver check id: check continue-on-error: true shell: bash + env: + BASELINE_REV: ${{ steps.baseline.outputs.rev }} # only check semver on released crates (delta_kernel and delta_kernel_ffi). # note that this won't run on proc macro/derive crates, so don't need to include # delta_kernel_derive etc. run: | - cargo semver-checks -p delta_kernel -p delta_kernel_ffi --all-features --baseline-rev ${{ github.event.pull_request.base.sha }} - - name: On Failure - id: set_failure - if: ${{ steps.check.outcome == 'failure' }} - run: | - echo "Checks failed" - echo "check_status=failure" >> $GITHUB_OUTPUT - - name: On Success - id: set_success - if: ${{ steps.check.outcome == 'success' }} - run: | - echo "Checks succeed" - echo "check_status=success" >> $GITHUB_OUTPUT - outputs: - check_status: ${{ steps.set_failure.outputs.check_status || steps.set_success.outputs.check_status }} - add_label_if_needed: - needs: check_if_pr_breaks_semver - runs-on: ubuntu-latest - permissions: - # this job only looks at previous output and then sets a label, so malicious code in the PR - # isn't a concern - pull-requests: write - steps: - - name: On Failure - if: needs.check_if_pr_breaks_semver.outputs.check_status == 'failure' - uses: actions-ecosystem/action-add-labels@v1 + cargo semver-checks -p delta_kernel -p delta_kernel_ffi --all-features \ + --baseline-rev "$BASELINE_REV" + # Upload the step outcome as an artifact so semver-label.yml can read it via workflow_run. + # steps.check.outcome is the raw result *before* continue-on-error converts it to "success", + # so it correctly reflects whether a breaking change was detected. + # Only upload for pull_request events; merge_group runs have no PR to label. + - name: Save semver outcome + if: github.event_name == 'pull_request' + env: + SEMVER_OUTCOME: ${{ steps.check.outcome }} + run: echo "$SEMVER_OUTCOME" > semver-outcome.txt + - name: Upload semver outcome + if: github.event_name == 'pull_request' + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: - labels: breaking-change - - name: On Success - if: needs.check_if_pr_breaks_semver.outputs.check_status == 'success' - run: | - echo "Checks succeed" - - name: Fail On Incorrect Previous Output - if: needs.check_if_pr_breaks_semver.outputs.check_status != 'success' && needs.check_if_pr_breaks_semver.outputs.check_status != 'failure' - run: exit 1 + name: semver-outcome + path: semver-outcome.txt + retention-days: 1 diff --git a/.github/workflows/semver-label.yml b/.github/workflows/semver-label.yml new file mode 100644 index 0000000000..ad75d8bda8 --- /dev/null +++ b/.github/workflows/semver-label.yml @@ -0,0 +1,77 @@ +name: semver-label + +# Apply or remove the breaking-change label based on the outcome of the semver-checks workflow. +# This must be a separate workflow from semver-checks.yml: label writes require pull-requests:write, +# which is unavailable in pull_request workflows triggered by fork PRs. workflow_run always runs +# in the base-repo context with full write permissions, and never executes PR code. +on: + workflow_run: + workflows: ["semver-checks"] + types: [completed] + +jobs: + update_label_if_needed: + runs-on: ubuntu-latest + permissions: + pull-requests: write + actions: read + # Label updates only apply to PRs; merge_group runs have no associated PR to label. + if: github.event.workflow_run.event == 'pull_request' + steps: + # Resolve PR number from the triggering workflow run's branch. For fork PRs the branch + # must be prefixed with `:` so gh pr view can locate it. + # Pattern from: https://github.com/orgs/community/discussions/25220#discussioncomment-11316244 + - name: Find PR number + id: pr-context + env: + GH_TOKEN: ${{ github.token }} + PR_TARGET_REPO: ${{ github.repository }} + PR_BRANCH: |- + ${{ + (github.event.workflow_run.head_repository.owner.login != github.event.workflow_run.repository.owner.login) + && format('{0}:{1}', github.event.workflow_run.head_repository.owner.login, github.event.workflow_run.head_branch) + || github.event.workflow_run.head_branch + }} + run: | + echo "Looking up PR for branch '${PR_BRANCH}' in repo '${PR_TARGET_REPO}'" + gh pr view --repo "${PR_TARGET_REPO}" "${PR_BRANCH}" \ + --json 'number' --jq '"number=\(.number)"' \ + >> "${GITHUB_OUTPUT}" + echo "PR lookup complete: $(cat "${GITHUB_OUTPUT}")" + + # Download the semver outcome artifact written by semver-checks.yml. + # steps.check.outcome in that workflow is the raw result before continue-on-error + # converts it to "success", so it correctly reflects whether a breaking change was found. + - name: Download semver outcome + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 + with: + name: semver-outcome + github-token: ${{ github.token }} + run-id: ${{ github.event.workflow_run.id }} + + - name: Update breaking-change label + if: steps.pr-context.outputs.number != '' + env: + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ steps.pr-context.outputs.number }} + run: | + STEP_OUTCOME=$(cat semver-outcome.txt) + echo "Semver check outcome: '${STEP_OUTCOME}' for PR #${PR_NUMBER}" + + if [[ "$STEP_OUTCOME" == "failure" ]]; then + echo "Breaking change detected -- adding 'breaking-change' label to PR #$PR_NUMBER" + gh pr edit "$PR_NUMBER" --repo "$GITHUB_REPOSITORY" --add-label "breaking-change" + elif [[ "$STEP_OUTCOME" == "success" ]]; then + # Remove the label only if it is currently present; gh pr edit fails on absent labels. + CURRENT_LABELS=$(gh pr view "$PR_NUMBER" --repo "$GITHUB_REPOSITORY" --json labels --jq '[.labels[].name]') + echo "Current PR labels: $CURRENT_LABELS" + if echo "$CURRENT_LABELS" | jq -e '.[] | select(. == "breaking-change")' > /dev/null 2>&1; then + echo "Semver check passed -- removing 'breaking-change' label from PR #$PR_NUMBER" + gh pr edit "$PR_NUMBER" --repo "$GITHUB_REPOSITORY" --remove-label "breaking-change" + else + echo "Semver check passed -- 'breaking-change' label not present, nothing to do" + fi + else + echo "ERROR: unexpected semver outcome '${STEP_OUTCOME}' in semver-outcome.txt" + exit 1 + fi diff --git a/.gitignore b/.gitignore index fcc5024b5f..90fa2fbbfb 100644 --- a/.gitignore +++ b/.gitignore @@ -2,20 +2,29 @@ .DS_Store # IDE +.claude/ +.cursor/ .dir-locals.el .idea/ .vscode/ .vim .zed +.cache/ +.clangd +*.*~ # Rust +.cargo-home target/ -/Cargo.lock integration-tests/Cargo.lock # Project acceptance/tests/dat/ +acceptance/workloads/ ffi/examples/read-table/build +ffi/examples/visit-expression/build /build /kernel/target /target + +/benchmarks/workloads/ diff --git a/AI_POLICY.md b/AI_POLICY.md new file mode 100644 index 0000000000..feaacb9991 --- /dev/null +++ b/AI_POLICY.md @@ -0,0 +1,56 @@ +# Delta Kernel (Rust) AI Policy + +### Overview + +We recognize that AI coding assistants are part of many developers' workflows. Thoughtful use of these tools can improve productivity and help contributors explore unfamiliar parts of the codebase. However, delta-kernel-rs implements the Delta Lake protocol with strict correctness requirements, and contributions must reflect genuine understanding of the changes being made. + +### Guidelines for Contributors + +**Be respectful of reviewers and other contributors**. Reviewing takes time and effort, and changes that are needlessly complex, poorly structured, or bloated make that work harder. Plus, future contributors will have to work with (or around) whatever code you merge. If you're unsure whether your contribution is well-structured or appropriately scoped, seek guidance before investing significant effort. You can open a GitHub issue to discuss your approach, use a draft PR to get early feedback on direction, or ask in the Delta-Users Slack. + +**Understand and own your changes.** Every change you push and every review you leave reflects on your professional character and reputation – regardless of whether you used tools like AI. If you use AI tools to assist with code generation, you must fully understand every line of the resulting contribution. You should be able to explain the design, justify implementation choices, and debug issues during review. If you cannot, the contribution is not ready to submit. + +Additionally, please **write your own PR description** and ensure it is crisp, complete, and correct. + +**Call out unknowns.** If there are any parts of the change you are less confident about – AI generated or otherwise – leave comments on your own PR explaining the concern and what steps you took to verify correctness. Reviewers can then focus their attention where it matters most. + +**Match project conventions.** AI tools often generate code that is stylistically inconsistent with a project. Ensure your contributions follow the conventions used in the rest of the codebase, including PR titles (conventional commit format), doc comments, data model, use of helper/utility functions, and error handling patterns. + +**Watch for common AI pitfalls:** + +* Protocol-incompatible behavior that looks plausible but violates the Delta spec +* Incorrect or superficial fixes that mask the real problem +* Changing correct kernel code to match incorrect test expectations (or vice-versa) +* Bloated and/or duplicative code (AI agents often struggle with encapsulation and abstraction) +* Overly verbose, duplicated and/or unnecessary documentation +* Doc and code comments that are stale or refer to the development process (such as dead-end prototyping attempts or initial implementation bugs that were already fixed) rather than describing the current state of the code and the design behind it. These are often called "temporal references." +* Unnecessary test cases or test scaffolding, or bloated/duplicated test structure (use helpers\!) + +**Disclose copyrighted materials**. Contributors are responsible for ensuring that any copyrighted third-party material appearing in AI-generated output has appropriate attribution and licensing. See the [Linux Foundation's Generative AI Policy](https://www.linuxfoundation.org/legal/generative-ai) for further information on licensing considerations. + +### What We Will Not Accept + +**Unreviewed AI PRs**. PRs that appear to be raw AI output submitted without meaningful engagement from the author may be closed without review. Maintainers with access to AI tools could generate such code more efficiently themselves, and the contributor gains nothing from the review process. + +**Unreviewed AI-assisted comments on issues or PRs.** The same ownership principle applies to review comments as to code: if you use AI tools to help draft a comment or review, you are responsible for its quality, completeness, and accuracy. Review and edit AI-assisted output before posting — do not paste raw AI output as-is, as such comments tend to be formulaic and consume attention without adding value. + +Automated bots or agents that post AI-generated content without human review are strictly prohibited, unless explicitly configured by project maintainers (e.g., CI-integrated review tools). + +### Why This Matters + +**delta-kernel-rs is a protocol implementation where correctness is critical**. A subtle bug can cause data loss or corruption for downstream connectors. Code review is a collaborative process that depends on the author understanding their changes well enough to engage meaningfully with reviewer feedback. + +**Our reviewing capacity is limited**. Large PRs that lack the requisite understanding may not get reviewed and may eventually be closed. If you want to contribute but are unfamiliar with the codebase, a high-quality issue with a clear problem statement and reproducible example is often a more valuable starting point than an AI-generated PR out of thin air. + +### Disclosures + +AI tools were used to refine early drafts of this policy, and the final content was edited, reviewed, and approved by human maintainers. + +### Sources + +This policy was written with input from: + +* [Linux Foundation Generative AI Policy](https://www.linuxfoundation.org/legal/generative-ai) +* [Matplotlib Contributing Guide — Restrictions on Generative AI Usage](https://matplotlib.org/devdocs/devel/contribute.html?utm_source=chatgpt.com#restrictions-on-generative-ai-usage) +* [Delta-RS Contributing Guide — AI Generated Code](https://github.com/delta-io/delta-rs/blob/main/CONTRIBUTING.md?utm_source=chatgpt.com#ai-generated-code) +* [DataFusion Contributor Guide — AI-Assisted Contributions](https://datafusion.apache.org/contributor-guide/index.html?#ai-assisted-contributions) diff --git a/CHANGELOG.md b/CHANGELOG.md index 491a397fe1..4405169ae2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,794 @@ # Changelog +## [v0.20.0](https://github.com/delta-io/delta-kernel-rs/tree/v0.20.0/) (2026-02-26) + +[Full Changelog](https://github.com/delta-io/delta-kernel-rs/compare/v0.19.2...v0.20.0) + +### 🏗️ Breaking changes +1. Remove `DefaultEngine::new` ([#1583]) + - Use `DefaultEngineBuilder` instead like: `DefaultEngineBuilder::new(store).build()` +2. Add ParseJson expression ([#1586]) + - Implementors of the ExpressionHandler trait now need to handle this expression +3. Change CommitResponse::Committed to return a FileMeta ([#1599]) + - Committer implementations must now return a FileMeta of the written file after each commit, instead of only returning the committed version +4. Add stats_columns to ParquetHandler ([#1668]) + - Add stat_columns to `write_parquet_file` engine implementation, which specifies the columns to collect Delta stats on +5. Add StatisticsCollector core with numRecords ([#1662]) + - Renames `_stat_columns` above to `stat_columns` +6. Return updated Snapshot from `Snapshot::publish` ([#1694]) + - Snapshot::publish now takes self: Arc and returns DeltaResult instead of () +7. Pass engine to Snapshot::transaction() for domain metadata access ([#1707]) + - Snapshot::transaction() now requires an engine: &dyn Engine parameter to read domain metadata +8. Add tracing instrumentation to transaction and snapshot operations ([#1772]) + - snapshot and transaction have both stopped implementing auto traits UnwindSafe and RefUnwindSafe due to storing new instrumentation span fields +9. Use physical stats column names in `WriteContext` ([#1836]) + - `WriteContext.stats_columns` now uses _physical_ column names per column mapping. Ref: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#column-mapping +10. Generate `physical_schema` in `WriteContext` w.r.t column mapping and `materializePartitionColumns` ([#1837]) + - `WriteContext.physical_schema` now respects column mapping, and retains partition columns when `materializePartitionColumns` is enabled. Ref: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#column-mapping +11. Fix get_app_id_version to take &self ([#1770]) + - If you are calling `get_app_id` pass a reference to the `Snapshot` not `Arc` +12. Add ability to 'enter' the runtime to the default engine ([#1847]) + - Implementors of the `TaskExecutor` trait now need to support this + +### 🚀 Features / new APIs +1. Add doctests for `IntoEngineData` derive macro ([#1580]) +2. Create `DefaultEngineBuilder` to build `DefaultEngine` ([#1582]) +3. Implement `Scalar::From>` ([#1541]) +4. Add `logSegment.new_with_commit_appended` API ([#1602]) +5. `snapshot.new_post_commit` ([#1604]) + - Creates a new Snapshot reflecting a just-committed transaction without re-reading the log +6. Enable Arrow to convert nullable StructArray to RecordBatch ([#1635]) +7. Add `snapshot.checkpoint()` for all-in-one checkpointing ([#1600]) +8. Add a tracing statement to print table configuration for each version ([#1634]) +9. Add CheckpointDeduplicator for checkpoint phase of distributed log replay ([#1538]) +10. Add CreateTable API with simplified single-stage flow ([#1629]) +11. Add with_table_properties method to CreateTableTransactionBuilder ([#1649]) +12. Add post-commit Snapshot to txn ([#1633]) +13. Add CDF tracing for Phase 1 of Change Data feed ([#1654]) +14. Make Sequential phase schema only contain add and remove actions ([#1679]) +15. Add executor for distributed log replay ([#1539]) +16. Transaction stats API ([#1658]) +17. `Snapshot::publish` API with e2e in-memory UC test ([#1628]) +18. Expose a `Snapshot::get_domain_metadata_internal` API, guarded by `internal-api` feature flag ([#1692]) +19. Add nullCount support to StatisticsCollector ([#1663]) +20. Add minValues and maxValues support to StatisticsCollector ([#1664]) +21. Enable NullCount collection for complex data types ([#1706]) +22. Implement schema diffing for flat schemas (2/5]) ([#1478]) +23. Add API on Scan to perform 2-phase log replay ([#1547]) +24. Enable distributed log replay serde serialization for serializable scan state ([#1549]) +25. Add InCommitTimestamp support to ChangeDataFeed ([#1670]) +26. Add include_stats_columns API and output_stats_schema field ([#1728]) +27. Add write support for clustered tables behind feature flag ([#1704]) +28. Add snapshot load instrumentation ([#1750]) +29. Create table builder and domain metadata handling ([#1762]) +30. Add crc module with schema, visitor, reader, and lazy loader ([#1780]) +31. Add clustering support for CREATE TABLE ([#1763]) +32. Support owned runtime in `TokioMultiThreadExecutor` ([#1719]) +33. *(transaction)* Support blind append commit metadata ([#1783]) + - Adds `set_is_blind_append()` API to `Transaction`, includes `isBlindAppend` in generated `CommitInfo`, and validates blind-append semantics (add-only, no removals/DV updates, `dataChange` must be true) before commit. +34. Add stats transform module for checkpoint stats population ([#1646]) +35. Refactor data skipping to use stats_parsed directly ([#1715]) +36. Support using stats_columns and predicate together in scans ([#1691]) +38. Support creation of `DefaultEngine` with `TokioMultiThreadExecutor` in FFI ([#1755]) +39. Add column mapping support for CREATE TABLE ([#1764]) +40. Write parsed stats in checkpoints ([#1643]) +41. Implement ReadConfig for Benchmark Framework ([#1758]) +42. Implement TableInfo Deserialization for Benchmark Framework ([#1759]) +43. Implement Read Spec Deserialization for Benchmark Framework ([#1760]) +44. Allow visitors to visit REE Arrow columns. ([#1829]) +45. *(committer)* Add tracing instrumentation to FileSystemCommitter::commit ([#1811]) +46. Try and cache brew packages to speed up CI ([#1909]) +47. Extend GetData with float, double, date, timestamp, decimal types ([#1901]) +48. Define and use constants for protocol (3,7]) ([#1917]) +49. Generate transform in `WriteContext` w.r.t column mapping ([#1862]) +50. Support v2 checkpoints in create_table API ([#1864]) +51. Expand add files schema to include all stats fields ([#1748]) +52. Support write with both partition columns and column mapping in `DefaultEngine` ([#1870]) +53. Feat: support scanning for multiple specific domains in domain metadata replay ([#1881]) + - Allows callers to request multiple domain names in a single metadata replay pass, with early termination once all requested domains are found. Includes optimized skip of domain metadata fields when a domain has already been seen in a newer commit. +54. Allow ffi for uc_catalog stuff ([#1711]) +55. Support column mapping on writes ([#1863]) +56. Coerce parquet read nullability to match table schema ([#1903]) +57. Relax clustering column constraints to align with Delta protocol ([#1913]) +58. Auto-enable variantType feature during CREATE TABLE ([#1922]) ([#1949]) +59. Add type validation for `evaluate_expression` ([#1575]) +60. Use ReaderBuilder::with_coerce_primitive when parsing JSON ([#1651]) +61. Allow to change tracing level and callback more than once ([#1111]) +62. Simplify checkpoint-table with Snapshot::checkpoint ([#1813]) +63. Add size metadata to the CdfScanFile ([#1935]) +64. Add deletion vector APIs to transaction ([#1430]) +65. Include max known published commit version inside of `LogSegment` ([#1587]) +66. Use CRC for In-Commit-Timestamp reading ([#1806]) +67. Refactor `ListedLogFiles::try_new` to be more extensible and with default values by using builder pattern ([#1585]) +68. Implement the read metadata workload runner ([#1919]) +69. Provide expected stats schema ([#1592]) +70. Add checkpoint schema discovery for stats_parsed detection ([#1550]) +71. Add function to check if schema supports parsed stats ([#1573]) +72. Read parsed-stats from checkpoint ([#1638]) +73. feat: add get clustering columns in transactions ([#1693]) +74. Change expected_stats_schema to return logical schema + physical schema ([#1749]) +75. Add support for outputting parsed file statistics to scan batches ([#1720]) +76. Checkpoint and sidecar row group skipping via stats_parsed ([#1853]) +77. Add serialization/deserialization support for Predicates and Expressions ([#1543]) +78. Distributed Log Replay serialization/deserialization ([#1503]) +79. Introduce Deduplicator trait to unify mutable and immutable deduplication ([#1537]) +80. Add ffi api to perform a checkpoint ([#1619]) + +### 🐛 Bug Fixes + +1. Make parquet read actually use the executor ([#1596]) +2. Deadlock for `TokioMultiThreadExecutor` ([#1606]) +3. Remove `breaking-change` tag after semver passes ([#1621]) +4. Enable arrow conversion from Int96 ([#1653]) +5. Preserve null bitmap in nested transform expressions ([#1645]) +6. Include domain metadata in checkpoints ([#1718]) +- Domain metadata was not being written to checkpoint files, causing it to be lost after checkpoints +7. Propagate struct-level nulls when computing nested column stats ([#1745]) +8. Express One Zone URLs do not support lexicographical ordering ([#1753]) +9. Preserve non-commit files (CRC, checkpoints, compactions) at log tail versions ([#1817]) + - Fixes `list_log_files` to no longer discard CRC, checkpoint, and compaction files at the log tail boundary, ensuring these auxiliary files are preserved alongside their commit files. +10. Fix Miri CI failure by cleaning stale Miri artifacts before test run ([#1845]) +11. Strip parquet field IDs from physical stats schema for checkpoint reading ([#1839]) +12. Unify v2 checkpoint batch schemas ([#1833]) +13. Improve performance and correctness of EngineMap implementation in default engine ([#1785]) +14. Parquet footer skipping cannot trust nullcount=0 stat ([#1914]) +15. Column extraction for visitors should not rely on schema order ([#1818]) +16. Ensure consistent usage of parquet.field.id and conversion to PARQUET:field_id in kernel/default engine ([#1850]) +17. Make log segment merging in `Snapshot::try_new_from` deduplicate compaction files ([#1954]) + +### ⚡ Performance + +1. Pre-allocate Vecs and HashSets when size is known ([#1676]) +2. Add skip_stats option to skip reading file statistics ([#1738]) +3. Use CRC in Protocol + Metadata log replay ([#1790]) + +### 🚜 Refactor + +1. Move doctest into mods ([#1574]) +2. Deny panics in ffi crate ([#1576]) +3. Extract shared HTTP utilities to http.rs ([#1590]) +4. Rename `Snapshot.checkpoint` ([#1608]) +5. Extract stats from `ActionReconciliationIterator` ([#1618]) +6. Cleanup repeated schema definitions in `kernel/tests/write.rs` ([#1637]) +7. Split `committer.rs` into multiple files ([#1622]) +8. Consolidate nullable stat transforms ([#1636]) +9. Add Expression::coalesce helper method ([#1648]) +10. Add checkpoint info to ScanLogReplayProcessor ([#1752]) +11. Extract protocol & metadata replay into log_segment submodule ([#1782]) +12. Define constants for table property keys ([#1797]) + - Replaces scattered string literals for Delta table property keys (e.g. `delta.appendOnly`, `delta.enableChangeDataFeed`) with named constants, improving maintainability and preventing typos. +13. Update metadata schema to be a SchemaRef and add appropriate Arcs ([#1802]) +14. Rename `set_is_blind_append` to `with_blind_append`, returning `Self` ([#1838]) + - Adopts builder-style API for the blind append flag, allowing method chaining (e.g. `txn.with_blind_append(true).commit(...)`). +15. Extract clustering tests into sub-module ([#1828]) +16. Split `UCCommitsClient` into `UCCommitClient` and `UCGetCommitsClient` ([#1854]) + - Separates the Unity Catalog commits client into two focused traits — one for committing and one for reading commits — enabling cleaner dependency boundaries and testability. +17. Use type-state pattern for `CreateTableTransaction` compile-time API safety ([#1842]) + - Encodes the create-table workflow states (building → ready → committed) in the type system, so invalid transitions (e.g. committing before setting schema) are caught at compile time. Reorganizes create-table code and moves tests to integration tests. +18. Simplify table feature parsing ([#1878]) +19. Define and use new TableConfiguration methods ([#1905]) +20. Improve Protocol::try_new and make tests call it reliably ([#1907]) +21. Simplify GetData impls with bool::then() ([#1918]) +22. Split transaction module into `mod.rs` and `update.rs` ([#1877]) + - Breaks the growing transaction module into separate files: core transaction logic in `mod.rs` and update/DV-related logic in `update.rs`, improving navigability. +23. Rename FeatureType::Writer as WriterOnly ([#1934]) +24. Clean up TableConfiguration validation and unit tests ([#1947]) +26. StructType modification method and stat_transform schema boilerplate code refactor. ([#1872]) + +### 🧪 Testing + +1. In-Memory UC-Commits-Client ([#1644]) +2. Add test for post_commit_snapshot with create table API ([#1680]) +3. Add rs-test support ([#1708]) +4. Add test validating collect_stats() output against Spark ([#1778]) +5. Add test for parquet id when CM enabled ([#1946]) +6. [Test Only] Minor refactor to log_segment tests ([#1581] +7. Add file size to the unit test of Engine's ParquetReader ([#1921]) + +### ⚙️ Chores/CI +1. Remove unnecessary spaces in PR description ([#1598]) +2. Upgrade to reqwest 0.13 and rustls as default ([#1588]) +3. Stats-schema improvements ([#1642]) +4. Add Rust caching to build and test jobs ([#1672]) +5. Use cargo-nextest for parallel test execution ([#1673]) +- ~19x faster locally via per-test process isolation +6. Fix ffi_test cache miss by using consistent toolchain action ([#1702]) +7. Add caching and optimize tool installation across all jobs ([#1674]) +8. Remove unused remove metadata ([#1732]) +9. Prefer `append_value_n` over `append_value` ([#1868]) +10. Pin native-tls to 0.2.16 due to upstream breakage ([#1880]) +11. Fix unit tests with bad protocol versions ([#1879]) +12. Add nextest support for miri tests ([#1685]) +13. Unpin Miri nightly toolchain ([#1900]) +14. Bring 0.19.1 changes into main ([#1632]) +15. Remove comfy-table dependency declaration ([#1860]) +16. Update review policy in CONTRIBUTING.md ([#1945]) +17. Revert "chore: pin native-tls to 0.2.16 due to upstream breakage" ([#1915]) + +### Other +4. Remove comments and text from `pull_request_template.md` ([#1589]) + +[#1581]: https://github.com/delta-io/delta-kernel-rs/pull/1581 +[#1585]: https://github.com/delta-io/delta-kernel-rs/pull/1585 +[#1575]: https://github.com/delta-io/delta-kernel-rs/pull/1575 +[#1574]: https://github.com/delta-io/delta-kernel-rs/pull/1574 +[#1550]: https://github.com/delta-io/delta-kernel-rs/pull/1550 +[#1576]: https://github.com/delta-io/delta-kernel-rs/pull/1576 +[#1589]: https://github.com/delta-io/delta-kernel-rs/pull/1589 +[#1430]: https://github.com/delta-io/delta-kernel-rs/pull/1430 +[#1580]: https://github.com/delta-io/delta-kernel-rs/pull/1580 +[#1582]: https://github.com/delta-io/delta-kernel-rs/pull/1582 +[#1590]: https://github.com/delta-io/delta-kernel-rs/pull/1590 +[#1598]: https://github.com/delta-io/delta-kernel-rs/pull/1598 +[#1583]: https://github.com/delta-io/delta-kernel-rs/pull/1583 +[#1591]: https://github.com/delta-io/delta-kernel-rs/pull/1591 +[#1587]: https://github.com/delta-io/delta-kernel-rs/pull/1587 +[#1586]: https://github.com/delta-io/delta-kernel-rs/pull/1586 +[#1596]: https://github.com/delta-io/delta-kernel-rs/pull/1596 +[#1541]: https://github.com/delta-io/delta-kernel-rs/pull/1541 +[#1588]: https://github.com/delta-io/delta-kernel-rs/pull/1588 +[#1599]: https://github.com/delta-io/delta-kernel-rs/pull/1599 +[#1573]: https://github.com/delta-io/delta-kernel-rs/pull/1573 +[#1609]: https://github.com/delta-io/delta-kernel-rs/pull/1609 +[#1606]: https://github.com/delta-io/delta-kernel-rs/pull/1606 +[#1608]: https://github.com/delta-io/delta-kernel-rs/pull/1608 +[#1543]: https://github.com/delta-io/delta-kernel-rs/pull/1543 +[#1592]: https://github.com/delta-io/delta-kernel-rs/pull/1592 +[#1503]: https://github.com/delta-io/delta-kernel-rs/pull/1503 +[#1602]: https://github.com/delta-io/delta-kernel-rs/pull/1602 +[#1537]: https://github.com/delta-io/delta-kernel-rs/pull/1537 +[#1621]: https://github.com/delta-io/delta-kernel-rs/pull/1621 +[#1618]: https://github.com/delta-io/delta-kernel-rs/pull/1618 +[#1604]: https://github.com/delta-io/delta-kernel-rs/pull/1604 +[#1637]: https://github.com/delta-io/delta-kernel-rs/pull/1637 +[#1622]: https://github.com/delta-io/delta-kernel-rs/pull/1622 +[#1651]: https://github.com/delta-io/delta-kernel-rs/pull/1651 +[#1635]: https://github.com/delta-io/delta-kernel-rs/pull/1635 +[#1636]: https://github.com/delta-io/delta-kernel-rs/pull/1636 +[#1600]: https://github.com/delta-io/delta-kernel-rs/pull/1600 +[#1619]: https://github.com/delta-io/delta-kernel-rs/pull/1619 +[#1653]: https://github.com/delta-io/delta-kernel-rs/pull/1653 +[#1642]: https://github.com/delta-io/delta-kernel-rs/pull/1642 +[#1645]: https://github.com/delta-io/delta-kernel-rs/pull/1645 +[#1648]: https://github.com/delta-io/delta-kernel-rs/pull/1648 +[#1634]: https://github.com/delta-io/delta-kernel-rs/pull/1634 +[#1625]: https://github.com/delta-io/delta-kernel-rs/pull/1625 +[#1538]: https://github.com/delta-io/delta-kernel-rs/pull/1538 +[#1626]: https://github.com/delta-io/delta-kernel-rs/pull/1626 +[#1672]: https://github.com/delta-io/delta-kernel-rs/pull/1672 +[#1673]: https://github.com/delta-io/delta-kernel-rs/pull/1673 +[#1629]: https://github.com/delta-io/delta-kernel-rs/pull/1629 +[#1649]: https://github.com/delta-io/delta-kernel-rs/pull/1649 +[#1633]: https://github.com/delta-io/delta-kernel-rs/pull/1633 +[#1654]: https://github.com/delta-io/delta-kernel-rs/pull/1654 +[#1679]: https://github.com/delta-io/delta-kernel-rs/pull/1679 +[#1644]: https://github.com/delta-io/delta-kernel-rs/pull/1644 +[#1680]: https://github.com/delta-io/delta-kernel-rs/pull/1680 +[#1539]: https://github.com/delta-io/delta-kernel-rs/pull/1539 +[#1658]: https://github.com/delta-io/delta-kernel-rs/pull/1658 +[#1668]: https://github.com/delta-io/delta-kernel-rs/pull/1668 +[#1662]: https://github.com/delta-io/delta-kernel-rs/pull/1662 +[#1628]: https://github.com/delta-io/delta-kernel-rs/pull/1628 +[#1692]: https://github.com/delta-io/delta-kernel-rs/pull/1692 +[#1111]: https://github.com/delta-io/delta-kernel-rs/pull/1111 +[#1702]: https://github.com/delta-io/delta-kernel-rs/pull/1702 +[#1674]: https://github.com/delta-io/delta-kernel-rs/pull/1674 +[#1663]: https://github.com/delta-io/delta-kernel-rs/pull/1663 +[#1694]: https://github.com/delta-io/delta-kernel-rs/pull/1694 +[#1707]: https://github.com/delta-io/delta-kernel-rs/pull/1707 +[#1664]: https://github.com/delta-io/delta-kernel-rs/pull/1664 +[#1706]: https://github.com/delta-io/delta-kernel-rs/pull/1706 +[#1478]: https://github.com/delta-io/delta-kernel-rs/pull/1478 +[#1547]: https://github.com/delta-io/delta-kernel-rs/pull/1547 +[#1718]: https://github.com/delta-io/delta-kernel-rs/pull/1718 +[#1549]: https://github.com/delta-io/delta-kernel-rs/pull/1549 +[#1638]: https://github.com/delta-io/delta-kernel-rs/pull/1638 +[#1693]: https://github.com/delta-io/delta-kernel-rs/pull/1693 +[#1732]: https://github.com/delta-io/delta-kernel-rs/pull/1732 +[#1745]: https://github.com/delta-io/delta-kernel-rs/pull/1745 +[#1670]: https://github.com/delta-io/delta-kernel-rs/pull/1670 +[#1749]: https://github.com/delta-io/delta-kernel-rs/pull/1749 +[#1728]: https://github.com/delta-io/delta-kernel-rs/pull/1728 +[#1752]: https://github.com/delta-io/delta-kernel-rs/pull/1752 +[#1753]: https://github.com/delta-io/delta-kernel-rs/pull/1753 +[#1704]: https://github.com/delta-io/delta-kernel-rs/pull/1704 +[#1720]: https://github.com/delta-io/delta-kernel-rs/pull/1720 +[#1750]: https://github.com/delta-io/delta-kernel-rs/pull/1750 +[#1772]: https://github.com/delta-io/delta-kernel-rs/pull/1772 +[#1708]: https://github.com/delta-io/delta-kernel-rs/pull/1708 +[#1762]: https://github.com/delta-io/delta-kernel-rs/pull/1762 +[#1782]: https://github.com/delta-io/delta-kernel-rs/pull/1782 +[#1780]: https://github.com/delta-io/delta-kernel-rs/pull/1780 +[#1770]: https://github.com/delta-io/delta-kernel-rs/pull/1770 +[#1797]: https://github.com/delta-io/delta-kernel-rs/pull/1797 +[#1763]: https://github.com/delta-io/delta-kernel-rs/pull/1763 +[#1719]: https://github.com/delta-io/delta-kernel-rs/pull/1719 +[#1783]: https://github.com/delta-io/delta-kernel-rs/pull/1783 +[#1802]: https://github.com/delta-io/delta-kernel-rs/pull/1802 +[#1778]: https://github.com/delta-io/delta-kernel-rs/pull/1778 +[#1646]: https://github.com/delta-io/delta-kernel-rs/pull/1646 +[#1817]: https://github.com/delta-io/delta-kernel-rs/pull/1817 +[#1715]: https://github.com/delta-io/delta-kernel-rs/pull/1715 +[#1691]: https://github.com/delta-io/delta-kernel-rs/pull/1691 +[#1838]: https://github.com/delta-io/delta-kernel-rs/pull/1838 +[#1790]: https://github.com/delta-io/delta-kernel-rs/pull/1790 +[#1845]: https://github.com/delta-io/delta-kernel-rs/pull/1845 +[#1839]: https://github.com/delta-io/delta-kernel-rs/pull/1839 +[#1833]: https://github.com/delta-io/delta-kernel-rs/pull/1833 +[#1785]: https://github.com/delta-io/delta-kernel-rs/pull/1785 +[#1755]: https://github.com/delta-io/delta-kernel-rs/pull/1755 +[#1764]: https://github.com/delta-io/delta-kernel-rs/pull/1764 +[#1828]: https://github.com/delta-io/delta-kernel-rs/pull/1828 +[#1643]: https://github.com/delta-io/delta-kernel-rs/pull/1643 +[#1758]: https://github.com/delta-io/delta-kernel-rs/pull/1758 +[#1854]: https://github.com/delta-io/delta-kernel-rs/pull/1854 +[#1853]: https://github.com/delta-io/delta-kernel-rs/pull/1853 +[#1868]: https://github.com/delta-io/delta-kernel-rs/pull/1868 +[#1880]: https://github.com/delta-io/delta-kernel-rs/pull/1880 +[#1759]: https://github.com/delta-io/delta-kernel-rs/pull/1759 +[#1760]: https://github.com/delta-io/delta-kernel-rs/pull/1760 +[#1842]: https://github.com/delta-io/delta-kernel-rs/pull/1842 +[#1878]: https://github.com/delta-io/delta-kernel-rs/pull/1878 +[#1879]: https://github.com/delta-io/delta-kernel-rs/pull/1879 +[#1685]: https://github.com/delta-io/delta-kernel-rs/pull/1685 +[#1847]: https://github.com/delta-io/delta-kernel-rs/pull/1847 +[#1900]: https://github.com/delta-io/delta-kernel-rs/pull/1900 +[#1829]: https://github.com/delta-io/delta-kernel-rs/pull/1829 +[#1811]: https://github.com/delta-io/delta-kernel-rs/pull/1811 +[#1632]: https://github.com/delta-io/delta-kernel-rs/pull/1632 +[#1813]: https://github.com/delta-io/delta-kernel-rs/pull/1813 +[#1836]: https://github.com/delta-io/delta-kernel-rs/pull/1836 +[#1837]: https://github.com/delta-io/delta-kernel-rs/pull/1837 +[#1905]: https://github.com/delta-io/delta-kernel-rs/pull/1905 +[#1909]: https://github.com/delta-io/delta-kernel-rs/pull/1909 +[#1914]: https://github.com/delta-io/delta-kernel-rs/pull/1914 +[#1676]: https://github.com/delta-io/delta-kernel-rs/pull/1676 +[#1915]: https://github.com/delta-io/delta-kernel-rs/pull/1915 +[#1907]: https://github.com/delta-io/delta-kernel-rs/pull/1907 +[#1901]: https://github.com/delta-io/delta-kernel-rs/pull/1901 +[#1918]: https://github.com/delta-io/delta-kernel-rs/pull/1918 +[#1917]: https://github.com/delta-io/delta-kernel-rs/pull/1917 +[#1818]: https://github.com/delta-io/delta-kernel-rs/pull/1818 +[#1862]: https://github.com/delta-io/delta-kernel-rs/pull/1862 +[#1872]: https://github.com/delta-io/delta-kernel-rs/pull/1872 +[#1738]: https://github.com/delta-io/delta-kernel-rs/pull/1738 +[#1860]: https://github.com/delta-io/delta-kernel-rs/pull/1860 +[#1864]: https://github.com/delta-io/delta-kernel-rs/pull/1864 +[#1877]: https://github.com/delta-io/delta-kernel-rs/pull/1877 +[#1748]: https://github.com/delta-io/delta-kernel-rs/pull/1748 +[#1870]: https://github.com/delta-io/delta-kernel-rs/pull/1870 +[#1881]: https://github.com/delta-io/delta-kernel-rs/pull/1881 +[#1806]: https://github.com/delta-io/delta-kernel-rs/pull/1806 +[#1711]: https://github.com/delta-io/delta-kernel-rs/pull/1711 +[#1850]: https://github.com/delta-io/delta-kernel-rs/pull/1850 +[#1934]: https://github.com/delta-io/delta-kernel-rs/pull/1934 +[#1863]: https://github.com/delta-io/delta-kernel-rs/pull/1863 +[#1919]: https://github.com/delta-io/delta-kernel-rs/pull/1919 +[#1903]: https://github.com/delta-io/delta-kernel-rs/pull/1903 +[#1921]: https://github.com/delta-io/delta-kernel-rs/pull/1921 +[#1935]: https://github.com/delta-io/delta-kernel-rs/pull/1935 +[#1913]: https://github.com/delta-io/delta-kernel-rs/pull/1913 +[#1946]: https://github.com/delta-io/delta-kernel-rs/pull/1946 +[#1945]: https://github.com/delta-io/delta-kernel-rs/pull/1945 +[#1954]: https://github.com/delta-io/delta-kernel-rs/pull/1954 +[#1947]: https://github.com/delta-io/delta-kernel-rs/pull/1947 +[#1949]: https://github.com/delta-io/delta-kernel-rs/pull/1949 + + +## [v0.19.1](https://github.com/delta-io/delta-kernel-rs/tree/v0.19.0/) (2026-01-20) + +[Full Changelog](https://github.com/delta-io/delta-kernel-rs/compare/v0.19.0...v0.19.1) + +### 🐛 Bug Fixes + +1. fix: deadlock for `TokioMultiThreadExecutor` ([#1606]) (see [#1605] for a description of the issue) + +[#1606]: https://github.com/delta-io/delta-kernel-rs/pull/1606 +[#1605]: https://github.com/delta-io/delta-kernel-rs/issues/1605 + +## [v0.19.0](https://github.com/delta-io/delta-kernel-rs/tree/v0.19.0/) (2025-12-19) + +[Full Changelog](https://github.com/delta-io/delta-kernel-rs/compare/v0.18.2...v0.19.0) + +### 🏗️ Breaking changes +1. Error on surplus columns in output schema ([#1528]) +2. Remove `arrow-55` support (upgrate to arrow 56 or 57 required) ([#1507]) +3. Add a new `read_parquet_schema` function to the `ParquetHandler` trait ([#1498]) +4. Add a new `write_parquet_file` function to the `ParquetHandler` trait ([#1392]) +5. Make PartialEq for Scalar a physical comparison ([#1554]) +> [!CAUTION] +> Note this is a **breaking** behavior change. Code that previously relied on `PartialEq` as a +> logical comparison will still compile, but its runtime behavior will silently change to perform +> structural comparisons. +> +> This change moves the current definition of `PartialEq` for `Scalar` to a new `Scalar::logical_eq` +> method, and derives `PartialEq` (= physical comparison). +> +> We also remove PartialOrd for Scalar because it, too, would become physical (required to match +> PartialEq), and the result would be largely nonsensical. The logical comparison moves to +> Scalar::logical_partial_cmp instead. +> +> These changes are needed because today there's no reliable way to physically compare two scalars, +> and most comparisons are physical in practice. Only predicate evaluation needs logical +> comparisons, and that code already has a narrow waist. +6. Expose mod time in scan metadata callbacks: users must change the scan callback function to take +a struct which has all the previous arguments as members (and the mod time). See an example of the +needed change [here][change1]. For FFI code, your callback function needs an extra argument. See an +example of the change needed [here][change2]. ([#1565]) + +[change1]: https://github.com/delta-io/delta-kernel-rs/pull/1565/files#diff-3f44d0b7f8cfbe763cbe0cdbb2e2450a84833065de32fc102aef9d38b21b3daaR62 +[change2]: https://github.com/delta-io/delta-kernel-rs/pull/1565/files#diff-60493959e34a593831a075fbf2cc7a03a45d8f423f98e2d6b6a4a6ce479dd25bR54 + +### 🚀 Features / new APIs + +1. Initial Metrics implementation ([#1448]) +2. Build TableConfiguration for each version of change data feed ([#1531]) +3. Add ability for engines to specify a scan schema ([#1463]) +4. Add bidirectional expression round-trip test with visitor functions ([#1467]) +5. Add support for the materializePartitionColumns writer feature ([#1476]) +6. Allow comfy-table 7.2.x ([#1545]) +7. Rustls for uc-client ([#1533]) +8. Add file name metadata column to parquet reading. ([#1512]) +9. Add checkpoint example ([#1544]) +10. Commit Reader for processing commit actions ([#1499]) +11. Add CheckpointManifestReader to process sidecar files ([#1500]) +12. Distributed Log Replay Sequential Phase ([#1502]) +13. Passing schema from C, plus example/tests in C ([#1535]) +14. Support sidecar in inspect-table ([#1566]) +15. short-circuit coalesce evaluation when array has no nulls ([#1568]) + +### 🐛 Bug Fixes + +1. Force usage of ListedLogFiles::try_new() ([#1562]) +2. Improve parse_json performance by removing line-by-line parsing ([#1561]) + +### 🚜 Refactor + +1. Move ensure_read_support/ensure_write_support to operation entry points ([#1518]) +2. Migrate custom feature functions to generic is_feature_enabled/is_feature_supported ([#1519]) +3. Separate async handler logic from sync bridge logic ([#1435]) + +### 🧪 Testing + +1. Migrated protocol validation tests to table_configuration ([#1517]) +2. Move scan/mod.rs to scan/tests.rs and scan/test_utils.rs ([#1485]) + +### ⚙️ Chores/CI + +1. Remove macOS metadata from test data tarballs ([#1534]) +2. Make tests async if they rely on async ([#1438]) +3. Cleanup scalar eq workaround ([#1560]) + +### Other +1. Remove architecture.md from readme ([#1551]) + + +[#1517]: https://github.com/delta-io/delta-kernel-rs/pull/1517 +[#1448]: https://github.com/delta-io/delta-kernel-rs/pull/1448 +[#1518]: https://github.com/delta-io/delta-kernel-rs/pull/1518 +[#1528]: https://github.com/delta-io/delta-kernel-rs/pull/1528 +[#1507]: https://github.com/delta-io/delta-kernel-rs/pull/1507 +[#1531]: https://github.com/delta-io/delta-kernel-rs/pull/1531 +[#1463]: https://github.com/delta-io/delta-kernel-rs/pull/1463 +[#1485]: https://github.com/delta-io/delta-kernel-rs/pull/1485 +[#1534]: https://github.com/delta-io/delta-kernel-rs/pull/1534 +[#1438]: https://github.com/delta-io/delta-kernel-rs/pull/1438 +[#1467]: https://github.com/delta-io/delta-kernel-rs/pull/1467 +[#1519]: https://github.com/delta-io/delta-kernel-rs/pull/1519 +[#1435]: https://github.com/delta-io/delta-kernel-rs/pull/1435 +[#1476]: https://github.com/delta-io/delta-kernel-rs/pull/1476 +[#1498]: https://github.com/delta-io/delta-kernel-rs/pull/1498 +[#1545]: https://github.com/delta-io/delta-kernel-rs/pull/1545 +[#1392]: https://github.com/delta-io/delta-kernel-rs/pull/1392 +[#1551]: https://github.com/delta-io/delta-kernel-rs/pull/1551 +[#1533]: https://github.com/delta-io/delta-kernel-rs/pull/1533 +[#1512]: https://github.com/delta-io/delta-kernel-rs/pull/1512 +[#1544]: https://github.com/delta-io/delta-kernel-rs/pull/1544 +[#1554]: https://github.com/delta-io/delta-kernel-rs/pull/1554 +[#1499]: https://github.com/delta-io/delta-kernel-rs/pull/1499 +[#1560]: https://github.com/delta-io/delta-kernel-rs/pull/1560 +[#1500]: https://github.com/delta-io/delta-kernel-rs/pull/1500 +[#1502]: https://github.com/delta-io/delta-kernel-rs/pull/1502 +[#1535]: https://github.com/delta-io/delta-kernel-rs/pull/1535 +[#1565]: https://github.com/delta-io/delta-kernel-rs/pull/1565 +[#1566]: https://github.com/delta-io/delta-kernel-rs/pull/1566 +[#1562]: https://github.com/delta-io/delta-kernel-rs/pull/1562 +[#1561]: https://github.com/delta-io/delta-kernel-rs/pull/1561 +[#1568]: https://github.com/delta-io/delta-kernel-rs/pull/1568 + + +## [v0.18.2](https://github.com/delta-io/delta-kernel-rs/tree/v0.18.2/) (2025-12-03) + +[Full Changelog](https://github.com/delta-io/delta-kernel-rs/compare/v0.18.1...v0.18.2) + + +### 🐛 Bug Fixes + +1. Address column mapping edge case in protocol validation ([#1513]) + +### 🧪 Testing +1. Remove arrow error message dependency from test ([#1529]) + + +[#1513]: https://github.com/delta-io/delta-kernel-rs/pull/1513 +[#1529]: https://github.com/delta-io/delta-kernel-rs/pull/1529 + + +## [v0.18.1](https://github.com/delta-io/delta-kernel-rs/tree/v0.18.1/) (2025-11-24) + +[Full Changelog](https://github.com/delta-io/delta-kernel-rs/compare/v0.18.0...v0.18.1) + + +### 🚀 Features / new APIs + +1. Scan::execute no longer requires lifetime bound ([#1515]) +2. Migrate protocol validation to table_configuration ([#1411]) +3. Add Display for StructType, StructField, and MetadataColumnSpec ([#1494]) +5. Add EngineDataArrowExt and use it everywhere ([#1516]) +6. Implement builder for StructType ([#1492]) +7. Enable CDF for column-mapped tables ([#1510]) + +### 🧪 Testing + +1. Extract File Action tests ([#1365]) + + +[#1515]: https://github.com/delta-io/delta-kernel-rs/pull/1515 +[#1365]: https://github.com/delta-io/delta-kernel-rs/pull/1365 +[#1411]: https://github.com/delta-io/delta-kernel-rs/pull/1411 +[#1494]: https://github.com/delta-io/delta-kernel-rs/pull/1494 +[#1516]: https://github.com/delta-io/delta-kernel-rs/pull/1516 +[#1492]: https://github.com/delta-io/delta-kernel-rs/pull/1492 +[#1510]: https://github.com/delta-io/delta-kernel-rs/pull/1510 + + +## [v0.18.0](https://github.com/delta-io/delta-kernel-rs/tree/v0.18.0/) (2025-11-19) + +[Full Changelog](https://github.com/delta-io/delta-kernel-rs/compare/v0.17.1...v0.18.0) + +### 🏗️ Breaking changes +1. New Engine StorageHandler head API ([#1465]) + - Engine API implementers must add the `head` API to StorageHandler which fetches metadata about a file in storage +2. Add remove_files API ([#1353]) + - The schema for scan rows (from `Scan::scan_metadata`) has been updated to include two + new fields: `fileConstantValues.tags` and `fileConstantValues.defaultRowCommitVersion`. + +### 🚀 Features / new APIs + +1. Add parser for iceberg compat properties ([#1466]) +2. Pass ColumnMappingMode to physical_name ([#1403]) +3. Allow visiting entire domain metadata ([#1384]) +4. Add Table Feature Info ([#1462]) +5. *(FFI)* Snapshot log tail FFI ([#1379]) +6. Add generic is_feature_supported and is_feature_enabled methods to TableConfiguration ([#1405]) +7. Un-deprecate ArrayData.array_elements() ([#1493]) +8. Allow writes to CDF tables for add-only, remove-only, and non-data-change transactions ([#1490]) +9. *(catalog-managed)* UCCommitter ([#1418]) + +### 🐛 Bug Fixes + +1. Eliminate endless busy looping in read_json_files on failed read ([#1489]) +2. Handle array/map types in ffi schema example and test ([#1497]) + +### 📚 Documentation + +1. Fix docs for rustc 1.92+ ([#1470]) + +### 🚜 Refactor + +1. Harmonize checkpoint and log compaction iterators ([#1436]) +2. Avoid overly complex itertools methods in log listing code ([#1434]) +3. Simplify creation of default engine in tests ([#1437]) + +### 🧪 Testing + +1. Add tests for StructField.physical_name ([#1469]) + +[#1466]: https://github.com/delta-io/delta-kernel-rs/pull/1466 +[#1403]: https://github.com/delta-io/delta-kernel-rs/pull/1403 +[#1465]: https://github.com/delta-io/delta-kernel-rs/pull/1465 +[#1436]: https://github.com/delta-io/delta-kernel-rs/pull/1436 +[#1470]: https://github.com/delta-io/delta-kernel-rs/pull/1470 +[#1384]: https://github.com/delta-io/delta-kernel-rs/pull/1384 +[#1462]: https://github.com/delta-io/delta-kernel-rs/pull/1462 +[#1474]: https://github.com/delta-io/delta-kernel-rs/pull/1474 +[#1379]: https://github.com/delta-io/delta-kernel-rs/pull/1379 +[#1434]: https://github.com/delta-io/delta-kernel-rs/pull/1434 +[#1437]: https://github.com/delta-io/delta-kernel-rs/pull/1437 +[#1353]: https://github.com/delta-io/delta-kernel-rs/pull/1353 +[#1489]: https://github.com/delta-io/delta-kernel-rs/pull/1489 +[#1405]: https://github.com/delta-io/delta-kernel-rs/pull/1405 +[#1469]: https://github.com/delta-io/delta-kernel-rs/pull/1469 +[#1493]: https://github.com/delta-io/delta-kernel-rs/pull/1493 +[#1497]: https://github.com/delta-io/delta-kernel-rs/pull/1497 +[#1490]: https://github.com/delta-io/delta-kernel-rs/pull/1490 +[#1418]: https://github.com/delta-io/delta-kernel-rs/pull/1418 + + +## [v0.17.1](https://github.com/delta-io/delta-kernel-rs/tree/v0.17.1/) (2025-11-13) + +[Full Changelog](https://github.com/delta-io/delta-kernel-rs/compare/v0.17.0...v0.17.1) + + +### 📚 Documentation + +1. Fix docs for rustc 1.92+ ([#1470]) + + +[#1470]: https://github.com/delta-io/delta-kernel-rs/pull/1470 + + +## [v0.17.0](https://github.com/delta-io/delta-kernel-rs/tree/v0.17.0/) (2025-11-10) + +[Full Changelog](https://github.com/delta-io/delta-kernel-rs/compare/v0.16.0...v0.17.0) + +### 🏗️ Breaking changes +1. (catalog-managed): New copy_atomic StorageHandler method ([#1400]) + - StorageHandler implementers must implement the copy_atomic method. +2. Make expression and predicate evaluator constructors fallible ([#1452]) + - Predicate and expression evaluator constructors return DeltaResult. +3. (catalog-managed): add `log_tail` to `SnapshotBuilder` ([#1290]) + - `into_scan_builder()` no longer exists on `Snapshot`. Must create an `Arc` +4. Arrow 57, MSRV 1.85+ ([#1424]) + - The Minimum Required Rust Version to use kernel-rs is now 1.85. +5. Add ffi for idempotent write primitives ([#1191]) + - get_transform_for_row now returns new FFI-safe OptionalValue instead of Option +6. Rearchitect `CommitResult` ([#1343]) + - CommitResult is now an enum containing CommittedTransaction, ConflictedTransaction, + and RetryableTransaction +7. Add with_data_change to transaction ([#1281]) + - Engines must use with_data_change on the transaction level instead of + passing it to the method. add_files_schema is moved to be scoped on a the + transaction. +8. *(catalog-managed)* Introduce Committer (with FileSystemCommitter) ([#1349]) + - Constructing a transaction now requires a committer. Ex: FileSystemCommitter +9. Switch scan.execute to return pre-filtered data ([#1429]) + - Connectors no longer need to filter data that is returned from `scan.execute()` + + +### 🚀 Features / new APIs + +1. Add visit_string_map to the ffi ([#1342]) +2. Add tags field to LastCheckpointHint ([#1455]) +3. Support writing domain metadata (1/2]) ([#1274]) +4. Change input to write_json_file to be FilteredEngineData ([#1312]) +5. Convert DV `storage_type` to enum ([#1366]) +6. Add latest_commit_file field to LogSegment ([#1364]) +7. No staged commits in checkpoint/compaction ([#1374]) +8. Generate In Commit Timestamp on write ([#1314]) +9. *(catalog-managed)* Add `uc-catalog` crate with load_table ([#1324]) +10. Snapshot should not expose delta implementation details ([#1339]) +11. *(catalog-managed)* Uc-client commit API ([#1399]) +12. Add row tracking support ([#1375]) +13. Support writing domain metadata (2/2]) ([#1275]) +14. Add parser for enableTypeWidening table property ([#1456]) +15. Implement `From` trait `EngineData` into `FilteredEngineData` ([#1397]) +16. Unify TableFeatures followups ([#1404]) +18. Accept nullable values in "tags" HashMap in `Add` action ([#1395]) +19. Enable writes to CDF enabled tables only if append only is supported ([#1449]) +20. Add deletion vector file writer ([#1425]) +21. Allow converting `bytes::Bytes` into a Binary Scalar ([#1373]) +22. CDF API for FFI ([#1335]) +23. Add optional stats field to remove action ([#1390]) +24. Modify read_actions to not require callers to know details about checkpoints. ([#1407]) +25. Add Accessor for `Binary` data ([#1383]) + +### 🐛 Bug Fixes + +1. Change InCommitTimestamp enablement getter function ([#1357]) +2. Be adaptive to the log schema changing in inspect-table ([#1368]) +3. Typo on variable name for ScanTransformFieldClassifierieldClassifier ([#1394]) +4. Pin cbindgen to 0.29.0 ([#1412]) +5. Unpin cbindgen ([#1414]) +6. Don't return errors from ParsedLogPath::try_from ([#1433]) +7. Doc issue, stray ' ([#1445]) +8. Replace todo!() with proper error handling in deletion vector ([#1447]) + +### 📚 Documentation + +1. Fix scan_metadata docs ([#1450]) + +### 🚜 Refactor + +1. Pull out transform spec utils and definitions ([#1326]) +2. Use expression transforms in change data feed ([#1330]) +3. Remove raw pointer indexing and add unit tests for RowIndexBuilder ([#1334]) +4. Make `Metadata` fields private ([#1347]) +5. Remove storing UUID in LogPathFileType::UuidCheckpoint ([#1317]) +6. Consolidate physical/logical info into StateInfo ([#1350]) +7. Consolidate regular scan and CDF scan field handling ([#1359]) +8. Make get_cdf_transform_expr return Option ([#1401]) +9. Separate domain metadata additions and removals ([#1421]) +10. Unify Reader/WriterFeature into a single TableFeature ([#1345]) +11. Put `DataFileMetadata::as_record_batch` under `#[internal_api]` ([#1409]) +12. Create static variables for magic values in deletion vector ([#1446]) + +### 🧪 Testing + +1. E2e test for log compaction ([#1308]) +2. Tombstone expiration e2e test for log compaction ([#1341]) +3. Add memory tests (via DHAT) ([#1009]) +4. One liner to skip read_table_version_hdfs ([#1428]) + +### ⚙️ Chores/CI + +1. Add CI for examples ([#1393]) +2. Small typo's in `log_segment.rs` ([#1396]) +3. Reduce log verbosity when encountering non-standard files in _delta_log ([#1416]) +4. Follow up on TODO in `log_replay.rs` ([#1408]) +5. Remove a stray comment in the kernel visitor ([#1457]) +6. Allow passing more on the command line for all the cli examples ([#1352]) +7. add back arrow-55 support ([#1458]) +8. Rename log_schema to commit_schema ([#1419]) + +[#1326]: https://github.com/delta-io/delta-kernel-rs/pull/1326 +[#1308]: https://github.com/delta-io/delta-kernel-rs/pull/1308 +[#1342]: https://github.com/delta-io/delta-kernel-rs/pull/1342 +[#1290]: https://github.com/delta-io/delta-kernel-rs/pull/1290 +[#1274]: https://github.com/delta-io/delta-kernel-rs/pull/1274 +[#1330]: https://github.com/delta-io/delta-kernel-rs/pull/1330 +[#1334]: https://github.com/delta-io/delta-kernel-rs/pull/1334 +[#1347]: https://github.com/delta-io/delta-kernel-rs/pull/1347 +[#1312]: https://github.com/delta-io/delta-kernel-rs/pull/1312 +[#1352]: https://github.com/delta-io/delta-kernel-rs/pull/1352 +[#1357]: https://github.com/delta-io/delta-kernel-rs/pull/1357 +[#1317]: https://github.com/delta-io/delta-kernel-rs/pull/1317 +[#1341]: https://github.com/delta-io/delta-kernel-rs/pull/1341 +[#1350]: https://github.com/delta-io/delta-kernel-rs/pull/1350 +[#1009]: https://github.com/delta-io/delta-kernel-rs/pull/1009 +[#1366]: https://github.com/delta-io/delta-kernel-rs/pull/1366 +[#1364]: https://github.com/delta-io/delta-kernel-rs/pull/1364 +[#1368]: https://github.com/delta-io/delta-kernel-rs/pull/1368 +[#1339]: https://github.com/delta-io/delta-kernel-rs/pull/1339 +[#1373]: https://github.com/delta-io/delta-kernel-rs/pull/1373 +[#1359]: https://github.com/delta-io/delta-kernel-rs/pull/1359 +[#1343]: https://github.com/delta-io/delta-kernel-rs/pull/1343 +[#1374]: https://github.com/delta-io/delta-kernel-rs/pull/1374 +[#1314]: https://github.com/delta-io/delta-kernel-rs/pull/1314 +[#1394]: https://github.com/delta-io/delta-kernel-rs/pull/1394 +[#1393]: https://github.com/delta-io/delta-kernel-rs/pull/1393 +[#1396]: https://github.com/delta-io/delta-kernel-rs/pull/1396 +[#1281]: https://github.com/delta-io/delta-kernel-rs/pull/1281 +[#1324]: https://github.com/delta-io/delta-kernel-rs/pull/1324 +[#1401]: https://github.com/delta-io/delta-kernel-rs/pull/1401 +[#1412]: https://github.com/delta-io/delta-kernel-rs/pull/1412 +[#1349]: https://github.com/delta-io/delta-kernel-rs/pull/1349 +[#1407]: https://github.com/delta-io/delta-kernel-rs/pull/1407 +[#1414]: https://github.com/delta-io/delta-kernel-rs/pull/1414 +[#1416]: https://github.com/delta-io/delta-kernel-rs/pull/1416 +[#1191]: https://github.com/delta-io/delta-kernel-rs/pull/1191 +[#1399]: https://github.com/delta-io/delta-kernel-rs/pull/1399 +[#1375]: https://github.com/delta-io/delta-kernel-rs/pull/1375 +[#1419]: https://github.com/delta-io/delta-kernel-rs/pull/1419 +[#1275]: https://github.com/delta-io/delta-kernel-rs/pull/1275 +[#1400]: https://github.com/delta-io/delta-kernel-rs/pull/1400 +[#1335]: https://github.com/delta-io/delta-kernel-rs/pull/1335 +[#1397]: https://github.com/delta-io/delta-kernel-rs/pull/1397 +[#1421]: https://github.com/delta-io/delta-kernel-rs/pull/1421 +[#1345]: https://github.com/delta-io/delta-kernel-rs/pull/1345 +[#1428]: https://github.com/delta-io/delta-kernel-rs/pull/1428 +[#1404]: https://github.com/delta-io/delta-kernel-rs/pull/1404 +[#1433]: https://github.com/delta-io/delta-kernel-rs/pull/1433 +[#1445]: https://github.com/delta-io/delta-kernel-rs/pull/1445 +[#1408]: https://github.com/delta-io/delta-kernel-rs/pull/1408 +[#1429]: https://github.com/delta-io/delta-kernel-rs/pull/1429 +[#1450]: https://github.com/delta-io/delta-kernel-rs/pull/1450 +[#1395]: https://github.com/delta-io/delta-kernel-rs/pull/1395 +[#1390]: https://github.com/delta-io/delta-kernel-rs/pull/1390 +[#1449]: https://github.com/delta-io/delta-kernel-rs/pull/1449 +[#1425]: https://github.com/delta-io/delta-kernel-rs/pull/1425 +[#1409]: https://github.com/delta-io/delta-kernel-rs/pull/1409 +[#1452]: https://github.com/delta-io/delta-kernel-rs/pull/1452 +[#1424]: https://github.com/delta-io/delta-kernel-rs/pull/1424 +[#1447]: https://github.com/delta-io/delta-kernel-rs/pull/1447 +[#1456]: https://github.com/delta-io/delta-kernel-rs/pull/1456 +[#1455]: https://github.com/delta-io/delta-kernel-rs/pull/1455 +[#1457]: https://github.com/delta-io/delta-kernel-rs/pull/1457 +[#1458]: https://github.com/delta-io/delta-kernel-rs/pull/1458 +[#1383]: https://github.com/delta-io/delta-kernel-rs/pull/1383 +[#1446]: https://github.com/delta-io/delta-kernel-rs/pull/1446 + + ## [v0.16.0](https://github.com/delta-io/delta-kernel-rs/tree/v0.16.0/) (2025-09-19) [Full Changelog](https://github.com/delta-io/delta-kernel-rs/compare/v0.15.2...v0.16.0) @@ -1312,4 +2101,4 @@ validation. [\#435] ## [v0.1.0](https://github.com/delta-io/delta-kernel-rs/tree/v0.1.0/) (2024-06-12) -Initial public release \ No newline at end of file +Initial public release diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000..6872934fdb --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,245 @@ +# CLAUDE.md + +## Project Overview + +Delta-kernel-rs is a Rust library for building Delta Lake connectors. It encapsulates the +Delta protocol so connectors can read and write Delta tables without understanding protocol +internals. Kernel never does I/O directly -- it defines _what_ to do via its APIs +(`Snapshot`, `Scan`, `Transaction`) and delegates _how_ to the `Engine` trait. + +Current capabilities: table reads with predicates, data skipping, deletion vectors, change +data feed, checkpoints (V1 & V2), log compaction, blind append writes, table creation +(including clustered tables), and catalog-managed table support. + +## Build & Test Commands + +```bash +# Build +cargo build --workspace --all-features + +# Run all tests (prefer nextest over cargo test) +cargo nextest run --workspace --all-features + +# Run tests for a specific crate +cargo nextest run -p delta_kernel --all-features + +# Run a single test in a specific crate (fastest -- only compiles that crate) +cargo nextest run -p delta_kernel --lib --all-features test_name_here + +# Run a test by name, searching all crates (slow -- compiles everything) +cargo nextest run --workspace --all-features test_name_here + +# Format, lint, and doc check (always run after code changes) +cargo fmt \ + && cargo clippy --workspace --benches --tests --all-features -- -D warnings \ + && cargo doc --workspace --all-features --no-deps + +# Workspace no-default-features lint for crates that depend on kernel's Arrow APIs +cargo clippy --workspace --no-default-features --features arrow \ + --exclude delta_kernel --exclude delta_kernel_ffi --exclude delta_kernel_derive --exclude delta_kernel_ffi_macros -- -D warnings + +# Quick pre-push check (mimics CI) +cargo fmt \ + && cargo clippy --workspace --benches --tests --all-features -- -D warnings \ + && cargo doc --workspace --all-features --no-deps \ + && cargo nextest run --workspace --all-features +``` + +### Crate Names for `-p` Flag + +| Crate | Directory | Description | +|------------------------------------------|--------------------------------------------|---------------------------------------------------------| +| `delta_kernel` | `kernel/` | Core library | +| `delta_kernel_ffi` | `ffi/` | C/C++ FFI bindings | +| `delta_kernel_derive` | `derive-macros/` | Proc macros | +| `acceptance` | `acceptance/` | Acceptance tests (DAT) | +| `test_utils` | `test-utils/` | Shared test utilities | +| `feature_tests` | `feature-tests/` | Feature flag tests | +| `delta-kernel-unity-catalog` | `delta-kernel-unity-catalog/` | Unity Catalog integration (UCKernelClient, UCCommitter) | +| `unity-catalog-delta-client-api` | `unity-catalog-delta-client-api/` | Unity Catalog client traits and shared models | +| `unity-catalog-delta-rest-client` | `unity-catalog-delta-rest-client/` | Unity Catalog REST client | + +### Feature Flags + +- `default-engine` / `default-engine-rustls` / `default-engine-native-tls` -- async + Arrow/Tokio engine (pick one TLS backend) +- `arrow`, `arrow-XX`, `arrow-YY` -- Arrow version selection (kernel tracks the latest two + major Arrow releases; `arrow` defaults to latest). Kernel itself does not depend on Arrow, + but default-engine does. +- `arrow-conversion`, `arrow-expression` -- Arrow interop (auto-enabled by default engine) +- `prettyprint` -- enables Arrow pretty-print helpers (primarily test/example oriented) +- `clustered-table` -- clustered table write support (experimental) +- `internal-api` -- unstable APIs like `parallel_scan_metadata`. Items are marked with the + `#[internal_api]` proc macro attribute. +- `test-utils`, `integration-test` -- development only (`test-utils` enables `prettyprint`) + +## Architecture at a Glance + +**Snapshot** is the entry point for everything -- an immutable view of a table at a specific +version. From it you build a `Scan` (reads) or `Transaction` (writes). + +**Read path:** `Snapshot` -> `ScanBuilder` -> `Scan` -> data. Three execution paths: +`execute()` (simple), `scan_metadata()` (advanced/distributed), +`parallel_scan_metadata()` (two-phase distributed log replay). + +**Write path:** `Snapshot` -> `Transaction` -> `commit()`. Kernel provides `WriteContext`, +assembles commit actions, enforces protocol compliance, delegates atomic commit to a +`Committer`. + +**Engine trait:** five handlers (`StorageHandler`, `JsonHandler`, `ParquetHandler`, +`EvaluationHandler`, optional `MetricsReporter`). `DefaultEngine` lives in +`kernel/src/engine/default/`. + +**EngineData:** opaque columnar data interface. IMPORTANT: never access `EngineData` columns +directly -- always use the visitor pattern (`visit_rows` with typed `GetData` accessors). + +## Testing + +- **Unit tests** test internal APIs and module internals. It is fine to use public APIs + like `create_table` in a unit test as setup (e.g. to create a table for testing reads, + writes, or state loading). +- **Integration tests** exercise only public APIs end-to-end. See `kernel/tests/README.md` + for a catalog of available test tables (schema, protocol, features, and which tests use + them). Consult it before creating new test data to avoid duplication. +- Consider how the feature interacts with Delta table features (see Protocol TLDR below). +- Consider write paths: normal commits, checkpointing, CRC files, log compaction files. +- Consider read paths: loading a snapshot from scratch at latest version, at a specific + version (time travel), and updating from an existing snapshot. +- Consider table state: only deltas (`00.json` to `0N.json`), after a checkpoint, after + a CRC (`0N.crc`) file, after log compaction, etc. +- Prefer descriptive test names over doc comments. Encode the scenario and expected + behavior in the test name. Only add a test doc comment when the intent is too + verbose or complex to express succinctly in the name. +- Use `rstest` to parameterize tests that share the same logic but differ in setup + or inputs. Prefer `#[case]` over duplicating test functions. When parameters are + independent and form a cartesian product, prefer `#[values]` over enumerating + every combination with `#[case]`. +- Reuse helpers from `test_utils` instead of writing custom ones when possible. +- **`add_commit` and table setup in tests:** `add_commit` takes a `table_root` string and + resolves it to an absolute object-store path. The `table_root` must be a proper URL string + with a trailing slash (e.g. `"memory:///"`, `"file:///tmp/my_table/"`). Avoid using the + `Url` type directly -- most test helpers and kernel APIs accept `impl AsRef`, so pass + URL strings instead. When using local storage, use an un-prefixed store + (`LocalFileSystem::new()`) with a `file:///` URL string. Do NOT use + `LocalFileSystem::new_with_prefix()` with `add_commit` -- the prefix causes double-nesting + because `add_commit` already resolves the full path from the URL. For in-memory tests, use + `InMemory::new()` with `"memory:///"`. Always use the same `table_root` URL string for + both `add_commit` (writing log files) and `snapshot`/`Snapshot::try_new` (reading the + table). Always include a trailing slash in directory URLs to ensure correct path joining. + +## Protocol TLDR + +The [Delta protocol spec](https://raw.githubusercontent.com/delta-io/delta/master/PROTOCOL.md) +is the source of truth. Key concepts: + +- **Actions** -- atomic units of a transaction: Metadata, Add File, Remove File, Add CDC + File, Protocol, CommitInfo, Domain Metadata, Sidecar, Checkpoint Metadata +- **Log structure** -- JSON commit files, checkpoints (V1 parquet, V2 multi-part), log + compaction files, version checksum (CRC) files, `_last_checkpoint` +- **Protocol versioning** -- (readerVersion, writerVersion) pair. At (3, 7) switches to + explicit table features via `readerFeatures`/`writerFeatures` arrays. Features cannot be + removed once added. +- **Data skipping** -- per-file column statistics (min, max, null count, row count) with + tight/wide bounds +- **Schemas** -- JSON serialization format for StructType/StructField/DataType +- **Stats and partition values** -- per-file column statistics (min, max, nullCount) and + partition values are stored as JSON strings in Add file actions. The stats JSON structure + mirrors the table schema. See the protocol spec sections on "Per-file Statistics" and + "Partition Value Serialization" for the exact formats. + +**Table features**: + +- Writer: `appendOnly`, `invariants`, `checkConstraints`, `generatedColumns`, + `allowColumnDefaults`, `changeDataFeed`, `identityColumns`, `rowTracking`, + `domainMetadata`, `icebergCompatV1`, `icebergCompatV2`, `clustering`, + `inCommitTimestamp` +- Reader + writer: `columnMapping`, `deletionVectors`, `timestampNtz`, + `v2Checkpoint`, `vacuumProtocolCheck`, `variantType`, `variantType-preview`, + `typeWidening` + +Keep this list updated when new protocol features are added to kernel. + +## Common Gotchas + +- **EngineData is opaque:** Never downcast to `ArrowEngineData` or any concrete type + in production code (ok in tests). Never assume one batch per file -- always iterate. +- **Column mapping:** Physical column names can differ from logical names. Always use + the schema from `Snapshot::schema()` for user data columns. Metadata/system schema + column names (defined by the protocol) are not subject to column mapping. +- **Transforms:** Generic recursive schema and expression transform traits and helpers + are in `kernel/src/transforms/`. + +## Code Style / Documentation + +- Line width is 100 characters. Wrap comments and string literals at 100, not 80. +- Use `==` as a visual section divider in comments (e.g. `// === Helpers ===` or + `// ============`). +- MUST include doc comments for all public functions, structs, enums, and methods. +- MUST document function parameters, return values, and errors. +- Keep comments up-to-date with code changes. +- Include examples in doc comments for complex functions only. +- NEVER use emoji or unicode in comments that emulates emoji (e.g. special arrows, + checkmarks). Use ASCII equivalents (`->`, `=>`, etc.) instead. +- Comments should be concise and non-repetitive -- find the right place to say it once. +- Comments should never include temporal references -- only refer to current code and + design, not past iterations. +- Doc comments focus on "what" (contract with caller) more than "how" (implementation), + unless the "how" meaningfully impacts the "what". +- Code comments state intent and explain "why" -- don't restate what the code self-documents. +- Place `use` imports at the top of the file (for non-test code) or at the top of the + `mod tests` block (for test code) -- never inside function bodies. +- NEVER panic in production code -- use errors instead. Panicking + (including `unwrap()`, `expect()`, `panic!()`, `unreachable!()`, etc) is acceptable in test code only. + +## Pull Requests + +**Title:** use conventional commit format, lowercase after prefix, no period at the end. +Allowed types: `feat`, `fix`, `refactor`, `chore`, `docs`, `perf`, `test`, `ci`. +If the pull request contains a breaking change, the type must have a `!` suffix. +Examples: `feat: add checkpoint stream support`, `fix: handle empty log segment`, +`refactor: extract common log replay logic` +Breaking change examples: `feat!: make_physical takes column mapping and sets parquet field ids`, +`chore!: remove the arrow-55 feature` + +**Description:** follow the template in `.github/PULL_REQUEST_TEMPLATE.md`. Error on the +side of simplicity -- don't list every change. Focus on key API changes, functionality, +and data flow. Keep it concise. + +### CI Jobs and Github Actions + +**Supply chain security:** every `cargo` command in CI that resolves dependencies MUST use +`--locked` to enforce the committed `Cargo.lock`. This prevents CI from silently picking up +a newer (potentially compromised) transitive dependency. If `Cargo.lock` is out of sync with +`Cargo.toml`, the build fails immediately, forcing dependency changes to be explicit and +reviewable. See the top-level comment in `build.yml` for full rationale. Commands exempt from +`--locked`: `cargo fmt` (no dep resolution), `cargo msrv verify/show` (wrapper tool), +`cargo miri setup` (tooling setup). + +Ensure that when writing any github action you are considering safety including thinking of +and mitigating common attack vectors such expression injection and pull request target attacks. + +Example: +```yaml +# The code below is vulnerable to expression injection +run: | + echo "Comment: ${{ github.event.comment.body }}" + +# To mitigate instead use environment variables +env: + COMMENT_BODY: ${{ github.event.comment.body }} +run: | + echo "Comment: $COMMENT_BODY" +``` + +## Deep Context + +Read these when relevant to the task at hand: +- `CLAUDE/architecture.md` -- kernel architecture: snapshot loading, read/write paths, + engine trait system, EngineData, key modules, catalog-managed tables +- Always cross-check protocol behavior against the + [Delta protocol spec](https://raw.githubusercontent.com/delta-io/delta/master/PROTOCOL.md) + +**Keeping docs current:** If you notice anything inaccurate in these docs -- renamed +structs, traits, functions, modules, crates, APIs, stale data flows, wrong file paths -- +inform the user so they can be updated. After major changes, update this file, +`CLAUDE/architecture.md`, `ffi/CLAUDE.md`, and any relevant `/CLAUDE.md` files. diff --git a/CLAUDE/architecture.md b/CLAUDE/architecture.md new file mode 100644 index 0000000000..2a8a3641f8 --- /dev/null +++ b/CLAUDE/architecture.md @@ -0,0 +1,151 @@ +# Architecture + +## Layered Design + +``` +Compute Engine (Spark, Flink, DuckDB, Polars, ...) + -> Your Delta Connector (implements compute engine's DataSource API) + -> Delta Kernel (snapshot loading, scan orchestration, write transaction coordination, + log replay, data skipping, schema enforcement, predicate evaluation, + physical-to-logical transforms, deletion vector handling, checkpointing) + -> Engine trait (abstraction for I/O and compute) + -> DefaultEngine (Arrow + object_store + Tokio) or custom engine + -> Storage (local FS, S3, GCS, Azure, HDFS, ...) +``` + +Kernel handles the Delta protocol; connectors handle execution, distribution, and data flow. +Kernel never does I/O directly -- it delegates all I/O to the Engine trait. Kernel also avoids +making memory allocation and scheduling decisions, leaving those to the connector and engine. +For example, during log replay or checkpoint writes, kernel receives opaque `EngineData` batches, +inspects them via the visitor pattern, updates a selection vector, and hands them back to the +engine -- it never deserializes the full batch into in-memory structs. + +## Snapshot + +`Snapshot` (`kernel/src/snapshot.rs`) is the entry point for everything. It is an immutable +point-in-time view of a Delta table at a specific version, providing the table schema, metadata, +properties, and version number. + +Built via `Snapshot::builder_for(url).build(engine)` (latest version) or +`.at_version(v).build(engine)` (specific version). For catalog-managed tables, +`.with_log_tail(commits)` supplies recent unpublished commits from the catalog. + +**Snapshot loading internals:** +1. **LogSegment** (`kernel/src/log_segment/`) -- discovers commits + checkpoints for the + requested version, replays Protocol and Metadata (`protocol_metadata_replay.rs`), and + replays domain metadata (`domain_metadata_replay.rs`) +2. **Log replay** (`kernel/src/log_replay.rs`) -- file-action deduplication via + `FileActionDeduplicator` and `LogReplayProcessor` trait (distinct from Protocol/Metadata + replay above) + +From a snapshot you can: read the schema and table properties, build a `Scan` to read data, +start a `Transaction` to write data, or create a checkpoint. + +## Read Path + +`Snapshot` -> `ScanBuilder` -> `Scan` -> data + +The scan pipeline: log replay (build active file list) -> data skipping (prune files via stats +and partition values) -> file reading -> physical-to-logical transform (partition values, +column mapping, schema evolution) -> deletion vector filtering. + +**Key modules** (`kernel/src/scan/`): `log_replay.rs` (reconcile Add/Remove into active file +set), `data_skipping.rs` (rewrite predicates against min/max/nullCount stats and partition values). + +**Execution paths:** +- `scan.execute(engine)` -- kernel handles everything end-to-end, returns `EngineData` +- `scan.scan_metadata(engine)` -- returns file list + transforms; connector reads files and + calls `transform_to_logical` / `DvInfo::get_selection_vector` +- `scan.parallel_scan_metadata(engine)` -- two-phase distributed log replay (`pub(crate)`, + requires `internal-api` feature) + +## Write Path + +`Snapshot` -> `Transaction` -> commit + +The kernel coordinates the write transaction: it provides the write context (target directory, +physical schema, stats columns), assembles commit actions (CommitInfo, Add files), enforces +protocol compliance (table features, schema validation), and delegates the atomic commit to a +`Committer`. + +**Steps:** +1. Create `Transaction` from a snapshot with a `Committer` (e.g. `FileSystemCommitter`) +2. Get `WriteContext` for target dir, physical schema, and stats columns +3. Write Parquet files (via engine), collect file metadata +4. Register files via `txn.add_files(metadata)` +5. Commit: returns `CommittedTransaction`, `ConflictedTransaction`, or `RetryableTransaction` + +- **Transaction** (`kernel/src/transaction/`) -- blind append writes, table creation (via + `create_table` builder, including clustered tables via `DataLayout`) +- **Committer** (`kernel/src/committer/`) -- commit coordination. `FileSystemCommitter` for + filesystem tables (atomic put-if-absent to `_delta_log/`); custom `Committer` implementations + for catalog-managed tables (staging, ratifying, publishing). + +## Engine Trait System + +The kernel is built around the `Engine` trait (`kernel/src/lib.rs`), which provides four handlers: + +| Handler | Purpose | Key Methods | +|----------------------|----------------------------------|--------------------------------------------| +| `StorageHandler` | File system operations | `list_from`, `read_files`, etc. | +| `JsonHandler` | Delta log commit parsing/writing | `parse_json`, `read_json_files` | +| `ParquetHandler` | Data file and checkpoint I/O | `read_parquet_files`, `write_parquet_file` | +| `EvaluationHandler` | Expression/predicate evaluation | `new_expression_evaluator`, etc. | +| `MetricsReporter` | Optional observability | `get_metrics_reporter` (default: None) | + +A `DefaultEngine` (Arrow + `object_store` + Tokio) lives in `kernel/src/engine/default/`. Custom +engines only need to replace specific handlers -- they can reuse defaults for the rest. + +## EngineData Trait + +Kernel never assumes data is Arrow. It uses the `EngineData` trait -- an opaque columnar data +interface. The kernel extracts data via a visitor pattern (`visit_rows` with typed `GetData` +accessors), not by inspecting columns directly. Never downcast `EngineData` to a concrete type +(e.g. `ArrowEngineData`) in prod kernel code -- only engine *implementations* know the concrete +type. (Unit tests using the default engine may legitimately downcast.) + +`DefaultEngine` uses `ArrowEngineData` (wrapping Arrow `RecordBatch`). Custom engines implement +`EngineData` for their own columnar format. + +Key methods: `visit_rows`, `len`, `append_columns` (for partition value injection/column mapping), +`apply_selection_vector` (for deletion vectors). + +**IMPORTANT:** Never assume that reading one file produces exactly one batch. Always iterate over +all returned batches -- the engine may split a single file across multiple batches. + +## Key Modules + +- `kernel/src/snapshot/` -- `Snapshot`, `SnapshotBuilder`, entry point for reads/writes +- `kernel/src/scan/` -- `Scan`, `ScanBuilder`, log replay, data skipping +- `kernel/src/transaction/` -- `Transaction`, `WriteContext`, `create_table` builder +- `kernel/src/committer/` -- `Committer` trait, `FileSystemCommitter` +- `kernel/src/log_segment/` -- log file discovery, Protocol/Metadata replay +- `kernel/src/log_replay.rs` -- file-action deduplication, `LogReplayProcessor` trait +- `kernel/src/log_reader/` -- I/O layer for reading commit and checkpoint files +- `kernel/src/actions/` -- Delta action types (Protocol, Metadata, CommitInfo, Add, Remove, Cdc, + SetTransaction, DomainMetadata, Sidecar, CheckpointMetadata) +- `kernel/src/schema/` -- `StructType`/`StructField`/`DataType`, projections +- `kernel/src/expressions/` -- expression AST (`Expression`, `Predicate`, `Scalar`), + `column_expr!` macro +- `kernel/src/transforms/` -- generic recursive transforms (`ExpressionTransform`, + `SchemaTransform`) +- `kernel/src/checkpoint/` -- checkpoint writing (V1 and V2 single-file classic-named) +- `kernel/src/table_configuration.rs` -- table metadata, properties, feature management +- `kernel/src/table_features/` -- protocol feature definitions, `TableFeature` enum +- `kernel/src/table_properties.rs` -- table property parsing (delta.appendOnly, etc.) +- `kernel/src/table_changes/` -- Change Data Feed (CDF) API (`TableChanges`) +- `kernel/src/path.rs` -- Delta log path parsing + +## Catalog-Managed Tables + +Tables whose commits go through a catalog (e.g. Unity Catalog) instead of direct filesystem +writes. Kernel doesn't know about catalogs -- the catalog client provides a log tail via +`SnapshotBuilder::with_log_tail()` and a custom `Committer` for staging/ratifying/publishing +commits. + +The `UCCommitter` (in the `delta-kernel-unity-catalog` crate) is the reference implementation of a catalog +committer for Unity Catalog. It stages commits to `_staged_commits/`, calls the UC commit API to +ratify them, and publishes by copying to `_delta_log/`. + +Commit types: staged (written to `_staged_commits/`), ratified (accepted by catalog for a +version), published (copied to `_delta_log/` as a normal delta file). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7a3ba95fdc..e781798435 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -9,8 +9,9 @@ ## How to Contribute -For trivial fixes, etc. please feel free to open a PR directly. For larger changes, we follow a -structured contribution process to ensure high-quality code: +For trivial fixes, etc. please feel free to open a PR directly (Please see [Getting you PR +reviewed](#getting-your-pr-reviewed) below). For larger changes, we follow a structured contribution +process to ensure high-quality code: 1. **Start with an issue and/or design sketch**: Open an issue describing what you want to contribute and why. Continue to step 2 after reaching some consensus. This helps us avoid wasted @@ -18,13 +19,39 @@ structured contribution process to ensure high-quality code: explored and rejected). Including a design sketch will help drive consensus (often a simple diagram or bullet points outlining high-level changes is sufficient). 2. **Prototype/POC**: Create a PR marked as prototype/draft (not intended to merge) and gather - feedback to further derisk the design. This PR is not intended to be merged but will guide the + feedback to further de-risk the design. This PR is not intended to be merged but will guide the implementation and serve as a proving ground for the design. Then, pieces are torn out into smaller PRs that can be merged. 3. **Implementation**: Finally, create PR(s) to implement the feature (production code, tests, thorough docs, etc.). Often the initial POC will be split into multiple smaller PRs (e.g., refactors, then feature additions, then public APIs specifically). Care should be taken to ensure - each PR is easily reviewable and thoroughly tested. + each PR is easily review-able and thoroughly tested. + +## Getting your PR reviewed + +We invite everyone who would like their PRs reviewed to review _other_ open PRs as +well. Like most open source projects, our review bandwidth is limited, and help from users is +greatly appreciated. This helps increase the overall review rate, and allows contributors to build +credibility within the project. + +PRs from contributors who do not review others' work will be lower priority, though we do try to +review all PRs as promptly as possible. + +We also encourage contributors to optimize their PRs for review: +- A crisp and complete PR description that explains clearly what the change is for cross-checking + against the code. +- Tightly-scoped. That is, don't mix multiple changes in a single PR. +- Code structure and doc comments optimized for understandability (e.g. avoid bloat, redundancy, and +convoluted control flow). + +PRs that do not follow these principles are much more time consuming to review, and less likely to +get prompt reviews. + +## AI-Assisted Contributions + +We welcome contributors who use AI coding tools, but all contributions must reflect genuine +understanding of the changes being made. Please read our [AI Policy](AI_POLICY.md) before +submitting a PR. ## Forking and Setup @@ -84,7 +111,34 @@ Our trunk branch is named `main`. Here's the typical workflow: **Note**: We require two approvals from code owners for any PR to be merged. +## Pull Request Best Practices + +#### General Tips + +1. When making your first PR, please read our contributor guidelines: https://github.com/delta-incubator/delta-kernel-rs/blob/main/CONTRIBUTING.md +2. Run `cargo t --all-features --all-targets` to get started testing, and run `cargo fmt`. +3. Ensure you have added or run the appropriate tests for your PR. +4. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP] Your PR title ...'. +5. Be sure to keep the PR description updated to reflect all changes. + +#### PR Title Formatting + +This project uses conventional commits: https://www.conventionalcommits.org/ + +Each PR corresponds to a commit on the `main` branch, with the title of the PR (typically) being +used for the commit message on main. In order to ensure proper formatting in the CHANGELOG please +ensure your PR title adheres to the conventional commit specification. + +Examples: +- new feature PR: "feat: new API for snapshot.update()" +- bugfix PR: "fix: correctly apply DV in read-table example" + +#### PR Testing + +Please make sure to add test cases that check the changes thoroughly including negative and positive cases if possible. +If it was tested in a way different from regular unit tests, please clarify how you tested, ideally via a reproducible test documented in the PR description. + ## Resources - [Delta Protocol](https://github.com/delta-io/delta/blob/master/PROTOCOL.md) -- [Delta Lake Slack](https://go.delta.io/slack) - Join us in the `#delta-kernel` channel \ No newline at end of file +- [Delta Lake Slack](https://go.delta.io/slack) - Join us in the `#delta-kernel` channel diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000000..9370212df9 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,5401 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "acceptance" +version = "0.20.0" +dependencies = [ + "datatest-stable", + "delta_kernel", + "delta_kernel_benchmarks", + "flate2", + "futures", + "itertools 0.14.0", + "serde", + "serde_json", + "tar", + "tempfile", + "test-case", + "test-log", + "test_utils", + "thiserror 2.0.18", + "tokio", + "tracing", + "tracing-subscriber", + "ureq", + "url", +] + +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures 0.2.17", +] + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse 0.2.7", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse 1.0.0", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.60.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + +[[package]] +name = "arrow" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" +dependencies = [ + "arrow-arith 56.2.0", + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-cast 56.2.0", + "arrow-csv 56.2.0", + "arrow-data 56.2.0", + "arrow-ipc 56.2.0", + "arrow-json 56.2.0", + "arrow-ord 56.2.0", + "arrow-row 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "arrow-string 56.2.0", +] + +[[package]] +name = "arrow" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8" +dependencies = [ + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-csv 57.3.0", + "arrow-data 57.3.0", + "arrow-ipc 57.3.0", + "arrow-json 57.3.0", + "arrow-ord 57.3.0", + "arrow-row 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "arrow-string 57.3.0", +] + +[[package]] +name = "arrow-arith" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "chrono", + "num", +] + +[[package]] +name = "arrow-arith" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "num-traits", +] + +[[package]] +name = "arrow-array" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" +dependencies = [ + "ahash", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.16.1", + "num", +] + +[[package]] +name = "arrow-array" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" +dependencies = [ + "ahash", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-buffer" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + +[[package]] +name = "arrow-cast" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-cast" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" +dependencies = [ + "arrow-array 56.2.0", + "arrow-cast 56.2.0", + "arrow-schema 56.2.0", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-csv" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a" +dependencies = [ + "arrow-array 57.3.0", + "arrow-cast 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" +dependencies = [ + "arrow-buffer 56.2.0", + "arrow-schema 56.2.0", + "half", + "num", +] + +[[package]] +name = "arrow-data" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +dependencies = [ + "arrow-buffer 57.3.0", + "arrow-schema 57.3.0", + "half", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-ipc" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "flatbuffers", +] + +[[package]] +name = "arrow-ipc" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-cast 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "chrono", + "half", + "indexmap", + "lexical-core", + "memchr", + "num", + "serde", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-json" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "half", + "indexmap", + "itoa", + "lexical-core", + "memchr", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", +] + +[[package]] +name = "arrow-ord" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", +] + +[[package]] +name = "arrow-row" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "half", +] + +[[package]] +name = "arrow-row" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "half", +] + +[[package]] +name = "arrow-schema" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" +dependencies = [ + "bitflags", +] + +[[package]] +name = "arrow-schema" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" +dependencies = [ + "bitflags", +] + +[[package]] +name = "arrow-select" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" +dependencies = [ + "ahash", + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "num", +] + +[[package]] +name = "arrow-select" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" +dependencies = [ + "ahash", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "num-traits", +] + +[[package]] +name = "arrow-string" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" +dependencies = [ + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-data 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "memchr", + "num", + "regex", + "regex-syntax", +] + +[[package]] +name = "arrow-string" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "aws-lc-rs" +version = "1.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "block-padding" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93" +dependencies = [ + "generic-array", +] + +[[package]] +name = "brotli" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "camino" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629a66d692cb9ff1a1c664e41771b3dcaf961985a9774c0eb0bd1b51cf60a48" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cbc" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6" +dependencies = [ + "cipher", +] + +[[package]] +name = "cbindgen" +version = "0.29.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "befbfd072a8e81c02f8c507aefce431fe5e7d051f83d48a23ffc9b9fe5a11799" +dependencies = [ + "clap", + "heck", + "indexmap", + "log", + "proc-macro2", + "quote", + "serde", + "serde_json", + "syn", + "tempfile", + "toml 0.9.12+spec-1.1.0", +] + +[[package]] +name = "cc" +version = "1.2.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.0", +] + +[[package]] +name = "checkpoint-table" +version = "0.1.0" +dependencies = [ + "bytes", + "chrono", + "clap", + "common", + "delta_kernel", + "env_logger", + "futures", + "tokio", + "url", +] + +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "chrono-tz" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" +dependencies = [ + "chrono", + "phf", +] + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + +[[package]] +name = "clap" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream 1.0.0", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "memchr", +] + +[[package]] +name = "comfy-table" +version = "7.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" +dependencies = [ + "strum 0.26.3", + "strum_macros 0.26.4", + "unicode-width", +] + +[[package]] +name = "common" +version = "0.20.0" +dependencies = [ + "clap", + "delta_kernel", + "url", +] + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.17", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools 0.10.5", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools 0.10.5", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "ctr" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0369ee1ad671834580515889b80f2ea915f23b8be8d0daa4bbaf2ac5c7590835" +dependencies = [ + "cipher", +] + +[[package]] +name = "datatest-stable" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a867d7322eb69cf3a68a5426387a25b45cb3b9c5ee41023ee6cea92e2afadd82" +dependencies = [ + "camino", + "fancy-regex", + "libtest-mimic", + "walkdir", +] + +[[package]] +name = "delta-kernel-unity-catalog" +version = "0.20.0" +dependencies = [ + "delta_kernel", + "itertools 0.14.0", + "serde", + "serde_json", + "tempfile", + "tokio", + "tracing", + "unity-catalog-delta-client-api", + "unity-catalog-delta-rest-client", + "url", + "uuid", +] + +[[package]] +name = "delta_kernel" +version = "0.20.0" +dependencies = [ + "arrow 56.2.0", + "arrow 57.3.0", + "async-trait", + "bytes", + "chrono", + "crc", + "criterion", + "delta_kernel", + "delta_kernel_derive", + "futures", + "hdfs-native", + "hdfs-native-object-store", + "indexmap", + "itertools 0.14.0", + "object_store", + "parquet 56.2.0", + "parquet 57.3.0", + "paste", + "reqwest 0.13.2", + "roaring", + "rstest", + "rustc_version", + "serde", + "serde_json", + "strum 0.27.2", + "tempfile", + "test-log", + "test_utils", + "thiserror 2.0.18", + "tokio", + "tracing", + "tracing-subscriber", + "url", + "uuid", + "walkdir", + "z85", +] + +[[package]] +name = "delta_kernel_benchmarks" +version = "0.20.0" +dependencies = [ + "criterion", + "delta-kernel-unity-catalog", + "delta_kernel", + "flate2", + "object_store", + "rayon", + "rstest", + "serde", + "serde_json", + "sqlparser", + "tar", + "test_utils", + "tokio", + "unity-catalog-delta-client-api", + "unity-catalog-delta-rest-client", + "ureq", + "url", +] + +[[package]] +name = "delta_kernel_derive" +version = "0.20.0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "delta_kernel_ffi" +version = "0.20.0" +dependencies = [ + "cbindgen", + "delta-kernel-unity-catalog", + "delta_kernel", + "delta_kernel_ffi_macros", + "itertools 0.14.0", + "libc", + "paste", + "rand 0.9.2", + "rstest", + "serde", + "serde_json", + "tempfile", + "test_utils", + "tokio", + "tracing", + "tracing-core", + "tracing-subscriber", + "trybuild", + "unity-catalog-delta-client-api", + "url", +] + +[[package]] +name = "delta_kernel_ffi_macros" +version = "0.20.0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "des" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffdd80ce8ce993de27e9f063a444a4d53ce8e8db4c1f00cc03af5ad5a9867a1e" +dependencies = [ + "cipher", +] + +[[package]] +name = "dhat" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98cd11d84628e233de0ce467de10b8633f4ddaecafadefc86e13b84b8739b827" +dependencies = [ + "backtrace", + "lazy_static", + "mintex", + "parking_lot", + "rustc-hash 1.1.0", + "serde", + "serde_json", + "thousands", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dns-lookup" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf5597a4b7fe5275fc9dcf88ce26326bc8e4cb87d0130f33752d4c5f717793cf" +dependencies = [ + "cfg-if", + "libc", + "socket2", + "windows-sys 0.60.2", +] + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "env_filter" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a1c3cc8e57274ec99de65301228b537f1e4eedc1b8e0f9411c6caac8ae7308f" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "env_logger" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" +dependencies = [ + "anstream 0.6.21", + "anstyle", + "env_filter", + "jiff", + "log", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "escape8259" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5692dd7b5a1978a5aeb0ce83b7655c58ca8efdcb79d21036ea249da95afec2c6" + +[[package]] +name = "fancy-regex" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e24cb5a94bcae1e5408b0effca5cd7172ea3c5755049c5f3af4cd283a165298" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "feature_tests" +version = "0.20.0" +dependencies = [ + "delta_kernel", + "reqwest 0.13.2", + "rustls", +] + +[[package]] +name = "filetime" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if", + "libc", + "libredox", +] + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" +dependencies = [ + "bitflags", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", + "zlib-rs", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-timer" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "g2gen" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5a7e0eb46f83a20260b850117d204366674e85d3a908d90865c78df9a6b1dfc" +dependencies = [ + "g2poly", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "g2p" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "539e2644c030d3bf4cd208cb842d2ce2f80e82e6e8472390bcef83ceba0d80ad" +dependencies = [ + "g2gen", + "g2poly", +] + +[[package]] +name = "g2poly" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "312d2295c7302019c395cfb90dacd00a82a2eabd700429bba9c7a3f38dbbe11b" + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi 5.3.0", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "rand_core 0.10.0", + "wasip2", + "wasip3", +] + +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "h2" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" + +[[package]] +name = "hdfs-native" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08603b51f970930b0025b92d3f6c0ba39a0a6d0dfb4b3f527af58768adc2f3b2" +dependencies = [ + "aes", + "base64", + "bitflags", + "bumpalo", + "bytes", + "cbc", + "chrono", + "cipher", + "crc", + "ctr", + "des", + "dns-lookup", + "futures", + "g2p", + "hex", + "hmac", + "libc", + "libloading", + "log", + "md-5", + "num-traits", + "once_cell", + "prost", + "prost-types", + "rand 0.9.2", + "regex", + "roxmltree", + "socket2", + "thiserror 2.0.18", + "tokio", + "url", + "uuid", + "which", + "whoami", +] + +[[package]] +name = "hdfs-native-object-store" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d5495a763df493ea3883271e727914e83a0ce188a32e230ca820e3cb5e188d2" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "hdfs-native", + "object_store", + "thiserror 2.0.18", + "tokio", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "humantime" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" + +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "system-configuration", + "tokio", + "tower-service", + "tracing", + "windows-registry", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" + +[[package]] +name = "icu_properties" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" + +[[package]] +name = "icu_provider" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "block-padding", + "generic-array", +] + +[[package]] +name = "inspect-table" +version = "0.1.0" +dependencies = [ + "chrono", + "clap", + "common", + "delta_kernel", + "env_logger", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "ipnet" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" + +[[package]] +name = "iri-string" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "is-terminal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "jiff" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" +dependencies = [ + "jiff-static", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", +] + +[[package]] +name = "jiff-static" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "jni" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a87aa2bb7d2af34197c04845522473242e1aa17c12f4935d5856491a7fb8c97" +dependencies = [ + "cesu8", + "cfg-if", + "combine", + "jni-sys", + "log", + "thiserror 1.0.69", + "walkdir", + "windows-sys 0.45.0", +] + +[[package]] +name = "jni-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libc" +version = "0.2.183" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" + +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "libredox" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" +dependencies = [ + "bitflags", + "libc", + "plain", + "redox_syscall 0.7.3", +] + +[[package]] +name = "libtest-mimic" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14e6ba06f0ade6e504aff834d7c34298e5155c6baca353cc6a4aaff2f9fd7f33" +dependencies = [ + "anstream 1.0.0", + "anstyle", + "clap", + "escape8259", +] + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "lz4_flex" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "mem-test" +version = "0.20.0" +dependencies = [ + "delta_kernel", + "dhat", + "rayon", + "serde_json", + "tempfile", + "tracing", + "url", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "mintex" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c505b3e17ed6b70a7ed2e67fbb2c560ee327353556120d6e72f5232b6880d536" + +[[package]] +name = "mio" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "num" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" +dependencies = [ + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +dependencies = [ + "async-trait", + "base64", + "bytes", + "chrono", + "form_urlencoded", + "futures", + "http", + "http-body-util", + "httparse", + "humantime", + "hyper", + "itertools 0.14.0", + "md-5", + "parking_lot", + "percent-encoding", + "quick-xml", + "rand 0.9.2", + "reqwest 0.12.28", + "ring", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "thiserror 2.0.18", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "openssl" +version = "0.10.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "openssl-sys" +version = "0.9.112" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.5.18", + "smallvec", + "windows-link", +] + +[[package]] +name = "parquet" +version = "56.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" +dependencies = [ + "ahash", + "arrow-array 56.2.0", + "arrow-buffer 56.2.0", + "arrow-cast 56.2.0", + "arrow-data 56.2.0", + "arrow-ipc 56.2.0", + "arrow-schema 56.2.0", + "arrow-select 56.2.0", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.11.6", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + +[[package]] +name = "parquet" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb" +dependencies = [ + "ahash", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", + "arrow-ipc 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown 0.16.1", + "lz4_flex 0.12.1", + "num-bigint", + "num-integer", + "num-traits", + "object_store", + "paste", + "seq-macro", + "simdutf8", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "phf" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" +dependencies = [ + "portable-atomic", +] + +[[package]] +name = "potential_utf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "proc-macro-crate" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" +dependencies = [ + "toml_edit", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +dependencies = [ + "prost", +] + +[[package]] +name = "psm" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" +dependencies = [ + "ar_archive_writer", + "cc", +] + +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash 2.1.1", + "rustls", + "socket2", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +dependencies = [ + "aws-lc-rs", + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash 2.1.1", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.60.2", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha", + "rand_core 0.9.5", +] + +[[package]] +name = "rand" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" +dependencies = [ + "chacha20", + "getrandom 0.4.2", + "rand_core 0.10.0", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rand_core" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "read-table-changes" +version = "0.1.0" +dependencies = [ + "clap", + "common", + "delta_kernel", + "itertools 0.14.0", +] + +[[package]] +name = "read-table-multi-threaded" +version = "0.1.0" +dependencies = [ + "clap", + "common", + "delta_kernel", + "env_logger", + "spmc", + "url", +] + +[[package]] +name = "read-table-single-threaded" +version = "0.1.0" +dependencies = [ + "clap", + "common", + "delta_kernel", + "env_logger", + "itertools 0.14.0", +] + +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_syscall" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "relative-path" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", +] + +[[package]] +name = "reqwest" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab3f43e3283ab1488b624b44b0e988d0acea0b3214e694730a055cb6b2efa801" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "js-sys", + "log", + "mime", + "native-tls", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "rustls-platform-verifier", + "serde", + "serde_json", + "sync_wrapper", + "tokio", + "tokio-native-tls", + "tokio-rustls", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "roaring" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" +dependencies = [ + "bytemuck", + "byteorder", +] + +[[package]] +name = "roxmltree" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1964b10c76125c36f8afe190065a4bf9a87bf324842c05701330bba9f1cacbb" +dependencies = [ + "memchr", +] + +[[package]] +name = "rstest" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a2c585be59b6b5dd66a9d2084aa1d8bd52fbdb806eafdeffb52791147862035" +dependencies = [ + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", +] + +[[package]] +name = "rstest_macros" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "825ea780781b15345a146be27eaefb05085e337e869bff01b4306a4fd4a9ad5a" +dependencies = [ + "cfg-if", + "glob", + "proc-macro-crate", + "proc-macro2", + "quote", + "regex", + "relative-path", + "rustc_version", + "syn", + "unicode-ident", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.23.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +dependencies = [ + "aws-lc-rs", + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-platform-verifier" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d99feebc72bae7ab76ba994bb5e121b8d83d910ca40b36e0921f53becc41784" +dependencies = [ + "core-foundation 0.10.1", + "core-foundation-sys", + "jni", + "log", + "once_cell", + "rustls", + "rustls-native-certs", + "rustls-platform-verifier-android", + "rustls-webpki", + "security-framework", + "security-framework-sys", + "webpki-root-certs", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls-platform-verifier-android" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f87165f0995f63a9fbeea62b64d10b4d9d8e78ec6d7d51fb2125fda7bb36788f" + +[[package]] +name = "rustls-webpki" +version = "0.103.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "security-framework" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +dependencies = [ + "bitflags", + "core-foundation 0.10.1", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_spanned" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "socket2" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +dependencies = [ + "libc", + "windows-sys 0.60.2", +] + +[[package]] +name = "spmc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02a8428da277a8e3a15271d79943e80ccc2ef254e78813a166a08d65e4c3ece5" + +[[package]] +name = "sqlparser" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" +dependencies = [ + "log", + "recursive", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "stacker" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" + +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros 0.27.2", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "system-configuration" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tar" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +dependencies = [ + "filetime", + "libc", + "xattr", +] + +[[package]] +name = "target-triple" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "591ef38edfb78ca4771ee32cf494cb8771944bee237a9b91fc9c1424ac4b777b" + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix", + "windows-sys 0.52.0", +] + +[[package]] +name = "termcolor" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "test-case" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb2550dd13afcd286853192af8601920d959b14c401fcece38071d53bf0768a8" +dependencies = [ + "test-case-macros", +] + +[[package]] +name = "test-case-core" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adcb7fd841cd518e279be3d5a3eb0636409487998a4aff22f3de87b81e88384f" +dependencies = [ + "cfg-if", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "test-case-macros" +version = "3.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c89e72a01ed4c579669add59014b9a524d609c0c88c6a585ce37485879f6ffb" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "test-case-core", +] + +[[package]] +name = "test-log" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d53ac171c92a39e4769491c4b4dde7022c60042254b5fc044ae409d34a24d4" +dependencies = [ + "test-log-macros", + "tracing-subscriber", +] + +[[package]] +name = "test-log-macros" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be35209fd0781c5401458ab66e4f98accf63553e8fae7425503e92fdd319783b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "test_utils" +version = "0.20.0" +dependencies = [ + "delta_kernel", + "itertools 0.14.0", + "serde_json", + "tar", + "tempfile", + "tokio", + "tracing", + "tracing-subscriber", + "url", + "uuid", + "zstd", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thousands" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinystr" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +dependencies = [ + "bytes", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-macros" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml" +version = "0.9.12+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf92845e79fc2e2def6a5d828f0801e29a2f8acc037becc5ab08595c7d5e9863" +dependencies = [ + "indexmap", + "serde_core", + "serde_spanned", + "toml_datetime 0.7.5+spec-1.1.0", + "toml_parser", + "toml_writer", + "winnow 0.7.15", +] + +[[package]] +name = "toml" +version = "1.0.7+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd28d57d8a6f6e458bc0b8784f8fdcc4b99a437936056fa122cb234f18656a96" +dependencies = [ + "indexmap", + "serde_core", + "serde_spanned", + "toml_datetime 1.0.1+spec-1.1.0", + "toml_parser", + "toml_writer", + "winnow 1.0.0", +] + +[[package]] +name = "toml_datetime" +version = "0.7.5+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_datetime" +version = "1.0.1+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_edit" +version = "0.25.5+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" +dependencies = [ + "indexmap", + "toml_datetime 1.0.1+spec-1.1.0", + "toml_parser", + "winnow 1.0.0", +] + +[[package]] +name = "toml_parser" +version = "1.0.10+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" +dependencies = [ + "winnow 1.0.0", +] + +[[package]] +name = "toml_writer" +version = "1.0.7+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f17aaa1c6e3dc22b1da4b6bba97d066e354c7945cac2f7852d4e4e7ca7a6b56d" + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "bitflags", + "bytes", + "futures-util", + "http", + "http-body", + "iri-string", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "trybuild" +version = "1.0.116" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47c635f0191bd3a2941013e5062667100969f8c4e9cd787c14f977265d73616e" +dependencies = [ + "glob", + "serde", + "serde_derive", + "serde_json", + "target-triple", + "termcolor", + "toml 1.0.7+spec-1.1.0", +] + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "unity-catalog-delta-client-api" +version = "0.20.0" +dependencies = [ + "chrono", + "serde", + "serde_json", + "thiserror 2.0.18", + "tokio", +] + +[[package]] +name = "unity-catalog-delta-rest-client" +version = "0.20.0" +dependencies = [ + "chrono", + "clap", + "reqwest 0.13.2", + "serde", + "serde_json", + "thiserror 2.0.18", + "tokio", + "tracing", + "tracing-subscriber", + "unity-catalog-delta-client-api", + "url", +] + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc97a28575b85cfedf2a7e7d3cc64b3e11bd8ac766666318003abbacc7a21fc" +dependencies = [ + "base64", + "flate2", + "log", + "percent-encoding", + "rustls", + "rustls-pki-types", + "ureq-proto", + "utf-8", + "webpki-roots", +] + +[[package]] +name = "ureq-proto" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f" +dependencies = [ + "base64", + "http", + "httparse", + "log", +] + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", + "serde_derive", +] + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "rand 0.10.0", + "wasm-bindgen", +] + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" + +[[package]] +name = "wasm-bindgen" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" +dependencies = [ + "cfg-if", + "futures-util", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap", + "semver", +] + +[[package]] +name = "web-sys" +version = "0.3.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-root-certs" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "webpki-roots" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "which" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81995fafaaaf6ae47a7d0cc83c67caf92aeb7e5331650ae6ff856f7c0c60c459" +dependencies = [ + "libc", +] + +[[package]] +name = "whoami" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4a4db5077702ca3015d3d02d74974948aba2ad9e12ab7df718ee64ccd7e97d" +dependencies = [ + "libredox", + "wasite", + "web-sys", +] + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" + +[[package]] +name = "winnow" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" +dependencies = [ + "memchr", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "write-table" +version = "0.1.0" +dependencies = [ + "chrono", + "clap", + "common", + "delta_kernel", + "env_logger", + "itertools 0.14.0", + "serde_json", + "tokio", + "url", + "uuid", +] + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix", +] + +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "z85" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6e61e59a957b7ccee15d2049f86e8bfd6f66968fcd88f018950662d9b86e675" + +[[package]] +name = "zerocopy" +version = "0.8.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zlib-rs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index f73ada89c7..42685445a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "acceptance", + "benchmarks", "derive-macros", "ffi", "kernel", @@ -8,8 +9,9 @@ members = [ "test-utils", "feature-tests", "mem-test", - "uc-client", # WIP: this is an experimental UC client for catalog-managed table work - "uc-catalog", # WIP: this is an experimental UC catalog implementation + "unity-catalog-delta-rest-client", # WIP: this is an experimental UC client for catalog-managed table work + "delta-kernel-unity-catalog", # WIP: this is an experimental UC catalog implementation + "unity-catalog-delta-client-api", # WIP: transport-agnostic UC client API traits ] # note that in addition to the members above, the workspace includes examples: # - inspect-table @@ -26,5 +28,5 @@ keywords = ["deltalake", "delta", "datalake"] license = "Apache-2.0" repository = "https://github.com/delta-io/delta-kernel-rs" readme = "README.md" -rust-version = "1.84" -version = "0.16.0" +rust-version = "1.85" +version = "0.20.0" diff --git a/README.md b/README.md index 9b808d8494..e01cedb403 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -# Delta Kernel (rust)   [![build-status]][actions] [![latest-version]][crates.io] [![docs]][docs.rs] [![rustc-version-1.84+]][rustc] +# Delta Kernel (rust)   [![build-status]][actions] [![latest-version]][crates.io] [![docs]][docs.rs] [![rustc-version-1.85+]][rustc] [build-status]: https://img.shields.io/github/actions/workflow/status/delta-io/delta-kernel-rs/build.yml?branch=main [actions]: https://github.com/delta-io/delta-kernel-rs/actions/workflows/build.yml?query=branch%3Amain [latest-version]: https://img.shields.io/crates/v/delta_kernel.svg [crates.io]: https://crates.io/crates/delta\_kernel -[rustc-version-1.84+]: https://img.shields.io/badge/rustc-1.84+-lightgray.svg -[rustc]: https://blog.rust-lang.org/2025/01/09/Rust-1.84.0/ +[rustc-version-1.85+]: https://img.shields.io/badge/rustc-1.85+-lightgray.svg +[rustc]: https://blog.rust-lang.org/2025/02/20/Rust-1.85.0/ [docs]: https://img.shields.io/docsrs/delta_kernel [docs.rs]: https://docs.rs/delta_kernel/latest/delta_kernel/ @@ -52,10 +52,10 @@ consumer's own `Engine` trait, the kernel has a feature flag to enable a default ```toml # fewer dependencies, requires consumer to implement Engine trait. # allows consumers to implement their own in-memory format -delta_kernel = "0.16.0" +delta_kernel = "0.20.0" -# or turn on the default engine, based on arrow -delta_kernel = { version = "0.16.0", features = ["default-engine", "arrow-56"] } +# or turn on the default engine, based on latest arrow +delta_kernel = { version = "0.20.0", features = ["default-engine", "arrow"] } ``` ### Feature flags @@ -85,8 +85,8 @@ arrow versions as we can. We allow selecting the version of arrow to use via feature flags. Currently we support the following flags: -- `arrow-55`: Use arrow version 55 - `arrow-56`: Use arrow version 56 +- `arrow-57`: Use arrow version 57 - `arrow`: Use the latest arrow version. Note that this is an _unstable_ flag: we will bump this to the latest arrow version at every arrow version release. Only removing old arrow versions will cause a breaking change for kernel. If you require a specific version N of arrow, you should @@ -106,7 +106,6 @@ and then checking what version of `object_store` it depends on. ## Documentation - [API Docs](https://docs.rs/delta_kernel/latest/delta_kernel/) -- [architecture.md](doc/architecture.md) document describing the kernel architecture (currently wip) ## Examples @@ -153,15 +152,7 @@ Some design principles which should be considered: - If using `emacs`, both [eglot](https://github.com/joaotavora/eglot) and [lsp-mode](https://github.com/emacs-lsp/lsp-mode) provide excellent integration with `rust-analyzer`. [rustic](https://github.com/brotzeit/rustic) is a nice mode as well. -- When also developing in VS Code it's sometimes convenient to configure rust-analyzer in - `.vscode/settings.json`. - -```json -{ - "editor.formatOnSave": true, - "rust-analyzer.cargo.features": ["default-engine"] -} -``` +- When also developing in VS Code it's convenient to add rust-analyzer to your workspace. - The crate's documentation can be easily reviewed with: `cargo docs --open` - Code coverage is available on codecov via [cargo-llvm-cov]. See their docs for instructions to install/run locally. diff --git a/acceptance/Cargo.toml b/acceptance/Cargo.toml index b9926441a8..52a2c85aa5 100644 --- a/acceptance/Cargo.toml +++ b/acceptance/Cargo.toml @@ -14,18 +14,25 @@ rust-version.workspace = true [package.metadata.release] release = false +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["delta_kernel/arrow-57", "test_utils/arrow-57"] +arrow-56 = ["delta_kernel/arrow-56", "test_utils/arrow-56"] + [dependencies] delta_kernel = { path = "../kernel", features = [ "default-engine-rustls", - "arrow", "internal-api", + "prettyprint", ] } +delta_kernel_benchmarks = { path = "../benchmarks" } futures = "0.3" itertools = "0.14" -object_store = "0.12.3" # must 'match' arrow version above serde = { version = "1", features = ["derive"] } serde_json = "1" thiserror = "2" +tracing = { version = "0.1", default-features = false } url = "2" [build-dependencies] @@ -36,6 +43,7 @@ tar = "0.4" [dev-dependencies] datatest-stable = "0.3" test-log = { version = "0.2", default-features = false, features = ["trace"] } +test_utils = { path = "../test-utils", default-features = false } tempfile = "3" test-case = { version = "3.3.1" } tokio = { version = "1.47" } @@ -47,3 +55,7 @@ tracing-subscriber = { version = "0.3", default-features = false, features = [ [[test]] name = "dat_reader" harness = false + +[[test]] +name = "acceptance_workloads_reader" +harness = false diff --git a/acceptance/build.rs b/acceptance/build.rs index 22bac27f3a..b7ba02616d 100644 --- a/acceptance/build.rs +++ b/acceptance/build.rs @@ -1,26 +1,36 @@ -//! Build script for DAT +//! Build script for DAT and acceptance workload specs use std::env; use std::fs::File; use std::io::{BufReader, BufWriter, Read, Write}; -use std::path::Path; +use std::path::{Path, PathBuf}; use flate2::read::GzDecoder; use tar::Archive; use ureq::{Agent, Proxy}; const DAT_EXISTS_FILE_CHECK: &str = "tests/dat/.done"; -const OUTPUT_FOLDER: &str = "tests/dat"; -const VERSION: &str = "0.0.3"; +const DAT_OUTPUT_FOLDER: &str = "tests/dat"; +const DAT_VERSION: &str = "0.0.3"; +const ACCEPTANCE_WORKLOADS_VERSION: &str = "0.0.4"; + +/// Workloads to skip on Windows due to invalid filename characters. +/// Windows does not support these characters in filenames: < > : " | ? * +/// Additionally, some percent-encoded characters cause issues. +#[cfg(windows)] +const WINDOWS_SKIP_WORKLOADS: &[&str] = &[ + // Contains files with #, %, and ? in filenames + "fpe_special_chars_path/", +]; fn main() { - if dat_exists() { - return; + if !dat_exists() { + let tarball_data = download_dat_files(); + extract_dat_tarball(tarball_data); + write_done_file(); } - let tarball_data = download_dat_files(); - extract_tarball(tarball_data); - write_done_file(); + extract_acceptance_workloads(); } fn dat_exists() -> bool { @@ -29,16 +39,20 @@ fn dat_exists() -> bool { fn download_dat_files() -> Vec { let tarball_url = format!( - "https://github.com/delta-incubator/dat/releases/download/v{VERSION}/deltalake-dat-v{VERSION}.tar.gz" + "https://github.com/delta-incubator/dat/releases/download/v{DAT_VERSION}/deltalake-dat-v{DAT_VERSION}.tar.gz" ); + download_tarball(&tarball_url) +} + +fn download_tarball(url: &str) -> Vec { let response = if let Ok(proxy_url) = env::var("HTTPS_PROXY") { let proxy = Proxy::new(&proxy_url).unwrap(); let config = Agent::config_builder().proxy(proxy.into()).build(); let agent = Agent::new_with_config(config); - agent.get(&tarball_url).call().unwrap() + agent.get(url).call().unwrap() } else { - ureq::get(&tarball_url).call().unwrap() + ureq::get(url).call().unwrap() }; let mut tarball_data: Vec = Vec::new(); @@ -51,12 +65,12 @@ fn download_dat_files() -> Vec { tarball_data } -fn extract_tarball(tarball_data: Vec) { +fn extract_dat_tarball(tarball_data: Vec) { let tarball = GzDecoder::new(BufReader::new(&tarball_data[..])); let mut archive = Archive::new(tarball); - std::fs::create_dir_all(OUTPUT_FOLDER).expect("Failed to create output directory"); + std::fs::create_dir_all(DAT_OUTPUT_FOLDER).expect("Failed to create output directory"); archive - .unpack(OUTPUT_FOLDER) + .unpack(DAT_OUTPUT_FOLDER) .expect("Failed to unpack tarball"); } @@ -65,3 +79,65 @@ fn write_done_file() { BufWriter::new(File::create(DAT_EXISTS_FILE_CHECK).expect("Failed to create .done file")); write!(done_file, "done").expect("Failed to write .done file"); } + +/// Download and extract acceptance workload specs if not already done. +/// Downloads from GitHub releases at `v{ACCEPTANCE_WORKLOADS_VERSION}_dat` tag. +/// Extracts to `acceptance/workloads/`. +fn extract_acceptance_workloads() { + let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set"); + let dir = PathBuf::from(manifest_dir); + + let output_dir = dir.join("workloads"); + let done_marker = output_dir.join(".done"); + + // Tell Cargo to re-run if the done marker changes + println!("cargo::rerun-if-changed={}", done_marker.display()); + + if done_marker.exists() { + return; + } + + // Download from GitHub releases + let tarball_url = format!( + "https://github.com/delta-incubator/dat/releases/download/v0.04-preview/v{ACCEPTANCE_WORKLOADS_VERSION}_dat_workloads.tar.gz" + ); + + let tarball_data = download_tarball(&tarball_url); + + // Extract tarball, skipping files with invalid Windows filenames + let decoder = GzDecoder::new(BufReader::new(&tarball_data[..])); + let mut archive = Archive::new(decoder); + std::fs::create_dir_all(&output_dir).expect("Failed to create acceptance output directory"); + + // Extract workloads one-by-one to skip tests that contain delta logs with special characters + // on Windows machines. This is because Windows does not support such files in the filesystem. + for entry in archive.entries().expect("Failed to read tarball entries") { + let mut entry = entry.expect("Failed to read tarball entry"); + + // On Windows, skip workloads that contain files with invalid filename characters + #[cfg(windows)] + { + let path = entry.path().expect("Failed to get entry path"); + let path_str = path.to_string_lossy(); + + // Skip entire workloads known to have problematic filenames + let should_skip = WINDOWS_SKIP_WORKLOADS + .iter() + .any(|skip| path_str.contains(skip)); + if should_skip { + eprintln!("Skipping Windows-incompatible workload file: {}", path_str); + continue; + } + } + + entry + .unpack_in(&output_dir) + .expect("Failed to unpack entry"); + } + + // Write .done marker + let mut done_file = BufWriter::new( + File::create(&done_marker).expect("Failed to create acceptance workloads .done file"), + ); + write!(done_file, "done").expect("Failed to write acceptance workloads .done file"); +} diff --git a/acceptance/src/acceptance_workloads/mod.rs b/acceptance/src/acceptance_workloads/mod.rs new file mode 100644 index 0000000000..dbc416e0bd --- /dev/null +++ b/acceptance/src/acceptance_workloads/mod.rs @@ -0,0 +1,148 @@ +//! Delta workload specification framework. +//! +//! Provides shared types and loading logic for correctness testing and benchmarking +//! workloads. All workload specs follow a flat file layout: +//! +//! ```text +//! / +//! table_info.json # Table metadata (optional) +//! delta/ # The Delta table +//! _delta_log/ +//! specs/ # Workload specifications (flat files) +//! workload_a.json +//! workload_b.json +//! expected/ # Expected results +//! workload_a/ +//! expected_data/ # Expected output as Parquet files +//! part-00000.parquet +//! part-00001.parquet +//! expected_metadata/ # Expected add files after data skipping as Parquet files +//! part-aaaaa.parquet +//! part-bbbbb.parquet +//! +//! ``` +//! +//! ## Expected Data Format +//! +//! The `expected_data/` directory contains one or more Parquet files representing the +//! expected table content after executing the workload. Files starting with `.` or `_` +//! are ignored (e.g., `_SUCCESS`, `.crc` files). The data is compared order-independently +//! against the actual scan output by sorting rows before comparison. +//! +//! ## Read Spec Format +//! +//! Read workloads execute a scan and compare results against expected data: +//! +//! ```json +//! { +//! "type": "read", +//! "version": 5, // optional: time travel to version +//! "predicate": "id > 10", // optional: filter predicate +//! "columns": ["id", "name"], // optional: column projection +//! "expected": { "rowCount": 100 } +//! } +//! ``` +//! +//! For error cases, use `"error"` instead of `"expected"`: +//! +//! ```json +//! { +//! "type": "read", +//! "error": { "errorCode": "TABLE_NOT_FOUND" } +//! } +//! ``` +//! +//! ## Snapshot Construction Spec Format +//! +//! Snapshot workloads verify that a snapshot can be constructed and its metadata matches: +//! +//! ```json +//! { +//! "type": "snapshotConstruction", +//! "version": 5, // optional: time travel to version +//! "expected": { +//! "protocol": { "minReaderVersion": 1, "minWriterVersion": 2 }, +//! "metadata": { "id": "...", "schemaString": "...", ... } +//! } +//! } +//! ``` +//! +//! For error cases: +//! +//! ```json +//! { +//! "type": "snapshotConstruction", +//! "error": { "errorCode": "INVALID_TABLE" } +//! } +//! ``` + +pub mod validation; +pub mod workload; + +use std::path::{Path, PathBuf}; + +use delta_kernel_benchmarks::models::{Spec, TableInfo}; +use url::Url; + +/// A fully resolved test case ready for execution. +#[derive(Debug)] +pub struct TestCase { + /// Table metadata (absent if `table_info.json` doesn't exist). This + /// occurs when the table is corrupt and used for error testing. + pub table_info: Option, + /// Root directory of the test case + pub root_dir: PathBuf, + /// The workload specification + pub spec: Spec, + /// Name of the workload (spec filename without extension) + pub workload_name: String, +} + +impl TestCase { + /// Create a test case from a spec file path. + /// + /// Given a path like `.../workloads//specs/.json`, + /// loads the spec and resolves the test case root directory. + pub fn from_spec_path(spec_path: impl AsRef) -> Self { + let spec_path = spec_path.as_ref(); + + let workload_name = spec_path + .file_stem() + .and_then(|s| s.to_str()) + .expect("Invalid spec path: missing filename") + .to_string(); + + let root_dir = spec_path + .parent() // specs/ + .and_then(|p| p.parent()) // test_case/ + .expect("Invalid spec path: must be /specs/.json") + .to_path_buf(); + + let content = std::fs::read_to_string(spec_path).expect("Failed to read spec file"); + let spec: Spec = serde_json::from_str(&content).expect("Failed to parse spec file"); + + Self { + table_info: None, + root_dir, + spec, + workload_name, + } + } + + /// URL to the Delta table directory. + /// + /// Uses `table_info.table_path` if available, otherwise defaults to + /// `/delta`. + pub fn table_root(&self) -> Result { + match self.table_info.as_ref() { + Some(table_info) => Ok(table_info.resolved_table_root()), + None => Url::from_directory_path(self.root_dir.join("delta")) + .map_err(|_| format!("Failed to construct table root for {:?}", self.root_dir)), + } + } + + /// Path to the expected results directory for this workload. + pub fn expected_dir(&self) -> PathBuf { + self.root_dir.join("expected").join(&self.workload_name) + } +} diff --git a/acceptance/src/acceptance_workloads/validation.rs b/acceptance/src/acceptance_workloads/validation.rs new file mode 100644 index 0000000000..77e2e96747 --- /dev/null +++ b/acceptance/src/acceptance_workloads/validation.rs @@ -0,0 +1,168 @@ +//! Result validation for acceptance workload test cases. +//! +//! Compares actual kernel results against expected outcomes from the spec. For read workloads, +//! expected data is loaded from Parquet files in `expected_data/` and compared order-independently. +//! For snapshot workloads, protocol and metadata are compared directly. + +use std::fs::{self, File}; +use std::path::Path; + +use delta_kernel::arrow::datatypes::Schema as ArrowSchema; +use delta_kernel::engine::arrow_conversion::TryFromKernel; +use itertools::Itertools; +use tracing::debug; + +use delta_kernel::arrow::array::RecordBatch; +use delta_kernel::arrow::compute::concat_batches; +use delta_kernel::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use delta_kernel::DeltaResult; +use delta_kernel_benchmarks::models::{ReadExpected, SnapshotExpected}; + +use crate::data::assert_data_matches; + +use super::workload::{ReadResult, SnapshotResult}; + +/// Read expected data from parquet files in expected_dir/expected_data/. +fn read_expected_data(expected_dir: &Path) -> Result { + let expected_data_dir = expected_dir.join("expected_data"); + if !expected_data_dir.exists() { + return Err(format!( + "Expected data directory not found: {}", + expected_data_dir.display() + )); + } + + let parquet_paths = fs::read_dir(&expected_data_dir) + .map_err(|e| format!("Failed to read expected_data dir: {e}"))? + .filter_map(|entry| { + let path = entry.ok()?.path(); + let filename = path.file_name()?.to_str()?; + + if filename.starts_with('.') || filename.starts_with('_') { + return None; + } + + if path.extension()?.to_str()? == "parquet" { + Some(path) + } else { + None + } + }) + .collect_vec(); + + let mut batches = vec![]; + let mut schema = None; + + for path in parquet_paths { + let file = File::open(&path) + .map_err(|e| format!("Failed to open parquet file {}: {e}", path.display()))?; + let builder = ParquetRecordBatchReaderBuilder::try_new(file) + .map_err(|e| format!("Failed to create parquet reader: {e}"))?; + + if schema.is_none() { + schema = Some(builder.schema().clone()); + } + + let reader = builder + .build() + .map_err(|e| format!("Failed to build parquet reader: {e}"))?; + + for batch in reader { + let batch = batch.map_err(|e| format!("Failed to read batch: {e}"))?; + batches.push(batch); + } + } + + let schema = schema + .ok_or_else(|| format!("No parquet files found in {}", expected_data_dir.display()))?; + let all_data = + concat_batches(&schema, &batches).map_err(|e| format!("Failed to concat batches: {e}"))?; + Ok(all_data) +} + +/// Validate read results against expected outcome. +pub fn validate_read_result( + result: DeltaResult, + expected_dir: &Path, + expected: &ReadExpected, +) -> Result<(), String> { + match (result, expected) { + (Ok(read_result), ReadExpected::Success { expected: exp }) => { + // TODO: Check file_count and files_skipped against scan metrics once available. + // Note: These would be informational only, not authoritative, since different + // data skipping implementations may produce different results. + let _ = (exp.file_count, exp.files_skipped); + + // Validate data content + let expected_data = read_expected_data(expected_dir)?; + + let schema = ArrowSchema::try_from_kernel(read_result.schema.as_ref()) + .map_err(|e| e.to_string())?; + let schema = std::sync::Arc::new(schema); + assert_data_matches(read_result.batches, &schema, expected_data) + .map_err(|e| e.to_string())?; + + // Validate row count against spec's expected row counts + if read_result.row_count != exp.row_count { + return Err(format!( + "Row count mismatch: expected {}, got {}", + exp.row_count, read_result.row_count + )); + } + + Ok(()) + } + (Err(kernel_err), ReadExpected::Error { error }) => { + debug!( + "Got expected error '{}' with message: {:?}\nKernel error: {}", + error.error_code, error.error_message, kernel_err + ); + Ok(()) + } + (Ok(_), ReadExpected::Error { error }) => Err(format!( + "Expected error '{}' but succeeded", + error.error_code + )), + (Err(e), ReadExpected::Success { .. }) => { + Err(format!("Expected success but got error: {}", e)) + } + } +} + +/// Validate snapshot result against expected outcome. +pub fn validate_snapshot( + result: DeltaResult, + expected: &SnapshotExpected, +) -> Result<(), String> { + match (result, expected) { + (Ok(snapshot_result), SnapshotExpected::Success { expected }) => { + if snapshot_result.protocol != *expected.protocol { + return Err(format!( + "Expected protocol to match:\n{:?}\n{:?}", + snapshot_result.protocol, expected.protocol + )); + } + if snapshot_result.metadata != *expected.metadata { + return Err(format!( + "Expected metadata to match:\n{:?}\n{:?}", + snapshot_result.metadata, expected.metadata + )); + } + Ok(()) + } + (Err(kernel_err), SnapshotExpected::Error { error }) => { + debug!( + "Got expected error '{}' with message: {:?}\nKernel error: {}", + error.error_code, error.error_message, kernel_err + ); + Ok(()) + } + (Ok(_), SnapshotExpected::Error { error }) => Err(format!( + "Expected error '{}' but succeeded", + error.error_code + )), + (Err(e), SnapshotExpected::Success { .. }) => { + Err(format!("Expected success but got error: {}", e)) + } + } +} diff --git a/acceptance/src/acceptance_workloads/workload.rs b/acceptance/src/acceptance_workloads/workload.rs new file mode 100644 index 0000000000..f97a9bde8a --- /dev/null +++ b/acceptance/src/acceptance_workloads/workload.rs @@ -0,0 +1,175 @@ +//! Workload execution logic for Delta workload specifications. + +use std::sync::Arc; + +use super::validation::{validate_read_result, validate_snapshot}; +use delta_kernel::actions::{Metadata, Protocol}; +use delta_kernel::arrow::array::RecordBatch; +use delta_kernel::arrow::compute::filter_record_batch; +use delta_kernel::engine::arrow_data::EngineDataArrowExt as _; +use delta_kernel::engine::arrow_expression::evaluate_expression::evaluate_predicate; +use delta_kernel::expressions::Predicate; +use delta_kernel::schema::Schema; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::{DeltaResult, Engine, Error, Version}; +use delta_kernel_benchmarks::models::{ReadSpec, SnapshotConstructionSpec, Spec, TimeTravel}; +use delta_kernel_benchmarks::predicate_parser::parse_predicate; +use itertools::Itertools; +use url::Url; + +/// Result of executing a read workload. +#[derive(Debug)] +pub struct ReadResult { + /// The record batches from the scan. + pub batches: Vec, + /// The kernel schema of the data. + pub schema: Arc, + /// Total number of rows in the result. + pub row_count: u64, +} + +/// Result of executing a snapshot workload. +#[derive(Debug)] +pub struct SnapshotResult { + /// The version of the snapshot. + pub version: Version, + /// The protocol at this version. + pub protocol: Protocol, + /// The table metadata at this version. + pub metadata: Metadata, +} + +/// Build a snapshot with optional time travel. +fn build_snapshot( + engine: &dyn Engine, + table_root: &Url, + time_travel: Option<&TimeTravel>, +) -> DeltaResult> { + let version = time_travel + .map(TimeTravel::as_version) + .transpose() + .map_err(Error::generic)?; + + let mut builder = Snapshot::builder_for(table_root.clone()); + if let Some(v) = version { + builder = builder.at_version(v); + } + builder.build(engine) +} + +/// Execute a read workload. +pub fn execute_read_workload( + engine: Arc, + table_root: &Url, + read_spec: &ReadSpec, +) -> DeltaResult { + // Parse predicate if present + let predicate = read_spec + .predicate + .as_deref() + .map(|p| parse_predicate(p)) + .transpose() + .map_err(|e| Error::generic(format!("Failed to parse predicate: {e}")))?; + + let snapshot = build_snapshot(engine.as_ref(), table_root, read_spec.time_travel.as_ref())?; + + let table_schema = snapshot.schema(); + + // Build scan with column projection (no predicate pushdown - we filter after) + let mut scan_builder = snapshot.scan_builder(); + if let Some(ref cols) = read_spec.columns { + let projected_schema = table_schema.project(cols)?; + scan_builder = scan_builder.with_schema(projected_schema); + } + let scan = scan_builder.build()?; + + let schema = scan.logical_schema(); + + // Execute scan to get all batches + let batches: Vec = scan + .execute(engine)? + .map(|data| data?.try_into_record_batch()) + .try_collect()?; + + // Filter batches using the predicate if present + let batches = filter_batches_with_predicate(batches, predicate.as_ref())?; + + // Compute row count from filtered batches + let row_count: u64 = batches.iter().map(|b| b.num_rows() as u64).sum(); + + Ok(ReadResult { + batches, + schema: schema.clone(), + row_count, + }) +} + +/// Filter record batches using a predicate expression. +fn filter_batches_with_predicate( + batches: Vec, + predicate: Option<&Predicate>, +) -> DeltaResult> { + let Some(predicate) = predicate else { + return Ok(batches); + }; + + batches + .into_iter() + .map(|batch| { + // Evaluate predicate to get boolean selection array + let selection = evaluate_predicate(predicate, &batch, false)?; + // Filter the batch using the selection + let filtered = filter_record_batch(&batch, &selection)?; + Ok(filtered) + }) + .collect() +} + +/// Execute a snapshot workload (for metadata validation). +pub fn execute_snapshot_workload( + engine: Arc, + table_root: &Url, + snapshot_spec: &SnapshotConstructionSpec, +) -> DeltaResult { + let snapshot = build_snapshot( + engine.as_ref(), + table_root, + snapshot_spec.time_travel.as_ref(), + )?; + + let config = snapshot.table_configuration(); + + Ok(SnapshotResult { + version: snapshot.version(), + protocol: config.protocol().clone(), + metadata: config.metadata().clone(), + }) +} + +/// Execute a workload and validate results. +pub fn execute_and_validate_workload( + engine: Arc, + table_root: &Url, + spec: &Spec, + expected_dir: &std::path::Path, +) -> Result<(), Box> { + match spec { + Spec::Read(read_spec) => { + let expected = read_spec + .expected + .as_ref() + .ok_or("ReadSpec must have expected or error field")?; + let result = execute_read_workload(engine, table_root, read_spec); + validate_read_result(result, expected_dir, expected)?; + } + Spec::SnapshotConstruction(snapshot_spec) => { + let expected = snapshot_spec + .expected + .as_ref() + .ok_or("SnapshotSpec must have expected or error field")?; + let result = execute_snapshot_workload(engine, table_root, snapshot_spec.as_ref()); + validate_snapshot(result, expected)?; + } + } + Ok(()) +} diff --git a/acceptance/src/data.rs b/acceptance/src/data.rs index 95e4892282..ce6314957e 100644 --- a/acceptance/src/data.rs +++ b/acceptance/src/data.rs @@ -1,19 +1,19 @@ use std::{path::Path, sync::Arc}; -use delta_kernel::arrow::array::{Array, RecordBatch}; -use delta_kernel::arrow::compute::{ - concat_batches, filter_record_batch, lexsort_to_indices, take, SortColumn, -}; -use delta_kernel::arrow::datatypes::{DataType, Schema}; +use delta_kernel::arrow::array::RecordBatch; +use delta_kernel::arrow::compute::concat_batches; +use delta_kernel::arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; +use delta_kernel::arrow::util::pretty::pretty_format_batches; +use delta_kernel::engine::arrow_data::EngineDataArrowExt as _; +use delta_kernel::object_store::{local::LocalFileSystem, ObjectStore}; use delta_kernel::parquet::arrow::async_reader::{ ParquetObjectReader, ParquetRecordBatchStreamBuilder, }; use delta_kernel::snapshot::Snapshot; -use delta_kernel::{engine::arrow_data::ArrowEngineData, DeltaResult, Engine, Error}; +use delta_kernel::{DeltaResult, Engine, Error}; use futures::{stream::TryStreamExt, StreamExt}; use itertools::Itertools; -use object_store::{local::LocalFileSystem, ObjectStore}; use crate::{TestCaseInfo, TestResult}; @@ -42,72 +42,85 @@ pub async fn read_golden(path: &Path, _version: Option<&str>) -> DeltaResult DeltaResult { - // Sort by all columns - let mut sort_columns = vec![]; - for col in batch.columns() { - match col.data_type() { - DataType::Struct(_) | DataType::List(_) | DataType::Map(_, _) => { - // can't sort structs, lists, or maps - } - _ => sort_columns.push(SortColumn { - values: col.clone(), - options: None, - }), - } +fn assert_schema_fields_match(schema: &Schema, golden: &Schema) -> DeltaResult<()> { + let schema_stripped = strip_metadata(schema); + let golden_stripped = strip_metadata(golden); + if schema_stripped.fields() != golden_stripped.fields() { + return Err(Error::generic(format!( + "Schema mismatch:\nActual: {:?}\nExpected: {:?}", + schema_stripped.fields(), + golden_stripped.fields() + ))); } - let indices = lexsort_to_indices(&sort_columns, None)?; - let columns = batch - .columns() - .iter() - .map(|c| take(c, &indices, None).unwrap()) - .collect(); - Ok(RecordBatch::try_new(batch.schema(), columns)?) + Ok(()) } -// Ensure that two schema have the same field names, and dict_is_ordered -// We ignore: -// - data type: This is checked already in `assert_columns_match` -// - nullability: parquet marks many things as nullable that we don't in our schema -// - metadata: because that diverges from the real data to the golden tabled data -fn assert_schema_fields_match(schema: &Schema, golden: &Schema) { - for (schema_field, golden_field) in schema.fields.iter().zip(golden.fields.iter()) { - assert!( - schema_field.name() == golden_field.name(), - "Field names don't match" - ); - assert!( - schema_field.dict_is_ordered() == golden_field.dict_is_ordered(), - "Field dict_is_ordered doesn't match" - ); +/// Recursively strip metadata from schema and all nested fields. +fn strip_metadata(schema: &Schema) -> Schema { + fn strip_field(field: &Field) -> Field { + Field::new( + field.name(), + strip_type(field.data_type()), + field.is_nullable(), + ) } -} -// some things are equivalent, but don't show up as equivalent for `==`, so we normalize here -fn normalize_col(col: Arc) -> Arc { - if let DataType::Timestamp(unit, Some(zone)) = col.data_type() { - if **zone == *"+00:00" { - let data_type = DataType::Timestamp(*unit, Some("UTC".into())); - delta_kernel::arrow::compute::cast(&col, &data_type).expect("Could not cast to UTC") - } else { - col + fn strip_type(dt: &DataType) -> DataType { + match dt { + DataType::Struct(fields) => DataType::Struct(Fields::from( + fields.iter().map(|f| strip_field(f)).collect_vec(), + )), + DataType::List(f) => DataType::List(Arc::new(strip_field(f))), + DataType::LargeList(f) => DataType::LargeList(Arc::new(strip_field(f))), + DataType::FixedSizeList(f, n) => DataType::FixedSizeList(Arc::new(strip_field(f)), *n), + DataType::Map(f, sorted) => DataType::Map(Arc::new(strip_field(f)), *sorted), + other => other.clone(), } - } else { - col } + Schema::new(schema.fields().iter().map(|f| strip_field(f)).collect_vec()) } -fn assert_columns_match(actual: &[Arc], expected: &[Arc]) { - for (actual, expected) in actual.iter().zip(expected) { - let actual = normalize_col(actual.clone()); - let expected = normalize_col(expected.clone()); - // note that array equality includes data_type equality - // See: https://arrow.apache.org/rust/arrow_data/equal/fn.equal.html - assert_eq!( - &actual, &expected, - "Column data didn't match. Got {actual:?}, expected {expected:?}" - ); +pub fn assert_data_matches( + result: Vec, + result_schema: &SchemaRef, + expected: RecordBatch, +) -> DeltaResult<()> { + let all_data = concat_batches(result_schema, result.iter())?; + + // Validate schemas match + assert_schema_fields_match(all_data.schema().as_ref(), expected.schema().as_ref())?; + + // Format both batches as strings for order-independent comparison + let actual_str = pretty_format_batches(std::slice::from_ref(&all_data)) + .map_err(|e| Error::generic(format!("Failed to format actual: {}", e)))? + .to_string(); + let expected_str = pretty_format_batches(std::slice::from_ref(&expected)) + .map_err(|e| Error::generic(format!("Failed to format expected: {}", e)))? + .to_string(); + + let mut actual_lines: Vec<&str> = actual_str.trim().lines().collect(); + let mut expected_lines: Vec<&str> = expected_str.trim().lines().collect(); + + // Sort data lines (skip header at indices 0-1 and footer at last index) + let num_actual = actual_lines.len(); + let num_expected = expected_lines.len(); + if num_actual > 3 { + actual_lines[2..num_actual - 1].sort_unstable(); + } + if num_expected > 3 { + expected_lines[2..num_expected - 1].sort_unstable(); + } + + // Compare sorted lines + if actual_lines != expected_lines { + return Err(Error::generic(format!( + "Data mismatch:\nExpected:\n{}\nActual:\n{}", + expected_lines.join("\n"), + actual_lines.join("\n") + ))); } + + Ok(()) } pub async fn assert_scan_metadata( @@ -120,36 +133,16 @@ pub async fn assert_scan_metadata( let mut schema = None; let batches: Vec = scan .execute(engine)? - .map(|scan_result| -> DeltaResult<_> { - let scan_result = scan_result?; - let mask = scan_result.full_mask(); - let data = scan_result.raw_data?; - let record_batch: RecordBatch = data - .into_any() - .downcast::() - .unwrap() - .into(); + .map(|data| -> DeltaResult<_> { + let record_batch = data?.try_into_record_batch()?; if schema.is_none() { schema = Some(record_batch.schema()); } - if let Some(mask) = mask { - Ok(filter_record_batch(&record_batch, &mask.into())?) - } else { - Ok(record_batch) - } + Ok(record_batch) }) .try_collect()?; - let all_data = concat_batches(&schema.unwrap(), batches.iter()).map_err(Error::from)?; - let all_data = sort_record_batch(all_data)?; let golden = read_golden(test_case.root_dir(), None).await?; - let golden = sort_record_batch(golden)?; - - assert_columns_match(all_data.columns(), golden.columns()); - assert_schema_fields_match(all_data.schema().as_ref(), golden.schema().as_ref()); - assert!( - all_data.num_rows() == golden.num_rows(), - "Didn't have same number of rows" - ); + assert_data_matches(batches, &schema.unwrap(), golden)?; Ok(()) } diff --git a/acceptance/src/lib.rs b/acceptance/src/lib.rs index 72658cdb4b..dbd2c1aeed 100644 --- a/acceptance/src/lib.rs +++ b/acceptance/src/lib.rs @@ -1,5 +1,6 @@ //! Helpers to validate Engine implementations +pub mod acceptance_workloads; pub mod data; pub mod meta; pub use meta::*; diff --git a/acceptance/src/meta.rs b/acceptance/src/meta.rs index 90253d006c..ed325a5a9f 100644 --- a/acceptance/src/meta.rs +++ b/acceptance/src/meta.rs @@ -3,8 +3,8 @@ use std::fs::File; use std::path::{Path, PathBuf}; use std::sync::Arc; +use delta_kernel::object_store::{local::LocalFileSystem, path::Path as ObjectPath, ObjectStore}; use futures::stream::TryStreamExt; -use object_store::{self, local::LocalFileSystem, ObjectStore}; use serde::{Deserialize, Serialize}; use url::Url; @@ -53,9 +53,7 @@ impl TestCaseInfo { let raw_cases = files.into_iter().filter(|meta| { meta.location.filename() == Some("table_version_metadata.json") - && !meta - .location - .prefix_matches(&object_store::path::Path::from("latest")) + && !meta.location.prefix_matches(&ObjectPath::from("latest")) }); let mut cases = Vec::new(); diff --git a/acceptance/tests/acceptance_workloads_reader.rs b/acceptance/tests/acceptance_workloads_reader.rs new file mode 100644 index 0000000000..09a0ebd141 --- /dev/null +++ b/acceptance/tests/acceptance_workloads_reader.rs @@ -0,0 +1,831 @@ +//! Test harness for acceptance workloads. +//! +//! This test uses datatest-stable to discover and run all workload specs in the +//! acceptance_workloads directory. Each spec file becomes its own test. + +use std::path::Path; + +use acceptance::acceptance_workloads::{workload::execute_and_validate_workload, TestCase}; + +/// Tests that cannot be executed due to test harness limitations. +/// These fail at parse time or cause infrastructure issues (OOM, hang). +/// All other failures (bugs, divergences, missing features) go in EXPECTED_KERNEL_FAILURES. +const SKIP_LIST: &[(&str, &str)] = &[ + ("DV-017/", "Huge table (2B rows) causes OOM/hang"), + // The Spec enum only supports "read" and "snapshot" types. + // CDF, transaction, and domain metadata specs fail at parse time. + ("_cdf", "CDF (Change Data Feed) test type not yet supported"), + ("_txn", "Transaction test type not yet supported"), + ( + "_domain_metadata", + "Domain metadata test type not yet supported", + ), + // These have `"type": "cdf"` which the harness cannot parse. + ( + "cdc_err_", + "CDF test type not yet supported (fails at parse time)", + ), +]; + +fn should_skip_test(test_path: &str) -> Option<&'static str> { + for (pattern, reason) in SKIP_LIST { + if test_path.contains(pattern) { + return Some(reason); + } + } + None +} + +/// Tests that CAN be executed but are expected to fail (kernel bugs or divergences). +/// Unlike SKIP_LIST, these workloads run and we assert they produce wrong results or errors. +/// When a kernel fix lands, the test will pass and the entry should be removed. +const EXPECTED_KERNEL_FAILURES: &[(&str, &[&str])] = &[ + // Kernel reads timestamps as Timestamp(Microsecond, Some("UTC")), + // Spark writes expected data as Timestamp(Nanosecond, None). + // Same instant, different Arrow representation. + ( + "Timestamp type divergence (kernel uses Microsecond/UTC, Spark uses Nanosecond/None)", + &[ + "cloneDeepMultiType/specs/cloneDeepMultiType_readAll", + "cloneDeepMultiType/specs/cloneDeepMultiType_readFiltered", + "cr_timestamp_boundaries/specs/cr_timestamp_boundaries_read_all", + "dcscStructWithSpecialTypes/specs/dcscStructWithSpecialTypes_read_high_amount", + "dcscStructWithSpecialTypes/specs/dcscStructWithSpecialTypes_read_by_date", + "dcscStructWithSpecialTypes/specs/dcscStructWithSpecialTypes_read_all", + "dpReadPartitionTimestamp/specs/dpReadPartitionTimestamp_readAll", + "dsReadMultipleTypes/specs/dsReadMultipleTypes_readAll", + "dsReadTimestampType/specs/dsReadTimestampType_readAll", + "ds_datetime/specs/ds_datetime_hit_dt_eq", + "ds_datetime/specs/ds_datetime_hit_dt_gte", + "ds_datetime/specs/ds_datetime_hit_dt_lt", + "ds_datetime/specs/ds_datetime_miss_dt_2023", + "ds_datetime/specs/ds_datetime_miss_dt_2025", + "ds_date_trunc_timestamp/specs/ds_date_trunc_timestamp_hit_trunc_month_june", + "ds_date_trunc_timestamp/specs/ds_date_trunc_timestamp_hit_trunc_month_march", + "ds_date_trunc_timestamp/specs/ds_date_trunc_timestamp_miss_trunc_month_jan", + "gc_datetime/specs/gc_datetime_filter_date", + "gc_datetime/specs/gc_datetime_read_all", + "ntz_mixed_tz_ntz/specs/ntz_mixed_tz_ntz_filter_ntz_col", + "ntz_mixed_tz_ntz/specs/ntz_mixed_tz_ntz_full_scan", + "pve_timestamp_partition/specs/pve_timestamp_partition_read_all", + "restoreCheckData/specs/restoreCheckData_filterBoolean", + "restoreCheckData/specs/restoreCheckData_filterDecimal", + "restoreCheckData/specs/restoreCheckData_readAll", + "st_datetime_stats/specs/st_datetime_stats_filter_date_eq", + "st_datetime_stats/specs/st_datetime_stats_filter_date_range", + "st_datetime_stats/specs/st_datetime_stats_full_scan", + "cdc_multiple_types/specs/cdc_multiple_types_read_all", + "cdc_timestamp_tz_handling/specs/cdc_timestamp_tz_handling_read_all", + "gc_append_data/specs/gc_append_data_read_all", + "gc_delete/specs/gc_delete_read_all", + "gc_insert_by_name/specs/gc_insert_by_name_read_all", + "gc_update_source/specs/gc_update_source_read_all", + ], + ), + // Kernel produces {metadata, value}, Spark produces {value, metadata}. + ( + "Variant field order divergence (kernel {metadata,value}, Spark {value,metadata})", + &[ + "var_001_basic/specs/var_001_basic_read_all", + "var_001_basic/specs/var_001_basic_select_variant_col", + "var_002_basic_stats/specs/var_002_basic_stats_read_all", + "var_003_nested_stats/specs/var_003_nested_stats_read_all", + "var_004_non_objects/specs/var_004_non_objects_filter_first_three", + "var_004_non_objects/specs/var_004_non_objects_read_all", + "var_005_null_counts/specs/var_005_null_counts_filter_non_null", + "var_005_null_counts/specs/var_005_null_counts_read_all", + "var_006_different_types/specs/var_006_different_types_filter_by_id", + "var_006_different_types/specs/var_006_different_types_read_all", + "var_007_partitions/specs/var_007_partitions_filter_partition", + "var_007_partitions/specs/var_007_partitions_read_all", + "var_008_many_fields/specs/var_008_many_fields_read_all", + "var_009_unusual_chars/specs/var_009_unusual_chars_read_all", + "var_010_nested_fields/specs/var_010_nested_fields_read_all", + "var_011_missing_values/specs/var_011_missing_values_read_all", + "var_012_mixed_types/specs/var_012_mixed_types_filter_half", + "var_012_mixed_types/specs/var_012_mixed_types_read_all", + "var_013_extreme_values/specs/var_013_extreme_values_read_all", + "var_014_variant_in_struct/specs/var_014_variant_in_struct_filter_label", + "var_014_variant_in_struct/specs/var_014_variant_in_struct_read_all", + "var_015_string_skipping/specs/var_015_string_skipping_filter_middle", + "var_015_string_skipping/specs/var_015_string_skipping_read_all", + "var_016_array_variant/specs/var_016_array_variant_filter_array_size", + "var_016_array_variant/specs/var_016_array_variant_read_all", + "var_017_map_variant/specs/var_017_map_variant_filter_by_id", + "var_017_map_variant/specs/var_017_map_variant_read_all", + "var_018_column_mapping/specs/var_018_column_mapping_filter_by_id", + "var_018_column_mapping/specs/var_018_column_mapping_read_all", + "var_019_schema_evolution/specs/var_019_schema_evolution_filter_new_column", + "var_019_schema_evolution/specs/var_019_schema_evolution_read_all", + "var_019_schema_evolution/specs/var_019_schema_evolution_read_v2_before_evolution", + "var_020_time_travel/specs/var_020_time_travel_read_latest", + "var_020_time_travel/specs/var_020_time_travel_read_v1", + "var_020_time_travel/specs/var_020_time_travel_read_v2", + "var_021_optimized/specs/var_021_optimized_filter_after_optimize", + "var_021_optimized/specs/var_021_optimized_read_all", + "var_022_stat_fields/specs/var_022_stat_fields_read_all", + "var_all_json_types_read_all", + "var_cdf_read/specs/var_cdf_read_read_all", + "var_deeply_nested/specs/var_deeply_nested_read_all", + "var_large_array/specs/var_large_array_read_all", + "var_null_top_level/specs/var_null_top_level_filter_not_null", + "var_null_top_level/specs/var_null_top_level_read_all", + "var_numeric_precision/specs/var_numeric_precision_read_all", + "var_predicate_non_variant/specs/var_predicate_non_variant_filter_category_A", + "var_predicate_non_variant/specs/var_predicate_non_variant_read_all", + "var_projection/specs/var_projection_project_id_data", + "var_projection/specs/var_projection_read_all", + "var_unicode_escapes/specs/var_unicode_escapes_read_all", + "ds_variant_null_stats/specs/ds_variant_null_stats_hit_null_v_is_null", + "ds_variant_null_stats/specs/ds_variant_null_stats_hit_null_v_struct_v_is_null", + "ds_variant_null_stats/specs/ds_variant_null_stats_hit_v_is_not_null", + "ds_variant_null_stats/specs/ds_variant_null_stats_hit_v_struct_v_not_null", + "ds_variant_null_stats/specs/ds_variant_null_stats_miss_null_v_is_not_null", + "ds_variant_null_stats/specs/ds_variant_null_stats_miss_null_v_struct_v_not_null", + "ds_variant_null_stats/specs/ds_variant_null_stats_miss_v_is_null", + "ds_variant_null_stats/specs/ds_variant_null_stats_miss_v_struct_v_is_null", + "var_null_top_level/specs/var_null_top_level_filter_null", + ], + ), + ( + "Kernel allows negative version, Spark rejects with DELTA_TABLE_RESTORE_VERSION_INVALID", + &["dsReadVersionNegative/specs/dsReadVersionNegative_error"], + ), + ( + "void/NullType not supported in schema deserialization", + &[ + "void_001_void_top_level/", + "void_002_void_nested_struct/", + "void_005_void_schema_evolution/", + "void_006_void_multiple_columns/", + "void_007_void_with_backticks/", + ], + ), + ( + "Interval types not supported in schema deserialization", + &[ + "intv_001_interval_ym_basic/", + "intv_002_interval_dt_basic/", + "intv_003_interval_partitioned/", + "intv_004_interval_negative/", + "intv_005_interval_mixed/", + "intv_boundary_values/", + "intv_sub_second/", + ], + ), + ( + "Cannot fall back to log replay when checkpoint files are missing or incomplete", + &[ + "corrupt_incomplete_multipart_checkpoint/", + "ckp_incomplete_multipart/", + "ckp_missing_checkpoint_file/", + ], + ), + ( + "Cannot cast list to non-list data types during type widening", + &[ + "tw_array_element/specs/tw_array_element_read_", + "tw_map_key_value_widening/specs/tw_map_key_value_widening_read_all", + ], + ), + ( + "Type widening metadata mismatch in nested struct fields", + &[ + "tw_nested_field/specs/tw_nested_field_read_all", + "tw_nested_field/specs/tw_nested_field_read_large_count", + ], + ), + ( + "Schema deserialization fails for TimestampNTZ type", + &["ds_multi_file_time/"], + ), + ( + "Column mapping id mode fails with None in final_fields_cols", + &[ + "cm_id_matching_swapped/specs/cm_id_matching_swapped_select_", + "cm_id_matching_nonexistent/specs/cm_id_matching_nonexistent_select_", + ], + ), + ( + "Absolute-path DV has invalid percent-encoded path", + &["dv_storage_type_p/specs/dv_storage_type_p_read_after_absolute_path_dv"], + ), + ( + "Fails on missing/empty delta log (no files in log segment)", + &[ + "ct_empty_delta_log/specs/ct_empty_delta_log_snapshot", + "ct_missing_delta_log/specs/ct_missing_delta_log_snapshot", + "dseReadNonDeltaPath/specs/dseReadNonDeltaPath_snapshot", + ], + ), + ( + "Accepts truncated log when initial commits are missing but CRC files exist", + &["prod_truncated_log/"], + ), + ( + "Accepts checkpoint-only tables (no commits)", + &["cp_checkpoint_only_table/specs/cp_checkpoint_only_table_error"], + ), + ( + "Projected column not found after column mapping/schema order change", + &[ + "ds_schema_order_mismatch/specs/ds_schema_order_mismatch_single_col_last", + "ds_with_dvs_edge/specs/ds_with_dvs_edge_proj_and_skip_with_dv", + "dv_projection_with_pred/specs/dv_projection_with_pred_proj_and_pred", + ], + ), + ( + "Does not reject unsupported column mapping mode", + &["cm_err_003_invalid_mode/specs/cm_err_003_invalid_mode_error"], + ), + ( + "Reads corrupt/invalid commit or checkpoint without error", + &[ + "corrupt_truncated_commit_json_error", + "cp_err_missing_protocol/specs/cp_err_missing_protocol_error", + "ct_corrupt_parquet/specs/ct_corrupt_parquet_error", + "ct_invalid_json_error", + "dsReadCorruptCheckpoint/specs/dsReadCorruptCheckpoint_error", + "dsReadCorruptJson_error", + "dsReadModifyCheckpoint/specs/dsReadModifyCheckpoint_error", + ], + ), + ( + "Does not reject duplicate actions in commit", + &[ + "ct_duplicate_metadata/specs/ct_duplicate_metadata_error", + "ct_duplicate_protocol/specs/ct_duplicate_protocol_error", + "err_duplicate_add_same_version/specs/err_duplicate_add_same_version_error", + ], + ), + ( + "Snapshot construction succeeds even when data files are missing", + &[ + "ct_missing_data_file/specs/ct_missing_data_file_error", + "dv_err_002_missing_file/specs/dv_err_002_missing_file_error", + ], + ), + ( + "Does not validate DV integrity", + &[ + "dv_err_001_checksum/specs/dv_err_001_checksum_error", + "dv_err_003_malformed_path/specs/dv_err_003_malformed_path_error", + "err_dv_invalid_storage_type/specs/err_dv_invalid_storage_type_error", + "err_add_and_remove_same_path_dv/specs/err_add_and_remove_same_path_dv_error", + ], + ), + ( + "Does not validate schema integrity", + &[ + "err_schema_empty/specs/err_schema_empty_error", + "err_schema_invalid_json_error", + ], + ), + ( + "Does not require version 0 to exist", + &["err_missing_version_0/specs/err_missing_version_0_error"], + ), + ( + "Does not reject unknown reader features", + &["ev_unknown_reader_feature/specs/ev_unknown_reader_feature_error"], + ), + ( + "Does not enforce time travel safety", + &[ + "tt_blocked_beyond_retention/specs/tt_blocked_beyond_retention_error", + "tt_after_vacuum/specs/tt_after_vacuum_error", + ], + ), + ( + "_metadata.file_path column projection not supported", + &["DV-003/specs/DV-003_metadata_file_path"], + ), + ( + "variantShredding feature not supported", + &["pv_002_upgrade_to_current/specs/pv_002_upgrade_to_current_read_latest"], + ), + // Predicate evaluation fails due to incomplete type resolution, unsupported expressions, + // or timestamp-based time travel not yet supported. + ( + "Predicate evaluation or time travel failures", + &[ + "cc_001_create_with_constraint_filter_constrained_col", + "cdc_data_skipping_read_high", + "cdc_data_skipping_read_low", + "cks_histogram_read_value_lt_500", + "cks_partitioned_read_part_0", + "cks_partitioned_read_part_1", + "cloneDeepPartitioned_readPartition", + "cloneShallowPartitioned_readPartition", + "cm_mode_name_read_by_price", + "convertParquetPartitioned_readPartition", + "cp_partitioned_read_part_0", + "cp_partitioned_read_part_3", + "ds_and_one_side_unsupported_hit_both_unsupported", + "ds_and_one_side_unsupported_miss_a_supported", + "ds_and_one_side_unsupported_miss_b_supported", + "ds_and_two_fields_hit_and_a_b_like", + "ds_and_two_fields_miss_and_b_like_2016", + "ds_complex_nested_complex_and_or", + "ds_complex_nested_complex_nested_or", + "ds_complex_nested_complex_not", + "ds_complex_nested_three_way_and", + "ds_date_add_sub_hit_date_add_all", + "ds_date_add_sub_hit_date_add_gt", + "ds_date_add_sub_miss_date_add_future", + "ds_datediff_hit_datediff_gt_30", + "ds_datediff_hit_datediff_lt_20", + "ds_float_special_values_hit_float_gt_0", + "ds_float_special_values_hit_float_lt_0", + "ds_generated_col_skipping_filter_generated_col", + "ds_generated_col_skipping_filter_generated_col_range", + "ds_in_list_in_list", + "ds_in_list_in_list_single_file", + "ds_in_nested_hit_code_in_1_2", + "ds_in_nested_miss_code_in_99", + "ds_in_set_hit_in_1", + "ds_in_set_hit_in_12", + "ds_in_set_miss_in_456", + "ds_in_with_nulls_mixed_hit_in_1_null", + "ds_in_with_nulls_mixed_miss_in_99_null", + "ds_in_with_nulls_only_hit_in_1_2", + "ds_in_with_nulls_only_hit_in_5", + "ds_in_with_thresholds_hit_in_cross_files", + "ds_in_with_thresholds_hit_in_small", + "ds_in_with_thresholds_miss_in_no_match", + "ds_isnull_complex_expr_hit_complex_or_and", + "ds_long_strings_max_hit_like_A", + "ds_long_strings_max_hit_like_C", + "ds_long_strings_min_hit_like_A", + "ds_missing_stats_graceful_filter_no_stats", + "ds_missing_stats_graceful_filter_no_stats_range", + "ds_month_function_hit_month_1", + "ds_month_function_hit_month_6", + "ds_month_function_miss_month_12", + "ds_multi_file_ranges_hit_file1_and_2", + "ds_multi_file_ranges_hit_file1_only", + "ds_multi_file_ranges_hit_file2_and_3", + "ds_multi_file_ranges_hit_file2_only", + "ds_multi_file_ranges_hit_file3_only", + "ds_multi_file_ranges_miss_all_gt_100", + "ds_multi_file_ranges_miss_all_lt_0", + "ds_not_in_not_in_1_2", + "ds_not_in_not_in_3", + "ds_not_in_not_in_all", + "ds_null_safe_eq_nse_a_eq_1", + "ds_null_safe_eq_nse_a_eq_null", + "ds_null_safe_eq_nse_b_eq_null", + "ds_null_safe_eq_nse_not_a_eq_1", + "ds_null_safe_eq_nse_not_a_eq_null", + "ds_nulls_mixed_hit_a_eq_1", + "ds_nulls_mixed_hit_a_gt_0_is_null", + "ds_nulls_mixed_hit_a_lt_0_is_null", + "ds_nulls_mixed_hit_a_nse_1", + "ds_nulls_mixed_hit_a_nse_null", + "ds_nulls_mixed_hit_not_a_gt_0_is_null", + "ds_nulls_mixed_hit_not_a_nse_1", + "ds_nulls_mixed_hit_not_a_nse_null", + "ds_nulls_mixed_miss_a_gt_1", + "ds_nulls_mixed_miss_a_lt_1", + "ds_nulls_mixed_miss_a_neq_1", + "ds_nulls_mixed_miss_not_a_eq_1", + "ds_nulls_nonnulls_only_hit_not_isnull", + "ds_nulls_nonnulls_only_miss_and_isnull", + "ds_nulls_nonnulls_only_miss_gt0_isnull", + "ds_nulls_nonnulls_only_miss_lt0_isnull", + "ds_nulls_nonnulls_only_miss_or_isnull", + "ds_nulls_only_nonnull_miss_and_isnull", + "ds_nulls_only_nonnull_miss_or_isnull", + "ds_nulls_only_null_hit_a_gt_1_is_null", + "ds_nulls_only_null_hit_not_a_nse_1", + "ds_nulls_only_null_miss_a_eq_1", + "ds_nulls_only_null_miss_a_gt_1", + "ds_nulls_only_null_miss_a_lt_1", + "ds_nulls_only_null_miss_a_neq_1", + "ds_nulls_only_null_miss_a_nse_1", + "ds_nulls_only_null_miss_not_a_eq_1", + "ds_nulls_only_null_miss_not_a_gt_1_is_null", + "ds_nulls_partial_stats_hit_b_and_a_isnull", + "ds_nulls_partial_stats_hit_b_gt0_isnull", + "ds_nulls_partial_stats_hit_b_lt0_isnull", + "ds_nulls_partial_stats_hit_b_or_a_isnull", + "ds_nulls_partial_stats_hit_not_a_isnull", + "ds_nulls_partial_stats_miss_a_and_isnull", + "ds_nulls_partial_stats_miss_a_gt0_isnull", + "ds_nulls_partial_stats_miss_a_lt0_isnull", + "ds_nulls_partial_stats_miss_a_or_isnull", + "ds_numeric_types_hit_int_eq_25", + "ds_numeric_types_hit_int_lt_50", + "ds_numeric_types_miss_int_gt_200", + "ds_or_one_side_unsupported_hit_a_unsupported", + "ds_or_one_side_unsupported_hit_b_unsupported", + "ds_or_two_fields_hit_or_a_b_like", + "ds_or_two_fields_hit_or_a_b_like2", + "ds_or_two_fields_miss_or_b_like_2016", + "ds_partition_and_stats_data_filter_across_partitions", + "ds_partition_and_stats_partition_and_data_filter", + "ds_partitioned_hit_part_0", + "ds_partitioned_hit_part_0_id_lt_10", + "ds_partitioned_hit_part_1", + "ds_partitioned_miss_part_99", + "ds_starts_with_hit_like_a", + "ds_starts_with_hit_like_all", + "ds_starts_with_hit_like_ap", + "ds_starts_with_hit_like_m", + "ds_starts_with_hit_like_mic", + "ds_starts_with_miss_like_xyz", + "ds_starts_with_nested_hit_like_a", + "ds_starts_with_nested_hit_like_all", + "ds_starts_with_nested_hit_like_ap", + "ds_starts_with_nested_hit_like_m", + "ds_starts_with_nested_hit_like_mic", + "ds_starts_with_nested_miss_like_xyz", + "ds_stats_col_drop_filter_remaining_col", + "ds_stats_col_rename_filter_renamed_col", + "ds_stats_col_rename_filter_renamed_col_eq", + "ds_stats_config_change_filter_indexed_col", + "ds_stats_config_change_filter_non_indexed_col", + "ds_string_patterns_hit_like_a", + "ds_string_patterns_hit_like_all", + "ds_string_patterns_hit_like_b", + "ds_string_patterns_miss_like_z", + "ds_timestamp_ntz_skipping_filter_ntz_eq", + "ds_timestamp_ntz_skipping_filter_ntz_range", + "ds_tinyint_smallint_hit_small_gt_500", + "ds_tinyint_smallint_hit_tiny_lt_10", + "ds_tinyint_smallint_miss_small_lt_50", + "ds_tinyint_smallint_miss_tiny_gt_120", + "ds_trunc_date_hit_trunc_year_2023", + "ds_trunc_date_hit_trunc_year_2024", + "ds_trunc_date_miss_trunc_year_2020", + "ds_with_dvs_edge_1_filter_hits_dv_file", + "ds_with_dvs_edge_1_skip_file_with_dv", + "ds_with_dvs_edge_2_predicate_only_deleted_file", + "ds_with_dvs_edge_2_predicate_spanning_deleted_file", + "ds_with_dvs_edge_skip_exact_deleted", + "ds_with_dvs_edge_skip_with_dv", + "ds_year_function_hit_year_2023", + "ds_year_function_hit_year_2024", + "ds_year_function_miss_year_2020", + "dv_insert_readback_read_new_rows_only", + "dv_insert_readback_read_surviving_original", + "dv_partition_pruning_prune_north_or_east", + "dv_partition_pruning_prune_west_high_amount", + "DV-002_version_1_id_gt_25", + "DV-002_version_3_partCol_3_id_gt_25", + "DV-002_version_4_partCol_eq_3", + "ev_partitioned_read_partition_0", + "ice_column_mapping_read_by_id", + "pm_metadata_partitioned_read_date", + "pm_protocol_3_7_read_value_gt_200", + "pm_v2_checkpoints_read_lt_10", + "restorePartitioned_readPartition", + "tw_byte_to_int_read_gt_max_byte", + "tw_byte_to_int_read_gt_max_short", + "tw_decimal_precision_read_large_values", + "tw_row_tracking_combo_read_wide_values", + "tw_short_to_int_read_gt_max_short", + "tw_with_dv_read_wide_values", + "tw_with_partition_read_wide_values", + "cc_002_show_tblproperties/specs/cc_002_show_tblproperties_filter_x", + "cc_004_case_insensitive_drop/specs/cc_004_case_insensitive_drop_filter_negative", + "cc_005_varchar_constraint/specs/cc_005_varchar_constraint_filter_short_string", + "cc_006_basic_constraint/specs/cc_006_basic_constraint_filter_id", + "cc_007_multiple_constraints/specs/cc_007_multiple_constraints_filter_amount", + "cc_008_nested_constraint/specs/cc_008_nested_constraint_filter_nested_age", + "cc_009_array_constraint/specs/cc_009_array_constraint_filter_array_size", + "cc_011_compound_constraint/specs/cc_011_compound_constraint_filter_by_month", + "cc_016_allowed_expressions/specs/cc_016_allowed_expressions_filter_num", + "cc_020_decimal_constraint/specs/cc_020_decimal_constraint_filter_high_price", + "cc_complex_expr/specs/cc_complex_expr_filter_age", + "cc_null_aware/specs/cc_null_aware_filter_age", + "cdc_predicates/specs/cdc_predicates_read_high_count", + "ckp_after_100_commits/specs/ckp_after_100_commits_read_filtered", + "ckp_corrupt_last_checkpoint/specs/ckp_corrupt_last_checkpoint_read_filtered", + "ckp_struct_array_map/specs/ckp_struct_array_map_read_filtered", + "ckp_v2_multiple_sidecars/specs/ckp_v2_multiple_sidecars_read_filtered", + "ckp_wrong_version_hint/specs/ckp_wrong_version_hint_read_filtered", + "cks_dv_in_crc/specs/cks_dv_in_crc_read_remaining", + "cks_file_size_histogram_optimize/specs/cks_file_size_histogram_optimize_read_filtered", + "cl_clustered_read/specs/cl_clustered_read_read_filtered", + "cloneCheckpointData/specs/cloneCheckpointData_readFiltered", + "cloneChecksumData/specs/cloneChecksumData_filterRange", + "cloneHistoryAfterDml/specs/cloneHistoryAfterDml_filterDeleted", + "cloneHistoryAfterDml/specs/cloneHistoryAfterDml_filterInserted", + "cloneIdempotentData/specs/cloneIdempotentData_filterNew", + "cloneShallowDvData/specs/cloneShallowDvData_readFiltered", + "cloneSqlWithStats/specs/cloneSqlWithStats_filterStats", + "cm_filter_pushdown_physical_names/specs/cm_filter_pushdown_physical_names_filter_a_eq_100_should_be_empty", + "cm_filter_pushdown_physical_names/specs/cm_filter_pushdown_physical_names_filter_a_eq_1000", + "cm_filter_pushdown_physical_names/specs/cm_filter_pushdown_physical_names_filter_c_eq_100", + "cm_mode_id/specs/cm_mode_id_read_filtered", + "cm_nested_columns/specs/cm_nested_columns_filter_nested", + "cm_nested_struct_name/specs/cm_nested_struct_name_filter_nested_field", + "cm_predicate_after_rename/specs/cm_predicate_after_rename_filter_eq_on_renamed_col", + "cm_predicate_after_rename/specs/cm_predicate_after_rename_filter_on_renamed_col", + "cm_predicate_on_readded/specs/cm_predicate_on_readded_pred_x_gt_150", + "cp_classic_checkpoint/specs/cp_classic_checkpoint_read_filtered", + "cp_last_checkpoint/specs/cp_last_checkpoint_read_filtered", + "cr_multi_partition/specs/cr_multi_partition_filter_year", + "cr_multi_partition/specs/cr_multi_partition_filter_year_month", + "dcscArrayOfStruct/specs/dcscArrayOfStruct_read_filtered", + "dcscBasicStruct/specs/dcscBasicStruct_read_filtered", + "dcscDeeplyNestedStruct/specs/dcscDeeplyNestedStruct_read_deep_filter", + "dcscMapWithStructValue/specs/dcscMapWithStructValue_read_filtered", + "dcscNestedStruct/specs/dcscNestedStruct_read_filtered", + "dcscStructPartitioned/specs/dcscStructPartitioned_read_age_filter", + "dpReadPartitionAfterAppend/specs/dpReadPartitionAfterAppend_filterPartInCD", + "dpReadPartitionDecimal/specs/dpReadPartitionDecimal_filterDecimalEq", + "dpReadPartitionDecimal/specs/dpReadPartitionDecimal_filterDecimalGt", + "dpReadPartitionIn/specs/dpReadPartitionIn_filterPartInAC", + "dpReadPartitionIn/specs/dpReadPartitionIn_filterPartInBDE", + "dpReadPartitionMixed/specs/dpReadPartitionMixed_filterPartAndValue", + "dpReadPartitionMixed/specs/dpReadPartitionMixed_filterPartOrValue", + "dpReadPartitionRange/specs/dpReadPartitionRange_filterPartBetween2And4", + "dpReadPartitionRange/specs/dpReadPartitionRange_filterPartGt3", + "dpReadPartitionTimestamp/specs/dpReadPartitionTimestamp_filterTsEq", + "dpReadPartitionTimestamp/specs/dpReadPartitionTimestamp_filterTsGt", + "ds_in_set/specs/ds_in_set_hit_in_123", + "ds_not_in/specs/ds_not_in_not_in_outside", + "ds_nulls_mixed/specs/ds_nulls_mixed_hit_a_eq_null", + "ds_nulls_mixed/specs/ds_nulls_mixed_hit_not_a_eq_null", + "ds_nulls_only_null/specs/ds_nulls_only_null_hit_a_eq_null", + "ds_nulls_only_null/specs/ds_nulls_only_null_hit_a_nse_null", + "ds_nulls_only_null/specs/ds_nulls_only_null_hit_not_a_eq_null", + "ds_nulls_only_null/specs/ds_nulls_only_null_miss_not_a_nse_null", + "ds_stats_after_drop/specs/ds_stats_after_drop_hit_c1_eq_1", + "ds_stats_after_drop/specs/ds_stats_after_drop_hit_c10_gt_1_5", + "ds_stats_after_drop/specs/ds_stats_after_drop_hit_c3_lt_1_5", + "ds_stats_after_drop/specs/ds_stats_after_drop_hit_c4_gt_1_0", + "ds_stats_after_drop/specs/ds_stats_after_drop_hit_c5_gte_ts", + "ds_stats_after_drop/specs/ds_stats_after_drop_hit_c6_gte_ts_ntz", + "ds_stats_after_drop/specs/ds_stats_after_drop_hit_c7_eq_date", + "ds_stats_after_drop/specs/ds_stats_after_drop_miss_c10_gt_2_5", + "ds_stats_after_drop/specs/ds_stats_after_drop_miss_c3_lt_0_5", + "ds_stats_after_drop/specs/ds_stats_after_drop_miss_c1_eq_10", + "ds_stats_after_drop/specs/ds_stats_after_drop_miss_c4_gt_5_0", + "ds_stats_after_drop/specs/ds_stats_after_drop_miss_c5_gte_future", + "ds_stats_after_drop/specs/ds_stats_after_drop_miss_c6_gte_future_ntz", + "ds_stats_after_drop/specs/ds_stats_after_drop_miss_c7_eq_future", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc1_eq_1", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc10_gt_1_5", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc2_eq_2", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc3_lt_1_5", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc4_gt_1_0", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc5_gte_ts", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc6_gte_ts_ntz", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc7_eq_date", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc8_hex_1111", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc8_hex_3333", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc9_false", + "ds_stats_after_rename/specs/ds_stats_after_rename_hit_cc9_true", + "ds_stats_after_rename/specs/ds_stats_after_rename_miss_cc10_gt_2_5", + "ds_stats_after_rename/specs/ds_stats_after_rename_miss_cc3_lt_0_5", + "ds_stats_after_rename/specs/ds_stats_after_rename_miss_cc1_eq_10", + "ds_stats_after_rename/specs/ds_stats_after_rename_miss_cc2_eq_4", + "ds_stats_after_rename/specs/ds_stats_after_rename_miss_cc4_gt_5_0", + "ds_stats_after_rename/specs/ds_stats_after_rename_miss_cc5_gte_future", + "ds_stats_after_rename/specs/ds_stats_after_rename_miss_cc6_gte_future_ntz", + "ds_stats_after_rename/specs/ds_stats_after_rename_miss_cc7_eq_future", + "ds_timestamp_microsecond/specs/ds_timestamp_microsecond_filter_exact_microsecond", + "ds_timestamp_microsecond/specs/ds_timestamp_microsecond_filter_microsecond", + "ds_timestamp_microsecond/specs/ds_timestamp_microsecond_read_all", + "ds_typed_stats/specs/ds_typed_stats_hit_c1_eq_1", + "ds_typed_stats/specs/ds_typed_stats_hit_c10_gt_1_5", + "ds_typed_stats/specs/ds_typed_stats_hit_c2_eq_2", + "ds_typed_stats/specs/ds_typed_stats_hit_c3_lt_1_5", + "ds_typed_stats/specs/ds_typed_stats_hit_c4_gt_1_0", + "ds_typed_stats/specs/ds_typed_stats_hit_c5_gte_ts", + "ds_typed_stats/specs/ds_typed_stats_hit_c6_gte_ts_ntz", + "ds_typed_stats/specs/ds_typed_stats_hit_c7_eq_date", + "ds_typed_stats/specs/ds_typed_stats_hit_c8_hex_1111", + "ds_typed_stats/specs/ds_typed_stats_hit_c8_hex_3333", + "ds_typed_stats/specs/ds_typed_stats_hit_c9_false", + "ds_typed_stats/specs/ds_typed_stats_hit_c9_true", + "ds_typed_stats/specs/ds_typed_stats_miss_c10_gt_2_5", + "ds_typed_stats/specs/ds_typed_stats_miss_c3_lt_0_5", + "ds_typed_stats/specs/ds_typed_stats_miss_c1_eq_10", + "ds_typed_stats/specs/ds_typed_stats_miss_c2_eq_4", + "ds_typed_stats/specs/ds_typed_stats_miss_c4_gt_5_0", + "ds_typed_stats/specs/ds_typed_stats_miss_c5_gte_future", + "ds_typed_stats/specs/ds_typed_stats_miss_c6_gte_future_ntz", + "ds_typed_stats/specs/ds_typed_stats_miss_c7_eq_future", + "dseReadWithStats/specs/dseReadWithStats_filterIdEq", + "dseReadWithStats/specs/dseReadWithStats_filterIdRange", + "dsReadAndPredicate/specs/dsReadAndPredicate_filterAnd", + "dsReadBetweenPredicate/specs/dsReadBetweenPredicate_readBetween", + "dsReadCdcEnabled/specs/dsReadCdcEnabled_filterDeleted", + "dsReadCdcEnabled/specs/dsReadCdcEnabled_filterUpdated", + "dsReadDateType/specs/dsReadDateType_readAfterJune", + "dsReadDecimalType/specs/dsReadDecimalType_readExpensive", + "dsReadEmptyPartition/specs/dsReadEmptyPartition_readNonExistentPartition", + "dsReadInPredicate/specs/dsReadInPredicate_readInSet", + "dsReadLikePredicate/specs/dsReadLikePredicate_filterLikeAlp", + "dsReadOrPredicate/specs/dsReadOrPredicate_filterOr", + "dsReadPartitioned/specs/dsReadPartitioned_readPartition0", + "dsReadPartitioned/specs/dsReadPartitioned_readPartition3", + "dsReadUpdateThenRead/specs/dsReadUpdateThenRead_readUpdated", + "dsReadWithPredicate/specs/dsReadWithPredicate_readFiltered", + "dv_checkpoint_read/specs/dv_checkpoint_read_read_filtered", + "dv_column_mapping_read/specs/dv_column_mapping_read_filter_on_value", + "dv_predicate_on_deleted/specs/dv_predicate_on_deleted_predicate_only_deleted", + "dv_predicate_on_deleted/specs/dv_predicate_on_deleted_predicate_surviving", + "DV-004/specs/DV-004_filter_300_787_239", + "DV-011/specs/DV-011_filter_col1_eq_2", + "ev_batch_read/specs/ev_batch_read_read_filtered", + "ev_unknown_writer_feature/specs/ev_unknown_writer_feature_read_filtered", + "fpe_space_in_path/specs/fpe_space_in_path_filter_id_eq_1", + "gc_append_data/specs/gc_append_data_filter_partition", + "gc_complex_type/specs/gc_complex_type_filter_sum", + "gc_delete/specs/gc_delete_filter_remaining", + "gc_insert_by_name/specs/gc_insert_by_name_filter_c2_g", + "gc_merge_insert_star/specs/gc_merge_insert_star_filter_merged", + "gc_partitioned/specs/gc_partitioned_filter_2024", + "gc_update_source/specs/gc_update_source_filter_c2_g_updated", + "ic_010_restore_partitioned/specs/ic_010_restore_partitioned_filter_partition_1", + "ic_bigint_type/specs/ic_bigint_type_filter_high", + "ice_complex_types/specs/ice_complex_types_filter_id", + "ice_with_dv/specs/ice_with_dv_filter_remaining", + "lc_with_schema/specs/lc_with_schema_read_filtered", + "log_replay_dv_key_dedup/specs/log_replay_dv_key_dedup_filter_remaining", + "ntz_basic/specs/ntz_basic_filter_ntz", + "ntz_far_past/specs/ntz_far_past_filter_before_1900", + "ntz_partition/specs/ntz_partition_filter_partition", + "ntz_partition/specs/ntz_partition_filter_partition_range", + "ntz_stats/specs/ntz_stats_filter_ntz_late_year", + "ntz_stats/specs/ntz_stats_filter_ntz_range", + "part_date_type/specs/part_date_type_filter_date", + "part_date_type/specs/part_date_type_filter_date_range", + "part_multi_column/specs/part_multi_column_filter_a", + "part_multi_column/specs/part_multi_column_filter_a_b", + "part_multi_column/specs/part_multi_column_filter_all_three", + "ps_both_stats_and_parsed/specs/ps_both_stats_and_parsed_filter_category", + "ps_nested_fields/specs/ps_nested_fields_filter_nested_age", + "ps_partitioned/specs/ps_partitioned_filter_partition", + "ps_v2_checkpoint/specs/ps_v2_checkpoint_filter_id", + "pve_byte_partition/specs/pve_byte_partition_filter_max", + "pve_byte_partition/specs/pve_byte_partition_filter_min", + "pve_byte_partition/specs/pve_byte_partition_filter_positive", + "pve_decimal_partition/specs/pve_decimal_partition_filter_boundary", + "pve_decimal_partition/specs/pve_decimal_partition_filter_eq", + "pve_decimal_partition/specs/pve_decimal_partition_filter_positive", + "pve_float_partition/specs/pve_float_partition_filter_eq", + "pve_float_partition/specs/pve_float_partition_filter_positive", + "pve_multi_partition_cols/specs/pve_multi_partition_cols_filter_all_three", + "pve_multi_partition_cols/specs/pve_multi_partition_cols_filter_int_range", + "pve_multi_partition_cols/specs/pve_multi_partition_cols_filter_str_and_int", + "pve_short_partition/specs/pve_short_partition_filter_max", + "pve_short_partition/specs/pve_short_partition_filter_min", + "pve_short_partition/specs/pve_short_partition_filter_range", + "pve_timestamp_ntz_partition/specs/pve_timestamp_ntz_partition_filter_eq", + "pve_timestamp_ntz_partition/specs/pve_timestamp_ntz_partition_filter_range", + "pve_timestamp_partition/specs/pve_timestamp_partition_filter_eq", + "pve_timestamp_partition/specs/pve_timestamp_partition_filter_range", + "restoreMultipleVersions/specs/restoreMultipleVersions_filterMissing", + "restoreMultipleVersions/specs/restoreMultipleVersions_filterV0", + "restoreWithDv/specs/restoreWithDv_filterRestored", + "rt_filter_read/specs/rt_filter_read_read_filtered", + "se_add_col_pred_eq/specs/se_add_col_pred_eq_pred_score_eq_0", + "se_add_col_pred_eq/specs/se_add_col_pred_eq_pred_score_eq_95", + "se_add_col_pred_null/specs/se_add_col_pred_null_pred_score_gt_90", + "se_drop_col_pred/specs/se_drop_col_pred_pred_id_gt_2", + "se_readd_pred/specs/se_readd_pred_pred_x_eq_200", + "se_rename_pred/specs/se_rename_pred_pred_full_name_like", + "st_decimal_stats/specs/st_decimal_stats_filter_decimal_eq", + "st_decimal_stats/specs/st_decimal_stats_filter_decimal_range", + "st_float_nan_inf/specs/st_float_nan_inf_filter_finite", + "st_float_nan_inf/specs/st_float_nan_inf_filter_id_range", + "st_indexed_cols/specs/st_indexed_cols_filter_col1", + "st_indexed_cols/specs/st_indexed_cols_filter_col2", + "st_indexed_cols/specs/st_indexed_cols_filter_col3", + "st_multi_file_stats/specs/st_multi_file_stats_filter_part_0", + "st_multi_file_stats/specs/st_multi_file_stats_filter_part_5", + "st_multi_file_stats/specs/st_multi_file_stats_filter_part_range", + "st_nested_stats/specs/st_nested_stats_filter_nested_category", + "st_numeric_stats/specs/st_numeric_stats_filter_double_gt", + "st_numeric_stats/specs/st_numeric_stats_filter_int_eq", + "st_string_stats/specs/st_string_stats_filter_string_like", + "st_tinyint_smallint/specs/st_tinyint_smallint_filter_smallint_negative", + "st_tinyint_smallint/specs/st_tinyint_smallint_filter_tinyint_range", + "stats_empty_string/specs/stats_empty_string_filter_id_eq_3", + "stats_missing_entirely/specs/stats_missing_entirely_filter_id_eq_2", + "stats_numrecords_only/specs/stats_numrecords_only_filter_id_eq_1", + "stats_partition_col_no_stats/specs/stats_partition_col_no_stats_filter_by_data_col", + "tntz_column_mapping/specs/tntz_column_mapping_filter_ntz", + "tntz_epoch/specs/tntz_epoch_filter_epoch", + "tntz_partition_filter/specs/tntz_partition_filter_filter_exact_partition", + "tntz_partition_filter/specs/tntz_partition_filter_filter_range_partition", + "tt_dv_between_versions/specs/tt_dv_between_versions_read_v2_filtered", + "tt_partition_filter/specs/tt_partition_filter_v0_part_0_or_1", + "tw_colmap_rename/specs/tw_colmap_rename_filter_wide", + "tw_stats_after_change/specs/tw_stats_after_change_predicate_cross_range", + "tw_stats_after_change/specs/tw_stats_after_change_predicate_new_range", + "tw_stats_after_change/specs/tw_stats_after_change_predicate_old_range", + "var_022_stat_fields/specs/var_022_stat_fields_filter_variant_field", + ], + ), + ( + "Timestamp-based time travel not yet supported", + &[ + "ict_basic_read_v0_ts", + "ict_dml_read_v0_ts", + "ict_enable_later_read_v0_no_ict_ts", + "ict_enable_later_read_v1_no_ict_ts", + "ict_enable_later_read_v3_with_ict_ts", + "ict_enabled_mid_lifecycle_read_v0_no_ict_ts", + "ict_enabled_mid_lifecycle_read_v1_no_ict_ts", + "ict_from_crc_read_v0_ts", + "ict_from_crc_read_v1_ts", + "ict_multiple_commits_read_v0_ts", + "ict_multiple_commits_read_v1_ts", + "ict_multiple_commits_read_v2_ts", + "ict_time_travel_read_v0_ts", + "ict_time_travel_read_v1_ts", + "ict_time_travel_read_v2_ts", + "ict_timestamp_resolution_edges_read_ts_v2_exact", + "ict_timestamp_resolution_edges_read_ts_v2_minus_1ms", + "ict_timestamp_resolution_edges_read_ts_v2_plus_1ms", + "ict_timestamp_resolution_edges_read_ts_v3_exact", + "ict_timestamp_resolution_edges_read_ts_v3_minus_1ms", + "ict_with_checkpoint_read_v1_ts", + "ict_with_checkpoint_read_v3_ts", + "tt_at_syntax_timestamp_v0", + "tt_at_syntax_timestamp_v1", + "tt_column_defaults_timestamp_v0_empty", + "tt_column_defaults_timestamp_v1_null", + "tt_exact_timestamp_version_0_exact_ts", + "tt_exact_timestamp_version_1_exact_ts", + "tt_multi_version_scans_timestamp_v0", + "tt_multi_version_scans_timestamp_v1", + "tt_partition_evolution_timestamp_v0_old_partition", + "tt_partition_evolution_timestamp_v1_new_partition", + "tt_relation_caching_timestamp_v0", + "tt_relation_caching_timestamp_v1", + "tt_schema_evolution_timestamp_v0_old_schema", + "tt_schema_evolution_timestamp_v1_new_schema", + "tt_sql_syntax_timestamp_v0", + "tt_sql_syntax_timestamp_v1", + "tt_timestamp_between_commits_timestamp_v0", + "tt_timestamp_between_commits_timestamp_v1", + "tt_timestamp_between_commits_timestamp_v2", + "tt_version_read_timestamp_v0", + "tt_version_read_timestamp_v1", + "tt_version_read_timestamp_v2", + ], + ), +]; + +fn acceptance_workloads_test(spec_path: &Path) -> datatest_stable::Result<()> { + let spec_path_raw = format!( + "{}/{}", + env!["CARGO_MANIFEST_DIR"], + spec_path.to_str().unwrap() + ); + let spec_path_abs = std::fs::canonicalize(&spec_path_raw) + .unwrap_or_else(|_| std::path::PathBuf::from(&spec_path_raw)); + let spec_path_str = spec_path_abs.to_string_lossy().to_string(); + // Normalize Windows backslashes to forward slashes for pattern matching + #[cfg(windows)] + let spec_path_str = spec_path_str.replace('\\', "/"); + + // Check expected kernel failures FIRST (path matching only - these need to + // actually run to assert kernel still fails). Skip list checked second. + let expected_failure = EXPECTED_KERNEL_FAILURES + .iter() + .find(|(_, patterns)| patterns.iter().any(|p| spec_path_str.contains(p))); + + if expected_failure.is_none() && should_skip_test(&spec_path_str).is_some() { + return Ok(()); + } + + // Load and execute test case + let test_case = TestCase::from_spec_path(&spec_path_abs); + let table_root = test_case.table_root().expect("Failed to get table URL"); + let engine = test_utils::create_default_engine(&table_root).expect("Failed to create engine"); + let result = execute_and_validate_workload( + engine, + &table_root, + &test_case.spec, + &test_case.expected_dir(), + ); + + match (result, expected_failure) { + (Err(_), Some(_)) => {} // Expected to fail, did fail + (Ok(_), None) => {} // Expected to pass, did pass + (Ok(_), Some((reason, _))) => panic!( + "Workload '{}' was expected to fail but succeeded! \ + Reason: {reason}. Remove from EXPECTED_KERNEL_FAILURES!", + test_case.workload_name + ), + (Err(e), None) => panic!("Workload '{}' failed: {}", test_case.workload_name, e), + } + Ok(()) +} + +datatest_stable::harness! { + { + test = acceptance_workloads_test, + root = "workloads/", + pattern = r"specs/.*\.json$" + }, +} diff --git a/acceptance/tests/dat_reader.rs b/acceptance/tests/dat_reader.rs index 0d8d283e7b..596af4cf14 100644 --- a/acceptance/tests/dat_reader.rs +++ b/acceptance/tests/dat_reader.rs @@ -1,9 +1,6 @@ use std::path::Path; -use std::sync::Arc; use acceptance::read_dat_case; -use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; -use delta_kernel::engine::default::DefaultEngine; // TODO(zach): skip iceberg_compat_v1 test until DAT is fixed static SKIPPED_TESTS: &[&str; 1] = &["iceberg_compat_v1"]; @@ -27,14 +24,7 @@ fn reader_test(path: &Path) -> datatest_stable::Result<()> { .block_on(async { let case = read_dat_case(root_dir).unwrap(); let table_root = case.table_root().unwrap(); - let engine = Arc::new( - DefaultEngine::try_new( - &table_root, - std::iter::empty::<(&str, &str)>(), - Arc::new(TokioBackgroundExecutor::new()), - ) - .unwrap(), - ); + let engine = test_utils::create_default_engine(&table_root).unwrap(); case.assert_metadata(engine.clone()).await.unwrap(); acceptance::data::assert_scan_metadata(engine.clone(), &case) diff --git a/acceptance/tests/other.rs b/acceptance/tests/other.rs index b6e9db6814..09d3163713 100644 --- a/acceptance/tests/other.rs +++ b/acceptance/tests/other.rs @@ -38,9 +38,7 @@ async fn test_read_table_with_checkpoint() { )) .unwrap(); let location = url::Url::from_directory_path(path).unwrap(); - let engine = Arc::new( - DefaultEngine::try_new(&location, HashMap::::new()).unwrap(), - ); + let engine = test_utils::create_default_engine(&location).unwrap(); let snapshot = Snapshot::try_new(location, engine, None) .await .unwrap(); diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml new file mode 100644 index 0000000000..2768f19689 --- /dev/null +++ b/benchmarks/Cargo.toml @@ -0,0 +1,42 @@ +[package] +name = "delta_kernel_benchmarks" +publish = false +edition.workspace = true +homepage.workspace = true +license.workspace = true +repository.workspace = true +version.workspace = true +rust-version.workspace = true + +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["delta_kernel/arrow-57"] +arrow-56 = ["delta_kernel/arrow-56"] + +[build-dependencies] +flate2 = "1.1" +tar = "0.4" +ureq = "3.1" + +[dependencies] +delta_kernel = { path = "../kernel", features = ["default-engine-rustls", "internal-api"] } +delta-kernel-unity-catalog = { path = "../delta-kernel-unity-catalog" } +object_store = { version = "0.12.3", features = ["aws"] } +test_utils = { path = "../test-utils" } +rayon = "1" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +sqlparser = "0.55" +tokio = { version = "1", features = ["rt-multi-thread"] } +unity-catalog-delta-client-api = { path = "../unity-catalog-delta-client-api" } +unity-catalog-delta-rest-client = { path = "../unity-catalog-delta-rest-client" } +url = { version = "2", features = ["serde"] } + +[dev-dependencies] +criterion = "0.5" +rstest = "0.23" + +[[bench]] +name = "workload_bench" +harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 0000000000..617b2688d1 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,313 @@ +# Delta Kernel Benchmarking + +This crate contains benchmarking infrastructure for Delta Kernel using Criterion and JSON workload specs. It is separate from the `kernel` crate to keep benchmark-specific code and dependencies out of the core library. + +## Running benchmarks + +```bash +# run all benchmarks +cargo bench -p delta_kernel_benchmarks + +# run a specific bench binary +cargo bench -p delta_kernel_benchmarks --bench workload_bench + +# filter to benchmarks whose name contains a substring (Criterion substring matching) +cargo bench -p delta_kernel_benchmarks --bench workload_bench "some_name" + +# profile a benchmark and generate a flamegraph +cargo install samply +samply record cargo bench -p delta_kernel_benchmarks --bench workload_bench "some_name" +``` + +### Filtering benchmarks + +#### By benchmark name + +Benchmark names follow a hierarchical path structure assembled from the table name, the spec file name, the operation, and (for `Read` workloads) the read config name: + +``` +{table_name}/{spec_file_name}/{operation}/{config_name} +``` + +- `{table_name}` — the `name` field from `tableInfo.json` +- `{spec_file_name}` — the spec filename without its `.json` extension (the `case_name`) +- `{operation}` — `snapshotConstruction` or `readMetadata` +- `{config_name}` — only present for `Read` workloads; e.g. `serial`, `parallel2`, `parallel4` + +All path components use camelCase to match the JSON keys used throughout the workload spec format. + +Examples: +``` +101kAdds1000CommitsSinceChkpt1Chkpt/snapshotLatest/snapshotConstruction +101kAdds1000CommitsSinceChkpt1Chkpt/snapshotLatest/readMetadata/serial +10kAdds0CommitsSinceChkpt1V2Chkpt/snapshotLatest/readMetadata/parallel2 +``` + +The filter argument is a regular expression, so you can create patterns to target the benchmarks that you want: + +```bash +# all benchmarks for a specific table name +cargo bench -p delta_kernel_benchmarks --bench workload_bench "101kAdds1000CommitsSinceChkpt1Chkpt" + +# all benchmarks for either of two tables (| for OR) +cargo bench -p delta_kernel_benchmarks --bench workload_bench "101kAdds1000CommitsSinceChkpt1Chkpt|10kAdds0Chkpts" + +# all snapshotConstruction benchmarks +cargo bench -p delta_kernel_benchmarks --bench workload_bench "snapshotConstruction" + +# snapshotConstruction workloads for a specific table (.* to AND two parts of the name) +cargo bench -p delta_kernel_benchmarks --bench workload_bench "101kAdds1000CommitsSinceChkpt1Chkpt.*snapshotConstruction" + +# profile a specific benchmark with samply +samply record cargo bench -p delta_kernel_benchmarks --bench workload_bench "101kAdds1000CommitsSinceChkpt1Chkpt/snapshotLatest/snapshotConstruction" +``` + +#### By tag (`BENCH_TAGS`) + +Set the `BENCH_TAGS` environment variable to a comma-separated list of tags to run only tables whose `tags` field (in `tableInfo.json`) contains at least one matching tag. If `BENCH_TAGS` is unset or empty, all tables are loaded and benchmarked. + +```bash +# run only tables tagged "base" +BENCH_TAGS=base cargo bench -p delta_kernel_benchmarks +``` + +Built-in tags (with current table assignments - for the most up-to-date table assignments, run benchmarks locally and inspect `benchmarks/workloads/benchmarks//tableInfo.json` to learn about the existing tables): +- **`base`** — a base set of tables run in CI + - Tables: `101kAdds1kCommitsSinceChkpt1Chkpt` +- **`commit-size-scaling`** — tables for comparing how log replay time scales with the number of actions in the log; all are single-commit tables with varying action counts (100, 1k, 10k, 100k, 1M) + - Tables: `100Adds0Chkpts`, `1kAdds0Chkpts`, `10kAdds0Chkpts`, `100kAdds0Chkpts`, `1MAddsNoData0Chkpts` +- **`checkpoint-reads-by-type`** — tables for comparing checkpoint reading performance for different kinds of checkpointing + - Tables: `10kAdds0CommitsSinceChkpt1Chkpt`, `10kAdds0CommitsSinceChkpt1V2Chkpt` +- **`v2-checkpoint`** — tables with v2 checkpoints + - Tables: `10kAdds0CommitsSinceChkpt1V2Chkpt` +- **`crc-optimization`** — tables for comparing how CRC files affect log replay timing; designed to isolate the effect of CRC files at different versions relative to the checkpoint and latest version + - Tables: `101kAdds1kCommitsSinceChkpt1Chkpt`, `20kAdds100CommitsSinceChkpt1Chkpt0CommitsSinceCrc`, `20kAdds100CommitsSinceChkpt1Chkpt50CommitsSinceCrc`, `20kAdds100CommitsSinceChkpt1ChkptNoCrc` +- **`time-travel-optimization`** — tables with multiple specs or specs not at the latest version, useful for benchmarking snapshot construction at historical versions + - Tables: `101kAdds1kCommitsSinceChkpt1Chkpt`, `200kAdds0CommitsSinceChkpt2Chkpts0CommitsSinceCrc` +- **`listing-optimization`** — table for benchmarking log listing efficiency (e.g. `list_from()` call patterns); useful for features that optimize how the delta log directory is scanned + - Tables: `200kAdds0CommitsSinceChkpt2Chkpts0CommitsSinceCrc` +- **`metadata-only`** — tables with no actual data files, useful for isolating log metadata processing overhead + - Tables: `1MAddsNoData0Chkpts` + + +You can also add custom tags to the `tags` field of any local `tableInfo.json` to group tables relevant to your work, then pass that tag via `BENCH_TAGS` without modifying any code: + +```bash +BENCH_TAGS=my-feature cargo bench -p delta_kernel_benchmarks + +# run all tables tagged either "base" or "my-feature" +BENCH_TAGS=base,my-feature cargo bench -p delta_kernel_benchmarks +``` + +### Running benchmarking on a PR + +To trigger benchmarks on a pull request, post a comment using the following syntax: + +``` +/bench [--tags ] [--filter ] +``` + +- `--tags` sets `BENCH_TAGS` (comma-separated), controlling which table groupings run. +- `--filter` is a single-token Criterion regex matched against benchmark names. +- Both flags are optional and independent; they can be given in any order. +- When both are specified, they apply as AND: only benchmarks from tables that match the tag filter AND whose name matches the regex are run. +- Running just `/bench` (with no flags) defaults to `BENCH_TAGS=base`. If neither flag is parsed, the same default applies. + +Examples: +``` +/bench # BENCH_TAGS=base, all benchmark names +/bench --tags base,my-tag # BENCH_TAGS=base,my-tag, all benchmark names +/bench --filter snapshotConstruction # no BENCH_TAGS set, only snapshotConstruction benchmarks +/bench --tags base --filter 101kAdds.*snapshotConstruction # only snapshotConstruction benchmarks from tables tagged "base" +/bench --filter 101kAdds|10kAdds # no BENCH_TAGS set, OR two table names +``` + +See [By tag (`BENCH_TAGS`)](#by-tag-bench_tags) for how tags work and [By benchmark name](#by-benchmark-name) for regex pattern examples. Results are posted automatically as a PR comment, comparing the PR branch against the base branch. +CI timings are noisy and tend to run higher than on dedicated hardware, but proportional differences between branches are a rough signal for performance changes. + +## Workload data layout + +Each table lives in its own subdirectory under `benchmarks/workloads/benchmarks/`: + +``` +benchmarks/workloads/ +├── benchmarks/ +│ └── / +│ ├── tableInfo.json # describes the table (name, schema, protocol, etc.) +│ ├── delta/ # Delta table data (if no explicit tablePath) +│ └── specs/ +│ └── .json # one file per benchmark operation +└── tests/ # reserved for future test workloads (currently empty) +``` + +## Loading workloads + +Workloads are downloaded from the DAT GitHub release and extracted to `benchmarks/workloads/` automatically by `build.rs` when the crate is built. A `.done` marker file is written on success to skip re-downloading on subsequent builds. To force a fresh download, delete `benchmarks/workloads/.done`. + +Workloads are discovered automatically by path. `load_all_workloads()` scans every subdirectory of `benchmarks/workloads/benchmarks/`, loading `tableInfo.json` and every spec file under `specs/`. The spec filename (without extension) becomes the `case_name`. + +## Current benchmarking workloads + +There is no single exhaustive list of all benchmark tables and their contents maintained in this README, as this can change over time. The [built-in tags](#by-tag-bench_tags) section includes a list of table names grouped by tag, but this is non-exhaustive and subject to change. To explore which tables exist and what each one contains, run benchmarks locally and inspect the `tableInfo.json` file in each table's directory under `benchmarks/workloads/benchmarks/`. + +## Adding a new table locally + +### Local tables + +To benchmark against a local Delta table: + +1. Extract the workload archive if you haven't already — the simplest way is to run any benchmark once, which auto-extracts it: + ```bash + cargo bench -p delta_kernel_benchmarks --bench workload_bench + ``` +2. Create a directory for the new table under `benchmarks/workloads/benchmarks/`: + ``` + benchmarks/data/workloads/benchmarks// + ├── tableInfo.json # see TableInfo section below for required fields + ├── delta/ # Delta table files (_delta_log/, parquet data, etc.) + └── specs/ + └── .json # one or more spec files describing operations to benchmark + ``` +3. Run benchmarks — the new table is discovered automatically (you can filter by table name — see [By benchmark name](#by-benchmark-name)): + ```bash + cargo bench -p delta_kernel_benchmarks --bench workload_bench "" + ``` + +### Remote tables (S3 / UC) + +Remote tables are benchmarked via `KERNEL_BENCH_WORKLOAD_DIR`, which points to a directory of +table configs outside of the workload archive. Each subdirectory has the same layout as local +tables (`tableInfo.json` + `specs/`), but no `delta/` directory is needed. + +There are two types of remote tables, determined by the `tableInfo.json` fields: + +- **S3 tables** — set `tablePath` to the S3 URL (e.g. `s3://bucket/path`). Requires `AWS_*` + env vars for credentials. +- **UC tables** — set `catalogInfo` with a `tableName` field (e.g. `catalog.schema.table`). + Credentials are vended via UC at runtime (`UC_WORKSPACE` / `UC_TOKEN` env vars). The + benchmark harness automatically detects catalog-managed tables (via UC properties) and + uses the appropriate snapshot loading path. + +Example `tableInfo.json` for a UC table: +```json +{ + "name": "my_uc_table", + "description": "A UC-managed table", + "catalogInfo": {"tableName": "catalog.schema.table"}, + "schema": {"type": "struct", "fields": [...]}, + "protocol": {"minReaderVersion": 1, "minWriterVersion": 2}, + "logInfo": {"numAddFiles": 100, "numRemoveFiles": 0, "sizeInBytes": 0, "numCommits": 1, "numActions": 100}, + "properties": {}, + "dataLayout": {}, + "tags": [] +} +``` + +Example: +```bash +KERNEL_BENCH_WORKLOAD_DIR=/path/to/my/tables \ + UC_WORKSPACE=https://my-workspace.cloud.databricks.com UC_TOKEN=... AWS_REGION=us-west-2 \ + cargo bench --bench workload_bench +``` + +## Entities + +### `TableInfo` + +Deserialized from `tableInfo.json`. Captures the table's identity (`name`, `description`), Delta schema and protocol, log statistics (`logInfo`), physical data layout, table properties, and benchmark tags. See [`src/models.rs`](src/models.rs) for field-level documentation. + +#### Example + +```json +{ + "name": "myTable", + "description": "A basic table with two append writes.", + "schema": {"type": "struct", "fields": [ + {"name": "id", "type": "long", "nullable": true, "metadata": {}} + ]}, + "protocol": {"minReaderVersion": 3, "minWriterVersion": 7, "readerFeatures": [], "writerFeatures": []}, + "logInfo": { + "numAddFiles": 10, + "numRemoveFiles": 0, + "sizeInBytes": 4096, + "numCommits": 2, + "numActions": 12 + }, + "properties": {}, + "dataLayout": {}, + "tags": ["base"] +} +``` + +### `Spec` + +Deserialized from a JSON file in a table's `specs/` directory. Describes a single operation to benchmark (what to do, e.g. read at version 3). Two variants are supported: + +- **`Read`** — scan a table at an optional version (defaults to latest). A single `Read` spec expands into one benchmark per `ReadOperation` × `ReadConfig` combination — every relevant operation and parallelism mode is benchmarked. Currently only `ReadMetadata` is implemented; `ReadData` is not yet supported. +- **`SnapshotConstruction`** — measure the cost of building a `Snapshot` from scratch at an optional version (defaults to latest) + +Read specs: +```json +{ + "type": "read" +} +``` +Or with a specific version: + +```json +{ + "type": "read", + "version": 0 +} +``` + +With a predicate for data skipping (SQL WHERE clause syntax): + +```json +{ + "type": "read", + "predicate": "id < 500 AND value > 10" +} +``` + +The `predicate` field accepts a SQL WHERE clause expression that is parsed into a kernel `Predicate` and passed to the scan builder. See [`src/predicate_parser.rs`](src/predicate_parser.rs) for the full list of supported SQL features. + +Snapshot construction specs: +```json +{ + "type": "snapshotConstruction" +} +``` +Or with a specific version: + +```json +{ + "type": "snapshotConstruction", + "version": 0 +} +``` + +### `Workload` + +The concrete unit of work that gets benchmarked. Assembled when loading workloads by pairing a `Spec` (the operation) with a `TableInfo` (the table) and a `case_name`. A `Spec` file on its own solely describes an operation without context of the table it is performed on; when combined with a table, it becomes a `Workload`. A single table therefore produces multiple workloads, one for each spec file in its `specs/` directory. + +### `ReadConfig` + +Specifies runtime parameters for `Read` workloads that are not part of the spec JSON — currently whether to scan serially or in parallel, and how many threads to use. Multiple configs can be applied to the same workload to compare modes. By default all workloads run serial log replay; workloads with sidecar files additionally run parallel configs to benchmark parallel scanning. + +### `WorkloadRunner` + +Owns all pre-built state for a workload (e.g. a pre-constructed `Snapshot`) so that `execute()` measures only the target operation. Each runner corresponds to one `Workload` plus whatever additional configuration that workload type requires — `Read` workloads take a `ReadConfig`, while `SnapshotConstruction` workloads require no extra configuration. + + +## Source Layout + +| File | Purpose | +|------|---------| +| `src/models.rs` | Data types: `TableInfo`, `Spec`, `Workload`, `ReadConfig`, `ReadOperation` | +| `src/predicate_parser.rs` | SQL WHERE clause to kernel `Predicate` parser | +| `src/runners.rs` | `WorkloadRunner` trait and implementations: `ReadMetadataRunner`, `SnapshotConstructionRunner` | +| `src/utils.rs` | Workload loading: deserializes workloads from the extracted data directory | +| `benches/workload_bench.rs` | Criterion entry point — loads workloads, builds runners, drives benchmarks | +| `build.rs` | Downloads and extracts benchmark workloads from the DAT GitHub release at build time | + diff --git a/benchmarks/benches/workload_bench.rs b/benchmarks/benches/workload_bench.rs new file mode 100644 index 0000000000..fb706c3e7f --- /dev/null +++ b/benchmarks/benches/workload_bench.rs @@ -0,0 +1,92 @@ +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; + +use criterion::{criterion_group, criterion_main, Criterion}; + +use delta_kernel_benchmarks::models::{ + default_read_configs, ParallelScan, ReadConfig, ReadOperation, Spec, +}; +use delta_kernel_benchmarks::runners::{ + create_read_runner, SnapshotConstructionRunner, WorkloadRunner, +}; +use delta_kernel_benchmarks::utils::load_all_workloads; +use test_utils::CountingReporter; + +// Loads all workloads and sets up a shared runtime, then registers each as a top-level benchmark. +// For each workload, builds a runner that encapsulates the state (table info, engine, config, etc.) +// and execution logic. After each Criterion timing pass, runs one IO-profiling iteration and +// prints per-call storage and log-replay counts. +fn workload_benchmarks(c: &mut Criterion) { + let workloads = match load_all_workloads() { + Ok(workloads) if !workloads.is_empty() => workloads, + Ok(_) => panic!("No workloads found"), + Err(e) => panic!("Failed to load workloads: {e}"), + }; + + let reporter = Arc::new(CountingReporter::new()); + let runtime = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create tokio runtime")); + + for workload in &workloads { + match &workload.spec { + Spec::Read(read_spec) => { + for operation in [ReadOperation::ReadMetadata] { + for config in build_read_configs(&workload.table_info.name) { + let runner = create_read_runner( + &workload.table_info, + &workload.case_name, + read_spec, + operation, + config, + runtime.clone(), + ) + .expect("Failed to create read runner"); + run_benchmark(c, runner.as_ref(), &reporter); + } + } + } + Spec::SnapshotConstruction(snapshot_construction_spec) => { + let runner = SnapshotConstructionRunner::setup( + &workload.table_info, + &workload.case_name, + snapshot_construction_spec, + runtime.clone(), + ) + .expect("Failed to create snapshot construction runner"); + run_benchmark(c, &runner, &reporter); + } + } + } +} + +// Registers a workload with Criterion and benchmarks its `execute()` function. +// After timing completes, runs one IO-profiling iteration and prints per-call storage and +// log-replay counts. The IO profile is skipped entirely when Criterion filters out the benchmark, +// since Criterion never calls the closure for filtered benchmarks. +fn run_benchmark(c: &mut Criterion, runner: &dyn WorkloadRunner, reporter: &CountingReporter) { + let bench_ran = AtomicBool::new(false); + c.bench_function(runner.name(), |b| { + bench_ran.store(true, Ordering::Relaxed); + b.iter(|| runner.execute().expect("Benchmark execution failed")) + }); + if bench_ran.load(Ordering::Relaxed) { + reporter.reset(); + runner.execute().expect("IO profiling iteration failed"); + reporter.print_summary(runner.name()); + } +} + +fn build_read_configs(table_name: &str) -> Vec { + // Choose which benchmark configurations to run for a given table + // TODO: This function will take in table info to choose the appropriate configs for a given table + let mut configs = default_read_configs(); + if table_name.contains("V2Chkpt") { + configs.push(ReadConfig { + name: "parallel2".into(), + parallel_scan: ParallelScan::Enabled { num_threads: 2 }, + }); + } + configs +} + +criterion_group!(benches, workload_benchmarks); +criterion_main!(benches); diff --git a/benchmarks/build.rs b/benchmarks/build.rs new file mode 100644 index 0000000000..b3d9cf1bea --- /dev/null +++ b/benchmarks/build.rs @@ -0,0 +1,69 @@ +//! Build script for benchmark workloads + +use std::env; +use std::fs::File; +use std::io::{BufReader, BufWriter, Read, Write}; +use std::path::{Path, PathBuf}; + +use flate2::read::GzDecoder; +use tar::Archive; +use ureq::{Agent, Proxy}; + +const VERSION: &str = "0.04-preview"; // release tag +const WORKLOADS_VERSION: &str = "0.0.4"; // version in filename + +fn main() { + let manifest_dir = env::var("CARGO_MANIFEST_DIR").expect("CARGO_MANIFEST_DIR not set"); + let output_dir = PathBuf::from(manifest_dir).join("workloads"); + let done_marker = output_dir.join(".done"); + + println!("cargo::rerun-if-changed={}", done_marker.display()); + + if done_marker.exists() { + return; + } + + let tarball_data = download_workloads(); + extract_tarball(tarball_data, &output_dir); + write_done_file(&done_marker); +} + +fn download_workloads() -> Vec { + let tarball_url = format!( + "https://github.com/delta-incubator/dat/releases/download/v{VERSION}/v{WORKLOADS_VERSION}_benchmark_workloads.tar.gz" + ); + download_tarball(&tarball_url) +} + +fn download_tarball(url: &str) -> Vec { + let response = if let Ok(proxy_url) = env::var("HTTPS_PROXY") { + let proxy = Proxy::new(&proxy_url).unwrap(); + let config = Agent::config_builder().proxy(proxy.into()).build(); + Agent::new_with_config(config).get(url).call().unwrap() + } else { + ureq::get(url).call().unwrap() + }; + + let mut data: Vec = Vec::new(); + response + .into_body() + .as_reader() + .read_to_end(&mut data) + .unwrap(); + data +} + +fn extract_tarball(tarball_data: Vec, output_dir: &Path) { + let decoder = GzDecoder::new(BufReader::new(&tarball_data[..])); + let mut archive = Archive::new(decoder); + std::fs::create_dir_all(output_dir).expect("Failed to create output directory"); + archive + .unpack(output_dir) + .expect("Failed to unpack tarball"); +} + +fn write_done_file(done_marker: &Path) { + let mut done_file = + BufWriter::new(File::create(done_marker).expect("Failed to create .done file")); + write!(done_file, "done").expect("Failed to write .done file"); +} diff --git a/benchmarks/ci/parse_critcmp.py b/benchmarks/ci/parse_critcmp.py new file mode 100644 index 0000000000..14298ed016 --- /dev/null +++ b/benchmarks/ci/parse_critcmp.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Parse critcmp output and format it as a GitHub-flavoured Markdown table. + +Reads the output of `critcmp base changes` from stdin and writes a table +with columns: Test | Base | PR | %, where the faster side and the % cell +are bolded when the difference is statistically significant (i.e. the +error bounds do not overlap). + +Usage: + critcmp base changes | python3 benchmarks/ci/parse_critcmp.py +""" +import sys, re + +def to_ms(value, units): + u = units.strip() + if u == 's': return value * 1e3 + if u == 'ms': return value + if u in ('µs', 'us', 'μs'): return value / 1e3 + if u == 'ns': return value / 1e6 + return value + +def is_significant(chg_dur, chg_err, base_dur, base_err): + """Return True if the difference between two measurements is statistically significant. + + Significance is determined by whether either center value (the `X` in `X±err`) lies outside the other's error bar. + Concretely, if chg is faster than base, the change is significant if: + - base's center value is above chg's entire error bar range (chg + chg_err < base), OR + - chg's center value is below base's entire error bar range (base - base_err > chg). + The symmetric conditions apply when chg is slower. This is an OR test: only one side + needs to show clear separation for the difference to be considered significant. + """ + if chg_dur < base_dur: + return chg_dur + chg_err < base_dur or base_dur - base_err > chg_dur + else: + return chg_dur - chg_err > base_dur or base_dur + base_err < chg_dur + +def parse_duration(s): + m = re.match(r'([0-9.]+)±([0-9.]+)(.+)', s.strip()) + if not m: + return None + return float(m.group(1)), float(m.group(2)), m.group(3).strip() + +def main(): + # Expected critcmp input format (2-space-separated columns): + # + # group base changes + # ----- ---- ------- + # bench_name 1.00 1.2±0.01µs 1.05 1.3±0.02µs + # bench_name/with_throughput 1.00 1.2±0.01µs 1.2 MB/s 1.05 1.3±0.02µs 1.1 MB/s + # + # Expected output (GitHub-flavored markdown table): + # Throughput/bandwidth columns from critcmp are ignored; output is the same either way. + # + # | Test | Base | PR | % | + # |----------------------------|------------|-----------------|--------| + # | bench_name | 1.2±0.01µs | **1.3±0.02µs** | **+8.33%** | + lines = sys.stdin.read().splitlines() + print("| Test | Base | PR | % |") + print("|------|--------------|------------------|---|") + + for line in lines[2:]: # skip critcmp header rows + if not line.strip(): + continue + # critcmp columns (split on 2+ spaces): + # with throughput: name, baseFactor, baseDuration, baseBandwidth, changesFactor, changesDuration, changesBandwidth + # without throughput: name, baseFactor, baseDuration, changesFactor, changesDuration + # Locate duration fields by the presence of "±" rather than hardcoding indices, + # so the script works correctly regardless of whether bandwidth columns are present. + fields = re.split(r' +', line) + name = fields[0].strip().replace('|', r'\|') if fields else '' + dur_fields = [f.strip() for f in fields[1:] if '±' in f] + base_dur_str = dur_fields[0] if len(dur_fields) > 0 else None + chg_dur_str = dur_fields[1] if len(dur_fields) > 1 else None + + if not name and not base_dur_str and not chg_dur_str: + continue + + # N/A when a benchmark only exists in one of the two runs (added or removed). + base_display = base_dur_str or 'N/A' + chg_display = chg_dur_str or 'N/A' + difference = 'N/A' + + # Only compute a percentage change when both runs have a measurement for this benchmark. + if base_dur_str and chg_dur_str: + # Parse each duration string into (mean, error, units), e.g. "1.2±0.01µs" -> (1.2, 0.01, "µs"). + base_p = parse_duration(base_dur_str) + chg_p = parse_duration(chg_dur_str) + if base_p and chg_p: + # Normalise both measurements to milliseconds so they can be compared directly. + base_ms = to_ms(base_p[0], base_p[2]) + base_err_ms = to_ms(base_p[1], base_p[2]) + chg_ms = to_ms(chg_p[0], chg_p[2]) + chg_err_ms = to_ms(chg_p[1], chg_p[2]) + + # Compute relative change: negative means faster, positive means slower. + pct = -(1 - chg_ms / base_ms) * 100 + prefix = '' if chg_ms <= base_ms else '+' + difference = f'{prefix}{pct:.2f}%' + + # Bold the slower of the two durations to draw attention to what changed, + # and always bold the difference column when the change is significant. + if is_significant(chg_ms, chg_err_ms, base_ms, base_err_ms): + if chg_ms < base_ms: + chg_display = f'**{chg_dur_str}**' + elif chg_ms > base_ms: + base_display = f'**{base_dur_str}**' + difference = f'**{difference}**' + + print(f'| {name} | {base_display} | {chg_display} | {difference} |') + +if __name__ == "__main__": + main() diff --git a/benchmarks/ci/run-benchmarks.sh b/benchmarks/ci/run-benchmarks.sh new file mode 100644 index 0000000000..197bd7030a --- /dev/null +++ b/benchmarks/ci/run-benchmarks.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +# Benchmark comparison script for pull requests. +# +# Called by .github/workflows/benchmark.yml (run-benchmark job) after the repo +# has been checked out at the PR head. Writes the formatted markdown comparison +# to /tmp/bench-comment.md; the post-comment job picks it up and posts it. +# +# Expects the following environment variables: +# +# COMMENT - the /bench PR comment body +# BASE_REF - base branch ref (e.g. "main") +# HEAD_SHA - full SHA of the PR head commit + +set -euo pipefail +shopt -s extglob + +# --------------------------------------------------------------------------- +# 1. Parse the /bench comment +# Syntax: /bench [--tags ] [--filter ] +# --tags sets BENCH_TAGS (comma-separated tag list); defaults to "base" +# when the comment is just /bench +# --filter Criterion name regex passed as a positional arg to cargo bench +# --------------------------------------------------------------------------- + +ARGS="${COMMENT#/bench}" +ARGS="${ARGS##+( )}" + +TAGS="" +FILTER="" + +if [[ -z "$ARGS" ]]; then + # Bare /bench with no args: default to the "base" tag + TAGS="base" +else + # Normalize: strip /bench prefix, collapse all whitespace (including newlines) + # to spaces, then strip to a safe allowlist before parsing + ARGS=$(printf '%s' "$ARGS" | tr '\n\r\t' ' ' | tr -s ' ' | tr -cd 'a-zA-Z0-9,_./|*+?()[]^$ -') + ARGS="${ARGS## }" # strip leading space + ARGS="${ARGS%% }" # strip trailing space + + read -ra TOKENS <<< "$ARGS" + i=0 + while [[ $i -lt ${#TOKENS[@]} ]]; do + case "${TOKENS[$i]}" in + --tags) i=$((i + 1)); TAGS="${TOKENS[$i]:-}" ;; + --filter) i=$((i + 1)); FILTER="${TOKENS[$i]:-}" ;; + *) echo "Unknown token: '${TOKENS[$i]}'" >&2; exit 1 ;; + esac + i=$((i + 1)) + done +fi + +# Default: if nothing was parsed, run with BENCH_TAGS=base +if [[ -z "$TAGS" && -z "$FILTER" ]]; then + TAGS="base" +fi + +echo "Parsed tags: ${TAGS:-}" +echo "Parsed filter: ${FILTER:-}" + +[[ -n "$TAGS" ]] && export BENCH_TAGS="$TAGS" + +# --------------------------------------------------------------------------- +# 2. Benchmark the PR branch (already checked out by the workflow) +# --------------------------------------------------------------------------- +(cd benchmarks && cargo bench --locked --bench workload_bench -- --save-baseline changes "$FILTER") + +# --------------------------------------------------------------------------- +# 3. Switch to the base branch and benchmark it +# The benchmarks/target/ directory is not tracked by git, so the +# "changes" baseline files are preserved across the branch switch. +# --------------------------------------------------------------------------- +git fetch origin -- "$BASE_REF" +git checkout FETCH_HEAD +(cd benchmarks && cargo bench --locked --bench workload_bench -- --save-baseline base "$FILTER") + +# --------------------------------------------------------------------------- +# 4. Compare baselines with critcmp and format as a markdown table. +# - Parses actual duration values (not rank factors) for the % column +# - Bolds the faster duration and % cell when the difference is +# statistically significant (error bounds do not overlap) +# --------------------------------------------------------------------------- +# Use `critcmp` to compare the criterion output for `base` and `changes`. We use `critcmp` instead of manually +# parsing criterion outputs because criterion may update its output format. By using `critcmp`, we inherit all +# updated criterion output parsing. +COMPARISON=$((cd benchmarks && critcmp base changes) | python3 benchmarks/ci/parse_critcmp.py) + +# --------------------------------------------------------------------------- +# 5. Write results to /tmp/bench-comment.md +# The post-comment job in benchmark.yml downloads this file as an artifact +# and posts it as a PR comment using a step that holds GH_TOKEN. +# --------------------------------------------------------------------------- +SHORT_SHA="${HEAD_SHA:0:7}" + +SUMMARY="" +[[ -n "$TAGS" ]] && SUMMARY="tags: \`${TAGS}\`" +[[ -n "$FILTER" ]] && SUMMARY+="${SUMMARY:+ | }filter: \`${FILTER}\`" + +{ + echo "## Benchmark for ${SHORT_SHA}" + echo "
" + echo "${SUMMARY}" + echo "" + echo "$COMPARISON" + echo "" + echo "
" +} > /tmp/bench-comment.md diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs new file mode 100644 index 0000000000..0141c19247 --- /dev/null +++ b/benchmarks/src/lib.rs @@ -0,0 +1,4 @@ +pub mod models; +pub mod predicate_parser; +pub mod runners; +pub mod utils; diff --git a/benchmarks/src/models.rs b/benchmarks/src/models.rs new file mode 100644 index 0000000000..8e408d796b --- /dev/null +++ b/benchmarks/src/models.rs @@ -0,0 +1,523 @@ +//! Data models for workload specifications + +use delta_kernel::actions::{Metadata, Protocol}; +use delta_kernel::schema::Schema; +use serde::Deserialize; +use url::Url; + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; + +/// ReadConfig represents a specific configuration for a read operation +/// A config represents configurations for a specific benchmark that aren't specified in the spec JSON file +#[derive(Clone, Debug)] +pub struct ReadConfig { + pub name: String, + pub parallel_scan: ParallelScan, +} + +/// Provides a default set of read configs for a given table, read spec, and operation +pub fn default_read_configs() -> Vec { + vec![ReadConfig { + name: "serial".into(), + parallel_scan: ParallelScan::Disabled, + }] +} + +#[derive(Clone, Debug)] +pub enum ParallelScan { + Disabled, + Enabled { num_threads: usize }, +} + +/// Info needed to access a UC-managed table via credential vending. +/// This covers both catalog-managed and non-catalog-managed UC tables. +#[derive(Clone, Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct CatalogInfo { + /// Fully-qualified table name: "catalog.schema.table" + pub table_name: String, +} + +/// Table info JSON files are located at the root of each table directory +#[derive(Clone, Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct TableInfo { + /// Table name is a short identifier for the table (part of the final benchmark name), e.g. 100Adds0Chkpts + pub name: String, + /// Human-readable description of the table. Use this to capture context that the name alone + /// doesn't convey (e.g. "A table with 1 commit with 1M add actions. This includes a commit file + /// in the delta log, but no actual Parquet data files and no CRC files"). + /// The description is a free-form more verbose description for human readers. + pub description: String, + /// URL to the table. Used for remote tables (e.g. `s3://my-bucket/my-table`) or (rarely) + /// absolute local paths. If `None`, the table is assumed to be in the `delta/` subdirectory + /// next to `tableInfo.json`. Mutually exclusive with `catalog_info`. + pub table_path: Option, + /// Info needed to access a UC-managed table via credential vending. + /// When present, the engine is set up with UC-vended credentials instead of local/S3 access. + /// Whether to use `UCKernelClient` (catalog-managed) or standard snapshot builder is + /// determined by the `delta.feature.catalogManaged` property. + /// Mutually exclusive with `table_path`. + /// TODO(#2303): Create an enum type that ensures table_path and catalog_info are mutually exclusive + pub catalog_info: Option, + /// Schema at the latest version of the table, in Delta protocol JSON format + /// e.g. `{"type": "struct", "fields": [...]}` + pub schema: Schema, + /// Delta protocol requirements at the latest version of the table + /// e.g. `{"minReaderVersion": 3, "minWriterVersion": 7, "readerFeatures": [], "writerFeatures": []}` + pub protocol: Protocol, + /// Log-level statistics giving a quick overview of the table without requiring a full log + /// replay. See [`LogInfo`] for field details. + pub log_info: LogInfo, + /// Table properties from the Delta metadata action (string key-value pairs). Use `{}` if + /// none. e.g. `{"delta.enableDeletionVector": "true", "delta.columnMapping.mode": "none"}` + pub properties: HashMap, + /// Physical data layout of the table. Use `{}` for unpartitioned/unclustered tables. + /// See [`DataLayout`] for partitioned and clustered variants. + pub data_layout: DataLayout, + /// Tags for filtering which tables are benchmarked via `BENCH_TAGS`. Use `[]` if none. + /// Built-in tag: `base` (run in CI). e.g. `["base", "my-feature"]` + #[serde(default)] + pub tags: Vec, + /// Path to the directory containing the `tableInfo.json` file + #[serde(skip, default)] + pub table_info_dir: PathBuf, +} + +impl TableInfo { + /// Returns true if the table has at least one tag in common with `required` + /// Tag matching uses union semantics; any single matching tag is sufficient + pub fn matches_tags(&self, required: &[String]) -> bool { + required.iter().any(|r| self.tags.contains(r)) + } + + pub fn resolved_table_root(&self) -> Url { + self.table_path.clone().unwrap_or_else(|| { + // If table path is not provided, assume that the Delta table is in a delta/ subdirectory at the same level as tableInfo.json + Url::from_file_path(self.table_info_dir.join("delta")) + .expect("table_info_dir must be an absolute path") + }) + } + + pub fn from_json_path>(path: P) -> Result { + let content = std::fs::read_to_string(path.as_ref()).map_err(serde_json::Error::io)?; + let mut table_info: TableInfo = serde_json::from_str(&content)?; + if table_info.catalog_info.is_some() && table_info.table_path.is_some() { + return Err(serde_json::Error::io(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "catalog_info and table_path are mutually exclusive", + ))); + } + // Stores the parent directory of the `tableInfo.json` file + if let Some(parent) = path.as_ref().parent() { + table_info.table_info_dir = parent.to_path_buf(); + } + Ok(table_info) + } +} + +/// Log-level information describing the history and structure of a Delta table +#[derive(Clone, Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct LogInfo { + /// Number of active Add file actions in the table + pub num_add_files: u64, + /// Number of Remove file actions in the table + pub num_remove_files: u64, + /// Total on-disk size of all data files in bytes + pub size_in_bytes: Option, + /// Number of commits (JSON log files) in the table history + pub num_commits: u64, + /// Total number of actions across all commits + pub num_actions: u64, + /// Version of the most recent checkpoint, if any + pub last_checkpoint_version: Option, + /// Version of the most recent CRC file, if any + pub last_crc_version: Option, + /// Number of part files in the most recent multi-part checkpoint, if any. + /// For classic multi-part checkpoints this is the number of parquet parts; for V2 checkpoints this is the number of sidecar files + /// For workloads that don't have multi-part checkpoints/sidecars, this is `None` + pub num_checkpoint_files: Option, +} + +/// Physical data layout of a Delta table +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +pub enum DataLayout { + /// Partitioned table. Two tables with the same number of partition columns can differ + /// significantly in cardinality (e.g. 1 column with 100 distinct values vs. 10000), so both + /// are tracked. e.g. `{"numPartitionColumns": 2, "numDistinctPartitions": 100}` + Partitioned { + /// Number of partition columns + #[serde(rename = "numPartitionColumns")] + num_partition_columns: u32, + /// Number of distinct partition values observed in the table + #[serde(rename = "numDistinctPartitions")] + num_distinct_partitions: u64, + }, + /// Clustered table, e.g. `{"numClusteringColumns": 1}` + Clustered { + /// Number of clustering columns + #[serde(rename = "numClusteringColumns")] + num_clustering_columns: u32, + }, + /// No special data organization (unpartitioned, unclustered). Serializes as `{}` + None {}, +} + +/// Time travel parameter. Either version or timestamp. +#[derive(Clone, Debug, Deserialize)] +#[serde(untagged)] +pub enum TimeTravel { + Version { version: u64 }, + Timestamp { timestamp: String }, +} + +impl TimeTravel { + /// Returns the version if this is version-based time travel. + /// + /// Returns an error message for timestamp-based time travel, which is not yet supported. + pub fn as_version(&self) -> Result { + match self { + TimeTravel::Version { version } => Ok(*version), + TimeTravel::Timestamp { .. } => Err("Timestamp-based time travel is not yet supported"), + } + } +} + +/// Spec defines the operation performed on a table - defines what operation at what version (e.g. read at version 0) +/// There will be multiple specs for a given table +#[derive(Clone, Debug, Deserialize)] +#[serde(tag = "type", rename_all = "camelCase")] +pub enum Spec { + Read(ReadSpec), + #[serde(alias = "snapshot_construction")] + SnapshotConstruction(Box), +} + +/// Specification for a read workload. +#[derive(Clone, Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ReadSpec { + // Time travel version or timestamp for the read + #[serde(flatten)] + pub time_travel: Option, + /// SQL WHERE clause expression (e.g. "id < 500"). Parsed into a kernel `Predicate` + /// and passed to the scan builder for data skipping. + pub predicate: Option, + // Column projections to read + pub columns: Option>, + /// Expected outcome - either success with row count or error with code. + #[serde(flatten)] + pub expected: Option, +} + +impl ReadSpec { + pub fn as_str(&self) -> &str { + "read" + } +} + +/// Specification for a snapshot construction workload. +#[derive(Clone, Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SnapshotConstructionSpec { + // Time travel version or timestamp for the read + #[serde(flatten)] + pub time_travel: Option, + /// Expected outcome - either success with protocol/metadata or error with code. + #[serde(flatten)] + pub expected: Option, +} + +impl SnapshotConstructionSpec { + pub fn as_str(&self) -> &str { + "snapshotConstruction" + } +} + +impl Spec { + pub fn as_str(&self) -> &str { + match self { + Spec::Read(read_spec) => read_spec.as_str(), + Spec::SnapshotConstruction(snapshot_construction_spec) => { + snapshot_construction_spec.as_str() + } + } + } + + pub fn from_json_path>(path: P) -> Result { + let content = std::fs::read_to_string(path.as_ref()).map_err(serde_json::Error::io)?; + let spec: Spec = serde_json::from_str(&content)?; + Ok(spec) + } +} + +/// Expected error outcome for a workload. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ExpectedError { + pub error_code: String, + pub error_message: Option, +} + +/// Expected success outcome for a read workload. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ReadExpectedSuccess { + pub row_count: u64, + pub file_count: Option, + pub files_skipped: Option, +} + +/// Expected result for read operations - either success or error. +#[derive(Debug, Clone, Deserialize)] +#[serde(untagged)] +pub enum ReadExpected { + Success { expected: ReadExpectedSuccess }, + Error { error: ExpectedError }, +} + +/// Expected success outcome for a snapshot construction workload. +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SnapshotExpectedSuccess { + pub protocol: Box, + pub metadata: Box, +} + +/// Expected result for snapshot operations - either success or error. +#[derive(Debug, Clone, Deserialize)] +#[serde(untagged)] +pub enum SnapshotExpected { + Success { expected: SnapshotExpectedSuccess }, + Error { error: ExpectedError }, +} + +/// For Read specs, we will either run a read data operation or a read metadata operation +#[derive(Clone, Copy, Debug)] +pub enum ReadOperation { + ReadData, + ReadMetadata, +} + +impl ReadOperation { + pub fn as_str(&self) -> &str { + match self { + ReadOperation::ReadData => "readData", + ReadOperation::ReadMetadata => "readMetadata", + } + } +} + +/// Partial workload specification loaded from JSON - table, case name, and spec only +#[derive(Clone, Debug)] +pub struct Workload { + pub table_info: TableInfo, + /// Spec filename without extension; used as the benchmark case label + pub case_name: String, + pub spec: Spec, +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + + fn make_table_info(tags: &[&str]) -> TableInfo { + let tags_json = serde_json::to_string(tags).unwrap(); + let json = format!( + r#"{{"name":"t","description":"d","schema":{{"type":"struct","fields":[]}},"protocol":{{"minReaderVersion":1,"minWriterVersion":2}},"logInfo":{{"numAddFiles":0,"numRemoveFiles":0,"sizeInBytes":0,"numCommits":1,"numActions":1}},"properties":{{}},"dataLayout":{{}},"tags":{}}}"#, + tags_json + ); + serde_json::from_str(&json).unwrap() + } + + #[rstest] + #[case(&["ci", "checkpoints"], &["ci"], true)] + #[case(&["ci", "checkpoints"], &["ci", "large"], true)] + #[case(&["large"], &["ci", "checkpoints"], false)] + #[case(&[], &["ci"], false)] + fn test_matches_tags( + #[case] table_tags: &[&str], + #[case] required: &[&str], + #[case] expected: bool, + ) { + let info = make_table_info(table_tags); + let required: Vec = required.iter().map(|s| s.to_string()).collect(); + assert_eq!(info.matches_tags(&required), expected); + } + + #[rstest] + #[case( + r#"{ + "name": "test_table", + "description": "A test table", + "tablePath": "s3://bucket/test_table", + "schema": {"type": "struct", "fields": [{"name": "id", "type": "long", "nullable": true, "metadata": {}}]}, + "protocol": {"minReaderVersion": 1, "minWriterVersion": 2}, + "logInfo": {"numAddFiles": 3, "numRemoveFiles": 1, "sizeInBytes": 1535, "numCommits": 100, "numActions": 10000}, + "properties": {}, + "dataLayout": {"numPartitionColumns": 2, "numDistinctPartitions": 4}, + "tags": ["base"] + }"#, + "test_table", + "A test table", + Some(Url::parse("s3://bucket/test_table").unwrap()), + &["base"] + )] + #[case( + r#"{ + "name": "no_path_table", + "description": "No path specified", + "schema": {"type": "struct", "fields": []}, + "protocol": {"minReaderVersion": 1, "minWriterVersion": 2}, + "logInfo": {"numAddFiles": 0, "numRemoveFiles": 0, "sizeInBytes": 0, "numCommits": 1, "numActions": 1}, + "properties": {}, + "dataLayout": {"numClusteringColumns": 2}, + "tags": ["base"] + }"#, + "no_path_table", + "No path specified", + None, + &["base"] + )] + #[case( + r#"{ + "name": "extra_fields_table", + "description": "Has extra fields", + "extraField": "should be ignored", + "schema": {"type": "struct", "fields": []}, + "protocol": {"minReaderVersion": 1, "minWriterVersion": 2}, + "logInfo": {"numAddFiles": 0, "numRemoveFiles": 0, "sizeInBytes": 0, "numCommits": 1, "numActions": 1}, + "properties": {}, + "dataLayout": {"numClusteringColumns": 1}, + "tags": [] + }"#, + "extra_fields_table", + "Has extra fields", + None, + &[] + )] + fn test_deserialize_table_info( + #[case] json: &str, + #[case] expected_name: &str, + #[case] expected_description: &str, + #[case] expected_table_path: Option, + #[case] expected_tags: &[&str], + ) { + let table_info: TableInfo = + serde_json::from_str(json).expect("Failed to deserialize table info"); + + assert_eq!(table_info.name, expected_name); + assert_eq!(table_info.description, expected_description); + assert_eq!(table_info.table_path, expected_table_path); + let expected_tags: Vec = expected_tags.iter().map(|s| s.to_string()).collect(); + assert_eq!(table_info.tags, expected_tags); + } + + #[rstest] + #[case( + r#"{ + "name": "catalog_table", "description": "A catalog-managed table", + "catalogInfo": {"tableName": "main.schema.table"}, + "schema": {"type": "struct", "fields": []}, + "protocol": {"minReaderVersion": 1, "minWriterVersion": 2}, + "logInfo": {"numAddFiles": 0, "numRemoveFiles": 0, "sizeInBytes": 0, "numCommits": 1, "numActions": 1}, + "properties": {}, "dataLayout": {}, "tags": [] + }"#, + true, + "main.schema.table" + )] + #[case( + r#"{ + "name": "local_table", "description": "A local table", + "schema": {"type": "struct", "fields": []}, + "protocol": {"minReaderVersion": 1, "minWriterVersion": 2}, + "logInfo": {"numAddFiles": 0, "numRemoveFiles": 0, "sizeInBytes": 0, "numCommits": 1, "numActions": 1}, + "properties": {}, "dataLayout": {}, "tags": [] + }"#, + false, + "" + )] + fn test_deserialize_catalog_info_field( + #[case] json: &str, + #[case] expect_present: bool, + #[case] expected_table_name: &str, + ) { + let table_info: TableInfo = + serde_json::from_str(json).expect("Failed to deserialize table info"); + assert_eq!(table_info.catalog_info.is_some(), expect_present); + if expect_present { + assert_eq!( + table_info.catalog_info.unwrap().table_name, + expected_table_name + ); + } + } + + #[rstest] + #[case(r#"{"description": "missing name"}"#, "missing field")] + #[case( + r#"{"name": "missing_schema", "description": "d", + "protocol": {"minReaderVersion": 1, "minWriterVersion": 2}, + "logInfo": {"numAddFiles": 0, "numRemoveFiles": 0, "sizeInBytes": 0, "numCommits": 1, "numActions": 1}, + "properties": {}, "dataLayout": {"numClusteringColumns": 1}, "tags": []}"#, + "missing field" + )] + fn test_deserialize_table_info_errors(#[case] json: &str, #[case] expected_msg: &str) { + let error = serde_json::from_str::(json).unwrap_err(); + assert!(error.to_string().contains(expected_msg)); + } + + #[rstest] + #[case(r#"{"type": "read", "version": 5}"#, "read", Some(5))] + #[case(r#"{"type": "read"}"#, "read", None)] + #[case( + r#"{"type": "read", "version": 7, "extraField": "should be ignored"}"#, + "read", + Some(7) + )] + #[case( + r#"{"type": "snapshotConstruction", "version": 5}"#, + "snapshotConstruction", + Some(5) + )] + #[case(r#"{"type": "snapshotConstruction"}"#, "snapshotConstruction", None)] + #[case( + r#"{"type": "snapshotConstruction", "version": 7, "extraField": "should be ignored"}"#, + "snapshotConstruction", + Some(7) + )] + fn test_deserialize_spec( + #[case] json: &str, + #[case] expected_type: &str, + #[case] expected_version: Option, + ) { + let spec: Spec = serde_json::from_str(json).expect("Failed to deserialize spec"); + assert_eq!(spec.as_str(), expected_type); + let version = match &spec { + Spec::Read(read_spec) => match &read_spec.time_travel { + Some(TimeTravel::Version { version }) => Some(*version), + _ => None, + }, + Spec::SnapshotConstruction(snapshot_construction_spec) => { + match &snapshot_construction_spec.time_travel { + Some(TimeTravel::Version { version }) => Some(*version), + _ => None, + } + } + }; + + assert_eq!(version, expected_version); + } + + #[rstest] + #[case(r#"{"version": 10}"#, "missing field")] + #[case(r#"{"type": "write", "version": 3}"#, "unknown variant")] + fn test_deserialize_spec_errors(#[case] json: &str, #[case] expected_msg: &str) { + let error = serde_json::from_str::(json).unwrap_err(); + assert!(error.to_string().contains(expected_msg)); + } +} diff --git a/benchmarks/src/predicate_parser.rs b/benchmarks/src/predicate_parser.rs new file mode 100644 index 0000000000..c00bc7f325 --- /dev/null +++ b/benchmarks/src/predicate_parser.rs @@ -0,0 +1,550 @@ +//! Parses SQL WHERE clause expressions into kernel [`Predicate`] types. +//! +//! Supports a subset of SQL sufficient for benchmark predicates: +//! - Binary comparisons: `=`, `!=`, `<>`, `<`, `>`, `<=`, `>=` +//! - Null-safe equals: `<=>` +//! - Logical operators: `AND`, `OR`, `NOT` +//! - `IS NULL`, `IS NOT NULL` +//! - `IN (...)`, `NOT IN (...)` +//! - `BETWEEN ... AND ...` +//! - Column references and literal values (integers, floats, strings, booleans) +//! +//! Unsupported (returns error): `LIKE`, function calls (`HEX`, `size`, `length`), +//! arithmetic expressions (`a % 100`), typed literals (`TIME '...'`). + +use delta_kernel::expressions::{ + ArrayData, BinaryPredicateOp, ColumnName, Expression, Predicate, Scalar, +}; +use delta_kernel::schema::{ArrayType, DataType}; + +use sqlparser::ast::{self, Expr, UnaryOperator, Value}; +use sqlparser::dialect::GenericDialect; +use sqlparser::parser::Parser; + +/// Parses a SQL WHERE clause expression string into a kernel [`Predicate`]. +/// +/// Returns an error if the SQL cannot be parsed or contains unsupported features +/// (e.g. `LIKE`, function calls, arithmetic expressions, typed literals). +/// +/// # Example +/// ```ignore +/// let pred = parse_predicate("id < 500 AND value > 10").unwrap(); +/// ``` +pub fn parse_predicate(sql: &str) -> Result> { + let dialect = GenericDialect {}; + let expr = Parser::new(&dialect) + .try_with_sql(sql)? + .parse_expr() + .map_err(|e| format!("Failed to parse predicate: {e}"))?; + convert_expr_to_predicate(&expr) +} + +/// Converts a sqlparser AST expression into a kernel [`Predicate`]. +fn convert_expr_to_predicate(expr: &Expr) -> Result> { + match expr { + Expr::BinaryOp { left, op, right } => convert_binary_op(left, op, right), + Expr::UnaryOp { + op: UnaryOperator::Not, + expr, + } => { + let inner = convert_expr_to_predicate(expr)?; + Ok(Predicate::not(inner)) + } + Expr::IsNull(expr) => { + let inner = convert_expr_to_expression(expr)?; + Ok(Predicate::is_null(inner)) + } + Expr::IsNotNull(expr) => { + let inner = convert_expr_to_expression(expr)?; + Ok(Predicate::is_not_null(inner)) + } + Expr::Nested(inner) => convert_expr_to_predicate(inner), + // Only boolean literals are valid in predicate position (e.g. `TRUE`, `FALSE`). + // Other literals like numbers or strings are not valid standalone predicates. + Expr::Value(value) => match &value.value { + Value::Boolean(b) => Ok(Predicate::literal(*b)), + _ => Err(format!("Unsupported literal in predicate position: {value}").into()), + }, + Expr::Identifier(_) | Expr::CompoundIdentifier(_) => { + // Delegate to convert_expr_to_expression to keep identifier conversion in one place + let col_expr = convert_expr_to_expression(expr)?; + Ok(Predicate::from_expr(col_expr)) + } + Expr::InList { + expr, + list, + negated, + } => convert_in_list(expr, list, *negated), + Expr::Between { + expr, + negated, + low, + high, + } => convert_between(expr, *negated, low, high), + _ => Err(format!("Unsupported expression type: {expr}").into()), + } +} + +/// Converts an IN/NOT IN list into a kernel [`Predicate`]. +fn convert_in_list( + expr: &Expr, + list: &[Expr], + negated: bool, +) -> Result> { + let col = convert_expr_to_expression(expr)?; + + // Convert all list elements to scalars and infer the element type from the first element + let scalars: Vec = list + .iter() + .map(|e| { + let Expression::Literal(s) = convert_expr_to_expression(e)? else { + return Err(format!("IN list elements must be literals, got: {e}").into()); + }; + Ok(s) + }) + .collect::>>()?; + + let element_type = scalars + .first() + .map(|s| s.data_type()) + .unwrap_or(DataType::LONG); + + let array_data = ArrayData::try_new(ArrayType::new(element_type, false), scalars)?; + let array_expr = Expression::literal(Scalar::Array(array_data)); + + let pred = Predicate::binary(BinaryPredicateOp::In, col, array_expr); + if negated { + Ok(Predicate::not(pred)) + } else { + Ok(pred) + } +} + +/// Converts BETWEEN into `col >= low AND col <= high` (or NOT of that if negated). +fn convert_between( + expr: &Expr, + negated: bool, + low: &Expr, + high: &Expr, +) -> Result> { + let col = convert_expr_to_expression(expr)?; + let low_expr = convert_expr_to_expression(low)?; + let high_expr = convert_expr_to_expression(high)?; + + let pred = Predicate::and( + Predicate::ge(col.clone(), low_expr), + Predicate::le(col, high_expr), + ); + if negated { + Ok(Predicate::not(pred)) + } else { + Ok(pred) + } +} + +/// Converts a binary operation into a kernel [`Predicate`]. +fn convert_binary_op( + left: &Expr, + op: &ast::BinaryOperator, + right: &Expr, +) -> Result> { + match op { + ast::BinaryOperator::Eq => { + let l = convert_expr_to_expression(left)?; + let r = convert_expr_to_expression(right)?; + Ok(Predicate::eq(l, r)) + } + ast::BinaryOperator::NotEq => { + let l = convert_expr_to_expression(left)?; + let r = convert_expr_to_expression(right)?; + Ok(Predicate::ne(l, r)) + } + ast::BinaryOperator::Lt => { + let l = convert_expr_to_expression(left)?; + let r = convert_expr_to_expression(right)?; + Ok(Predicate::lt(l, r)) + } + ast::BinaryOperator::LtEq => { + let l = convert_expr_to_expression(left)?; + let r = convert_expr_to_expression(right)?; + Ok(Predicate::le(l, r)) + } + ast::BinaryOperator::Gt => { + let l = convert_expr_to_expression(left)?; + let r = convert_expr_to_expression(right)?; + Ok(Predicate::gt(l, r)) + } + ast::BinaryOperator::GtEq => { + let l = convert_expr_to_expression(left)?; + let r = convert_expr_to_expression(right)?; + Ok(Predicate::ge(l, r)) + } + ast::BinaryOperator::And => { + let l = convert_expr_to_predicate(left)?; + let r = convert_expr_to_predicate(right)?; + Ok(Predicate::and(l, r)) + } + ast::BinaryOperator::Or => { + let l = convert_expr_to_predicate(left)?; + let r = convert_expr_to_predicate(right)?; + Ok(Predicate::or(l, r)) + } + // <=> is null-safe equals, which is NOT DISTINCT in kernel + ast::BinaryOperator::Spaceship => { + let l = convert_expr_to_expression(left)?; + let r = convert_expr_to_expression(right)?; + Ok(Predicate::not(Predicate::distinct(l, r))) + } + _ => Err(format!("Unsupported binary operator: {op}").into()), + } +} + +/// Converts a sqlparser AST expression into a kernel [`Expression`] (for use as operands). +fn convert_expr_to_expression(expr: &Expr) -> Result> { + match expr { + Expr::Identifier(ident) => { + let name = ColumnName::new([ident.value.clone()]); + Ok(name.into()) + } + Expr::CompoundIdentifier(parts) => { + let names: Vec = parts.iter().map(|p| p.value.clone()).collect(); + Ok(Expression::column(names)) + } + Expr::Value(value) => convert_value(&value.value), + Expr::UnaryOp { + op: UnaryOperator::Minus, + expr, + } => { + if let Expr::Value(value) = expr.as_ref() { + convert_negative_value(&value.value) + } else { + Err(format!("Unsupported unary minus on: {expr}").into()) + } + } + Expr::Nested(inner) => convert_expr_to_expression(inner), + _ => Err(format!("Unsupported expression: {expr}").into()), + } +} + +/// Converts a sqlparser [`Value`] into a kernel [`Expression`]. +fn convert_value(value: &Value) -> Result> { + match value { + Value::Number(n, _long) => { + if let Ok(i) = n.parse::() { + Ok(Scalar::Long(i).into()) + } else if let Ok(f) = n.parse::() { + Ok(Scalar::Double(f).into()) + } else { + Err(format!("Cannot parse number: {n}").into()) + } + } + Value::SingleQuotedString(s) | Value::DoubleQuotedString(s) => { + Ok(Scalar::from(s.clone()).into()) + } + Value::Boolean(b) => Ok(Scalar::Boolean(*b).into()), + // SQL NULL has no inherent type; LONG is an arbitrary default since the kernel + // requires a DataType for Scalar::Null and predicates don't carry type context. + Value::Null => Ok(Scalar::Null(DataType::LONG).into()), + _ => Err(format!("Unsupported value: {value}").into()), + } +} + +/// Converts a negated sqlparser [`Value`] into a kernel [`Expression`] by delegating +/// to [`convert_value`] and negating the resulting numeric scalar. +fn convert_negative_value(value: &Value) -> Result> { + let expr = convert_value(value)?; + match expr { + Expression::Literal(Scalar::Long(n)) => Ok(Scalar::Long(-n).into()), + Expression::Literal(Scalar::Double(n)) => Ok(Scalar::Double(-n).into()), + _ => Err(format!("Unsupported negative value: {value}").into()), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use delta_kernel::expressions::{column_name, BinaryPredicateOp}; + use rstest::rstest; + + // Helper to build an IN predicate: `col IN (scalars...)` + fn in_list(col: ColumnName, scalars: Vec) -> Predicate { + let element_type = scalars + .first() + .map(|s| s.data_type()) + .unwrap_or(DataType::LONG); + let array = ArrayData::try_new(ArrayType::new(element_type, false), scalars).unwrap(); + Predicate::binary( + BinaryPredicateOp::In, + col, + Expression::literal(Scalar::Array(array)), + ) + } + + // -- Comparisons -- + #[rstest] + #[case("id < 5", Predicate::lt(column_name!("id"), Scalar::Long(5)))] + #[case("id = 999", Predicate::eq(column_name!("id"), Scalar::Long(999)))] + #[case("id > 250", Predicate::gt(column_name!("id"), Scalar::Long(250)))] + #[case("id <= 2", Predicate::le(column_name!("id"), Scalar::Long(2)))] + #[case("id >= 100", Predicate::ge(column_name!("id"), Scalar::Long(100)))] + #[case("a <> 1", Predicate::ne(column_name!("a"), Scalar::Long(1)))] + #[case("a != 1", Predicate::ne(column_name!("a"), Scalar::Long(1)))] + // Literal on the left + #[case("1 < a", Predicate::lt(Scalar::Long(1), column_name!("a")))] + #[case("1 = a", Predicate::eq(Scalar::Long(1), column_name!("a")))] + #[case("1 != a", Predicate::ne(Scalar::Long(1), column_name!("a")))] + fn comparison(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- Literal types -- + #[rstest] + #[case("name = 'bob'", Predicate::eq(column_name!("name"), Scalar::from("bob".to_string())))] + #[case("c3 < 1.5", Predicate::lt(column_name!("c3"), Scalar::Double(1.5)))] + #[case("val < -2.5", Predicate::lt(column_name!("val"), Scalar::Double(-2.5)))] + #[case("c4 > 5.0", Predicate::gt(column_name!("c4"), Scalar::Double(5.0)))] + #[case("flag = true", Predicate::eq(column_name!("flag"), Scalar::Boolean(true)))] + #[case("cc9 = false", Predicate::eq(column_name!("cc9"), Scalar::Boolean(false)))] + #[case("a = NULL", Predicate::eq(column_name!("a"), Scalar::Null(DataType::LONG)))] + #[case("id > -100", Predicate::gt(column_name!("id"), Scalar::Long(-100)))] + #[case("value > 1.0E300", Predicate::gt(column_name!("value"), Scalar::Double(1.0E300)))] + #[case("a > 2147483647", Predicate::gt(column_name!("a"), Scalar::Long(2147483647)))] + #[case("long_val < 50000000000", Predicate::lt(column_name!("long_val"), Scalar::Long(50000000000)))] + fn literal_types(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- Compound identifiers (nested columns) -- + #[rstest] + #[case("a.b > 1", Predicate::gt(Expression::column(["a", "b"]), Scalar::Long(1)))] + #[case("a.b.c = 2", Predicate::eq(Expression::column(["a", "b", "c"]), Scalar::Long(2)))] + #[case("b.c.f.i < 0", Predicate::lt(Expression::column(["b", "c", "f", "i"]), Scalar::Long(0)))] + #[case("data.value > 150", Predicate::gt(Expression::column(["data", "value"]), Scalar::Long(150)))] + fn nested_columns(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- Boolean literals -- + #[rstest] + #[case("TRUE", Predicate::literal(true))] + #[case("FALSE", Predicate::literal(false))] + fn boolean_literals(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- IS NULL / IS NOT NULL -- + #[rstest] + #[case("a IS NULL", Predicate::is_null(column_name!("a")))] + #[case("a IS NOT NULL", Predicate::is_not_null(column_name!("a")))] + #[case("null_v_struct.v IS NULL", Predicate::is_null(Expression::column(["null_v_struct", "v"])))] + fn is_null(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- NOT -- + #[rstest] + #[case("NOT a = 1", Predicate::not(Predicate::eq(column_name!("a"), Scalar::Long(1))))] + #[case("NOT a = NULL", Predicate::not(Predicate::eq(column_name!("a"), Scalar::Null(DataType::LONG))))] + #[case( + "NOT(a < 1 OR b > 20)", + Predicate::not(Predicate::or( + Predicate::lt(column_name!("a"), Scalar::Long(1)), + Predicate::gt(column_name!("b"), Scalar::Long(20)), + )) + )] + fn not_predicate(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- AND / OR -- + #[rstest] + #[case( + "id < 500 AND value > 10", + Predicate::and( + Predicate::lt(column_name!("id"), Scalar::Long(500)), + Predicate::gt(column_name!("value"), Scalar::Long(10)), + ) + )] + #[case( + "id = 1 OR id = 2", + Predicate::or( + Predicate::eq(column_name!("id"), Scalar::Long(1)), + Predicate::eq(column_name!("id"), Scalar::Long(2)), + ) + )] + #[case( + "a = 0 AND b = 0 AND c = 0", + Predicate::and( + Predicate::and( + Predicate::eq(column_name!("a"), Scalar::Long(0)), + Predicate::eq(column_name!("b"), Scalar::Long(0)), + ), + Predicate::eq(column_name!("c"), Scalar::Long(0)), + ) + )] + #[case( + "(a < 3 AND b < 3) OR (a > 7 AND b > 7)", + Predicate::or( + Predicate::and( + Predicate::lt(column_name!("a"), Scalar::Long(3)), + Predicate::lt(column_name!("b"), Scalar::Long(3)), + ), + Predicate::and( + Predicate::gt(column_name!("a"), Scalar::Long(7)), + Predicate::gt(column_name!("b"), Scalar::Long(7)), + ), + ) + )] + #[case( + "(a = 5 OR a = 7) AND b < 5", + Predicate::and( + Predicate::or( + Predicate::eq(column_name!("a"), Scalar::Long(5)), + Predicate::eq(column_name!("a"), Scalar::Long(7)), + ), + Predicate::lt(column_name!("b"), Scalar::Long(5)), + ) + )] + fn and_or(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- Null-safe equals (<=>) -> NOT DISTINCT -- + #[rstest] + #[case("a <=> 1", Predicate::not(Predicate::distinct(column_name!("a"), Scalar::Long(1))))] + #[case("a <=> NULL", Predicate::not(Predicate::distinct(column_name!("a"), Scalar::Null(DataType::LONG))))] + #[case("1 <=> a", Predicate::not(Predicate::distinct(Scalar::Long(1), column_name!("a"))))] + #[case( + "NOT a <=> 1", + Predicate::not(Predicate::not(Predicate::distinct(column_name!("a"), Scalar::Long(1)))) + )] + fn null_safe_equals(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- IN / NOT IN -- + #[rstest] + #[case( + "a in (1, 2, 3)", + in_list(column_name!("a"), vec![Scalar::Long(1), Scalar::Long(2), Scalar::Long(3)]) + )] + #[case( + "a in (1)", + in_list(column_name!("a"), vec![Scalar::Long(1)]) + )] + #[case( + "value in (300, 787, 239)", + in_list(column_name!("value"), vec![Scalar::Long(300), Scalar::Long(787), Scalar::Long(239)]) + )] + #[case( + "name in ('alice', 'bob')", + in_list(column_name!("name"), vec![Scalar::from("alice".to_string()), Scalar::from("bob".to_string())]) + )] + fn in_predicate(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + #[rstest] + #[case( + "a NOT IN (1, 2)", + Predicate::not(in_list(column_name!("a"), vec![Scalar::Long(1), Scalar::Long(2)])) + )] + #[case( + "a NOT IN (10, 20, 30)", + Predicate::not(in_list(column_name!("a"), vec![Scalar::Long(10), Scalar::Long(20), Scalar::Long(30)])) + )] + fn not_in_predicate(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- BETWEEN -> col >= low AND col <= high -- + #[rstest] + #[case( + "id BETWEEN 10 AND 20", + Predicate::and( + Predicate::ge(column_name!("id"), Scalar::Long(10)), + Predicate::le(column_name!("id"), Scalar::Long(20)), + ) + )] + #[case( + "id BETWEEN -10 AND -1", + Predicate::and( + Predicate::ge(column_name!("id"), Scalar::Long(-10)), + Predicate::le(column_name!("id"), Scalar::Long(-1)), + ) + )] + #[case( + "id NOT BETWEEN 10 AND 20", + Predicate::not(Predicate::and( + Predicate::ge(column_name!("id"), Scalar::Long(10)), + Predicate::le(column_name!("id"), Scalar::Long(20)), + )) + )] + fn between(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + // -- Complex predicates (multi-feature) -- + #[rstest] + #[case( + "id >= 0 AND id < 1000 AND version_tag = 'v0'", + Predicate::and( + Predicate::and( + Predicate::ge(column_name!("id"), Scalar::Long(0)), + Predicate::lt(column_name!("id"), Scalar::Long(1000)), + ), + Predicate::eq(column_name!("version_tag"), Scalar::from("v0".to_string())), + ) + )] + #[case( + "int_col IS NOT NULL AND str_col IS NOT NULL", + Predicate::and( + Predicate::is_not_null(column_name!("int_col")), + Predicate::is_not_null(column_name!("str_col")), + ) + )] + #[case( + "NOT (a >= 5 AND NOT (b < 5))", + Predicate::not(Predicate::and( + Predicate::ge(column_name!("a"), Scalar::Long(5)), + Predicate::not(Predicate::lt(column_name!("b"), Scalar::Long(5))), + )) + )] + #[case( + "partCol = 3 and id > 25", + Predicate::and( + Predicate::eq(column_name!("partCol"), Scalar::Long(3)), + Predicate::gt(column_name!("id"), Scalar::Long(25)), + ) + )] + fn complex(#[case] sql: &str, #[case] expected: Predicate) { + assert_eq!(parse_predicate(sql).unwrap(), expected); + } + + #[rstest] + // LIKE (no kernel support) + #[case("a like 'C%'")] + #[case("fruit like 'b%'")] + #[case("a > 0 AND b like '2016-%'")] + // Function calls + #[case("cc8 = HEX('1111')")] + #[case("size(items) > 2")] + #[case("length(s) < 4")] + // Arithmetic expressions + #[case("a % 100 < 10 OR b > 20")] + #[case("a < 10 AND b % 100 > 20")] + // Typed literals (TIME keyword) + #[case("time_col >= TIME '00:00:00'")] + #[case("time_col < TIME '12:00:00'")] + // IS NULL on non-column expressions + #[case("(a > 0) IS NULL")] + #[case("(a > 0 AND b > 1) IS NULL")] + fn unsupported_predicates_fail_gracefully(#[case] sql: &str) { + let result = parse_predicate(sql); + assert!( + result.is_err(), + "Expected {sql:?} to fail, but it parsed as: {:?}", + result.unwrap() + ); + } +} diff --git a/benchmarks/src/runners.rs b/benchmarks/src/runners.rs new file mode 100644 index 0000000000..216b7231f9 --- /dev/null +++ b/benchmarks/src/runners.rs @@ -0,0 +1,645 @@ +//! Benchmark runners for executing Delta table operations. +//! +//! Each runner holds all the state required for its workload (e.g. read metadata needs +//! pre-built snapshots and a config) so that `execute` measures only the operation itself. +//! Results are discarded for benchmarking purposes. +//! +//! Engine and snapshot construction is handled based on `TableInfo`: +//! - UC tables (`catalog_info`): UC-vended credentials; catalog-managed tables use +//! `UCKernelClient::load_snapshot`, others use the standard snapshot builder +//! - S3 tables (`table_path` with s3://): credentials from `AWS_*` env vars +//! - Local tables: local filesystem engine + +use crate::models::{ + ParallelScan, ReadConfig, ReadOperation, ReadSpec, SnapshotConstructionSpec, TableInfo, + TimeTravel, +}; +use crate::predicate_parser::parse_predicate; +use delta_kernel::engine::default::executor::tokio::TokioMultiThreadExecutor; +use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::expressions::PredicateRef; +use delta_kernel::scan::{AfterSequentialScanMetadata, ParallelScanMetadata}; +use delta_kernel::Engine; +use delta_kernel::Snapshot; +use delta_kernel_unity_catalog::UCKernelClient; +use object_store::local::LocalFileSystem; +use unity_catalog_delta_client_api::{Error as UcApiError, Operation}; +use unity_catalog_delta_rest_client::{ + ClientConfig, Error as UcRestError, UCClient, UCCommitsRestClient, +}; + +use std::hint::black_box; +use std::sync::Arc; +use url::Url; + +/// Delta table property indicating catalog-managed support. +const CATALOG_MANAGED_PROPERTY: &str = "delta.feature.catalogManaged"; + +pub trait WorkloadRunner { + fn execute(&self) -> Result<(), Box>; + fn name(&self) -> &str; +} + +fn build_engine( + store: Arc, + runtime: Arc, +) -> Arc { + let executor = TokioMultiThreadExecutor::new(runtime.handle().clone()); + Arc::new( + DefaultEngine::builder(store) + .with_task_executor(Arc::new(executor)) + .build(), + ) +} + +/// Determines how a snapshot is loaded. Built once at setup via [`resolve_snapshot_strategy`], +/// then used by runners to construct snapshots. +enum SnapshotStrategy { + /// Standard snapshot builder (local, S3, or UC-managed non-catalog-managed tables). + Standard { url: Url }, + /// Catalog-managed table: snapshot loaded via `UCKernelClient::load_snapshot`. + CatalogManaged { + table_id: String, + table_uri: String, + commits_client: Box, + }, +} + +impl SnapshotStrategy { + /// Builds a snapshot using this strategy. + fn load_snapshot( + &self, + engine: &dyn Engine, + runtime: &tokio::runtime::Runtime, + time_travel: Option<&TimeTravel>, + ) -> Result, Box> { + match self { + SnapshotStrategy::Standard { url } => { + let mut builder = Snapshot::builder_for(url.clone()); + if let Some(tt) = time_travel { + builder = builder.at_version(tt.as_version()?); + } + Ok(builder.build(engine)?) + } + SnapshotStrategy::CatalogManaged { + table_id, + table_uri, + commits_client, + } => { + let catalog = UCKernelClient::new(commits_client.as_ref()); + let result = match time_travel { + Some(tt) => { + let version = tt.as_version()?; + runtime.block_on( + catalog.load_snapshot_at(table_id, table_uri, version, engine), + ) + } + None => runtime.block_on(catalog.load_snapshot(table_id, table_uri, engine)), + }; + result.map_err(|e| format!("Catalog snapshot failed: {e}").into()) + } + } + } +} + +/// Resolves engine credentials and snapshot strategy from a [`TableInfo`]. +/// +/// For UC-managed tables (`catalog_info` is present), credentials are vended via +/// `UCClient`. The `delta.feature.catalogManaged` property then determines whether to use +/// `UCKernelClient` (catalog-managed) or the standard snapshot builder. +/// +/// For non-UC tables, the engine is built from env vars (`AWS_*` for S3, local filesystem +/// otherwise). +fn resolve_snapshot_strategy( + table_info: &TableInfo, + runtime: Arc, +) -> Result<(Arc, SnapshotStrategy), Box> { + let Some(cm) = &table_info.catalog_info else { + let url = table_info.resolved_table_root(); + let engine = resolve_engine_for_url(&url, runtime)?; + return Ok((engine, SnapshotStrategy::Standard { url })); + }; + + let endpoint = std::env::var("UC_WORKSPACE").map_err(|_| "UC_WORKSPACE required")?; + let token = std::env::var("UC_TOKEN").map_err(|_| "UC_TOKEN required")?; + + let config = ClientConfig::build(&endpoint, &token).build()?; + let client = UCClient::new(config.clone())?; + + let result: Result<_, UcRestError> = runtime.block_on(async { + let table = client.get_table(&cm.table_name).await?; + let creds = client + .get_credentials(&table.table_id, Operation::Read) + .await?; + let aws = creds + .aws_temp_credentials + .ok_or(UcApiError::UnsupportedOperation( + // TODO(#2305): support non-AWS credential types + "Credential vending returned no AWS credentials".into(), + ))?; + Ok(( + table.table_id, + table.storage_location, + table.properties, + aws, + )) + }); + let (table_id, table_uri, uc_properties, aws) = result?; + + let table_url = Url::parse(&table_uri)?; + let region = std::env::var("AWS_REGION").map_err(|_| "AWS_REGION required")?; + let options = [ + ("region", region.as_str()), + ("access_key_id", aws.access_key_id.as_str()), + ("secret_access_key", aws.secret_access_key.as_str()), + ("session_token", aws.session_token.as_str()), + ]; + let (store, _) = object_store::parse_url_opts(&table_url, options)?; + let engine = build_engine(store.into(), runtime); + + let is_catalog_managed = uc_properties + .get(CATALOG_MANAGED_PROPERTY) + .is_some_and(|v| v == "supported"); + + let strategy = if is_catalog_managed { + let commits_client = Box::new(UCCommitsRestClient::new(config)?); + SnapshotStrategy::CatalogManaged { + table_id, + table_uri, + commits_client, + } + } else { + SnapshotStrategy::Standard { url: table_url } + }; + + Ok((engine, strategy)) +} + +/// Builds an engine from the table URL scheme and env vars (S3 or local). +fn resolve_engine_for_url( + url: &Url, + runtime: Arc, +) -> Result, Box> { + match url.scheme() { + "s3" | "s3a" => { + let region = + std::env::var("AWS_REGION").map_err(|_| "AWS_REGION required for S3 tables")?; + let mut opts: Vec<(&str, String)> = vec![("region", region)]; + for (env_key, opt_key) in [ + ("AWS_ACCESS_KEY_ID", "access_key_id"), + ("AWS_SECRET_ACCESS_KEY", "secret_access_key"), + ("AWS_SESSION_TOKEN", "session_token"), + ] { + if let Ok(v) = std::env::var(env_key) { + opts.push((opt_key, v)); + } + } + let (store, _) = object_store::parse_url_opts(url, opts)?; + Ok(build_engine(store.into(), runtime)) + } + "file" => Ok(build_engine(Arc::new(LocalFileSystem::new()), runtime)), + scheme => Err(format!( + "Unsupported scheme '{scheme}': only s3://, s3a://, and file:// are supported" + ) + .into()), + } +} + +pub struct ReadMetadataRunner { + snapshot: Arc, + engine: Arc, + name: String, + config: ReadConfig, + thread_pool: Option, // None for serial configuration, Some for parallel configuration + predicate: Option, +} + +impl ReadMetadataRunner { + pub fn setup( + table_info: &TableInfo, + case_name: &str, + read_spec: &ReadSpec, + config: ReadConfig, + runtime: Arc, + ) -> Result> { + let (engine, strategy) = resolve_snapshot_strategy(table_info, runtime.clone())?; + let snapshot = + strategy.load_snapshot(engine.as_ref(), &runtime, read_spec.time_travel.as_ref())?; + + let predicate = read_spec + .predicate + .as_deref() + .map(parse_predicate) + .transpose()? + .map(Arc::new); + + let name = format!( + "{}/{}/{}/{}", + table_info.name, + case_name, + ReadOperation::ReadMetadata.as_str(), + config.name, + ); + + let thread_pool = match &config.parallel_scan { + ParallelScan::Enabled { num_threads } => { + if *num_threads == 0 { + return Err("num_threads in ReadConfig must be greater than 0".into()); + } + let thread_pool = rayon::ThreadPoolBuilder::new() + .num_threads(*num_threads) + .build()?; + Some(thread_pool) + } + ParallelScan::Disabled => None, + }; + + Ok(Self { + snapshot, + engine, + name, + config, + thread_pool, + predicate, + }) + } + + fn execute_serial(&self) -> Result<(), Box> { + let scan = self + .snapshot + .clone() + .scan_builder() + .with_predicate(self.predicate.clone()) + .build()?; + let metadata_iter = scan.scan_metadata(self.engine.as_ref())?; + for result in metadata_iter { + black_box(result?); + } + Ok(()) + } + + fn execute_parallel(&self) -> Result<(), Box> { + let pool = self + .thread_pool + .as_ref() + .ok_or("thread_pool must be Some for parallel execution")?; + + let scan = self + .snapshot + .clone() + .scan_builder() + .with_predicate(self.predicate.clone()) + .build()?; + + let mut phase1 = scan.parallel_scan_metadata(self.engine.clone())?; + for result in phase1.by_ref() { + black_box(result?); + } + + match phase1.finish()? { + AfterSequentialScanMetadata::Done => {} + AfterSequentialScanMetadata::Parallel { state, files } => { + let num_threads = pool.current_num_threads(); + let files_per_worker = files.len().div_ceil(num_threads); + + let partitions: Vec<_> = files + .chunks(files_per_worker) + .map(|chunk| chunk.to_vec()) + .collect(); + + let state = Arc::new(*state); + + pool.scope(|s| { + for partition_files in partitions { + let engine = self.engine.clone(); + let state = state.clone(); + + s.spawn(move |_| { + if partition_files.is_empty() { + return; + } + + let parallel = + ParallelScanMetadata::try_new(engine, state, partition_files) + .expect("Failed to create ParallelScanMetadata"); + for result in parallel { + black_box(result.expect("Parallel scan error")); + } + }); + } + }); + } + } + Ok(()) + } +} + +impl WorkloadRunner for ReadMetadataRunner { + fn execute(&self) -> Result<(), Box> { + match &self.config.parallel_scan { + ParallelScan::Disabled => self.execute_serial(), + ParallelScan::Enabled { .. } => self.execute_parallel(), + } + } + + fn name(&self) -> &str { + &self.name + } +} + +/// Factory function that creates the appropriate read runner for a given operation and config +pub fn create_read_runner( + table_info: &TableInfo, + case_name: &str, + read_spec: &ReadSpec, + operation: ReadOperation, + config: ReadConfig, + runtime: Arc, +) -> Result, Box> { + match operation { + ReadOperation::ReadMetadata => Ok(Box::new(ReadMetadataRunner::setup( + table_info, case_name, read_spec, config, runtime, + )?)), + ReadOperation::ReadData => Err("ReadDataRunner not yet implemented".into()), + } +} + +pub struct SnapshotConstructionRunner { + engine: Arc, + runtime: Arc, + snapshot_strategy: SnapshotStrategy, + time_travel: Option, + name: String, +} + +impl SnapshotConstructionRunner { + pub fn setup( + table_info: &TableInfo, + case_name: &str, + snapshot_spec: &SnapshotConstructionSpec, + runtime: Arc, + ) -> Result> { + let name = format!( + "{}/{}/{}", + table_info.name, + case_name, + snapshot_spec.as_str() + ); + + let (engine, snapshot_strategy) = resolve_snapshot_strategy(table_info, runtime.clone())?; + + Ok(Self { + engine, + runtime, + snapshot_strategy, + time_travel: snapshot_spec.time_travel.clone(), + name, + }) + } +} + +impl WorkloadRunner for SnapshotConstructionRunner { + fn execute(&self) -> Result<(), Box> { + let snapshot = self.snapshot_strategy.load_snapshot( + self.engine.as_ref(), + &self.runtime, + self.time_travel.as_ref(), + )?; + black_box(snapshot); + Ok(()) + } + + fn name(&self) -> &str { + &self.name + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::models::{ParallelScan, ReadConfig, ReadSpec, TableInfo, TimeTravel}; + use std::sync::LazyLock; + + fn test_runtime() -> Arc { + static RT: LazyLock> = LazyLock::new(|| { + Arc::new(tokio::runtime::Runtime::new().expect("failed to create runtime")) + }); + RT.clone() + } + + fn test_table_info() -> TableInfo { + let path = format!( + "{}/../kernel/tests/data/basic_partitioned", + env!("CARGO_MANIFEST_DIR") + ); + let json = format!( + r#"{{ + "name": "basic_partitioned", + "description": "basic partitioned table for testing", + "tablePath": "{}", + "schema": {{ + "type": "struct", + "fields": [ + {{"name": "letter", "type": "string", "nullable": true, "metadata": {{}}}}, + {{"name": "number", "type": "long", "nullable": true, "metadata": {{}}}}, + {{"name": "a_float", "type": "double", "nullable": true, "metadata": {{}}}} + ] + }}, + "protocol": {{"minReaderVersion": 1, "minWriterVersion": 2}}, + "logInfo": {{ + "numAddFiles": 6, + "numRemoveFiles": 0, + "sizeInBytes": 4505, + "numCommits": 2, + "numActions": 10 + }}, + "properties": {{}}, + "dataLayout": {{"numPartitionColumns": 1, "numDistinctPartitions": 5}}, + "tags": [] + }}"#, + Url::from_file_path(path).unwrap() + ); + serde_json::from_str(&json).expect("failed to build test TableInfo") + } + + fn test_read_spec() -> ReadSpec { + ReadSpec { + time_travel: None, + predicate: None, + columns: None, + expected: None, + } + } + + fn serial_config() -> ReadConfig { + ReadConfig { + name: "serial".to_string(), + parallel_scan: ParallelScan::Disabled, + } + } + + fn parallel_config() -> ReadConfig { + ReadConfig { + name: "parallel2".to_string(), + parallel_scan: ParallelScan::Enabled { num_threads: 2 }, + } + } + + #[test] + fn test_read_metadata_runner_serial() { + let runner = ReadMetadataRunner::setup( + &test_table_info(), + "testCase", + &test_read_spec(), + serial_config(), + test_runtime(), + ) + .expect("setup should succeed"); + assert_eq!( + runner.name(), + "basic_partitioned/testCase/readMetadata/serial" + ); + assert!(runner.execute().is_ok()); + } + + #[test] + fn test_read_metadata_runner_parallel() { + let runner = ReadMetadataRunner::setup( + &test_table_info(), + "testCase", + &test_read_spec(), + parallel_config(), + test_runtime(), + ) + .expect("setup should succeed"); + assert_eq!( + runner.name(), + "basic_partitioned/testCase/readMetadata/parallel2" + ); + assert!(runner.execute().is_ok()); + } + + fn test_snapshot_spec() -> SnapshotConstructionSpec { + SnapshotConstructionSpec { + time_travel: None, + expected: None, + } + } + + #[test] + fn test_snapshot_construction_runner_setup() { + let runner = SnapshotConstructionRunner::setup( + &test_table_info(), + "testCase", + &test_snapshot_spec(), + test_runtime(), + ); + assert!(runner.is_ok()); + } + + #[test] + fn test_snapshot_construction_runner_name() { + let runner = SnapshotConstructionRunner::setup( + &test_table_info(), + "testCase", + &test_snapshot_spec(), + test_runtime(), + ) + .expect("setup should succeed"); + assert_eq!( + runner.name(), + "basic_partitioned/testCase/snapshotConstruction" + ); + } + + #[test] + fn test_snapshot_construction_runner_execute() { + let runner = SnapshotConstructionRunner::setup( + &test_table_info(), + "testCase", + &test_snapshot_spec(), + test_runtime(), + ) + .expect("setup should succeed"); + assert!(runner.execute().is_ok()); + } + + #[test] + fn test_create_read_runner_read_metadata() { + let runner = create_read_runner( + &test_table_info(), + "testCase", + &test_read_spec(), + ReadOperation::ReadMetadata, + serial_config(), + test_runtime(), + ) + .expect("create_read_runner should succeed"); + assert!(runner.execute().is_ok()); + } + + #[test] + fn test_read_metadata_runner_with_valid_predicate() { + let mut spec = test_read_spec(); + spec.predicate = Some("letter = 'a'".to_string()); + let runner = ReadMetadataRunner::setup( + &test_table_info(), + "test_case", + &spec, + serial_config(), + test_runtime(), + ) + .expect("setup should succeed"); + assert!(runner.execute().is_ok()); + } + + #[test] + fn test_read_metadata_runner_with_invalid_predicate() { + let mut spec = test_read_spec(); + spec.predicate = Some("a LIKE '%foo'".to_string()); + let result = ReadMetadataRunner::setup( + &test_table_info(), + "test_case", + &spec, + serial_config(), + test_runtime(), + ); + assert!(result.is_err()); + } + + #[test] + fn test_create_read_runner_read_data_unimplemented() { + let result = create_read_runner( + &test_table_info(), + "testCase", + &test_read_spec(), + ReadOperation::ReadData, + serial_config(), + test_runtime(), + ); + assert!(result.is_err()); + } + + #[test] + fn test_resolve_engine_unsupported_scheme() { + let url = Url::parse("gs://bucket/table").unwrap(); + let result = resolve_engine_for_url(&url, test_runtime()); + assert!(result.is_err()); + } + + #[test] + fn test_snapshot_construction_with_time_travel() { + let spec = SnapshotConstructionSpec { + time_travel: Some(TimeTravel::Version { version: 0 }), + expected: None, + }; + let runner = SnapshotConstructionRunner::setup( + &test_table_info(), + "testCase", + &spec, + test_runtime(), + ) + .expect("setup should succeed"); + assert!(runner.execute().is_ok()); + } +} diff --git a/benchmarks/src/utils.rs b/benchmarks/src/utils.rs new file mode 100644 index 0000000000..02baadce63 --- /dev/null +++ b/benchmarks/src/utils.rs @@ -0,0 +1,199 @@ +//! Utility functions for loading workload specifications + +use crate::models::{Spec, TableInfo, Workload}; +use std::path::{Path, PathBuf}; + +// Environment variable used to filter benchmarks by tag (e.g. `BENCH_TAGS=base,feature_x`). +pub const BENCH_TAGS_ENV_VAR: &str = "BENCH_TAGS"; + +const OUTPUT_FOLDER: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/workloads"); +const TABLE_INFO_FILE_NAME: &str = "tableInfo.json"; +const SPECS_DIR_NAME: &str = "specs"; +const BENCHMARKS_DIR_NAME: &str = "benchmarks"; +const DELTA_DIR_NAME: &str = "delta"; + +/// Loads all workload specifications from `OUTPUT_FOLDER`, optionally filtered by `BENCH_TAGS`. +/// +/// If `KERNEL_BENCH_WORKLOAD_DIR` is set, loads from that directory instead (for remote/S3 tables). +/// +/// Workloads are downloaded and extracted into `OUTPUT_FOLDER` at build time by `build.rs`. +/// +/// If the `BENCH_TAGS` environment variable is set (e.g. `BENCH_TAGS=base`), +/// only workloads whose `table_info.json` has at least one matching tag are returned. +/// If `BENCH_TAGS` is unset or empty, all workloads are returned. +pub fn load_all_workloads() -> Result, Box> { + // When KERNEL_BENCH_WORKLOAD_DIR is set, tag filtering is skipped -- remote workload + // directories are assumed to be curated, so all tables in the directory are benchmarked. + let (base_dir, required_tags) = if let Ok(dir) = std::env::var("KERNEL_BENCH_WORKLOAD_DIR") { + (PathBuf::from(dir), None) + } else { + let benchmarks_dir = PathBuf::from(OUTPUT_FOLDER).join(BENCHMARKS_DIR_NAME); + (benchmarks_dir, get_required_tags()) + }; + + let table_directories = find_table_directories(&base_dir)?; + + let mut all_workloads = Vec::new(); + + for table_dir in table_directories { + all_workloads.extend(load_specs_from_table(&table_dir, required_tags.as_deref())?); + } + + Ok(all_workloads) +} + +/// Reads the `BENCH_TAGS` environment variable and returns the set of tags +/// Returns `None` if unset or empty (meaning we should run all workloads) +fn get_required_tags() -> Option> { + std::env::var(BENCH_TAGS_ENV_VAR) + .ok() + .filter(|s| !s.is_empty()) + .map(|s| s.split(',').map(|t| t.trim().to_string()).collect()) +} + +/// Returns all subdirectories of `base_dir`. In practice this is called with `base_dir` = `OUTPUT_FOLDER`/`BENCHMARKS_DIR_NAME`, +/// Each subdirectory returned represents a table to be benchmarked and contains the table itself, specs, and table info +fn find_table_directories(base_dir: &Path) -> Result, Box> { + let entries = std::fs::read_dir(base_dir) + .map_err(|e| format!("Cannot read directory {}: {}", base_dir.display(), e))?; + + let table_dirs: Vec = entries + .filter_map(|entry| entry.ok()) + .map(|entry| entry.path()) + .filter(|path| path.is_dir()) + .collect(); + + if table_dirs.is_empty() { + return Err(format!("No table directories found in {}", base_dir.display()).into()); + } + + Ok(table_dirs) +} + +/// Loads all workload specs for a single table, or returns an empty vec if required_tags is set +/// and the table has no matching tags. +/// +/// Reads table info from `TABLE_INFO_FILE_NAME` at the root of `table_dir`, +/// then loads each JSON spec from `table_dir`/`SPECS_DIR_NAME`. +/// +/// If `required_tags` is `None`, all tables are included (no tables will be skipped in this function) +/// Otherwise, a specific table is included (not skipped by this function) if any of its tags appear in `required_tags` (uses union semantics) +fn load_specs_from_table( + table_dir: &Path, + required_tags: Option<&[String]>, +) -> Result, Box> { + let specs_dir = table_dir.join(SPECS_DIR_NAME); + + if !specs_dir.is_dir() { + return Err(format!("Specs directory not found: {}", specs_dir.display()).into()); + } + + let table_info_path = table_dir.join(TABLE_INFO_FILE_NAME); + let table_info = TableInfo::from_json_path(&table_info_path).map_err(|e| { + format!( + "Failed to parse table_info.json at {}: {}", + table_info_path.display(), + e + ) + })?; + + // Skip this table if BENCH_TAGS is set and none of its tags match + if let Some(tags) = required_tags { + if !table_info.matches_tags(tags) { + return Ok(vec![]); + } + } + + // Remote tables (table_path or catalog_info present) don't need local data. + // Local tables must have a delta/ subdirectory next to tableInfo.json. + let is_remote = table_info.table_path.is_some() || table_info.catalog_info.is_some(); + if !is_remote { + let delta_dir = table_dir.join(DELTA_DIR_NAME); + if !delta_dir.is_dir() { + return Err(format!( + "Table data not found for '{}'. Expected a 'delta' directory in {}", + table_info.name, + table_dir.display() + ) + .into()); + } + } + + let spec_files = find_spec_files(&specs_dir)?; + + let mut workloads = Vec::new(); + for spec_file in spec_files { + workloads.push(load_single_spec(&spec_file, table_info.clone())?); + } + + Ok(workloads) +} + +/// Returns all JSON files in `specs_dir`, where each file is a benchmark spec for the table +fn find_spec_files(specs_dir: &Path) -> Result, Box> { + let entries = std::fs::read_dir(specs_dir) + .map_err(|e| format!("Cannot read directory {}: {}", specs_dir.display(), e))?; + + let spec_files: Vec = entries + .filter_map(|entry| entry.ok()) + .map(|entry| entry.path()) + .filter(|path| path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("json")) + .collect(); + + if spec_files.is_empty() { + return Err(format!("No JSON spec files found in {}", specs_dir.display()).into()); + } + + Ok(spec_files) +} + +/// Parses a single spec JSON file and builds a Workload from it, combining the spec with the +/// provided table info and using the spec file's name (without extension) as the case name +fn load_single_spec( + spec_file: &Path, + table_info: TableInfo, +) -> Result> { + let case_name = spec_file + .file_stem() + .and_then(|n| n.to_str()) + .ok_or_else(|| format!("Invalid spec file name: {}", spec_file.display()))? + .to_string(); + + let spec = Spec::from_json_path(spec_file) + .map_err(|e| format!("Failed to parse spec file {}: {}", spec_file.display(), e))?; + + Ok(Workload { + table_info, + case_name, + spec, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_required_tags() { + // These cases must run sequentially b/c env vars conflict when these tests are separate (as they run in parallel) + std::env::remove_var(BENCH_TAGS_ENV_VAR); + assert!(get_required_tags().is_none()); + + std::env::set_var(BENCH_TAGS_ENV_VAR, ""); + assert!(get_required_tags().is_none()); + + std::env::set_var(BENCH_TAGS_ENV_VAR, "ci"); + assert_eq!(get_required_tags().unwrap(), vec!["ci"]); + + std::env::set_var(BENCH_TAGS_ENV_VAR, "ci,checkpoints,v2"); + assert_eq!( + get_required_tags().unwrap(), + vec!["ci", "checkpoints", "v2"] + ); + + std::env::set_var(BENCH_TAGS_ENV_VAR, " ci , checkpoints "); + assert_eq!(get_required_tags().unwrap(), vec!["ci", "checkpoints"]); + + std::env::remove_var(BENCH_TAGS_ENV_VAR); + } +} diff --git a/delta-kernel-unity-catalog/Cargo.toml b/delta-kernel-unity-catalog/Cargo.toml new file mode 100644 index 0000000000..338c9501a5 --- /dev/null +++ b/delta-kernel-unity-catalog/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "delta-kernel-unity-catalog" +edition.workspace = true +homepage.workspace = true +keywords.workspace = true +license.workspace = true +repository.workspace = true +readme.workspace = true +rust-version.workspace = true +version.workspace = true + +# for cargo-release +[package.metadata.release] +release = false + +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["delta_kernel/arrow-57"] +arrow-56 = ["delta_kernel/arrow-56"] + +[dependencies] +delta_kernel = { path = "../kernel", features = ["internal-api"] } +unity-catalog-delta-client-api = { path = "../unity-catalog-delta-client-api" } +itertools = "0.14" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +tokio = { version = "1", features = ["full"] } +tracing = "0.1" +url = "2" +uuid = { version = "1", features = ["v4"] } + +[dev-dependencies] +delta_kernel = { path = "../kernel", features = ["default-engine-rustls", "test-utils", "internal-api"] } +unity-catalog-delta-rest-client = { path = "../unity-catalog-delta-rest-client", features = ["test-utils"] } +unity-catalog-delta-client-api = { path = "../unity-catalog-delta-client-api" } +tempfile = "3" diff --git a/delta-kernel-unity-catalog/src/committer.rs b/delta-kernel-unity-catalog/src/committer.rs new file mode 100644 index 0000000000..873c67a6c0 --- /dev/null +++ b/delta-kernel-unity-catalog/src/committer.rs @@ -0,0 +1,594 @@ +use std::sync::Arc; + +use delta_kernel::committer::{ + CommitMetadata, CommitResponse, CommitType, Committer, PublishMetadata, +}; +use delta_kernel::{DeltaResult, Engine, Error as DeltaError, FilteredEngineData}; +use tracing::{debug, info}; +use unity_catalog_delta_client_api::{Commit, CommitClient, CommitRequest}; + +use crate::constants::{ + CATALOG_MANAGED_FEATURE, CLUSTERING_DOMAIN_NAME, ENABLE_IN_COMMIT_TIMESTAMPS, + IN_COMMIT_TIMESTAMP_FEATURE, UC_TABLE_ID_KEY, VACUUM_PROTOCOL_CHECK_FEATURE, +}; +use crate::errors; + +/// Convenience macro: returns an error if a condition is not met. +macro_rules! require { + ($cond:expr, $err:expr) => { + if !($cond) { + return Err($err); + } + }; +} + +/// A [UCCommitter] is a Unity Catalog [`Committer`] implementation for committing to a specific +/// delta table in UC. +/// +/// For version 0 (table creation), the committer writes `000.json` directly to the published +/// commit path. The caller (connector) is responsible for finalizing the table in UC via the +/// create table API. +/// +/// For version >= 1, the committer writes a staged commit and calls the UC commit API to ratify it. +/// +/// NOTE: this [`Committer`] requires a multi-threaded tokio runtime. That is, whatever +/// implementation consumes the Committer to commit to the table, must call `commit` from within a +/// muti-threaded tokio runtime context. Since the default engine uses tokio, this is compatible, +/// but must ensure that the multi-threaded runtime is used. +#[derive(Debug, Clone)] +pub struct UCCommitter { + commits_client: Arc, + table_id: String, +} + +impl UCCommitter { + /// Create a new [UCCommitter] to commit via the `commits_client` to the specific table with the given + /// `table_id`. + pub fn new(commits_client: Arc, table_id: impl Into) -> Self { + UCCommitter { + commits_client, + table_id: table_id.into(), + } + } + + /// Returns true if the commit metadata has the `catalogManaged` feature in both reader and + /// writer features. + fn has_catalog_managed_feature(commit_metadata: &CommitMetadata) -> bool { + commit_metadata.has_writer_feature(CATALOG_MANAGED_FEATURE) + && commit_metadata.has_reader_feature(CATALOG_MANAGED_FEATURE) + } + + /// Validates that protocol features and metadata properties are correct for a UC + /// catalog-managed table. + fn validate_catalog_managed_state(&self, commit_metadata: &CommitMetadata) -> DeltaResult<()> { + require!( + commit_metadata.commit_type() != CommitType::UpgradeToCatalogManaged, + errors::upgrade_downgrade_unsupported("upgrade") + ); + require!( + commit_metadata.commit_type() != CommitType::DowngradeToPathBased, + errors::upgrade_downgrade_unsupported("downgrade") + ); + require!( + Self::has_catalog_managed_feature(commit_metadata), + errors::missing_feature(CATALOG_MANAGED_FEATURE) + ); + require!( + commit_metadata.has_writer_feature(VACUUM_PROTOCOL_CHECK_FEATURE) + && commit_metadata.has_reader_feature(VACUUM_PROTOCOL_CHECK_FEATURE), + errors::missing_feature(VACUUM_PROTOCOL_CHECK_FEATURE) + ); + require!( + commit_metadata.has_writer_feature(IN_COMMIT_TIMESTAMP_FEATURE), + errors::missing_feature(IN_COMMIT_TIMESTAMP_FEATURE) + ); + + let config = commit_metadata + .metadata_configuration() + .ok_or_else(errors::missing_metadata_configuration)?; + let table_id = config + .get(UC_TABLE_ID_KEY) + .ok_or_else(|| errors::missing_property(UC_TABLE_ID_KEY))?; + require!( + table_id == &self.table_id, + errors::table_id_mismatch(&self.table_id, table_id) + ); + require!( + config.get(ENABLE_IN_COMMIT_TIMESTAMPS).map(String::as_str) == Some("true"), + errors::ict_not_enabled() + ); + Ok(()) + } + + /// Validates that this commit does not include ALTER TABLE changes (protocol, metadata, + /// or clustering column changes). + fn validate_no_alter_table_changes(commit_metadata: &CommitMetadata) -> DeltaResult<()> { + require!( + !commit_metadata.has_protocol_change(), + errors::alter_table_unsupported("protocol") + ); + require!( + !commit_metadata.has_metadata_change(), + errors::alter_table_unsupported("metadata") + ); + require!( + !commit_metadata.has_domain_metadata_change(CLUSTERING_DOMAIN_NAME), + errors::alter_table_unsupported("clustering columns") + ); + Ok(()) + } + + /// Commit version 0 (table creation). Validates that all required UC properties are present, + /// then writes the version 0 commit file directly to the published commit path. + fn commit_version_0( + &self, + engine: &dyn Engine, + actions: Box> + Send + '_>, + commit_metadata: &CommitMetadata, + ) -> DeltaResult { + debug_assert!( + commit_metadata.version() == 0, + "commit_version_0 called with version {}", + commit_metadata.version() + ); + self.validate_catalog_managed_state(commit_metadata)?; + let published_commit_path = commit_metadata.published_commit_path()?; + match engine.json_handler().write_json_file( + &published_commit_path, + Box::new(actions), + false, + ) { + Ok(()) => { + info!("wrote version 0 commit file for UC table creation"); + let file_meta = engine.storage_handler().head(&published_commit_path)?; + Ok(CommitResponse::Committed { file_meta }) + } + Err(delta_kernel::Error::FileAlreadyExists(_)) => { + info!("version 0 commit conflict: commit file already exists"); + Ok(CommitResponse::Conflict { version: 0 }) + } + Err(e) => Err(e), + } + } + + /// Commit version >= 1. Validates catalog-managed status hasn't changed, writes a staged + /// commit file, and calls the UC commit API to ratify it. + fn commit_version_non_zero( + &self, + engine: &dyn Engine, + actions: Box> + Send + '_>, + commit_metadata: CommitMetadata, + ) -> DeltaResult + where + C: 'static, + { + debug_assert!( + commit_metadata.version() != 0, + "commit_version_non_zero called with version 0" + ); + self.validate_catalog_managed_state(&commit_metadata)?; + Self::validate_no_alter_table_changes(&commit_metadata)?; + let staged_commit_path = commit_metadata.staged_commit_path()?; + engine + .json_handler() + .write_json_file(&staged_commit_path, Box::new(actions), false)?; + + let committed = engine.storage_handler().head(&staged_commit_path)?; + debug!("wrote staged commit file: {:?}", committed); + + let commit_req = CommitRequest::new( + self.table_id.clone(), + commit_metadata.table_root().as_str(), + Commit::new( + commit_metadata.version().try_into().map_err(|_| { + DeltaError::generic("commit version does not fit into i64 for UC commit") + })?, + commit_metadata.in_commit_timestamp(), + staged_commit_path + .path_segments() + .ok_or_else(|| DeltaError::generic("staged commit contained no path segments"))? + .next_back() + .ok_or_else(|| { + DeltaError::generic("staged commit segments next_back was empty") + })?, + committed + .size + .try_into() + .map_err(|_| DeltaError::generic("committed size does not fit into i64"))?, + committed.last_modified, + ), + commit_metadata + .max_published_version() + .map(|v| { + v.try_into().map_err(|_| { + DeltaError::Generic(format!( + "Max published version {v} does not fit into i64 for UC commit" + )) + }) + }) + .transpose()?, + ); + let handle = tokio::runtime::Handle::try_current().map_err(|_| { + DeltaError::generic("UCCommitter may only be used within a tokio runtime") + })?; + tokio::task::block_in_place(|| { + handle.block_on(async move { + self.commits_client + .commit(commit_req) + .await + .map_err(|e| DeltaError::Generic(format!("UC commit error: {e}"))) + }) + })?; + Ok(CommitResponse::Committed { + file_meta: committed, + }) + } +} + +impl Committer for UCCommitter { + /// Commit the given `actions` to the delta table in UC. + /// + /// For version 0 (table creation), writes `000.json` directly to the published commit path. + /// The connector is responsible for finalizing the table in UC via the create table API. + /// + /// For version >= 1, writes a staged commit then calls the UC commit API to ratify it. + /// Connectors should publish staged commits to the delta log immediately after writing. + /// UC expects to be informed of the last known published version during commit. + fn commit( + &self, + engine: &dyn Engine, + actions: Box> + Send + '_>, + commit_metadata: CommitMetadata, + ) -> DeltaResult { + if commit_metadata.version() == 0 { + return self.commit_version_0(engine, actions, &commit_metadata); + } + self.commit_version_non_zero(engine, actions, commit_metadata) + } + + fn is_catalog_committer(&self) -> bool { + true + } + + fn publish(&self, engine: &dyn Engine, publish_metadata: PublishMetadata) -> DeltaResult<()> { + if publish_metadata.commits_to_publish().is_empty() { + return Ok(()); + } + + for catalog_commit in publish_metadata.commits_to_publish() { + let src = catalog_commit.location(); + let dest = catalog_commit.published_location(); + match engine.storage_handler().copy_atomic(src, dest) { + Ok(_) => (), + Err(DeltaError::FileAlreadyExists(_)) => (), + Err(e) => return Err(e), + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::collections::HashMap; + use std::fs; + + use delta_kernel::committer::{CatalogCommit, CommitMetadata}; + use delta_kernel::engine::default::DefaultEngine; + use delta_kernel::object_store::local::LocalFileSystem; + use delta_kernel::Version; + use unity_catalog_delta_client_api::error::Result; + + struct MockCommitsClient; + + impl CommitClient for MockCommitsClient { + async fn commit(&self, _: CommitRequest) -> Result<()> { + unimplemented!() + } + } + + /// Creates a valid catalog-managed CommitMetadata with all required UC features and properties. + fn catalog_managed_commit_metadata(table_root: url::Url, version: Version) -> CommitMetadata { + CommitMetadata::new_unchecked_with( + table_root, + version, + vec!["catalogManaged", "vacuumProtocolCheck"], + vec!["catalogManaged", "inCommitTimestamp", "vacuumProtocolCheck"], + HashMap::from([ + ( + "io.unitycatalog.tableId".to_string(), + "test-table-id".to_string(), + ), + ( + "delta.enableInCommitTimestamps".to_string(), + "true".to_string(), + ), + ]), + ) + .unwrap() + } + + #[test] + fn commit_version_0_writes_published_commit() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + let commit_metadata = catalog_managed_commit_metadata(table_root.clone(), 0); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "test-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + + // Create the _delta_log directory + fs::create_dir_all(tmp_dir.path().join("_delta_log")).unwrap(); + + let result = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap(); + match result { + CommitResponse::Committed { file_meta } => { + assert!( + file_meta + .location + .as_str() + .ends_with("00000000000000000000.json"), + "expected published path for version 0, got: {}", + file_meta.location + ); + // Verify the file was written to disk + let commit_path = tmp_dir.path().join("_delta_log/00000000000000000000.json"); + assert!(commit_path.exists(), "000.json should exist on disk"); + } + CommitResponse::Conflict { .. } => { + panic!("expected Committed for version 0, got Conflict") + } + } + } + + #[test] + fn commit_version_0_conflict_when_file_exists() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "test-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + + // Pre-create the commit file to trigger a conflict + let delta_log = tmp_dir.path().join("_delta_log"); + fs::create_dir_all(&delta_log).unwrap(); + fs::write(delta_log.join("00000000000000000000.json"), "existing").unwrap(); + + let commit_metadata = catalog_managed_commit_metadata(table_root, 0); + let result = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap(); + assert!( + matches!(result, CommitResponse::Conflict { version: 0 }), + "expected Conflict for version 0 when file exists, got: {result:?}" + ); + } + + #[test] + fn commit_version_0_rejects_missing_catalog_managed_feature() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + let commit_metadata = CommitMetadata::new_unchecked(table_root, 0).unwrap(); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "test-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + fs::create_dir_all(tmp_dir.path().join("_delta_log")).unwrap(); + + let err = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap_err(); + assert!( + err.to_string().contains("catalogManaged"), + "expected catalogManaged error, got: {err}" + ); + } + + #[test] + fn commit_version_0_rejects_missing_table_id() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + // Has features but missing io.unitycatalog.tableId in config + let commit_metadata = CommitMetadata::new_unchecked_with( + table_root, + 0, + vec!["catalogManaged", "vacuumProtocolCheck"], + vec!["catalogManaged", "inCommitTimestamp", "vacuumProtocolCheck"], + HashMap::from([( + "delta.enableInCommitTimestamps".to_string(), + "true".to_string(), + )]), + ) + .unwrap(); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "test-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + fs::create_dir_all(tmp_dir.path().join("_delta_log")).unwrap(); + + let err = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap_err(); + assert!( + err.to_string().contains("io.unitycatalog.tableId"), + "expected tableId error, got: {err}" + ); + } + + #[test] + fn commit_version_0_rejects_missing_ict_enablement() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + // Has features and tableId but missing delta.enableInCommitTimestamps=true + let commit_metadata = CommitMetadata::new_unchecked_with( + table_root, + 0, + vec!["catalogManaged", "vacuumProtocolCheck"], + vec!["catalogManaged", "inCommitTimestamp", "vacuumProtocolCheck"], + HashMap::from([( + "io.unitycatalog.tableId".to_string(), + "test-table-id".to_string(), + )]), + ) + .unwrap(); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "test-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + fs::create_dir_all(tmp_dir.path().join("_delta_log")).unwrap(); + + let err = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap_err(); + assert!( + err.to_string().contains("enableInCommitTimestamps"), + "expected ICT enablement error, got: {err}" + ); + } + + #[test] + fn commit_version_non_zero_rejects_non_catalog_managed_table() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + // Version >= 1 but without catalogManaged feature (simulates downgrade attempt) + let commit_metadata = CommitMetadata::new_unchecked(table_root, 1).unwrap(); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "test-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + + let err = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap_err(); + assert!( + err.to_string().contains("catalogManaged"), + "expected catalogManaged error, got: {err}" + ); + } + + #[test] + fn commit_version_non_zero_rejects_protocol_change() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + let commit_metadata = catalog_managed_commit_metadata(table_root, 1).with_protocol_change(); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "test-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + + let err = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap_err(); + assert!( + err.to_string().contains("table protocol"), + "expected protocol change error, got: {err}" + ); + } + + #[test] + fn commit_version_non_zero_rejects_metadata_change() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + let commit_metadata = catalog_managed_commit_metadata(table_root, 1).with_metadata_change(); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "test-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + + let err = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap_err(); + assert!( + err.to_string().contains("table metadata"), + "expected metadata change error, got: {err}" + ); + } + + #[test] + fn commit_version_non_zero_rejects_clustering_change() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + let commit_metadata = + catalog_managed_commit_metadata(table_root, 1).with_domain_change("delta.clustering"); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "test-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + + let err = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap_err(); + assert!( + err.to_string().contains("clustering columns"), + "expected clustering change error, got: {err}" + ); + } + + #[test] + fn commit_version_non_zero_rejects_mismatched_table_id() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + let commit_metadata = catalog_managed_commit_metadata(table_root, 1); + // Committer initialized with a different table ID than what's in the metadata + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "different-table-id"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + + let err = committer + .commit(&engine, Box::new(std::iter::empty()), commit_metadata) + .unwrap_err(); + assert!( + err.to_string().contains("table ID mismatch"), + "expected table ID mismatch error, got: {err}" + ); + } + + fn staged_commit_url(table_root: &url::Url, version: Version) -> url::Url { + table_root + .join(&format!( + "_delta_log/_staged_commits/{version:020}.uuid.json" + )) + .unwrap() + } + + fn published_commit_url(table_root: &url::Url, version: Version) -> url::Url { + table_root + .join(&format!("_delta_log/{version:020}.json")) + .unwrap() + } + + #[tokio::test] + async fn test_publish() { + let tmp_dir = tempfile::tempdir().unwrap(); + let table_root = url::Url::from_directory_path(tmp_dir.path()).unwrap(); + let staged_dir = tmp_dir.path().join("_delta_log/_staged_commits"); + let versions = [10u64, 11, 12]; + + // ===== GIVEN ===== + // Create catalog commits + let catalog_commits: Vec = versions + .into_iter() + .map(|v| { + CatalogCommit::new_unchecked( + v, + staged_commit_url(&table_root, v), + published_commit_url(&table_root, v), + ) + }) + .collect(); + + // Write staged commit files to disk + fs::create_dir_all(&staged_dir).unwrap(); + for commit in &catalog_commits { + let path = commit.location().to_file_path().unwrap(); + fs::write(&path, format!("version: {}", commit.version())).unwrap(); + } + + // Write 10.json file to disk (should be skipped, not error) + let existing_published = published_commit_url(&table_root, 10) + .to_file_path() + .unwrap(); + fs::create_dir_all(existing_published.parent().unwrap()).unwrap(); + fs::write(&existing_published, "version: 10").unwrap(); + + // ===== WHEN ===== + let publish_metadata = PublishMetadata::try_new(12, catalog_commits).unwrap(); + let committer = UCCommitter::new(Arc::new(MockCommitsClient), "testUcTableId"); + let engine = DefaultEngine::builder(Arc::new(LocalFileSystem::new())).build(); + committer.publish(&engine, publish_metadata).unwrap(); + + // ===== THEN ===== + for v in versions { + let path = published_commit_url(&table_root, v).to_file_path().unwrap(); + assert!(path.exists()); + assert_eq!(fs::read_to_string(&path).unwrap(), format!("version: {v}")); + } + } +} diff --git a/delta-kernel-unity-catalog/src/constants.rs b/delta-kernel-unity-catalog/src/constants.rs new file mode 100644 index 0000000000..f6541ca8f6 --- /dev/null +++ b/delta-kernel-unity-catalog/src/constants.rs @@ -0,0 +1,24 @@ +//! Shared constants for UC catalog-managed table operations. + +/// Property key for the UC table ID, stored in Delta metadata configuration. +pub(crate) const UC_TABLE_ID_KEY: &str = "io.unitycatalog.tableId"; +/// Property key to enable in-commit timestamps. +pub(crate) const ENABLE_IN_COMMIT_TIMESTAMPS: &str = "delta.enableInCommitTimestamps"; +/// Feature supported value. +pub(crate) const FEATURE_SUPPORTED: &str = "supported"; +/// Feature signal key for catalog-managed tables. +pub(crate) const CATALOG_MANAGED_FEATURE_KEY: &str = "delta.feature.catalogManaged"; +/// Feature signal key for vacuum protocol check. +pub(crate) const VACUUM_PROTOCOL_CHECK_FEATURE_KEY: &str = "delta.feature.vacuumProtocolCheck"; +/// UC property for the last committed version. +pub(crate) const METASTORE_LAST_UPDATE_VERSION: &str = "delta.lastUpdateVersion"; +/// UC property for the last commit timestamp. +pub(crate) const METASTORE_LAST_COMMIT_TIMESTAMP: &str = "delta.lastCommitTimestamp"; +/// Feature name for catalog-managed tables (wire format). +pub(crate) const CATALOG_MANAGED_FEATURE: &str = "catalogManaged"; +/// Feature name for vacuum protocol check (wire format). +pub(crate) const VACUUM_PROTOCOL_CHECK_FEATURE: &str = "vacuumProtocolCheck"; +/// Feature name for in-commit timestamps (wire format). +pub(crate) const IN_COMMIT_TIMESTAMP_FEATURE: &str = "inCommitTimestamp"; +/// Domain name for clustering metadata. +pub(crate) const CLUSTERING_DOMAIN_NAME: &str = "delta.clustering"; diff --git a/delta-kernel-unity-catalog/src/errors.rs b/delta-kernel-unity-catalog/src/errors.rs new file mode 100644 index 0000000000..bfe32765ac --- /dev/null +++ b/delta-kernel-unity-catalog/src/errors.rs @@ -0,0 +1,43 @@ +//! Error helpers for UC operations. Centralizes error message construction to keep validation +//! logic concise. + +use delta_kernel::Error as DeltaError; + +pub(crate) fn missing_feature(feature: &str) -> DeltaError { + DeltaError::generic(format!( + "UC catalog-managed table requires the '{feature}' table feature" + )) +} + +pub(crate) fn missing_metadata_configuration() -> DeltaError { + DeltaError::generic("UC catalog-managed table requires metadata configuration") +} + +pub(crate) fn missing_property(key: &str) -> DeltaError { + DeltaError::generic(format!( + "UC catalog-managed table requires '{key}' in metadata configuration" + )) +} + +pub(crate) fn table_id_mismatch(expected: &str, actual: &str) -> DeltaError { + DeltaError::generic(format!( + "UC table ID mismatch: expected '{expected}' but found '{actual}'" + )) +} + +pub(crate) fn ict_not_enabled() -> DeltaError { + DeltaError::generic("UC catalog-managed table requires 'delta.enableInCommitTimestamps=true'") +} + +pub(crate) fn upgrade_downgrade_unsupported(direction: &str) -> DeltaError { + DeltaError::generic(format!( + "Table {direction} is not yet supported by the UCCommitter" + )) +} + +pub(crate) fn alter_table_unsupported(what: &str) -> DeltaError { + DeltaError::generic(format!( + "UCCommitter does not support commits that change the table {what}. \ + ALTER TABLE is not supported for catalog-managed tables." + )) +} diff --git a/delta-kernel-unity-catalog/src/lib.rs b/delta-kernel-unity-catalog/src/lib.rs new file mode 100644 index 0000000000..4571608ba9 --- /dev/null +++ b/delta-kernel-unity-catalog/src/lib.rs @@ -0,0 +1,318 @@ +//! UCKernelClient implements a high-level interface for interacting with Delta Tables in Unity Catalog. + +mod committer; +mod constants; +mod errors; +mod utils; +pub use committer::UCCommitter; +pub use utils::{get_final_required_properties_for_uc, get_required_properties_for_disk}; + +use std::sync::Arc; + +use delta_kernel::{Engine, LogPath, Snapshot, Version}; + +use unity_catalog_delta_client_api::{CommitsRequest, GetCommitsClient}; + +use itertools::Itertools; +use tracing::debug; +use url::Url; + +/// The [UCKernelClient] provides a high-level interface to interact with Delta Tables stored in +/// Unity Catalog. It is a lightweight wrapper around a [GetCommitsClient]. +pub struct UCKernelClient<'a, C: GetCommitsClient> { + get_commits_client: &'a C, +} + +impl<'a, C: GetCommitsClient> UCKernelClient<'a, C> { + /// Create a new [UCKernelClient] instance with the provided client. + pub fn new(get_commits_client: &'a C) -> Self { + UCKernelClient { get_commits_client } + } + + /// Load the latest snapshot of a Delta Table identified by `table_id` and `table_uri` in Unity + /// Catalog. Generally, a separate `get_table` call can be used to resolve the table id/uri from + /// the table name. + pub async fn load_snapshot( + &self, + table_id: &str, + table_uri: &str, + engine: &dyn Engine, + ) -> Result, Box> { + self.load_snapshot_inner(table_id, table_uri, None, engine) + .await + } + + /// Load a snapshot of a Delta Table identified by `table_id` and `table_uri` for a specific + /// version. Generally, a separate `get_table` call can be used to resolve the table id/uri from + /// the table name. + pub async fn load_snapshot_at( + &self, + table_id: &str, + table_uri: &str, + version: Version, + engine: &dyn Engine, + ) -> Result, Box> { + self.load_snapshot_inner(table_id, table_uri, Some(version), engine) + .await + } + + pub(crate) async fn load_snapshot_inner( + &self, + table_id: &str, + table_uri: &str, + version: Option, + engine: &dyn Engine, + ) -> Result, Box> { + let table_uri = table_uri.to_string(); + let req = CommitsRequest { + table_id: table_id.to_string(), + table_uri: table_uri.clone(), + start_version: Some(0), + end_version: version.and_then(|v| v.try_into().ok()), + }; + let mut commits = self.get_commits_client.get_commits(req).await?; + if let Some(commits) = commits.commits.as_mut() { + commits.sort_by_key(|c| c.version) + } + + // if commits are present, we ensure they are sorted+contiguous + if let Some(commits) = &commits.commits { + if !commits.windows(2).all(|w| w[1].version == w[0].version + 1) { + return Err("Received non-contiguous commit versions".into()); + } + } + + // we always get back the latest version from commits response, and pass that in to + // kernel's Snapshot builder. basically, load_table for the latest version always looks + // like a time travel query since we know the latest version ahead of time. + // + // note there is a weird edge case: if the table was just created it will return + // latest_table_version = -1, but the 0.json will exist in the _delta_log. + let version: Version = match version { + Some(v) => v, + None => match commits.latest_table_version { + -1 => 0, + i => i.try_into()?, + }, + }; + + // consume the UC Commit and hand back a delta_kernel LogPath + let mut table_url = Url::parse(&table_uri)?; + // add trailing slash + if !table_url.path().ends_with('/') { + // NB: we push an empty segment which effectively adds a trailing slash + table_url + .path_segments_mut() + .map_err(|_| "Cannot modify URL path segments")? + .push(""); + } + let commits: Vec<_> = commits + .commits + .unwrap_or_default() + .into_iter() + .map( + |c| -> Result> { + LogPath::staged_commit( + table_url.clone(), + &c.file_name, + c.file_modification_timestamp, + c.file_size.try_into()?, + ) + .map_err(|e| e.into()) + }, + ) + .try_collect()?; + + debug!("commits for kernel: {:?}\n", commits); + + Snapshot::builder_for(table_url) + .at_version(version) + .with_log_tail(commits) + .build(engine) + .map_err(|e| e.into()) + } +} + +#[cfg(test)] +mod tests { + use std::env; + use std::sync::Arc; + + use delta_kernel::engine::default::DefaultEngineBuilder; + use delta_kernel::object_store; + use delta_kernel::object_store::memory::InMemory; + use delta_kernel::transaction::CommitResult; + + use tracing::info; + use unity_catalog_delta_client_api::{Commit, InMemoryCommitsClient, Operation, TableData}; + use unity_catalog_delta_rest_client::{UCClient, UCCommitsRestClient}; + + use super::*; + + // We could just re-export UCClient's get_table to not require consumers to directly import + // unity_catalog_delta_rest_client themselves. + async fn get_table( + client: &UCClient, + table_name: &str, + ) -> Result<(String, String), Box> { + let res = client.get_table(table_name).await?; + let table_id = res.table_id; + let table_uri = res.storage_location; + + info!( + "[GET TABLE] got table_id: {}, table_uri: {}\n", + table_id, table_uri + ); + + Ok((table_id, table_uri)) + } + + // ignored test which you can run manually to play around with reading a UC table. run with: + // `ENDPOINT=".." TABLENAME=".." TOKEN=".." cargo t read_uc_table --nocapture -- --ignored` + #[ignore] + #[tokio::test] + async fn read_uc_table() -> Result<(), Box> { + let endpoint = env::var("ENDPOINT").expect("ENDPOINT environment variable not set"); + let token = env::var("TOKEN").expect("TOKEN environment variable not set"); + let table_name = env::var("TABLENAME").expect("TABLENAME environment variable not set"); + + // build shared config + let config = + unity_catalog_delta_rest_client::ClientConfig::build(&endpoint, &token).build()?; + + // build clients + let uc_client = UCClient::new(config.clone())?; + let uc_commits_client = UCCommitsRestClient::new(config)?; + + let (table_id, table_uri) = get_table(&uc_client, &table_name).await?; + let creds = uc_client + .get_credentials(&table_id, Operation::Read) + .await + .map_err(|e| format!("Failed to get credentials: {e}"))?; + + let catalog = UCKernelClient::new(&uc_commits_client); + + // TODO: support non-AWS + let creds = creds + .aws_temp_credentials + .ok_or("No AWS temporary credentials found")?; + + let options = [ + ("region", "us-west-2"), + ("access_key_id", &creds.access_key_id), + ("secret_access_key", &creds.secret_access_key), + ("session_token", &creds.session_token), + ]; + + let table_url = Url::parse(&table_uri)?; + let (store, path) = object_store::parse_url_opts(&table_url, options)?; + + info!("created object store: {:?}\npath: {:?}\n", store, path); + + let engine = DefaultEngineBuilder::new(store.into()).build(); + + // read table + let snapshot = catalog + .load_snapshot(&table_id, &table_uri, &engine) + .await?; + // or time travel + // let snapshot = catalog.load_snapshot_at(&table, 2).await?; + + println!("loaded snapshot: {snapshot:?}"); + + Ok(()) + } + + // ignored test which you can run manually to play around with writing to a UC table. run with: + // `ENDPOINT=".." TABLENAME=".." TOKEN=".." cargo t write_uc_table --nocapture -- --ignored` + #[ignore] + #[tokio::test(flavor = "multi_thread")] + async fn write_uc_table() -> Result<(), Box> { + let endpoint = env::var("ENDPOINT").expect("ENDPOINT environment variable not set"); + let token = env::var("TOKEN").expect("TOKEN environment variable not set"); + let table_name = env::var("TABLENAME").expect("TABLENAME environment variable not set"); + + // build shared config + let config = + unity_catalog_delta_rest_client::ClientConfig::build(&endpoint, &token).build()?; + + // build clients + let client = UCClient::new(config.clone())?; + let commits_client = Arc::new(UCCommitsRestClient::new(config)?); + + let (table_id, table_uri) = get_table(&client, &table_name).await?; + let creds = client + .get_credentials(&table_id, Operation::ReadWrite) + .await + .map_err(|e| format!("Failed to get credentials: {e}"))?; + + let catalog = UCKernelClient::new(commits_client.as_ref()); + + // TODO: support non-AWS + let creds = creds + .aws_temp_credentials + .ok_or("No AWS temporary credentials found")?; + + let options = [ + ("region", "us-west-2"), + ("access_key_id", &creds.access_key_id), + ("secret_access_key", &creds.secret_access_key), + ("session_token", &creds.session_token), + ]; + + let table_url = Url::parse(&table_uri)?; + let (store, _path) = object_store::parse_url_opts(&table_url, options)?; + let store = Arc::new(store); + + let engine = DefaultEngineBuilder::new(store.clone()).build(); + let committer = Box::new(UCCommitter::new(commits_client.clone(), table_id.clone())); + let snapshot = catalog + .load_snapshot(&table_id, &table_uri, &engine) + .await?; + println!("latest snapshot version: {:?}", snapshot.version()); + let txn = snapshot.clone().transaction(committer, &engine)?; + let _write_context = txn.get_write_context(); + + match txn.commit(&engine)? { + CommitResult::CommittedTransaction(t) => { + println!("committed version {}", t.commit_version()); + // TODO: should use post-commit snapshot here (plumb through log tail) + let _snapshot = catalog + .load_snapshot_at(&table_id, &table_uri, t.commit_version(), &engine) + .await?; + // then do publish + } + CommitResult::ConflictedTransaction(t) => { + println!("commit conflicted at version {}", t.conflict_version()); + } + CommitResult::RetryableTransaction(_) => { + println!("we should retry..."); + } + } + Ok(()) + } + + #[tokio::test] + async fn load_snapshot_errors_on_non_contiguous_commits() { + let client = InMemoryCommitsClient::new(); + client.insert_table( + "test_table", + TableData { + max_ratified_version: 3, + catalog_commits: vec![ + Commit::new(1, 0, "1.json", 100, 0), + Commit::new(3, 0, "3.json", 100, 0), // gap: version 2 missing + ], + }, + ); + let store = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(store).build(); + let catalog = UCKernelClient::new(&client); + + let result = catalog + .load_snapshot("test_table", "memory:///", &engine) + .await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("non-contiguous")); + } +} diff --git a/delta-kernel-unity-catalog/src/utils/create_table.rs b/delta-kernel-unity-catalog/src/utils/create_table.rs new file mode 100644 index 0000000000..88a5614ba4 --- /dev/null +++ b/delta-kernel-unity-catalog/src/utils/create_table.rs @@ -0,0 +1,305 @@ +//! Utilities for Unity Catalog catalog-managed table creation. +//! +//! These utilities help connectors create UC-managed tables by providing the required properties +//! for both the Delta log (disk) and the UC server registration. +//! +//! # Usage +//! +//! ```ignore +//! // Step 1: Get staging info from UC +//! let staging_info = my_uc_client.get_staging_table(..); +//! +//! // Step 2: Build and commit the create-table transaction +//! let disk_props = get_required_properties_for_disk(staging_info.table_id); +//! let create_table_txn = kernel::create_table(path, schema, "MyApp/1.0") +//! .with_table_properties(disk_props) +//! .build(engine, committer); +//! let result = create_table_txn.commit(engine); +//! +//! // Step 3: Finalize table in UC +//! let snapshot = /* load post-commit snapshot at version 0 */; +//! let uc_props = get_final_required_properties_for_uc(&snapshot, engine)?; +//! my_uc_client.create_table(.., uc_props); +//! ``` + +use std::collections::HashMap; + +use delta_kernel::{Engine, Snapshot}; + +use crate::constants::{ + CATALOG_MANAGED_FEATURE_KEY, FEATURE_SUPPORTED, METASTORE_LAST_COMMIT_TIMESTAMP, + METASTORE_LAST_UPDATE_VERSION, UC_TABLE_ID_KEY, VACUUM_PROTOCOL_CHECK_FEATURE_KEY, +}; + +/// Returns the table properties that must be written to disk (in `000.json`) for a UC +/// catalog-managed table creation. +/// +/// These properties must be persisted in the Delta log so that the table is recognized as +/// catalog-managed. Note: ICT enablement is handled automatically by kernel's CREATE TABLE +/// when the `catalogManaged` feature is present. +pub fn get_required_properties_for_disk(uc_table_id: &str) -> HashMap { + [ + (CATALOG_MANAGED_FEATURE_KEY, FEATURE_SUPPORTED), + (VACUUM_PROTOCOL_CHECK_FEATURE_KEY, FEATURE_SUPPORTED), + (UC_TABLE_ID_KEY, uc_table_id), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect() +} + +/// Extracts the properties that must be sent to the UC server when finalizing a table creation. +/// +/// These properties are derived from the post-commit snapshot (after `000.json` has +/// been written). The connector should pass these to the UC `create_table` API. +/// +/// # Properties returned +/// +/// - All entries from `Metadata.configuration` (includes `io.unitycatalog.tableId`, user props) +/// - `delta.minReaderVersion` and `delta.minWriterVersion` +/// - `delta.feature. = "supported"` for every reader and writer table feature +/// - `delta.lastUpdateVersion` -- the snapshot version +/// - `delta.lastCommitTimestamp` -- the snapshot's in-commit timestamp (requires ICT enabled) +/// - `clusteringColumns` -- JSON-serialized clustering columns (if clustering is enabled) +/// +/// # Clustering columns +/// +/// Clustering columns are returned as logical column names. When column mapping is enabled, +/// the physical names stored in domain metadata are converted to logical names using the +/// table schema. +pub fn get_final_required_properties_for_uc( + snapshot: &Snapshot, + engine: &dyn Engine, +) -> delta_kernel::DeltaResult> { + if snapshot.version() != 0 { + return Err(delta_kernel::Error::generic(format!( + "get_final_required_properties_for_uc is only valid for version 0 (table creation) \ + snapshots, but snapshot is at version {}", + snapshot.version() + ))); + } + + // Start with metadata configuration (user + delta properties) + let mut properties = snapshot.metadata_configuration().clone(); + + // Protocol-derived properties (versions + feature signals) + properties.extend(snapshot.get_protocol_derived_properties()); + + // UC-specific properties + properties.insert( + METASTORE_LAST_UPDATE_VERSION.to_string(), + snapshot.version().to_string(), + ); + let timestamp = snapshot.get_in_commit_timestamp(engine)?.ok_or_else(|| { + delta_kernel::Error::generic( + "In-commit timestamp is required for UC catalog-managed tables but was not found", + ) + })?; + properties.insert( + METASTORE_LAST_COMMIT_TIMESTAMP.to_string(), + timestamp.to_string(), + ); + + // Clustering columns as logical names (if present) + if let Some(columns) = snapshot.get_logical_clustering_columns(engine)? { + let column_arrays: Vec> = columns + .iter() + .map(|c| c.path().iter().map(|s| s.as_str()).collect()) + .collect(); + let json = serde_json::to_string(&column_arrays).map_err(|e| { + delta_kernel::Error::generic(format!("Failed to serialize clustering columns: {e}")) + })?; + properties.insert("clusteringColumns".to_string(), json); + } + + Ok(properties) +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::sync::Arc; + + use delta_kernel::committer::{CommitMetadata, CommitResponse, Committer, PublishMetadata}; + use delta_kernel::engine::default::DefaultEngineBuilder; + use delta_kernel::object_store::memory::InMemory; + use delta_kernel::schema::{DataType, StructField, StructType}; + use delta_kernel::snapshot::Snapshot; + use delta_kernel::transaction::create_table::create_table; + use delta_kernel::transaction::data_layout::DataLayout; + use delta_kernel::{DeltaResult, Engine, FileMeta, FilteredEngineData}; + + /// A mock catalog committer that writes directly to the published path. + struct MockCatalogCommitter; + impl Committer for MockCatalogCommitter { + fn commit( + &self, + engine: &dyn Engine, + actions: Box> + Send + '_>, + commit_metadata: CommitMetadata, + ) -> DeltaResult { + let path = commit_metadata.published_commit_path()?; + engine + .json_handler() + .write_json_file(&path, Box::new(actions), false)?; + Ok(CommitResponse::Committed { + file_meta: FileMeta::new(path, commit_metadata.in_commit_timestamp(), 0), + }) + } + fn is_catalog_committer(&self) -> bool { + true + } + fn publish(&self, _: &dyn Engine, _: PublishMetadata) -> DeltaResult<()> { + Ok(()) + } + } + + #[test] + fn test_get_required_properties_for_disk() { + let props = get_required_properties_for_disk("my-uc-table-123"); + assert_eq!(props.len(), 3); + assert_eq!(props["delta.feature.catalogManaged"], "supported"); + assert_eq!(props["delta.feature.vacuumProtocolCheck"], "supported"); + assert_eq!(props["io.unitycatalog.tableId"], "my-uc-table-123"); + } + + #[tokio::test] + async fn test_get_final_required_properties_for_uc() { + let storage = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(storage).build(); + let table_path = "memory:///test_table/"; + let schema = Arc::new( + StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("region", DataType::STRING, true), + ]) + .unwrap(), + ); + + // Create a UC catalog-managed table with clustering + let disk_props = get_required_properties_for_disk("test-table-id-456"); + let _ = create_table(table_path, schema, "Test/1.0") + .with_table_properties(disk_props) + .with_data_layout(DataLayout::clustered(["region"])) + .build(&engine, Box::new(MockCatalogCommitter)) + .unwrap() + .commit(&engine) + .unwrap(); + + let snapshot = Snapshot::builder_for(table_path).build(&engine).unwrap(); + assert_eq!(snapshot.version(), 0); + let uc_props = get_final_required_properties_for_uc(&snapshot, &engine).unwrap(); + + // Protocol-derived properties + assert_eq!(uc_props["delta.minReaderVersion"], "3"); + assert_eq!(uc_props["delta.minWriterVersion"], "7"); + assert_eq!(uc_props["delta.feature.catalogManaged"], "supported"); + assert_eq!(uc_props["delta.feature.vacuumProtocolCheck"], "supported"); + assert_eq!(uc_props["delta.feature.inCommitTimestamp"], "supported"); + assert_eq!(uc_props["delta.feature.clustering"], "supported"); + + // Metadata configuration + assert_eq!(uc_props["io.unitycatalog.tableId"], "test-table-id-456"); + + // UC-specific properties + assert_eq!(uc_props["delta.lastUpdateVersion"], "0"); + let timestamp: i64 = uc_props["delta.lastCommitTimestamp"] + .parse() + .expect("timestamp should be a valid i64"); + assert!( + timestamp > 0, + "ICT timestamp should be non-zero, got {timestamp}" + ); + + // Clustering columns: serialized as [[col1], [col2]] (array of path arrays) + let parsed: Vec> = + serde_json::from_str(&uc_props["clusteringColumns"]).unwrap(); + assert_eq!(parsed, vec![vec!["region"]]); + } + + #[tokio::test] + async fn test_clustering_columns_serialization_multiple_and_nested() { + let storage = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(storage).build(); + let table_path = "memory:///test_clustering_ser/"; + let address_struct = StructType::new_unchecked(vec![ + StructField::new("city", DataType::STRING, true), + StructField::new("zip", DataType::STRING, true), + ]); + let schema = Arc::new( + StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("region", DataType::STRING, true), + StructField::new("address", DataType::Struct(Box::new(address_struct)), true), + ]) + .unwrap(), + ); + + use delta_kernel::expressions::ColumnName; + + let disk_props = get_required_properties_for_disk("test-table-id"); + let _ = create_table(table_path, schema, "Test/1.0") + .with_table_properties(disk_props) + .with_data_layout(DataLayout::Clustered { + columns: vec![ + ColumnName::new(["region"]), + ColumnName::new(["address", "city"]), + ], + }) + .build(&engine, Box::new(MockCatalogCommitter)) + .unwrap() + .commit(&engine) + .unwrap(); + + let snapshot = Snapshot::builder_for(table_path).build(&engine).unwrap(); + let uc_props = get_final_required_properties_for_uc(&snapshot, &engine).unwrap(); + + // Clustering columns serialized as array of path arrays: + // [["region"], ["address", "city"]] + let raw_json = &uc_props["clusteringColumns"]; + let parsed: Vec> = serde_json::from_str(raw_json).unwrap(); + assert_eq!( + parsed, + vec![vec!["region"], vec!["address", "city"]], + "Raw JSON: {raw_json}" + ); + } + + #[tokio::test] + async fn test_get_final_required_properties_for_uc_rejects_non_zero_version() { + let storage = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(storage).build(); + let table_path = "memory:///test_version_check/"; + let schema = Arc::new( + StructType::try_new(vec![StructField::new("id", DataType::INTEGER, false)]).unwrap(), + ); + + // Create a table (version 0) and append (version 1) + let disk_props = get_required_properties_for_disk("test-table-id"); + let _ = create_table(table_path, schema, "Test/1.0") + .with_table_properties(disk_props) + .build(&engine, Box::new(MockCatalogCommitter)) + .unwrap() + .commit(&engine) + .unwrap(); + let v0_snapshot = Snapshot::builder_for(table_path).build(&engine).unwrap(); + let result = v0_snapshot + .transaction(Box::new(MockCatalogCommitter), &engine) + .unwrap() + .commit(&engine) + .unwrap(); + assert!(result.is_committed()); + + // Load snapshot at version 1 + let snapshot = Snapshot::builder_for(table_path).build(&engine).unwrap(); + assert_eq!(snapshot.version(), 1); + + // Should fail because version != 0 + let err = get_final_required_properties_for_uc(&snapshot, &engine).unwrap_err(); + assert!( + err.to_string().contains("version 0"), + "expected version 0 error, got: {err}" + ); + } +} diff --git a/delta-kernel-unity-catalog/src/utils/mod.rs b/delta-kernel-unity-catalog/src/utils/mod.rs new file mode 100644 index 0000000000..595410194c --- /dev/null +++ b/delta-kernel-unity-catalog/src/utils/mod.rs @@ -0,0 +1,3 @@ +pub(crate) mod create_table; + +pub use create_table::{get_final_required_properties_for_uc, get_required_properties_for_disk}; diff --git a/delta-kernel-unity-catalog/tests/data/catalog_managed_0/_delta_log/00000000000000000000.json b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..4aa0f02197 --- /dev/null +++ b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"inCommitTimestamp":1749830855993,"timestamp":1749830855992,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[\"part1\"]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"d108f896-9662-4eda-b4de-444a99850aa8"}} +{"metaData":{"id":"64dcd182-b3b4-4ee0-88e0-63c159a4121c","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"part1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"col1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part1"],"configuration":{"delta.enableInCommitTimestamps":"true","io.unitycatalog.tableId":"64dcd182-b3b4-4ee0-88e0-63c159a4121c"},"createdTime":1749830855646}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["catalogManaged","vacuumProtocolCheck"],"writerFeatures":["catalogManaged","inCommitTimestamp","vacuumProtocolCheck"]}} diff --git a/delta-kernel-unity-catalog/tests/data/catalog_managed_0/_delta_log/_staged_commits/00000000000000000001.4cb9708e-b478-44de-b203-53f9ba9b2876.json b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/_delta_log/_staged_commits/00000000000000000001.4cb9708e-b478-44de-b203-53f9ba9b2876.json new file mode 100644 index 0000000000..3970d2a02d --- /dev/null +++ b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/_delta_log/_staged_commits/00000000000000000001.4cb9708e-b478-44de-b203-53f9ba9b2876.json @@ -0,0 +1,2 @@ +{"commitInfo":{"inCommitTimestamp":1749830871085,"timestamp":1749830871084,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"100","numOutputBytes":"889"},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"4cb9708e-b478-44de-b203-53f9ba9b2876"}} +{"add":{"path":"part1=0/part-00000-13fefaba-8ec2-4762-b17e-aeda657451c5.c000.snappy.parquet","partitionValues":{"part1":"0"},"size":889,"modificationTime":1749830870833,"dataChange":true,"stats":"{\"numRecords\":100,\"minValues\":{\"col1\":0},\"maxValues\":{\"col1\":99},\"nullCount\":{\"col1\":0}}"}} diff --git a/delta-kernel-unity-catalog/tests/data/catalog_managed_0/_delta_log/_staged_commits/00000000000000000002.5b9bba4a-0085-430d-a65e-b0d38c1afbe9.json b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/_delta_log/_staged_commits/00000000000000000002.5b9bba4a-0085-430d-a65e-b0d38c1afbe9.json new file mode 100644 index 0000000000..4c42e497d6 --- /dev/null +++ b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/_delta_log/_staged_commits/00000000000000000002.5b9bba4a-0085-430d-a65e-b0d38c1afbe9.json @@ -0,0 +1,2 @@ +{"commitInfo":{"inCommitTimestamp":1749830881799,"timestamp":1749830881798,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"100","numOutputBytes":"891"},"engineInfo":"Apache-Spark/4.0.0 Delta-Lake/4.0.0","txnId":"5b9bba4a-0085-430d-a65e-b0d38c1afbe9"}} +{"add":{"path":"part1=1/part-00000-8afb1c56-2018-4af2-aa4f-4336c1b39efd.c000.snappy.parquet","partitionValues":{"part1":"1"},"size":891,"modificationTime":1749830881779,"dataChange":true,"stats":"{\"numRecords\":100,\"minValues\":{\"col1\":100},\"maxValues\":{\"col1\":199},\"nullCount\":{\"col1\":0}}"}} diff --git a/delta-kernel-unity-catalog/tests/data/catalog_managed_0/part1=0/part-00000-13fefaba-8ec2-4762-b17e-aeda657451c5.c000.snappy.parquet b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/part1=0/part-00000-13fefaba-8ec2-4762-b17e-aeda657451c5.c000.snappy.parquet new file mode 100644 index 0000000000..b3e92a479d Binary files /dev/null and b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/part1=0/part-00000-13fefaba-8ec2-4762-b17e-aeda657451c5.c000.snappy.parquet differ diff --git a/delta-kernel-unity-catalog/tests/data/catalog_managed_0/part1=1/part-00000-8afb1c56-2018-4af2-aa4f-4336c1b39efd.c000.snappy.parquet b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/part1=1/part-00000-8afb1c56-2018-4af2-aa4f-4336c1b39efd.c000.snappy.parquet new file mode 100644 index 0000000000..3457677546 Binary files /dev/null and b/delta-kernel-unity-catalog/tests/data/catalog_managed_0/part1=1/part-00000-8afb1c56-2018-4af2-aa4f-4336c1b39efd.c000.snappy.parquet differ diff --git a/delta-kernel-unity-catalog/tests/e2e_in_memory.rs b/delta-kernel-unity-catalog/tests/e2e_in_memory.rs new file mode 100644 index 0000000000..bed797500c --- /dev/null +++ b/delta-kernel-unity-catalog/tests/e2e_in_memory.rs @@ -0,0 +1,214 @@ +use std::path::PathBuf; +use std::sync::Arc; + +use delta_kernel::engine::default::executor::tokio::TokioMultiThreadExecutor; +use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::object_store::local::LocalFileSystem; +use delta_kernel::transaction::CommitResult; +use delta_kernel::Snapshot; +use delta_kernel_unity_catalog::{UCCommitter, UCKernelClient}; +use unity_catalog_delta_client_api::{Commit, InMemoryCommitsClient, TableData}; + +// ============================================================================ +// Test Setup +// ============================================================================ + +type TestError = Box; + +const TABLE_ID: &str = "64dcd182-b3b4-4ee0-88e0-63c159a4121c"; + +/// Test fixtures: commits client, engine, snapshot at v2, and temp directory. +struct TestSetup { + commits_client: Arc, + engine: DefaultEngine, + snapshot: Arc, + table_uri: url::Url, + /// Tests must bind this field (not ignore with `..` or `_`) to prevent the temp directory + /// from being dropped and cleaned up before the test completes. + _tmp_dir: tempfile::TempDir, +} + +/// Copies test data to temp dir and loads snapshot at v2 with in-memory commits client. +async fn setup() -> Result { + let src = PathBuf::from("./tests/data/catalog_managed_0/"); + let tmp_dir = tempfile::tempdir()?; + copy_dir_recursive(&src, tmp_dir.path())?; + + // v0 published, v1/v2 ratified but unpublished + let commits_client = Arc::new(InMemoryCommitsClient::new()); + commits_client.insert_table( + TABLE_ID, + TableData { + max_ratified_version: 2, + catalog_commits: vec![ + Commit::new( + 1, + 1749830871085, + "00000000000000000001.4cb9708e-b478-44de-b203-53f9ba9b2876.json", + 889, + 1749830870833, + ), + Commit::new( + 2, + 1749830881799, + "00000000000000000002.5b9bba4a-0085-430d-a65e-b0d38c1afbe9.json", + 891, + 1749830881779, + ), + ], + }, + ); + + let store = Arc::new(LocalFileSystem::new()); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = delta_kernel::engine::default::DefaultEngineBuilder::new(store) + .with_task_executor(executor) + .build(); + let table_uri = url::Url::from_directory_path(tmp_dir.path()).map_err(|_| "invalid path")?; + let snapshot = UCKernelClient::new(commits_client.as_ref()) + .load_snapshot_at(TABLE_ID, table_uri.as_str(), 2, &engine) + .await?; + + Ok(TestSetup { + commits_client, + engine, + snapshot, + table_uri, + _tmp_dir: tmp_dir, + }) +} + +/// Recursively copies a directory tree. +fn copy_dir_recursive(src: &std::path::Path, dst: &std::path::Path) -> std::io::Result<()> { + std::fs::create_dir_all(dst)?; + for entry in std::fs::read_dir(src)? { + let entry = entry?; + let dst_path = dst.join(entry.file_name()); + if entry.path().is_dir() { + copy_dir_recursive(&entry.path(), &dst_path)?; + } else { + std::fs::copy(entry.path(), &dst_path)?; + } + } + Ok(()) +} + +/// Commits an empty transaction and returns the post-commit snapshot. +fn commit( + snapshot: &Arc, + commits_client: &Arc, + engine: &DefaultEngine, +) -> Result, TestError> { + let committer = Box::new(UCCommitter::new(commits_client.clone(), TABLE_ID)); + match snapshot + .clone() + .transaction(committer, engine)? + .commit(engine)? + { + CommitResult::CommittedTransaction(t) => Ok(t + .post_commit_snapshot() + .ok_or("no post commit snapshot")? + .clone()), + _ => Err("Expected committed transaction".into()), + } +} + +// ============================================================================ +// Tests +// ============================================================================ + +// multi_thread required: UCCommitter uses block_on which panics on single-threaded runtime +#[tokio::test(flavor = "multi_thread")] +async fn test_insert_and_publish() -> Result<(), TestError> { + let TestSetup { + commits_client, + engine, + mut snapshot, + table_uri: _, + _tmp_dir, + } = setup().await?; + assert_eq!(snapshot.version(), 2); + + let beyond_max = TableData::MAX_UNPUBLISHED_COMMITS as u64 + 5; + + for _ in 3..=beyond_max { + snapshot = commit(&snapshot, &commits_client, &engine)?; + + let committer = UCCommitter::new(commits_client.clone(), TABLE_ID); + + snapshot = snapshot.publish(&engine, &committer)?; + } + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_insert_without_publish_hits_limit() -> Result<(), TestError> { + let TestSetup { + commits_client, + engine, + mut snapshot, + table_uri: _, + _tmp_dir, + } = setup().await?; + + // Start with 2 unpublished (v1, v2). Insert up to MAX, then the next should fail. + let max = TableData::MAX_UNPUBLISHED_COMMITS as u64; + for _ in 3..=max { + snapshot = commit(&snapshot, &commits_client, &engine)?; + } + assert_eq!(snapshot.version(), max); + + // Next insert should fail with MaxUnpublishedCommitsExceeded + let committer = Box::new(UCCommitter::new(commits_client.clone(), TABLE_ID)); + let err = snapshot + .clone() + .transaction(committer, &engine)? + .commit(&engine) + .unwrap_err(); + assert!( + matches!(err, delta_kernel::Error::Generic(msg) if msg.contains("Max unpublished commits")) + ); + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_checkpoint_after_publish() -> Result<(), TestError> { + let TestSetup { + commits_client, + engine, + snapshot, + table_uri, + _tmp_dir, + } = setup().await?; + + let committer = UCCommitter::new(commits_client.clone(), TABLE_ID); + + commit(&snapshot, &commits_client, &engine)? + .publish(&engine, &committer)? + .checkpoint(&engine)?; + + // Load a fresh snapshot and verify checkpoint was written + let snapshot = Snapshot::builder_for(table_uri).build(&engine)?; + assert_eq!(snapshot.log_segment().checkpoint_version, Some(3)); + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_cannot_checkpoint_unpublished_snapshot() -> Result<(), TestError> { + let TestSetup { + commits_client, + engine, + snapshot, + table_uri: _, + _tmp_dir, + } = setup().await?; + + let snapshot = commit(&snapshot, &commits_client, &engine)?; + + let err = snapshot.checkpoint(&engine).unwrap_err(); + assert!(matches!(err, delta_kernel::Error::Generic(msg) if msg.contains("not published"))); + Ok(()) +} diff --git a/derive-macros/src/lib.rs b/derive-macros/src/lib.rs index 7d392aa490..50f8836e4c 100644 --- a/derive-macros/src/lib.rs +++ b/derive-macros/src/lib.rs @@ -82,6 +82,26 @@ fn get_schema_name(name: &Ident) -> Ident { Ident::new(&ret, name.span()) } +/// Check if a path segment is `Option>`. +fn is_option_of_hashmap(seg: &syn::PathSegment) -> bool { + if seg.ident != "Option" { + return false; + } + let PathArguments::AngleBracketed(angle_args) = &seg.arguments else { + return false; + }; + // Option has exactly one type argument + let Some(syn::GenericArgument::Type(Type::Path(inner_type))) = angle_args.args.first() else { + return false; + }; + // Check if the inner type's last segment is HashMap + inner_type + .path + .segments + .last() + .is_some_and(|seg| seg.ident == "HashMap") +} + fn gen_schema_fields(data: &Data) -> TokenStream { let fields = match data { Data::Struct(DataStruct { @@ -119,11 +139,13 @@ fn gen_schema_fields(data: &Data) -> TokenStream { } }); if have_schema_null { - if let Some(last_ident) = type_path.path.segments.last().map(|seg| &seg.ident) { - if last_ident != "HashMap" { - return Error::new( - last_ident.span(), - format!("Can only use allow_null_container_values on HashMap fields, not {last_ident}") + if let Some(last_seg) = type_path.path.segments.last() { + let is_valid = + last_seg.ident == "HashMap" || is_option_of_hashmap(last_seg); + if !is_valid { + return Error::new( + last_seg.ident.span(), + format!("Can only use allow_null_container_values on HashMap or Option fields, not {}", last_seg.ident) ).to_compile_error() } } @@ -138,7 +160,7 @@ fn gen_schema_fields(data: &Data) -> TokenStream { quote! { #(#schema_fields),* } } -/// Derive an IntoEngineData trait for a struct that has all fields implement `Into`. +/// Derive an IntoEngineData trait for a struct that has all fields implement `TryInto`. /// /// This is a relatively simple macro to produce the boilerplate for converting a struct into /// EngineData using the `create_one` method. TODO: (doc)tests included in the delta_kernel crate: @@ -163,23 +185,24 @@ pub fn into_engine_data_derive(input: proc_macro::TokenStream) -> proc_macro::To let fields = &fields.named; let field_idents = fields.iter().map(|f| &f.ident); - let field_types = fields.iter().map(|f| &f.ty); + let field_types: Vec<_> = fields.iter().map(|f| &f.ty).collect(); let expanded = quote! { #[automatically_derived] - impl crate::IntoEngineData for #struct_name + impl delta_kernel::IntoEngineData for #struct_name where - #(#field_types: Into),* + #(#field_types: TryInto,)* + #(delta_kernel::Error: From<<#field_types as TryInto>::Error>,)* { fn into_engine_data( self, - schema: crate::schema::SchemaRef, - engine: &dyn crate::Engine) - -> crate::DeltaResult> { + schema: delta_kernel::schema::SchemaRef, + engine: &dyn delta_kernel::Engine) + -> delta_kernel::DeltaResult> { // NB: we `use` here to avoid polluting the caller's namespace - use crate::EvaluationHandlerExtension as _; + use delta_kernel::EvaluationHandlerExtension as _; let values = [ - #(self.#field_idents.into()),* + #(self.#field_idents.try_into()?),* ]; let evaluator = engine.evaluation_handler(); evaluator.create_one(schema, &values) @@ -191,9 +214,10 @@ pub fn into_engine_data_derive(input: proc_macro::TokenStream) -> proc_macro::To } /// Mark items as `internal_api` to make them public iff the `internal-api` feature is enabled. -/// Note this doesn't work for inline module definitions (see `internal_mod!` macro in delta_kernel -/// crate - can't export macro_rules! from proc macro crate). -/// Ref: +/// +/// NOTE: This macro does not support `mod` declarations because of nuances in how the mod expander +/// and proc macro system interact for non-inline modules such as `mod foo;`. Use explicit +/// cfg-gated `pub mod` / `pub(crate) mod` for module visibility control instead. #[proc_macro_attribute] pub fn internal_api( _attr: proc_macro::TokenStream, @@ -217,27 +241,39 @@ pub fn internal_api( } fn make_public(mut item: Item) -> Item { - fn set_pub(vis: &mut Visibility) -> Result<(), syn::Error> { + /// Transforms the passed visibility to be `pub`. We pass the original span that the visibility + /// came from, and attach it to the newly created pub token. This means that the compiler treats + /// it as user-written code and normal lints apply. We want this because it allows us to catch + /// "private_in_public" violations that are tricky to notice when just slapping + /// `#[internal_api]` on something. + fn set_pub(vis: &mut Visibility, span: Span) -> Result<(), syn::Error> { if matches!(vis, Visibility::Public(_)) { return Err(Error::new( vis.span(), "ineligible for #[internal_api]: item is already public", )); } - *vis = syn::parse_quote!(pub); + *vis = Visibility::Public(syn::token::Pub { span }); Ok(()) } + macro_rules! set_vis { + ($item:ident) => {{ + let vis_span = $item.vis.span(); + set_pub(&mut $item.vis, vis_span) + }}; + } + let result = match &mut item { - Item::Fn(f) => set_pub(&mut f.vis), - Item::Struct(s) => set_pub(&mut s.vis), - Item::Enum(e) => set_pub(&mut e.vis), - Item::Trait(t) => set_pub(&mut t.vis), - Item::Type(t) => set_pub(&mut t.vis), - Item::Mod(m) => set_pub(&mut m.vis), - Item::Static(s) => set_pub(&mut s.vis), - Item::Const(c) => set_pub(&mut c.vis), - Item::Union(u) => set_pub(&mut u.vis), + Item::Fn(f) => set_vis!(f), + Item::Struct(s) => set_vis!(s), + Item::Enum(e) => set_vis!(e), + Item::Trait(t) => set_vis!(t), + Item::Type(t) => set_vis!(t), + Item::Use(u) => set_vis!(u), + Item::Static(s) => set_vis!(s), + Item::Const(c) => set_vis!(c), + Item::Union(u) => set_vis!(u), // foreign mod, impl block, and all others not handled _ => Err(Error::new( item.span(), diff --git a/feature-tests/Cargo.toml b/feature-tests/Cargo.toml index 58984cba47..06c2d2f321 100644 --- a/feature-tests/Cargo.toml +++ b/feature-tests/Cargo.toml @@ -13,8 +13,15 @@ version.workspace = true release = false [dependencies] -delta_kernel = { path = "../kernel", features = ["arrow"] } +delta_kernel = { path = "../kernel" } +# Direct deps for TLS crypto provider regression tests (see lib.rs). +reqwest = { version = "0.13", default-features = false, optional = true } +rustls = { version = "0.23", default-features = false, optional = true } [features] -default-engine-native-tls= [ "delta_kernel/default-engine-native-tls" ] -default-engine-rustls = [ "delta_kernel/default-engine-rustls" ] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["delta_kernel/arrow-57"] +arrow-56 = ["delta_kernel/arrow-56"] +default-engine-native-tls = ["delta_kernel/default-engine-native-tls", "dep:rustls", "reqwest/native-tls"] +default-engine-rustls = ["delta_kernel/default-engine-rustls", "dep:rustls", "reqwest/rustls"] diff --git a/feature-tests/src/lib.rs b/feature-tests/src/lib.rs index 46355e5410..f43fbd59e4 100644 --- a/feature-tests/src/lib.rs +++ b/feature-tests/src/lib.rs @@ -3,7 +3,7 @@ /// /// Run (from workspace root) with: /// 1. `cargo b -p feature_tests --features default-engine-rustls` -/// 2. `cargo b -p feature_tests --features default-engine` +/// 2. `cargo b -p feature_tests --features default-engine-native-tls` /// /// These run in our build CI. pub fn test_default_engine_feature_flags() { @@ -16,3 +16,51 @@ pub fn test_default_engine_feature_flags() { use delta_kernel::engine::default::DefaultEngine; } } + +/// Regression tests for rustls crypto provider conflicts. +/// +/// rustls 0.23 panics at runtime if both `aws-lc-rs` and `ring` features are active and no +/// provider is explicitly installed. object_store always brings `ring` transitively, so kernel +/// must avoid adding `aws-lc-rs` to the same rustls instance. +/// +/// Two APIs are tested because they behave differently: +/// - `rustls::ClientConfig::builder()` relies on auto-detection and panics on dual providers. +/// - `reqwest::Client::new()` explicitly constructs its provider, so it always succeeds. +#[cfg(test)] +mod tests { + // Verifies that `default-engine-native-tls` does not leak aws-lc-rs into the rustls + // feature set. If this panics, kernel's reqwest is pulling in `default-tls` again. + // + // Excluded when `default-engine-rustls` is also active (e.g. --all-features) because + // that feature inherently adds aws-lc-rs, making the dual-provider panic unavoidable. + #[test] + #[cfg(all( + feature = "default-engine-native-tls", + not(feature = "default-engine-rustls") + ))] + fn test_native_tls_rustls_builder_no_dual_provider_panic() { + let _config = rustls::ClientConfig::builder(); + } + + #[test] + #[cfg(feature = "default-engine-native-tls")] + fn test_native_tls_reqwest_client_no_panic() { + let _client = reqwest::Client::new(); + } + + // `default-engine-rustls` has an inherent dual-provider conflict: reqwest's `rustls` + // feature brings aws-lc-rs while object_store brings ring. This is an upstream limitation. + // If this test stops panicking, the upstream issue is fixed and the annotation can go. + #[test] + #[cfg(feature = "default-engine-rustls")] + #[should_panic(expected = "Could not automatically determine the process-level CryptoProvider")] + fn test_rustls_rustls_builder_has_dual_provider_panic() { + let _config = rustls::ClientConfig::builder(); + } + + #[test] + #[cfg(feature = "default-engine-rustls")] + fn test_rustls_reqwest_client_no_panic() { + let _client = reqwest::Client::new(); + } +} diff --git a/ffi/CLAUDE.md b/ffi/CLAUDE.md new file mode 100644 index 0000000000..d40a0c8e02 --- /dev/null +++ b/ffi/CLAUDE.md @@ -0,0 +1,68 @@ +# FFI Layer + +The `delta_kernel_ffi` crate exposes the kernel to C/C++ via a stable FFI boundary using +cbindgen-generated headers (`.h` and `.hpp`). + +## Handle System + +Objects crossing the FFI boundary may be wrapped in **handles** -- opaque pointers with +ownership semantics: +- **Mutable handles** (`Box`-like) -- exclusive ownership, neither `Copy` nor `Clone` +- **Shared handles** (`Arc`-like) -- shared ownership via reference counting + +A handle is needed when a value might outlive the function call that passes it across the +FFI boundary, or when the type is not representable in C/C++ (dyn trait references, slices, +options, etc.). Short-lived "plain old data" types like `ExternResult`, `KernelError`, +`KernelStringSlice`, and `EngineIterator` do not need handles. + +Every handle has a corresponding `free_*` function (e.g. `free_engine`, `free_snapshot`). + +## Error Handling + +Fallible functions return `ExternResult` (tagged union of Ok/Err). The caller provides an +`allocate_error` callback when creating the engine; kernel calls this to allocate errors in +the caller's memory space. + +## Key Files + +- `src/lib.rs` -- main FFI entry points and type definitions +- `src/handle.rs` -- opaque handle system for passing Rust objects across FFI +- `src/scan.rs` -- scan FFI interface +- `src/schema_visitor.rs` -- visitor pattern for schema traversal + +## Read Flow + +``` +get_default_engine() -> get_snapshot_builder() -> snapshot_builder_build() -> scan() -> scan_metadata() -> read + transform +``` + +Snapshot builder API (`ffi/src/lib.rs`): +- `get_snapshot_builder(path, engine)` -- fresh snapshot from a table path +- `get_snapshot_builder_from(old_snapshot, engine)` -- incremental update reusing an existing snapshot (avoids re-reading the log) +- `snapshot_builder_set_version(builder, version)` -- optional: pin to a specific version +- `snapshot_builder_set_log_tail(builder, log_tail)` -- optional: set log tail (for catalog-managed tables) +- `snapshot_builder_build(builder)` -- consume the builder and produce a `SharedSnapshot` +- `free_snapshot_builder(builder)` -- discard without building (e.g. on error paths) + +The caller owns the returned builder handle and must call either `snapshot_builder_build` or `free_snapshot_builder`. + +## Write Flow + +``` +get_default_engine() -> transaction() -> with_engine_info() -> add_files() -> commit() +``` + +## Building + +```bash +cargo build -p delta_kernel_ffi --release +# Headers written to target/ffi-headers/ +``` + +Feature flags: +- `default-engine-rustls` (default) +- `default-engine-native-tls` +- `arrow` (default; currently maps to `arrow-57`) +- `arrow-57`, `arrow-56` +- `delta-kernel-unity-catalog` +- `tracing` diff --git a/ffi/Cargo.toml b/ffi/Cargo.toml index 84d379f75c..ee56cef77f 100644 --- a/ffi/Cargo.toml +++ b/ffi/Cargo.toml @@ -25,32 +25,39 @@ url = "2" delta_kernel = { path = "../kernel", default-features = false, features = [ "internal-api", ] } -delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.16.0" } +delta_kernel_ffi_macros = { path = "../ffi-proc-macros", version = "0.20.0" } +delta-kernel-unity-catalog = { path = "../delta-kernel-unity-catalog", optional = true } +unity-catalog-delta-client-api = { path = "../unity-catalog-delta-client-api", optional = true } [build-dependencies] cbindgen = "0.29.2" libc = "0.2.175" [dev-dependencies] +paste = "1.0" rand = "0.9.2" serde = "1.0.219" serde_json = "1.0.142" -test_utils = { path = "../test-utils" } +test_utils = { path = "../test-utils", default-features = false } tokio = { version = "1.47" } trybuild = "1.0" tempfile = "3.20.0" itertools = "0.14.0" -object_store = "0.12.3" +rstest = "0.23" [features] -default = ["default-engine-rustls"] +default = ["default-engine-rustls", "arrow"] default-engine-native-tls = ["delta_kernel/default-engine-native-tls", "default-engine-base"] default-engine-rustls = ["delta_kernel/default-engine-rustls", "default-engine-base"] +delta-kernel-unity-catalog = [ "dep:delta-kernel-unity-catalog", "dep:unity-catalog-delta-client-api" ] +arrow = ["arrow-57"] +arrow-57 = ["delta_kernel/arrow-57"] +arrow-56 = ["delta_kernel/arrow-56"] # This is an 'internal' feature flag which has all the shared bits from default-engine-native-tls and # default-engine-rustls. There is a check in kernel/lib.rs to ensure you have enabled one of # default-engine-native-tls or default-engine-rustls, so default-engine-base will not work by itself -default-engine-base = ["delta_kernel/default-engine-base", "delta_kernel/arrow"] +default-engine-base = ["delta_kernel/default-engine-base"] tracing = [ "tracing-core", "tracing-subscriber" ] internal-api = [] diff --git a/ffi/examples/read-table/kernel_utils.c b/ffi/examples/common/kernel_utils.c similarity index 81% rename from ffi/examples/read-table/kernel_utils.c rename to ffi/examples/common/kernel_utils.c index 64262414a8..002c3bfc1e 100644 --- a/ffi/examples/read-table/kernel_utils.c +++ b/ffi/examples/common/kernel_utils.c @@ -61,10 +61,17 @@ void* allocate_string(const KernelStringSlice slice) } // utility function to convert key/val into slices and set them on a builder -void set_builder_opt(EngineBuilder* engine_builder, char* key, char* val) +// returns false on failure +bool set_builder_opt(EngineBuilder* engine_builder, char* key, char* val) { KernelStringSlice key_slice = { key, strlen(key) }; KernelStringSlice val_slice = { val, strlen(val) }; - set_builder_option(engine_builder, key_slice, val_slice); + ExternResultbool res = set_builder_option(engine_builder, key_slice, val_slice); + if (res.tag != Okbool) { + print_error("Failed to set builder option.", (Error*)res.err); + free_error((Error*)res.err); + return false; + } + return true; } diff --git a/ffi/examples/read-table/kernel_utils.h b/ffi/examples/common/kernel_utils.h similarity index 88% rename from ffi/examples/read-table/kernel_utils.h rename to ffi/examples/common/kernel_utils.h index c6e60b960a..eca6eccd97 100644 --- a/ffi/examples/read-table/kernel_utils.h +++ b/ffi/examples/common/kernel_utils.h @@ -1,5 +1,6 @@ #pragma once +#include #include // This is how we represent our errors. The kernel will ask us to contruct this struct whenever it @@ -21,4 +22,5 @@ void* allocate_string(const KernelStringSlice slice); // error EngineError* allocate_error(KernelError etype, const KernelStringSlice msg); // utility function to convert key/val into slices and set them on a builder -void set_builder_opt(EngineBuilder* engine_builder, char* key, char* val); +// returns false on failure +bool set_builder_opt(EngineBuilder* engine_builder, char* key, char* val); diff --git a/ffi/examples/delta-kernel-unity-catalog-example/CMakeLists.txt b/ffi/examples/delta-kernel-unity-catalog-example/CMakeLists.txt new file mode 100644 index 0000000000..d371209bb5 --- /dev/null +++ b/ffi/examples/delta-kernel-unity-catalog-example/CMakeLists.txt @@ -0,0 +1,27 @@ +cmake_minimum_required(VERSION 3.12) +project(delta_kernel_unity_catalog_example) + +add_executable(delta_kernel_unity_catalog_example delta_kernel_unity_catalog_example.c ../common/kernel_utils.c) +target_compile_definitions(delta_kernel_unity_catalog_example PUBLIC DEFINE_DEFAULT_ENGINE_BASE) +target_include_directories(delta_kernel_unity_catalog_example PUBLIC + "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/ffi-headers" + "${CMAKE_CURRENT_SOURCE_DIR}/../common") +target_link_directories(delta_kernel_unity_catalog_example PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/debug") +target_link_libraries(delta_kernel_unity_catalog_example PUBLIC delta_kernel_ffi) + +if(WIN32) + set(CMAKE_C_FLAGS_DEBUG "/MT") + target_link_libraries(delta_kernel_unity_catalog_example PUBLIC ws2_32 userenv bcrypt ncrypt crypt32 secur32 ntdll RuntimeObject) +endif(WIN32) + +if(MSVC) + target_compile_options(delta_kernel_unity_catalog_example PRIVATE /W3 /WX) +else() + target_compile_options(delta_kernel_unity_catalog_example PRIVATE -Wall -Wextra -Wpedantic -Werror -g -fsanitize=address) + target_link_options(delta_kernel_unity_catalog_example PRIVATE -g -fsanitize=address) +endif() + +# Testing +include(CTest) +set(ExprTestRunner "../../../tests/test-delta-kernel-unity-catalog-ffi/run_test.sh") +add_test(NAME test_delta_kernel_unity_catalog_ffi COMMAND ${ExprTestRunner}) diff --git a/ffi/examples/delta-kernel-unity-catalog-example/README.md b/ffi/examples/delta-kernel-unity-catalog-example/README.md new file mode 100644 index 0000000000..f01c441c31 --- /dev/null +++ b/ffi/examples/delta-kernel-unity-catalog-example/README.md @@ -0,0 +1,35 @@ +delta-kernel-unity-catalog example +=================================== + +Simple example to show how to use the delta-kernel-unity-catalog ffi features + +# Building + +This example is built with [cmake]. Instructions below assume you start in the directory containing this README. + +Note that prior to building these examples you must build `delta_kernel_ffi` with all feature enabled (see [the FFI readme] for details). TLDR: +```bash +# from repo root +$ cargo build -p delta_kernel_ffi [--release] --all-features +# from ffi/ dir +$ cargo build [--release] --all-features +``` + +There are two configurations that can currently be configured in cmake: +```bash +# turn on VERBOSE mode (default is off) - print more diagnostics +$ cmake -DVERBOSE=yes .. +# turn off PRINT_DATA (default is on) - see below +$ cmake -DPRINT_DATA=no .. +``` + +## Linux / MacOS + +Most likely something like this should work: +``` +$ mkdir build +$ cd build +$ cmake .. +$ make +$ ./delta_kernel_unity_catalog_example [path/to/table] +``` diff --git a/ffi/examples/delta-kernel-unity-catalog-example/delta_kernel_unity_catalog_example.c b/ffi/examples/delta-kernel-unity-catalog-example/delta_kernel_unity_catalog_example.c new file mode 100644 index 0000000000..1b4c66c09b --- /dev/null +++ b/ffi/examples/delta-kernel-unity-catalog-example/delta_kernel_unity_catalog_example.c @@ -0,0 +1,240 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kernel_utils.h" + +// Context struct to hold any state needed by our client +// This can hold connection info, auth tokens, etc. +typedef struct UCContext { + int call_count; + const char* base_url; +} UCContext; + + +// Check that a staging file matches what the commit info says, then remove it +void validate_and_clean_staging_file(char* table_uri, char* file_name, Commit *commit_info) { + char* uri = table_uri; + + // strip 'file://' if it's present + if(strncmp(table_uri, "file://", 7) == 0) { + uri = uri + 7; + } + + int path_len = strlen(uri) + strlen(file_name) + 28; + char path[path_len]; + snprintf(path, path_len, "%s_delta_log/_staged_commits/%s", uri, file_name); + printf("Checking that staging file at %s is valid\n", path); + struct stat buf; + if (stat(path, &buf)) { + // stat returned an error + perror("Could not stat the staging file!"); + exit(-1); + } else { + if (buf.st_size != commit_info->file_size) { + printf("staged has size: %9jd, but commit_info says something else\n", (intmax_t)buf.st_size); + exit(-1); + } +#if defined(__APPLE__) + time_t mt = buf.st_mtimespec.tv_sec; +#else + time_t mt = buf.st_mtim.tv_sec; +#endif + time_t expected_mt = commit_info->file_modification_timestamp / 1000; + if (mt != expected_mt) { + printf("staged has modification time: %ld, but commit_info has %ld\n", mt, expected_mt); + exit(-1); + } + printf("Staged file looks good\n"); + if (unlink(path)) { + perror("Couldn't removed staged file"); + } else { + printf("Removed staged file\n\n"); + } + } +} + +// our implementation of `commit` +OptionalValueHandleExclusiveRustString commit_callback( + NullableCvoid context_ptr, + CommitRequest request) +{ + UCContext* context = NULL; + if (context_ptr != NULL) { + context = (UCContext*)context_ptr; + context->call_count++; + printf("commit called (call #%d)\n", context->call_count); + printf("committing to catalog at: %s\n", context->base_url); + } else { + printf("commit called\n"); + } + + // Extract request information + char table_id[256]; + char table_uri[1024]; + snprintf(table_id, sizeof(table_id), "%.*s", (int)request.table_id.len, request.table_id.ptr); + snprintf(table_uri, sizeof(table_uri), "%.*s", (int)request.table_uri.len, request.table_uri.ptr); + + printf("Committing to table ID: %s\n", table_id); + printf("Table URI: %s\n", table_uri); + + if (request.commit_info.tag == SomeCommit) { + Commit commit_info = request.commit_info.some; + char* file_name = allocate_string(commit_info.file_name); + + printf("Commit info:\n"); + printf(" Version: %" PRId64 "\n", commit_info.version); + printf(" Timestamp: %" PRId64 "\n", commit_info.timestamp); + printf(" File name: %s\n", file_name); + printf(" File size: %" PRId64 "\n", commit_info.file_size); + printf(" File mod time: %" PRId64 "\n\n", commit_info.file_modification_timestamp); + + validate_and_clean_staging_file(table_uri, file_name, &commit_info); + free(file_name); + } + + if (request.latest_backfilled_version.tag == Somei64) { + printf("Latest backfilled version: %" PRId64 "\n", + request.latest_backfilled_version.some); + } + + // Return None to indicate success + OptionalValueHandleExclusiveRustString result; + result.tag = NoneHandleExclusiveRustString; + return result; +} + +int main(int argc, char* argv[]) +{ + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return -1; + } + + char* table_path = argv[1]; + + // Initialize our UC context + UCContext uc_context = { + .call_count = 0, + .base_url = "https://uc-catalog.example.com/api/v1" + }; + + // Create a UC commit client + NullableCvoid context = (void*)&uc_context; + HandleSharedFfiUCCommitClient uc_client = get_uc_commit_client(context, commit_callback); + + // Create a UC committer for a specific table + const char* table_id = "64dcd182-b3b4-4ee0-88e0-63c159a4121c"; + KernelStringSlice table_id_slice = { .ptr = table_id, .len = strlen(table_id) }; + + ExternResultHandleMutableCommitter committer_res = + get_uc_committer(uc_client, table_id_slice, allocate_error); + + if (committer_res.tag != OkHandleMutableCommitter) { + print_error("Failed to create UC committer", (Error*)committer_res.err); + free_error((Error*)committer_res.err); + free_uc_commit_client(uc_client); + return -1; + } + + HandleMutableCommitter uc_committer = committer_res.ok; + + // Get the default engine + KernelStringSlice table_path_slice = { .ptr = table_path, .len = strlen(table_path) }; + ExternResultEngineBuilder engine_builder_res = + get_engine_builder(table_path_slice, allocate_error); + + if (engine_builder_res.tag != OkEngineBuilder) { + print_error("Could not get engine builder", (Error*)engine_builder_res.err); + free_error((Error*)engine_builder_res.err); + free_uc_commit_client(uc_client); + return -1; + } + + EngineBuilder* engine_builder = engine_builder_res.ok; + ExternResultHandleSharedExternEngine engine_res = builder_build(engine_builder); + + if (engine_res.tag != OkHandleSharedExternEngine) { + print_error("Failed to build engine", (Error*)engine_res.err); + free_error((Error*)engine_res.err); + free_uc_commit_client(uc_client); + return -1; + } + + SharedExternEngine* engine = engine_res.ok; + + ExternResultHandleMutableFfiSnapshotBuilder snapshot_builder_res = get_snapshot_builder(table_path_slice, engine); + if (snapshot_builder_res.tag != OkHandleMutableFfiSnapshotBuilder) { + print_error("Failed to get snapshot builder.", (Error*)snapshot_builder_res.err); + free_error((Error*)snapshot_builder_res.err); + return -1; + } + ExternResultHandleSharedSnapshot snapshot_res = snapshot_builder_build(snapshot_builder_res.ok); + if (snapshot_res.tag != OkHandleSharedSnapshot) { + print_error("Failed to create snapshot.", (Error*)snapshot_res.err); + free_error((Error*)snapshot_res.err); + return -1; + } + + SharedSnapshot* snapshot = snapshot_res.ok; + + // Create a transaction with the UC committer + ExternResultHandleExclusiveTransaction txn_res = + transaction_with_committer(snapshot, engine, uc_committer); + + if (txn_res.tag != OkHandleExclusiveTransaction) { + print_error("Failed to create transaction with UC committer", (Error*)txn_res.err); + free_error((Error*)txn_res.err); + free_engine(engine); + free_uc_commit_client(uc_client); + return -1; + } + + HandleExclusiveTransaction txn = txn_res.ok; + + // In a real txn we could now add files using add_files() + + // Add engine info to the transaction + const char* engine_info = "uc_example_engine"; + KernelStringSlice engine_info_slice = { .ptr = engine_info, .len = strlen(engine_info) }; + + ExternResultHandleExclusiveTransaction txn_with_info_res = + with_engine_info(txn, engine_info_slice, engine); + + if (txn_with_info_res.tag != OkHandleExclusiveTransaction) { + print_error("Failed to set engine info", (Error*)txn_with_info_res.err); + free_error((Error*)txn_with_info_res.err); + free_engine(engine); + free_uc_commit_client(uc_client); + return -1; + } + + HandleExclusiveTransaction txn_with_info = txn_with_info_res.ok; + // calling commit here will end up calling our callback + ExternResultu64 commit_res = commit(txn_with_info, engine); + + if (commit_res.tag != Oku64) { + print_error("Commit failed", (Error*)commit_res.err); + free_error((Error*)commit_res.err); + free_engine(engine); + free_uc_commit_client(uc_client); + return -1; + } + + printf("\nCommitted version: %lu\n", (unsigned long)commit_res.ok); + + // Cleanup + // Note: txn_with_info was consumed by commit(), so we don't free it + free_engine(engine); + free_uc_commit_client(uc_client); + free_snapshot(snapshot); + + printf("Total UC API calls: %d\n", uc_context.call_count); + + return 0; +} diff --git a/ffi/examples/read-table/CMakeLists.txt b/ffi/examples/read-table/CMakeLists.txt index c8db73a383..050ed4b5be 100644 --- a/ffi/examples/read-table/CMakeLists.txt +++ b/ffi/examples/read-table/CMakeLists.txt @@ -2,9 +2,11 @@ cmake_minimum_required(VERSION 3.12) project(read_table) option(PRINT_DATA "Print out the table data. Requires arrow-glib" ON) option(VERBOSE "Enable for more diagnostics messages." OFF) -add_executable(read_table read_table.c arrow.c kernel_utils.c) +add_executable(read_table read_table.c arrow.c ../common/kernel_utils.c) target_compile_definitions(read_table PUBLIC DEFINE_DEFAULT_ENGINE_BASE) -target_include_directories(read_table PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/ffi-headers") +target_include_directories(read_table PUBLIC + "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/ffi-headers" + "${CMAKE_CURRENT_SOURCE_DIR}/../common") target_link_directories(read_table PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/debug") target_link_libraries(read_table PUBLIC delta_kernel_ffi) target_compile_options(read_table PUBLIC) @@ -16,6 +18,9 @@ set(DatPath "../../../../acceptance/tests/dat/out/reader_tests/generated") set(ExpectedPath "../../../tests/read-table-testing/expected-data") set(KernelTestPath "../../../../kernel/tests/data") add_test(NAME read_and_print_all_prim COMMAND ${TestRunner} ${DatPath}/all_primitive_types/delta/ ${ExpectedPath}/all-prim-types.expected) +add_test(NAME read_and_print_all_prim_cols COMMAND ${TestRunner} ${DatPath}/all_primitive_types/delta/ ${ExpectedPath}/all-prim-types-cols.expected -cdecimal,float64) +add_test(NAME read_and_print_nested COMMAND ${TestRunner} ${DatPath}/nested_types/delta/ ${ExpectedPath}/nested-types.expected) +add_test(NAME read_and_print_nested_cols COMMAND ${TestRunner} ${DatPath}/nested_types/delta/ ${ExpectedPath}/nested-types-cols.expected -cmap,struct,array) add_test(NAME read_and_print_basic_partitioned COMMAND ${TestRunner} ${DatPath}/basic_partitioned/delta/ ${ExpectedPath}/basic-partitioned.expected) add_test(NAME read_and_print_with_dv_small COMMAND ${TestRunner} ${KernelTestPath}/table-with-dv-small/ ${ExpectedPath}/table-with-dv-small.expected) diff --git a/ffi/examples/read-table/arrow.c b/ffi/examples/read-table/arrow.c index 4b0c34dbe7..08515a4ec5 100644 --- a/ffi/examples/read-table/arrow.c +++ b/ffi/examples/read-table/arrow.c @@ -117,11 +117,18 @@ static ExclusiveEngineData* apply_transform( return data; } print_diag(" Applying transform\n"); - SharedExpressionEvaluator* evaluator = new_expression_evaluator( + ExternResultHandleSharedExpressionEvaluator evaluator_res = new_expression_evaluator( context->engine, context->physical_schema, // input schema context->arrow_context->cur_transform, context->logical_schema); // output schema + if (evaluator_res.tag != OkHandleSharedExpressionEvaluator) { + print_error("Failed to create expression evaluator.", (Error*)evaluator_res.err); + free_error((Error*)evaluator_res.err); + free_engine_data(data); + return NULL; + } + SharedExpressionEvaluator* evaluator = evaluator_res.ok; ExternResultHandleExclusiveEngineData transformed_res = evaluate_expression( context->engine, &data, diff --git a/ffi/examples/read-table/kernel_schema_visitor.h b/ffi/examples/read-table/kernel_schema_visitor.h new file mode 100644 index 0000000000..4aea1918a4 --- /dev/null +++ b/ffi/examples/read-table/kernel_schema_visitor.h @@ -0,0 +1,174 @@ +#include + +/* + * This header defines a visitor that allows kernel to learn about our schema. A + * `KernelSchemaVisitor` in kernel parlance. + */ + +// This function looks at tahe type field in the schema to figure out which visitor to call. It's a +// bit gross as the schema code is string based, a real implementation would have a more robust way +// to represent a schema. +uintptr_t visit_schema_item(SchemaItem* item, KernelSchemaVisitorState *state, CSchema *cschema) { + print_diag("Visiting schema item %s (%s)\n", item->name, item->type); + KernelStringSlice name = { item->name, strlen(item->name) }; + ExternResultusize visit_res; + if (strcmp(item->type, "string") == 0) { + visit_res = visit_field_string(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "integer") == 0) { + visit_res = visit_field_integer(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "short") == 0) { + visit_res = visit_field_short(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "byte") == 0) { + visit_res = visit_field_byte(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "long") == 0) { + visit_res = visit_field_long(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "float") == 0) { + visit_res = visit_field_float(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "double") == 0) { + visit_res = visit_field_double(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "boolean") == 0) { + visit_res = visit_field_boolean(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "binary") == 0) { + visit_res = visit_field_binary(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "date") == 0) { + visit_res = visit_field_date(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "timestamp") == 0) { + visit_res = visit_field_timestamp(state, name, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "timestamp_ntz") == 0) { + visit_res = visit_field_timestamp_ntz(state, name, item->is_nullable, allocate_error); + } else if (strncmp(item->type, "decimal", 7) == 0) { + unsigned int precision; + int scale; + sscanf(item->type, "decimal(%u)(%d)", &precision, &scale); + visit_res = visit_field_decimal(state, name, precision, scale, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "array") == 0) { + SchemaItemList child_list = cschema->builder->lists[item->children]; + // an array should always have 1 child + if (child_list.len != 1) { + printf("[ERROR] Invalid array child list"); + return 0; + } + uintptr_t child_visit_id = visit_schema_item(&child_list.list[0], state, cschema); + if (child_visit_id == 0) { + // previous visit will have printed the issue + return 0; + } + visit_res = visit_field_array(state, name, child_visit_id, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "map") == 0) { + SchemaItemList child_list = cschema->builder->lists[item->children]; + // an map should always have 2 children + if (child_list.len != 2) { + printf("[ERROR] Invalid map child list"); + return 0; + } + uintptr_t key_visit_id = visit_schema_item(&child_list.list[0], state, cschema); + if (key_visit_id == 0) { + // previous visit will have printed the issue + return 0; + } + uintptr_t val_visit_id = visit_schema_item(&child_list.list[1], state, cschema); + if (val_visit_id == 0) { + // previous visit will have printed the issue + return 0; + } + visit_res = visit_field_map(state, name, key_visit_id, val_visit_id, item->is_nullable, allocate_error); + } else if (strcmp(item->type, "struct") == 0) { + SchemaItemList child_list = cschema->builder->lists[item->children]; + uintptr_t child_visit_ids[child_list.len]; + for (uint32_t i = 0; i < child_list.len; i++) { + // visit all the children + SchemaItem *item = &child_list.list[i]; + uintptr_t child_id = visit_schema_item(item, state, cschema); + if (child_id == 0) { + // previous visit will have printed the issue + return 0; + } + child_visit_ids[i] = child_id; + } + visit_res = visit_field_struct( + state, + name, + child_visit_ids, + child_list.len, + item->is_nullable, + allocate_error); + } else { + printf("[ERROR] Can't visit unknown type: %s\n", item->type); + return 0; + } + + if (visit_res.tag != Okusize) { + print_error("Could not visit field", (Error*)visit_res.err); + return 0; + } + return visit_res.ok; +} + +typedef struct { + CSchema* cschema; + char* requested_cols; +} RequestedSchemaSpec; + +// This is the function kernel will call asking it to visit the schema in requested_spec +uintptr_t visit_requested_spec(void* requested_spec, KernelSchemaVisitorState *state) { + RequestedSchemaSpec *spec = (RequestedSchemaSpec*)requested_spec; + print_diag("Asked to visit: %s\n", spec->requested_cols); + + // figure out how many columns we are requesting. will be number of commas + 1 + int col_count = 1; + char* s = spec->requested_cols; + while (*s) { + if (*s == ',') { + col_count++; + } + s++; + } + + uintptr_t cols[col_count]; + int col_index = 0; + + CSchema* cschema = spec->cschema; + SchemaItemList* top_level_list = &cschema->builder->lists[cschema->list_id]; + + char* col = strtok(spec->requested_cols, ","); + + while (col != NULL) { + print_diag("Visiting requested col: %s\n", col); + char found_col = 0; + for (uint32_t i = 0; i < top_level_list->len; i++) { + SchemaItem* item = &top_level_list->list[i]; + if (strcmp(item->name, col) == 0) { + found_col = 1; + uintptr_t col_id = visit_schema_item(item, state, cschema); + if (col_id == 0) { + // error will have been printed above + return 0; + } + cols[col_index++] = col_id; + } + if (found_col) { + break; + } + } + if (!found_col) { + printf("[ERROR] No such column in table: %s\n", col); + return 0; + } + col = strtok(NULL, ","); + } + + KernelStringSlice name = { "s", 1 }; // name doesn't matter + ExternResultusize visit_res = visit_field_struct( + state, + name, + cols, + col_index, + false, + allocate_error); + + if (visit_res.tag != Okusize) { + print_error("Could not visit top_level schema", (Error*)visit_res.err); + return 0; + } + return visit_res.ok; +} diff --git a/ffi/examples/read-table/read_table.c b/ffi/examples/read-table/read_table.c index 2fc992aef9..984d65da72 100644 --- a/ffi/examples/read-table/read_table.c +++ b/ffi/examples/read-table/read_table.c @@ -1,11 +1,14 @@ +#include #include #include #include -#include +#include +#include #include "arrow.h" #include "read_table.h" #include "schema.h" +#include "kernel_schema_visitor.h" #include "kernel_utils.h" // Print the content of a selection vector if `VERBOSE` is defined in read_table.h @@ -28,7 +31,13 @@ void print_partition_info(struct EngineContext* context, const CStringMap* parti for (uintptr_t i = 0; i < context->partition_cols->len; i++) { char* col = context->partition_cols->cols[i]; KernelStringSlice key = { col, strlen(col) }; - char* partition_val = get_from_string_map(partition_values, key, allocate_string); + ExternResultNullableCvoid res = get_from_string_map(partition_values, key, allocate_string, context->engine); + if (res.tag != OkNullableCvoid) { + print_error("Failed to get from string map.", (Error*)res.err); + free_error((Error*)res.err); + continue; + } + char* partition_val = res.ok; if (partition_val) { print_diag(" partition '%s' here: %s\n", col, partition_val); free(partition_val); @@ -48,12 +57,16 @@ void scan_row_callback( void* engine_context, KernelStringSlice path, int64_t size, + int64_t mod_time, const Stats* stats, const CDvInfo* cdv_info, const Expression* transform, const CStringMap* partition_values) { - (void)size; // not using this at the moment + (void)mod_time; // not using this at the moment +#ifndef PRINT_ARROW_DATA + (void)transform; // only used when PRINT_ARROW_DATA is defined +#endif struct EngineContext* context = engine_context; print_diag("Called back to read file: %.*s. (size: %" PRIu64 ", num records: ", (int)path.len, path.ptr, size); if (stats) { @@ -110,7 +123,11 @@ void do_visit_scan_metadata(void* engine_context, HandleSharedScanMetadata scan_ // Ask kernel to iterate each individual file and call us back with extracted metadata print_diag("Asking kernel to call us back for each scan row (file to read)\n"); - visit_scan_metadata(scan_metadata, engine_context, scan_row_callback); + ExternResultbool visit_res = visit_scan_metadata(scan_metadata, context->engine, engine_context, scan_row_callback); + if (visit_res.tag != Okbool) { + print_error("Failed to visit scan metadata.", (Error*)visit_res.err); + free_error((Error*)visit_res.err); + } free_bool_slice(selection_vector); free_scan_metadata(scan_metadata); } @@ -216,22 +233,47 @@ void log_line_callback(KernelStringSlice line) { int main(int argc, char* argv[]) { - if (argc < 2) { - printf("Usage: %s table/path\n", argv[0]); + char* requested_cols = NULL; + int c; + while ((c = getopt (argc, argv, "c:")) != -1) { + switch (c) { + case 'c': + requested_cols = optarg; + break; + case '?': + if (optopt == 'c') { + fprintf (stderr, "Option -%c requires an argument.\n", optopt); + } + else if (isprint(optopt)) { + fprintf (stderr, "Unknown option `-%c'.\n", optopt); + } + else { + fprintf (stderr, + "Unknown option character `\\x%x'.\n", + optopt); + } + return 1; + default: + abort (); + } + } + + if (optind != (argc - 1)) { + printf("Usage: %s [-c top_level_column1,top_level_column2] table/path\n", argv[0]); return -1; } + char* table_path = argv[optind]; + printf("Reading table at %s\n", table_path); + #ifdef VERBOSE enable_event_tracing(tracing_callback, TRACE); // we could also do something like this if we want less control over formatting // enable_formatted_log_line_tracing(log_line_callback, TRACE, FULL, true, true, false, false); #else - enable_event_tracing(tracing_callback, INFO); + enable_event_tracing(tracing_callback, WARN); #endif - char* table_path = argv[1]; - printf("Reading table at %s\n", table_path); - KernelStringSlice table_path_slice = { table_path, strlen(table_path) }; ExternResultEngineBuilder engine_builder_res = @@ -244,7 +286,9 @@ int main(int argc, char* argv[]) // an example of using a builder to set options when building an engine EngineBuilder* engine_builder = engine_builder_res.ok; - set_builder_opt(engine_builder, "aws_region", "us-west-2"); + if (!set_builder_opt(engine_builder, "aws_region", "us-west-2")) { + return -1; + } // potentially set credentials here // set_builder_opt(engine_builder, "aws_access_key_id" , "[redacted]"); // set_builder_opt(engine_builder, "aws_secret_access_key", "[redacted]"); @@ -262,7 +306,13 @@ int main(int argc, char* argv[]) SharedExternEngine* engine = engine_res.ok; - ExternResultHandleSharedSnapshot snapshot_res = snapshot(table_path_slice, engine); + ExternResultHandleMutableFfiSnapshotBuilder snapshot_builder_res = get_snapshot_builder(table_path_slice, engine); + if (snapshot_builder_res.tag != OkHandleMutableFfiSnapshotBuilder) { + print_error("Failed to get snapshot builder.", (Error*)snapshot_builder_res.err); + free_error((Error*)snapshot_builder_res.err); + return -1; + } + ExternResultHandleSharedSnapshot snapshot_res = snapshot_builder_build(snapshot_builder_res.ok); if (snapshot_res.tag != OkHandleSharedSnapshot) { print_error("Failed to create snapshot.", (Error*)snapshot_res.err); free_error((Error*)snapshot_res.err); @@ -273,7 +323,9 @@ int main(int argc, char* argv[]) uint64_t v = version(snapshot); printf("version: %" PRIu64 "\n\n", v); - print_schema(snapshot); + + CSchema *cschema = get_cschema(snapshot, engine); + print_cschema(cschema); char* table_root = snapshot_table_root(snapshot, allocate_string); print_diag("Table root: %s\n", table_root); @@ -282,9 +334,37 @@ int main(int argc, char* argv[]) print_diag("Starting table scan\n\n"); - ExternResultHandleSharedScan scan_res = scan(snapshot, engine, NULL); + EngineSchema* engine_schema = NULL; + RequestedSchemaSpec *spec = NULL; + if (requested_cols != NULL) { + print_diag("Selecting columns: [%s]\n", requested_cols); + engine_schema = malloc(sizeof(EngineSchema)); + spec = malloc(sizeof(RequestedSchemaSpec)); + spec->cschema = cschema; + spec->requested_cols = requested_cols; + engine_schema->schema = spec; + engine_schema->visitor = visit_requested_spec; + } + + ExternResultHandleSharedScan scan_res = scan(snapshot, engine, NULL, engine_schema); + + if (engine_schema != NULL) { + free(engine_schema); + } + + if (spec != NULL) { + free(spec); + } + + free_cschema(cschema); + if (scan_res.tag != OkHandleSharedScan) { - printf("Failed to create scan\n"); + print_error("Failed to create scan", (Error*)scan_res.err); + free_error((Error*)scan_res.err); + free_snapshot(snapshot); + free_engine(engine); + free(table_root); + free_partition_list(partition_cols); return -1; } diff --git a/ffi/examples/read-table/schema.h b/ffi/examples/read-table/schema.h index a70bd5f5a0..13f8a1252b 100644 --- a/ffi/examples/read-table/schema.h +++ b/ffi/examples/read-table/schema.h @@ -1,6 +1,7 @@ #include "delta_kernel_ffi.h" #include "read_table.h" #include "kernel_utils.h" +#include #include /** @@ -48,8 +49,15 @@ typedef struct { int list_count; SchemaItemList* lists; + SharedExternEngine* engine; } SchemaBuilder; +typedef struct +{ + uintptr_t list_id; + SchemaBuilder* builder; +} CSchema; + // lists are preallocated to have exactly enough space, so we just fill in the next open slot and // increment our length SchemaItem* add_to_list(SchemaItemList* list, char* name, char* type, bool is_nullable) @@ -78,19 +86,42 @@ void print_list(SchemaBuilder* builder, uintptr_t list_id, int indent, int paren } SchemaItem* item = &list->list[i]; char* prefix = is_last ? "└" : "├"; - printf("%s─ %s: %s\n", prefix, item->name, item->type); + printf("%s─ %s: %s", prefix, item->name, item->type); + if (strcmp(item->type, "array") == 0) { + SchemaItemList child_list = builder->lists[item->children]; + if (child_list.len != 1) { + printf(" (invalid array child list)\n"); + } else { + printf(" (can contain null: %s)\n", child_list.list[0].is_nullable ? "true" : "false"); + } + } else if (strcmp(item->type, "map") == 0) { + SchemaItemList child_list = builder->lists[item->children]; + if (child_list.len != 2) { + printf(" (invalid map child list)\n"); + } else { + printf(" (can contain null: %s)\n", child_list.list[1].is_nullable ? "true" : "false"); + } + } else { + printf("\n"); + } if (list->list[i].children != UINTPTR_MAX) { print_list(builder, list->list[i].children, indent + 1, parents_on_last + is_last); } } } -void print_physical_name(const char *name, const CStringMap* metadata) +void print_physical_name(const char *name, const CStringMap* metadata, SharedExternEngine* engine) { #ifdef VERBOSE char* key_str = "delta.columnMapping.physicalName"; KernelStringSlice key = { key_str, strlen(key_str) }; - char* value = get_from_map(metadata, key, allocate_string); + ExternResultNullableCvoid res = get_from_string_map(metadata, key, allocate_string, engine); + if (res.tag != OkNullableCvoid) { + printf("Failed to get physical name\n"); + free_error((Error*)res.err); + return; + } + char* value = res.ok; if (value) { printf("Physical name of %s is %s\n", name, value); free(value); @@ -100,6 +131,7 @@ void print_physical_name(const char *name, const CStringMap* metadata) #else (void)name; (void)metadata; + (void)engine; #endif } @@ -133,7 +165,7 @@ void visit_struct( SchemaBuilder* builder = data; char* name_ptr = allocate_string(name); PRINT_CHILD_VISIT("struct", name_ptr, sibling_list_id, "Children", child_list_id); - print_physical_name(name_ptr, metadata); + print_physical_name(name_ptr, metadata, builder->engine); SchemaItem* struct_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "struct", is_nullable); struct_item->children = child_list_id; } @@ -147,10 +179,8 @@ void visit_array( uintptr_t child_list_id) { SchemaBuilder* builder = data; - char* name_ptr = malloc(sizeof(char) * (name.len + 22)); - snprintf(name_ptr, name.len + 1, "%s", name.ptr); - snprintf(name_ptr + name.len, 22, " (is nullable: %s)", is_nullable ? "true" : "false"); - print_physical_name(name_ptr, metadata); + char* name_ptr = allocate_string(name); + print_physical_name(name_ptr, metadata, builder->engine); PRINT_CHILD_VISIT("array", name_ptr, sibling_list_id, "Types", child_list_id); SchemaItem* array_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "array", is_nullable); array_item->children = child_list_id; @@ -165,10 +195,8 @@ void visit_map( uintptr_t child_list_id) { SchemaBuilder* builder = data; - char* name_ptr = malloc(sizeof(char) * (name.len + 22)); - snprintf(name_ptr, name.len + 1, "%s", name.ptr); - snprintf(name_ptr + name.len, 22, " (is nullable: %s)", is_nullable ? "true" : "false"); - print_physical_name(name_ptr, metadata); + char* name_ptr = allocate_string(name); + print_physical_name(name_ptr, metadata, builder->engine); PRINT_CHILD_VISIT("map", name_ptr, sibling_list_id, "Types", child_list_id); SchemaItem* map_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "map", is_nullable); map_item->children = child_list_id; @@ -187,7 +215,7 @@ void visit_decimal( char* name_ptr = allocate_string(name); char* type = malloc(19 * sizeof(char)); snprintf(type, 19, "decimal(%u)(%d)", precision, scale); - print_physical_name(name_ptr, metadata); + print_physical_name(name_ptr, metadata, builder->engine); PRINT_NO_CHILD_VISIT(type, name_ptr, sibling_list_id); add_to_list(&builder->lists[sibling_list_id], name_ptr, type, is_nullable); } @@ -202,7 +230,7 @@ void visit_simple_type( { SchemaBuilder* builder = data; char* name_ptr = allocate_string(name); - print_physical_name(name_ptr, metadata); + print_physical_name(name_ptr, metadata, builder->engine); PRINT_NO_CHILD_VISIT(type, name_ptr, sibling_list_id); add_to_list(&builder->lists[sibling_list_id], name_ptr, type, is_nullable); } @@ -226,11 +254,11 @@ DEFINE_VISIT_SIMPLE_TYPE(date) DEFINE_VISIT_SIMPLE_TYPE(timestamp) DEFINE_VISIT_SIMPLE_TYPE(timestamp_ntz) -// free all the data in the builder (but not the builder itself, it's stack allocated) -void free_builder(SchemaBuilder builder) +// free all the data in the builder and the builder itself +void free_builder(SchemaBuilder* builder) { - for (int i = 0; i < builder.list_count; i++) { - SchemaItemList* list = (builder.lists) + i; + for (int i = 0; i < builder->list_count; i++) { + SchemaItemList* list = (builder->lists) + i; for (uint32_t j = 0; j < list->len; j++) { SchemaItem* item = list->list + j; free(item->name); @@ -242,19 +270,26 @@ void free_builder(SchemaBuilder builder) } free(list->list); // free all the items in this list (we alloc'd them together) } - free(builder.lists); + free(builder->lists); + free(builder); } -// Print the schema of the snapshot -void print_schema(SharedSnapshot* snapshot) +// Free the schema and any associated builder data +void free_cschema(CSchema *schema) { + free_builder(schema->builder); + free(schema); +} + +// Get the schema of the snapshot +CSchema* get_cschema(SharedSnapshot* snapshot, SharedExternEngine* engine) { print_diag("Building schema\n"); - SchemaBuilder builder = { - .list_count = 0, - .lists = NULL, - }; + SchemaBuilder* builder = malloc(sizeof(SchemaBuilder)); + builder->list_count = 0; + builder->lists = NULL; + builder->engine = engine; EngineSchemaVisitor visitor = { - .data = &builder, + .data = builder, .make_field_list = make_field_list, .visit_struct = visit_struct, .visit_array = visit_array, @@ -279,9 +314,16 @@ void print_schema(SharedSnapshot* snapshot) printf("Schema returned in list %" PRIxPTR "\n", schema_list_id); #endif print_diag("Done building schema\n"); + CSchema* cschema = malloc(sizeof(CSchema)); + cschema->list_id = schema_list_id; + cschema->builder = builder; + free_schema(schema); + return cschema; +} + +// Print out a schema +void print_cschema(CSchema *schema) { printf("Schema:\n"); - print_list(&builder, schema_list_id, 0, 0); + print_list(schema->builder, schema->list_id, 0, 0); printf("\n"); - free_schema(schema); - free_builder(builder); } diff --git a/ffi/examples/visit-expression/CMakeLists.txt b/ffi/examples/visit-expression/CMakeLists.txt index 5e17253bce..ca863e579f 100644 --- a/ffi/examples/visit-expression/CMakeLists.txt +++ b/ffi/examples/visit-expression/CMakeLists.txt @@ -1,9 +1,11 @@ cmake_minimum_required(VERSION 3.12) project(visit_expressions) -add_executable(visit_expression visit_expression.c) +add_executable(visit_expression visit_expression.c ../common/kernel_utils.c) target_compile_definitions(visit_expression PUBLIC DEFINE_DEFAULT_ENGINE_BASE) -target_include_directories(visit_expression PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/ffi-headers") +target_include_directories(visit_expression PUBLIC + "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/ffi-headers" + "${CMAKE_CURRENT_SOURCE_DIR}/../common") target_link_directories(visit_expression PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/../../../target/debug") target_link_libraries(visit_expression PUBLIC delta_kernel_ffi) target_compile_options(visit_expression PUBLIC) diff --git a/ffi/examples/visit-expression/engine_to_kernel_expression.h b/ffi/examples/visit-expression/engine_to_kernel_expression.h new file mode 100644 index 0000000000..87d3bbb5a8 --- /dev/null +++ b/ffi/examples/visit-expression/engine_to_kernel_expression.h @@ -0,0 +1,342 @@ +#pragma once + +#include "delta_kernel_ffi.h" +#include "expression.h" +#include "../common/kernel_utils.h" +#include +#include + +/** + * This module converts an engine expression (ExpressionItemList) back into + * a kernel expression (SharedExpression) by calling the appropriate visit_* + * functions from KernelExpressionVisitorState. + */ + +// Forward declarations +uintptr_t convert_engine_to_kernel_expression_item( + KernelExpressionVisitorState* state, + ExpressionItem item); + +uintptr_t convert_engine_to_kernel_literal( + KernelExpressionVisitorState* state, + struct Literal* lit) { + switch (lit->type) { + case Integer: + return visit_expression_literal_int(state, lit->value.integer_data); + case Long: + return visit_expression_literal_long(state, lit->value.long_data); + case Short: + return visit_expression_literal_short(state, lit->value.short_data); + case Byte: + return visit_expression_literal_byte(state, lit->value.byte_data); + case Float: + return visit_expression_literal_float(state, lit->value.float_data); + case Double: + return visit_expression_literal_double(state, lit->value.double_data); + case String: { + KernelStringSlice str_slice = { + .ptr = lit->value.string_data, + .len = strlen(lit->value.string_data) + }; + ExternResultusize result = visit_expression_literal_string( + state, str_slice, allocate_error); + if (result.tag == Errusize) { + print_error("visit_expression_literal_string failed", (Error*)result.err); + free_error((Error*)result.err); + abort(); + } + return result.ok; + } + case Boolean: + return visit_expression_literal_bool(state, lit->value.boolean_data); + case Timestamp: + return visit_expression_literal_timestamp(state, lit->value.long_data); + case TimestampNtz: + return visit_expression_literal_timestamp_ntz(state, + lit->value.long_data); + case Date: + return visit_expression_literal_date(state, lit->value.integer_data); + case Binary: { + return visit_expression_literal_binary( + state, lit->value.binary.buf, lit->value.binary.len); + } + case Decimal: { + struct Decimal* dec = &lit->value.decimal; + ExternResultusize result = visit_expression_literal_decimal( + state, dec->hi, dec->lo, dec->precision, dec->scale, allocate_error); + if (result.tag == Errusize) { + print_error("visit_expression_literal_decimal failed", (Error*)result.err); + free_error((Error*)result.err); + abort(); + } + return result.ok; + } + case Null: { + ExternResultusize result = visit_expression_literal_null(state, allocate_error); + if (result.tag == Errusize) { + print_error("visit_expression_literal_null failed", (Error*)result.err); + free_error((Error*)result.err); + abort(); + } + return result.ok; + } + case Struct: + fprintf(stderr, "Error: Struct literal type not supported\n"); + assert(0 && "Struct literal type not supported"); + abort(); // Explicitly abort even if assertions are disabled + case Array: + fprintf(stderr, "Error: Array literal type not supported\n"); + assert(0 && "Array literal type not supported"); + abort(); // Explicitly abort even if assertions are disabled + case Map: + fprintf(stderr, "Error: Map literal type not supported\n"); + assert(0 && "Map literal type not supported"); + abort(); // Explicitly abort even if assertions are disabled + default: + fprintf(stderr, + "Error: Unknown literal type in convert_engine_to_kernel_literal\n"); + assert(0 && "Unknown literal type in convert_engine_to_kernel_literal"); + abort(); // Explicitly abort even if assertions are disabled + } +} + +uintptr_t convert_engine_to_kernel_binop( + KernelExpressionVisitorState* state, + struct BinOp* binop) { + assert(binop->exprs.len == 2); + uintptr_t left = convert_engine_to_kernel_expression_item( + state, binop->exprs.list[0]); + uintptr_t right = convert_engine_to_kernel_expression_item( + state, binop->exprs.list[1]); + + switch (binop->op) { + case Add: + return visit_expression_plus(state, left, right); + case Minus: + return visit_expression_minus(state, left, right); + case Divide: + return visit_expression_divide(state, left, right); + case Multiply: + return visit_expression_multiply(state, left, right); + case LessThan: + return visit_predicate_lt(state, left, right); + case GreaterThan: + return visit_predicate_gt(state, left, right); + case Equal: + return visit_predicate_eq(state, left, right); + case Distinct: + return visit_predicate_distinct(state, left, right); + case In: + return visit_predicate_in(state, left, right); + default: + fprintf(stderr, "Error: Unknown binary op in convert_engine_to_kernel_binop\n"); + assert(0 && "Unknown binary op in convert_engine_to_kernel_binop"); + abort(); // Explicitly abort even if assertions are disabled + } +} + +// Helper to create an iterator from ExpressionItemList +typedef struct { + ExpressionItemList* list; + size_t current_index; + KernelExpressionVisitorState* state; +} EngineToKernelIteratorState; + +const void* convert_engine_to_kernel_next_fn(void* data) { + EngineToKernelIteratorState* iter_state = (EngineToKernelIteratorState*)data; + if (iter_state->current_index >= iter_state->list->len) { + // Return NULL to signal end of iteration + return NULL; + } + + ExpressionItem item = iter_state->list->list[iter_state->current_index]; + iter_state->current_index++; + + uintptr_t result = convert_engine_to_kernel_expression_item(iter_state->state, item); + // Return the result as a pointer (cast uintptr_t to void*) + return (const void*)result; +} + +uintptr_t convert_engine_to_kernel_variadic( + KernelExpressionVisitorState* state, + struct Variadic* variadic) { + EngineToKernelIteratorState iter_state = { + .list = &variadic->exprs, + .current_index = 0, + .state = state + }; + + EngineIterator iterator = { + .data = &iter_state, + .get_next = convert_engine_to_kernel_next_fn + }; + + switch (variadic->op) { + case And: + return visit_predicate_and(state, &iterator); + case Or: + return visit_predicate_or(state, &iterator); + case StructExpression: + return visit_expression_struct(state, &iterator); + default: + fprintf(stderr, + "Error: Unknown variadic op in convert_engine_to_kernel_variadic\n"); + assert(0 && "Unknown variadic op in convert_engine_to_kernel_variadic"); + abort(); // Explicitly abort even if assertions are disabled + } +} + +uintptr_t convert_engine_to_kernel_unary( + KernelExpressionVisitorState* state, + struct Unary* unary) { + assert(unary->sub_expr.len == 1); + uintptr_t inner = convert_engine_to_kernel_expression_item( + state, unary->sub_expr.list[0]); + + switch (unary->type) { + case Not: + return visit_predicate_not(state, inner); + case IsNull: + return visit_predicate_is_null(state, inner); + default: + fprintf(stderr, "Error: Unknown unary op in convert_engine_to_kernel_unary\n"); + assert(0 && "Unknown unary op in convert_engine_to_kernel_unary"); + abort(); // Explicitly abort even if assertions are disabled + } +} + +uintptr_t convert_engine_to_kernel_expression_item( + KernelExpressionVisitorState* state, + ExpressionItem item) { + switch (item.type) { + case Literal: + return convert_engine_to_kernel_literal(state, (struct Literal*)item.ref); + case BinOp: + return convert_engine_to_kernel_binop(state, (struct BinOp*)item.ref); + case Variadic: + return convert_engine_to_kernel_variadic(state, (struct Variadic*)item.ref); + case Unary: + return convert_engine_to_kernel_unary(state, (struct Unary*)item.ref); + case Column: { + char* column_name = (char*)item.ref; + KernelStringSlice str_slice = { + .ptr = column_name, + .len = strlen(column_name) + }; + ExternResultusize result = visit_expression_column( + state, str_slice, allocate_error); + if (result.tag == Errusize) { + print_error("visit_expression_column failed", (Error*)result.err); + free_error((Error*)result.err); + abort(); + } + return result.ok; + } + case Unknown: { + struct Unknown* unknown = (struct Unknown*)item.ref; + KernelStringSlice str_slice = { + .ptr = unknown->name, + .len = strlen(unknown->name) + }; + return visit_expression_unknown(state, str_slice); + } + case MapToStruct: { + struct MapToStructExpr* m2s = (struct MapToStructExpr*)item.ref; + assert(m2s->child_expr.len == 1); + uintptr_t child = convert_engine_to_kernel_expression_item( + state, m2s->child_expr.list[0]); + return visit_expression_map_to_struct(state, child); + } + case Transform: + case FieldTransform: + case OpaqueExpression: + case OpaquePredicate: + fprintf(stderr, + "Warning: Complex expression type not yet supported " + "for reconstruction\n"); + return visit_expression_literal_int(state, 0); + default: + fprintf(stderr, + "Error: Unknown expression type in " + "convert_engine_to_kernel_expression_item\n"); + assert(0 && + "Unknown expression type in convert_engine_to_kernel_expression_item"); + abort(); // Explicitly abort even if assertions are disabled + } +} + +// Visitor function for converting ExpressionItemList to kernel expression +static inline uintptr_t expression_item_list_visitor( + void* expr_list_ptr, + KernelExpressionVisitorState* state) { + ExpressionItemList* expr_list = (ExpressionItemList*)expr_list_ptr; + assert(expr_list->len > 0); + return convert_engine_to_kernel_expression_item(state, expr_list->list[0]); +} + +/** + * Convert an engine expression to a kernel expression using the visitor + * pattern. Returns a SharedExpression handle that must be freed with + * free_kernel_expression. + * + * This function uses the EngineExpression visitor pattern, completely + * hiding KernelExpressionVisitorState management from the caller. + */ +SharedExpression* convert_engine_to_kernel_expression( + ExpressionItemList expr_list) { + EngineExpression engine_expr = { + .expression = (void*)&expr_list, + .visitor = expression_item_list_visitor + }; + + ExternResultHandleSharedExpression result = visit_engine_expression( + &engine_expr, allocate_error); + + if (result.tag == OkHandleSharedExpression) { + return result.ok; + } else { + print_error("Failed to convert engine expression to kernel expression", + (Error*)result.err); + free_error((Error*)result.err); + abort(); + } +} + +// Visitor function for converting ExpressionItemList to kernel predicate +static inline uintptr_t predicate_item_list_visitor( + void* pred_list_ptr, + KernelExpressionVisitorState* state) { + ExpressionItemList* pred_list = (ExpressionItemList*)pred_list_ptr; + assert(pred_list->len > 0); + return convert_engine_to_kernel_expression_item(state, pred_list->list[0]); +} + +/** + * Convert an engine predicate to a kernel predicate using the visitor + * pattern. Returns a SharedPredicate handle that must be freed with + * free_kernel_predicate. + * + * This function uses the EnginePredicate visitor pattern, completely + * hiding KernelExpressionVisitorState management from the caller. + */ +SharedPredicate* convert_engine_to_kernel_predicate( + ExpressionItemList pred_list) { + EnginePredicate engine_pred = { + .predicate = (void*)&pred_list, + .visitor = predicate_item_list_visitor + }; + + ExternResultHandleSharedPredicate result = visit_engine_predicate( + &engine_pred, allocate_error); + + if (result.tag == OkHandleSharedPredicate) { + return result.ok; + } else { + print_error("Failed to convert engine predicate to kernel predicate", + (Error*)result.err); + free_error((Error*)result.err); + abort(); + } +} + + diff --git a/ffi/examples/visit-expression/expression.h b/ffi/examples/visit-expression/expression.h index 5e2ae6a85b..920b0d54da 100644 --- a/ffi/examples/visit-expression/expression.h +++ b/ffi/examples/visit-expression/expression.h @@ -1,6 +1,7 @@ #pragma once #include "delta_kernel_ffi.h" +#include "../common/kernel_utils.h" #include #include #include @@ -63,6 +64,7 @@ enum ExpressionType { OpaqueExpression, OpaquePredicate, Unknown, + MapToStruct, }; enum VariadicType { And, @@ -110,6 +112,9 @@ struct OpaquePredicate { struct Unknown { char* name; }; +struct MapToStructExpr { + ExpressionItemList child_expr; +}; struct BinaryData { uint8_t* buf; uintptr_t len; @@ -168,10 +173,6 @@ ExpressionItemList get_expr_list(void* data, size_t list_id) { assert(list_id < data_ptr->list_count); return data_ptr->lists[list_id]; } -// utility to turn a slice into a char* -char* allocate_string(const KernelStringSlice slice) { - return strndup(slice.ptr, slice.len); -} /************************************************************* * Binary Operations @@ -383,6 +384,14 @@ void visit_unknown(void *data, uintptr_t sibling_list_id, struct KernelStringSli put_expr_item(data, sibling_list_id, unknown, Unknown); } +void visit_map_to_struct_expr(void* data, + uintptr_t sibling_list_id, + uintptr_t child_list_id) { + struct MapToStructExpr* m2s = malloc(sizeof(struct MapToStructExpr)); + m2s->child_expr = get_expr_list(data, child_list_id); + put_expr_item(data, sibling_list_id, m2s, MapToStruct); +} + void visit_expr_array_literal(void* data, uintptr_t sibling_list_id, uintptr_t child_list_id) { struct Literal* literal = malloc(sizeof(struct Literal)); literal->type = Array; @@ -490,6 +499,7 @@ ExpressionItemList construct_expression(SharedExpression* expression) { .visit_opaque_pred = visit_opaque_pred, .visit_opaque_expr = visit_opaque_expr, .visit_unknown = visit_unknown, + .visit_map_to_struct = visit_map_to_struct_expr, }; uintptr_t top_level_id = visit_expression(&expression, &visitor); ExpressionItemList top_level_expr = data.lists[top_level_id]; @@ -536,6 +546,7 @@ ExpressionItemList construct_predicate(SharedPredicate* predicate) { .visit_opaque_pred = visit_opaque_pred, .visit_opaque_expr = visit_opaque_expr, .visit_unknown = visit_unknown, + .visit_map_to_struct = visit_map_to_struct_expr, }; uintptr_t top_level_id = visit_predicate(&predicate, &visitor); ExpressionItemList top_level_expr = data.lists[top_level_id]; @@ -648,6 +659,12 @@ void free_expression_item(ExpressionItem ref) { free(ref.ref); break; } + case MapToStruct: { + struct MapToStructExpr* m2s = ref.ref; + free_expression_list(m2s->child_expr); + free(m2s); + break; + } } } void free_expression_list(ExpressionItemList list) { diff --git a/ffi/examples/visit-expression/expression_print.h b/ffi/examples/visit-expression/expression_print.h index 1c2e2735ba..44aa491837 100644 --- a/ffi/examples/visit-expression/expression_print.h +++ b/ffi/examples/visit-expression/expression_print.h @@ -243,6 +243,12 @@ void print_tree_helper(ExpressionItem ref, int depth) { printf("Column(%s)\n", column_name); break; } + case MapToStruct: { + struct MapToStructExpr* m2s = ref.ref; + printf("MapToStruct\n"); + print_expression_item_list(m2s->child_expr, depth + 1); + break; + } } } diff --git a/ffi/examples/visit-expression/visit_expression.c b/ffi/examples/visit-expression/visit_expression.c index a41164cba7..1f56b9ba3b 100644 --- a/ffi/examples/visit-expression/visit_expression.c +++ b/ffi/examples/visit-expression/visit_expression.c @@ -1,18 +1,126 @@ #include "delta_kernel_ffi.h" #include "expression.h" #include "expression_print.h" +#include "engine_to_kernel_expression.h" +#include -int main() { - SharedExpression* expr = get_testing_kernel_expression(); +// Test case structure for organizing test runs +typedef struct { + const char* name; + SharedExpression* (*get_expression_fn)(void); + SharedPredicate* (*get_predicate_fn)(void); + bool validate_roundtrip; + const char* description; +} TestCase; + +// Run a single test case for expressions and predicates +// The C side owns the memory for the expressions and predicates +// and needs to be freed from the C side while emulates the engine side. +bool run_test_case(const TestCase* test) { + bool all_passed = true; + + printf("=== %s ===\n", test->name); + printf("%s\n\n", test->description); + + // Test expressions + SharedExpression* expr = test->get_expression_fn(); ExpressionItemList expr_list = construct_expression(expr); print_expression(expr_list); + + // The round-trip test for complex expressions is not supported yet. + // We need to add kernel visitor functions for complex expressions + // and enable the test for complex expressions. + if (test->validate_roundtrip) { + SharedExpression* expr_rekernel = + convert_engine_to_kernel_expression(expr_list); + bool expr_equal = expressions_are_equal(&expr, &expr_rekernel); + + printf("\n=== Expression Round-trip Test ===\n"); + if (expr_equal) { + printf("SUCCESS: Round-trip expression matches original!\n"); + } else { + printf("FAILURE: Round-trip expression does NOT match original!\n"); + all_passed = false; + } + free_kernel_expression(expr_rekernel); + } + free_expression_list(expr_list); free_kernel_expression(expr); - - SharedPredicate* pred = get_testing_kernel_predicate(); + + // Test predicates + SharedPredicate* pred = test->get_predicate_fn(); ExpressionItemList pred_list = construct_predicate(pred); print_expression(pred_list); + + // The round-trip test for complex expressions is not supported yet. + // We need to add kernel visitor functions for complex expressions + // and enable the test for complex expressions. + if (test->validate_roundtrip) { + SharedPredicate* pred_rekernel = + convert_engine_to_kernel_predicate(pred_list); + bool pred_equal = predicates_are_equal(&pred, &pred_rekernel); + + printf("\n=== Predicate Round-trip Test ===\n"); + if (pred_equal) { + printf("SUCCESS: Round-trip predicate matches original!\n"); + } else { + printf("FAILURE: Round-trip predicate does NOT match original!\n"); + all_passed = false; + } + free_kernel_predicate(pred_rekernel); + } + free_expression_list(pred_list); free_kernel_predicate(pred); - return 0; + + return all_passed; +} + +int main() { + // Define test cases + // We use an iterator pattern to add tests + TestCase test_cases[] = { + { + .name = "Complex Expression Test", + .get_expression_fn = get_testing_kernel_expression, + .get_predicate_fn = get_testing_kernel_predicate, + // TODO: Enable this once #1471 (i.e complex expressions are supported) + .validate_roundtrip = false, + .description = + "This test demonstrates the full range of expression types.\n" + "Some types are not yet supported in round-trip reconstruction:\n" + " - Struct/Array/Map literals (nested data structures)\n" + " - Transform expressions (schema evolution operations)\n" + " - Opaque expressions (custom user-defined expressions)" + }, + { + .name = "Simple Round-trip Test", + .get_expression_fn = get_simple_testing_kernel_expression, + .get_predicate_fn = get_simple_testing_kernel_predicate, + .validate_roundtrip = true, + .description = + "This test validates expressions/predicates with full support.\n" + "Supported types: primitives (int, long, float, double, bool, " + "string),\n temporal (date, timestamp, timestamp_ntz), binary, " + "decimal, null,\n binary operations (+, -, *, /), struct " + "expressions, predicates (eq, ne, lt, le,\n gt, ge, distinct, " + "is_null, is_not_null, not, and, or)" + } + }; + + bool all_tests_passed = true; + size_t num_tests = sizeof(test_cases) / sizeof(test_cases[0]); + + // Run all test cases + for (size_t i = 0; i < num_tests; i++) { + if (!run_test_case(&test_cases[i])) { + all_tests_passed = false; + } + if (i < num_tests - 1) { + printf("\n"); // Separator between test cases + } + } + + return all_tests_passed ? 0 : 1; } diff --git a/ffi/src/delta_kernel_unity_catalog.rs b/ffi/src/delta_kernel_unity_catalog.rs new file mode 100644 index 0000000000..2265d4473e --- /dev/null +++ b/ffi/src/delta_kernel_unity_catalog.rs @@ -0,0 +1,419 @@ +//! FFI hooks that enable constructing a CommitClient. + +use crate::error::{ExternResult, IntoExternResult as _}; +use crate::{error::AllocateErrorFn, transaction::MutableCommitter}; +use crate::{ExclusiveRustString, NullableCvoid}; +use std::sync::Arc; + +use delta_kernel::committer::Committer; +use delta_kernel::engine::default::executor::tokio::{ + TokioBackgroundExecutor, TokioMultiThreadExecutor, +}; +use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::DeltaResult; +use delta_kernel_ffi::{ + handle::Handle, kernel_string_slice, KernelStringSlice, OptionalValue, TryFromStringSlice, +}; +use delta_kernel_ffi_macros::handle_descriptor; +use delta_kernel_unity_catalog::UCCommitter; + +use unity_catalog_delta_client_api::{ + CommitClient, CommitRequest as ClientCommitRequest, Result as ApiResult, +}; + +use tracing::debug; + +/// Data representing a commit. +#[repr(C)] +pub struct Commit { + pub version: i64, + pub timestamp: i64, + pub file_name: KernelStringSlice, + pub file_size: i64, + pub file_modification_timestamp: i64, +} + +/// Request to commit a new version to the table. It must include either a `commit_info` or +/// `latest_backfilled_version`. +#[repr(C)] +pub struct CommitRequest { + pub table_id: KernelStringSlice, + pub table_uri: KernelStringSlice, + pub commit_info: OptionalValue, + pub latest_backfilled_version: OptionalValue, + /// json serialized version of the metadata + pub metadata: OptionalValue, + /// json serialized version of the protocol + pub protocol: OptionalValue, +} + +/// The callback that will be called when the client wants to commit. Return `None` on success, or +/// `Some("error description")` if an error occured. +// Note, it doesn't make sense to return an ExternResult here because that can't hold the string +// error msg +pub type CCommit = extern "C" fn( + context: NullableCvoid, + request: CommitRequest, +) -> OptionalValue>; + +pub struct FfiUCCommitClient { + context: NullableCvoid, + commit_callback: CCommit, +} + +// NullableCvoid is NOT `Send` by itself. Here we declare our struct to be Send as it's up to the +// caller to ensure they pass a thread safe pointer that remains valid +unsafe impl Send for FfiUCCommitClient {} +unsafe impl Sync for FfiUCCommitClient {} + +impl CommitClient for FfiUCCommitClient { + /// Commit a new version to the table. + async fn commit(&self, request: ClientCommitRequest) -> ApiResult<()> { + let table_id = request.table_id; + let table_uri = request.table_uri; + + // there is a subtle issue here where we need to ensure that the string we use to refer to + // the commit_info.file_name stays in scope until _after_ the callback returns, so that the + // KernelStringSlice remains valid. This means we can't get clever with + // request.commit_info.map to build an Option. Rather we just use a closure to hold + // the common code and call it from a scope where the string remains valid until after the + // closure finishes + + let send_request = |commit_info| -> ApiResult<()> { + let c_commit_request = CommitRequest { + table_id: kernel_string_slice!(table_id), + table_uri: kernel_string_slice!(table_uri), + commit_info, + latest_backfilled_version: request.latest_backfilled_version.into(), + metadata: None.into(), + protocol: None.into(), + }; + + match (self.commit_callback)(self.context, c_commit_request) { + OptionalValue::Some(e) => { + let boxed_str = unsafe { e.into_inner() }; // get the string back into Box + let s: String = *boxed_str; // move back onto the stack + Err(unity_catalog_delta_client_api::Error::Generic(s)) + } + OptionalValue::None => Ok(()), + } + }; + + if let Some(client_commit_info) = request.commit_info { + let file_name = client_commit_info.file_name; + let commit_info = Some(Commit { + version: client_commit_info.version, + timestamp: client_commit_info.timestamp, + file_name: kernel_string_slice!(file_name), + file_size: client_commit_info.file_size, + file_modification_timestamp: client_commit_info.file_modification_timestamp, + }); + send_request(commit_info.into()) + } else { + send_request(None.into()) + } + } +} + +#[handle_descriptor(target=FfiUCCommitClient, mutable=false, sized=true)] +pub struct SharedFfiUCCommitClient; + +/// Get a commit client that will call the passed callbacks when it wants to make a commit. The +/// context will be passed back to the callback when called. +/// +/// IMPORTANT: The pointer passed for the context MUST be thread-safe (i.e. be able to be sent +/// between threads safely) and MUST remain valid for as long as the client is used. It is valid to +/// pass NULL as the context. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid pointer for the callback and a valid context pointer +#[no_mangle] +pub unsafe extern "C" fn get_uc_commit_client( + context: NullableCvoid, + commit_callback: CCommit, +) -> Handle { + Arc::new(FfiUCCommitClient { + context, + commit_callback, + }) + .into() +} + +/// # Safety +/// +/// Caller is responsible for passing a valid handle. +#[no_mangle] +pub unsafe extern "C" fn free_uc_commit_client(commit_client: Handle) { + debug!("released uc commit client"); + commit_client.drop_handle(); +} + +// we need our own struct here because we want to override the calls to enter the tokio runtime +// before calling into the standard committer +struct FfiUCCommitter { + inner: UCCommitter, +} + +impl Committer for FfiUCCommitter { + fn commit( + &self, + engine: &dyn delta_kernel::Engine, + actions: Box< + dyn Iterator> + Send + '_, + >, + commit_metadata: delta_kernel::committer::CommitMetadata, + ) -> DeltaResult { + // We hold this guard until the end of the function so we stay in the tokio context until + // we're done + let _guard = engine + .any_ref() + .downcast_ref::>() + .map(|e| e.enter()) + .or_else(|| { + engine + .any_ref() + .downcast_ref::>() + .map(|e| e.enter()) + }) + .ok_or_else(|| { + delta_kernel::Error::generic( + "FFIUCCommitter can only be used with the default engine", + ) + })?; + self.inner.commit(engine, actions, commit_metadata) + } + + fn is_catalog_committer(&self) -> bool { + self.inner.is_catalog_committer() + } + + fn publish( + &self, + engine: &dyn delta_kernel::Engine, + publish_metadata: delta_kernel::committer::PublishMetadata, + ) -> DeltaResult<()> { + self.inner.publish(engine, publish_metadata) + } +} + +/// Get a commit client that will call the passed callbacks when it wants to make a commit. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid pointer to a SharedFfiUCCommitClient, obtained via +/// calling [`get_uc_commit_client`], a valid KernelStringSlice as the table_id, and a valid error +/// function pointer. +#[no_mangle] +pub unsafe extern "C" fn get_uc_committer( + commit_client: Handle, + table_id: KernelStringSlice, + error_fn: AllocateErrorFn, +) -> ExternResult> { + get_uc_committer_impl(commit_client, table_id).into_extern_result(&error_fn) +} + +fn get_uc_committer_impl( + commit_client: Handle, + table_id: KernelStringSlice, +) -> DeltaResult> { + let client: Arc = unsafe { commit_client.clone_as_arc() }; + let table_id_str: String = unsafe { TryFromStringSlice::try_from_slice(&table_id) }?; + let committer: Box = Box::new(FfiUCCommitter { + inner: UCCommitter::new(client, table_id_str), + }); + Ok(committer.into()) +} + +/// Free a committer obtained via get_uc_committer. Warning! Normally the value returned here will +/// be consumed when creating a transaction via [`crate::transaction::transaction_with_committer`] +/// and will NOT need to be freed. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid handle obtained via `get_uc_committer` +#[no_mangle] +pub unsafe extern "C" fn free_uc_committer(commit_client: Handle) { + debug!("released uc committer"); + commit_client.drop_handle(); +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + use crate::ffi_test_utils::{allocate_err, ok_or_panic}; + use crate::{allocate_kernel_string, kernel_string_slice, OptionalValue}; + use std::ffi::c_void; + use std::ptr::NonNull; + use std::sync::Arc; + use unity_catalog_delta_client_api::{Commit as ClientCommit, Error as ApiError}; + + pub(crate) struct TestContext { + pub(crate) commit_called: bool, + pub(crate) last_commit_request: Option<(String, String)>, + pub(crate) last_staged_filename: Option, + pub(crate) should_fail_commit: bool, + } + + pub(crate) fn get_test_context(should_fail_commit: bool) -> NullableCvoid { + let context = Box::new(TestContext { + commit_called: false, + last_commit_request: None, + last_staged_filename: None, + should_fail_commit, + }); + NonNull::new(Box::into_raw(context) as *mut c_void) + } + + // take back ownership of the context. be aware that you therefore cannot call this twice with + // the same context pointer + pub(crate) fn recover_test_context(context: NullableCvoid) -> Option> { + context.map(|context| unsafe { Box::from_raw(context.as_ptr() as *mut TestContext) }) + } + + // get the context without taking ownership + pub(crate) fn cast_test_context<'a>(context: NullableCvoid) -> Option<&'a mut TestContext> { + context.map(|ptr| unsafe { &mut *(ptr.as_ptr() as *mut TestContext) }) + } + + #[no_mangle] + extern "C" fn test_commit_callback( + context: NullableCvoid, + request: CommitRequest, + ) -> OptionalValue> { + let context = cast_test_context(context).unwrap(); + + context.commit_called = true; + + let table_id = unsafe { String::try_from_slice(&request.table_id).unwrap() }; + let table_uri = unsafe { String::try_from_slice(&request.table_uri).unwrap() }; + + if let OptionalValue::Some(commit_info) = request.commit_info { + let file_name = unsafe { + crate::TryFromStringSlice::try_from_slice(&commit_info.file_name).unwrap() + }; + context.last_staged_filename = Some(file_name); + } + context.last_commit_request = Some((table_id.clone(), table_uri.clone())); + if context.should_fail_commit { + let error_msg = "Test commit failure"; + let error_str = unsafe { + ok_or_panic(allocate_kernel_string( + kernel_string_slice!(error_msg), + allocate_err, + )) + }; + OptionalValue::Some(error_str) + } else { + OptionalValue::None + } + } + + #[test] + fn test_get_uc_commit_client() { + let client = unsafe { get_uc_commit_client(None, test_commit_callback) }; + + let _client_ref: Arc = unsafe { client.clone_as_arc() }; + unsafe { free_uc_commit_client(client) }; + } + + #[tokio::test] + async fn test_ffi_uc_commit_client_commit_success() { + let context = get_test_context(false); + + let client = unsafe { get_uc_commit_client(context, test_commit_callback) }; + + let client_arc: Arc = unsafe { client.clone_as_arc() }; + + let request = ClientCommitRequest { + table_id: "test_table_id".to_string(), + table_uri: "s3://bucket/path".to_string(), + commit_info: Some(ClientCommit { + version: 10, + timestamp: 2000000000, + file_name: "_staged_commits/00000000000000000010.uuid.json".to_string(), + file_size: 1024, + file_modification_timestamp: 2000000100, + }), + latest_backfilled_version: None, + metadata: None, + protocol: None, + }; + + let result: ApiResult<()> = client_arc.commit(request).await; + + assert!(result.is_ok()); + + let context = recover_test_context(context).unwrap(); + + assert!(context.commit_called); + let (table_id, table_uri) = context.last_commit_request.unwrap(); + assert_eq!(table_id, "test_table_id"); + assert_eq!(table_uri, "s3://bucket/path"); + assert_eq!( + context.last_staged_filename.unwrap(), + "_staged_commits/00000000000000000010.uuid.json" + ); + + unsafe { free_uc_commit_client(client) }; + } + + #[tokio::test] + async fn test_ffi_uc_commit_client_commit_failure() { + let context = get_test_context(true); + + let client = unsafe { get_uc_commit_client(context, test_commit_callback) }; + + let client_arc: Arc = unsafe { client.clone_as_arc() }; + + let request = ClientCommitRequest { + table_id: "test_table_id".to_string(), + table_uri: "s3://bucket/path".to_string(), + commit_info: Some(ClientCommit { + version: 10, + timestamp: 2000000000, + file_name: "00000000000000000010.uuid.json".to_string(), + file_size: 1024, + file_modification_timestamp: 2000000100, + }), + latest_backfilled_version: None, + metadata: None, + protocol: None, + }; + + let result: ApiResult<()> = client_arc.commit(request).await; + + assert!(result.is_err()); + + let context = recover_test_context(context).unwrap(); + + assert!(context.commit_called); + + let error = result.unwrap_err(); + assert!(matches!(error, ApiError::Generic(_))); + if let unity_catalog_delta_client_api::Error::Generic(msg) = error { + assert_eq!(msg, "Test commit failure"); + } + + unsafe { free_uc_commit_client(client) }; + } + + #[test] + fn test_get_uc_committer() { + let client = unsafe { get_uc_commit_client(None, test_commit_callback) }; + + let table_id = "test_table_id"; + let committer = unsafe { + ok_or_panic(get_uc_committer( + client.shallow_copy(), + kernel_string_slice!(table_id), + allocate_err, + )) + }; + + unsafe { + free_uc_commit_client(client); + free_uc_committer(committer); + } + } +} diff --git a/ffi/src/domain_metadata.rs b/ffi/src/domain_metadata.rs index 7f51dd6074..5c6ac9a595 100644 --- a/ffi/src/domain_metadata.rs +++ b/ffi/src/domain_metadata.rs @@ -37,20 +37,68 @@ fn get_domain_metadata_impl( .and_then(|config: String| allocate_fn(kernel_string_slice!(config)))) } +/// Get the domain metadata as an optional string allocated by `AllocatedStringFn` for a specific domain in this snapshot +/// +/// # Safety +/// +/// Caller is responsible for passing in a valid handle +#[no_mangle] +pub unsafe extern "C" fn visit_domain_metadata( + snapshot: Handle, + engine: Handle, + engine_context: NullableCvoid, + visitor: extern "C" fn( + engine_context: NullableCvoid, + domain: KernelStringSlice, + configuration: KernelStringSlice, + ), +) -> ExternResult { + let snapshot = unsafe { snapshot.as_ref() }; + let engine = unsafe { engine.as_ref() }; + + visit_domain_metadata_impl(snapshot, engine, engine_context, visitor) + .into_extern_result(&engine) +} + +fn visit_domain_metadata_impl( + snapshot: &Snapshot, + extern_engine: &dyn ExternEngine, + engine_context: NullableCvoid, + visitor: extern "C" fn( + engine_context: NullableCvoid, + key: KernelStringSlice, + value: KernelStringSlice, + ), +) -> DeltaResult { + let res = snapshot.get_all_domain_metadata(extern_engine.engine().as_ref())?; + res.iter().for_each(|metadata| { + let domain = &metadata.domain(); + let configuration = &metadata.configuration(); + visitor( + engine_context, + kernel_string_slice!(domain), + kernel_string_slice!(configuration), + ); + }); + + Ok(true) +} + #[cfg(test)] mod tests { use super::*; use crate::error::KernelError; use crate::ffi_test_utils::{ - allocate_err, allocate_str, assert_extern_result_error_with_message, ok_or_panic, - recover_string, + allocate_err, allocate_str, assert_extern_result_error_with_message, build_snapshot, + ok_or_panic, recover_string, }; - use crate::{engine_to_handle, free_engine, free_snapshot, kernel_string_slice, snapshot}; - use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; - use delta_kernel::engine::default::DefaultEngine; + use crate::{engine_to_handle, free_engine, free_snapshot, kernel_string_slice}; + use delta_kernel::engine::default::DefaultEngineBuilder; + use delta_kernel::object_store::memory::InMemory; use delta_kernel::DeltaResult; - use object_store::memory::InMemory; use serde_json::json; + use std::collections::HashMap; + use std::ptr::NonNull; use std::sync::Arc; use test_utils::add_commit; @@ -58,9 +106,9 @@ mod tests { async fn test_domain_metadata() -> DeltaResult<()> { let storage = Arc::new(InMemory::new()); - let engine = DefaultEngine::new(storage.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(storage.clone()).build(); let engine = engine_to_handle(Arc::new(engine), allocate_err); - let path = "memory:///"; + let table_root = "memory:///test_table/"; // commit0 // - domain1: not removed @@ -100,7 +148,7 @@ mod tests { .map(|json| json.to_string()) .join("\n"); - add_commit(storage.clone().as_ref(), 0, commit) + add_commit(table_root, storage.clone().as_ref(), 0, commit) .await .unwrap(); @@ -134,10 +182,12 @@ mod tests { .map(|json| json.to_string()) .join("\n"); - add_commit(storage.as_ref(), 1, commit).await.unwrap(); + add_commit(table_root, storage.as_ref(), 1, commit) + .await + .unwrap(); let snapshot = - unsafe { ok_or_panic(snapshot(kernel_string_slice!(path), engine.shallow_copy())) }; + unsafe { build_snapshot(kernel_string_slice!(table_root), engine.shallow_copy()) }; let get_domain_metadata_helper = |domain: &str| unsafe { get_domain_metadata( @@ -148,6 +198,8 @@ mod tests { ) }; + // First, we test fetching the domain metadata one-by-one + let domain1 = "domain1"; let res = ok_or_panic(get_domain_metadata_helper(domain1)); assert!(res.is_none()); @@ -158,7 +210,50 @@ mod tests { let domain3 = "delta.domain3"; let res = get_domain_metadata_helper(domain3); - assert_extern_result_error_with_message(res, KernelError::GenericError, "Generic delta kernel error: User DomainMetadata are not allowed to use system-controlled 'delta.*' domain"); + assert_extern_result_error_with_message(res, KernelError::GenericError, Some("Generic delta kernel error: User DomainMetadata are not allowed to use system-controlled 'delta.*' domain")); + + // Secondly, we visit the entire domain metadata + + // Create visitor state + let visitor_state: Box> = Box::default(); + let visitor_state_ptr = Box::into_raw(visitor_state); + + // Test visitor function + extern "C" fn visitor( + state: NullableCvoid, + key: KernelStringSlice, + value: KernelStringSlice, + ) { + let mut collected_metadata = unsafe { + Box::from_raw( + state.unwrap().as_ptr() as *mut std::collections::HashMap + ) + }; + let key: DeltaResult = unsafe { TryFromStringSlice::try_from_slice(&key) }; + let value: DeltaResult = unsafe { TryFromStringSlice::try_from_slice(&value) }; + collected_metadata.insert(key.unwrap(), value.unwrap()); + Box::leak(collected_metadata); + } + + // Visit all (user) domain metadata + let res = unsafe { + ok_or_panic(visit_domain_metadata( + snapshot.shallow_copy(), + engine.shallow_copy(), + Some(NonNull::new_unchecked(visitor_state_ptr).cast()), + visitor, + )) + }; + + // Confirm visitor picked up all entries in map + let collected_metadata = unsafe { Box::from_raw(visitor_state_ptr) }; + assert!(res); + assert!(collected_metadata.get("domain1").is_none()); + assert!(collected_metadata.get("delta.domain3").is_none()); + assert_eq!( + collected_metadata.get("domain2").unwrap(), + "domain2_commit1" + ); unsafe { free_snapshot(snapshot) } unsafe { free_engine(engine) } diff --git a/ffi/src/engine_data.rs b/ffi/src/engine_data.rs index 0010bf7dee..621ea7197b 100644 --- a/ffi/src/engine_data.rs +++ b/ffi/src/engine_data.rs @@ -1,5 +1,4 @@ //! EngineData related ffi code - #[cfg(feature = "default-engine-base")] use delta_kernel::arrow; #[cfg(feature = "default-engine-base")] @@ -8,7 +7,7 @@ use delta_kernel::arrow::array::{ ArrayData, RecordBatch, StructArray, }; #[cfg(feature = "default-engine-base")] -use delta_kernel::engine::arrow_data::ArrowEngineData; +use delta_kernel::engine::arrow_data::{ArrowEngineData, EngineDataArrowExt as _}; #[cfg(feature = "default-engine-base")] use delta_kernel::DeltaResult; use delta_kernel::EngineData; @@ -64,6 +63,16 @@ pub struct ArrowFFIData { pub schema: FFI_ArrowSchema, } +#[cfg(feature = "default-engine-base")] +impl ArrowFFIData { + pub fn empty() -> Self { + Self { + array: FFI_ArrowArray::empty(), + schema: FFI_ArrowSchema::empty(), + } + } +} + // TODO: This should use a callback to avoid having to have the engine free the struct /// Get an [`ArrowFFIData`] to allow binding to the arrow [C Data /// Interface](https://arrow.apache.org/docs/format/CDataInterface.html). This includes the data and @@ -86,11 +95,7 @@ pub unsafe extern "C" fn get_raw_arrow_data( // TODO: This method leaks the returned pointer memory. How will the engine free it? #[cfg(feature = "default-engine-base")] fn get_raw_arrow_data_impl(data: Box) -> DeltaResult<*mut ArrowFFIData> { - let record_batch: delta_kernel::arrow::array::RecordBatch = data - .into_any() - .downcast::() - .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? - .into(); + let record_batch = data.try_into_record_batch()?; let sa: StructArray = record_batch.into(); let array_data: ArrayData = sa.into(); // these call `clone`. is there a way to not copy anything and what exactly are they cloning? diff --git a/ffi/src/engine_funcs.rs b/ffi/src/engine_funcs.rs index 5eb4dacb3f..e1c76a5f89 100644 --- a/ffi/src/engine_funcs.rs +++ b/ffi/src/engine_funcs.rs @@ -156,12 +156,13 @@ pub unsafe extern "C" fn new_expression_evaluator( expression: &Expression, // TODO: Make this a data_type, and give a way for c code to go between schema <-> datatype output_type: Handle, -) -> Handle { +) -> ExternResult> { let engine = unsafe { engine.clone_as_arc() }; let input_schema = unsafe { input_schema.clone_as_arc() }; let output_type: DataType = output_type.as_ref().clone().into(); let expression = Arc::new(expression.clone()); - new_expression_evaluator_impl(engine, input_schema, expression, output_type) + let res = new_expression_evaluator_impl(engine.clone(), input_schema, expression, output_type); + res.into_extern_result(&engine.as_ref()) } fn new_expression_evaluator_impl( @@ -169,13 +170,14 @@ fn new_expression_evaluator_impl( input_schema: SchemaRef, expression: ExpressionRef, output_type: DataType, -) -> Handle { +) -> DeltaResult> { let engine = extern_engine.engine(); - let evaluator = - engine - .evaluation_handler() - .new_expression_evaluator(input_schema, expression, output_type); - evaluator.into() + let evaluator = engine.evaluation_handler().new_expression_evaluator( + input_schema, + expression, + output_type, + )?; + Ok(evaluator.into()) } /// Free an expression evaluator @@ -215,6 +217,7 @@ fn evaluate_expression_impl( #[cfg(test)] mod tests { use super::{free_expression_evaluator, new_expression_evaluator}; + use crate::ffi_test_utils::ok_or_panic; use crate::{free_engine, handle::Handle, tests::get_default_engine, SharedSchema}; use delta_kernel::{ schema::{DataType, StructField, StructType}, @@ -232,12 +235,13 @@ mod tests { let output_type: Handle = in_schema.clone().into(); let in_schema_handle: Handle = in_schema.into(); unsafe { - let evaluator = new_expression_evaluator( + let result = new_expression_evaluator( engine.shallow_copy(), in_schema_handle.shallow_copy(), &expr, output_type.shallow_copy(), ); + let evaluator = ok_or_panic(result); in_schema_handle.drop_handle(); output_type.drop_handle(); free_engine(engine); diff --git a/ffi/src/expressions/engine_visitor.rs b/ffi/src/expressions/engine_visitor.rs index 2387c23df5..61d8db4a50 100644 --- a/ffi/src/expressions/engine_visitor.rs +++ b/ffi/src/expressions/engine_visitor.rs @@ -3,14 +3,15 @@ use crate::expressions::{ SharedExpression, SharedOpaqueExpressionOp, SharedOpaquePredicateOp, SharedPredicate, }; -use crate::{handle::Handle, kernel_string_slice, KernelStringSlice}; +use crate::{handle::Handle, kernel_string_slice, KernelStringSlice, SharedSchema}; use delta_kernel::expressions::{ ArrayData, BinaryExpression, BinaryExpressionOp, BinaryPredicate, BinaryPredicateOp, ColumnName, Expression, ExpressionRef, JunctionPredicate, JunctionPredicateOp, MapData, - OpaqueExpression, OpaqueExpressionOpRef, OpaquePredicate, OpaquePredicateOpRef, Predicate, - Scalar, StructData, Transform, UnaryExpression, UnaryExpressionOp, UnaryPredicate, - UnaryPredicateOp, VariadicExpression, VariadicExpressionOp, + MapToStructExpression, OpaqueExpression, OpaqueExpressionOpRef, OpaquePredicate, + OpaquePredicateOpRef, ParseJsonExpression, Predicate, Scalar, StructData, Transform, + UnaryExpression, UnaryExpressionOp, UnaryPredicate, UnaryPredicateOp, VariadicExpression, + VariadicExpressionOp, }; use std::ffi::c_void; @@ -22,6 +23,12 @@ type VisitVariadicFn = extern "C" fn(data: *mut c_void, sibling_list_id: usize, child_list_id: usize); type VisitJunctionFn = extern "C" fn(data: *mut c_void, sibling_list_id: usize, child_list_id: usize); +type VisitParseJsonFn = extern "C" fn( + data: *mut c_void, + sibling_list_id: usize, + child_list_id: usize, + output_schema: Handle, +); /// The [`EngineExpressionVisitor`] defines a visitor system to allow engines to build their own /// representation of a kernel expression or predicate. @@ -138,6 +145,14 @@ pub struct EngineExpressionVisitor { /// Visits the `ToJson` unary operator belonging to the list identified by `sibling_list_id`. /// The sub-expression will be in a _one_ item list identified by `child_list_id` pub visit_to_json: VisitUnaryFn, + /// Visits the `ParseJson` expression belonging to the list identified by `sibling_list_id`. + /// The sub-expression (JSON string) will be in a _one_ item list identified by `child_list_id`. + /// The `output_schema` handle specifies the schema to parse the JSON into. + pub visit_parse_json: VisitParseJsonFn, + /// Visits the `MapToStruct` expression belonging to the list identified by `sibling_list_id`. + /// The sub-expression (map column) will be in a _one_ item list identified by `child_list_id`. + /// The output struct schema is determined by the evaluator's result type. + pub visit_map_to_struct: VisitUnaryFn, /// Visits the `LessThan` binary operator belonging to the list identified by `sibling_list_id`. /// The operands will be in a _two_ item list identified by `child_list_id` pub visit_lt: VisitBinaryFn, @@ -317,7 +332,6 @@ fn visit_expression_array( array: &ArrayData, sibling_list_id: usize, ) { - #[allow(deprecated)] let elements = array.array_elements(); let child_list_id = call!(visitor, make_field_list, elements.len()); for scalar in elements { @@ -590,7 +604,7 @@ fn visit_expression_impl( match expression { Expression::Literal(scalar) => visit_expression_scalar(visitor, scalar, sibling_list_id), Expression::Column(name) => visit_expression_column(visitor, name, sibling_list_id), - Expression::Struct(exprs) => visit_expression_struct(visitor, exprs, sibling_list_id), + Expression::Struct(exprs, _) => visit_expression_struct(visitor, exprs, sibling_list_id), Expression::Transform(transform) => { visit_expression_transform(visitor, transform, sibling_list_id) } @@ -628,6 +642,26 @@ fn visit_expression_impl( Expression::Opaque(OpaqueExpression { op, exprs }) => { visit_expression_opaque(visitor, op, exprs, sibling_list_id) } + Expression::ParseJson(ParseJsonExpression { + json_expr, + output_schema, + }) => { + let child_list_id = call!(visitor, make_field_list, 1); + visit_expression_impl(visitor, json_expr, child_list_id); + let schema_handle = Handle::from(output_schema.clone()); + call!( + visitor, + visit_parse_json, + sibling_list_id, + child_list_id, + schema_handle + ); + } + Expression::MapToStruct(MapToStructExpression { map_expr }) => { + let child_list_id = call!(visitor, make_field_list, 1); + visit_expression_impl(visitor, map_expr, child_list_id); + call!(visitor, visit_map_to_struct, sibling_list_id, child_list_id); + } Expression::Unknown(name) => visit_unknown(visitor, sibling_list_id, name), } } diff --git a/ffi/src/expressions/kernel_visitor.rs b/ffi/src/expressions/kernel_visitor.rs index f912acf986..5102092032 100644 --- a/ffi/src/expressions/kernel_visitor.rs +++ b/ffi/src/expressions/kernel_visitor.rs @@ -1,15 +1,21 @@ //! Defines [`KernelExpressionVisitorState`]. This is a visitor that can be used to convert an //! engine's native expressions into kernel's [`Expression`] and [`Predicate`] types. -use crate::{ - AllocateErrorFn, EngineIterator, ExternResult, IntoExternResult, KernelStringSlice, - ReferenceSet, TryFromStringSlice, -}; +use std::sync::Arc; + use delta_kernel::expressions::{ BinaryExpressionOp, BinaryPredicateOp, ColumnName, Expression, Predicate, Scalar, UnaryPredicateOp, }; use delta_kernel::DeltaResult; +use crate::expressions::{SharedExpression, SharedPredicate}; +use crate::handle::Handle; +use crate::scan::{EngineExpression, EnginePredicate}; +use crate::{ + AllocateErrorFn, EngineIterator, ExternResult, IntoExternResult, KernelStringSlice, + ReferenceSet, TryFromStringSlice, +}; + pub(crate) enum ExpressionOrPredicate { Expression(Expression), Predicate(Predicate), @@ -266,7 +272,6 @@ fn visit_expression_literal_string_impl( } // We need to get parse.expand working to be able to macro everything below, see issue #255 - #[no_mangle] pub extern "C" fn visit_expression_literal_int( state: &mut KernelExpressionVisitorState, @@ -331,3 +336,195 @@ pub extern "C" fn visit_expression_literal_date( ) -> usize { wrap_expression(state, Expression::literal(Scalar::Date(value))) } + +/// visit a timestamp literal expression 'value' (i64 representing microseconds since unix epoch) +#[no_mangle] +pub extern "C" fn visit_expression_literal_timestamp( + state: &mut KernelExpressionVisitorState, + value: i64, +) -> usize { + wrap_expression(state, Expression::literal(Scalar::Timestamp(value))) +} + +/// visit a timestamp_ntz literal expression 'value' (i64 representing microseconds since unix epoch) +#[no_mangle] +pub extern "C" fn visit_expression_literal_timestamp_ntz( + state: &mut KernelExpressionVisitorState, + value: i64, +) -> usize { + wrap_expression(state, Expression::literal(Scalar::TimestampNtz(value))) +} + +/// visit a binary literal expression +/// +/// # Safety +/// The caller must ensure that `value` points to a valid array of at least `len` bytes. +#[no_mangle] +pub unsafe extern "C" fn visit_expression_literal_binary( + state: &mut KernelExpressionVisitorState, + value: *const u8, + len: usize, +) -> usize { + let bytes = std::slice::from_raw_parts(value, len); + wrap_expression(state, Expression::literal(Scalar::Binary(bytes.to_vec()))) +} + +/// visit a decimal literal expression +/// +/// Returns an error if the precision/scale combination is invalid. +#[no_mangle] +pub extern "C" fn visit_expression_literal_decimal( + state: &mut KernelExpressionVisitorState, + value_hi: u64, + value_lo: u64, + precision: u8, + scale: u8, + allocate_error: AllocateErrorFn, +) -> ExternResult { + // SAFETY: The allocate_error function pointer is provided by the engine and assumed valid. + unsafe { + visit_expression_literal_decimal_impl(state, value_hi, value_lo, precision, scale) + .into_extern_result(&allocate_error) + } +} + +fn visit_expression_literal_decimal_impl( + state: &mut KernelExpressionVisitorState, + value_hi: u64, + value_lo: u64, + precision: u8, + scale: u8, +) -> DeltaResult { + // Reconstruct the i128 from two u64 parts + let value = ((value_hi as i128) << 64) | (value_lo as i128); + let decimal = Scalar::decimal(value, precision, scale)?; + Ok(wrap_expression(state, Expression::literal(decimal))) +} + +/// Visit a null literal expression. +/// +/// Returns an error because NULL literal reconstruction is not supported - type information +/// is lost when converting from kernel to engine format, so we cannot faithfully reconstruct +/// the original NULL literal. +#[no_mangle] +pub extern "C" fn visit_expression_literal_null( + _state: &mut KernelExpressionVisitorState, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let err = delta_kernel::Error::generic("NULL literal reconstruction is not supported"); + // SAFETY: The allocate_error function pointer is provided by the engine and assumed valid. + unsafe { Err(err).into_extern_result(&allocate_error) } +} + +#[no_mangle] +pub extern "C" fn visit_predicate_distinct( + state: &mut KernelExpressionVisitorState, + a: usize, + b: usize, +) -> usize { + visit_predicate_binary(state, BinaryPredicateOp::Distinct, a, b) +} + +#[no_mangle] +pub extern "C" fn visit_predicate_in( + state: &mut KernelExpressionVisitorState, + a: usize, + b: usize, +) -> usize { + visit_predicate_binary(state, BinaryPredicateOp::In, a, b) +} + +#[no_mangle] +pub extern "C" fn visit_predicate_or( + state: &mut KernelExpressionVisitorState, + children: &mut EngineIterator, +) -> usize { + use delta_kernel::expressions::JunctionPredicateOp; + let result = Predicate::junction( + JunctionPredicateOp::Or, + children.flat_map(|child| unwrap_kernel_predicate(state, child as usize)), + ); + wrap_predicate(state, result) +} + +#[no_mangle] +pub extern "C" fn visit_expression_struct( + state: &mut KernelExpressionVisitorState, + children: &mut EngineIterator, +) -> usize { + let exprs: Vec = children + .flat_map(|child| unwrap_kernel_expression(state, child as usize)) + .collect(); + wrap_expression(state, Expression::struct_from(exprs)) +} + +/// Visit a MapToStruct expression. The `child_expr` is the map expression. +#[no_mangle] +pub extern "C" fn visit_expression_map_to_struct( + state: &mut KernelExpressionVisitorState, + child_expr: usize, +) -> usize { + unwrap_kernel_expression(state, child_expr).map_or(0, |expr| { + wrap_expression(state, Expression::map_to_struct(expr)) + }) +} + +/// Convert an engine expression to a kernel expression using the visitor +/// pattern. +/// +/// # Safety +/// +/// Caller must ensure that `engine_expression` points to a valid +/// `EngineExpression` with a valid visitor function and expression pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_engine_expression( + engine_expression: &mut EngineExpression, + allocate_error: AllocateErrorFn, +) -> ExternResult> { + visit_engine_expression_impl(engine_expression).into_extern_result(&allocate_error) +} + +fn visit_engine_expression_impl( + engine_expression: &mut EngineExpression, +) -> DeltaResult> { + let mut visitor_state = KernelExpressionVisitorState::default(); + let expr_id = (engine_expression.visitor)(engine_expression.expression, &mut visitor_state); + + let expr = unwrap_kernel_expression(&mut visitor_state, expr_id).ok_or_else(|| { + delta_kernel::Error::generic(format!( + "Invalid expression ID {expr_id} returned from engine visitor" + )) + })?; + + Ok(Arc::new(expr).into()) +} + +/// Convert an engine predicate to a kernel predicate using the visitor +/// pattern. +/// +/// # Safety +/// +/// Caller must ensure that `engine_predicate` points to a valid +/// `EnginePredicate` with a valid visitor function and predicate pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_engine_predicate( + engine_predicate: &mut EnginePredicate, + allocate_error: AllocateErrorFn, +) -> ExternResult> { + visit_engine_predicate_impl(engine_predicate).into_extern_result(&allocate_error) +} + +fn visit_engine_predicate_impl( + engine_predicate: &mut EnginePredicate, +) -> DeltaResult> { + let mut visitor_state = KernelExpressionVisitorState::default(); + let pred_id = (engine_predicate.visitor)(engine_predicate.predicate, &mut visitor_state); + + let pred = unwrap_kernel_predicate(&mut visitor_state, pred_id).ok_or_else(|| { + delta_kernel::Error::generic(format!( + "Invalid predicate ID {pred_id} returned from engine visitor" + )) + })?; + + Ok(Arc::new(pred).into()) +} diff --git a/ffi/src/ffi_test_utils.rs b/ffi/src/ffi_test_utils.rs index 0508c7e732..0fcf644ace 100644 --- a/ffi/src/ffi_test_utils.rs +++ b/ffi/src/ffi_test_utils.rs @@ -1,9 +1,23 @@ //! Utility functions used for tests in this crate. -use crate::error::{EngineError, ExternResult, KernelError}; -use crate::{KernelStringSlice, NullableCvoid, TryFromStringSlice}; use std::os::raw::c_void; use std::ptr::NonNull; +#[cfg(test)] +use std::sync::Arc; + +use crate::error::{EngineError, ExternResult, KernelError}; +#[cfg(test)] +use crate::{ + engine_to_handle, get_snapshot_builder, kernel_string_slice, snapshot_builder_build, + SharedExternEngine, SharedSnapshot, +}; +use crate::{KernelStringSlice, NullableCvoid, TryFromStringSlice}; +#[cfg(test)] +use delta_kernel::engine::default::DefaultEngineBuilder; +#[cfg(test)] +use delta_kernel::object_store::memory::InMemory; +#[cfg(test)] +use test_utils::add_commit; // Used to allocate EngineErrors with test information from Rust tests #[cfg(test)] @@ -56,19 +70,52 @@ pub(crate) fn ok_or_panic(result: ExternResult) -> T { } } +/// Build a latest-version snapshot via the FFI builder API. Panics on error. +#[cfg(test)] +pub(crate) unsafe fn build_snapshot( + path: KernelStringSlice, + engine: crate::handle::Handle, +) -> crate::handle::Handle { + let builder = ok_or_panic(get_snapshot_builder(path, engine)); + ok_or_panic(snapshot_builder_build(builder)) +} + +/// Create an in-memory engine and snapshot from the given commit data. Returns +/// `(engine_handle, snapshot_handle)` -- the caller must free both when done. +#[cfg(test)] +pub(crate) async fn setup_snapshot( + commit_data: String, +) -> Result< + ( + crate::handle::Handle, + crate::handle::Handle, + ), + Box, +> { + let table_root = "memory:///"; + let storage = Arc::new(InMemory::new()); + add_commit(table_root, storage.as_ref(), 0, commit_data).await?; + let engine = DefaultEngineBuilder::new(storage.clone()).build(); + let engine = engine_to_handle(Arc::new(engine), allocate_err); + let snap = unsafe { build_snapshot(kernel_string_slice!(table_root), engine.shallow_copy()) }; + Ok((engine, snap)) +} + /// Check error type and message while also recovering the error to prevent leaks pub(crate) fn assert_extern_result_error_with_message( res: ExternResult, expected_etype: KernelError, - expected_message: &str, + opt_message: Option<&str>, ) { match res { ExternResult::Err(e) => { let error = unsafe { recover_error(e) }; assert_eq!(error.etype, expected_etype); - assert_eq!(error.message, expected_message); + if let Some(expected_message) = opt_message { + assert_eq!(error.message, expected_message); + } } - _ => panic!("Expected error of type '{expected_etype:?}' and message '{expected_message}'"), + _ => panic!("Expected error of type '{expected_etype:?}' and message '{opt_message:?}'"), } } @@ -107,18 +154,15 @@ mod tests { assert!( panic_str.contains("Got engine error with type"), - "Panic message should contain 'Got engine error with type', got: {}", - panic_str + "Panic message should contain 'Got engine error with type', got: {panic_str}" ); assert!( panic_str.contains("GenericError"), - "Panic message should contain error type 'GenericError', got: {}", - panic_str + "Panic message should contain error type 'GenericError', got: {panic_str}" ); assert!( panic_str.contains(message), - "Panic message should contain error message 'Test error message', got: {}", - panic_str + "Panic message should contain error message 'Test error message', got: {panic_str}" ); } diff --git a/ffi/src/ffi_tracing.rs b/ffi/src/ffi_tracing.rs index a84f8224fc..231d59332f 100644 --- a/ffi/src/ffi_tracing.rs +++ b/ffi/src/ffi_tracing.rs @@ -1,9 +1,11 @@ //! FFI functions to allow engines to receive log and tracing events from kernel +use std::sync::LazyLock; use std::sync::{Arc, Mutex}; use std::{fmt, io}; use delta_kernel::{DeltaResult, Error}; +use tracing::{error, warn}; use tracing::{ field::{Field as TracingField, Visit}, Event as TracingEvent, Subscriber, @@ -242,7 +244,7 @@ impl Visit for MessageFieldVisitor { } struct EventLayer { - callback: TracingEventFn, + callback: Arc>, } impl Layer for EventLayer @@ -267,25 +269,140 @@ where line: metadata.line().unwrap_or(0), file: kernel_string_slice!(file), }; - (self.callback)(event); + if let Ok(cb) = self.callback.lock() { + (cb)(event); + } else { + error!("Failed to lock event callback (mutex poisoned)."); + } } } } -fn get_event_dispatcher(callback: TracingEventFn, max_level: Level) -> tracing_core::Dispatch { +struct GlobalTracingState { + dispatch: Option, + reload_handle: + Option>, + /// callback for event subscriber + event_callback: Option>>, + /// callback for log line subscriber + log_line_callback: Option>>, +} + +impl GlobalTracingState { + fn uninitialized() -> Self { + GlobalTracingState { + dispatch: None, + reload_handle: None, + event_callback: None, + log_line_callback: None, + } + } + + fn register_event_callback( + &mut self, + callback: TracingEventFn, + max_level: Level, + ) -> DeltaResult<()> { + if !max_level.is_valid() { + return Err(Error::generic("max_level out of range")); + } + + if let (Some(reload), Some(event_cb)) = (&self.reload_handle, &self.event_callback) { + let mut event_cb = event_cb.lock().map_err(|_e| { + Error::generic("Failed to acquire lock for event callback (mutex poisoned).") + })?; + *event_cb = callback; + return reload.reload(LevelFilter::from(max_level)).map_err(|e| { + warn!("Failed to reload tracing level: {e}"); + Error::generic(format!("Unable to reload subscriber: {e}")) + }); + } + + let (dispatch, reload_handle, event_callback) = create_event_dispatch(callback, max_level); + set_global_default(dispatch.clone())?; + self.dispatch = Some(dispatch); + self.reload_handle = Some(reload_handle); + self.event_callback = Some(event_callback); + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + fn register_log_line_callback( + &mut self, + callback: TracingLogLineFn, + max_level: Level, + format: LogLineFormat, + ansi: bool, + with_time: bool, + with_level: bool, + with_target: bool, + ) -> DeltaResult<()> { + if !max_level.is_valid() { + return Err(Error::generic("max_level out of range")); + } + + if let (Some(reload), Some(log_cb)) = (&self.reload_handle, &self.log_line_callback) { + let mut log_cb = log_cb.lock().map_err(|_e| { + Error::generic("Failed to acquire lock for log callback (mutex poisoned).") + })?; + *log_cb = callback; + return reload.reload(LevelFilter::from(max_level)).map_err(|e| { + warn!("Failed to reload log level: {e}"); + Error::generic(format!("Unable to reload subscriber: {e}")) + }); + } + + let (dispatch, reload_handle, log_line_callback) = create_log_line_dispatch( + callback, + max_level, + format, + ansi, + with_time, + with_level, + with_target, + ); + set_global_default(dispatch.clone())?; + self.dispatch = Some(dispatch); + self.reload_handle = Some(reload_handle); + self.log_line_callback = Some(log_line_callback); + Ok(()) + } +} + +static TRACING_STATE: LazyLock> = + LazyLock::new(|| Mutex::new(GlobalTracingState::uninitialized())); + +fn create_event_dispatch( + callback: TracingEventFn, + max_level: Level, +) -> ( + tracing_core::Dispatch, + tracing_subscriber::reload::Handle, + Arc>, +) { use tracing_subscriber::{layer::SubscriberExt, registry::Registry}; - let filter: LevelFilter = max_level.into(); - let event_layer = EventLayer { callback }.with_filter(filter); + + let callback_arc = Arc::new(Mutex::new(callback)); + let (filter_layer, reload_handle) = + tracing_subscriber::reload::Layer::new(LevelFilter::from(max_level)); + let event_layer = EventLayer { + callback: callback_arc.clone(), + } + .with_filter(filter_layer); + let subscriber = Registry::default().with(event_layer); - tracing_core::Dispatch::new(subscriber) + ( + tracing_core::Dispatch::new(subscriber), + reload_handle, + callback_arc, + ) } fn setup_event_subscriber(callback: TracingEventFn, max_level: Level) -> DeltaResult<()> { - if !max_level.is_valid() { - return Err(Error::generic("max_level out of range")); - } - let dispatch = get_event_dispatcher(callback, max_level); - set_global_default(dispatch) + let mut state = TRACING_STATE + .lock() + .map_err(|_e| Error::generic("Poisoned mutex while setting up event subscriber"))?; + state.register_event_callback(callback, max_level) } // utility code below for setting up the tracing subscriber for log lines @@ -294,7 +411,7 @@ type SharedBuffer = Arc>>; struct TriggerLayer { buf: SharedBuffer, - callback: TracingLogLineFn, + callback: Arc>, } impl Layer for TriggerLayer @@ -306,13 +423,21 @@ where Ok(mut buf) => { let message = String::from_utf8_lossy(&buf); let message = kernel_string_slice!(message); - (self.callback)(message); + if let Ok(cb) = self.callback.lock() { + (cb)(message); + } else { + error!("Failed to lock event callback (mutex poisoned)."); + } buf.clear(); } Err(_) => { let message = "INTERNAL KERNEL ERROR: Could not lock message buffer."; let message = kernel_string_slice!(message); - (self.callback)(message); + if let Ok(cb) = self.callback.lock() { + (cb)(message); + } else { + error!("Failed to lock event callback (mutex poisoned)."); + } } } } @@ -347,7 +472,7 @@ impl<'a> MakeWriter<'a> for BufferedMessageWriter { } } -fn get_log_line_dispatch( +fn create_log_line_dispatch( callback: TracingLogLineFn, max_level: Level, format: LogLineFormat, @@ -355,34 +480,47 @@ fn get_log_line_dispatch( with_time: bool, with_level: bool, with_target: bool, -) -> tracing_core::Dispatch { +) -> ( + tracing_core::Dispatch, + tracing_subscriber::reload::Handle, + Arc>, +) { use tracing_subscriber::{layer::SubscriberExt, registry::Registry}; + let buffer = Arc::new(Mutex::new(vec![])); let writer = BufferedMessageWriter { current_buffer: buffer.clone(), }; + let fmt_layer = tracing_subscriber::fmt::layer() .with_writer(writer) .with_ansi(ansi) .with_level(with_level) .with_target(with_target); - let filter: LevelFilter = max_level.into(); + + let (filter_layer, reload_handle) = + tracing_subscriber::reload::Layer::new(LevelFilter::from(max_level)); + + let callback_arc = Arc::new(Mutex::new(callback)); let tracking_layer = TriggerLayer { buf: buffer.clone(), - callback, + callback: callback_arc.clone(), }; - // This repeats some code, but avoids some insane generic wrangling if we try to abstract the - // type of `fmt_layer` over the formatter macro_rules! setup_subscriber { ($($transform:ident()).*) => {{ - let fmt_layer = fmt_layer$(.$transform())*.with_filter(filter); let subscriber = Registry::default() + .with(filter_layer) .with(fmt_layer) - .with(tracking_layer.with_filter(filter)); - tracing_core::Dispatch::new(subscriber) + .with(tracking_layer); + ( + tracing_core::Dispatch::new(subscriber), + reload_handle, + callback_arc.clone(), + ) }}; } + use LogLineFormat::*; match (format, with_time) { (FULL, true) => setup_subscriber!(), @@ -405,10 +543,10 @@ fn setup_log_line_subscriber( with_level: bool, with_target: bool, ) -> DeltaResult<()> { - if !max_level.is_valid() { - return Err(Error::generic("max_level out of range")); - } - let dispatch = get_log_line_dispatch( + let mut state = TRACING_STATE + .lock() + .map_err(|_e| Error::generic("Poisoned mutex while setting up log_line_subscriber"))?; + state.register_log_line_callback( callback, max_level, format, @@ -416,15 +554,16 @@ fn setup_log_line_subscriber( with_time, with_level, with_target, - ); - set_global_default(dispatch) + ) } #[cfg(test)] mod tests { use std::sync::LazyLock; + use tracing::debug; use tracing::info; + use tracing::trace; use tracing_subscriber::fmt::time::FormatTime; use crate::TryFromStringSlice; @@ -436,23 +575,42 @@ mod tests { static TEST_LOCK: LazyLock> = LazyLock::new(|| Mutex::new(())); static MESSAGES: Mutex>> = Mutex::new(None); - extern "C" fn record_callback(line: KernelStringSlice) { - let s: &str = unsafe { TryFromStringSlice::try_from_slice(&line).unwrap() }; - let s = s.to_string(); - let mut lock = MESSAGES.lock().unwrap(); - if let Some(ref mut msgs) = *lock { - msgs.push(s); + fn record_callback_with_filter(line: KernelStringSlice, expected_log_lines: Vec<&str>) { + let line_str: &str = unsafe { TryFromStringSlice::try_from_slice(&line).unwrap() }; + let line_str = line_str.to_string(); + let ok = expected_log_lines.is_empty() + || expected_log_lines + .iter() + .any(|expected_log_line| line_str.ends_with(expected_log_line)); + if ok { + let mut lock = MESSAGES.lock().unwrap(); + if let Some(ref mut msgs) = *lock { + msgs.push(line_str); + } } } + // Note: record callbacks must be extern "C". Thus we cannot construct test callback closures in runtime. + extern "C" fn record_callback_with_filter_1(line: KernelStringSlice) { + record_callback_with_filter(line, vec!["Testing 1\n", "Another line\n"]) + } + + extern "C" fn record_callback_with_filter_2(line: KernelStringSlice) { + record_callback_with_filter(line, vec!["Testing 2\n", "Yet another line\n"]) + } + fn setup_messages() { *MESSAGES.lock().unwrap() = Some(vec![]); } - // get the string that we should ensure is in log messages for the time. If current time seconds - // is >= 50, return None because the minute might roll over before we actually log which would - // invalidate this check - fn get_time_test_str() -> Option { + /// Format the current time as a string using the same formatter that tracing uses, trimmed + /// to hours, minutes, and seconds (e.g. `"2026-03-10T14:32:45"`). This is used by tests to + /// verify that log output contains a reasonable timestamp. + /// + /// Must be called both before and after logging to bracket the log's timestamp -- a context + /// switch between capturing and logging can cross a second boundary, so the test asserts + /// that at least one of the two captured timestamps appears in the output. + fn get_time_test_str() -> String { #[derive(Default)] struct W { s: String, @@ -469,71 +627,112 @@ mod tests { let now = tracing_subscriber::fmt::time::SystemTime; now.format_time(&mut writer).unwrap(); let tstr = w.s; - if tstr.len() < 19 { - return None; - } - let secs: u32 = tstr[17..19].parse().expect("Failed to parse secs"); - if secs >= 50 { - // risk of roll-over, don't check - return None; + assert!(tstr.len() >= 19, "Unexpected time format: {tstr:?}"); + // Trim to hours, minutes, and seconds + tstr[..19].to_string() + } + + /// Check that logged messages match the expected lines, level, and timestamp. + /// + /// Timestamps are captured before and after logging to handle second-boundary races: a + /// context switch between capturing the time and emitting the log line can cause the + /// seconds to differ. By bracketing with a range check (`time_before <= log_time <= + /// time_after`), we tolerate any amount of delay between capture and log emission. This + /// works because ISO 8601 timestamps sort lexicographically. + fn check_messages( + expected_lines: Vec<&str>, + time_before: &str, + time_after: &str, + expected_level_str: &str, + ) { + let lock = MESSAGES.lock().unwrap(); + let Some(ref msgs) = *lock else { + panic!("Messages wasn't Some"); + }; + assert_eq!(msgs.len(), expected_lines.len()); + for (got, expect) in msgs.iter().zip(expected_lines) { + assert!(got.ends_with(expect)); + assert!(got.contains(expected_level_str)); + assert!(got.contains("delta_kernel_ffi::ffi_tracing::tests")); + assert_timestamp_in_range(got, time_before, time_after); } - // Trim to just hours and minutes - Some(tstr[..19].to_string()) + } + + /// Assert that the log line contains a timestamp within [time_before, time_after]. + /// + /// Log lines may contain ANSI escape codes before the timestamp, so we locate the + /// timestamp by searching for the `time_before` prefix (up to the seconds). Once found, + /// we extract the full timestamp and do a lexicographic range check. ISO 8601 timestamps + /// are fixed-width and sort lexicographically. + fn assert_timestamp_in_range(log_line: &str, time_before: &str, time_after: &str) { + let len = time_before.len(); + // Search for the date+hour+minute prefix (first 16 chars, e.g. "2026-03-10T14:32") + // to locate the timestamp start. We use 16 chars (excluding seconds) because the + // seconds may differ between time_before and the log line. + let prefix = &time_before[..16]; + let ts_start = log_line + .find(prefix) + .unwrap_or_else(|| panic!("No timestamp found in log line: {log_line:?}")); + let log_time = &log_line[ts_start..ts_start + len]; + assert!( + log_time >= time_before && log_time <= time_after, + "Timestamp {log_time:?} not in expected range [{time_before:?}, {time_after:?}], \ + full log line: {log_line:?}" + ); } // IMPORTANT: This is the only test that should call the actual `extern "C"` function, as we can // only call it once to set the global subscriber. Other tests ALL need to use // `get_X_dispatcher` and set it locally using `with_default` #[test] - fn info_logs_with_log_line_tracing() { + fn test_enable_log_line_tracing() { let _lock = TEST_LOCK.lock().unwrap(); setup_messages(); unsafe { - enable_log_line_tracing(record_callback, Level::INFO); + // record_callback_with_filter_1 filters only "Testing 1\n", "Another line\n" + enable_log_line_tracing(record_callback_with_filter_1, Level::INFO); } - let lines = ["Testing 1\n", "Another line\n"]; - let test_time_str = get_time_test_str(); - for line in lines { - // remove final newline which will be added back by logging + let lines = [ + "Testing 1\n", + "Another line\n", + "Testing 2\n", + "Yet another line\n", + ]; + // We registered record_callback_with_filter_1, which filters only the first two lines. + let expected_lines = vec!["Testing 1\n", "Another line\n"]; + let time_before = get_time_test_str(); + for line in &lines { + // Remove final newline which will be added back by logging info!("{}", &line[..(line.len() - 1)]); } - let lock = MESSAGES.lock().unwrap(); - if let Some(ref msgs) = *lock { - assert_eq!(msgs.len(), lines.len()); - for (got, expect) in msgs.iter().zip(lines) { - assert!(got.ends_with(expect)); - assert!(got.contains("INFO")); - assert!(got.contains("delta_kernel_ffi::ffi_tracing::tests")); - if let Some(ref tstr) = test_time_str { - assert!(got.contains(tstr)); - } - } - } else { - panic!("Messages wasn't Some"); - } + let time_after = get_time_test_str(); - // ensure we can't setup again - // do in the same test to ensure ordering - let ok = unsafe { - enable_formatted_log_line_tracing( - record_callback, - Level::TRACE, - LogLineFormat::FULL, - true, // ansi - true, // with_time - true, // with_level - true, // with_target - ) - }; - assert!(!ok, "Should have not set up a second time") + check_messages(expected_lines, &time_before, &time_after, "INFO"); + setup_messages(); + + // Ensure we can setup again with a new callback and a new tracing level + let ok = unsafe { enable_log_line_tracing(record_callback_with_filter_2, Level::DEBUG) }; + assert!(ok, "Failed to set up second time"); + + // Ensure both callback and tracing level are reloaded. + // We registered record_callback_with_filter_2, which filters the other logging lines. + let expected_lines = vec!["Testing 2\n", "Yet another line\n"]; + let time_before = get_time_test_str(); + for line in &lines { + debug!("{}", &line[..(line.len() - 1)]); + // Trace must not be visible in messages, because we changed level to debug + trace!("{}", &line[..(line.len() - 1)]); + } + let time_after = get_time_test_str(); + check_messages(expected_lines, &time_before, &time_after, "DEBUG"); } #[test] fn info_logs_with_formatted_log_line_tracing() { let _lock = TEST_LOCK.lock().unwrap(); setup_messages(); - let dispatch = get_log_line_dispatch( - record_callback, + let (dispatch, _, _) = create_log_line_dispatch( + record_callback_with_filter_1, Level::INFO, LogLineFormat::COMPACT, false, @@ -543,11 +742,12 @@ mod tests { ); tracing_core::dispatcher::with_default(&dispatch, || { let lines = ["Testing 1\n", "Another line\n"]; - let test_time_str = get_time_test_str(); + let time_before = get_time_test_str(); for line in lines { // remove final newline which will be added back by logging info!("{}", &line[..(line.len() - 1)]); } + let time_after = get_time_test_str(); let lock = MESSAGES.lock().unwrap(); if let Some(ref msgs) = *lock { assert_eq!(msgs.len(), lines.len()); @@ -555,9 +755,7 @@ mod tests { assert!(got.ends_with(expect)); assert!(!got.contains("INFO")); assert!(!got.contains("delta_kernel_ffi::ffi_tracing::tests")); - if let Some(ref tstr) = test_time_str { - assert!(got.contains(tstr)); - } + assert_timestamp_in_range(got, &time_before, &time_after); } } else { panic!("Messages wasn't Some"); @@ -565,12 +763,31 @@ mod tests { }) } - static EVENTS_OK: Mutex>> = Mutex::new(None); + static EVENTS_OK: Mutex>> = Mutex::new(None); fn setup_events() { *EVENTS_OK.lock().unwrap() = Some(vec![]); } - extern "C" fn event_callback(event: Event) { + fn events_to_string(events: Vec<(String, tracing::Level)>) -> String { + let events_str = events + .iter() + .map(|(s, lvl)| format!("{s}:{lvl}")) + .collect::>() + .join(", "); + events_str + } + + fn convert_level(level: Level) -> tracing::Level { + match level { + Level::ERROR => tracing::Level::ERROR, + Level::WARN => tracing::Level::WARN, + Level::INFO => tracing::Level::INFO, + Level::DEBUG => tracing::Level::DEBUG, + Level::TRACE => tracing::Level::TRACE, + } + } + + fn event_callback_with_filter(event: Event, expected_log_lines: Vec<&str>) { let msg: &str = unsafe { TryFromStringSlice::try_from_slice(&event.message).unwrap() }; let target: &str = unsafe { TryFromStringSlice::try_from_slice(&event.target).unwrap() }; let file: &str = unsafe { TryFromStringSlice::try_from_slice(&event.file).unwrap() }; @@ -579,13 +796,43 @@ mod tests { use std::path::MAIN_SEPARATOR; let expected_file = format!("ffi{MAIN_SEPARATOR}src{MAIN_SEPARATOR}ffi_tracing.rs"); - let ok = event.level == Level::INFO - && target == "delta_kernel_ffi::ffi_tracing::tests" + let ok = target == "delta_kernel_ffi::ffi_tracing::tests" && file == expected_file - && (msg == "Testing 1" || msg == "Another line"); - let mut lock = EVENTS_OK.lock().unwrap(); - if let Some(ref mut events) = *lock { - events.push(ok); + && expected_log_lines.contains(&msg); + if ok { + let mut lock = EVENTS_OK.lock().unwrap(); + if let Some(ref mut events) = *lock { + events.push((msg.to_string(), convert_level(event.level))); + } + } + } + + extern "C" fn event_callback_with_filter_1(event: Event) { + event_callback_with_filter(event, vec!["Testing 1", "Another line"]) + } + + extern "C" fn event_callback_with_filter_2(event: Event) { + event_callback_with_filter(event, vec!["Testing 2", "Yet another line"]) + } + + fn check_events(expected_level: tracing::Level, expected_messages: Vec<&str>) { + let lock = EVENTS_OK.lock().unwrap(); + if let Some(ref results) = *lock { + assert!(!results.is_empty(), "No events were captured"); + + assert!( + results.iter().all(|(_msg, lvl)| *lvl == expected_level), + "Not all events were {expected_level}" + ); + let events_str = events_to_string(results.to_vec()); + assert!( + results + .iter() + .all(|(msg, _lvl)| expected_messages.contains(&msg.as_str())), + "Not all messages have expected format: {events_str}" + ) + } else { + panic!("Events wasn't Some"); } } @@ -593,19 +840,55 @@ mod tests { fn trace_event_tracking() { let _lock = TEST_LOCK.lock().unwrap(); setup_events(); - let dispatch = get_event_dispatcher(event_callback, Level::TRACE); + let (dispatch, _filter, _) = + create_event_dispatch(event_callback_with_filter_1, Level::TRACE); tracing_core::dispatcher::with_default(&dispatch, || { let lines = ["Testing 1", "Another line"]; for line in lines { info!("{line}"); } }); - let lock = EVENTS_OK.lock().unwrap(); - if let Some(ref results) = *lock { - assert!(results.iter().all(|x| *x)); - } else { - panic!("Events wasn't Some"); + check_events(tracing::Level::INFO, vec!["Testing 1", "Another line"]); + } + + #[test] + #[ignore] // We cannot run this test if test_enable_log_line_tracing was run before - see comment there, however this test works if run individually. + fn test_enable_event_tracing() { + let _lock = TEST_LOCK.lock().unwrap(); + setup_events(); + unsafe { + // Filters only "Testing 1", "Another line" + enable_event_tracing(event_callback_with_filter_1, Level::INFO); + } + let lines = ["Testing 1", "Another line", "Testing 2", "Yet another line"]; + // We registered record_callback_with_filter_1, which filters the first two logging lines + let expected_lines = vec!["Testing 1", "Another line"]; + for line in &lines { + info!("{}", &line); + } + + check_events(tracing::Level::INFO, expected_lines); + setup_events(); + assert!(EVENTS_OK + .lock() + .unwrap() + .as_ref() + .is_none_or(|v| v.is_empty())); + + // Ensure we can setup again with a new callback and a new tracing level + unsafe { + enable_event_tracing(event_callback_with_filter_2, Level::DEBUG); + }; + + // Ensure both callback and tracing level are reloaded. + // We registered record_callback_with_filter_2, which filters the other logging lines + let expected_lines = vec!["Testing 2", "Yet another line"]; + for line in &lines { + debug!("{}", &line); + // trace must not be visible in messages, because we changed level to debug + trace!("{}", &line); } + check_events(tracing::Level::DEBUG, expected_lines); } #[test] diff --git a/ffi/src/lib.rs b/ffi/src/lib.rs index 47d4881758..e6c8ff0bc0 100644 --- a/ffi/src/lib.rs +++ b/ffi/src/lib.rs @@ -2,19 +2,27 @@ //! //! Exposes that an engine needs to call from C/C++ to interface with kernel -#[cfg(feature = "default-engine-base")] -use std::collections::HashMap; +#![deny(clippy::unwrap_used, clippy::expect_used, clippy::panic)] +// we re-allow panics in tests +#![cfg_attr(test, allow(clippy::unwrap_used, clippy::expect_used, clippy::panic))] + use std::default::Default; use std::os::raw::{c_char, c_void}; use std::ptr::NonNull; use std::sync::Arc; use tracing::debug; use url::Url; +#[cfg(feature = "default-engine-base")] +use { + delta_kernel::engine::default::executor::tokio::TokioMultiThreadExecutor, + std::collections::HashMap, +}; +use delta_kernel::actions::{Metadata, Protocol}; use delta_kernel::schema::Schema; -use delta_kernel::snapshot::Snapshot; -use delta_kernel::Version; -use delta_kernel::{DeltaResult, Engine, EngineData}; +use delta_kernel::snapshot::{Snapshot, SnapshotRef}; +use delta_kernel::LogPath; +use delta_kernel::{DeltaResult, Engine, EngineData, Version}; use delta_kernel_ffi_macros::handle_descriptor; // cbindgen doesn't understand our use of feature flags here, and by default it parses `mod handle` @@ -37,12 +45,18 @@ pub use domain_metadata::get_domain_metadata; pub mod engine_data; pub mod engine_funcs; pub mod error; +#[cfg(feature = "default-engine-base")] +pub mod table_changes; use error::{AllocateError, AllocateErrorFn, ExternResult, IntoExternResult}; +#[cfg(feature = "delta-kernel-unity-catalog")] +pub mod delta_kernel_unity_catalog; pub mod expressions; #[cfg(feature = "tracing")] pub mod ffi_tracing; +pub mod log_path; pub mod scan; pub mod schema; +pub mod schema_visitor; #[cfg(test)] mod ffi_test_utils; @@ -205,6 +219,32 @@ impl<'a> TryFromStringSlice<'a> for &'a str { /// function is that `kernel_str` is _only_ valid until the return from this function pub type AllocateStringFn = extern "C" fn(kernel_str: KernelStringSlice) -> NullableCvoid; +/// An opaque type that rust will understand as a string. This can be obtained by calling +/// [`allocate_kernel_string`] with a [`KernelStringSlice`] +#[handle_descriptor(target=String, mutable=true, sized=true)] +pub struct ExclusiveRustString; + +/// Allow engines to create an opaque pointer that Rust will understand as a String. Returns an +/// error if the slice contains invalid utf-8 data. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid KernelStringSlice +#[no_mangle] +pub unsafe extern "C" fn allocate_kernel_string( + kernel_str: KernelStringSlice, + error_fn: AllocateErrorFn, +) -> ExternResult> { + allocate_kernel_string_impl(kernel_str).into_extern_result(&error_fn) +} + +fn allocate_kernel_string_impl( + kernel_str: KernelStringSlice, +) -> DeltaResult> { + let s = unsafe { String::try_from_slice(&kernel_str) }?; + Ok(Box::new(s).into()) +} + // Put KernelBoolSlice in a sub-module, with non-public members, so rust code cannot instantiate it // directly. It can only be created by converting `From>`. mod private { @@ -270,8 +310,10 @@ mod private { let len = val.len(); let boxed = val.into_boxed_slice(); let leaked_ptr = Box::leak(boxed).as_mut_ptr(); + // safety: Box::leak always returns a valid, non-null pointer + #[allow(clippy::expect_used)] let ptr = NonNull::new(leaked_ptr) - .expect("This should never be non-null please report this bug."); + .expect("This should never be null please report this bug."); KernelBoolSlice { ptr, len } } } @@ -324,8 +366,10 @@ mod private { let len = vec.len(); let boxed = vec.into_boxed_slice(); let leaked_ptr = Box::leak(boxed).as_mut_ptr(); + // safety: Box::leak always returns a valid, non-null pointer + #[allow(clippy::expect_used)] let ptr = NonNull::new(leaked_ptr) - .expect("This should never be non-null please report this bug."); + .expect("This should never be null please report this bug."); KernelRowIndexArray { ptr, len } } } @@ -436,6 +480,17 @@ pub struct EngineBuilder { url: Url, allocate_fn: AllocateErrorFn, options: HashMap, + /// Configuration for multithreaded executor. If Some, use a multi-threaded executor + /// If None, use the default single-threaded background executor. + multithreaded_executor_config: Option, +} + +#[cfg(feature = "default-engine-base")] +struct MultithreadedExecutorConfig { + /// Number of worker threads for the tokio runtime. `None` uses Tokio's default. + worker_threads: Option, + /// Maximum number of threads for blocking operations. `None` uses Tokio's default. + max_blocking_threads: Option, } #[cfg(feature = "default-engine-base")] @@ -470,6 +525,7 @@ fn get_engine_builder_impl( url: url?, allocate_fn, options: HashMap::default(), + multithreaded_executor_config: None, }); Ok(Box::into_raw(builder)) } @@ -485,11 +541,46 @@ pub unsafe extern "C" fn set_builder_option( builder: &mut EngineBuilder, key: KernelStringSlice, value: KernelStringSlice, +) -> ExternResult { + set_builder_option_impl(builder, key, value).into_extern_result(&builder.allocate_fn) +} +#[cfg(feature = "default-engine-base")] +fn set_builder_option_impl( + builder: &mut EngineBuilder, + key: KernelStringSlice, + value: KernelStringSlice, +) -> DeltaResult { + let key = unsafe { String::try_from_slice(&key) }?; + let value = unsafe { String::try_from_slice(&value) }?; + builder.set_option(key, value); + Ok(true) +} + +/// Configure the builder to use a multi-threaded executor instead of the default +/// single-threaded background executor. +/// +/// # Parameters +/// - `builder`: The engine builder to configure. +/// - `worker_threads`: Number of worker threads. Pass 0 to use Tokio's default. +/// - `max_blocking_threads`: Maximum number of blocking threads. Pass 0 to use Tokio's default. +/// +/// # Safety +/// +/// Caller must pass a valid EngineBuilder pointer. +#[cfg(feature = "default-engine-base")] +#[no_mangle] +pub unsafe extern "C" fn set_builder_with_multithreaded_executor( + builder: &mut EngineBuilder, + worker_threads: usize, + max_blocking_threads: usize, ) { - let key = unsafe { String::try_from_slice(&key) }; - let value = unsafe { String::try_from_slice(&value) }; - // TODO: Return ExternalError if key or value is invalid? (builder has an error allocator) - builder.set_option(key.unwrap(), value.unwrap()); + let worker_threads = (worker_threads != 0).then_some(worker_threads); + let max_blocking_threads = (max_blocking_threads != 0).then_some(max_blocking_threads); + + builder.multithreaded_executor_config = Some(MultithreadedExecutorConfig { + worker_threads, + max_blocking_threads, + }); } /// Consume the builder and return a `default` engine. After calling, the passed pointer is _no @@ -509,6 +600,7 @@ pub unsafe extern "C" fn builder_build( get_default_engine_impl( builder_box.url, builder_box.options, + builder_box.multithreaded_executor_config, builder_box.allocate_fn, ) .into_extern_result(&builder_box.allocate_fn) @@ -533,7 +625,7 @@ fn get_default_default_engine_impl( url: DeltaResult, allocate_error: AllocateErrorFn, ) -> DeltaResult> { - get_default_engine_impl(url?, Default::default(), allocate_error) + get_default_engine_impl(url?, Default::default(), None, allocate_error) } /// Safety @@ -551,20 +643,37 @@ fn engine_to_handle( engine.into() } +/// Build the default engine +/// +/// If `executor_config` is `Some`, uses a multi-threaded executor that owns its runtime. Otherwise, +/// uses the default single-threaded background executor. #[cfg(feature = "default-engine-base")] fn get_default_engine_impl( url: Url, options: HashMap, + executor_config: Option, allocate_error: AllocateErrorFn, ) -> DeltaResult> { - use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; - use delta_kernel::engine::default::DefaultEngine; - let engine = DefaultEngine::::try_new( - &url, - options, - Arc::new(TokioBackgroundExecutor::new()), - ); - Ok(engine_to_handle(Arc::new(engine?), allocate_error)) + use delta_kernel::engine::default::storage::store_from_url_opts; + use delta_kernel::engine::default::DefaultEngineBuilder; + + let store = store_from_url_opts(&url, options)?; + + let engine: Arc = if let Some(config) = executor_config { + let executor = TokioMultiThreadExecutor::new_owned_runtime( + config.worker_threads, + config.max_blocking_threads, + )?; + Arc::new( + DefaultEngineBuilder::new(store) + .with_task_executor(Arc::new(executor)) + .build(), + ) + } else { + Arc::new(DefaultEngineBuilder::new(store).build()) + }; + + Ok(engine_to_handle(engine, allocate_error)) } /// # Safety @@ -582,53 +691,180 @@ pub struct SharedSchema; #[handle_descriptor(target=Snapshot, mutable=false, sized=true)] pub struct SharedSnapshot; -/// Get the latest snapshot from the specified table +#[handle_descriptor(target=Protocol, mutable=false, sized=true)] +pub struct SharedProtocol; + +#[handle_descriptor(target=Metadata, mutable=false, sized=true)] +pub struct SharedMetadata; + +/// Opaque builder for constructing a [`SharedSnapshot`]. +/// +/// Create with [`get_snapshot_builder`] (from a table path) or [`get_snapshot_builder_from`] +/// (incrementally from an existing snapshot). Configure with [`snapshot_builder_set_version`] and +/// [`snapshot_builder_set_log_tail`] (for catalog-managed tables). Finally, +/// call [`snapshot_builder_build`] to consume the builder and obtain the snapshot. If you need to +/// discard the builder without building, call [`free_snapshot_builder`]. +pub struct FfiSnapshotBuilder { + engine: Arc, + source: FfiSnapshotBuilderSource, + version: Option, + log_tail: Vec, +} + +/// An opaque handle with exclusive (Box-like) ownership of a [`FfiSnapshotBuilder`]. +#[handle_descriptor(target=FfiSnapshotBuilder, mutable=true, sized=true)] +pub struct MutableFfiSnapshotBuilder; + +enum FfiSnapshotBuilderSource { + TableRoot(Url), + ExistingSnapshot(SnapshotRef), +} + +fn make_snapshot_builder( + source: FfiSnapshotBuilderSource, + engine: Arc, +) -> DeltaResult> { + Ok(Box::new(FfiSnapshotBuilder { + engine, + source, + version: None, + log_tail: Vec::new(), + }) + .into()) +} + +/// Get a builder for creating a [`SharedSnapshot`] from a table path. +/// +/// Use [`snapshot_builder_set_version`] to pin a specific version, then call +/// [`snapshot_builder_build`] to obtain the snapshot. The caller owns the returned handle and must +/// eventually call either [`snapshot_builder_build`] to produce a [`SharedSnapshot`], or +/// [`free_snapshot_builder`] to drop it without building. /// /// # Safety /// -/// Caller is responsible for passing valid handles and path pointer. +/// Caller is responsible for passing a valid path and engine handle. #[no_mangle] -pub unsafe extern "C" fn snapshot( +pub unsafe extern "C" fn get_snapshot_builder( path: KernelStringSlice, engine: Handle, -) -> ExternResult> { +) -> ExternResult> { + let engine_ref = unsafe { engine.as_ref() }; + let engine_arc = unsafe { engine.clone_as_arc() }; let url = unsafe { unwrap_and_parse_path_as_url(path) }; - let engine = unsafe { engine.as_ref() }; - snapshot_impl(url, engine, None).into_extern_result(&engine) + let source = match url { + Ok(url) => FfiSnapshotBuilderSource::TableRoot(url), + Err(e) => return DeltaResult::Err(e).into_extern_result(&engine_ref), + }; + make_snapshot_builder(source, engine_arc).into_extern_result(&engine_ref) } -/// Get the snapshot from the specified table at a specific version +/// Get a builder for incrementally updating an existing snapshot. +/// +/// This avoids re-reading the full log. Use [`snapshot_builder_set_version`] to target a specific +/// version, then call [`snapshot_builder_build`] to obtain the updated snapshot. The caller owns +/// the returned handle and must eventually call either [`snapshot_builder_build`] to produce a +/// [`SharedSnapshot`], or [`free_snapshot_builder`] to drop it without building. /// /// # Safety /// -/// Caller is responsible for passing valid handles and path pointer. +/// Caller is responsible for passing valid handles. #[no_mangle] -pub unsafe extern "C" fn snapshot_at_version( - path: KernelStringSlice, +pub unsafe extern "C" fn get_snapshot_builder_from( + prev_snapshot: Handle, engine: Handle, +) -> ExternResult> { + let engine_ref = unsafe { engine.as_ref() }; + let engine_arc = unsafe { engine.clone_as_arc() }; + let snapshot_arc = unsafe { prev_snapshot.clone_as_arc() }; + make_snapshot_builder( + FfiSnapshotBuilderSource::ExistingSnapshot(snapshot_arc), + engine_arc, + ) + .into_extern_result(&engine_ref) +} + +/// Set the target version on a snapshot builder. When omitted, the snapshot is created at the +/// latest version of the table. +/// +/// # Safety +/// +/// Caller must pass a valid builder pointer. +#[no_mangle] +pub unsafe extern "C" fn snapshot_builder_set_version( + builder: &mut Handle, version: Version, +) { + unsafe { builder.as_mut() }.version = Some(version); +} + +/// Set the log tail on a snapshot builder for catalog-managed tables. +/// +/// # Safety +/// +/// Caller must pass a valid builder pointer. The log_tail array and its contents must remain valid +/// for the duration of this call. +#[no_mangle] +pub unsafe extern "C" fn snapshot_builder_set_log_tail( + builder: &mut Handle, + log_tail: log_path::LogPathArray, +) -> ExternResult { + let builder_mut = unsafe { builder.as_mut() }; + let engine_arc = builder_mut.engine.clone(); + let engine_ref = engine_arc.as_ref(); + snapshot_builder_set_log_tail_impl(builder_mut, log_tail).into_extern_result(&engine_ref) +} + +unsafe fn snapshot_builder_set_log_tail_impl( + builder: &mut FfiSnapshotBuilder, + log_tail: log_path::LogPathArray, +) -> DeltaResult { + builder.log_tail = unsafe { log_tail.log_paths() }?; + Ok(true) +} + +/// Consume the builder and return a snapshot. After calling, the builder pointer is _no longer +/// valid_. The builder is always freed by this call, whether or not it succeeds. +/// +/// # Safety +/// +/// Caller must pass a valid builder pointer and must not use it again after this call. +#[no_mangle] +pub unsafe extern "C" fn snapshot_builder_build( + mut builder: Handle, ) -> ExternResult> { - let url = unsafe { unwrap_and_parse_path_as_url(path) }; - let engine = unsafe { engine.as_ref() }; - snapshot_impl(url, engine, version.into()).into_extern_result(&engine) + // Clone the engine Arc before consuming the handle so we can still use it for error reporting + let engine_arc = unsafe { builder.as_mut() }.engine.clone(); + let engine_ref = engine_arc.as_ref(); + let builder_box = unsafe { builder.into_inner() }; + snapshot_builder_build_impl(*builder_box).into_extern_result(&engine_ref) } -fn snapshot_impl( - url: DeltaResult, - extern_engine: &dyn ExternEngine, - version: Option, -) -> DeltaResult> { - let builder = Snapshot::builder_for(url?); - let builder = if let Some(v) = version { - // TODO: should we include a `with_version_opt` method for the builder? - builder.at_version(v) - } else { - builder +fn snapshot_builder_build_impl(builder: FfiSnapshotBuilder) -> DeltaResult> { + let engine = builder.engine.engine(); + let mut rust_builder = match builder.source { + FfiSnapshotBuilderSource::TableRoot(url) => Snapshot::builder_for(url), + FfiSnapshotBuilderSource::ExistingSnapshot(snap) => Snapshot::builder_from(snap), }; - let snapshot = builder.build(extern_engine.engine().as_ref())?; + if let Some(v) = builder.version { + rust_builder = rust_builder.at_version(v); + } + if !builder.log_tail.is_empty() { + rust_builder = rust_builder.with_log_tail(builder.log_tail); + } + let snapshot = rust_builder.build(engine.as_ref())?; Ok(snapshot.into()) } +/// Free a snapshot builder without building a snapshot (e.g. on an error path). +/// +/// # Safety +/// +/// Caller must pass a valid builder pointer and must not use it again after this call. +#[no_mangle] +pub unsafe extern "C" fn free_snapshot_builder(builder: Handle) { + builder.drop_handle(); +} + /// # Safety /// /// Caller is responsible for passing a valid handle. @@ -638,6 +874,35 @@ pub unsafe extern "C" fn free_snapshot(snapshot: Handle) { snapshot.drop_handle(); } +/// Perform a full checkpoint of the specified snapshot using the supplied engine. +/// +/// This writes the checkpoint parquet file and the `_last_checkpoint` file. +/// +// TODO: Expose the updated snapshot via a new FFI function that returns a snapshot handle. +/// +/// # Safety +/// +/// Caller is responsible for passing valid handles. +#[no_mangle] +pub unsafe extern "C" fn checkpoint_snapshot( + snapshot: Handle, + engine: Handle, +) -> ExternResult { + let engine_ref = unsafe { engine.as_ref() }; + let snapshot = unsafe { snapshot.clone_as_arc() }; + snapshot_checkpoint_impl(snapshot, engine_ref).into_extern_result(&engine_ref) +} + +fn snapshot_checkpoint_impl( + snapshot: Arc, + extern_engine: &dyn ExternEngine, +) -> DeltaResult { + let (_result, _updated) = snapshot.checkpoint(extern_engine.engine().as_ref())?; + // We ignore the CheckpointWriteResult because both Written and AlreadyExists are non-error + // outcomes at the FFI layer. + Ok(true) +} + /// Get the version of the specified snapshot /// /// # Safety @@ -649,6 +914,30 @@ pub unsafe extern "C" fn version(snapshot: Handle) -> u64 { snapshot.version() } +/// Get the timestamp of the specified snapshot in milliseconds since the Unix epoch. +/// +/// When In-Commit Timestamp (ICT) is enabled, returns the ICT value from the commit's +/// `CommitInfo` action. Otherwise, falls back to the filesystem last-modified time of +/// the latest commit file. +/// +/// Returns an error if the commit file is missing, the ICT configuration is invalid, or the +/// ICT value cannot be read. +/// +/// # Safety +/// +/// Caller is responsible for passing valid snapshot handle and engine handle. +#[no_mangle] +pub unsafe extern "C" fn snapshot_timestamp( + snapshot: Handle, + engine: Handle, +) -> ExternResult { + let engine_ref = unsafe { engine.as_ref() }; + let snapshot = unsafe { snapshot.as_ref() }; + snapshot + .get_timestamp(engine_ref.engine().as_ref()) + .into_extern_result(&engine_ref) +} + /// Get the logical schema of the specified snapshot /// /// # Safety @@ -692,11 +981,7 @@ pub unsafe extern "C" fn snapshot_table_root( #[no_mangle] pub unsafe extern "C" fn get_partition_column_count(snapshot: Handle) -> usize { let snapshot = unsafe { snapshot.as_ref() }; - snapshot - .table_configuration() - .metadata() - .partition_columns() - .len() + snapshot.table_configuration().partition_columns().len() } /// Get an iterator of the list of partition columns for this snapshot. @@ -708,17 +993,187 @@ pub unsafe extern "C" fn get_partition_columns( snapshot: Handle, ) -> Handle { let snapshot = unsafe { snapshot.as_ref() }; - let iter: Box = Box::new( - snapshot - .table_configuration() - .metadata() - .partition_columns() - .clone() - .into_iter(), - ); + // NOTE: Clippy doesn't like it, but we need to_vec+into_iter to decouple lifetimes + let partition_columns = snapshot.table_configuration().partition_columns().to_vec(); + let iter: Box = Box::new(partition_columns.into_iter()); iter.into() } +/// Visit each metadata configuration (key/value pair) for the specified snapshot by invoking the provided +/// `visitor` callback once per entry. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid snapshot handle, a valid `engine_context` as an +/// opaque pointer passed to each `visitor` invocation, and a valid `visitor` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_metadata_configuration( + snapshot: Handle, + engine_context: NullableCvoid, + visitor: extern "C" fn( + engine_context: NullableCvoid, + key: KernelStringSlice, + value: KernelStringSlice, + ), +) { + let snapshot = unsafe { snapshot.as_ref() }; + snapshot + .table_configuration() + .metadata() + .configuration() + .iter() + .for_each(|(key, value)| { + visitor( + engine_context, + kernel_string_slice!(key), + kernel_string_slice!(value), + ); + }); +} +// === Protocol handle FFI === + +/// Get the protocol for this snapshot. The returned handle must be freed with [`free_protocol`]. +/// +/// # Safety +/// Caller is responsible for providing a valid snapshot handle. +#[no_mangle] +pub unsafe extern "C" fn snapshot_get_protocol( + snapshot: Handle, +) -> Handle { + let snapshot = unsafe { snapshot.as_ref() }; + Arc::new(snapshot.table_configuration().protocol().clone()).into() +} + +/// Free a protocol handle obtained from [`snapshot_get_protocol`]. +/// +/// # Safety +/// Caller is responsible for providing a valid, non-freed protocol handle. +#[no_mangle] +pub unsafe extern "C" fn free_protocol(protocol: Handle) { + protocol.drop_handle(); +} + +/// Visit all fields of the protocol in a single FFI call. The caller provides: +/// - `visit_versions`: called once with `(context, min_reader_version, min_writer_version)` +/// - `visit_feature`: called once per feature with `(context, is_reader, feature_name)`. +/// `is_reader` is `true` for reader features, `false` for writer features. +/// If the protocol uses legacy versioning (no explicit feature lists), the `visit_feature` +/// callback will not fire. +/// +/// # Safety +/// Caller is responsible for providing a valid protocol handle, a valid `context` pointer, and +/// valid function pointers for `visit_versions` and `visit_feature`. +#[no_mangle] +pub unsafe extern "C" fn visit_protocol( + protocol: Handle, + context: NullableCvoid, + visit_versions: extern "C" fn(context: NullableCvoid, min_reader: i32, min_writer: i32), + visit_feature: extern "C" fn( + context: NullableCvoid, + is_reader: bool, + feature: KernelStringSlice, + ), +) { + let protocol = unsafe { protocol.as_ref() }; + visit_versions( + context, + protocol.min_reader_version(), + protocol.min_writer_version(), + ); + if let Some(features) = protocol.reader_features() { + for f in features { + let name = f.as_ref(); + visit_feature(context, true, kernel_string_slice!(name)); + } + } + if let Some(features) = protocol.writer_features() { + for f in features { + let name = f.as_ref(); + visit_feature(context, false, kernel_string_slice!(name)); + } + } +} + +// === Metadata handle FFI === + +/// Get the metadata for this snapshot. The returned handle must be freed with [`free_metadata`]. +/// +/// # Safety +/// Caller is responsible for providing a valid snapshot handle. +#[no_mangle] +pub unsafe extern "C" fn snapshot_get_metadata( + snapshot: Handle, +) -> Handle { + let snapshot = unsafe { snapshot.as_ref() }; + Arc::new(snapshot.table_configuration().metadata().clone()).into() +} + +/// Free a metadata handle obtained from [`snapshot_get_metadata`]. +/// +/// # Safety +/// Caller is responsible for providing a valid, non-freed metadata handle. +#[no_mangle] +pub unsafe extern "C" fn free_metadata(metadata: Handle) { + metadata.drop_handle(); +} + +/// Visit all fields of the metadata in a single FFI call. String fields are passed as +/// [`KernelStringSlice`] references that borrow from the metadata handle -- they are only valid +/// for the duration of the callback. +/// +/// The visitor receives: +/// - `id`: always present +/// - `name`: `OptionalValue::None` if not set +/// - `description`: `OptionalValue::None` if not set +/// - `format_provider`: always present +/// - `has_created_time`: whether `created_time_ms` is meaningful +/// - `created_time_ms`: milliseconds since epoch (only valid when `has_created_time` is true) +/// +/// # Safety +/// Caller is responsible for providing a valid metadata handle, a valid `context` pointer, and +/// a valid `visit_metadata_fields` function pointer. String slices must not be retained past +/// the callback return. +#[no_mangle] +pub unsafe extern "C" fn visit_metadata( + metadata: Handle, + context: NullableCvoid, + visit_metadata_fields: extern "C" fn( + context: NullableCvoid, + id: KernelStringSlice, + name: OptionalValue, + description: OptionalValue, + format_provider: KernelStringSlice, + has_created_time: bool, + created_time_ms: i64, + ), +) { + let metadata = unsafe { metadata.as_ref() }; + let id_str = metadata.id(); + let id = kernel_string_slice!(id_str); + let name = metadata.name().map(|s| kernel_string_slice!(s)).into(); + let description = metadata + .description() + .map(|s| kernel_string_slice!(s)) + .into(); + let fp_str = metadata.format_provider(); + let format_provider = kernel_string_slice!(fp_str); + let (has_created_time, created_time_ms) = match metadata.created_time() { + Some(t) => (true, t), + None => (false, 0), + }; + visit_metadata_fields( + context, + id, + name, + description, + format_provider, + has_created_time, + created_time_ms, + ); +} + +// === Snapshot-level computed property FFI === + type StringIter = dyn Iterator + Send; #[handle_descriptor(target=StringIter, mutable=true, sized=false)] @@ -818,12 +1273,25 @@ mod tests { use super::*; use crate::error::{EngineError, KernelError}; use crate::ffi_test_utils::{ - allocate_err, allocate_str, assert_extern_result_error_with_message, ok_or_panic, - recover_string, + allocate_err, allocate_str, assert_extern_result_error_with_message, build_snapshot, + ok_or_panic, recover_string, setup_snapshot, + }; + use delta_kernel::engine::default::executor::tokio::TokioMultiThreadExecutor; + use delta_kernel::engine::default::DefaultEngineBuilder; + use delta_kernel::object_store::memory::InMemory; + use delta_kernel::object_store::path::Path; + use delta_kernel::object_store::ObjectStore; + use delta_kernel::schema::StructType; + use rstest::rstest; + use serde_json::Value; + use std::collections::HashMap; + use test_utils::add_staged_commit; + use test_utils::{ + actions_to_string, actions_to_string_partitioned, actions_to_string_with_metadata, + add_commit, create_table, TestAction, METADATA, METADATA_WITH_FEATURES, + METADATA_WITH_TABLE_PROPERTIES, }; - use delta_kernel::engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}; - use object_store::memory::InMemory; - use test_utils::{actions_to_string, actions_to_string_partitioned, add_commit, TestAction}; + use url::Url; #[no_mangle] extern "C" fn allocate_null_err(_: KernelError, _: KernelStringSlice) -> *mut EngineError { @@ -845,6 +1313,35 @@ mod tests { } } + /// Create an in-memory table with a single version-0 metadata commit, returning the storage, + /// engine handle, and a snapshot at version 0. The caller is responsible for freeing the + /// engine and snapshot handles. + async fn make_engine_and_v0_snapshot( + path: &str, + ) -> Result< + ( + Arc, + Handle, + Handle, + ), + Box, + > { + let storage = Arc::new(InMemory::new()); + add_commit( + path, + storage.as_ref(), + 0, + actions_to_string(vec![TestAction::Metadata]), + ) + .await?; + let engine = engine_to_handle( + Arc::new(DefaultEngineBuilder::new(storage.clone()).build()), + allocate_err, + ); + let snap = unsafe { build_snapshot(kernel_string_slice!(path), engine.shallow_copy()) }; + Ok((storage, engine, snap)) + } + pub(crate) fn get_default_engine(path: &str) -> Handle { let path = kernel_string_slice!(path); let builder = unsafe { ok_or_panic(get_engine_builder(path, allocate_err)) }; @@ -861,43 +1358,41 @@ mod tests { #[tokio::test] async fn test_snapshot() -> Result<(), Box> { - let storage = Arc::new(InMemory::new()); - add_commit( - storage.as_ref(), - 0, - actions_to_string(vec![TestAction::Metadata]), - ) - .await?; - let engine = DefaultEngine::new(storage.clone(), Arc::new(TokioBackgroundExecutor::new())); - let engine = engine_to_handle(Arc::new(engine), allocate_err); - let path = "memory:///"; + let table_root = "memory:///test_table/"; + let (_, engine, snapshot1) = make_engine_and_v0_snapshot(table_root).await?; // Test getting latest snapshot - let snapshot1 = - unsafe { ok_or_panic(snapshot(kernel_string_slice!(path), engine.shallow_copy())) }; let version1 = unsafe { version(snapshot1.shallow_copy()) }; assert_eq!(version1, 0); // Test getting snapshot at version let snapshot2 = unsafe { - ok_or_panic(snapshot_at_version( - kernel_string_slice!(path), + let mut ptr = ok_or_panic(get_snapshot_builder( + kernel_string_slice!(table_root), engine.shallow_copy(), - 0, - )) + )); + snapshot_builder_set_version(&mut ptr, 0); + ok_or_panic(snapshot_builder_build(ptr)) }; let version2 = unsafe { version(snapshot2.shallow_copy()) }; assert_eq!(version2, 0); // Test getting non-existent snapshot - let snapshot_at_non_existent_version = - unsafe { snapshot_at_version(kernel_string_slice!(path), engine.shallow_copy(), 1) }; - assert_extern_result_error_with_message(snapshot_at_non_existent_version, KernelError::GenericError, "Generic delta kernel error: LogSegment end version 0 not the same as the specified end version 1"); + let snapshot_at_non_existent_version = unsafe { + let mut ptr = ok_or_panic(get_snapshot_builder( + kernel_string_slice!(table_root), + engine.shallow_copy(), + )); + snapshot_builder_set_version(&mut ptr, 1); + snapshot_builder_build(ptr) + }; + assert_extern_result_error_with_message(snapshot_at_non_existent_version, KernelError::GenericError, Some("Generic delta kernel error: LogSegment end version 0 not the same as the specified end version 1")); - let table_root = unsafe { snapshot_table_root(snapshot1.shallow_copy(), allocate_str) }; - assert!(table_root.is_some()); - let s = recover_string(table_root.unwrap()); - assert_eq!(&s, path); + let snapshot_table_root_str = + unsafe { snapshot_table_root(snapshot1.shallow_copy(), allocate_str) }; + assert!(snapshot_table_root_str.is_some()); + let s = recover_string(snapshot_table_root_str.unwrap()); + assert_eq!(&s, table_root); unsafe { free_snapshot(snapshot1) } unsafe { free_snapshot(snapshot2) } @@ -905,21 +1400,312 @@ mod tests { Ok(()) } + // TODO: (PR #2307) will introduce a helper function for setting up storage, engine. + // The test will need to refactor to use the helper function. + #[tokio::test] + async fn test_snapshot_timestamp_no_ict() -> Result<(), Box> { + let storage = Arc::new(InMemory::new()); + let table_root = "memory:///test_table/"; + add_commit( + table_root, + storage.as_ref(), + 0, + actions_to_string(vec![TestAction::Metadata]), + ) + .await?; + + let engine = DefaultEngineBuilder::new(storage.clone()).build(); + let engine = engine_to_handle(Arc::new(engine), allocate_err); + let snap = + unsafe { build_snapshot(kernel_string_slice!(table_root), engine.shallow_copy()) }; + + let ts = unsafe { + ok_or_panic(snapshot_timestamp( + snap.shallow_copy(), + engine.shallow_copy(), + )) + }; + // ICT is not enabled -- falls back to commit file mtime (written "now"). + let now_ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as i64; + let two_days_ms = 2 * 24 * 60 * 60 * 1000_i64; + assert!( + (now_ms - two_days_ms..=now_ms).contains(&ts), + "timestamp {ts} not within 2 days of now {now_ms}" + ); + + unsafe { free_snapshot(snap) } + unsafe { free_engine(engine) } + Ok(()) + } + + // TODO: (PR #2307) will introduce a helper function for setting up storage, engine. + // The test will need to refactor to use the helper function. + #[tokio::test] + async fn test_snapshot_timestamp_ict_enabled() -> Result<(), Box> { + let storage = Arc::new(InMemory::new()); + let table_root = "memory:///test_table/"; + + // create_table with "inCommitTimestamp" in writer_features sets up: + // - protocol v3.7 with writerFeatures=["inCommitTimestamp"] + // - metadata config: enableInCommitTimestamps=true, enablement version/timestamp + // - commitInfo with inCommitTimestamp=1612345678 (fixed test value) + create_table( + storage.clone(), + Url::parse(table_root)?, + Arc::new(StructType::try_new([]).unwrap()), + &[], + true, + vec![], + vec!["inCommitTimestamp"], + ) + .await?; + + let engine = DefaultEngineBuilder::new(storage.clone()).build(); + let engine = engine_to_handle(Arc::new(engine), allocate_err); + let snap = + unsafe { build_snapshot(kernel_string_slice!(table_root), engine.shallow_copy()) }; + + let ts = unsafe { + ok_or_panic(snapshot_timestamp( + snap.shallow_copy(), + engine.shallow_copy(), + )) + }; + assert_eq!(ts, 1612345678_i64); + + unsafe { free_snapshot(snap) } + unsafe { free_engine(engine) } + Ok(()) + } + + #[rstest] + #[case( + METADATA_WITH_TABLE_PROPERTIES, + HashMap::from([ + (String::from("delta.appendOnly"), String::from("true")), + (String::from("custom.key"), String::from("custom_value")), + ]) + )] + #[case(METADATA, HashMap::new())] + #[tokio::test] + async fn test_visit_metadata_configuration( + #[case] metadata: &str, + #[case] expected: HashMap, + ) -> Result<(), Box> { + let table_root = "memory:///"; + let storage = Arc::new(InMemory::new()); + add_commit( + table_root, + storage.as_ref(), + 0, + actions_to_string_with_metadata(vec![TestAction::Metadata], metadata), + ) + .await?; + + let engine = DefaultEngineBuilder::new(storage.clone()).build(); + let engine = engine_to_handle(Arc::new(engine), allocate_err); + + let snap = + unsafe { build_snapshot(kernel_string_slice!(table_root), engine.shallow_copy()) }; + + extern "C" fn collect_property( + engine_context: NullableCvoid, + key: KernelStringSlice, + value: KernelStringSlice, + ) { + let map = + unsafe { &mut *(engine_context.unwrap().as_ptr() as *mut HashMap) }; + let k = unsafe { String::try_from_slice(&key) }.unwrap(); + let v = unsafe { String::try_from_slice(&value) }.unwrap(); + map.insert(k, v); + } + + let mut collected: HashMap = HashMap::new(); + let ctx = NonNull::new(&mut collected as *mut _ as *mut c_void); + unsafe { visit_metadata_configuration(snap.shallow_copy(), ctx, collect_property) }; + + assert_eq!(collected, expected); + + unsafe { free_snapshot(snap) } + unsafe { free_engine(engine) } + Ok(()) + } + + // NOTE: Snapshot::checkpoint requires a multi-threaded tokio task executor to avoid deadlocks. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_snapshot_checkpoint() -> Result<(), Box> { + let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + + // Create a minimal table history: initial metadata+protocol (no commitInfo), then some + // add/remove commits. + let protocol_and_metadata = METADATA + .lines() + .skip(1) // skip commitInfo + .collect::>() + .join("\n"); + add_commit(table_root, storage.as_ref(), 0, protocol_and_metadata).await?; + add_commit( + table_root, + storage.as_ref(), + 1, + actions_to_string(vec![ + TestAction::Add("file1.parquet".into()), + TestAction::Add("file2.parquet".into()), + ]), + ) + .await?; + add_commit( + table_root, + storage.as_ref(), + 2, + actions_to_string(vec![ + TestAction::Add("file3.parquet".into()), + TestAction::Remove("file1.parquet".into()), + ]), + ) + .await?; + + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = DefaultEngineBuilder::new(storage.clone()) + .with_task_executor(executor) + .build(); + let engine = engine_to_handle(Arc::new(engine), allocate_err); + + let snapshot = + unsafe { build_snapshot(kernel_string_slice!(table_root), engine.shallow_copy()) }; + + let did_checkpoint = unsafe { + ok_or_panic(checkpoint_snapshot( + snapshot.shallow_copy(), + engine.shallow_copy(), + )) + }; + assert!(did_checkpoint); + + // Verify `_last_checkpoint` exists and looks sane. + let last_checkpoint = storage + .get(&Path::from("_delta_log/_last_checkpoint")) + .await?; + let last_checkpoint_bytes = last_checkpoint.bytes().await?; + let v: Value = serde_json::from_slice(last_checkpoint_bytes.as_ref())?; + assert_eq!(v["version"].as_u64(), Some(2)); + // Here file1 was removed, so only file2 and + // file3 remain. + assert_eq!(v["numOfAddFiles"].as_u64(), Some(2)); + // size = 1 protocol + 1 metadata + 2 live adds + assert_eq!(v["size"].as_u64(), Some(4)); + + // Cross-check checkpoint file size against `_last_checkpoint.sizeInBytes`. + let checkpoint_path = Path::from("_delta_log/00000000000000000002.checkpoint.parquet"); + let checkpoint_size = storage.head(&checkpoint_path).await?.size; + assert_eq!(v["sizeInBytes"].as_u64(), Some(checkpoint_size)); + + unsafe { free_snapshot(snapshot) } + unsafe { free_engine(engine) } + Ok(()) + } + + // Test checkpoint using FFI engine builder APIs with multithreaded executor. + // NOTE: We made this a sync test to simulate the expected case: C code calling FFI APIs to build engine without existing tokio runtime. + #[cfg(feature = "default-engine-base")] + #[test] + fn test_setting_multithread_executor() -> Result<(), Box> { + use delta_kernel::object_store::local::LocalFileSystem; + use tempfile::tempdir; + + let tmp_dir = tempdir()?; + let tmp_path = tmp_dir.path(); + let table_root = tmp_path + .to_str() + .ok_or_else(|| delta_kernel::Error::generic("Invalid path"))?; + let storage = Arc::new(LocalFileSystem::new()); + + // Create a minimal table history: initial metadata+protocol (no commitInfo), then some + // add/remove commits. + let protocol_and_metadata = METADATA + .lines() + .skip(1) // skip commitInfo + .collect::>() + .join("\n"); + + // Use a temporary runtime for async setup, then drop it before FFI calls + { + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async { + add_commit(&table_root, storage.as_ref(), 0, protocol_and_metadata).await?; + add_commit( + &table_root, + storage.as_ref(), + 1, + actions_to_string(vec![ + TestAction::Add("file1.parquet".into()), + TestAction::Add("file2.parquet".into()), + ]), + ) + .await?; + add_commit( + &table_root, + storage.as_ref(), + 2, + actions_to_string(vec![ + TestAction::Add("file3.parquet".into()), + TestAction::Remove("file1.parquet".into()), + ]), + ) + .await?; + Ok::<_, Box>(()) + })?; + } // runtime dropped here, before FFI calls + + // Build engine using FFI APIs + let builder = unsafe { + ok_or_panic(get_engine_builder( + kernel_string_slice!(table_root), + allocate_err, + )) + }; + unsafe { set_builder_with_multithreaded_executor(builder.as_mut().unwrap(), 2, 0) }; + let engine = unsafe { ok_or_panic(builder_build(builder)) }; + + let snapshot = + unsafe { build_snapshot(kernel_string_slice!(table_root), engine.shallow_copy()) }; + + let did_checkpoint = unsafe { + ok_or_panic(checkpoint_snapshot( + snapshot.shallow_copy(), + engine.shallow_copy(), + )) + }; + assert!(did_checkpoint); + + unsafe { free_snapshot(snapshot) } + unsafe { free_engine(engine) } + Ok(()) + } + #[tokio::test] async fn test_snapshot_partition_cols() -> Result<(), Box> { let storage = Arc::new(InMemory::new()); + let table_root = "memory:///test_table/"; + add_commit( + table_root, storage.as_ref(), 0, actions_to_string_partitioned(vec![TestAction::Metadata]), ) .await?; - let engine = DefaultEngine::new(storage.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(storage.clone()).build(); let engine = engine_to_handle(Arc::new(engine), allocate_err); - let path = "memory:///"; let snapshot = - unsafe { ok_or_panic(snapshot(kernel_string_slice!(path), engine.shallow_copy())) }; + unsafe { build_snapshot(kernel_string_slice!(table_root), engine.shallow_copy()) }; let partition_count = unsafe { get_partition_column_count(snapshot.shallow_copy()) }; assert_eq!(partition_count, 1, "Should have one partition"); @@ -944,22 +1730,607 @@ mod tests { #[tokio::test] async fn allocate_null_err_okay() -> Result<(), Box> { let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + add_commit( + table_root, storage.as_ref(), 0, actions_to_string(vec![TestAction::Metadata]), ) .await?; - let engine = DefaultEngine::new(storage.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(storage.clone()).build(); let engine = engine_to_handle(Arc::new(engine), allocate_null_err); - let path = "memory:///"; // Get a non-existent snapshot, this will call allocate_null_err - let snapshot_at_non_existent_version = - unsafe { snapshot_at_version(kernel_string_slice!(path), engine.shallow_copy(), 1) }; + let snapshot_at_non_existent_version = unsafe { + let mut ptr = ok_or_panic(get_snapshot_builder( + kernel_string_slice!(table_root), + engine.shallow_copy(), + )); + snapshot_builder_set_version(&mut ptr, 1); + snapshot_builder_build(ptr) + }; assert!(snapshot_at_non_existent_version.is_err()); unsafe { free_engine(engine) } Ok(()) } + + #[tokio::test] + async fn test_snapshot_log_tail() -> Result<(), Box> { + let table_root = "memory:///test_table/"; + let (storage, engine, snap) = make_engine_and_v0_snapshot(table_root).await?; + unsafe { free_snapshot(snap) }; + let commit1 = add_staged_commit( + table_root, + storage.as_ref(), + 1, + actions_to_string(vec![TestAction::Add("path1".into())]), + ) + .await?; + + let commit1_path = format!( + "{}_delta_log/_staged_commits/{}", + table_root, + commit1.filename().unwrap() + ); + let log_path = + log_path::FfiLogPath::new(kernel_string_slice!(commit1_path), 123456789, 100); + let log_tail = [log_path]; + let log_tail = log_path::LogPathArray { + ptr: log_tail.as_ptr(), + len: log_tail.len(), + }; + let snapshot = unsafe { + let mut ptr = ok_or_panic(get_snapshot_builder( + kernel_string_slice!(table_root), + engine.shallow_copy(), + )); + ok_or_panic(snapshot_builder_set_log_tail(&mut ptr, log_tail.clone())); + ok_or_panic(snapshot_builder_build(ptr)) + }; + let snapshot_version = unsafe { version(snapshot.shallow_copy()) }; + assert_eq!(snapshot_version, 1); + + // Test getting snapshot at version + let snapshot2 = unsafe { + let mut ptr = ok_or_panic(get_snapshot_builder( + kernel_string_slice!(table_root), + engine.shallow_copy(), + )); + snapshot_builder_set_version(&mut ptr, 1); + ok_or_panic(snapshot_builder_set_log_tail(&mut ptr, log_tail)); + ok_or_panic(snapshot_builder_build(ptr)) + }; + let snapshot_version = unsafe { version(snapshot2.shallow_copy()) }; + assert_eq!(snapshot_version, 1); + + unsafe { free_snapshot(snapshot) } + unsafe { free_snapshot(snapshot2) } + unsafe { free_engine(engine) } + Ok(()) + } + + #[tokio::test] + async fn test_builder_from_existing_snapshot_advances_to_latest_and_pinned_version( + ) -> Result<(), Box> { + let path = "memory:///"; + let (storage, engine, snapshot_at_v0) = make_engine_and_v0_snapshot(path).await?; + assert_eq!(unsafe { version(snapshot_at_v0.shallow_copy()) }, 0); + + add_commit( + path, + storage.as_ref(), + 1, + actions_to_string(vec![TestAction::Add("file1.parquet".into())]), + ) + .await?; + add_commit( + path, + storage.as_ref(), + 2, + actions_to_string(vec![TestAction::Add("file2.parquet".into())]), + ) + .await?; + + let snapshot_at_v2 = unsafe { + let ptr = ok_or_panic(get_snapshot_builder_from( + snapshot_at_v0.shallow_copy(), + engine.shallow_copy(), + )); + ok_or_panic(snapshot_builder_build(ptr)) + }; + assert_eq!(unsafe { version(snapshot_at_v2.shallow_copy()) }, 2); + + let snapshot_at_v1 = unsafe { + let mut ptr = ok_or_panic(get_snapshot_builder_from( + snapshot_at_v0.shallow_copy(), + engine.shallow_copy(), + )); + snapshot_builder_set_version(&mut ptr, 1); + ok_or_panic(snapshot_builder_build(ptr)) + }; + assert_eq!(unsafe { version(snapshot_at_v1.shallow_copy()) }, 1); + + unsafe { free_snapshot(snapshot_at_v2) } + unsafe { free_snapshot(snapshot_at_v1) } + unsafe { free_snapshot(snapshot_at_v0) } + unsafe { free_engine(engine) } + Ok(()) + } + + #[tokio::test] + async fn test_builder_from_existing_snapshot_rejects_earlier_version( + ) -> Result<(), Box> { + let path = "memory:///"; + let (storage, engine, snapshot_at_v0) = make_engine_and_v0_snapshot(path).await?; + + add_commit( + path, + storage.as_ref(), + 1, + actions_to_string(vec![TestAction::Add("file1.parquet".into())]), + ) + .await?; + add_commit( + path, + storage.as_ref(), + 2, + actions_to_string(vec![TestAction::Add("file2.parquet".into())]), + ) + .await?; + + // build a v2 snapshot to use as the base + let snapshot_at_v2 = unsafe { + let ptr = ok_or_panic(get_snapshot_builder_from( + snapshot_at_v0.shallow_copy(), + engine.shallow_copy(), + )); + ok_or_panic(snapshot_builder_build(ptr)) + }; + assert_eq!(unsafe { version(snapshot_at_v2.shallow_copy()) }, 2); + + // pinning to a version older than the hint snapshot is rejected + let result = unsafe { + let mut ptr = ok_or_panic(get_snapshot_builder_from( + snapshot_at_v2.shallow_copy(), + engine.shallow_copy(), + )); + snapshot_builder_set_version(&mut ptr, 1); + snapshot_builder_build(ptr) + }; + assert_extern_result_error_with_message( + result, + KernelError::GenericError, + Some("Generic delta kernel error: Requested snapshot version 1 is older than snapshot hint version 2"), + ); + + unsafe { free_snapshot(snapshot_at_v2) } + unsafe { free_snapshot(snapshot_at_v0) } + unsafe { free_engine(engine) } + Ok(()) + } + + #[tokio::test] + async fn test_snapshot_with_prev_snapshot_and_log_tail( + ) -> Result<(), Box> { + let path = "memory:///"; + let (storage, engine, snapshot_at_v0) = make_engine_and_v0_snapshot(path).await?; + assert_eq!(unsafe { version(snapshot_at_v0.shallow_copy()) }, 0); + + // Add staged commit (version 1) + let commit1 = add_staged_commit( + path, + storage.as_ref(), + 1, + actions_to_string(vec![TestAction::Add("path1.parquet".into())]), + ) + .await?; + + // Add another staged commit (version 2) + let commit2 = add_staged_commit( + path, + storage.as_ref(), + 2, + actions_to_string(vec![TestAction::Add("path2.parquet".into())]), + ) + .await?; + + // Build log tail with both commits + let commit1_path = format!( + "{}_delta_log/_staged_commits/{}", + path, + commit1.filename().unwrap() + ); + let commit2_path = format!( + "{}_delta_log/_staged_commits/{}", + path, + commit2.filename().unwrap() + ); + let log_path1 = + log_path::FfiLogPath::new(kernel_string_slice!(commit1_path), 123456789, 100); + let log_path2 = + log_path::FfiLogPath::new(kernel_string_slice!(commit2_path), 123456790, 101); + let log_tail = [log_path1, log_path2]; + let log_tail_array = log_path::LogPathArray { + ptr: log_tail.as_ptr(), + len: log_tail.len(), + }; + + let snapshot_at_v2 = unsafe { + let mut ptr = ok_or_panic(get_snapshot_builder_from( + snapshot_at_v0.shallow_copy(), + engine.shallow_copy(), + )); + ok_or_panic(snapshot_builder_set_log_tail( + &mut ptr, + log_tail_array.clone(), + )); + ok_or_panic(snapshot_builder_build(ptr)) + }; + assert_eq!(unsafe { version(snapshot_at_v2.shallow_copy()) }, 2); + + let snapshot_at_v1 = unsafe { + let mut ptr = ok_or_panic(get_snapshot_builder_from( + snapshot_at_v0.shallow_copy(), + engine.shallow_copy(), + )); + snapshot_builder_set_version(&mut ptr, 1); + ok_or_panic(snapshot_builder_set_log_tail(&mut ptr, log_tail_array)); + ok_or_panic(snapshot_builder_build(ptr)) + }; + assert_eq!(unsafe { version(snapshot_at_v1.shallow_copy()) }, 1); + + unsafe { free_snapshot(snapshot_at_v2) } + unsafe { free_snapshot(snapshot_at_v1) } + unsafe { free_snapshot(snapshot_at_v0) } + unsafe { free_engine(engine) } + Ok(()) + } + + #[tokio::test] + async fn test_builder_from_table_path_builds_latest_version( + ) -> Result<(), Box> { + let storage = Arc::new(InMemory::new()); + let path = "memory:///"; + add_commit( + path, + storage.as_ref(), + 0, + actions_to_string(vec![TestAction::Metadata]), + ) + .await?; + add_commit( + path, + storage.as_ref(), + 1, + actions_to_string(vec![TestAction::Add("file1.parquet".into())]), + ) + .await?; + let engine = engine_to_handle( + Arc::new(DefaultEngineBuilder::new(storage).build()), + allocate_err, + ); + + let snapshot_at_v1 = unsafe { + let ptr = ok_or_panic(get_snapshot_builder( + kernel_string_slice!(path), + engine.shallow_copy(), + )); + ok_or_panic(snapshot_builder_build(ptr)) + }; + assert_eq!(unsafe { version(snapshot_at_v1.shallow_copy()) }, 1); + + unsafe { free_snapshot(snapshot_at_v1) } + unsafe { free_engine(engine) } + Ok(()) + } + + #[tokio::test] + async fn test_free_snapshot_builder_without_building() -> Result<(), Box> + { + let path = "memory:///"; + let (_, engine, snap) = make_engine_and_v0_snapshot(path).await?; + unsafe { free_snapshot(snap) }; + + let ptr = unsafe { + ok_or_panic(get_snapshot_builder( + kernel_string_slice!(path), + engine.shallow_copy(), + )) + }; + + unsafe { free_snapshot_builder(ptr) }; + unsafe { free_engine(engine) } + Ok(()) + } + + // === Shared visitor state and callbacks for protocol/metadata tests === + + struct ProtocolVisitState { + min_reader: i32, + min_writer: i32, + reader_features: Vec, + writer_features: Vec, + } + + impl ProtocolVisitState { + fn new() -> Self { + Self { + min_reader: 0, + min_writer: 0, + reader_features: Vec::new(), + writer_features: Vec::new(), + } + } + } + + extern "C" fn protocol_version_cb(ctx: NullableCvoid, min_reader: i32, min_writer: i32) { + let state = unsafe { &mut *(ctx.unwrap().as_ptr() as *mut ProtocolVisitState) }; + state.min_reader = min_reader; + state.min_writer = min_writer; + } + + extern "C" fn protocol_feature_cb( + ctx: NullableCvoid, + is_reader: bool, + feature: KernelStringSlice, + ) { + let state = unsafe { &mut *(ctx.unwrap().as_ptr() as *mut ProtocolVisitState) }; + let name = unsafe { String::try_from_slice(&feature) }.unwrap(); + if is_reader { + state.reader_features.push(name); + } else { + state.writer_features.push(name); + } + } + + /// Visit protocol on a snapshot and return the collected state. + fn collect_protocol_state(snap: &handle::Handle) -> ProtocolVisitState { + let proto = unsafe { snapshot_get_protocol(snap.shallow_copy()) }; + let mut state = ProtocolVisitState::new(); + let ctx = NonNull::new(&mut state as *mut ProtocolVisitState as *mut c_void); + unsafe { + visit_protocol( + proto.shallow_copy(), + ctx, + protocol_version_cb, + protocol_feature_cb, + ) + }; + unsafe { free_protocol(proto) }; + state + } + + struct MetadataVisitState { + id: Option, + name: Option, + description: Option, + format_provider: Option, + has_created_time: bool, + created_time_ms: i64, + } + + impl MetadataVisitState { + fn new() -> Self { + Self { + id: None, + name: None, + description: None, + format_provider: None, + has_created_time: false, + created_time_ms: 0, + } + } + } + + /// Convert a [`KernelStringSlice`] to a [`String`] (test-only helper). + fn slice_to_string(slice: KernelStringSlice) -> String { + unsafe { String::try_from_slice(&slice) }.unwrap() + } + + extern "C" fn metadata_visit_cb( + ctx: NullableCvoid, + id: KernelStringSlice, + name: OptionalValue, + description: OptionalValue, + format_provider: KernelStringSlice, + has_created_time: bool, + created_time_ms: i64, + ) { + let state = unsafe { &mut *(ctx.unwrap().as_ptr() as *mut MetadataVisitState) }; + state.id = Some(slice_to_string(id)); + state.name = match name { + OptionalValue::Some(s) => Some(slice_to_string(s)), + OptionalValue::None => None, + }; + state.description = match description { + OptionalValue::Some(s) => Some(slice_to_string(s)), + OptionalValue::None => None, + }; + state.format_provider = Some(slice_to_string(format_provider)); + state.has_created_time = has_created_time; + state.created_time_ms = created_time_ms; + } + + /// Visit metadata on a snapshot and return the collected state. + fn collect_metadata_state(snap: &handle::Handle) -> MetadataVisitState { + let meta = unsafe { snapshot_get_metadata(snap.shallow_copy()) }; + let mut state = MetadataVisitState::new(); + let ctx = NonNull::new(&mut state as *mut MetadataVisitState as *mut c_void); + unsafe { visit_metadata(meta.shallow_copy(), ctx, metadata_visit_cb) }; + unsafe { free_metadata(meta) }; + state + } + + // === visit_protocol tests === + + #[tokio::test] + async fn test_visit_protocol_legacy() -> Result<(), Box> { + let (engine, snap) = setup_snapshot(METADATA.to_string()).await?; + let state = collect_protocol_state(&snap); + + assert_eq!(state.min_reader, 1); + assert_eq!(state.min_writer, 2); + assert!(state.reader_features.is_empty()); + assert!(state.writer_features.is_empty()); + + unsafe { free_snapshot(snap) }; + unsafe { free_engine(engine) }; + Ok(()) + } + + #[tokio::test] + async fn test_builder_with_nonexistent_path_returns_error( + ) -> Result<(), Box> { + let storage = Arc::new(InMemory::new()); + let engine = engine_to_handle( + Arc::new(DefaultEngineBuilder::new(storage).build()), + allocate_err, + ); + + let result = unsafe { + let invalid_path = "not a valid url!"; + get_snapshot_builder(kernel_string_slice!(invalid_path), engine.shallow_copy()) + }; + assert_extern_result_error_with_message( + result, + KernelError::InvalidTableLocationError, + None, + ); + + unsafe { free_engine(engine) } + Ok(()) + } + + #[tokio::test] + async fn test_builder_at_nonexistent_version_returns_error( + ) -> Result<(), Box> { + let path = "memory:///"; + let (_, engine, snap) = make_engine_and_v0_snapshot(path).await?; + unsafe { free_snapshot(snap) }; + + let result = unsafe { + let mut ptr = ok_or_panic(get_snapshot_builder( + kernel_string_slice!(path), + engine.shallow_copy(), + )); + snapshot_builder_set_version(&mut ptr, 99); + snapshot_builder_build(ptr) + }; + assert_extern_result_error_with_message(result, KernelError::GenericError, None); + + unsafe { free_engine(engine) } + Ok(()) + } + + #[tokio::test] + async fn test_visit_protocol_with_features() -> Result<(), Box> { + let (engine, snap) = setup_snapshot(METADATA_WITH_FEATURES.to_string()).await?; + let state = collect_protocol_state(&snap); + + assert_eq!(state.min_reader, 3); + assert_eq!(state.min_writer, 7); + assert_eq!(state.reader_features, vec!["columnMapping"]); + let mut wf = state.writer_features.clone(); + wf.sort(); + assert_eq!(wf, vec!["columnMapping", "domainMetadata", "rowTracking"]); + + unsafe { free_snapshot(snap) }; + unsafe { free_engine(engine) }; + Ok(()) + } + + #[tokio::test] + async fn test_visit_metadata_default() -> Result<(), Box> { + let (engine, snap) = setup_snapshot(METADATA.to_string()).await?; + let state = collect_metadata_state(&snap); + + assert_eq!( + state.id.as_deref(), + Some("5fba94ed-9794-4965-ba6e-6ee3c0d22af9") + ); + assert!( + state.name.is_none(), + "name should be None for default metadata" + ); + assert!( + state.description.is_none(), + "description should be None for default metadata" + ); + assert_eq!(state.format_provider.as_deref(), Some("parquet")); + assert!(state.has_created_time); + assert_eq!(state.created_time_ms, 1587968585495); + + unsafe { free_snapshot(snap) }; + unsafe { free_engine(engine) }; + Ok(()) + } + + #[tokio::test] + async fn test_visit_metadata_with_name() -> Result<(), Box> { + let (engine, snap) = setup_snapshot(METADATA_WITH_FEATURES.to_string()).await?; + let state = collect_metadata_state(&snap); + + assert_eq!( + state.id.as_deref(), + Some("deadbeef-1234-5678-abcd-000000000000") + ); + assert_eq!(state.name.as_deref(), Some("test_table")); + assert!(state.description.is_none(), "description should be None"); + assert_eq!(state.format_provider.as_deref(), Some("parquet")); + assert!(state.has_created_time); + assert_eq!(state.created_time_ms, 1234567890000); + + unsafe { free_snapshot(snap) }; + unsafe { free_engine(engine) }; + Ok(()) + } + + #[tokio::test] + async fn test_visit_metadata_with_description() -> Result<(), Box> { + let metadata_with_desc = concat!( + r#"{"commitInfo":{"timestamp":1587968586154,"operation":"WRITE","operationParameters":{},"isBlindAppend":true}}"#, + "\n", + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + "\n", + r#"{"metaData":{"id":"5fba94ed-9794-4965-ba6e-6ee3c0d22af9","name":"my_table","description":"A test table","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[]}","partitionColumns":[],"configuration":{},"createdTime":1587968585495}}"#, + ); + let (engine, snap) = setup_snapshot(metadata_with_desc.to_string()).await?; + let state = collect_metadata_state(&snap); + + assert_eq!(state.name.as_deref(), Some("my_table")); + assert_eq!(state.description.as_deref(), Some("A test table")); + assert_eq!(state.format_provider.as_deref(), Some("parquet")); + assert!(state.has_created_time); + + unsafe { free_snapshot(snap) }; + unsafe { free_engine(engine) }; + Ok(()) + } + + #[tokio::test] + async fn test_visit_metadata_without_created_time() -> Result<(), Box> { + let metadata_no_time = concat!( + r#"{"commitInfo":{"timestamp":1587968586154,"operation":"WRITE","operationParameters":{},"isBlindAppend":true}}"#, + "\n", + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + "\n", + r#"{"metaData":{"id":"5fba94ed-9794-4965-ba6e-6ee3c0d22af9","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[]}","partitionColumns":[],"configuration":{}}}"#, + ); + let (engine, snap) = setup_snapshot(metadata_no_time.to_string()).await?; + let state = collect_metadata_state(&snap); + + assert_eq!( + state.id.as_deref(), + Some("5fba94ed-9794-4965-ba6e-6ee3c0d22af9") + ); + assert!(!state.has_created_time); + assert_eq!(state.created_time_ms, 0); + + unsafe { free_snapshot(snap) }; + unsafe { free_engine(engine) }; + Ok(()) + } } diff --git a/ffi/src/log_path.rs b/ffi/src/log_path.rs new file mode 100644 index 0000000000..cec6ae4178 --- /dev/null +++ b/ffi/src/log_path.rs @@ -0,0 +1,101 @@ +//! FFI interface for LogPath. + +use delta_kernel::{DeltaResult, FileMeta, LogPath}; +use url::Url; + +use crate::{KernelStringSlice, TryFromStringSlice}; + +/// FFI-safe array of LogPaths. Note that we _explicitly_ do not implement `Copy` on this struct +/// despite all types being `Copy`, to avoid accidental misuse of the pointer. +/// +/// This struct is essentially a borrowed view into an array. The owner must ensure the underlying +/// array remains valid for the duration of its use. +#[repr(C)] +#[derive(Debug, Clone)] +pub struct LogPathArray { + /// Pointer to the first element of the FfiLogPath array. If len is 0, this pointer may be null, + /// otherwise it must be non-null. + pub ptr: *const FfiLogPath, + /// Number of elements in the array + pub len: usize, +} + +impl LogPathArray { + /// Create an empty LogPathArray + pub fn empty() -> Self { + Self { + ptr: std::ptr::null(), + len: 0, + } + } + + /// Convert this array into a Vec of kernel LogPaths + /// + /// # Safety + /// The ptr must point to `len` valid FfiLogPath elements, and those elements + /// must remain valid for the duration of this call + pub(crate) unsafe fn log_paths(&self) -> DeltaResult> { + if self.ptr.is_null() || self.len == 0 { + return Ok(Vec::new()); + } + + let slice = unsafe { std::slice::from_raw_parts(self.ptr, self.len) }; + slice + .iter() + .map(|ffi_path| unsafe { ffi_path.log_path() }) + .collect::, _>>() + } +} + +/// FFI-safe LogPath representation that can be passed from the engine +#[repr(C)] +pub struct FfiLogPath { + /// URL location of the log file + location: KernelStringSlice, + /// Last modified time as milliseconds since unix epoch + last_modified: i64, + /// Size in bytes of the log file + size: u64, +} + +impl FfiLogPath { + /// Create a new FFI LogPath. The location string slice must be valid UTF-8. + pub fn new(location: KernelStringSlice, last_modified: i64, size: u64) -> Self { + Self { + location, + last_modified, + size, + } + } + + /// URL location of the log file as a string slice + pub fn location(&self) -> &KernelStringSlice { + &self.location + } + + /// Last modified time as milliseconds since unix epoch + pub fn last_modified(&self) -> i64 { + self.last_modified + } + + /// Size in bytes of the log file + pub fn size(&self) -> u64 { + self.size + } + + /// Convert this FFI log path into a kernel LogPath + /// + /// # Safety + /// + /// The `self.location` string slice must be valid UTF-8 and represent a valid URL. + unsafe fn log_path(&self) -> DeltaResult { + let location_str = unsafe { TryFromStringSlice::try_from_slice(&self.location) }?; + let url = Url::parse(location_str)?; + let file_meta = FileMeta { + location: url, + last_modified: self.last_modified, + size: self.size, + }; + LogPath::try_new(file_meta) + } +} diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index 9de92303be..1b53275831 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -4,8 +4,8 @@ use std::collections::HashMap; use std::ffi::c_void; use std::sync::{Arc, Mutex}; -use delta_kernel::scan::state::DvInfo; -use delta_kernel::scan::{Scan, ScanMetadata}; +use delta_kernel::scan::state::{DvInfo, ScanFile}; +use delta_kernel::scan::{Scan, ScanBuilder, ScanMetadata}; use delta_kernel::snapshot::SnapshotRef; use delta_kernel::{DeltaResult, Error, Expression, ExpressionRef}; use delta_kernel_ffi_macros::handle_descriptor; @@ -14,6 +14,7 @@ use url::Url; use crate::expressions::kernel_visitor::{unwrap_kernel_predicate, KernelExpressionVisitorState}; use crate::expressions::SharedExpression; +use crate::schema_visitor::{extract_kernel_schema, KernelSchemaVisitorState}; use crate::{ kernel_string_slice, unwrap_and_parse_path_as_url, AllocateStringFn, ExternEngine, ExternResult, IntoExternResult, KernelBoolSlice, KernelRowIndexArray, KernelStringSlice, @@ -23,25 +24,27 @@ use crate::{ use super::handle::Handle; -// TODO: Why do we even need to expose a scan, when the only thing an engine can do with it is -// handit back to the kernel by calling `scan_metadata_iter_init`? There isn't even an FFI method to -// drop it! #[handle_descriptor(target=Scan, mutable=false, sized=true)] pub struct SharedScan; #[handle_descriptor(target=ScanMetadata, mutable=false, sized=true)] pub struct SharedScanMetadata; +/// An opaque, exclusive handle owning a [`ScanBuilder`]. +/// +/// The caller must eventually either call [`scan_builder_build`] (which consumes the handle +/// and produces a [`SharedScan`]) or [`free_scan_builder`] (which drops it without building). +#[handle_descriptor(target=ScanBuilder, mutable=true, sized=true)] +pub struct ExclusiveScanBuilder; + /// A predicate that can be used to skip data when scanning. /// -/// When invoking [`scan`], The engine provides a pointer to the (engine's native) predicate, along -/// with a visitor function that can be invoked to recursively visit the predicate. This engine -/// state must be valid until the call to [`scan`] returns. Inside that method, the kernel allocates -/// visitor state, which becomes the second argument to the predicate visitor invocation along with -/// the engine-provided predicate pointer. The visitor state is valid for the lifetime of the -/// predicate visitor invocation. Thanks to this double indirection, engine and kernel each retain -/// ownership of their respective objects, with no need to coordinate memory lifetimes with the -/// other. +/// Used by [`scan`] and [`scan_builder_with_predicate`]. The engine provides a pointer to its +/// native predicate along with a visitor function that recursively visits it. This engine state +/// must remain valid for the duration of the call. The kernel allocates visitor state internally, +/// which becomes the second argument to the visitor invocation. Thanks to this double indirection, +/// engine and kernel each retain ownership of their respective objects with no need to coordinate +/// memory lifetimes. #[repr(C)] pub struct EnginePredicate { pub predicate: *mut c_void, @@ -49,6 +52,34 @@ pub struct EnginePredicate { extern "C" fn(predicate: *mut c_void, state: &mut KernelExpressionVisitorState) -> usize, } +/// A schema for columns to select from the snapshot. +/// +/// Used by [`scan`] and [`scan_builder_with_schema`] for projection pushdown or to specify +/// metadata columns. The engine provides a pointer to its native schema representation along with +/// a visitor function. The kernel allocates visitor state internally, which becomes the second +/// argument to the schema visitor invocation. Thanks to this double indirection, engine and kernel +/// each retain ownership of their respective objects with no need to coordinate memory lifetimes. +#[repr(C)] +pub struct EngineSchema { + pub schema: *mut c_void, + pub visitor: extern "C" fn(schema: *mut c_void, state: &mut KernelSchemaVisitorState) -> usize, +} + +/// An engine-provided expression along with a visitor function to convert +/// it to a kernel expression. +/// +/// The engine provides a pointer to its own expression representation, along +/// with a visitor function that can convert it to a kernel expression by +/// calling the appropriate visitor methods on the kernel's +/// `KernelExpressionVisitorState`. The visitor function returns an expression +/// ID that can be converted to a kernel expression handle. +#[repr(C)] +pub struct EngineExpression { + pub expression: *mut c_void, + pub visitor: + extern "C" fn(expression: *mut c_void, state: &mut KernelExpressionVisitorState) -> usize, +} + /// Drop a `SharedScanMetadata`. /// /// # Safety @@ -98,26 +129,169 @@ pub unsafe extern "C" fn scan( snapshot: Handle, engine: Handle, predicate: Option<&mut EnginePredicate>, + schema: Option<&mut EngineSchema>, ) -> ExternResult> { let snapshot = unsafe { snapshot.clone_as_arc() }; - scan_impl(snapshot, predicate).into_extern_result(&engine.as_ref()) + scan_impl(snapshot, predicate, schema).into_extern_result(&engine.as_ref()) +} + +/// Decode an [`EnginePredicate`] and apply it to a [`ScanBuilder`]. +/// +/// Returns an error if the engine's visitor fails to produce a valid predicate (i.e. returns +/// an invalid expression ID). A `None` result from the visitor indicates the engine-side +/// predicate construction failed, which would silently produce a full-table scan if ignored. +fn apply_predicate( + builder: ScanBuilder, + predicate: &mut EnginePredicate, +) -> DeltaResult { + let mut visitor_state = KernelExpressionVisitorState::default(); + let pred_id = (predicate.visitor)(predicate.predicate, &mut visitor_state); + let predicate = unwrap_kernel_predicate(&mut visitor_state, pred_id).ok_or_else(|| { + delta_kernel::Error::generic( + "engine predicate visitor returned an invalid expression ID; \ + predicate could not be decoded", + ) + })?; + debug!("Got predicate: {:#?}", predicate); + Ok(builder.with_predicate(Some(Arc::new(predicate)))) +} + +/// Decode an [`EngineSchema`] and apply it as a column projection to a [`ScanBuilder`]. +/// +/// Returns an error if the schema visitor produces an invalid schema. +fn apply_schema(builder: ScanBuilder, schema: &mut EngineSchema) -> DeltaResult { + let mut visitor_state = KernelSchemaVisitorState::default(); + let schema_id = (schema.visitor)(schema.schema, &mut visitor_state); + let schema = extract_kernel_schema(&mut visitor_state, schema_id)?; + debug!("FFI scan projection schema: {:#?}", schema); + Ok(builder.with_schema(Arc::new(schema))) } fn scan_impl( snapshot: SnapshotRef, predicate: Option<&mut EnginePredicate>, + schema: Option<&mut EngineSchema>, ) -> DeltaResult> { let mut scan_builder = snapshot.scan_builder(); if let Some(predicate) = predicate { - let mut visitor_state = KernelExpressionVisitorState::default(); - let pred_id = (predicate.visitor)(predicate.predicate, &mut visitor_state); - let predicate = unwrap_kernel_predicate(&mut visitor_state, pred_id); - debug!("Got predicate: {:#?}", predicate); - scan_builder = scan_builder.with_predicate(predicate.map(Arc::new)); + scan_builder = apply_predicate(scan_builder, predicate)?; + } + if let Some(schema) = schema { + scan_builder = apply_schema(scan_builder, schema)?; } Ok(Arc::new(scan_builder.build()?).into()) } +/// Create a [`ScanBuilder`] for the given snapshot. +/// +/// The caller owns the returned handle and must eventually call either +/// [`scan_builder_build`] to produce a [`SharedScan`], or [`free_scan_builder`] to drop it +/// without building. +/// +/// This function is infallible; constructing a [`ScanBuilder`] from a snapshot always succeeds. +/// +/// # Safety +/// +/// `snapshot` must be a valid [`SharedSnapshot`] handle. +#[no_mangle] +pub unsafe extern "C" fn scan_builder( + snapshot: Handle, +) -> Handle { + let snapshot = unsafe { snapshot.clone_as_arc() }; + Box::new(snapshot.scan_builder()).into() +} + +/// Apply a predicate to an [`ExclusiveScanBuilder`] for data skipping and row-level filtering. +/// +/// Consumes the `builder` handle and returns a new handle with the predicate applied. The +/// `builder` handle must not be used after this call. Returns an error if the engine's predicate +/// visitor fails to produce a valid predicate (i.e. returns an invalid expression ID). On error, +/// the builder is dropped. +/// +/// # Safety +/// +/// `builder` and `engine` must be valid handles. The `builder` handle must not be used after this +/// call. `predicate` must be a valid, non-null [`EnginePredicate`] whose `visitor` and `predicate` +/// fields are safe to call and read. +#[no_mangle] +pub unsafe extern "C" fn scan_builder_with_predicate( + builder: Handle, + engine: Handle, + predicate: &mut EnginePredicate, +) -> ExternResult> { + let engine = unsafe { engine.as_ref() }; + let builder = unsafe { builder.into_inner() }; + apply_predicate(*builder, predicate) + .map(|b| Box::new(b).into()) + .into_extern_result(&engine) +} + +/// Apply a column projection schema to an [`ExclusiveScanBuilder`]. +/// +/// Consumes the `builder` handle and returns a new handle with the schema applied. The `builder` +/// handle must not be used after this call. Returns an error if the schema visitor produces an +/// invalid schema, such as a non-struct root or unconsumed field IDs. On error, the builder is +/// dropped. +/// +/// # Safety +/// +/// `builder` and `engine` must be valid handles. The `builder` handle must not be used after this +/// call. `schema` must be a valid, non-null [`EngineSchema`] whose `visitor` and `schema` fields +/// are safe to call and read. +#[no_mangle] +pub unsafe extern "C" fn scan_builder_with_schema( + builder: Handle, + engine: Handle, + schema: &mut EngineSchema, +) -> ExternResult> { + let engine = unsafe { engine.as_ref() }; + scan_builder_with_schema_impl(builder, schema).into_extern_result(&engine) +} + +fn scan_builder_with_schema_impl( + builder: Handle, + schema: &mut EngineSchema, +) -> DeltaResult> { + let builder = unsafe { builder.into_inner() }; + Ok(Box::new(apply_schema(*builder, schema)?).into()) +} + +/// Consume an [`ExclusiveScanBuilder`] and produce a [`SharedScan`]. +/// +/// The `builder` handle is consumed and must not be used afterward. On error, the builder is +/// dropped and an error is returned. It is the responsibility of the caller to free the returned +/// scan handle by calling [`free_scan`]. +/// +/// # Safety +/// +/// `builder` and `engine` must be valid handles. The `builder` handle must not be used after +/// this call. +#[no_mangle] +pub unsafe extern "C" fn scan_builder_build( + builder: Handle, + engine: Handle, +) -> ExternResult> { + let engine = unsafe { engine.as_ref() }; + let builder = unsafe { builder.into_inner() }; + builder + .build() + .map(|scan| Arc::new(scan).into()) + .into_extern_result(&engine) +} + +/// Free an [`ExclusiveScanBuilder`] without building a scan. +/// +/// Only call this if you will not call [`scan_builder_build`]. If you have already called +/// [`scan_builder_build`], the builder handle was consumed and this must not be called. +/// +/// # Safety +/// +/// `builder` must be a valid handle that has not been previously consumed or freed. +#[no_mangle] +pub unsafe extern "C" fn free_scan_builder(builder: Handle) { + builder.drop_handle(); +} + /// Get the table root of a scan. /// /// # Safety @@ -296,6 +470,7 @@ pub struct CDvInfo<'a> { /// * `context`: a `void*` context this can be anything that engine needs to pass through to each call /// * `path`: a `KernelStringSlice` which is the path to the file /// * `size`: an `i64` which is the size of the file +/// * `mod_time`: an `i64` which is the time the file was created, as milliseconds since the epoch /// * `dv_info`: a [`CDvInfo`] struct, which allows getting the selection vector for this file /// * `transform`: An optional expression that, if not `NULL`, _must_ be applied to physical data to /// convert it to the correct logical format. If this is `NULL`, no transform is needed. @@ -304,6 +479,7 @@ type CScanCallback = extern "C" fn( engine_context: NullableCvoid, path: KernelStringSlice, size: i64, + mod_time: i64, stats: Option<&Stats>, dv_info: &CDvInfo, transform: Option<&Expression>, @@ -333,12 +509,21 @@ pub unsafe extern "C" fn get_from_string_map( map: &CStringMap, key: KernelStringSlice, allocate_fn: AllocateStringFn, -) -> NullableCvoid { - // TODO: Return ExternResult to caller instead of panicking? - let string_key = unsafe { TryFromStringSlice::try_from_slice(&key) }; - map.values - .get(string_key.unwrap()) - .and_then(|v| allocate_fn(kernel_string_slice!(v))) + engine: Handle, +) -> ExternResult { + let engine = unsafe { engine.as_ref() }; + get_from_string_map_impl(map, key, allocate_fn).into_extern_result(&engine) +} +fn get_from_string_map_impl( + map: &CStringMap, + key: KernelStringSlice, + allocate_fn: AllocateStringFn, +) -> DeltaResult { + let string_key = unsafe { TryFromStringSlice::try_from_slice(&key) }?; + Ok(map + .values + .get(string_key) + .and_then(|v| allocate_fn(kernel_string_slice!(v)))) } /// Visit all values in a CStringMap. The callback will be called once for each element of the map @@ -368,7 +553,7 @@ pub unsafe extern "C" fn visit_string_map( /// Transformation expressions that need to be applied to each row `i` in ScanMetadata. You can use /// [`get_transform_for_row`] to get the transform for a particular row. If that returns an /// associated expression, it _must_ be applied to the data read from the file specified by the -/// row. The resultant schema for this expression is guaranteed to be `Scan.schema()`. If +/// row. The resultant schema for this expression is guaranteed to be [`scan_logical_schema()`]. If /// `get_transform_for_row` returns `NULL` no expression need be applied and the data read from disk /// is already in the correct logical state. /// @@ -454,30 +639,24 @@ fn row_indexes_from_dv_impl( // Wrapper function that gets called by the kernel, transforms the arguments to make the ffi-able, // and then calls the ffi specified callback -fn rust_callback( - context: &mut ContextWrapper, - path: &str, - size: i64, - kernel_stats: Option, - dv_info: DvInfo, - transform: Option, - partition_values: HashMap, -) { - let transform = transform.map(|e| e.as_ref().clone()); +fn rust_callback(context: &mut ContextWrapper, scan_file: ScanFile) { + let transform = scan_file.transform.map(|e| e.as_ref().clone()); let partition_map = CStringMap { - values: partition_values, + values: scan_file.partition_values, }; - let stats = kernel_stats.map(|ks| Stats { + let stats = scan_file.stats.map(|ks| Stats { num_records: ks.num_records, }); let cdv_info = CDvInfo { - info: &dv_info, - has_vector: dv_info.has_vector(), + info: &scan_file.dv_info, + has_vector: scan_file.dv_info.has_vector(), }; + let path = scan_file.path.as_str(); (context.callback)( context.engine_context, kernel_string_slice!(path), - size, + scan_file.size, + scan_file.modification_time, stats.as_ref(), &cdv_info, transform.as_ref(), @@ -499,19 +678,306 @@ struct ContextWrapper { #[no_mangle] pub unsafe extern "C" fn visit_scan_metadata( scan_metadata: Handle, + engine: Handle, engine_context: NullableCvoid, callback: CScanCallback, -) { +) -> ExternResult { let scan_metadata = unsafe { scan_metadata.as_ref() }; + let engine = unsafe { engine.as_ref() }; + visit_scan_metadata_impl(scan_metadata, engine_context, callback).into_extern_result(&engine) +} +fn visit_scan_metadata_impl( + scan_metadata: &ScanMetadata, + engine_context: NullableCvoid, + callback: CScanCallback, +) -> DeltaResult { let context_wrapper = ContextWrapper { engine_context, callback, }; + scan_metadata.visit_scan_files(context_wrapper, rust_callback)?; + Ok(true) +} + +#[cfg(test)] +mod scan_builder_tests { + #![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] + + use std::ffi::c_void; + + use test_utils::{actions_to_string, TestAction}; + + use crate::error::KernelError; + use crate::expressions::kernel_visitor::{ + visit_expression_column, visit_expression_literal_int, visit_predicate_lt, + KernelExpressionVisitorState, + }; + use crate::ffi_test_utils::{allocate_err, ok_or_panic, recover_error, setup_snapshot}; + use crate::schema_visitor::{ + visit_field_integer, visit_field_struct, KernelSchemaVisitorState, + }; + use crate::{free_engine, free_schema, free_snapshot, kernel_string_slice, ExternResult}; + + use super::{ + free_scan, free_scan_builder, scan_builder, scan_builder_build, + scan_builder_with_predicate, scan_builder_with_schema, scan_logical_schema, + EnginePredicate, EngineSchema, + }; + + /// Schema visitor that produces `{id: integer (nullable)}` -- a single-column projection of + /// the standard test table schema. + extern "C" fn visit_id_only_schema( + _schema_ptr: *mut c_void, + state: &mut KernelSchemaVisitorState, + ) -> usize { + let id = "id"; + let id_field_id = unsafe { + ok_or_panic(visit_field_integer( + state, + kernel_string_slice!(id), + true, + allocate_err, + )) + }; + let field_ids = [id_field_id]; + let schema = "schema"; + unsafe { + ok_or_panic(visit_field_struct( + state, + kernel_string_slice!(schema), + field_ids.as_ptr(), + 1, + false, + allocate_err, + )) + } + } - // TODO: return ExternResult to caller instead of panicking? - scan_metadata - .visit_scan_files(context_wrapper, rust_callback) - .unwrap(); + /// Predicate visitor that constructs `id < 10`. + extern "C" fn visit_id_lt_10( + _pred_ptr: *mut c_void, + state: &mut KernelExpressionVisitorState, + ) -> usize { + let id = "id"; + let col = unsafe { + ok_or_panic(visit_expression_column( + state, + kernel_string_slice!(id), + allocate_err, + )) + }; + let lit = visit_expression_literal_int(state, 10); + visit_predicate_lt(state, col, lit) + } + + #[tokio::test] + async fn test_scan_builder_no_pushdown() { + let (engine, snapshot) = setup_snapshot(actions_to_string(vec![TestAction::Metadata])) + .await + .unwrap(); + let builder = unsafe { scan_builder(snapshot.shallow_copy()) }; + let scan = unsafe { ok_or_panic(scan_builder_build(builder, engine.shallow_copy())) }; + // Full schema: both `id` and `val` columns + let schema = unsafe { scan_logical_schema(scan.shallow_copy()) }; + let schema_ref = unsafe { schema.as_ref() }; + assert_eq!(schema_ref.fields().count(), 2); + unsafe { free_schema(schema) }; + unsafe { free_scan(scan) }; + unsafe { free_snapshot(snapshot) }; + unsafe { free_engine(engine) }; + } + + #[tokio::test] + async fn test_scan_builder_with_predicate() { + let (engine, snapshot) = setup_snapshot(actions_to_string(vec![TestAction::Metadata])) + .await + .unwrap(); + let builder = unsafe { scan_builder(snapshot.shallow_copy()) }; + let mut predicate = EnginePredicate { + predicate: std::ptr::null_mut(), + visitor: visit_id_lt_10, + }; + let builder = unsafe { + ok_or_panic(scan_builder_with_predicate( + builder, + engine.shallow_copy(), + &mut predicate, + )) + }; + let scan = unsafe { ok_or_panic(scan_builder_build(builder, engine.shallow_copy())) }; + // Predicate does not reduce columns -- full schema is still returned + let schema = unsafe { scan_logical_schema(scan.shallow_copy()) }; + let schema_ref = unsafe { schema.as_ref() }; + assert_eq!(schema_ref.fields().count(), 2); + unsafe { free_schema(schema) }; + unsafe { free_scan(scan) }; + unsafe { free_snapshot(snapshot) }; + unsafe { free_engine(engine) }; + } + + #[tokio::test] + async fn test_scan_builder_with_schema() { + let (engine, snapshot) = setup_snapshot(actions_to_string(vec![TestAction::Metadata])) + .await + .unwrap(); + let builder = unsafe { scan_builder(snapshot.shallow_copy()) }; + let mut schema_arg = EngineSchema { + schema: std::ptr::null_mut(), + visitor: visit_id_only_schema, + }; + let builder = unsafe { + ok_or_panic(scan_builder_with_schema( + builder, + engine.shallow_copy(), + &mut schema_arg, + )) + }; + let scan = unsafe { ok_or_panic(scan_builder_build(builder, engine.shallow_copy())) }; + // Projection to `{id}` -- only one column in the logical schema + let schema = unsafe { scan_logical_schema(scan.shallow_copy()) }; + let schema_ref = unsafe { schema.as_ref() }; + assert_eq!(schema_ref.fields().count(), 1); + assert!(schema_ref.field("id").is_some()); + unsafe { free_schema(schema) }; + unsafe { free_scan(scan) }; + unsafe { free_snapshot(snapshot) }; + unsafe { free_engine(engine) }; + } + + #[tokio::test] + async fn test_scan_builder_with_predicate_and_schema() { + let (engine, snapshot) = setup_snapshot(actions_to_string(vec![TestAction::Metadata])) + .await + .unwrap(); + let builder = unsafe { scan_builder(snapshot.shallow_copy()) }; + let mut predicate = EnginePredicate { + predicate: std::ptr::null_mut(), + visitor: visit_id_lt_10, + }; + let builder = unsafe { + ok_or_panic(scan_builder_with_predicate( + builder, + engine.shallow_copy(), + &mut predicate, + )) + }; + let mut schema_arg = EngineSchema { + schema: std::ptr::null_mut(), + visitor: visit_id_only_schema, + }; + let builder = unsafe { + ok_or_panic(scan_builder_with_schema( + builder, + engine.shallow_copy(), + &mut schema_arg, + )) + }; + let scan = unsafe { ok_or_panic(scan_builder_build(builder, engine.shallow_copy())) }; + // Predicate + projection: only `id` in logical schema + let schema = unsafe { scan_logical_schema(scan.shallow_copy()) }; + let schema_ref = unsafe { schema.as_ref() }; + assert_eq!(schema_ref.fields().count(), 1); + assert!(schema_ref.field("id").is_some()); + unsafe { free_schema(schema) }; + unsafe { free_scan(scan) }; + unsafe { free_snapshot(snapshot) }; + unsafe { free_engine(engine) }; + } + + /// Schema visitor that returns a bare integer field ID (not wrapped in a struct root). + /// `extract_kernel_schema` requires a struct root, so this produces an error. + extern "C" fn visit_invalid_schema_not_struct( + _schema_ptr: *mut c_void, + state: &mut KernelSchemaVisitorState, + ) -> usize { + let bare_field = "bare_field"; + unsafe { + ok_or_panic(visit_field_integer( + state, + kernel_string_slice!(bare_field), + true, + allocate_err, + )) + } + } + + #[tokio::test] + async fn test_scan_builder_with_schema_then_predicate() { + // Verify that applying schema before predicate produces the same result as the reverse + let (engine, snapshot) = setup_snapshot(actions_to_string(vec![TestAction::Metadata])) + .await + .unwrap(); + let builder = unsafe { scan_builder(snapshot.shallow_copy()) }; + let mut schema_arg = EngineSchema { + schema: std::ptr::null_mut(), + visitor: visit_id_only_schema, + }; + let builder = unsafe { + ok_or_panic(scan_builder_with_schema( + builder, + engine.shallow_copy(), + &mut schema_arg, + )) + }; + let mut predicate = EnginePredicate { + predicate: std::ptr::null_mut(), + visitor: visit_id_lt_10, + }; + let builder = unsafe { + ok_or_panic(scan_builder_with_predicate( + builder, + engine.shallow_copy(), + &mut predicate, + )) + }; + let scan = unsafe { ok_or_panic(scan_builder_build(builder, engine.shallow_copy())) }; + // Projection to `{id}` regardless of application order + let schema = unsafe { scan_logical_schema(scan.shallow_copy()) }; + let schema_ref = unsafe { schema.as_ref() }; + assert_eq!(schema_ref.fields().count(), 1); + assert!(schema_ref.field("id").is_some()); + unsafe { free_schema(schema) }; + unsafe { free_scan(scan) }; + unsafe { free_snapshot(snapshot) }; + unsafe { free_engine(engine) }; + } + + #[tokio::test] + async fn test_scan_builder_with_schema_error_propagates() { + // An invalid schema (bare primitive field, no struct root) must return ExternResult::Err + let (engine, snapshot) = setup_snapshot(actions_to_string(vec![TestAction::Metadata])) + .await + .unwrap(); + let builder = unsafe { scan_builder(snapshot.shallow_copy()) }; + let mut schema_arg = EngineSchema { + schema: std::ptr::null_mut(), + visitor: visit_invalid_schema_not_struct, + }; + let result = + unsafe { scan_builder_with_schema(builder, engine.shallow_copy(), &mut schema_arg) }; + assert!( + matches!(result, ExternResult::Err(_)), + "expected ExternResult::Err for invalid schema" + ); + if let ExternResult::Err(e) = result { + let err = unsafe { recover_error(e) }; + assert_eq!(err.etype, KernelError::SchemaError); + } + unsafe { free_snapshot(snapshot) }; + unsafe { free_engine(engine) }; + } + + #[tokio::test] + async fn test_free_scan_builder_without_build() { + let (engine, snapshot) = setup_snapshot(actions_to_string(vec![TestAction::Metadata])) + .await + .unwrap(); + let builder = unsafe { scan_builder(snapshot.shallow_copy()) }; + // Drop without building -- must not panic or leak + unsafe { free_scan_builder(builder) }; + unsafe { free_snapshot(snapshot) }; + unsafe { free_engine(engine) }; + } } #[cfg(test)] diff --git a/ffi/src/schema_visitor.rs b/ffi/src/schema_visitor.rs new file mode 100644 index 0000000000..82ce0d9c5a --- /dev/null +++ b/ffi/src/schema_visitor.rs @@ -0,0 +1,1334 @@ +//! The `KernelSchemaVisitor` defines a visitor system to allow engines to build kernel-native +//! representations of schemas for projection pushdown during scans. +//! +//! Building a schema requires creating elements in dependency order. Referenced elements must be +//! constructed before the elements that reference them. In other words, children must be created +//! before parents. +//! +//! The model is ID based. When the engine wants to create a schema element (a [`StructField`] in +//! kernel terms) it calls the appropriate visitor function which constructs the analogous kernel +//! schema field and returns an `id` (`usize`) that identifies the field. That ID can be passed to +//! other visitor functions to reference that element when building complex types. +//! +//! The final schema is built by visiting a struct field combining the field IDs of the top-level +//! fields. +//! +//! Note: Schemas are structs but can also contain struct fields. Use `visit_field_struct` for both +//! the root schema and for named struct fields. The name of the root struct is ignored and can be +//! anything. +//! +//! IDs are consumed when used. Each element takes ownership of its referenced child +//! elements. Trying to pass an ID more than once to a complex field visitor will result in an +//! error. +//! + +use crate::{ + AllocateErrorFn, ExternResult, IntoExternResult, KernelStringSlice, ReferenceSet, + TryFromStringSlice, +}; +use delta_kernel::schema::{ + ArrayType, DataType, DecimalType, MapType, PrimitiveType, StructField, StructType, +}; +use delta_kernel::{DeltaResult, Error}; +use tracing::warn; + +#[derive(Default)] +pub struct KernelSchemaVisitorState { + elements: ReferenceSet, +} + +/// Extract the final schema from the visitor state. +/// +/// This validates that the schema was properly constructed by ensuring: +/// 1. The schema_id points to a DataType::Struct (the root schema) +/// 2. No other elements remain in the state (all field IDs are consumed) +pub fn extract_kernel_schema( + state: &mut KernelSchemaVisitorState, + schema_id: usize, +) -> DeltaResult { + let schema_element = state + .elements + .take(schema_id) + .ok_or_else(|| Error::schema("Nonexistent id passed to extract_kernel_schema"))?; + let DataType::Struct(struct_type) = schema_element.data_type else { + warn!("Final returned id was not a struct, schema is invalid"); + return Err(Error::schema( + "Final returned id was not a struct, schema is invalid", + )); + }; + if !state.elements.is_empty() { + warn!("Didn't consume all visited fields, schema is invalid."); + Err(Error::schema( + "Didn't consume all visited fields, schema is invalid.", + )) + } else { + Ok(*struct_type) + } +} + +fn wrap_field(state: &mut KernelSchemaVisitorState, field: StructField) -> usize { + state.elements.insert(field) +} + +fn unwrap_field(state: &mut KernelSchemaVisitorState, field_id: usize) -> Option { + state.elements.take(field_id) +} + +// ============================================================================= +// FFI Visitor Functions for field creation - Primitive Types +// ============================================================================= + +/// Generic helper to create primitive fields +fn visit_field_primitive_impl( + state: &mut KernelSchemaVisitorState, + name: DeltaResult<&str>, + primitive_type: PrimitiveType, + nullable: bool, +) -> DeltaResult { + let name_str = name?.to_string(); + let field = StructField::new(name_str, DataType::Primitive(primitive_type), nullable); + Ok(wrap_field(state, field)) +} + +// TODO: turn all the primitive visitors below into a macro once cbindgen can run on macro expanded code +/// Visit a string field. Strings can hold arbitrary UTF-8 text data. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_string( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::String, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a long field. Long fields store 64-bit signed integers. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_long( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Long, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit an integer field. Integer fields store 32-bit signed integers. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_integer( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Integer, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a short field. Short fields store 16-bit signed integers. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_short( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Short, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a byte field. Byte fields store 8-bit signed integers. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_byte( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Byte, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a float field. Float fields store 32-bit floating point numbers. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_float( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Float, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a double field. Double fields store 64-bit floating point numbers. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_double( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Double, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a boolean field. Boolean fields store true/false values. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_boolean( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Boolean, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a binary field. Binary fields store arbitrary byte arrays. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_binary( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Binary, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a date field. Date fields store calendar dates without time information. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_date( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Date, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a timestamp field. Timestamp fields store date and time with microsecond precision in UTC. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_timestamp( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::Timestamp, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a timestamp_ntz field. Similar to timestamp but without timezone information. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_timestamp_ntz( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_primitive_impl(state, name_str, PrimitiveType::TimestampNtz, nullable) + .into_extern_result(&allocate_error) +} + +/// Visit a decimal field. Decimal fields store fixed-precision decimal numbers with specified precision and scale. +/// +/// # Safety +/// +/// Caller is responsible for providing a valid `state`, `name` slice with valid UTF-8 data, +/// and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_decimal( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + precision: u8, + scale: u8, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_decimal_impl(state, name_str, precision, scale, nullable) + .into_extern_result(&allocate_error) +} + +fn visit_field_decimal_impl( + state: &mut KernelSchemaVisitorState, + name: DeltaResult<&str>, + precision: u8, + scale: u8, + nullable: bool, +) -> DeltaResult { + let name_str = name?.to_string(); + + let decimal_type = DecimalType::try_new(precision, scale)?; + let field = StructField::new( + name_str, + DataType::Primitive(PrimitiveType::Decimal(decimal_type)), + nullable, + ); + Ok(wrap_field(state, field)) +} + +// ============================================================================= +// FFI Visitor Functions for field creation - Complex Types +// ============================================================================= + +/// Visit a struct field. Struct fields contain nested fields organized as ordered key-value pairs. +/// +/// Note: This creates a named struct field (e.g. `address: struct`). This function +/// should _also_ be used to create the final schema element, where the field IDs of the top-level +/// fields should be passed as `field_ids`. The name for the final schema element is ignored. +/// +/// The `field_ids` array must contain IDs from previous `visit_field_*` field creation calls. +/// +/// # Safety +/// +/// Caller is responsible for providing valid `state`, `name` slice, `field_ids` array pointing +/// to valid field IDs previously returned by this visitor, and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_struct( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + field_ids: *const usize, + field_count: usize, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str: Result<&str, Error> = unsafe { TryFromStringSlice::try_from_slice(&name) }; + let field_ids = unsafe { std::slice::from_raw_parts(field_ids, field_count) }; + + visit_field_struct_impl(state, name_str, field_ids, nullable) + .into_extern_result(&allocate_error) +} + +// Helper to create struct DataType from field IDs +fn create_struct_data_type( + state: &mut KernelSchemaVisitorState, + field_ids: &[usize], +) -> DeltaResult { + let field_vec = field_ids + .iter() + .map(|&field_id| { + unwrap_field(state, field_id) + .ok_or_else(|| Error::generic(format!("Invalid field ID {field_id} in struct"))) + }) + .collect::>>()?; + + let struct_type = StructType::try_new(field_vec)?; + Ok(DataType::Struct(Box::new(struct_type))) +} + +fn visit_field_struct_impl( + state: &mut KernelSchemaVisitorState, + name: DeltaResult<&str>, + field_ids: &[usize], + nullable: bool, +) -> DeltaResult { + let name_str = name?.to_string(); + let data_type = create_struct_data_type(state, field_ids)?; + let field = StructField::new(name_str, data_type, nullable); + Ok(wrap_field(state, field)) +} + +/// Visit an array field. Array fields store ordered sequences of elements of the same type. +/// +/// The `element_type_id` must reference a field created by a previous `visit_field_*`. Elements of +/// the array can be null if and only if the field referenced by `element_type_id` is nullable. +/// +/// # Safety +/// +/// Caller is responsible for providing valid `state`, `name` slice, `element_type_id` from +/// previous `visit_data_type_*` call, and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_array( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + element_type_id: usize, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_array_impl(state, name_str, element_type_id, nullable) + .into_extern_result(&allocate_error) +} + +fn visit_field_array_impl( + state: &mut KernelSchemaVisitorState, + name: DeltaResult<&str>, + element_type_id: usize, + nullable: bool, +) -> DeltaResult { + let name_str = name?.to_string(); + let element_field = unwrap_field(state, element_type_id).ok_or_else(|| { + Error::generic(format!( + "Invalid element type ID {element_type_id} for array" + )) + })?; + + let array_type = ArrayType::new(element_field.data_type, element_field.nullable); + let field = StructField::new(name_str, array_type, nullable); + Ok(wrap_field(state, field)) +} + +/// Visit a map field. Map fields store key-value pairs where all keys have the same type and all +/// values have the same type. +/// +/// Both `key_type_id` and `value_type_id` must reference fields created by previous `visit_field_*` +/// calls. The map can contain null values if and only if the field referenced by `value_type_id` is +/// nullable. +/// +/// # Safety +/// +/// Caller is responsible for providing valid `state`, `name` slice, `key_type_id` and `value_type_id` +/// from previous `visit_data_type_*` calls, and `allocate_error` function pointer. +#[no_mangle] +pub unsafe extern "C" fn visit_field_map( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + key_type_id: usize, + value_type_id: usize, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_map_impl(state, name_str, key_type_id, value_type_id, nullable) + .into_extern_result(&allocate_error) +} + +fn visit_field_map_impl( + state: &mut KernelSchemaVisitorState, + name: DeltaResult<&str>, + key_type_id: usize, + value_type_id: usize, + nullable: bool, +) -> DeltaResult { + let name_str = name?.to_string(); + + let key_field = unwrap_field(state, key_type_id) + .ok_or_else(|| Error::generic(format!("Invalid key type ID {key_type_id} for map")))?; + + if key_field.nullable { + return Err(Error::generic("Delta Map keys may not be nullable")); + } + + let value_field = unwrap_field(state, value_type_id) + .ok_or_else(|| Error::generic(format!("Invalid value type ID {value_type_id} for map")))?; + + let map_type = MapType::new( + key_field.data_type, + value_field.data_type, + value_field.nullable, + ); + let field = StructField::new(name_str, map_type, nullable); + Ok(wrap_field(state, field)) +} + +/// Visit a variant field. +/// +/// Takes a struct type ID that defines the variant schema. This must reference a field created by +/// previous `visit_field_struct` call. +/// +/// # Safety +/// +/// Caller must ensure: +/// - All base parameters are valid as per visit_field_string +/// - `variant_struct_id` is a valid struct type ID from a previous visitor call +#[no_mangle] +pub unsafe extern "C" fn visit_field_variant( + state: &mut KernelSchemaVisitorState, + name: KernelStringSlice, + variant_struct_id: usize, + nullable: bool, + allocate_error: AllocateErrorFn, +) -> ExternResult { + let name_str = unsafe { TryFromStringSlice::try_from_slice(&name) }; + visit_field_variant_impl(state, name_str, variant_struct_id, nullable) + .into_extern_result(&allocate_error) +} + +fn visit_field_variant_impl( + state: &mut KernelSchemaVisitorState, + name: DeltaResult<&str>, + variant_struct_id: usize, + nullable: bool, +) -> DeltaResult { + let name_str = name?.to_string(); + let data_type = create_variant_data_type(state, variant_struct_id)?; + let field = StructField::new(name_str, data_type, nullable); + Ok(wrap_field(state, field)) +} + +// Helper to create variant DataType +fn create_variant_data_type( + state: &mut KernelSchemaVisitorState, + struct_type_id: usize, +) -> DeltaResult { + let Some(DataType::Struct(variant_struct)) = + state.elements.take(struct_type_id).map(|f| f.data_type) + else { + return Err(Error::generic(format!( + "Invalid variant struct ID {struct_type_id} - must be DataType::Struct" + ))); + }; + Ok(DataType::Variant(variant_struct)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::error::{EngineError, KernelError}; + use crate::ffi_test_utils::ok_or_panic; + use crate::KernelStringSlice; + use delta_kernel::schema::{DataType, PrimitiveType}; + + // Error allocator for tests that panics when invoked. It is used in tests where we don't expect errors. + #[no_mangle] + extern "C" fn test_allocate_error( + etype: KernelError, + msg: crate::KernelStringSlice, + ) -> *mut EngineError { + panic!( + "Error allocator called with type {:?}, message: {:?}", + etype, + unsafe { + std::str::from_utf8_unchecked(std::slice::from_raw_parts( + msg.ptr as *const u8, + msg.len, + )) + } + ); + } + + macro_rules! visit_field { + ($type:ident, $state:ident, $name:expr, $nullable:tt) => { + paste::paste! { ok_or_panic(unsafe { + []( + &mut $state, + KernelStringSlice::new_unsafe($name), + $nullable, + test_allocate_error, + ) + }) } + }; + + ($type:ident, $state:ident, $name:expr, $arg1:expr, $nullable:tt) => { + paste::paste! { ok_or_panic(#[allow(unused_unsafe)] unsafe { + let arg1 = $arg1; + []( + &mut $state, + KernelStringSlice::new_unsafe($name), + arg1, + $nullable, + test_allocate_error, + ) + }) } + }; + + ($type:ident, $state:ident, $name:expr, $arg1:expr, $arg2:expr, $nullable:tt) => { + paste::paste! { ok_or_panic(#[allow(unused_unsafe)] unsafe { + let arg1 = $arg1; + let arg2 = $arg2; + []( + &mut $state, + KernelStringSlice::new_unsafe($name), + arg1, + arg2, + $nullable, + test_allocate_error, + ) + }) } + }; + } + + macro_rules! visit_array_field { + ($state:ident, $name:expr, $nullable:tt, $elem_field:expr) => {{ + let ef = $elem_field; + ok_or_panic(unsafe { + visit_field_array( + &mut $state, + KernelStringSlice::new_unsafe($name), + ef, + $nullable, + test_allocate_error, + ) + }) + }}; + } + + macro_rules! visit_map_field { + ($state:ident, $name:expr, $nullable:tt, $key_field:expr, $val_field:expr) => {{ + let kf = $key_field; + let vf = $val_field; + ok_or_panic(unsafe { + visit_field_map( + &mut $state, + KernelStringSlice::new_unsafe($name), + kf, + vf, + $nullable, + test_allocate_error, + ) + }) + }}; + } + + macro_rules! visit_struct_field { + ($state:ident, $name:expr, $nullable:tt, $($fields:expr),* $(,)?) => {{ + let fields = vec![$($fields),*]; + let field_count = fields.len(); + ok_or_panic(unsafe { + visit_field_struct( + &mut $state, + KernelStringSlice::new_unsafe($name), + fields.as_ptr(), + field_count, + $nullable, + test_allocate_error, + ) + }) + }}; + } + + macro_rules! visit_variant_field { + ($state:ident, $name:expr, $nullable:tt) => {{ + visit_field!( + variant, + $state, + $name, + visit_struct_field!( + $state, + "variant", + false, + visit_field!(binary, $state, "metadata", false), + visit_field!(binary, $state, "value", false), + ), + false + ) + }}; + } + + fn assert_array(field: &StructField, element_type: DataType, contains_null: bool) { + let DataType::Array(array_type) = field.data_type() else { + panic!("Expected array type"); + }; + assert_eq!( + array_type.element_type(), + &element_type, + "Mismatch on array element type" + ); + assert_eq!( + array_type.contains_null(), + contains_null, + "Mismatch on array element nullability" + ); + } + + fn assert_map( + field: &StructField, + key_type: DataType, + value_type: DataType, + contains_null: bool, + ) { + let DataType::Map(map_type) = field.data_type() else { + panic!("Expected map type"); + }; + assert_eq!(map_type.key_type(), &key_type, "Mismatch on map key type"); + assert_eq!( + map_type.value_type(), + &value_type, + "Mismatch on map value type" + ); + assert_eq!( + map_type.value_contains_null(), + contains_null, + "Mismatch on map value nullability" + ); + } + + fn assert_struct(field: &StructField, inner_type: DataType, inner_is_nullable: bool) { + let DataType::Struct(struct_type) = field.data_type() else { + panic!("Expected struct type"); + }; + let inner_fields: Vec<_> = struct_type.fields().collect(); + assert_eq!(inner_fields.len(), 1); + assert_eq!(inner_fields[0].name(), "inner"); + assert_eq!( + inner_fields[0].data_type(), + &inner_type, + "Mismatch on inner field type" + ); + assert_eq!(inner_fields[0].is_nullable(), inner_is_nullable); + } + + #[test] + fn test_schema_all_types() { + // Schema: struct< + // col_string: string, + // col_long: long, + // col_int: int, + // col_short: short, + // col_byte: byte, + // col_double: double, + // col_float: float, + // col_boolean: boolean, + // col_binary: binary, + // col_date: date, + // col_timestamp: timestamp, + // col_timestamp_ntz: timestamp_ntz, + // col_decimal: decimal(10,2), + // col_array: array, + // col_map: map, + // col_struct: struct, + // col_variant: variant + // > + + let mut state = KernelSchemaVisitorState::default(); + + // Create all primitive fields + let col_string = visit_field!(string, state, "col_string", false); + let col_long = visit_field!(long, state, "col_long", false); + let col_int = visit_field!(integer, state, "col_int", false); + let col_short = visit_field!(short, state, "col_short", false); + let col_byte = visit_field!(byte, state, "col_byte", false); + let col_double = visit_field!(double, state, "col_double", false); + let col_float = visit_field!(float, state, "col_float", false); + let col_boolean = visit_field!(boolean, state, "col_boolean", false); + let col_binary = visit_field!(binary, state, "col_binary", false); + let col_date = visit_field!(date, state, "col_date", false); + let col_timestamp = visit_field!(timestamp, state, "col_timestamp", false); + let col_timestamp_ntz = visit_field!(timestamp_ntz, state, "col_timestamp_ntz", false); + let col_decimal = visit_field!(decimal, state, "col_decimal", 10, 2, false); + + // Create array + let col_array = visit_array_field!( + state, + "col_array", + false, + visit_field!(string, state, "element", false) + ); + + // Create map + let col_map = visit_map_field!( + state, + "col_map", + false, + visit_field!(string, state, "key", false), + visit_field!(long, state, "value", false) + ); + + // Create struct + let col_struct = visit_struct_field!( + state, + "col_struct", + false, + visit_field!(string, state, "inner", false), + ); + + // Create variant + let col_variant = visit_variant_field!(state, "col_variant", false); + + // Build the final schema + let all_columns = [ + col_string, + col_long, + col_int, + col_short, + col_byte, + col_double, + col_float, + col_boolean, + col_binary, + col_date, + col_timestamp, + col_timestamp_ntz, + col_decimal, + col_array, + col_map, + col_struct, + col_variant, + ]; + let schema_id = ok_or_panic(unsafe { + visit_field_struct( + &mut state, + KernelStringSlice::new_unsafe("schema"), + all_columns.as_ptr(), + all_columns.len(), + false, + test_allocate_error, + ) + }); + + // Verify the schema + let schema = extract_kernel_schema(&mut state, schema_id).unwrap(); + let fields: Vec<_> = schema.fields().collect(); + assert_eq!(fields.len(), 17); + + // Validate the primitive fields + let primitive_field_expectations = [ + ("col_string", PrimitiveType::String), + ("col_long", PrimitiveType::Long), + ("col_int", PrimitiveType::Integer), + ("col_short", PrimitiveType::Short), + ("col_byte", PrimitiveType::Byte), + ("col_double", PrimitiveType::Double), + ("col_float", PrimitiveType::Float), + ("col_boolean", PrimitiveType::Boolean), + ("col_binary", PrimitiveType::Binary), + ("col_date", PrimitiveType::Date), + ("col_timestamp", PrimitiveType::Timestamp), + ("col_timestamp_ntz", PrimitiveType::TimestampNtz), + ]; + + for (index, (expected_name, expected_type)) in + primitive_field_expectations.iter().enumerate() + { + assert_eq!(fields[index].name(), *expected_name); + assert_eq!( + fields[index].data_type(), + &DataType::Primitive(expected_type.clone()) + ); + assert!(!fields[index].is_nullable()); + } + + assert_eq!(fields[12].name(), "col_decimal"); + let DataType::Primitive(PrimitiveType::Decimal(decimal_type)) = fields[12].data_type() + else { + panic!("Field col_decimal is not a decimal type"); + }; + assert_eq!(decimal_type.precision(), 10); + assert_eq!(decimal_type.scale(), 2); + + assert_eq!(fields[13].name(), "col_array"); + assert_array(fields[13], DataType::STRING, false); + + assert_eq!(fields[14].name(), "col_map"); + assert_map(fields[14], DataType::STRING, DataType::LONG, false); + + assert_eq!(fields[15].name(), "col_struct"); + assert_struct(fields[15], DataType::STRING, false); + + assert_eq!(fields[16].name(), "col_variant"); + let DataType::Variant(variant_type) = fields[16].data_type() else { + panic!("Expected variant type for col_variant"); + }; + let variant_fields: Vec<_> = variant_type.fields().collect(); + assert_eq!(variant_fields.len(), 2); + assert_eq!(variant_fields[0].name(), "metadata"); + assert_eq!( + variant_fields[0].data_type(), + &DataType::Primitive(PrimitiveType::Binary) + ); + assert_eq!(variant_fields[1].name(), "value"); + assert_eq!( + variant_fields[1].data_type(), + &DataType::Primitive(PrimitiveType::Binary) + ); + } + + #[test] + fn test_deeply_nested_structures() { + let mut state = KernelSchemaVisitorState::default(); + + // This creates a deeply nested structure that tests every type containing every other type: + // - Arrays containing maps, structs, other arrays + // - Maps with complex keys (struct, variant) and complex values + // - Structs containing arrays, maps, variants, other structs + // - Variants with proper metadata/value binary fields + // + // Structure with clear numbering (same level = a,b,c): + // struct< + // col_nested: 1.array<2.map<2a.struct, 2b.struct< + // inner_arrays: 3.array<4.struct< + // deep_maps: 4a.map<4a1.variant, 4a2.array>, + // variant_data: 4b.variant, + // nested_struct: 4c.struct< + // final_array: 5.array<6.map<6a.struct, 6b.double>> + // > + // >> + // >>> + // > + + let schema_id = visit_struct_field!( + state, + "top_struct", + false, + visit_array_field!( + // nested field in struct is an array + state, + "col_nested", + true, + visit_map_field!( + // array element is a map + state, + "element", + false, + visit_struct_field!( + // map key is a struct + state, + "key", + false, + visit_field!(long, state, "key_id", false), + ), + visit_struct_field!( + // map value is a struct + state, + "value", + true, + visit_array_field!( + // even more nested array + state, + "inner_arrays", + false, + visit_struct_field!( + // inner array element is a struct + state, + "element", + true, + visit_map_field!( + // struct field 1 is map + state, + "deep_maps", + true, + visit_variant_field!( + // key is variant + state, "key", false + ), + visit_array_field!( + // value is an array + state, + "value", + false, + visit_field!( + // array element is decimal + decimal, state, "element", 10, 2, true + ) + ) + ), + visit_variant_field!( + // struct field 2 is variant + state, + "variant_data", + false + ), + visit_struct_field!( + // struct field 3 is nested_struct + state, + "nested_struct", + true, + visit_array_field!( + state, + "final_array", + false, + visit_map_field!( + state, + "element", + false, + visit_struct_field!( + state, + "key", + false, + visit_field!(double, state, "coord", false), + ), + visit_field!(double, state, "value", false) + ) + ), + ), + ) + ) + ) + ) + ) + ); + + let schema = extract_kernel_schema(&mut state, schema_id).unwrap(); + + let root_fields: Vec<_> = schema.fields().collect(); + assert_eq!(root_fields.len(), 1); + assert_eq!(root_fields[0].name(), "col_nested"); + assert!(root_fields[0].is_nullable()); + + // 1: col_nested: array<...> + let DataType::Array(level1_array) = root_fields[0].data_type() else { + panic!("Expected array type for col_nested (level 1)"); + }; + assert!(!level1_array.contains_null()); + + // 2: array element: map, ...> + let DataType::Map(level2_map) = level1_array.element_type() else { + panic!("Expected map type (level 2)"); + }; + assert!(level2_map.value_contains_null()); + + // 2a: map key: struct + let DataType::Struct(level2a_key_struct) = level2_map.key_type() else { + panic!("Expected struct type for map key (level 2a)"); + }; + let level2a_key_fields: Vec<_> = level2a_key_struct.fields().collect(); + assert_eq!(level2a_key_fields.len(), 1); + assert_eq!(level2a_key_fields[0].name(), "key_id"); + assert_eq!( + level2a_key_fields[0].data_type(), + &DataType::Primitive(PrimitiveType::Long) + ); + assert!(!level2a_key_fields[0].is_nullable()); + + // 2b: map value: struct + let DataType::Struct(level2b_value_struct) = level2_map.value_type() else { + panic!("Expected struct type for map value (level 2b)"); + }; + let level2b_value_fields: Vec<_> = level2b_value_struct.fields().collect(); + assert_eq!(level2b_value_fields.len(), 1); + assert_eq!(level2b_value_fields[0].name(), "inner_arrays"); + assert!(!level2b_value_fields[0].is_nullable()); + + // 3: inner_arrays: array> + let DataType::Array(level3_array) = level2b_value_fields[0].data_type() else { + panic!("Expected array type (level 3)"); + }; + assert!(level3_array.contains_null()); + + // 4: array element: struct + let DataType::Struct(level4_struct) = level3_array.element_type() else { + panic!("Expected struct type (level 4)"); + }; + let level4_fields: Vec<_> = level4_struct.fields().collect(); + assert_eq!(level4_fields.len(), 3); + assert_eq!(level4_fields[0].name(), "deep_maps"); + assert_eq!(level4_fields[1].name(), "variant_data"); + assert_eq!(level4_fields[2].name(), "nested_struct"); + + // 4a: deep_maps: map, array> + assert!(level4_fields[0].is_nullable()); + let DataType::Map(level4a_map) = level4_fields[0].data_type() else { + panic!("Expected map type (level 4a)"); + }; + assert!(!level4a_map.value_contains_null()); + + // 4a1: map key: variant + let DataType::Variant(level4a1_key_variant) = level4a_map.key_type() else { + panic!("Expected variant type for map key (level 4a1)"); + }; + let level4a1_key_fields: Vec<_> = level4a1_key_variant.fields().collect(); + assert_eq!(level4a1_key_fields.len(), 2); + assert_eq!(level4a1_key_fields[0].name(), "metadata"); + assert_eq!( + level4a1_key_fields[0].data_type(), + &DataType::Primitive(PrimitiveType::Binary) + ); + assert!(!level4a1_key_fields[0].is_nullable()); + assert_eq!(level4a1_key_fields[1].name(), "value"); + assert_eq!( + level4a1_key_fields[1].data_type(), + &DataType::Primitive(PrimitiveType::Binary) + ); + assert!(!level4a1_key_fields[1].is_nullable()); + + // 4a2: map value: array + let DataType::Array(level4a2_array) = level4a_map.value_type() else { + panic!("Expected array type (level 4a2)"); + }; + assert!(level4a2_array.contains_null()); + let DataType::Primitive(PrimitiveType::Decimal(decimal_type)) = + level4a2_array.element_type() + else { + panic!("Expected decimal type in array (level 4a2)"); + }; + assert_eq!(decimal_type.precision(), 10); + assert_eq!(decimal_type.scale(), 2); + + // 4b: variant_data: variant + assert!(!level4_fields[1].is_nullable()); + let DataType::Variant(level4b_variant) = level4_fields[1].data_type() else { + panic!("Expected variant type (level 4b)"); + }; + let level4b_fields: Vec<_> = level4b_variant.fields().collect(); + assert_eq!(level4b_fields.len(), 2); + assert_eq!(level4b_fields[0].name(), "metadata"); + assert_eq!( + level4b_fields[0].data_type(), + &DataType::Primitive(PrimitiveType::Binary) + ); + assert!(!level4b_fields[0].is_nullable()); + assert_eq!(level4b_fields[1].name(), "value"); + assert_eq!( + level4b_fields[1].data_type(), + &DataType::Primitive(PrimitiveType::Binary) + ); + assert!(!level4b_fields[1].is_nullable()); + + // 4c: nested_struct: struct + assert!(level4_fields[2].is_nullable()); + let DataType::Struct(level4c_struct) = level4_fields[2].data_type() else { + panic!("Expected struct type (level 4c)"); + }; + let level4c_fields: Vec<_> = level4c_struct.fields().collect(); + assert_eq!(level4c_fields.len(), 1); + assert_eq!(level4c_fields[0].name(), "final_array"); + assert!(!level4c_fields[0].is_nullable()); + + // 5: final_array: array<...> + let DataType::Array(level5_array) = level4c_fields[0].data_type() else { + panic!("Expected array type (level 5)"); + }; + assert!(!level5_array.contains_null()); + + // 6: array element: map, double> + let DataType::Map(level6_map) = level5_array.element_type() else { + panic!("Expected map type (level 6)"); + }; + + // 6b: map value: double + assert_eq!( + level6_map.value_type(), + &DataType::Primitive(PrimitiveType::Double) + ); + assert!(!level6_map.value_contains_null()); + + // 6a: map key: struct + let DataType::Struct(level6a_key_struct) = level6_map.key_type() else { + panic!("Expected struct type for map key (level 6a)"); + }; + let level6a_key_fields: Vec<_> = level6a_key_struct.fields().collect(); + assert_eq!(level6a_key_fields.len(), 1); + assert_eq!(level6a_key_fields[0].name(), "coord"); + assert_eq!( + level6a_key_fields[0].data_type(), + &DataType::Primitive(PrimitiveType::Double) + ); + assert!(!level6a_key_fields[0].is_nullable()); + } + + #[test] + fn test_nullability_combinations() { + let mut state = KernelSchemaVisitorState::default(); + + // Test more nullability cases: + // Schema: + // struct< + // col_required_string: string NOT NULL, + // col_nullable_string: string NULL, + // col_nullable_array_non_null_elements: array, + // col_non_null_array_nullable_elements: array NOT NULL, + // col_nullable_map_nullable_values: map , + // col_non_null_map_non_null_values: map NOT NULL, + // col_nullable_struct: struct NULL, + // col_non_null_struct_nullable_field: struct NOT NULL + // > + + // Required string field + let col_required_string = visit_field!(string, state, "col_required_string", false); + let col_nullable_string = visit_field!(string, state, "col_nullable_string", true); + + // Nullable array with non-null elements: array NULL (elements NOT NULL) + let col_nullable_array_non_null_elements = visit_array_field!( + state, + "col_nullable_array_non_null_elements", + true, // array can be null + visit_field!(string, state, "element", false) // elements cannot be null + ); + + // Non-null array with nullable elements: array NOT NULL (elements NULL) + let col_non_null_array_nullable_elements = visit_array_field!( + state, + "col_non_null_array_nullable_elements", + false, // array not null + visit_field!(string, state, "element", true) // elements can be null + ); + + // Nullable map with nullable values: map NULL (values NULL) + let col_nullable_map_nullable_values = visit_map_field!( + state, + "col_nullable_map_nullable_values", + true, // map can be null + visit_field!(string, state, "key", false), + visit_field!(integer, state, "value", true) // values can be null + ); + + // Non-null map with non-null values: map NOT NULL (values NOT NULL) + let col_non_null_map_non_null_values = visit_map_field!( + state, + "col_non_null_map_non_null_values", + false, // map cannot be null + visit_field!(string, state, "key", false), + visit_field!(integer, state, "value", false) // values cannot be null + ); + + let col_nullable_struct = visit_struct_field!( + state, + "col_nullable_struct", + true, // struct is nullable + visit_field!(string, state, "inner", false), // inner is not nullable + ); + + // Non-null struct with nullable field: struct NOT NULL + let col_non_null_struct_nullable_field = visit_struct_field!( + state, + "col_non_null_struct_nullable_field", + false, // struct not null + visit_field!(string, state, "inner", true), // inner is nullable + ); + + // Build final schema + let schema_id = visit_struct_field!( + state, + "top_struct", + false, + col_required_string, + col_nullable_string, + col_nullable_array_non_null_elements, + col_non_null_array_nullable_elements, + col_nullable_map_nullable_values, + col_non_null_map_non_null_values, + col_nullable_struct, + col_non_null_struct_nullable_field, + ); + + // Verify nullability settings + let schema = extract_kernel_schema(&mut state, schema_id).unwrap(); + let fields: Vec<_> = schema.fields().collect(); + assert_eq!(fields.len(), 8); + + let expected_names_and_nulls = [ + ("col_required_string", false), + ("col_nullable_string", true), + ("col_nullable_array_non_null_elements", true), + ("col_non_null_array_nullable_elements", false), + ("col_nullable_map_nullable_values", true), + ("col_non_null_map_non_null_values", false), + ("col_nullable_struct", true), + ("col_non_null_struct_nullable_field", false), + ]; + + for (field, (name, nullability)) in fields.iter().zip(expected_names_and_nulls) { + assert_eq!(field.name(), name); + assert_eq!( + field.is_nullable(), + nullability, + "Nullablity didn't match for {}", + field.name() + ); + } + + assert_array(fields[2], DataType::STRING, false); + assert_array(fields[3], DataType::STRING, true); + + assert_map(fields[4], DataType::STRING, DataType::INTEGER, true); + assert_map(fields[5], DataType::STRING, DataType::INTEGER, false); + + assert_struct(fields[6], DataType::STRING, false); + assert_struct(fields[7], DataType::STRING, true); + } + + #[test] + fn cannot_use_nullable_as_map_keys() { + // Error allocator for tests that panics when invoked. It is used in tests where we don't expect errors. + #[no_mangle] + extern "C" fn ensure_map_err( + _etype: KernelError, + msg: crate::KernelStringSlice, + ) -> *mut EngineError { + let msg = unsafe { + std::str::from_utf8_unchecked(std::slice::from_raw_parts( + msg.ptr as *const u8, + msg.len, + )) + }; + assert_eq!( + msg, + "Generic delta kernel error: Delta Map keys may not be nullable" + ); + std::ptr::null_mut() + } + + let mut state = KernelSchemaVisitorState::default(); + let kf = visit_field!(string, state, "key", true); // should fail due to this being nullable + let vf = visit_field!(integer, state, "value", false); + let res = unsafe { + visit_field_map( + &mut state, + KernelStringSlice::new_unsafe("map_check"), + kf, + vf, + false, + ensure_map_err, + ) + }; + assert!(res.is_err()); + } +} diff --git a/ffi/src/table_changes.rs b/ffi/src/table_changes.rs new file mode 100644 index 0000000000..ee2b2cfc55 --- /dev/null +++ b/ffi/src/table_changes.rs @@ -0,0 +1,822 @@ +//! TableChanges related ffi code + +use std::sync::Arc; +use std::sync::Mutex; + +use delta_kernel::arrow::array::{Array, ArrayData, StructArray}; +use delta_kernel::arrow::ffi::to_ffi; +use delta_kernel::engine::arrow_data::EngineDataArrowExt; +use delta_kernel::table_changes::scan::TableChangesScan; +use delta_kernel::table_changes::TableChanges; +use delta_kernel::EngineData; +use delta_kernel::Error; +use delta_kernel::{DeltaResult, Version}; +use delta_kernel_ffi_macros::handle_descriptor; +use tracing::debug; + +use super::handle::Handle; +use url::Url; + +use crate::engine_data::ArrowFFIData; +use crate::expressions::kernel_visitor::{unwrap_kernel_predicate, KernelExpressionVisitorState}; +use crate::scan::EnginePredicate; +use crate::{ + kernel_string_slice, unwrap_and_parse_path_as_url, AllocateStringFn, ExternEngine, + ExternResult, IntoExternResult, KernelStringSlice, NullableCvoid, SharedExternEngine, + SharedSchema, +}; + +#[handle_descriptor(target=TableChanges, mutable=true, sized=true)] +pub struct ExclusiveTableChanges; + +/// Get the table changes from the specified table at a specific version +/// +/// - `table_root`: url pointing at the table root (where `_delta_log` folder is located) +/// - `engine`: Implementation of `Engine` apis. +/// - `start_version`: The start version of the change data feed +/// End version will be the newest table version. +/// +/// # Safety +/// +/// Caller is responsible for passing valid handles and path pointer. +#[no_mangle] +pub unsafe extern "C" fn table_changes_from_version( + path: KernelStringSlice, + engine: Handle, + start_version: Version, +) -> ExternResult> { + let url = unsafe { unwrap_and_parse_path_as_url(path) }; + let engine = unsafe { engine.as_ref() }; + table_changes_impl(url, engine, start_version, None).into_extern_result(&engine) +} + +/// Get the table changes from the specified table between two versions +/// +/// - `table_root`: url pointing at the table root (where `_delta_log` folder is located) +/// - `engine`: Implementation of `Engine` apis. +/// - `start_version`: The start version of the change data feed +/// - `end_version`: The end version (inclusive) of the change data feed. +/// +/// # Safety +/// +/// Caller is responsible for passing valid handles and path pointer. +#[no_mangle] +pub unsafe extern "C" fn table_changes_between_versions( + path: KernelStringSlice, + engine: Handle, + start_version: Version, + end_version: Version, +) -> ExternResult> { + let url = unsafe { unwrap_and_parse_path_as_url(path) }; + let engine = unsafe { engine.as_ref() }; + table_changes_impl(url, engine, start_version, end_version.into()).into_extern_result(&engine) +} + +fn table_changes_impl( + url: DeltaResult, + extern_engine: &dyn ExternEngine, + start_version: Version, + end_version: Option, +) -> DeltaResult> { + let table_changes = TableChanges::try_new( + url?, + extern_engine.engine().as_ref(), + start_version, + end_version, + ); + Ok(Box::new(table_changes?).into()) +} + +/// Drops table changes. +/// +/// # Safety +/// Caller is responsible for passing a valid table changes handle. +#[no_mangle] +pub unsafe extern "C" fn free_table_changes(table_changes: Handle) { + table_changes.drop_handle(); +} + +/// Get schema from the specified TableChanges. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid table changes handle. +#[no_mangle] +pub unsafe extern "C" fn table_changes_schema( + table_changes: Handle, +) -> Handle { + let table_changes = unsafe { table_changes.as_ref() }; + Arc::new(table_changes.schema().clone()).into() +} + +/// Get table root from the specified TableChanges. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid table changes handle. +#[no_mangle] +pub unsafe extern "C" fn table_changes_table_root( + table_changes: Handle, + allocate_fn: AllocateStringFn, +) -> NullableCvoid { + let table_changes = unsafe { table_changes.as_ref() }; + let table_root = table_changes.table_root().to_string(); + allocate_fn(kernel_string_slice!(table_root)) +} + +/// Get start version from the specified TableChanges. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid table changes handle. +#[no_mangle] +pub unsafe extern "C" fn table_changes_start_version( + table_changes: Handle, +) -> u64 { + let table_changes = unsafe { table_changes.as_ref() }; + table_changes.start_version() +} + +/// Get end version from the specified TableChanges. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid table changes handle. +#[no_mangle] +pub unsafe extern "C" fn table_changes_end_version( + table_changes: Handle, +) -> u64 { + let table_changes = unsafe { table_changes.as_ref() }; + table_changes.end_version() +} + +#[handle_descriptor(target=TableChangesScan, mutable=false, sized=true)] +pub struct SharedTableChangesScan; + +/// Get a [`TableChangesScan`] over the table specified by the passed table changes. +/// It is the responsibility of the _engine_ to free this scan when complete by calling [`free_table_changes_scan`]. +/// Consumes TableChanges. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid table changes pointer, and engine pointer +#[no_mangle] +pub unsafe extern "C" fn table_changes_scan( + table_changes: Handle, + engine: Handle, + predicate: Option<&mut EnginePredicate>, +) -> ExternResult> { + let table_changes = unsafe { table_changes.into_inner() }; + table_changes_scan_impl(*table_changes, predicate).into_extern_result(&engine.as_ref()) +} + +fn table_changes_scan_impl( + table_changes: TableChanges, + predicate: Option<&mut EnginePredicate>, +) -> DeltaResult> { + let mut scan_builder = table_changes.into_scan_builder(); + if let Some(predicate) = predicate { + let mut visitor_state = KernelExpressionVisitorState::default(); + let pred_id = (predicate.visitor)(predicate.predicate, &mut visitor_state); + let predicate = unwrap_kernel_predicate(&mut visitor_state, pred_id); + debug!("Table changes got predicate: {:#?}", predicate); + scan_builder = scan_builder.with_predicate(predicate.map(Arc::new)); + } + Ok(Arc::new(scan_builder.build()?).into()) +} + +/// Drops a table changes scan. +/// +/// # Safety +/// Caller is responsible for passing a valid scan handle. +#[no_mangle] +pub unsafe extern "C" fn free_table_changes_scan( + table_changes_scan: Handle, +) { + table_changes_scan.drop_handle(); +} + +/// Get the table root of a table changes scan. +/// +/// # Safety +/// Engine is responsible for providing a valid scan pointer and allocate_fn (for allocating the +/// string) +#[no_mangle] +pub unsafe extern "C" fn table_changes_scan_table_root( + table_changes_scan: Handle, + allocate_fn: AllocateStringFn, +) -> NullableCvoid { + let table_changes_scan = unsafe { table_changes_scan.as_ref() }; + let table_root = table_changes_scan.table_root().to_string(); + allocate_fn(kernel_string_slice!(table_root)) +} + +/// Get the logical schema of the specified table changes scan. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid snapshot handle. +#[no_mangle] +pub unsafe extern "C" fn table_changes_scan_logical_schema( + table_changes_scan: Handle, +) -> Handle { + let table_changes_scan = unsafe { table_changes_scan.as_ref() }; + table_changes_scan.logical_schema().clone().into() +} + +/// Get the physical schema of the specified table changes scan. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid snapshot handle. +#[no_mangle] +pub unsafe extern "C" fn table_changes_scan_physical_schema( + table_changes_scan: Handle, +) -> Handle { + let table_changes_scan = unsafe { table_changes_scan.as_ref() }; + table_changes_scan.physical_schema().clone().into() +} + +type TableChangesData = Mutex>> + Send>>; + +pub struct ScanTableChangesIterator { + data: TableChangesData, + engine: Arc, +} + +#[handle_descriptor(target=ScanTableChangesIterator, mutable=false, sized=true)] +pub struct SharedScanTableChangesIterator; + +impl Drop for ScanTableChangesIterator { + fn drop(&mut self) { + debug!("dropping ScanTableChangesIterator"); + } +} + +/// Get an iterator over the data needed to perform a table changes scan. This will return a +/// [`ScanTableChangesIterator`] which can be passed to [`scan_table_changes_next`] to get the +/// actual data in the iterator. +/// +/// # Safety +/// +/// Engine is responsible for passing a valid [`SharedExternEngine`] and [`SharedTableChangesScan`] +#[no_mangle] +pub unsafe extern "C" fn table_changes_scan_execute( + table_changes_scan: Handle, + engine: Handle, +) -> ExternResult> { + let table_changes_scan = unsafe { table_changes_scan.as_ref() }; + let engine = unsafe { engine.clone_as_arc() }; + table_changes_scan_execute_impl(table_changes_scan, engine.clone()) + .into_extern_result(&engine.as_ref()) +} + +fn table_changes_scan_execute_impl( + table_changes_scan: &TableChangesScan, + engine: Arc, +) -> DeltaResult> { + let table_changes_iter = table_changes_scan.execute(engine.engine().clone())?; + let data = ScanTableChangesIterator { + data: Mutex::new(Box::new(table_changes_iter)), + engine: engine.clone(), + }; + Ok(Arc::new(data).into()) +} + +/// # Safety +/// +/// Drops table changes iterator. +/// Caller is responsible for (at most once) passing a valid pointer returned by a call to +/// [`table_changes_scan_execute`]. +#[no_mangle] +pub unsafe extern "C" fn free_scan_table_changes_iter( + data: Handle, +) { + data.drop_handle(); +} + +/// Get next batch of data from the table changes iterator. +/// +/// # Safety +/// +/// The iterator must be valid (returned by [table_changes_scan_execute]) and not yet freed by +/// [`free_scan_table_changes_iter`]. +#[no_mangle] +pub unsafe extern "C" fn scan_table_changes_next( + data: Handle, +) -> ExternResult { + let data = unsafe { data.as_ref() }; + scan_table_changes_next_impl(data).into_extern_result(&data.engine.as_ref()) +} + +fn scan_table_changes_next_impl(data: &ScanTableChangesIterator) -> DeltaResult { + let mut data = data + .data + .lock() + .map_err(|_| Error::generic("poisoned scan table changes iterator mutex"))?; + + let Some(data) = data.next().transpose()? else { + return Ok(ArrowFFIData::empty()); + }; + + let record_batch = data.try_into_record_batch()?; + + let batch_struct_array: StructArray = record_batch.into(); + let array_data: ArrayData = batch_struct_array.into_data(); + let (out_array, out_schema) = to_ffi(&array_data)?; + Ok(ArrowFFIData { + array: out_array, + schema: out_schema, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ffi_test_utils::{allocate_err, allocate_str, ok_or_panic, recover_string}; + use crate::{engine_to_handle, free_engine, free_schema, kernel_string_slice}; + + use delta_kernel::arrow::array::{ArrayRef, Int32Array, StringArray}; + use delta_kernel::arrow::datatypes::{Field, Schema}; + use delta_kernel::arrow::error::ArrowError; + use delta_kernel::arrow::record_batch::RecordBatch; + use delta_kernel::arrow::util::pretty::pretty_format_batches; + use delta_kernel::engine::arrow_conversion::TryIntoArrow as _; + use delta_kernel::engine::arrow_data::ArrowEngineData; + use delta_kernel::engine::default::DefaultEngineBuilder; + use delta_kernel::object_store::{memory::InMemory, path::Path, DynObjectStore}; + use delta_kernel::schema::{DataType, StructField, StructType}; + use delta_kernel::Engine; + use delta_kernel_ffi::engine_data::get_engine_data; + use itertools::Itertools; + use std::sync::Arc; + use test_utils::{ + actions_to_string_with_metadata, add_commit, generate_batch, record_batch_to_bytes, + IntoArray as _, TestAction, + }; + + const PARQUET_FILE1: &str = + "part-00000-a72b1fb3-f2df-41fe-a8f0-e65b746382dd-c000.snappy.parquet"; + const PARQUET_FILE2: &str = + "part-00001-c506e79a-0bf8-4e2b-a42b-9731b2e490ae-c000.snappy.parquet"; + + pub const METADATA: &str = r#" + {"commitInfo": { + "timestamp": 1587968586154, + "operation": "WRITE", + "operationParameters": { + "mode": "ErrorIfExists", + "partitionBy": "[]" + }, + "isBlindAppend": true + }} + {"protocol": { + "minReaderVersion": 1, + "minWriterVersion": 4 + }} + {"metaData": { + "id": "5fba94ed-9794-4965-ba6e-6ee3c0d22af9", + "format": { + "provider": "parquet", + "options": {} + }, + "schemaString": "{ + \"type\": \"struct\", + \"fields\": [ + { + \"name\": \"id\", + \"type\": \"integer\", + \"nullable\": true, + \"metadata\": {} + }, + { + \"name\": \"val\", + \"type\": \"string\", + \"nullable\": true, + \"metadata\": {} + } + ] + }", + "partitionColumns": [], + "configuration": { + "delta.enableChangeDataFeed": "true" + }, + "createdTime": 1587968585495 + }} + "#; + + async fn commit_add_file( + table_root: &str, + storage: &DynObjectStore, + version: u64, + file: String, + ) -> Result<(), Box> { + let metadata = storage.head(&Path::from(file.as_ref())).await?; + add_commit( + table_root, + storage, + version, + actions_to_string_with_metadata( + vec![ + TestAction::Metadata, + TestAction::AddWithSize(file, metadata.size), + ], + METADATA, + ), + ) + .await + } + + async fn commit_remove_file( + table_root: &str, + storage: &DynObjectStore, + version: u64, + file: String, + ) -> Result<(), Box> { + let metadata = storage.head(&Path::from(file.as_ref())).await?; + add_commit( + table_root, + storage, + version, + actions_to_string_with_metadata( + vec![ + TestAction::Metadata, + TestAction::RemoveWithSize(file, metadata.size), + ], + METADATA, + ), + ) + .await + } + + async fn put_file( + storage: &DynObjectStore, + file: String, + batch: &RecordBatch, + ) -> Result<(), Box> { + storage + .put(&Path::from(file), record_batch_to_bytes(batch).into()) + .await?; + Ok(()) + } + + pub fn generate_batch_with_id(start_i: i32) -> Result { + generate_batch(vec![ + ("id", vec![start_i, start_i + 1, start_i + 2].into_array()), + ("val", vec!["a", "b", "c"].into_array()), + ]) + } + + pub fn get_batch_schema() -> Arc { + Arc::new( + StructType::try_new(vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("val", DataType::STRING), + StructField::nullable("_change_type", DataType::STRING), + StructField::nullable("_commit_version", DataType::INTEGER), + ]) + .unwrap(), + ) + } + + fn check_columns_in_schema(fields: &[&str], schema: &StructType) -> bool { + fields.iter().all(|f| schema.contains(f)) + } + + fn read_scan( + scan: &TableChangesScan, + engine: Arc, + ) -> DeltaResult> { + let scan_results = scan.execute(engine)?; + scan_results + .map(EngineDataArrowExt::try_into_record_batch) + .try_collect() + } + + fn filter_batches(batches: Vec) -> Vec { + batches + .into_iter() + .map(|batch| { + let schema = batch.schema(); + let keep_indices: Vec = schema + .fields() + .iter() + .enumerate() + .filter_map(|(i, field)| { + if field.name() != "_commit_timestamp" { + Some(i) + } else { + None + } + }) + .collect(); + + let columns: Vec = keep_indices + .iter() + .map(|&i| batch.column(i).clone()) + .collect(); + + let fields: Vec> = keep_indices + .iter() + .map(|&i| Arc::new(schema.field(i).clone())) + .collect(); + + let filtered_schema = Arc::new(Schema::new(fields)); + RecordBatch::try_new(filtered_schema, columns).unwrap() + }) + .collect() + } + + #[tokio::test] + async fn test_table_changes_getters() -> Result<(), Box> { + let storage = Arc::new(InMemory::new()); + + let batch = generate_batch_with_id(1)?; + put_file(storage.as_ref(), PARQUET_FILE1.to_string(), &batch).await?; + let batch = generate_batch_with_id(4)?; + put_file(storage.as_ref(), PARQUET_FILE2.to_string(), &batch).await?; + + let table_root = "memory:///"; + commit_add_file(table_root, storage.as_ref(), 0, PARQUET_FILE1.to_string()).await?; + commit_add_file(table_root, storage.as_ref(), 1, PARQUET_FILE2.to_string()).await?; + + let engine = DefaultEngineBuilder::new(storage).build(); + let engine = engine_to_handle(Arc::new(engine), allocate_err); + + let table_changes = ok_or_panic(unsafe { + table_changes_from_version(kernel_string_slice!(table_root), engine.shallow_copy(), 0) + }); + + assert_eq!( + unsafe { table_changes_start_version(table_changes.shallow_copy()) }, + 0 + ); + assert_eq!( + unsafe { table_changes_end_version(table_changes.shallow_copy()) }, + 1 + ); + + let table_root_str = + unsafe { table_changes_table_root(table_changes.shallow_copy(), allocate_str) }; + assert_eq!(recover_string(table_root_str.unwrap()), table_root); + + let schema = unsafe { table_changes_schema(table_changes.shallow_copy()).shallow_copy() }; + let schema_ref = unsafe { schema.as_ref() }; + assert_eq!(schema_ref.fields().len(), 5); + check_columns_in_schema( + &[ + "id", + "val", + "_change_type", + "_commit_version", + "_commit_timestamp", + ], + schema_ref, + ); + + let table_changes_scan = + ok_or_panic(unsafe { table_changes_scan(table_changes, engine.shallow_copy(), None) }); + + let scan_table_root = unsafe { + table_changes_scan_table_root(table_changes_scan.shallow_copy(), allocate_str) + }; + assert_eq!(recover_string(scan_table_root.unwrap()), table_root); + + let logical_schema = unsafe { + table_changes_scan_logical_schema(table_changes_scan.shallow_copy()).shallow_copy() + }; + let logical_schema_ref = unsafe { logical_schema.as_ref() }; + assert_eq!(logical_schema_ref.fields().len(), 5); + check_columns_in_schema( + &[ + "id", + "val", + "_change_type", + "_commit_version", + "_commit_timestamp", + ], + logical_schema_ref, + ); + + let physical_schema = unsafe { + table_changes_scan_physical_schema(table_changes_scan.shallow_copy()).shallow_copy() + }; + let physical_schema_ref = unsafe { physical_schema.as_ref() }; + assert_eq!(physical_schema_ref.fields().len(), 2); + check_columns_in_schema(&["id", "val"], physical_schema_ref); + + unsafe { + free_table_changes_scan(table_changes_scan); + free_engine(engine); + free_schema(schema); + free_schema(logical_schema); + free_schema(physical_schema); + } + Ok(()) + } + + #[tokio::test] + async fn test_table_changes_scan() -> Result<(), Box> { + let storage = Arc::new(InMemory::new()); + + let batch = generate_batch_with_id(1)?; + put_file(storage.as_ref(), PARQUET_FILE1.to_string(), &batch).await?; + let batch = generate_batch_with_id(4)?; + put_file(storage.as_ref(), PARQUET_FILE2.to_string(), &batch).await?; + + let table_root = "memory:///"; + commit_add_file(table_root, storage.as_ref(), 0, PARQUET_FILE1.to_string()).await?; + commit_add_file(table_root, storage.as_ref(), 1, PARQUET_FILE2.to_string()).await?; + + let engine = DefaultEngineBuilder::new(storage).build(); + let engine = engine_to_handle(Arc::new(engine), allocate_err); + + let table_changes = ok_or_panic(unsafe { + table_changes_from_version(kernel_string_slice!(table_root), engine.shallow_copy(), 0) + }); + let table_changes_scan = + ok_or_panic(unsafe { table_changes_scan(table_changes, engine.shallow_copy(), None) }); + let batches = unsafe { + read_scan( + &table_changes_scan.into_inner(), + engine.into_inner().engine(), + ) + }; + let batches: Vec = batches.into_iter().flatten().collect(); + let filtered_batches: Vec = filter_batches(batches); + + let table_schema = get_batch_schema(); + let expected = &ArrowEngineData::new(RecordBatch::try_new( + Arc::new(table_schema.as_ref().try_into_arrow()?), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6])), + Arc::new(StringArray::from(vec!["a", "b", "c", "a", "b", "c"])), + Arc::new(StringArray::from(vec![ + "insert", "insert", "insert", "insert", "insert", "insert", + ])), + Arc::new(Int32Array::from(vec![0, 0, 0, 1, 1, 1])), + ], + )?); + + let formatted = pretty_format_batches(&filtered_batches) + .unwrap() + .to_string(); + let expected = pretty_format_batches(&[expected.record_batch().clone()]) + .unwrap() + .to_string(); + + println!("actual:\n{formatted}"); + println!("expected:\n{expected}"); + assert_eq!(formatted, expected); + + Ok(()) + } + + #[tokio::test] + async fn test_table_changes_scan_iterator() -> Result<(), Box> { + let storage = Arc::new(InMemory::new()); + + let batch = generate_batch_with_id(1)?; + put_file(storage.as_ref(), PARQUET_FILE1.to_string(), &batch).await?; + let batch = generate_batch_with_id(4)?; + put_file(storage.as_ref(), PARQUET_FILE2.to_string(), &batch).await?; + + let table_root = "memory:///"; + commit_add_file(table_root, storage.as_ref(), 0, PARQUET_FILE1.to_string()).await?; + commit_add_file(table_root, storage.as_ref(), 1, PARQUET_FILE2.to_string()).await?; + + let engine = DefaultEngineBuilder::new(storage).build(); + let engine = engine_to_handle(Arc::new(engine), allocate_err); + + let table_changes = ok_or_panic(unsafe { + table_changes_from_version(kernel_string_slice!(table_root), engine.shallow_copy(), 0) + }); + + let table_changes_scan = + ok_or_panic(unsafe { table_changes_scan(table_changes, engine.shallow_copy(), None) }); + + let table_changes_scan_iter_result = ok_or_panic(unsafe { + table_changes_scan_execute(table_changes_scan.shallow_copy(), engine.shallow_copy()) + }); + + let mut batches: Vec = Vec::new(); + let mut i: i32 = 0; + loop { + i += 1; + let data = ok_or_panic(unsafe { + scan_table_changes_next(table_changes_scan_iter_result.shallow_copy()) + }); + if data.array.is_empty() { + break; + } + let engine_data = + ok_or_panic(unsafe { get_engine_data(data.array, &data.schema, allocate_err) }); + let record_batch = unsafe { engine_data.into_inner().try_into_record_batch() }?; + + println!("Batch ({i}) num rows {:?}", record_batch.num_rows()); + batches.push(record_batch); + } + + let filtered_batches: Vec = filter_batches(batches); + let formatted = pretty_format_batches(&filtered_batches) + .unwrap() + .to_string(); + + let table_schema = get_batch_schema(); + let expected = &ArrowEngineData::new(RecordBatch::try_new( + Arc::new(table_schema.as_ref().try_into_arrow()?), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5, 6])), + Arc::new(StringArray::from(vec!["a", "b", "c", "a", "b", "c"])), + Arc::new(StringArray::from(vec![ + "insert", "insert", "insert", "insert", "insert", "insert", + ])), + Arc::new(Int32Array::from(vec![0, 0, 0, 1, 1, 1])), + ], + )?); + + let expected = pretty_format_batches(&[expected.record_batch().clone()]) + .unwrap() + .to_string(); + + println!("actual:\n{formatted}"); + println!("expected:\n{expected}"); + assert_eq!(formatted, expected); + + unsafe { + free_table_changes_scan(table_changes_scan); + free_scan_table_changes_iter(table_changes_scan_iter_result); + free_engine(engine); + } + Ok(()) + } + + #[tokio::test] + async fn test_table_changes_between_commits() -> Result<(), Box> { + let storage = Arc::new(InMemory::new()); + + let batch = generate_batch_with_id(1)?; + put_file(storage.as_ref(), PARQUET_FILE1.to_string(), &batch).await?; + let batch = generate_batch_with_id(4)?; + put_file(storage.as_ref(), PARQUET_FILE2.to_string(), &batch).await?; + + let table_root = "memory:///"; + commit_add_file(table_root, storage.as_ref(), 0, PARQUET_FILE1.to_string()).await?; + commit_add_file(table_root, storage.as_ref(), 1, PARQUET_FILE2.to_string()).await?; + commit_remove_file(table_root, storage.as_ref(), 2, PARQUET_FILE1.to_string()).await?; + commit_remove_file(table_root, storage.as_ref(), 3, PARQUET_FILE2.to_string()).await?; + + let engine = DefaultEngineBuilder::new(storage).build(); + let engine = engine_to_handle(Arc::new(engine), allocate_err); + + let table_changes = ok_or_panic(unsafe { + table_changes_between_versions( + kernel_string_slice!(table_root), + engine.shallow_copy(), + 1, + 2, + ) + }); + let table_changes_scan = + ok_or_panic(unsafe { table_changes_scan(table_changes, engine.shallow_copy(), None) }); + let batches = unsafe { + read_scan( + &table_changes_scan.into_inner(), + engine.into_inner().engine(), + ) + }; + let batches: Vec = batches.into_iter().flatten().collect(); + let filtered_batches: Vec = filter_batches(batches); + + let table_schema = Arc::new(StructType::try_new(vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("val", DataType::STRING), + StructField::nullable("_change_type", DataType::STRING), + StructField::nullable("_commit_version", DataType::INTEGER), + ])?); + let expected = &ArrowEngineData::new(RecordBatch::try_new( + Arc::new(table_schema.as_ref().try_into_arrow()?), + vec![ + Arc::new(Int32Array::from(vec![4, 5, 6, 1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c", "a", "b", "c"])), + Arc::new(StringArray::from(vec![ + "insert", "insert", "insert", "delete", "delete", "delete", + ])), + Arc::new(Int32Array::from(vec![1, 1, 1, 2, 2, 2])), + ], + )?); + + let formatted = pretty_format_batches(&filtered_batches) + .unwrap() + .to_string(); + let expected = pretty_format_batches(&[expected.record_batch().clone()]) + .unwrap() + .to_string(); + + println!("actual:\n{formatted}"); + println!("expected:\n{expected}"); + assert_eq!(formatted, expected); + + Ok(()) + } +} diff --git a/ffi/src/test_ffi.rs b/ffi/src/test_ffi.rs index 715cee25d5..c5ef94292b 100644 --- a/ffi/src/test_ffi.rs +++ b/ffi/src/test_ffi.rs @@ -1,5 +1,7 @@ //! Utility functions used for testing ffi code +#![allow(clippy::unwrap_used, clippy::expect_used, clippy::panic)] + use std::sync::Arc; use crate::expressions::{SharedExpression, SharedPredicate}; @@ -156,6 +158,7 @@ pub unsafe extern "C" fn get_testing_kernel_expression() -> Handle Handle Handle { + let sub_exprs = vec![ + column_expr!("simple_col"), + Expr::literal(42i32), + Expr::literal(100i64), + Expr::literal(2.5f64), // Using 2.5 to avoid clippy::approx_constant warning + Expr::literal(true), + Expr::literal(false), + Expr::literal("test string"), + Scalar::Date(19000).into(), + Scalar::Timestamp(1234567890).into(), + Scalar::TimestampNtz(9876543210).into(), + Expr::binary( + BinaryExpressionOp::Plus, + Expr::literal(10), + Expr::literal(20), + ), + Expr::binary( + BinaryExpressionOp::Minus, + Expr::literal(50), + Expr::literal(30), + ), + Expr::binary( + BinaryExpressionOp::Multiply, + Expr::literal(5), + Expr::literal(6), + ), + Expr::binary( + BinaryExpressionOp::Divide, + Expr::literal(100), + Expr::literal(4), + ), + Expr::struct_from([ + Expr::literal(1_i32), + Expr::literal(2_i64), + Expr::literal(3.0_f64), + ]), + Expr::map_to_struct(column_expr!("partitionValues")), + ]; + Arc::new(Expr::struct_from(sub_exprs)).into() +} + +/// Constructs a simple kernel predicate using only primitive types for round-trip testing. +/// This predicate only uses types that have full visitor support. +/// +/// # Safety +/// The caller is responsible for freeing the returned memory. +#[no_mangle] +pub unsafe extern "C" fn get_simple_testing_kernel_predicate() -> Handle { + let sub_preds = vec![ + column_pred!("pred_col"), + Pred::literal(true), + Pred::literal(false), + Pred::eq(Expr::literal(10), Expr::literal(10)), + Pred::ne(Expr::literal(5), Expr::literal(10)), + Pred::lt(Expr::literal(5), Expr::literal(10)), + Pred::le(Expr::literal(10), Expr::literal(10)), + Pred::gt(Expr::literal(20), Expr::literal(10)), + Pred::ge(Expr::literal(10), Expr::literal(10)), + Pred::distinct(Expr::literal(1), Expr::literal(2)), + Pred::is_null(column_expr!("nullable_col")), + Pred::is_not_null(column_expr!("nonnull_col")), + Pred::not(Pred::literal(false)), + Pred::or_from(vec![ + Pred::eq(Expr::literal(1), Expr::literal(1)), + Pred::eq(Expr::literal(2), Expr::literal(2)), + ]), + ]; + Arc::new(Pred::and_from(sub_preds)).into() +} + +/// Compare two kernel expressions for equality. Returns true if they are +/// structurally equal, false otherwise. +/// +/// # Safety +/// Both expr1 and expr2 must be valid SharedExpression handles. +#[no_mangle] +pub unsafe extern "C" fn expressions_are_equal( + expr1: &Handle, + expr2: &Handle, +) -> bool { + let expr1: &Expr = expr1.as_ref(); + let expr2: &Expr = expr2.as_ref(); + expr1 == expr2 +} + +/// Compare two kernel predicates for equality. Returns true if they are +/// structurally equal, false otherwise. +/// +/// # Safety +/// Both pred1 and pred2 must be valid SharedPredicate handles. +#[no_mangle] +pub unsafe extern "C" fn predicates_are_equal( + pred1: &Handle, + pred2: &Handle, +) -> bool { + let pred1: &Pred = pred1.as_ref(); + let pred2: &Pred = pred2.as_ref(); + pred1 == pred2 +} diff --git a/ffi/src/transaction/mod.rs b/ffi/src/transaction/mod.rs index 9242719242..c85f72a31c 100644 --- a/ffi/src/transaction/mod.rs +++ b/ffi/src/transaction/mod.rs @@ -2,24 +2,41 @@ mod transaction_id; mod write_context; +use std::sync::Arc; + use crate::error::{ExternResult, IntoExternResult}; use crate::handle::Handle; -use crate::KernelStringSlice; use crate::{unwrap_and_parse_path_as_url, TryFromStringSlice}; use crate::{DeltaResult, ExternEngine, Snapshot, Url}; use crate::{ExclusiveEngineData, SharedExternEngine}; -use delta_kernel::committer::FileSystemCommitter; +use crate::{KernelStringSlice, SharedSchema, SharedSnapshot}; +use delta_kernel::committer::{Committer, FileSystemCommitter}; +use delta_kernel::engine_data::FilteredEngineData; +use delta_kernel::transaction::create_table::{ + CreateTableTransaction, CreateTableTransactionBuilder, +}; use delta_kernel::transaction::{CommitResult, Transaction}; use delta_kernel_ffi_macros::handle_descriptor; -/// A handle representing an exclusive transaction on a Delta table. (Similar to a Box<_>) +/// A handle for an existing-table transaction (`Transaction`). /// -/// This struct provides a safe wrapper around the underlying `Transaction` type, -/// ensuring exclusive access to transaction operations. The transaction can be used -/// to stage changes and commit them atomically to the Delta table. +/// Returned by [`transaction`] and [`transaction_with_committer`]. Supports all transaction +/// operations including existing-table-only operations like blind append and file removal. #[handle_descriptor(target=Transaction, mutable=true, sized=true)] pub struct ExclusiveTransaction; +/// A handle for a create-table transaction (`Transaction`). +/// +/// Returned by [`create_table_builder_build`]. Only supports operations valid during table +/// creation: adding files, setting data change, engine info, and committing. Operations like +/// file removal, blind append, and deletion vector updates are not available. +#[handle_descriptor(target=CreateTableTransaction, mutable=true, sized=true)] +pub struct ExclusiveCreateTransaction; + +/// Handle for a mutable boxed committer that can be passed across FFI +#[handle_descriptor(target = dyn Committer, mutable = true, sized = false)] +pub struct MutableCommitter; + /// Start a transaction on the latest snapshot of the table. /// /// # Safety @@ -39,12 +56,66 @@ fn transaction_impl( url: DeltaResult, extern_engine: &dyn ExternEngine, ) -> DeltaResult> { - let snapshot = Snapshot::builder_for(url?).build(extern_engine.engine().as_ref())?; + let engine = extern_engine.engine(); + let snapshot = Snapshot::builder_for(url?).build(engine.as_ref())?; let committer = Box::new(FileSystemCommitter::new()); - let transaction = snapshot.transaction(committer); + let transaction = snapshot.transaction(committer, engine.as_ref()); Ok(Box::new(transaction?).into()) } +/// Start a transaction with a custom committer +/// NOTE: This consumes the committer handle +/// +/// # Safety +/// +/// Caller is responsible for passing valid handles +#[no_mangle] +pub unsafe extern "C" fn transaction_with_committer( + snapshot: Handle, + engine: Handle, + committer: Handle, +) -> ExternResult> { + let snapshot = unsafe { snapshot.clone_as_arc() }; + let engine = unsafe { engine.as_ref() }; + let committer = unsafe { committer.into_inner() }; + transaction_with_committer_impl(snapshot, engine, committer).into_extern_result(&engine) +} + +fn transaction_with_committer_impl( + snapshot: Arc, + extern_engine: &dyn ExternEngine, + committer: Box, +) -> DeltaResult> { + let engine = extern_engine.engine(); + let transaction = snapshot.transaction(committer, engine.as_ref()); + Ok(Box::new(transaction?).into()) +} + +/// Convert a [`CommitResult`] into a committed version number, or an error if the commit was not +/// successful. +/// +/// TODO: expose the full `CommitResult` enum through FFI for conflict resolution. +fn commit_result_to_version(result: DeltaResult>) -> DeltaResult { + match result? { + CommitResult::CommittedTransaction(committed) => Ok(committed.commit_version()), + CommitResult::RetryableTransaction(_) => Err(delta_kernel::Error::unsupported( + "commit failed: retryable transaction not supported in FFI (yet)", + )), + CommitResult::ConflictedTransaction(conflicted) => { + Err(delta_kernel::Error::Generic(format!( + "commit conflict at version {}", + conflicted.conflict_version() + ))) + } + } +} + +// ============================================================================ +// Existing-table transaction FFI functions +// ============================================================================ + +/// Free an existing-table transaction handle without committing. +/// /// # Safety /// /// Caller is responsible for passing a valid handle. @@ -53,12 +124,11 @@ pub unsafe extern "C" fn free_transaction(txn: Handle) { txn.drop_handle(); } -/// Attaches commit information to a transaction. The commit info contains metadata about the -/// transaction that will be written to the log during commit. +/// Attaches engine info to an existing-table transaction. /// /// # Safety /// -/// Caller is responsible for passing a valid handle. CONSUMES TRANSACTION and commit info +/// Caller is responsible for passing a valid handle. CONSUMES the transaction handle. #[no_mangle] pub unsafe extern "C" fn with_engine_info( txn: Handle, @@ -67,7 +137,6 @@ pub unsafe extern "C" fn with_engine_info( ) -> ExternResult> { let txn = unsafe { txn.into_inner() }; let engine = unsafe { engine.as_ref() }; - with_engine_info_impl(*txn, engine_info).into_extern_result(&engine) } @@ -75,9 +144,73 @@ fn with_engine_info_impl( txn: Transaction, engine_info: KernelStringSlice, ) -> DeltaResult> { - let info_string: DeltaResult<&str> = - unsafe { TryFromStringSlice::try_from_slice(&engine_info) }; - Ok(Box::new(txn.with_engine_info(info_string?)).into()) + let info: &str = unsafe { TryFromStringSlice::try_from_slice(&engine_info) }?; + Ok(Box::new(txn.with_engine_info(info)).into()) +} + +/// Add domain metadata to the transaction. The domain metadata will be written to the Delta log +/// as a `domainMetadata` action when the transaction is committed. +/// +/// `domain` identifies the metadata domain (e.g. `"myApp"`). `configuration` is an arbitrary +/// string value associated with the domain (typically JSON). +/// +/// Each domain can only appear once per transaction. Setting metadata for multiple distinct +/// domains is allowed. Duplicate domains or setting and removing the same domain in a single +/// transaction will cause the commit to fail. +/// +/// # Safety +/// +/// Caller is responsible for passing valid handles. CONSUMES the transaction handle and returns +/// a new one. +#[no_mangle] +pub unsafe extern "C" fn with_domain_metadata( + txn: Handle, + domain: KernelStringSlice, + configuration: KernelStringSlice, + engine: Handle, +) -> ExternResult> { + let txn = unsafe { txn.into_inner() }; + let engine = unsafe { engine.as_ref() }; + with_domain_metadata_impl(*txn, domain, configuration).into_extern_result(&engine) +} + +fn with_domain_metadata_impl( + txn: Transaction, + domain: KernelStringSlice, + configuration: KernelStringSlice, +) -> DeltaResult> { + let domain = unsafe { TryFromStringSlice::try_from_slice(&domain) }?; + let configuration = unsafe { TryFromStringSlice::try_from_slice(&configuration) }?; + Ok(Box::new(txn.with_domain_metadata(domain, configuration)).into()) +} + +/// Remove domain metadata from the table in this transaction. A tombstone action with +/// `removed: true` will be written to the Delta log when the transaction is committed. +/// +/// The caller does not need to provide a configuration value -- the existing value is +/// automatically preserved in the tombstone. +/// +/// # Safety +/// +/// Caller is responsible for passing valid handles. CONSUMES the transaction handle and returns +/// a new one. +#[no_mangle] +pub unsafe extern "C" fn with_domain_metadata_removed( + txn: Handle, + domain: KernelStringSlice, + engine: Handle, +) -> ExternResult> { + let txn = unsafe { txn.into_inner() }; + let engine = unsafe { engine.as_ref() }; + with_domain_metadata_removed_impl(*txn, domain).into_extern_result(&engine) +} + +fn with_domain_metadata_removed_impl( + txn: Transaction, + domain: KernelStringSlice, +) -> DeltaResult> { + let domain = unsafe { TryFromStringSlice::try_from_slice(&domain) }?; + Ok(Box::new(txn.with_domain_metadata_removed(domain)).into()) } /// Add file metadata to the transaction for files that have been written. The metadata contains @@ -97,7 +230,6 @@ pub unsafe extern "C" fn add_files( txn.add_files(write_metadata); } -/// /// Mark the transaction as having data changes or not (these are recorded at the file level). /// /// # Safety @@ -124,27 +256,304 @@ pub unsafe extern "C" fn commit( let txn = unsafe { txn.into_inner() }; let extern_engine = unsafe { engine.as_ref() }; let engine = extern_engine.engine(); - // TODO: for now this removes the enum, which prevents doing any conflict resolution. We should fix - // this by making the commit function return the enum somehow. - match txn.commit(engine.as_ref()) { - Ok(CommitResult::CommittedTransaction(committed)) => Ok(committed.commit_version()), - Ok(CommitResult::RetryableTransaction(_)) => Err(delta_kernel::Error::unsupported( - "commit failed: retryable transaction not supported in FFI (yet)", - )), - Ok(CommitResult::ConflictedTransaction(conflicted)) => { - Err(delta_kernel::Error::Generic(format!( - "commit conflict at version {}", - conflicted.conflict_version() - ))) - } - Err(e) => Err(e), - } - .into_extern_result(&extern_engine) + commit_result_to_version(txn.commit(engine.as_ref())).into_extern_result(&extern_engine) +} + +// ============================================================================ +// Create-table transaction FFI functions +// ============================================================================ + +/// Free a create-table transaction handle without committing. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid handle. +#[no_mangle] +pub unsafe extern "C" fn create_table_free_transaction(txn: Handle) { + txn.drop_handle(); +} + +/// Attaches engine info to a create-table transaction. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid handle. CONSUMES the transaction handle. +#[no_mangle] +pub unsafe extern "C" fn create_table_with_engine_info( + txn: Handle, + engine_info: KernelStringSlice, + engine: Handle, +) -> ExternResult> { + let txn = unsafe { txn.into_inner() }; + let engine = unsafe { engine.as_ref() }; + create_table_with_engine_info_impl(*txn, engine_info).into_extern_result(&engine) +} + +fn create_table_with_engine_info_impl( + txn: CreateTableTransaction, + engine_info: KernelStringSlice, +) -> DeltaResult> { + let info: &str = unsafe { TryFromStringSlice::try_from_slice(&engine_info) }?; + Ok(Box::new(txn.with_engine_info(info)).into()) +} + +/// Add file metadata to a create-table transaction for files that have been written. The metadata +/// contains information about files written during the transaction that will be added to the +/// Delta log during commit. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid handle. Consumes write_metadata. +#[no_mangle] +pub unsafe extern "C" fn create_table_add_files( + mut txn: Handle, + write_metadata: Handle, +) { + let txn = unsafe { txn.as_mut() }; + let write_metadata = unsafe { write_metadata.into_inner() }; + txn.add_files(write_metadata); +} + +/// Mark the create-table transaction as having data changes or not (these are recorded at the +/// file level). +/// +/// # Safety +/// +/// Caller is responsible for passing a valid handle. +#[no_mangle] +pub unsafe extern "C" fn create_table_set_data_change( + mut txn: Handle, + data_change: bool, +) { + let underlying_txn = unsafe { txn.as_mut() }; + underlying_txn.set_data_change(data_change); +} + +/// Attempt to commit a create-table transaction. Returns version number if successful. +/// Returns error if the commit fails. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid handle. And MUST NOT USE transaction after this +/// method is called. +#[no_mangle] +pub unsafe extern "C" fn create_table_commit( + txn: Handle, + engine: Handle, +) -> ExternResult { + let txn = unsafe { txn.into_inner() }; + let extern_engine = unsafe { engine.as_ref() }; + let engine = extern_engine.engine(); + commit_result_to_version(txn.commit(engine.as_ref())).into_extern_result(&extern_engine) +} + +// ============================================================================ +// Create Table DDL +// ============================================================================ + +/// A handle representing an exclusive [`CreateTableTransactionBuilder`]. +/// +/// The caller must eventually either call [`create_table_builder_build`] (which consumes the +/// handle and returns a transaction) or [`free_create_table_builder`] (which drops it without +/// creating anything). +#[handle_descriptor(target=CreateTableTransactionBuilder, mutable=true, sized=true)] +pub struct ExclusiveCreateTableBuilder; + +// TODO: Add `create_table_builder_with_data_layout` FFI function to support partitioned and +// clustered table creation. The kernel's +// `CreateTableTransactionBuilder::with_data_layout(DataLayout)` supports this but is not yet +// exposed through FFI. + +/// Create a new [`CreateTableTransactionBuilder`] for creating a Delta table at the given path. +/// +/// The returned builder can be configured with [`create_table_builder_with_table_property`] +/// before building with [`create_table_builder_build`]. The engine is only used for error +/// reporting at this stage. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid `path`, `schema`, `engine_info`, and `engine`. +/// Does NOT consume the `schema` handle -- the caller is still responsible for freeing it. +#[no_mangle] +pub unsafe extern "C" fn get_create_table_builder( + path: KernelStringSlice, + schema: Handle, + engine_info: KernelStringSlice, + engine: Handle, +) -> ExternResult> { + let engine = unsafe { engine.as_ref() }; + let path = unsafe { TryFromStringSlice::try_from_slice(&path) }; + let info = unsafe { TryFromStringSlice::try_from_slice(&engine_info) }; + let schema = unsafe { schema.clone_as_arc() }; + get_create_table_builder_impl(path, schema, info).into_extern_result(&engine) +} + +fn get_create_table_builder_impl( + path: DeltaResult<&str>, + schema: Arc, + engine_info: DeltaResult<&str>, +) -> DeltaResult> { + let builder = delta_kernel::transaction::create_table::create_table( + path?, + schema, + engine_info?.to_string(), + ); + Ok(Box::new(builder).into()) +} + +/// Add a single table property to a [`CreateTableTransactionBuilder`]. +/// +/// This consumes the builder handle and returns a new one. The caller MUST replace their handle +/// pointer with the returned handle. On error, the old builder handle is consumed and gone -- +/// do not free or reuse it. There is no new handle to free either. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid builder handle, `key`, `value`, and `engine`. +/// CONSUMES the builder handle unconditionally (even on error). +#[no_mangle] +pub unsafe extern "C" fn create_table_builder_with_table_property( + builder: Handle, + key: KernelStringSlice, + value: KernelStringSlice, + engine: Handle, +) -> ExternResult> { + let engine = unsafe { engine.as_ref() }; + let builder = unsafe { *builder.into_inner() }; + let key = unsafe { TryFromStringSlice::try_from_slice(&key) }; + let value = unsafe { TryFromStringSlice::try_from_slice(&value) }; + create_table_builder_with_table_property_impl(builder, key, value).into_extern_result(&engine) +} + +fn create_table_builder_with_table_property_impl( + builder: CreateTableTransactionBuilder, + key: DeltaResult, + value: DeltaResult, +) -> DeltaResult> { + let builder = builder.with_table_properties([(key?, value?)]); + Ok(Box::new(builder).into()) +} + +/// Build a create-table transaction using the default [`FileSystemCommitter`]. Returns a +/// create-table transaction handle that can be used with [`create_table_add_files`], +/// [`create_table_set_data_change`], [`create_table_with_engine_info`], and +/// [`create_table_commit`] to optionally stage initial data before committing. +/// +/// # Safety +/// +/// Caller is responsible for passing valid builder and engine handles. +/// CONSUMES the builder handle -- caller must not use it after this call. +#[no_mangle] +pub unsafe extern "C" fn create_table_builder_build( + builder: Handle, + engine: Handle, +) -> ExternResult> { + let builder = unsafe { *builder.into_inner() }; + let extern_engine = unsafe { engine.as_ref() }; + let committer = Box::new(FileSystemCommitter::new()); + create_table_builder_build_impl(builder, committer, extern_engine) + .into_extern_result(&extern_engine) +} + +/// Build a create-table transaction with a custom committer. Same as +/// [`create_table_builder_build`] but uses the provided committer instead of the default. +/// +/// # Safety +/// +/// Caller is responsible for passing valid handles. +/// CONSUMES both the builder and committer handles -- caller must not use them after this call. +#[no_mangle] +pub unsafe extern "C" fn create_table_builder_build_with_committer( + builder: Handle, + committer: Handle, + engine: Handle, +) -> ExternResult> { + let builder = unsafe { *builder.into_inner() }; + let committer = unsafe { committer.into_inner() }; + let extern_engine = unsafe { engine.as_ref() }; + create_table_builder_build_impl(builder, committer, extern_engine) + .into_extern_result(&extern_engine) +} + +fn create_table_builder_build_impl( + builder: CreateTableTransactionBuilder, + committer: Box, + extern_engine: &dyn ExternEngine, +) -> DeltaResult> { + let engine = extern_engine.engine(); + let create_txn = builder.build(engine.as_ref(), committer)?; + Ok(Box::new(create_txn).into()) +} + +/// Free a [`CreateTableTransactionBuilder`] without building. +/// +/// Use this on failure paths when the builder will not be built. +/// +/// # Safety +/// +/// Caller is responsible for passing a valid handle. +#[no_mangle] +pub unsafe extern "C" fn free_create_table_builder(builder: Handle) { + builder.drop_handle(); +} + +// ============================================================================ +// Remove Files DML +// ============================================================================ + +/// Remove files from a transaction using engine data and a selection vector. +/// +/// The `data` handle is consumed. The selection vector indicates which rows in `data` represent +/// files to remove: nonzero means the row is selected for removal, `0` means it is skipped. +/// If `selection_vector` is null or `selection_vector_len` is 0, all rows are selected. When +/// `selection_vector_len` is 0, the `selection_vector` pointer is not accessed and may be null +/// or any arbitrary value. +/// +/// The `data` and `selection_vector` should be derived from +/// [`scan_metadata_next`](crate::scan::scan_metadata_next): `data` is the engine data batch and +/// `selection_vector` is the scan's selection vector, modified to select only the rows (files) to +/// remove. Selecting rows that were not active in the original scan selection vector produces +/// invalid Remove actions in the commit log. +/// +/// Note: Unlike [`add_files`], this function takes an `engine` handle and returns +/// [`ExternResult`] because the selection vector validation can fail. Returns `true` on +/// success (the value itself is not meaningful). +/// +/// # Safety +/// +/// Caller is responsible for passing valid handles. The `selection_vector` pointer must be valid +/// for `selection_vector_len` bytes, or null. Consumes the `data` handle. Does NOT consume +/// the `txn` handle. +#[no_mangle] +pub unsafe extern "C" fn remove_files( + mut txn: Handle, + data: Handle, + selection_vector: *const u8, + selection_vector_len: usize, + engine: Handle, +) -> ExternResult { + let engine = unsafe { engine.as_ref() }; + let data = unsafe { data.into_inner() }; + let txn = unsafe { txn.as_mut() }; + // empty sv = all rows selected (per FilteredEngineData contract) + let sv = if selection_vector.is_null() || selection_vector_len == 0 { + vec![] + } else { + let raw = unsafe { std::slice::from_raw_parts(selection_vector, selection_vector_len) }; + raw.iter().map(|&b| b != 0).collect() + }; + let result: DeltaResult = (|| { + let filtered = FilteredEngineData::try_new(data, sv)?; + txn.remove_files(filtered); + Ok(true) + })(); + result.into_extern_result(&engine) } #[cfg(test)] mod tests { use delta_kernel::schema::{DataType, StructField, StructType}; + use delta_kernel::table_features::TableFeature; use delta_kernel::arrow::array::{Array, ArrayRef, Int32Array, StringArray, StructArray}; use delta_kernel::arrow::datatypes::Schema as ArrowSchema; @@ -154,23 +563,28 @@ mod tests { use delta_kernel::engine::arrow_conversion::TryIntoArrow; use delta_kernel::engine::arrow_data::ArrowEngineData; + use delta_kernel::object_store::path::Path; + use delta_kernel::object_store::ObjectStore; use delta_kernel::parquet::arrow::arrow_writer::ArrowWriter; use delta_kernel::parquet::file::properties::WriterProperties; use delta_kernel_ffi::engine_data::get_engine_data; use delta_kernel_ffi::engine_data::ArrowFFIData; - use delta_kernel_ffi::ffi_test_utils::{allocate_str, ok_or_panic, recover_string}; + use delta_kernel_ffi::error::KernelError; + use delta_kernel_ffi::ffi_test_utils::{ + allocate_err, allocate_str, assert_extern_result_error_with_message, build_snapshot, + ok_or_panic, recover_error, recover_string, + }; use delta_kernel_ffi::tests::get_default_engine; - use crate::{free_engine, free_schema, kernel_string_slice}; + use crate::{free_engine, free_schema, free_snapshot, kernel_string_slice}; + use crate::{logical_schema, version}; use write_context::{free_write_context, get_write_context, get_write_path, get_write_schema}; use test_utils::{set_json_value, setup_test_tables, test_read}; use itertools::Itertools; - use object_store::path::Path; - use object_store::ObjectStore; use serde_json::json; use serde_json::Deserializer; @@ -208,6 +622,7 @@ mod tests { fn create_file_metadata( path: &str, + file_size_bytes: u64, num_rows: i64, metadata_schema: ArrowSchema, ) -> Result> { @@ -217,7 +632,7 @@ mod tests { .as_millis() as i64; let file_metadata = format!( - r#"{{"path":"{path}", "partitionValues": {{}}, "size": {num_rows}, "modificationTime": {current_time}, "stats": {{"numRecords": {num_rows}}}}}"#, + r#"{{"path":"{path}", "partitionValues": {{}}, "size": {file_size_bytes}, "modificationTime": {current_time}, "stats": {{"numRecords": {num_rows}}}}}"#, ); create_arrow_ffi_from_json(metadata_schema, file_metadata.as_str()) @@ -241,7 +656,14 @@ mod tests { // writer must be closed to write footer let res = writer.close().unwrap(); - create_file_metadata(file_path, res.num_rows, metadata_schema) + let file_size_bytes = std::fs::metadata(&full_path)?.len(); + + #[cfg(any(not(feature = "arrow-56"), feature = "arrow-57"))] + let num_rows = res.file_metadata().num_rows(); + #[cfg(all(feature = "arrow-56", not(feature = "arrow-57")))] + let num_rows = res.num_rows; + + create_file_metadata(file_path, file_size_bytes, num_rows, metadata_schema) } #[tokio::test] @@ -343,11 +765,7 @@ mod tests { )?; let file_info_engine_data = ok_or_panic(unsafe { - get_engine_data( - file_info.array, - &file_info.schema, - crate::ffi_test_utils::allocate_err, - ) + get_engine_data(file_info.array, &file_info.schema, allocate_err) }); unsafe { add_files(txn_with_engine_info.shallow_copy(), file_info_engine_data) }; @@ -410,4 +828,947 @@ mod tests { Ok(()) } + + /// Read the commit log at `version` and return the `domainMetadata` action JSON. + async fn read_domain_metadata_action( + store: &dyn ObjectStore, + table_url: &Url, + version: u64, + ) -> serde_json::Value { + let path = format!("_delta_log/{version:020}.json"); + let commit_url = table_url.join(&path).unwrap(); + let data = store + .get(&Path::from_url_path(commit_url.path()).unwrap()) + .await + .unwrap(); + let actions: Vec = + Deserializer::from_slice(&data.bytes().await.unwrap()) + .into_iter::() + .try_collect() + .unwrap(); + actions + .into_iter() + .find(|a| a.get("domainMetadata").is_some()) + .expect("commit should contain a domainMetadata action") + } + + /// Create a table with the `domainMetadata` writer feature enabled and return the table + /// URL, object store, and FFI engine handle. + async fn setup_domain_metadata_table( + dir_url: &Url, + name: &str, + ) -> Result<(Url, Arc, Handle), Box> + { + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + let (store, _test_engine, table_location) = + test_utils::engine_store_setup(name, Some(dir_url)); + let table_url = test_utils::create_table( + store.clone(), + table_location, + schema, + &[], + true, + vec![], + vec!["domainMetadata"], + ) + .await?; + let table_path = table_url.to_file_path().unwrap(); + let table_path_str = table_path.to_str().unwrap(); + let engine = get_default_engine(table_path_str); + Ok((table_url, store, engine)) + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_domain_metadata_add_and_remove() -> Result<(), Box> { + let tmp_test_dir = tempdir()?; + let tmp_dir_url = Url::from_directory_path(tmp_test_dir.path()).unwrap(); + let (table_url, store, engine) = + setup_domain_metadata_table(&tmp_dir_url, "test_dm").await?; + let table_path = table_url.to_file_path().unwrap(); + let table_path_str = table_path.to_str().unwrap(); + + // === Transaction 1: add domain metadata === + let txn = ok_or_panic(unsafe { + transaction(kernel_string_slice!(table_path_str), engine.shallow_copy()) + }); + unsafe { set_data_change(txn.shallow_copy(), false) }; + + let domain = "testDomain"; + let configuration = r#"{"key": "value"}"#; + let txn = ok_or_panic(unsafe { + with_domain_metadata( + txn, + kernel_string_slice!(domain), + kernel_string_slice!(configuration), + engine.shallow_copy(), + ) + }); + + let version = ok_or_panic(unsafe { commit(txn, engine.shallow_copy()) }); + assert_eq!(version, 1); + + let dm = read_domain_metadata_action(&*store, &table_url, 1).await; + assert_eq!(dm["domainMetadata"]["domain"], "testDomain"); + assert_eq!(dm["domainMetadata"]["configuration"], r#"{"key": "value"}"#); + assert_eq!(dm["domainMetadata"]["removed"], false); + + // === Transaction 2: remove domain metadata === + let txn = ok_or_panic(unsafe { + transaction(kernel_string_slice!(table_path_str), engine.shallow_copy()) + }); + unsafe { set_data_change(txn.shallow_copy(), false) }; + + let txn = ok_or_panic(unsafe { + with_domain_metadata_removed(txn, kernel_string_slice!(domain), engine.shallow_copy()) + }); + + let version = ok_or_panic(unsafe { commit(txn, engine.shallow_copy()) }); + assert_eq!(version, 2); + + let dm = read_domain_metadata_action(&*store, &table_url, 2).await; + assert_eq!(dm["domainMetadata"]["domain"], "testDomain"); + assert_eq!(dm["domainMetadata"]["removed"], true); + assert_eq!(dm["domainMetadata"]["configuration"], r#"{"key": "value"}"#); + + unsafe { free_engine(engine) }; + Ok(()) + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_domain_metadata_system_domain_rejected_at_commit( + ) -> Result<(), Box> { + let tmp_test_dir = tempdir()?; + let tmp_dir_url = Url::from_directory_path(tmp_test_dir.path()).unwrap(); + let (table_url, _store, engine) = + setup_domain_metadata_table(&tmp_dir_url, "test_dm_sys").await?; + let table_path = table_url.to_file_path().unwrap(); + let table_path_str = table_path.to_str().unwrap(); + + // with_domain_metadata succeeds (validation is lazy), but commit should fail + let txn = ok_or_panic(unsafe { + transaction(kernel_string_slice!(table_path_str), engine.shallow_copy()) + }); + unsafe { set_data_change(txn.shallow_copy(), false) }; + + let sys_domain = "delta.system"; + let config = "config"; + let txn = ok_or_panic(unsafe { + with_domain_metadata( + txn, + kernel_string_slice!(sys_domain), + kernel_string_slice!(config), + engine.shallow_copy(), + ) + }); + + let result = unsafe { commit(txn, engine.shallow_copy()) }; + assert_extern_result_error_with_message( + result, + KernelError::GenericError, + Some("Generic delta kernel error: Cannot modify domains that start with 'delta.' as those are system controlled"), + ); + + unsafe { free_engine(engine) }; + Ok(()) + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_domain_metadata_duplicate_domain_rejected_at_commit( + ) -> Result<(), Box> { + let tmp_test_dir = tempdir()?; + let tmp_dir_url = Url::from_directory_path(tmp_test_dir.path()).unwrap(); + let (table_url, _store, engine) = + setup_domain_metadata_table(&tmp_dir_url, "test_dm_dup").await?; + let table_path = table_url.to_file_path().unwrap(); + let table_path_str = table_path.to_str().unwrap(); + + // Adding the same domain twice should cause commit to fail + let txn = ok_or_panic(unsafe { + transaction(kernel_string_slice!(table_path_str), engine.shallow_copy()) + }); + unsafe { set_data_change(txn.shallow_copy(), false) }; + + let dup_domain = "dup"; + let config_a = "a"; + let config_b = "b"; + let txn = ok_or_panic(unsafe { + with_domain_metadata( + txn, + kernel_string_slice!(dup_domain), + kernel_string_slice!(config_a), + engine.shallow_copy(), + ) + }); + let txn = ok_or_panic(unsafe { + with_domain_metadata( + txn, + kernel_string_slice!(dup_domain), + kernel_string_slice!(config_b), + engine.shallow_copy(), + ) + }); + + let result = unsafe { commit(txn, engine.shallow_copy()) }; + assert_extern_result_error_with_message( + result, + KernelError::GenericError, + Some("Generic delta kernel error: Metadata for domain dup already specified in this transaction"), + ); + + unsafe { free_engine(engine) }; + Ok(()) + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_domain_metadata_rejected_without_feature( + ) -> Result<(), Box> { + let tmp_test_dir = tempdir()?; + let tmp_dir_url = Url::from_directory_path(tmp_test_dir.path()).unwrap(); + + // Create a table WITHOUT the domainMetadata writer feature (v1/v1 protocol) + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + let (store, _test_engine, table_location) = + test_utils::engine_store_setup("test_dm_no_feature", Some(&tmp_dir_url)); + let table_url = test_utils::create_table( + store.clone(), + table_location, + schema, + &[], + false, + vec![], + vec![], + ) + .await?; + let table_path = table_url.to_file_path().unwrap(); + let table_path_str = table_path.to_str().unwrap(); + let engine = get_default_engine(table_path_str); + + let txn = ok_or_panic(unsafe { + transaction(kernel_string_slice!(table_path_str), engine.shallow_copy()) + }); + unsafe { set_data_change(txn.shallow_copy(), false) }; + + let domain = "myDomain"; + let config = "config"; + let txn = ok_or_panic(unsafe { + with_domain_metadata( + txn, + kernel_string_slice!(domain), + kernel_string_slice!(config), + engine.shallow_copy(), + ) + }); + + let result = unsafe { commit(txn, engine.shallow_copy()) }; + assert_extern_result_error_with_message( + result, + KernelError::UnsupportedError, + Some("Unsupported: Domain metadata operations require writer version 7 and the 'domainMetadata' writer feature"), + ); + + unsafe { free_engine(engine) }; + Ok(()) + } + + #[cfg(feature = "delta-kernel-unity-catalog")] + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[cfg_attr(miri, ignore)] + async fn test_transaction_with_uc_committer() -> Result<(), Box> { + use crate::delta_kernel_unity_catalog::{ + free_uc_commit_client, get_uc_commit_client, get_uc_committer, + tests::{cast_test_context, get_test_context, recover_test_context}, + CommitRequest, + }; + use crate::{Handle, NullableCvoid, OptionalValue}; + + #[no_mangle] + extern "C" fn test_uc_commit( + context: NullableCvoid, + request: CommitRequest, + ) -> OptionalValue> { + let context = cast_test_context(context).unwrap(); + context.commit_called = true; + + let table_id = unsafe { String::try_from_slice(&request.table_id).unwrap() }; + let table_uri = unsafe { String::try_from_slice(&request.table_uri).unwrap() }; + + context.last_commit_request = Some((table_id.clone(), table_uri.clone())); + + // Capture the staged commit file name if present + if let OptionalValue::Some(commit_info) = request.commit_info { + let file_name = unsafe { String::try_from_slice(&commit_info.file_name).unwrap() }; + context.last_staged_filename = Some(file_name); + } + + OptionalValue::None + } + + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("number", DataType::INTEGER), + StructField::nullable("string", DataType::STRING), + ])); + + let tmp_test_dir = tempdir()?; + let tmp_dir_local_url = Url::from_directory_path(tmp_test_dir.path()).unwrap(); + + // Create a catalog-managed table so UCCommitter (a catalog committer) is allowed. + let (store, _test_engine, table_location) = + test_utils::engine_store_setup("test_uc_table", Some(&tmp_dir_local_url)); + let table_url = test_utils::create_table( + store.clone(), + table_location, + schema, + &[], + true, // use v3/v7 protocol + vec!["catalogManaged", "vacuumProtocolCheck"], + vec!["inCommitTimestamp", "catalogManaged", "vacuumProtocolCheck"], + ) + .await?; + + { + let table_path = table_url.to_file_path().unwrap(); + let table_path_str = table_path.to_str().unwrap(); + let engine = get_default_engine(table_path_str); + + let snapshot = unsafe { + build_snapshot(kernel_string_slice!(table_path_str), engine.shallow_copy()) + }; + + let context = get_test_context(false); + + let uc_client = unsafe { get_uc_commit_client(context, test_uc_commit) }; + let table_id = "test_id"; + let uc_committer = unsafe { + ok_or_panic(get_uc_committer( + uc_client.shallow_copy(), + kernel_string_slice!(table_id), + allocate_err, + )) + }; + + let txn = ok_or_panic(unsafe { + transaction_with_committer(snapshot, engine.shallow_copy(), uc_committer) + }); + unsafe { set_data_change(txn.shallow_copy(), false) }; + + let engine_info = "uc_test_engine"; + let engine_info_kernel_string = kernel_string_slice!(engine_info); + let txn_with_engine_info = unsafe { + ok_or_panic(with_engine_info( + txn, + engine_info_kernel_string, + engine.shallow_copy(), + )) + }; + + let write_context = unsafe { get_write_context(txn_with_engine_info.shallow_copy()) }; + + let batch = RecordBatch::try_from_iter(vec![ + ( + "number", + Arc::new(Int32Array::from(vec![10, 20, 30])) as ArrayRef, + ), + ( + "string", + Arc::new(StringArray::from(vec!["uc-1", "uc-2", "uc-3"])) as ArrayRef, + ), + ]) + .unwrap(); + + let parquet_schema = unsafe { + txn_with_engine_info + .shallow_copy() + .as_ref() + .add_files_schema() + }; + let file_info = write_parquet_file( + table_path_str, + "uc_test_file.parquet", + &batch, + parquet_schema.as_ref().try_into_arrow()?, + )?; + + let file_info_engine_data = ok_or_panic(unsafe { + get_engine_data(file_info.array, &file_info.schema, allocate_err) + }); + + unsafe { add_files(txn_with_engine_info.shallow_copy(), file_info_engine_data) }; + + let commit_result = unsafe { commit(txn_with_engine_info, engine.shallow_copy()) }; + + // UC committer returns success from our mock callback + assert!(commit_result.is_ok(), "Commit should succeed"); + + let context = recover_test_context(context).unwrap(); + + assert!( + context.commit_called, + "Commit callback should be called after commit" + ); + + { + // scope so we don't hold mutex across the await lower down + let (last_table_id, _) = context.last_commit_request.unwrap(); + assert_eq!( + last_table_id, "test_id", + "Table ID should match the one passed to UCCommitter" + ); + } + + let staged_file_name = { + // scope so we don't hold mutex across await + assert!( + context.last_staged_filename.is_some(), + "Staged commit file name should be captured" + ); + + context.last_staged_filename.clone().unwrap() + }; + + // Read the staged commit + let staged_commit_url = table_url + .join(&format!("_delta_log/_staged_commits/{staged_file_name}")) + .unwrap(); + let staged_commit = store + .get(&Path::from_url_path(staged_commit_url.path()).unwrap()) + .await?; + let staged_content = staged_commit.bytes().await?; + let staged_actions: Vec<_> = Deserializer::from_slice(&staged_content) + .into_iter::() + .try_collect()?; + + assert!( + !staged_actions.is_empty(), + "Staged commit should have actions" + ); + + // Should have commitInfo and add action + let has_commit_info = staged_actions.iter().any(|a| a.get("commitInfo").is_some()); + let has_add = staged_actions.iter().any(|a| a.get("add").is_some()); + + assert!(has_commit_info, "Staged commit should contain commitInfo"); + assert!(has_add, "Staged commit should contain add action"); + + let add_action = staged_actions + .iter() + .find(|a| a.get("add").is_some()) + .unwrap(); + assert_eq!( + add_action["add"]["path"].as_str().unwrap(), + "uc_test_file.parquet", + "Add action should reference the correct file" + ); + + let commit_info = staged_actions + .iter() + .find(|a| a.get("commitInfo").is_some()) + .unwrap(); + assert_eq!( + commit_info["commitInfo"]["engineInfo"].as_str().unwrap(), + "uc_test_engine", + "CommitInfo should contain the engine info" + ); + + unsafe { free_write_context(write_context) }; + unsafe { free_engine(engine) }; + unsafe { free_uc_commit_client(uc_client) }; + } + + Ok(()) + } + + /// Create a [`CreateTableTransactionBuilder`] handle via the FFI, using the given schema + /// fields. Returns `(table_path, engine_handle, builder_handle)`. The caller is responsible + /// for freeing/consuming the engine and builder handles. + fn create_table_builder( + tmp_dir: &tempfile::TempDir, + fields: Vec, + ) -> ( + String, + Handle, + Handle, + ) { + let table_path = tmp_dir.path().to_str().unwrap().to_string(); + let schema = Arc::new(StructType::try_new(fields).unwrap()); + let engine = get_default_engine(&table_path); + let schema_handle: Handle = schema.into(); + let engine_info = "test-engine/1.0"; + let builder = ok_or_panic(unsafe { + get_create_table_builder( + kernel_string_slice!(table_path), + schema_handle.shallow_copy(), + kernel_string_slice!(engine_info), + engine.shallow_copy(), + ) + }); + // get_create_table_builder does NOT consume the schema handle -- free it + unsafe { free_schema(schema_handle) }; + (table_path, engine, builder) + } + + /// Build and commit a create-table builder with default (null) committer, asserting that + /// the committed version is 0. + fn build_and_commit( + builder: Handle, + engine: &Handle, + ) -> u64 { + let txn = + ok_or_panic(unsafe { create_table_builder_build(builder, engine.shallow_copy()) }); + let version = ok_or_panic(unsafe { create_table_commit(txn, engine.shallow_copy()) }); + assert_eq!(version, 0); + version + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_create_table_basic() -> Result<(), Box> { + let tmp_dir = tempdir()?; + let (table_path, engine, builder) = create_table_builder( + &tmp_dir, + vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("name", DataType::STRING), + ], + ); + let table_path_str = table_path.as_str(); + + build_and_commit(builder, &engine); + + // Verify by opening a snapshot of the created table + let snap = + unsafe { build_snapshot(kernel_string_slice!(table_path_str), engine.shallow_copy()) }; + let snap_version = unsafe { version(snap.shallow_copy()) }; + assert_eq!(snap_version, 0); + + // Verify schema + let snap_schema = unsafe { logical_schema(snap.shallow_copy()) }; + let snap_schema_ref = unsafe { snap_schema.as_ref() }; + assert_eq!(snap_schema_ref.num_fields(), 2); + assert_eq!(snap_schema_ref.field_at_index(0).unwrap().name, "id"); + assert_eq!(snap_schema_ref.field_at_index(1).unwrap().name, "name"); + + unsafe { free_schema(snap_schema) }; + unsafe { free_snapshot(snap) }; + unsafe { free_engine(engine) }; + Ok(()) + } + + #[test] + #[cfg_attr(miri, ignore)] + fn test_create_table_with_properties() { + let tmp_dir = tempdir().unwrap(); + let (table_path, engine, builder) = create_table_builder( + &tmp_dir, + vec![StructField::nullable("id", DataType::INTEGER)], + ); + + // Set properties + let prop_key1 = "delta.appendOnly"; + let prop_val1 = "true"; + let builder = ok_or_panic(unsafe { + create_table_builder_with_table_property( + builder, + kernel_string_slice!(prop_key1), + kernel_string_slice!(prop_val1), + engine.shallow_copy(), + ) + }); + let prop_key2 = "custom.key"; + let prop_val2 = "custom_value"; + let builder = ok_or_panic(unsafe { + create_table_builder_with_table_property( + builder, + kernel_string_slice!(prop_key2), + kernel_string_slice!(prop_val2), + engine.shallow_copy(), + ) + }); + + build_and_commit(builder, &engine); + + // Verify properties via snapshot + let table_path_str = table_path.as_str(); + let snap = + unsafe { build_snapshot(kernel_string_slice!(table_path_str), engine.shallow_copy()) }; + let snap_ref = unsafe { snap.as_ref() }; + + // Verify parsed table properties + let props = snap_ref.table_properties(); + assert_eq!(props.append_only, Some(true)); + assert_eq!( + props + .unknown_properties + .get("custom.key") + .map(|s| s.as_str()), + Some("custom_value") + ); + + // Verify feature is enabled in protocol (property set + protocol supports it) + let config = snap_ref.table_configuration(); + assert!(config.is_feature_enabled(&TableFeature::AppendOnly)); + + unsafe { free_snapshot(snap) }; + unsafe { free_engine(engine) }; + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_create_table_already_exists() -> Result<(), Box> { + let tmp_dir = tempdir()?; + + // Create the table first time -- should succeed + let (_table_path, engine, builder) = create_table_builder( + &tmp_dir, + vec![StructField::nullable("id", DataType::INTEGER)], + ); + build_and_commit(builder, &engine); + + // Try to create the same table again -- build should error (table already exists) + let (_, engine2, builder2) = create_table_builder( + &tmp_dir, + vec![StructField::nullable("id", DataType::INTEGER)], + ); + let result = unsafe { create_table_builder_build(builder2, engine2.shallow_copy()) }; + match result { + ExternResult::Err(e) => { + // Clean up the error to prevent leaks + let error = unsafe { recover_error(e) }; + assert!( + error.message.contains("already exists"), + "Expected 'already exists' error, got: {}", + error.message + ); + } + ExternResult::Ok(txn) => { + unsafe { create_table_free_transaction(txn) }; + panic!("Expected error for table that already exists"); + } + } + + unsafe { free_engine(engine2) }; + unsafe { free_engine(engine) }; + Ok(()) + } + + #[test] + fn test_free_create_table_builder() { + let schema = Arc::new( + StructType::try_new(vec![StructField::nullable("id", DataType::INTEGER)]).unwrap(), + ); + let builder = + delta_kernel::transaction::create_table::create_table("memory:///test", schema, "test"); + let handle: Handle = Box::new(builder).into(); + // Should not panic or leak + unsafe { free_create_table_builder(handle) }; + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_create_table_build_with_empty_schema_returns_error( + ) -> Result<(), Box> { + let tmp_dir = tempdir()?; + // An empty schema is always invalid for table creation + let (_table_path, engine, builder) = create_table_builder(&tmp_dir, vec![]); + + let result = unsafe { create_table_builder_build(builder, engine.shallow_copy()) }; + match result { + ExternResult::Err(e) => { + let error = unsafe { recover_error(e) }; + assert!( + error.message.contains("schema") + || error.message.contains("field") + || error.message.contains("empty"), + "Expected schema-related error, got: {}", + error.message + ); + } + ExternResult::Ok(txn) => { + unsafe { create_table_free_transaction(txn) }; + panic!("Expected error for empty schema"); + } + } + + unsafe { free_engine(engine) }; + Ok(()) + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_create_table_with_custom_committer() -> Result<(), Box> { + let tmp_dir = tempdir()?; + let (table_path, engine, builder) = create_table_builder( + &tmp_dir, + vec![StructField::nullable("id", DataType::INTEGER)], + ); + let table_path_str = table_path.as_str(); + + // Create a FileSystemCommitter handle and pass it to build_with_committer + let committer: Box = + Box::new(FileSystemCommitter::new()); + let committer_handle: Handle = committer.into(); + + let txn = ok_or_panic(unsafe { + create_table_builder_build_with_committer( + builder, + committer_handle, + engine.shallow_copy(), + ) + }); + let committed_version = + ok_or_panic(unsafe { create_table_commit(txn, engine.shallow_copy()) }); + assert_eq!(committed_version, 0); + + // Verify the table was created + let snap = + unsafe { build_snapshot(kernel_string_slice!(table_path_str), engine.shallow_copy()) }; + assert_eq!(unsafe { version(snap.shallow_copy()) }, 0); + + unsafe { free_snapshot(snap) }; + unsafe { free_engine(engine) }; + Ok(()) + } + + /// Helper: create a table, write one parquet file, and return (table_path, engine_handle). + /// The caller is responsible for freeing the engine handle. + fn create_table_with_one_file( + tmp_dir: &tempfile::TempDir, + ) -> Result<(String, Handle), Box> { + let table_path = tmp_dir.path().to_str().unwrap(); + let schema = Arc::new(StructType::try_new(vec![ + StructField::nullable("number", DataType::INTEGER), + StructField::nullable("value", DataType::STRING), + ])?); + + let engine = get_default_engine(table_path); + let schema_handle: Handle = schema.into(); + + // Create the table + let engine_info = "test-engine/1.0"; + let builder = ok_or_panic(unsafe { + get_create_table_builder( + kernel_string_slice!(table_path), + schema_handle.shallow_copy(), + kernel_string_slice!(engine_info), + engine.shallow_copy(), + ) + }); + // get_create_table_builder does NOT consume the schema handle -- free it + unsafe { free_schema(schema_handle) }; + build_and_commit(builder, &engine); + + // Write a parquet file and commit it + let batch = RecordBatch::try_from_iter(vec![ + ( + "number", + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + ), + ( + "value", + Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef, + ), + ])?; + + let txn = ok_or_panic(unsafe { + transaction(kernel_string_slice!(table_path), engine.shallow_copy()) + }); + let engine_info = "test-engine/1.0"; + let txn = ok_or_panic(unsafe { + with_engine_info( + txn, + kernel_string_slice!(engine_info), + engine.shallow_copy(), + ) + }); + + let parquet_schema = unsafe { txn.shallow_copy().as_ref().add_files_schema() }; + let file_info = write_parquet_file( + table_path, + "file1.parquet", + &batch, + parquet_schema.as_ref().try_into_arrow()?, + )?; + let file_info_engine_data = ok_or_panic(unsafe { + get_engine_data( + file_info.array, + &file_info.schema, + crate::ffi_test_utils::allocate_err, + ) + }); + unsafe { add_files(txn.shallow_copy(), file_info_engine_data) }; + ok_or_panic(unsafe { commit(txn, engine.shallow_copy()) }); + + Ok((table_path.to_string(), engine)) + } + + fn assert_no_active_files( + kernel_engine: &Arc, + table_path: &str, + ) -> Result<(), Box> { + let snapshot = + delta_kernel::snapshot::Snapshot::builder_for(delta_kernel::try_parse_uri(table_path)?) + .build(kernel_engine.as_ref())?; + let scan = snapshot.scan_builder().build()?; + let scan_meta: Vec<_> = scan + .scan_metadata(kernel_engine.as_ref())? + .collect::>()?; + let total_selected: usize = scan_meta + .iter() + .map(|m| { + let sv = m.scan_files.selection_vector(); + let data_len = m.scan_files.data().len(); + if sv.is_empty() { + data_len + } else { + sv.iter().filter(|&&b| b).count() + } + }) + .sum(); + assert_eq!(total_selected, 0, "Expected 0 files after removal"); + Ok(()) + } + + /// Helper: create a table with one file, build a snapshot, extract scan metadata, and + /// return the components needed for remove_files tests. Caller must free the engine handle + /// (and txn if not committed). + #[allow(clippy::type_complexity)] + fn setup_remove_files_test( + tmp_dir: &tempfile::TempDir, + ) -> Result< + ( + Box, + Vec, + Handle, + Handle, + Arc, + String, + ), + Box, + > { + let (table_path, engine) = create_table_with_one_file(tmp_dir)?; + let table_path_str = table_path.as_str(); + + let kernel_engine = unsafe { engine.as_ref() }.engine(); + let snapshot = delta_kernel::snapshot::Snapshot::builder_for(delta_kernel::try_parse_uri( + table_path_str, + )?) + .build(kernel_engine.as_ref())?; + + let scan = snapshot.scan_builder().build()?; + let scan_meta_items: Vec<_> = scan + .scan_metadata(kernel_engine.as_ref())? + .collect::>()?; + assert_eq!(scan_meta_items.len(), 1); + + let scan_meta = scan_meta_items.into_iter().next().unwrap(); + let (data, sv) = scan_meta.scan_files.into_parts(); + + let txn = ok_or_panic(unsafe { + transaction(kernel_string_slice!(table_path_str), engine.shallow_copy()) + }); + let engine_info = "test-engine/1.0"; + let txn = ok_or_panic(unsafe { + with_engine_info( + txn, + kernel_string_slice!(engine_info), + engine.shallow_copy(), + ) + }); + + Ok((data, sv, txn, engine, kernel_engine, table_path)) + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_remove_files_with_null_sv_commits_and_removes_all( + ) -> Result<(), Box> { + let tmp_dir = tempdir()?; + let (data, sv, txn, engine, kernel_engine, table_path) = setup_remove_files_test(&tmp_dir)?; + let data_handle: Handle = data.into(); + + // Pass the original SV as u8 values + let sv_u8: Vec = sv.iter().map(|&b| b as u8).collect(); + let sv_ptr = if sv_u8.is_empty() { + std::ptr::null() + } else { + sv_u8.as_ptr() + }; + ok_or_panic(unsafe { + remove_files( + txn.shallow_copy(), + data_handle, + sv_ptr, + sv_u8.len(), + engine.shallow_copy(), + ) + }); + + ok_or_panic(unsafe { commit(txn, engine.shallow_copy()) }); + assert_no_active_files(&kernel_engine, table_path.as_str())?; + + unsafe { free_engine(engine) }; + Ok(()) + } + + #[test] + fn test_empty_sv_creates_filtered_data_selecting_all_rows() { + // Verifies the FilteredEngineData contract that an empty selection vector means + // "all rows selected". The remove_files FFI wrapper normalizes null/zero-length + // pointers to an empty vec before constructing FilteredEngineData. + let batch = RecordBatch::try_from_iter(vec![( + "id", + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + )]) + .unwrap(); + let data: Box = Box::new(ArrowEngineData::from(batch)); + + let filtered = FilteredEngineData::try_new(data, vec![]).unwrap(); + assert_eq!(filtered.selection_vector().len(), 0); + assert_eq!(filtered.data().len(), 3); + } + + #[tokio::test] + #[cfg_attr(miri, ignore)] + async fn test_remove_files_with_non_empty_sv_exercises_from_raw_parts( + ) -> Result<(), Box> { + // Exercises the from_raw_parts code path in the remove_files FFI wrapper by passing + // a non-null selection vector pointer with non-zero length. The null-SV test always + // passes a null pointer because scan_metadata for a single-file table returns an + // empty selection vector. + let tmp_dir = tempdir()?; + let (data, _sv, txn, engine, _kernel_engine, _table_path) = + setup_remove_files_test(&tmp_dir)?; + let num_rows = data.len(); + assert!(num_rows > 0); + let data_handle: Handle = data.into(); + + // Construct a non-empty all-true SV selecting every row. Uses u8 values to match + // the FFI signature (nonzero = selected). + let sv: Vec = vec![1; num_rows]; + ok_or_panic(unsafe { + remove_files( + txn.shallow_copy(), + data_handle, + sv.as_ptr(), + sv.len(), + engine.shallow_copy(), + ) + }); + + // Don't commit -- the from_raw_parts path has been exercised. Committing with + // a non-empty SV triggers a different code path in generate_remove_actions that + // requires additional scan row fields not present in this test setup. + unsafe { free_transaction(txn) }; + unsafe { free_engine(engine) }; + Ok(()) + } } diff --git a/ffi/src/transaction/transaction_id.rs b/ffi/src/transaction/transaction_id.rs index face362986..32b234cade 100644 --- a/ffi/src/transaction/transaction_id.rs +++ b/ffi/src/transaction/transaction_id.rs @@ -142,18 +142,9 @@ mod tests { .build(&engine)?; // Check versions - assert_eq!( - snapshot.clone().get_app_id_version("app_id1", &engine)?, - Some(1) - ); - assert_eq!( - snapshot.clone().get_app_id_version("app_id2", &engine)?, - Some(2) - ); - assert_eq!( - snapshot.clone().get_app_id_version("app_id3", &engine)?, - None - ); + assert_eq!(snapshot.get_app_id_version("app_id1", &engine)?, Some(1)); + assert_eq!(snapshot.get_app_id_version("app_id2", &engine)?, Some(2)); + assert_eq!(snapshot.get_app_id_version("app_id3", &engine)?, None); // Check versions through ffi handles let version1 = ok_or_panic(unsafe { diff --git a/ffi/src/transaction/write_context.rs b/ffi/src/transaction/write_context.rs index 60ddd14c99..c3d51ae180 100644 --- a/ffi/src/transaction/write_context.rs +++ b/ffi/src/transaction/write_context.rs @@ -5,7 +5,7 @@ use delta_kernel_ffi_macros::handle_descriptor; use std::sync::Arc; -use super::ExclusiveTransaction; +use super::{ExclusiveCreateTransaction, ExclusiveTransaction}; /// A [`WriteContext`] that provides schema and path information needed for writing data. /// This is a shared reference that can be cloned and used across multiple consumers. @@ -14,8 +14,8 @@ use super::ExclusiveTransaction; #[handle_descriptor(target=WriteContext, mutable=false, sized=true)] pub struct SharedWriteContext; -/// Gets the write context from a transaction. The write context provides schema and path information -/// needed for writing data. +/// Gets the write context from a transaction. The write context provides schema and path +/// information needed for writing data. /// /// # Safety /// @@ -28,6 +28,20 @@ pub unsafe extern "C" fn get_write_context( Arc::new(txn.get_write_context()).into() } +/// Gets the write context from a create-table transaction. The write context provides schema +/// and path information needed for writing data. +/// +/// # Safety +/// +/// Caller is responsible for passing a [valid][Handle#Validity] transaction handle. +#[no_mangle] +pub unsafe extern "C" fn create_table_get_write_context( + txn: Handle, +) -> Handle { + let txn = unsafe { txn.as_ref() }; + Arc::new(txn.get_write_context()).into() +} + #[no_mangle] pub unsafe extern "C" fn free_write_context(write_context: Handle) { write_context.drop_handle(); @@ -43,7 +57,7 @@ pub unsafe extern "C" fn get_write_schema( write_context: Handle, ) -> Handle { let write_context = unsafe { write_context.as_ref() }; - write_context.schema().clone().into() + write_context.logical_schema().clone().into() } /// Get write path from WriteContext handle. diff --git a/ffi/tests/read-table-testing/expected-data/all-prim-types-cols.expected b/ffi/tests/read-table-testing/expected-data/all-prim-types-cols.expected new file mode 100644 index 0000000000..ec6b7b51c0 --- /dev/null +++ b/ffi/tests/read-table-testing/expected-data/all-prim-types-cols.expected @@ -0,0 +1,31 @@ +Reading table at ../../../../acceptance/tests/dat/out/reader_tests/generated/all_primitive_types/delta/ +version: 0 + +Schema: +├─ utf8: string +├─ int64: long +├─ int32: integer +├─ int16: short +├─ int8: byte +├─ float32: float +├─ float64: double +├─ bool: boolean +├─ binary: binary +├─ decimal: decimal(5)(3) +├─ date32: date +└─ timestamp: timestamp + +decimal: [ + 10.000, + 11.000, + 12.000, + 13.000, + 14.000 +] +float64: [ + 0, + 1, + 2, + 3, + 4 +] diff --git a/ffi/tests/read-table-testing/expected-data/nested-types-cols.expected b/ffi/tests/read-table-testing/expected-data/nested-types-cols.expected new file mode 100644 index 0000000000..b2af6ea8db --- /dev/null +++ b/ffi/tests/read-table-testing/expected-data/nested-types-cols.expected @@ -0,0 +1,108 @@ +Reading table at ../../../../acceptance/tests/dat/out/reader_tests/generated/nested_types/delta/ +version: 0 + +Schema: +├─ pk: integer +├─ struct: struct +│ ├─ float64: double +│ └─ bool: boolean +├─ array: array (can contain null: true) +│ └─ array_element: short +└─ map: map (can contain null: true) + ├─ map_key: string + └─ map_value: integer + +map: [ + keys: + [] + values: + [], + keys: + [ + "0" + ] + values: + [ + 0 + ], + keys: + [ + "0", + "1" + ] + values: + [ + 0, + 1 + ], + keys: + [ + "0", + "1", + "2" + ] + values: + [ + 0, + 1, + 2 + ], + keys: + [ + "0", + "1", + "2", + "3" + ] + values: + [ + 0, + 1, + 2, + 3 + ] +] +struct: -- is_valid: all not null +-- child 0 type: double + [ + 0, + 1, + 2, + 3, + 4 + ] +-- child 1 type: bool + [ + true, + false, + true, + false, + true + ] +array: [ + [ + 0 + ], + [ + 0, + 1 + ], + [ + 0, + 1, + 2 + ], + [ + 0, + 1, + 2, + 3 + ], + [ + 0, + 1, + 2, + 3, + 4 + ] +] diff --git a/ffi/tests/read-table-testing/expected-data/nested-types.expected b/ffi/tests/read-table-testing/expected-data/nested-types.expected new file mode 100644 index 0000000000..b701bf6092 --- /dev/null +++ b/ffi/tests/read-table-testing/expected-data/nested-types.expected @@ -0,0 +1,115 @@ +Reading table at ../../../../acceptance/tests/dat/out/reader_tests/generated/nested_types/delta/ +version: 0 + +Schema: +├─ pk: integer +├─ struct: struct +│ ├─ float64: double +│ └─ bool: boolean +├─ array: array (can contain null: true) +│ └─ array_element: short +└─ map: map (can contain null: true) + ├─ map_key: string + └─ map_value: integer + +pk: [ + 0, + 1, + 2, + 3, + 4 +] +struct: -- is_valid: all not null +-- child 0 type: double + [ + 0, + 1, + 2, + 3, + 4 + ] +-- child 1 type: bool + [ + true, + false, + true, + false, + true + ] +array: [ + [ + 0 + ], + [ + 0, + 1 + ], + [ + 0, + 1, + 2 + ], + [ + 0, + 1, + 2, + 3 + ], + [ + 0, + 1, + 2, + 3, + 4 + ] +] +map: [ + keys: + [] + values: + [], + keys: + [ + "0" + ] + values: + [ + 0 + ], + keys: + [ + "0", + "1" + ] + values: + [ + 0, + 1 + ], + keys: + [ + "0", + "1", + "2" + ] + values: + [ + 0, + 1, + 2 + ], + keys: + [ + "0", + "1", + "2", + "3" + ] + values: + [ + 0, + 1, + 2, + 3 + ] +] diff --git a/ffi/tests/read-table-testing/run_test.sh b/ffi/tests/read-table-testing/run_test.sh index 970a444b1c..1c86c3f7dc 100755 --- a/ffi/tests/read-table-testing/run_test.sh +++ b/ffi/tests/read-table-testing/run_test.sh @@ -3,7 +3,10 @@ set -euxo pipefail OUT_FILE=$(mktemp) -./read_table "$1" | tee "$OUT_FILE" +# if we've passed a third arg, use it as the first arg to ./read_table, otherwise set it to the +# empty string +COLS="${3:-}" +./read_table $COLS "$1" | tee "$OUT_FILE" diff -s "$OUT_FILE" "$2" DIFF_EXIT_CODE=$? echo "Diff exited with $DIFF_EXIT_CODE" diff --git a/ffi/tests/test-delta-kernel-unity-catalog-ffi/run_test.sh b/ffi/tests/test-delta-kernel-unity-catalog-ffi/run_test.sh new file mode 100755 index 0000000000..64113b45e8 --- /dev/null +++ b/ffi/tests/test-delta-kernel-unity-catalog-ffi/run_test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -euxo pipefail + +OUT_FILE=$(mktemp "${TMPDIR:-/tmp}/catalog_test.out.XXXX") +TMP_TABLE_DIR=$(mktemp -d "${TMPDIR:-/tmp}/catalog_test.table.XXXX") +cp -r ../../../../delta-kernel-unity-catalog/tests/data/catalog_managed_0 "$TMP_TABLE_DIR" + +./delta_kernel_unity_catalog_example "$TMP_TABLE_DIR/catalog_managed_0" +CATALOG_EXIT_CODE=$? + +rm "$OUT_FILE" +rm -r "$TMP_TABLE_DIR" + +exit "$CATALOG_EXIT_CODE" + diff --git a/ffi/tests/test-expression-visitor/expected.txt b/ffi/tests/test-expression-visitor/expected.txt index d56f5d8db8..ce11c4beef 100644 --- a/ffi/tests/test-expression-visitor/expected.txt +++ b/ffi/tests/test-expression-visitor/expected.txt @@ -1,3 +1,10 @@ +=== Complex Expression Test === +This test demonstrates the full range of expression types. +Some types are not yet supported in round-trip reconstruction: + - Struct/Array/Map literals (nested data structures) + - Transform expressions (schema evolution operations) + - Opaque expressions (custom user-defined expressions) + StructExpression Column(col) Byte(127) @@ -59,6 +66,8 @@ StructExpression Integer(42) Double(1.111000) Unknown(mystery) + MapToStruct + Column(pv) Divide Integer(0) Integer(0) @@ -125,3 +134,88 @@ And Distinct Integer(0) Integer(0) + +=== Simple Round-trip Test === +This test validates expressions/predicates with full support. +Supported types: primitives (int, long, float, double, bool, string), + temporal (date, timestamp, timestamp_ntz), binary, decimal, null, + binary operations (+, -, *, /), struct expressions, predicates (eq, ne, lt, le, + gt, ge, distinct, is_null, is_not_null, not, and, or) + +StructExpression + Column(simple_col) + Integer(42) + Long(100) + Double(2.500000) + Boolean(1) + Boolean(0) + String(test string) + Date(19000) + Timestamp(1234567890) + TimestampNtz(9876543210) + Add + Integer(10) + Integer(20) + Minus + Integer(50) + Integer(30) + Multiply + Integer(5) + Integer(6) + Divide + Integer(100) + Integer(4) + StructExpression + Integer(1) + Long(2) + Double(3.000000) + MapToStruct + Column(partitionValues) + +=== Expression Round-trip Test === +SUCCESS: Round-trip expression matches original! +And + Column(pred_col) + Boolean(1) + Boolean(0) + Equal + Integer(10) + Integer(10) + Not + Equal + Integer(5) + Integer(10) + LessThan + Integer(5) + Integer(10) + Not + GreaterThan + Integer(10) + Integer(10) + GreaterThan + Integer(20) + Integer(10) + Not + LessThan + Integer(10) + Integer(10) + Distinct + Integer(1) + Integer(2) + IsNull + Column(nullable_col) + Not + IsNull + Column(nonnull_col) + Not + Boolean(0) + Or + Equal + Integer(1) + Integer(1) + Equal + Integer(2) + Integer(2) + +=== Predicate Round-trip Test === +SUCCESS: Round-trip predicate matches original! diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index eb21b37022..a730424858 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -39,9 +39,10 @@ pre-release-hook = [ ] [dependencies] -delta_kernel_derive = { path = "../derive-macros", version = "0.16.0" } +delta_kernel_derive = { path = "../derive-macros", version = "0.20.0" } bytes = "1.10" chrono = "0.4.41" +crc = "3.2.2" indexmap = "2.10.0" itertools = "0.14" roaring = "0.11.2" @@ -57,36 +58,36 @@ z85 = "3.0.6" # optional deps futures = { version = "0.3", optional = true } -# Used for fetching direct urls (like pre-signed urls) -reqwest = { version = "0.12.23", default-features = false, optional = true } +# Used for fetching direct urls (like pre-signed urls). We disable `default-tls` to avoid +# pulling in a second rustls crypto provider that conflicts with transitive deps (see +# feature-tests for a regression test). Each default-engine-* feature selects its TLS backend. +reqwest = { version = "0.13", default-features = false, optional = true, features = ["charset", "http2", "system-proxy"] } # optionally used with default engine (though not required) tokio = { version = "1.47", optional = true, features = ["rt-multi-thread"] } # both arrow versions below are optional and require object_store -object_store = { version = "0.12.3", optional = true, features = ["aws", "azure", "gcp", "http"] } -# TODO: Remove this once https://github.com/apache/arrow-rs/pull/8244 ships -comfy-table = { version = "~7.1", optional = true } +object_store_12 = { package = "object_store", version = "0.12.3", optional = true, features = ["aws", "azure", "gcp", "http"] } -# arrow 55 -[dependencies.arrow_55] +# arrow 56 +[dependencies.arrow_56] package = "arrow" -version = "55" -features = ["chrono-tz", "ffi", "json", "prettyprint"] +version = "56" +features = ["chrono-tz", "ffi", "json"] optional = true -[dependencies.parquet_55] +[dependencies.parquet_56] package = "parquet" -version = "55" +version = "56" features = ["async", "object_store"] optional = true -# arrow 56 -[dependencies.arrow_56] +# arrow 57 +[dependencies.arrow_57] package = "arrow" -version = "56" -features = ["chrono-tz", "ffi", "json", "prettyprint"] +version = "57" +features = ["chrono-tz", "ffi", "json"] optional = true -[dependencies.parquet_56] +[dependencies.parquet_57] package = "parquet" -version = "56" +version = "57" features = ["async", "object_store"] optional = true @@ -95,21 +96,24 @@ optional = true default = [] # internal-api will make everything marked #[internal_api] public internal-api = [] +# prettyprint enables Arrow pretty-print helpers (test/example oriented). +prettyprint = ["arrow_56?/prettyprint", "arrow_57?/prettyprint"] +# test-utils exposes test-only constructors for downstream crate tests +test-utils = ["prettyprint"] # integration-test turns on a particularly heavy test for hdfs-object-store integration-test = ["hdfs-native-object-store/integration-test"] # The default versions for arrow/parquet/object_store -arrow = ["arrow-56"] # latest arrow version +arrow = ["arrow-57"] # latest arrow version need-arrow = [] # need-arrow is a marker that the feature needs arrow dep -arrow-55 = ["dep:arrow_55", "dep:parquet_55", "object_store", "comfy-table"] -arrow-56 = ["dep:arrow_56", "dep:parquet_56", "object_store", "comfy-table"] +arrow-56 = ["dep:arrow_56", "dep:parquet_56", "dep:object_store_12"] +arrow-57 = ["dep:arrow_57", "dep:parquet_57", "dep:object_store_12"] arrow-conversion = ["need-arrow"] arrow-expression = ["need-arrow"] -# WARNING: experimental feature, still under active development -# enables new experimental catalog-managed tables support -catalog-managed = [] +# Schema diffing functionality (experimental) +schema-diff = [] # this is an 'internal' feature flag which has all the shared bits from default-engine and # default-engine-rustls @@ -120,21 +124,19 @@ default-engine-base = [ "need-arrow", "tokio", ] -# the default-engine-native-tls use the reqwest crate with default features which uses native-tls. if you want -# to instead use rustls, use 'default-engine-rustls' which has no native-tls dependency -default-engine-native-tls = ["default-engine-base", "reqwest/default"] -default-engine-rustls = [ - "default-engine-base", - "reqwest/rustls-tls-native-roots", - "reqwest/http2", -] +# default-engine-native-tls enables reqwest with the native-tls backend. If you want to use +# rustls instead, use 'default-engine-rustls' which has no native-tls dependency. +default-engine-native-tls = ["default-engine-base", "reqwest/native-tls"] +default-engine-rustls = ["default-engine-base", "reqwest/rustls"] [build-dependencies] rustc_version = "0.4.1" [dev-dependencies] -delta_kernel = { path = ".", features = ["arrow", "catalog-managed", "default-engine-rustls", "internal-api"] } -test_utils = { path = "../test-utils" } +delta_kernel = { path = ".", features = ["test-utils"] } +# NOTE: test_utils forcibly pulls the default engine and arrow-56. By default it ALSO pulls +# arrow-57, which takes precedence and would make it impossible to test against arrow-56. +test_utils = { path = "../test-utils", default-features = false } criterion = "0.5" # Used for testing parse_url_opts extensibility hdfs-native-object-store = { version = "0.15.0" } @@ -148,7 +150,7 @@ tracing-subscriber = { version = "0.3", default-features = false, features = [ "env-filter", "fmt", ] } - +rstest = "0.23" [[bench]] name = "metadata_bench" harness = false diff --git a/kernel/benches/metadata_bench.rs b/kernel/benches/metadata_bench.rs index 86168baf1c..db974af0f7 100644 --- a/kernel/benches/metadata_bench.rs +++ b/kernel/benches/metadata_bench.rs @@ -17,11 +17,10 @@ //! //! Follow-ups: -use std::collections::HashMap; use std::sync::Arc; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; -use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::engine::default::{DefaultEngine, DefaultEngineBuilder}; use delta_kernel::snapshot::Snapshot; use delta_kernel::try_parse_uri; @@ -41,9 +40,9 @@ fn setup() -> (TempDir, Url, Arc>) { let table_path = tempdir.path().join(table); let url = try_parse_uri(table_path.to_str().unwrap()).expect("Failed to parse table path"); // TODO: use multi-threaded executor - let executor = Arc::new(TokioBackgroundExecutor::new()); - let engine = DefaultEngine::try_new(&url, HashMap::::new(), executor) - .expect("Failed to create engine"); + use delta_kernel::engine::default::storage::store_from_url; + let store = store_from_url(&url).expect("Failed to create store"); + let engine = DefaultEngineBuilder::new(store).build(); (tempdir, url, Arc::new(engine)) } diff --git a/kernel/examples/checkpoint-table/Cargo.toml b/kernel/examples/checkpoint-table/Cargo.toml new file mode 100644 index 0000000000..0702616a37 --- /dev/null +++ b/kernel/examples/checkpoint-table/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "checkpoint-table" +version = "0.1.0" +edition = "2021" +publish = false + +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["common/arrow-57"] +arrow-56 = ["common/arrow-56"] + +[dependencies] +clap = { version = "4.5", features = ["derive"] } +chrono = { version = "0.4", features = ["serde"] } +common = { path = "../common", default-features = false } +delta_kernel = { path = "../../../kernel", features = [ + "default-engine-rustls", + "internal-api", +] } +env_logger = "0.11.8" +tokio = { version = "1.0", features = ["full"] } +url = "2" +bytes = "1.11.0" +futures = "0.3.31" + +# for cargo-release +[package.metadata.release] +release = false diff --git a/kernel/examples/checkpoint-table/README.md b/kernel/examples/checkpoint-table/README.md new file mode 100644 index 0000000000..a2cb3605cd --- /dev/null +++ b/kernel/examples/checkpoint-table/README.md @@ -0,0 +1,35 @@ +Checkpoint Table +=========== + +# About + +This example shows how to checkpoint a Delta table. + +You can run this example from anywhere in this repository by running `cargo run -p checkpoint-table -- [args]` or by navigating to this directory and running `cargo run -- [args]`. + +!!!WARNING!!!: This doesn't use put-if-absent, or a catalog based commit, so it is UNSAFE. As such + you need to pass --unsafe-i-know-what-im-doing as an argument to get this to actually write the + checkpoint, otherwise it will just do all the work it _would_ have done, but not actually write the + final checkpoint. + +# Examples + +Assuming you're running in the directory of this example: + +- checkpoint the table at "/tmp/my_table" + +```bash +cargo run -- --unsafe-i-know-what-im-doing /tmp/my_table +``` + +- Just see that the checkpoint would work and how large it would be: + +```bash +cargo run -- /tmp/my_table +``` + +- Get usage info: + +```bash +cargo run -- --help +``` diff --git a/kernel/examples/checkpoint-table/src/main.rs b/kernel/examples/checkpoint-table/src/main.rs new file mode 100644 index 0000000000..a846f1f853 --- /dev/null +++ b/kernel/examples/checkpoint-table/src/main.rs @@ -0,0 +1,138 @@ +use std::process::ExitCode; +use std::sync::Arc; + +use clap::Parser; +use common::{LocationArgs, ParseWithExamples}; +use delta_kernel::arrow::array::RecordBatch; +use delta_kernel::parquet::arrow::async_writer::AsyncFileWriter; +use delta_kernel::parquet::arrow::AsyncArrowWriter; +use delta_kernel::parquet::errors::Result as ParquetResult; +use futures::future::{BoxFuture, FutureExt}; + +use delta_kernel::engine::arrow_data::EngineDataArrowExt; +use delta_kernel::engine::default::executor::tokio::TokioMultiThreadExecutor; +use delta_kernel::engine::default::DefaultEngineBuilder; +use delta_kernel::{ActionReconciliationIterator, DeltaResult, Error, Snapshot}; + +/// An example program that checkpoints a table. +/// !!!WARNING!!!: This doesn't use put-if-absent, or a catalog based commit, so it is UNSAFE. +/// As such you need to pass --unsafe_i_know_what_im_doing as an argument to get this to actually +/// write the checkpoint, otherwise it will just do all the work it _would_ have done, but not +/// actually write the final checkpoint. +#[derive(Parser)] +#[command(author, version, about, verbatim_doc_comment)] +#[command(propagate_version = true)] +struct Cli { + #[command(flatten)] + location_args: LocationArgs, + + /// This program doesn't use put-if-absent, or a catalog based commit, so it is UNSAFE. As such + /// you need to pass --unsafe-i-know-what-im-doing as an argument to get this to actually write + /// the checkpoint + #[arg(long)] + unsafe_i_know_what_im_doing: bool, +} + +#[tokio::main] +async fn main() -> ExitCode { + env_logger::init(); + match try_main().await { + Ok(()) => ExitCode::SUCCESS, + Err(e) => { + println!("{e:#?}"); + ExitCode::FAILURE + } + } +} + +async fn write_data( + first_batch: &RecordBatch, + batch_iter: &mut ActionReconciliationIterator, + parquet_writer: &mut AsyncArrowWriter, +) -> DeltaResult<()> { + parquet_writer.write(first_batch).await?; + for data_res in batch_iter { + let data = data_res?.apply_selection_vector()?; + let batch = data.try_into_record_batch()?; + parquet_writer.write(&batch).await?; + } + Ok(()) +} + +async fn try_main() -> DeltaResult<()> { + let cli = Cli::parse_with_examples(env!("CARGO_PKG_NAME"), "Write", "write", ""); + + let url = delta_kernel::try_parse_uri(&cli.location_args.path)?; + println!("Checkpointing Delta table at: {url}"); + + use delta_kernel::engine::default::storage::store_from_url; + let store = store_from_url(&url)?; + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = DefaultEngineBuilder::new(store) + .with_task_executor(executor) + .build(); + let snapshot = Snapshot::builder_for(url).build(&engine)?; + + if cli.unsafe_i_know_what_im_doing { + snapshot.checkpoint(&engine)?; + println!("Table checkpointed"); + } else { + // first we create a checkpoint writer + let writer = snapshot.create_checkpoint_writer()?; + + // this tells us the path where we should write the checkpoint file + let checkpoint_path = writer.checkpoint_path()?; + // this gives us a iterator of `FilteredEngineData` that needs to be written to the file + let mut data_iter = writer.checkpoint_data(&engine)?; + + let batch_iter = data_iter.by_ref(); + // we'll use the first batch to determine the schema + let first = batch_iter.next(); + let Some(first) = first else { + return Err(Error::generic("No batches in checkpoint data")); + }; + // Note that with `FilteredEngineData` it's important to `apply_selection_vector` to remove any + // filtered out rows. It's also possible to use `into_parts` to get the unfiltered batch and the + // selection vector individually, such that an engine could write only the selected rows out + // without having to allocate a new engine data. + // NB: Unselected rows MUST NOT be written to the checkpoint! Doing so will create an invalid + // checkpoint + let first_data = first?.apply_selection_vector()?; + let first_batch = first_data.try_into_record_batch()?; + + println!("--unsafe-i-know-what-im-doing not specified, just doing a dry run"); + // this block just writes the checkpoint to a blackhole + let mut parquet_writer = + AsyncArrowWriter::try_new(BlackholeWriter::default(), first_batch.schema(), None)?; + write_data(&first_batch, batch_iter, &mut parquet_writer).await?; + parquet_writer.finish().await?; + let blackhole_writer = parquet_writer.into_inner(); + println!( + "Would have written a checkpoint as:\n\tpath: {checkpoint_path}\n\tsize: {}", + blackhole_writer.len + ); + // in this example we don't call `finalize` because we don't want to actually write + // anything, but if really checkpointing, it's important to call finalize as we do above + } + Ok(()) +} + +/// Simple struct to allow us to go through the motions of writing the data without actually writing +/// it anywhere. Verifies that the actual flow of data does work. +#[derive(Default)] +pub struct BlackholeWriter { + len: u64, +} + +impl AsyncFileWriter for BlackholeWriter { + fn write(&mut self, bs: bytes::Bytes) -> BoxFuture<'_, ParquetResult<()>> { + self.len += bs.len() as u64; + async move { Ok(()) }.boxed() + } + + fn complete(&mut self) -> BoxFuture<'_, ParquetResult<()>> { + async move { Ok(()) }.boxed() + } +} diff --git a/kernel/examples/common/Cargo.toml b/kernel/examples/common/Cargo.toml index 2a6baae2d5..fad44854a1 100644 --- a/kernel/examples/common/Cargo.toml +++ b/kernel/examples/common/Cargo.toml @@ -10,12 +10,16 @@ version.workspace = true [package.metadata.release] release = false +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["delta_kernel/arrow-57"] +arrow-56 = ["delta_kernel/arrow-56"] + [dependencies] clap = { version = "4.5", features = ["derive"] } delta_kernel = { path = "../../../kernel", features = [ - "arrow", "default-engine-rustls", "internal-api", ] } -object_store = { version = "0.12.3", features = ["aws", "azure", "gcp", "http"] } url = "2" diff --git a/kernel/examples/common/src/lib.rs b/kernel/examples/common/src/lib.rs index 7023f26322..c847489cf7 100644 --- a/kernel/examples/common/src/lib.rs +++ b/kernel/examples/common/src/lib.rs @@ -4,14 +4,13 @@ use std::{collections::HashMap, sync::Arc}; use clap::{Args, CommandFactory, FromArgMatches}; use delta_kernel::{ - arrow::array::RecordBatch, - engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}, - scan::Scan, - schema::MetadataColumnSpec, - DeltaResult, SnapshotRef, + arrow::array::RecordBatch, engine::default::executor::tokio::TokioBackgroundExecutor, + engine::default::storage::store_from_url_opts, engine::default::DefaultEngine, + engine::default::DefaultEngineBuilder, scan::Scan, schema::MetadataColumnSpec, DeltaResult, + SnapshotRef, }; -use object_store::{ +use delta_kernel::object_store::{ aws::AmazonS3Builder, azure::MicrosoftAzureBuilder, gcp::GoogleCloudStorageBuilder, DynObjectStore, ObjectStoreScheme, }; @@ -130,7 +129,7 @@ pub fn get_engine( ) -> DeltaResult> { if args.env_creds { let (scheme, _path) = ObjectStoreScheme::parse(url).map_err(|e| { - delta_kernel::Error::Generic(format!("Object store could not parse url: {}", e)) + delta_kernel::Error::Generic(format!("Object store could not parse url: {e}")) })?; use ObjectStoreScheme::*; let url_str = url.to_string(); @@ -158,16 +157,13 @@ pub fn get_engine( ))); } }; - Ok(DefaultEngine::new( - store, - Arc::new(TokioBackgroundExecutor::new()), - )) + Ok(DefaultEngineBuilder::new(Arc::new(store)).build()) } else if !args.option.is_empty() { let opts = args.option.iter().map(|option| { let parts: Vec<&str> = option.split("=").collect(); (parts[0].to_ascii_lowercase(), parts[1]) }); - DefaultEngine::try_new(url, opts, Arc::new(TokioBackgroundExecutor::new())) + Ok(DefaultEngineBuilder::new(store_from_url_opts(url, opts)?).build()) } else { let mut options = if let Some(ref region) = args.region { HashMap::from([("region", region.clone())]) @@ -177,7 +173,7 @@ pub fn get_engine( if args.public { options.insert("skip_signature", "true".to_string()); } - DefaultEngine::try_new(url, options, Arc::new(TokioBackgroundExecutor::new())) + Ok(DefaultEngineBuilder::new(store_from_url_opts(url, options)?).build()) } } diff --git a/kernel/examples/inspect-table/Cargo.toml b/kernel/examples/inspect-table/Cargo.toml index 545baf9119..54461bf47a 100644 --- a/kernel/examples/inspect-table/Cargo.toml +++ b/kernel/examples/inspect-table/Cargo.toml @@ -4,11 +4,17 @@ version = "0.1.0" edition = "2021" publish = false +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["common/arrow-57"] +arrow-56 = ["common/arrow-56"] + [dependencies] +chrono = "0.4.42" clap = { version = "4.5", features = ["derive"] } -common = { path = "../common" } +common = { path = "../common", default-features = false } delta_kernel = { path = "../../../kernel", features = [ - "arrow", "default-engine-rustls", "internal-api", ] } diff --git a/kernel/examples/inspect-table/src/main.rs b/kernel/examples/inspect-table/src/main.rs index a3f4709dea..ba1552ae19 100644 --- a/kernel/examples/inspect-table/src/main.rs +++ b/kernel/examples/inspect-table/src/main.rs @@ -1,18 +1,18 @@ use common::{LocationArgs, ParseWithExamples}; use delta_kernel::actions::visitors::{ visit_metadata_at, visit_protocol_at, AddVisitor, CdcVisitor, RemoveVisitor, - SetTransactionVisitor, + SetTransactionVisitor, SidecarVisitor, }; use delta_kernel::actions::{ - get_commit_schema, ADD_NAME, CDC_NAME, METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, - SET_TRANSACTION_NAME, + get_all_actions_schema, ADD_NAME, CDC_NAME, METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, + SET_TRANSACTION_NAME, SIDECAR_NAME, }; use delta_kernel::engine_data::{GetData, RowVisitor, TypedGetData as _}; use delta_kernel::expressions::ColumnName; -use delta_kernel::scan::state::{DvInfo, Stats}; +use delta_kernel::scan::state::ScanFile; use delta_kernel::scan::ScanBuilder; use delta_kernel::schema::{ColumnNamesAndTypes, DataType}; -use delta_kernel::{DeltaResult, Error, ExpressionRef, Snapshot}; +use delta_kernel::{DeltaResult, Error, Snapshot}; use std::collections::HashMap; use std::process::ExitCode; @@ -67,10 +67,11 @@ enum Action { Add(delta_kernel::actions::Add), SetTransaction(delta_kernel::actions::SetTransaction), Cdc(delta_kernel::actions::Cdc), + Sidecar(delta_kernel::actions::Sidecar), } static NAMES_AND_TYPES: LazyLock = - LazyLock::new(|| get_commit_schema().leaves(None)); + LazyLock::new(|| get_all_actions_schema().leaves(None)); struct LogVisitor { actions: Vec<(Action, usize)>, @@ -117,6 +118,7 @@ impl RowVisitor for LogVisitor { let (metadata_start, metadata_end) = self.offsets[METADATA_NAME]; let (protocol_start, protocol_end) = self.offsets[PROTOCOL_NAME]; let (txn_start, txn_end) = self.offsets[SET_TRANSACTION_NAME]; + let (sidecar_start, sidecar_end) = self.offsets[SIDECAR_NAME]; let (cdc_start, cdc_end) = self.offsets[CDC_NAME]; for i in 0..row_count { let action = if let Some(path) = getters[add_start].get_opt(i, "add.path")? { @@ -138,6 +140,10 @@ impl RowVisitor for LogVisitor { let txn = SetTransactionVisitor::visit_txn(i, app_id, &getters[txn_start..txn_end])?; Action::SetTransaction(txn) + } else if let Some(path) = getters[sidecar_start].get_opt(i, "sidecar.path")? { + let sidecar = + SidecarVisitor::visit_sidecar(i, path, &getters[sidecar_start..sidecar_end])?; + Action::Sidecar(sidecar) } else if let Some(path) = getters[cdc_start].get_opt(i, "cdc.path")? { let cdc = CdcVisitor::visit_cdc(i, path, &getters[cdc_start..cdc_end])?; Action::Cdc(cdc) @@ -153,29 +159,30 @@ impl RowVisitor for LogVisitor { } // This is the callback that will be called for each valid scan row -fn print_scan_file( - _: &mut (), - path: &str, - size: i64, - stats: Option, - dv_info: DvInfo, - transform: Option, - partition_values: HashMap, -) { - let num_record_str = if let Some(s) = stats { +fn print_scan_file(_: &mut (), file: ScanFile) { + let num_record_str = if let Some(s) = file.stats { format!("{}", s.num_records) } else { "[unknown]".to_string() }; + let mod_str = match chrono::DateTime::from_timestamp(file.modification_time / 1000, 0) { + Some(dt) => format!("{} ({})", dt, file.modification_time), + None => format!("Invalid mod time: {}", file.modification_time), + }; println!( "Data to process:\n \ - Path:\t\t{path}\n \ - Size (bytes):\t{size}\n \ + Path:\t\t{0}\n \ + Size (bytes):\t{1}\n \ + Mod Time:\t{mod_str}\n \ Num Records:\t{num_record_str}\n \ - Has DV?:\t{}\n \ - Transform:\t{transform:?}\n \ - Part Vals:\t{partition_values:?}", - dv_info.has_vector() + Has DV?:\t{2}\n \ + Transform:\t{3:?}\n \ + Part Vals:\t{4:?}", + file.path, + file.size, + file.dv_info.has_vector(), + file.transform, + file.partition_values, ); } @@ -205,11 +212,10 @@ fn try_main() -> DeltaResult<()> { } } Commands::Actions { oldest_first } => { - let commit_schema = get_commit_schema(); - let actions = - snapshot - .log_segment() - .read_actions(&engine, commit_schema.clone(), None)?; + let actions_schema = get_all_actions_schema(); + let actions = snapshot + .log_segment() + .read_actions(&engine, actions_schema.clone())?; let mut visitor = LogVisitor::new(); for action in actions { @@ -227,6 +233,7 @@ fn try_main() -> DeltaResult<()> { Action::Add(a) => println!("\nAction {row}:\n{a:#?}"), Action::SetTransaction(t) => println!("\nAction {row}:\n{t:#?}"), Action::Cdc(c) => println!("\nAction {row}:\n{c:#?}"), + Action::Sidecar(s) => println!("\nAction {row}:\n{s:#?}"), } } } diff --git a/kernel/examples/read-table-changes/Cargo.toml b/kernel/examples/read-table-changes/Cargo.toml index 17c57ef574..394d998f3d 100644 --- a/kernel/examples/read-table-changes/Cargo.toml +++ b/kernel/examples/read-table-changes/Cargo.toml @@ -8,12 +8,18 @@ publish = false [package.metadata.release] release = false +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["common/arrow-57"] +arrow-56 = ["common/arrow-56"] + [dependencies] clap = { version = "4.5", features = ["derive"] } -common = { path = "../common" } +common = { path = "../common", default-features = false } delta_kernel = { path = "../../../kernel", features = [ - "arrow", "default-engine-rustls", "internal-api", + "prettyprint", ] } itertools = "0.14" diff --git a/kernel/examples/read-table-changes/src/main.rs b/kernel/examples/read-table-changes/src/main.rs index db5e7a7be1..cf303e09f1 100644 --- a/kernel/examples/read-table-changes/src/main.rs +++ b/kernel/examples/read-table-changes/src/main.rs @@ -3,8 +3,8 @@ use std::sync::Arc; use clap::Parser; use common::{LocationArgs, ParseWithExamples}; use delta_kernel::arrow::array::RecordBatch; -use delta_kernel::arrow::{compute::filter_record_batch, util::pretty::print_batches}; -use delta_kernel::engine::arrow_data::ArrowEngineData; +use delta_kernel::arrow::util::pretty::print_batches; +use delta_kernel::engine::arrow_data::EngineDataArrowExt; use delta_kernel::table_changes::TableChanges; use delta_kernel::DeltaResult; use itertools::Itertools; @@ -38,21 +38,7 @@ fn main() -> DeltaResult<()> { let table_changes_scan = table_changes.into_scan_builder().build()?; let batches: Vec = table_changes_scan .execute(Arc::new(engine))? - .map(|scan_result| -> DeltaResult<_> { - let scan_result = scan_result?; - let mask = scan_result.full_mask(); - let data = scan_result.raw_data?; - let record_batch: RecordBatch = data - .into_any() - .downcast::() - .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? - .into(); - if let Some(mask) = mask { - Ok(filter_record_batch(&record_batch, &mask.into())?) - } else { - Ok(record_batch) - } - }) + .map(EngineDataArrowExt::try_into_record_batch) .try_collect()?; print_batches(&batches)?; Ok(()) diff --git a/kernel/examples/read-table-multi-threaded/Cargo.toml b/kernel/examples/read-table-multi-threaded/Cargo.toml index 17253b98dc..d5e12399eb 100644 --- a/kernel/examples/read-table-multi-threaded/Cargo.toml +++ b/kernel/examples/read-table-multi-threaded/Cargo.toml @@ -4,14 +4,19 @@ version = "0.1.0" edition = "2021" publish = false +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["common/arrow-57"] +arrow-56 = ["common/arrow-56"] + [dependencies] -arrow = { version = "56", features = ["prettyprint", "chrono-tz"] } clap = { version = "4.5", features = ["derive"] } -common = { path = "../common" } +common = { path = "../common", default-features = false } delta_kernel = { path = "../../../kernel", features = [ - "arrow-56", "default-engine-rustls", "internal-api", + "prettyprint", ] } env_logger = "0.11.8" spmc = "0.3.0" diff --git a/kernel/examples/read-table-multi-threaded/src/main.rs b/kernel/examples/read-table-multi-threaded/src/main.rs index 3effd89946..1ff0c42b85 100644 --- a/kernel/examples/read-table-multi-threaded/src/main.rs +++ b/kernel/examples/read-table-multi-threaded/src/main.rs @@ -1,18 +1,17 @@ -use std::collections::HashMap; use std::process::ExitCode; use std::sync::mpsc::Sender; use std::sync::{mpsc, Arc}; use std::thread; -use arrow::compute::filter_record_batch; -use arrow::record_batch::RecordBatch; -use arrow::util::pretty::print_batches; use common::{LocationArgs, ParseWithExamples, ScanArgs}; use delta_kernel::actions::deletion_vector::split_vector; -use delta_kernel::engine::arrow_data::ArrowEngineData; -use delta_kernel::scan::state::{transform_to_logical, DvInfo, Stats}; +use delta_kernel::arrow::compute::filter_record_batch; +use delta_kernel::arrow::record_batch::RecordBatch; +use delta_kernel::arrow::util::pretty::print_batches; +use delta_kernel::engine::arrow_data::EngineDataArrowExt as _; +use delta_kernel::scan::state::{transform_to_logical, DvInfo, ScanFile}; use delta_kernel::schema::SchemaRef; -use delta_kernel::{DeltaResult, Engine, EngineData, ExpressionRef, FileMeta, Snapshot}; +use delta_kernel::{DeltaResult, Engine, ExpressionRef, FileMeta, Snapshot}; use clap::Parser; use url::Url; @@ -52,37 +51,20 @@ fn main() -> ExitCode { // the way we as a connector represent data to scan. this is computed from the raw data returned // from the scan, and could be any format the engine chooses to use to facilitate distributing work. -struct ScanFile { +struct FileToScan { path: String, size: i64, transform: Option, dv_info: DvInfo, } -// we know we're using arrow under the hood, so cast an EngineData into something we can work with -fn to_arrow(data: Box) -> DeltaResult { - Ok(data - .into_any() - .downcast::() - .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? - .into()) -} - // This is the callback that will be called for each valid scan row -fn send_scan_file( - scan_tx: &mut spmc::Sender, - path: &str, - size: i64, - _stats: Option, - dv_info: DvInfo, - transform: Option, - _: HashMap, -) { - let scan_file = ScanFile { - path: path.to_string(), - size, - transform, - dv_info, +fn send_scan_file(scan_tx: &mut spmc::Sender, scan_file: ScanFile) { + let scan_file = FileToScan { + path: scan_file.path.to_string(), + size: scan_file.size, + transform: scan_file.transform, + dv_info: scan_file.dv_info, }; scan_tx.send(scan_file).unwrap(); } @@ -183,7 +165,7 @@ fn do_work( engine: &dyn Engine, scan_state: Arc, record_batch_tx: Sender, - scan_file_rx: spmc::Receiver, + scan_file_rx: spmc::Receiver, ) { // in a loop, try and get a ScanFile. Note that `recv` will return an `Err` when the other side // hangs up, which indicates there's no more data to process. @@ -231,7 +213,7 @@ fn do_work( ) .unwrap(); - let record_batch = to_arrow(logical).unwrap(); + let record_batch = logical.try_into_record_batch().unwrap(); // need to split the dv_mask. what's left in dv_mask covers this result, and rest // will cover the following results diff --git a/kernel/examples/read-table-single-threaded/Cargo.toml b/kernel/examples/read-table-single-threaded/Cargo.toml index d13457a5a1..0dda1128f0 100644 --- a/kernel/examples/read-table-single-threaded/Cargo.toml +++ b/kernel/examples/read-table-single-threaded/Cargo.toml @@ -4,14 +4,19 @@ version = "0.1.0" edition = "2021" publish = false +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["common/arrow-57"] +arrow-56 = ["common/arrow-56"] + [dependencies] -arrow = { version = "56", features = ["prettyprint", "chrono-tz"] } clap = { version = "4.5", features = ["derive"] } -common = { path = "../common" } +common = { path = "../common", default-features = false } delta_kernel = { path = "../../../kernel", features = [ - "arrow-56", "default-engine-rustls", "internal-api", + "prettyprint", ] } env_logger = "0.11.8" itertools = "0.14" diff --git a/kernel/examples/read-table-single-threaded/src/main.rs b/kernel/examples/read-table-single-threaded/src/main.rs index caa5dccf70..207c792361 100644 --- a/kernel/examples/read-table-single-threaded/src/main.rs +++ b/kernel/examples/read-table-single-threaded/src/main.rs @@ -1,11 +1,10 @@ use std::process::ExitCode; use std::sync::Arc; -use arrow::compute::filter_record_batch; -use arrow::record_batch::RecordBatch; -use arrow::util::pretty::print_batches; use common::{LocationArgs, ParseWithExamples, ScanArgs}; -use delta_kernel::engine::arrow_data::ArrowEngineData; +use delta_kernel::arrow::record_batch::RecordBatch; +use delta_kernel::arrow::util::pretty::print_batches; +use delta_kernel::engine::arrow_data::EngineDataArrowExt; use delta_kernel::{DeltaResult, Snapshot}; use clap::Parser; @@ -50,22 +49,7 @@ fn try_main() -> DeltaResult<()> { let mut rows_so_far = 0; let batches: Vec = scan .execute(Arc::new(engine))? - .map(|scan_result| -> DeltaResult<_> { - // extract the batches and filter them if they have deletion vectors - let scan_result = scan_result?; - let mask = scan_result.full_mask(); - let data = scan_result.raw_data?; - let record_batch: RecordBatch = data - .into_any() - .downcast::() - .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? - .into(); - if let Some(mask) = mask { - Ok(filter_record_batch(&record_batch, &mask.into())?) - } else { - Ok(record_batch) - } - }) + .map(EngineDataArrowExt::try_into_record_batch) .scan(&mut rows_so_far, |rows_so_far, record_batch| { // handle truncation if we've specified a limit let Ok(batch) = record_batch else { diff --git a/kernel/examples/write-table/Cargo.toml b/kernel/examples/write-table/Cargo.toml index 3291944f63..883d0b20e5 100644 --- a/kernel/examples/write-table/Cargo.toml +++ b/kernel/examples/write-table/Cargo.toml @@ -4,14 +4,19 @@ version = "0.1.0" edition = "2021" publish = false +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["common/arrow-57"] +arrow-56 = ["common/arrow-56"] + [dependencies] -arrow = { version = "56", features = ["prettyprint", "chrono-tz"] } clap = { version = "4.5", features = ["derive"] } -common = { path = "../common" } +common = { path = "../common", default-features = false } delta_kernel = { path = "../../../kernel", features = [ - "arrow-56", "default-engine-rustls", "internal-api", + "prettyprint", ] } env_logger = "0.11.8" itertools = "0.14" diff --git a/kernel/examples/write-table/src/main.rs b/kernel/examples/write-table/src/main.rs index a1335be7ab..e29786b207 100644 --- a/kernel/examples/write-table/src/main.rs +++ b/kernel/examples/write-table/src/main.rs @@ -1,25 +1,26 @@ use std::collections::HashMap; -use std::fs::{create_dir_all, write}; +use std::fs::create_dir_all; use std::path::Path; use std::process::ExitCode; use std::sync::Arc; -use arrow::array::{BooleanArray, Float64Array, Int32Array, RecordBatch, StringArray}; -use arrow::util::pretty::print_batches; use clap::Parser; use common::{LocationArgs, ParseWithExamples}; +use delta_kernel::arrow::array::{ + ArrayRef, BooleanArray, Float64Array, Int32Array, Int64Array, RecordBatch, StringArray, +}; +use delta_kernel::arrow::util::pretty::print_batches; use itertools::Itertools; -use serde_json::{json, to_vec}; use url::Url; -use uuid::Uuid; use delta_kernel::arrow::array::TimestampMicrosecondArray; use delta_kernel::committer::FileSystemCommitter; use delta_kernel::engine::arrow_conversion::TryIntoArrow; -use delta_kernel::engine::arrow_data::ArrowEngineData; +use delta_kernel::engine::arrow_data::{ArrowEngineData, EngineDataArrowExt}; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; -use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::engine::default::{DefaultEngine, DefaultEngineBuilder}; use delta_kernel::schema::{DataType, SchemaRef, StructField, StructType}; +use delta_kernel::transaction::create_table::create_table as create_delta_table; use delta_kernel::transaction::{CommitResult, RetryableTransaction}; use delta_kernel::{DeltaResult, Engine, Error, Snapshot, SnapshotRef}; @@ -76,11 +77,8 @@ async fn try_main() -> DeltaResult<()> { println!("Using Delta table at: {url}"); // Get the engine for local filesystem - let engine = DefaultEngine::try_new( - &url, - HashMap::::new(), - Arc::new(TokioBackgroundExecutor::new()), - )?; + use delta_kernel::engine::default::storage::store_from_url; + let engine = DefaultEngineBuilder::new(store_from_url(&url)?).build(); // Create or get the table let snapshot = create_or_get_base_snapshot(&url, &engine, &cli.schema).await?; @@ -91,7 +89,7 @@ async fn try_main() -> DeltaResult<()> { // Write sample data to the table let committer = Box::new(FileSystemCommitter::new()); let mut txn = snapshot - .transaction(committer)? + .transaction(committer, &engine)? .with_operation("INSERT".to_string()) .with_engine_info("default_engine/write-table-example") .with_data_change(true); @@ -155,7 +153,7 @@ async fn create_or_get_base_snapshot( // Create new table println!("Creating new Delta table..."); let schema = parse_schema(schema_str)?; - create_table(url, &schema).await?; + create_table(url, &schema, engine).await?; Snapshot::builder_for(url.clone()).build(engine) } } @@ -195,66 +193,13 @@ fn parse_schema(schema_str: &str) -> DeltaResult { Ok(Arc::new(StructType::try_new(fields)?)) } -/// Create a new Delta table with the given schema. -/// -/// Creating a Delta table is not officially supported by kernel-rs yet, so we manually create the -/// initial transaction log. -async fn create_table(table_url: &Url, schema: &SchemaRef) -> DeltaResult<()> { - let table_id = Uuid::new_v4().to_string(); - let schema_str = serde_json::to_string(&schema)?; - - let (reader_features, writer_features) = { - let reader_features: Vec<&'static str> = vec![]; - let writer_features: Vec<&'static str> = vec![]; - - // TODO: Support adding specific table features - (reader_features, writer_features) - }; - - let protocol = json!({ - "protocol": { - "minReaderVersion": 3, - "minWriterVersion": 7, - "readerFeatures": reader_features, - "writerFeatures": writer_features, - } - }); - let partition_columns: Vec = vec![]; - let metadata = json!({ - "metaData": { - "id": table_id, - "format": { - "provider": "parquet", - "options": {} - }, - "schemaString": schema_str, - "partitionColumns": partition_columns, - "configuration": {}, - "createdTime": 1677811175819u64 - } - }); - - let data = [ - to_vec(&protocol).unwrap(), - b"\n".to_vec(), - to_vec(&metadata).unwrap(), - ] - .concat(); - - // Write the initial transaction with protocol and metadata to 0.json - let delta_log_path = table_url - .join("_delta_log/")? - .to_file_path() - .map_err(|_e| Error::generic("URL cannot be converted to local file path"))?; - let file_path = delta_log_path.join("00000000000000000000.json"); - - // Create the _delta_log directory if it doesn't exist - create_dir_all(&delta_log_path) - .map_err(|e| Error::generic(format!("Failed to create _delta_log directory: {e}")))?; - - // Write the file using standard filesystem operations - write(&file_path, data) - .map_err(|e| Error::generic(format!("Failed to write initial transaction log: {e}")))?; +/// Create a new Delta table with the given schema using the official CreateTable API. +async fn create_table(table_url: &Url, schema: &SchemaRef, engine: &dyn Engine) -> DeltaResult<()> { + // Use the create_table API to create the table + let table_path = table_url.as_str(); + let _result = create_delta_table(table_path, schema.clone(), "write-table-example/1.0") + .build(engine, Box::new(FileSystemCommitter::new()))? + .commit(engine)?; println!("✓ Created Delta table with schema: {schema:#?}"); Ok(()) @@ -266,7 +211,7 @@ fn create_sample_data(schema: &SchemaRef, num_rows: usize) -> DeltaResult = match *field.data_type() { + let column: ArrayRef = match *field.data_type() { DataType::STRING => { let data: Vec = (0..num_rows).map(|i| format!("item_{i}")).collect(); Arc::new(StringArray::from(data)) @@ -277,7 +222,7 @@ fn create_sample_data(schema: &SchemaRef, num_rows: usize) -> DeltaResult { let data: Vec = (0..num_rows).map(|i| i as i64).collect(); - Arc::new(arrow::array::Int64Array::from(data)) + Arc::new(Int64Array::from(data)) } DataType::DOUBLE => { let data: Vec = (0..num_rows).map(|i| i as f64 * 1.5).collect(); @@ -320,25 +265,7 @@ async fn read_and_display_data( let batches: Vec = scan .execute(Arc::new(engine))? - .map(|scan_result| -> DeltaResult<_> { - let scan_result = scan_result?; - let mask = scan_result.full_mask(); - let data = scan_result.raw_data?; - let record_batch: RecordBatch = data - .into_any() - .downcast::() - .map_err(|_| Error::EngineDataType("ArrowEngineData".to_string()))? - .into(); - - if let Some(mask) = mask { - Ok(arrow::compute::filter_record_batch( - &record_batch, - &mask.into(), - )?) - } else { - Ok(record_batch) - } - }) + .map(EngineDataArrowExt::try_into_record_batch) .try_collect()?; print_batches(&batches)?; diff --git a/kernel/src/action_reconciliation/log_replay.rs b/kernel/src/action_reconciliation/log_replay.rs index a0e923715c..33402000bf 100644 --- a/kernel/src/action_reconciliation/log_replay.rs +++ b/kernel/src/action_reconciliation/log_replay.rs @@ -31,6 +31,7 @@ //! actions selected //! use crate::engine_data::{FilteredEngineData, GetData, RowVisitor, TypedGetData as _}; +use crate::log_replay::deduplicator::Deduplicator as _; use crate::log_replay::{ ActionsBatch, FileActionDeduplicator, FileActionKey, HasSelectionVector, LogReplayProcessor, }; @@ -40,7 +41,8 @@ use crate::utils::require; use crate::{DeltaResult, Error}; use std::collections::HashSet; -use std::sync::LazyLock; +use std::sync::atomic::{AtomicBool, AtomicI64, Ordering}; +use std::sync::{Arc, LazyLock}; /// The [`ActionReconciliationProcessor`] is an implementation of the [`LogReplayProcessor`] /// trait that filters log segment actions. @@ -54,6 +56,9 @@ pub(crate) struct ActionReconciliationProcessor { seen_metadata: bool, /// Set of transaction app IDs that have been processed to avoid duplicates. seen_txns: HashSet, + /// Set of domain names that have been processed to avoid duplicates. + /// For each unique domain, only the first (newest) domain metadata action is kept. + seen_domains: HashSet, /// Minimum timestamp for file retention, used for filtering expired tombstones. minimum_file_retention_timestamp: i64, /// Transaction expiration timestamp for filtering old transactions @@ -64,6 +69,12 @@ pub(crate) struct ActionReconciliationProcessor { /// /// It contains the filtered batch of actions to be included, along with statistics about the /// number of actions filtered for inclusion. +/// +/// # Warning +/// +/// This iterator must be fully consumed to ensure proper collection of statistics. Additionally, +/// all yielded data must be written to the specified path before e.g. calling +/// [`CheckpointWriter::finalize`]. Failing to do so may result in data loss or corruption. pub(crate) struct ActionReconciliationBatch { /// The filtered batch of actions. pub(crate) filtered_data: FilteredEngineData, @@ -79,6 +90,94 @@ impl HasSelectionVector for ActionReconciliationBatch { } } +/// Stats for ActionReconciliationIterator +#[derive(Debug, Default)] +pub struct ActionReconciliationIteratorState { + actions_count: AtomicI64, + add_actions_count: AtomicI64, + is_exhausted: AtomicBool, +} + +impl ActionReconciliationIteratorState { + /// Get the total number of actions processed + pub fn actions_count(&self) -> i64 { + self.actions_count.load(Ordering::Acquire) + } + + /// Get the total number of add actions processed + pub fn add_actions_count(&self) -> i64 { + self.add_actions_count.load(Ordering::Acquire) + } + + /// True if the iterator has been exhausted (all batches processed) + pub fn is_exhausted(&self) -> bool { + self.is_exhausted.load(Ordering::Acquire) + } +} + +/// Iterator over action reconciliation data. +/// +/// This iterator yields a stream of [`FilteredEngineData`] items while, tracking action +/// counts. Used by both checkpoint and log compaction workflows. +pub struct ActionReconciliationIterator { + inner: Box> + Send>, + state: Arc, +} + +impl ActionReconciliationIterator { + /// Create a new iterator with counters initialized to 0 + pub(crate) fn new( + inner: Box> + Send>, + ) -> Self { + Self { + inner, + state: Arc::new(ActionReconciliationIteratorState::default()), + } + } + + /// Get the shared state. This allows sharing of stats. + pub fn state(&self) -> Arc { + Arc::clone(&self.state) + } + + /// Helper to transform a batch: update metrics and extract filtered data + fn transform_batch( + &mut self, + batch: Option>, + ) -> Option> { + let Some(batch) = batch else { + self.state.is_exhausted.store(true, Ordering::Release); + return None; + }; + Some(batch.map(|batch| { + self.state + .actions_count + .fetch_add(batch.actions_count, Ordering::Release); + self.state + .add_actions_count + .fetch_add(batch.add_actions_count, Ordering::Release); + batch.filtered_data + })) + } +} + +impl std::fmt::Debug for ActionReconciliationIterator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ActionReconciliationIterator") + .field("state", &self.state) + .finish() + } +} + +impl Iterator for ActionReconciliationIterator { + type Item = DeltaResult; + + fn next(&mut self) -> Option { + let batch = self.inner.next(); + self.transform_batch(batch) + } +} + impl LogReplayProcessor for ActionReconciliationProcessor { type Output = ActionReconciliationBatch; @@ -106,6 +205,7 @@ impl LogReplayProcessor for ActionReconciliationProcessor { self.seen_protocol, self.seen_metadata, &mut self.seen_txns, + &mut self.seen_domains, self.txn_expiration_timestamp, ); visitor.visit_rows_of(actions.as_ref())?; @@ -139,6 +239,7 @@ impl ActionReconciliationProcessor { seen_protocol: false, seen_metadata: false, seen_txns: Default::default(), + seen_domains: Default::default(), minimum_file_retention_timestamp, txn_expiration_timestamp, } @@ -168,6 +269,7 @@ impl ActionReconciliationProcessor { /// - Keeps only the first protocol action (newest version) /// - Keeps only the first metadata action (most recent table metadata) /// - Keeps only the first txn action for each unique app ID +/// - Keeps only the first domainMetadata action for each unique domain name /// /// # Excluded Actions /// - CommitInfo, CDC, and CheckpointMetadata actions should not appear in the action @@ -179,9 +281,10 @@ impl ActionReconciliationProcessor { /// - The CheckpointMetadata action is included down the wire when writing a V2 spec checkpoint. /// /// # Memory Usage -/// This struct has O(N + M) memory usage where: +/// This struct has O(N + M + D) memory usage where: /// - N = number of txn actions with unique appIds /// - M = number of file actions with unique (path, dvId) pairs +/// - D = number of domainMetadata actions with unique domain names /// /// The resulting filtered set of actions are the reconciled actions. pub(crate) struct ActionReconciliationVisitor<'seen> { @@ -206,24 +309,57 @@ pub(crate) struct ActionReconciliationVisitor<'seen> { // Set of transaction IDs to deduplicate by appId // This set has O(N) memory usage where N = number of txn actions with unique appIds seen_txns: &'seen mut HashSet, + // Set of domain names to deduplicate domainMetadata by domain + // This set has O(D) memory usage where D = number of domainMetadata actions with unique domains + seen_domains: &'seen mut HashSet, /// Transaction expiration timestamp for filtering old transactions txn_expiration_timestamp: Option, } +/// A projected column used by `ActionReconciliationVisitor`. +/// +/// `index` is the position in the `getters: &[&dyn GetData]` slice. +/// `name` is the fully-qualified field path used when calling `get_*` (and appears in errors). +/// +/// Invariant: these constants must match the order in +/// `ActionReconciliationVisitor::selected_column_names_and_types()`. +#[derive(Debug, Copy, Clone)] +struct GetterColumn { + index: usize, + name: &'static str, +} + +impl GetterColumn { + const fn new(index: usize, name: &'static str) -> Self { + GetterColumn { index, name } + } +} + #[allow(unused)] impl ActionReconciliationVisitor<'_> { - // These index positions correspond to the order of columns defined in - // `selected_column_names_and_types()` - const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters - const ADD_DV_START_INDEX: usize = 1; // Start position of add deletion vector columns - const REMOVE_PATH_INDEX: usize = 4; // Position of "remove.path" in getters - const REMOVE_DELETION_TIMESTAMP_INDEX: usize = 5; // Position of "remove.deletionTimestamp" in getters - const REMOVE_DV_START_INDEX: usize = 6; // Start position of remove deletion vector columns - - // These are the column names used to access the data in the getters - const REMOVE_DELETION_TIMESTAMP: &'static str = "remove.deletionTimestamp"; - const PROTOCOL_MIN_READER_VERSION: &'static str = "protocol.minReaderVersion"; - const METADATA_ID: &'static str = "metaData.id"; + // Projected columns in the same order as `selected_column_names_and_types()`. + // DV columns are defined individually for completeness, even when accessed via a start index. + const ADD_PATH: GetterColumn = GetterColumn::new(0, "add.path"); + const ADD_DV_STORAGE_TYPE: GetterColumn = + GetterColumn::new(1, "add.deletionVector.storageType"); + const ADD_DV_PATH_OR_INLINE_DV: GetterColumn = + GetterColumn::new(2, "add.deletionVector.pathOrInlineDv"); + const ADD_DV_OFFSET: GetterColumn = GetterColumn::new(3, "add.deletionVector.offset"); + const REMOVE_PATH: GetterColumn = GetterColumn::new(4, "remove.path"); + const REMOVE_DELETION_TIMESTAMP: GetterColumn = + GetterColumn::new(5, "remove.deletionTimestamp"); + const REMOVE_DV_STORAGE_TYPE: GetterColumn = + GetterColumn::new(6, "remove.deletionVector.storageType"); + const REMOVE_DV_PATH_OR_INLINE_DV: GetterColumn = + GetterColumn::new(7, "remove.deletionVector.pathOrInlineDv"); + const REMOVE_DV_OFFSET: GetterColumn = GetterColumn::new(8, "remove.deletionVector.offset"); + const METADATA_ID: GetterColumn = GetterColumn::new(9, "metaData.id"); + const PROTOCOL_MIN_READER_VERSION: GetterColumn = + GetterColumn::new(10, "protocol.minReaderVersion"); + const TXN_APP_ID: GetterColumn = GetterColumn::new(11, "txn.appId"); + const TXN_LAST_UPDATED: GetterColumn = GetterColumn::new(12, "txn.lastUpdated"); + const DOMAIN_METADATA_DOMAIN: GetterColumn = GetterColumn::new(13, "domainMetadata.domain"); + const DOMAIN_METADATA_REMOVED: GetterColumn = GetterColumn::new(14, "domainMetadata.removed"); #[allow(clippy::too_many_arguments)] pub(crate) fn new<'seen>( @@ -234,16 +370,17 @@ impl ActionReconciliationVisitor<'_> { seen_protocol: bool, seen_metadata: bool, seen_txns: &'seen mut HashSet, + seen_domains: &'seen mut HashSet, txn_expiration_timestamp: Option, ) -> ActionReconciliationVisitor<'seen> { ActionReconciliationVisitor { deduplicator: FileActionDeduplicator::new( seen_file_keys, is_log_batch, - Self::ADD_PATH_INDEX, - Self::REMOVE_PATH_INDEX, - Self::ADD_DV_START_INDEX, - Self::REMOVE_DV_START_INDEX, + Self::ADD_PATH.index, + Self::REMOVE_PATH.index, + Self::ADD_DV_STORAGE_TYPE.index, + Self::REMOVE_DV_STORAGE_TYPE.index, ), selection_vector, actions_count: 0, @@ -252,6 +389,7 @@ impl ActionReconciliationVisitor<'_> { seen_protocol, seen_metadata, seen_txns, + seen_domains, txn_expiration_timestamp, } } @@ -271,7 +409,7 @@ impl ActionReconciliationVisitor<'_> { // Spark and the Java Kernel. // Note: When remove.deletion_timestamp is not present (defaulting to 0), the remove action // will be excluded as it will be treated as expired. - let deletion_timestamp = getter.get_opt(i, "remove.deletionTimestamp")?; + let deletion_timestamp = getter.get_opt(i, Self::REMOVE_DELETION_TIMESTAMP.name)?; let deletion_timestamp = deletion_timestamp.unwrap_or(0i64); Ok(deletion_timestamp <= self.minimum_file_retention_timestamp) @@ -305,7 +443,7 @@ impl ActionReconciliationVisitor<'_> { true } else { // Expired remove actions are not valid - !self.is_expired_tombstone(i, getters[Self::REMOVE_DELETION_TIMESTAMP_INDEX])? + !self.is_expired_tombstone(i, getters[Self::REMOVE_DELETION_TIMESTAMP.index])? }; Ok(Some(is_valid)) } @@ -324,7 +462,7 @@ impl ActionReconciliationVisitor<'_> { // minReaderVersion is a required field, so we check for its presence to determine if this is a protocol action. // Only return the first (newest) protocol action we see, ignoring other types let result = getter - .get_int(i, Self::PROTOCOL_MIN_READER_VERSION)? + .get_int(i, Self::PROTOCOL_MIN_READER_VERSION.name)? .is_some() .then(|| !std::mem::replace(&mut self.seen_protocol, true)); Ok(result) @@ -344,7 +482,7 @@ impl ActionReconciliationVisitor<'_> { // id is a required field, so we check for its presence to determine if this is a metadata action. // Only return the first (newest) metadata action we see, ignoring other types let result = getter - .get_str(i, Self::METADATA_ID)? + .get_str(i, Self::METADATA_ID.name)? .is_some() .then(|| !std::mem::replace(&mut self.seen_metadata, true)); Ok(result) @@ -359,16 +497,18 @@ impl ActionReconciliationVisitor<'_> { fn check_txn_action<'a>( &mut self, i: usize, - getter: &[&'a dyn GetData<'a>], + getters: &[&'a dyn GetData<'a>], ) -> DeltaResult> { - // Check for txn field - let Some(app_id) = getter[11].get_str(i, "txn.appId")? else { + let Some(app_id) = getters[Self::TXN_APP_ID.index].get_str(i, Self::TXN_APP_ID.name)? + else { return Ok(None); // Not a txn action, continue checking other types }; // Check retention if last_updated is present if let Some(retention_ts) = self.txn_expiration_timestamp { - if let Some(last_updated) = getter[12].get_opt(i, "txn.lastUpdated")? { + if let Some(last_updated) = + getters[Self::TXN_LAST_UPDATED.index].get_opt(i, Self::TXN_LAST_UPDATED.name)? + { let last_updated: i64 = last_updated; if last_updated <= retention_ts { // Transaction is old, exclude it @@ -383,6 +523,37 @@ impl ActionReconciliationVisitor<'_> { Ok(Some(self.seen_txns.insert(app_id.to_string()))) } + /// Processes a potential domainMetadata action to determine if it should be included. + /// + /// Returns `Ok(Some(true))` if the row contains a valid domainMetadata action. + /// Returns `Ok(Some(false))` if the row contains a domainMetadata action but it's suppressed + /// (duplicate or tombstone with removed=true). + /// Returns `Ok(None)` if the row doesn't contain a domainMetadata action (continue checking other action types). + /// Returns `Err(...)` if there was an error processing the action. + fn check_domain_metadata_action<'a>( + &mut self, + i: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult> { + let Some(domain) = getters[Self::DOMAIN_METADATA_DOMAIN.index] + .get_str(i, Self::DOMAIN_METADATA_DOMAIN.name)? + else { + return Ok(None); // Not a domainMetadata action, continue checking other types + }; + + // Exclude tombstones (removed=true) from checkpoint per protocol spec + let removed: bool = getters[Self::DOMAIN_METADATA_REMOVED.index] + .get_opt(i, Self::DOMAIN_METADATA_REMOVED.name)? + .unwrap_or(false); + if removed { + return Ok(Some(false)); + } + + // If the domain already exists in the set, the insertion will return false, + // indicating that this is a duplicate. + Ok(Some(self.seen_domains.insert(domain.to_string()))) + } + /// Determines if a row in the batch should be included. /// /// This method checks each action type in sequence, short-circuiting when: @@ -393,7 +564,8 @@ impl ActionReconciliationVisitor<'_> { /// Actions are checked in order of expected frequency of occurrence to optimize performance: /// 1. File actions (most frequent) /// 2. Txn actions - /// 3. Protocol & Metadata actions (least frequent) + /// 3. DomainMetadata actions + /// 4. Protocol & Metadata actions (least frequent) /// /// Returns `Ok(true)` if the row should be included. /// Returns `Ok(false)` if the row should be skipped. @@ -403,15 +575,18 @@ impl ActionReconciliationVisitor<'_> { i: usize, getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { - // Check each action type in sequence, short-circuiting when an action is found let is_valid = if let Some(result) = self.check_file_action(i, getters)? { result } else if let Some(result) = self.check_txn_action(i, getters)? { result - } else if let Some(result) = self.check_protocol_action(i, getters[10])? { + } else if let Some(result) = self.check_domain_metadata_action(i, getters)? { + result + } else if let Some(result) = + self.check_protocol_action(i, getters[Self::PROTOCOL_MIN_READER_VERSION.index])? + { result } else { - self.check_metadata_action(i, getters[9])? + self.check_metadata_action(i, getters[Self::METADATA_ID.index])? .unwrap_or_default() }; @@ -425,16 +600,19 @@ impl ActionReconciliationVisitor<'_> { impl RowVisitor for ActionReconciliationVisitor<'_> { fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { - // The data columns visited must be in the following order: + // The data columns visited must be in the following order, which must match + // the order of fields in CHECKPOINT_ACTIONS_SCHEMA / COMPACTION_ACTIONS_SCHEMA: // 1. ADD // 2. REMOVE // 3. METADATA // 4. PROTOCOL // 5. TXN + // 6. DOMAIN_METADATA static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { const STRING: DataType = DataType::STRING; const INTEGER: DataType = DataType::INTEGER; const LONG: DataType = DataType::LONG; + const BOOLEAN: DataType = DataType::BOOLEAN; let types_and_names = vec![ // File action columns (STRING, column_name!("add.path")), @@ -451,6 +629,8 @@ impl RowVisitor for ActionReconciliationVisitor<'_> { (INTEGER, column_name!("protocol.minReaderVersion")), (STRING, column_name!("txn.appId")), (LONG, column_name!("txn.lastUpdated")), + (STRING, column_name!("domainMetadata.domain")), + (BOOLEAN, column_name!("domainMetadata.removed")), ]; let (types, names) = types_and_names.into_iter().unzip(); (names, types).into() @@ -460,7 +640,7 @@ impl RowVisitor for ActionReconciliationVisitor<'_> { fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { require!( - getters.len() == 13, + getters.len() == 15, Error::InternalError(format!( "Wrong number of visitor getters: {}", getters.len() @@ -513,6 +693,7 @@ mod tests { let data = action_batch(); let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); + let mut seen_domains = HashSet::new(); let mut visitor = ActionReconciliationVisitor::new( &mut seen_file_keys, true, @@ -521,6 +702,7 @@ mod tests { false, false, &mut seen_txns, + &mut seen_domains, None, ); @@ -569,6 +751,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); + let mut seen_domains = HashSet::new(); let mut visitor = ActionReconciliationVisitor::new( &mut seen_file_keys, true, @@ -577,6 +760,7 @@ mod tests { false, false, &mut seen_txns, + &mut seen_domains, None, ); @@ -600,6 +784,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); + let mut seen_domains = HashSet::new(); let mut visitor = ActionReconciliationVisitor::new( &mut seen_file_keys, false, // is_log_batch = false (batch) @@ -608,6 +793,7 @@ mod tests { false, false, &mut seen_txns, + &mut seen_domains, None, ); @@ -639,6 +825,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); + let mut seen_domains = HashSet::new(); let mut visitor = ActionReconciliationVisitor::new( &mut seen_file_keys, true, @@ -647,6 +834,7 @@ mod tests { false, false, &mut seen_txns, + &mut seen_domains, None, ); @@ -672,6 +860,7 @@ mod tests { // Pre-populate with txn app1 let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); + let mut seen_domains = HashSet::new(); seen_txns.insert("app1".to_string()); let mut visitor = ActionReconciliationVisitor::new( @@ -682,6 +871,7 @@ mod tests { true, // The visitor has already seen a protocol action true, // The visitor has already seen a metadata action &mut seen_txns, // Pre-populated transaction + &mut seen_domains, None, ); @@ -712,6 +902,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); + let mut seen_domains = HashSet::new(); let mut visitor = ActionReconciliationVisitor::new( &mut seen_file_keys, true, // is_log_batch @@ -720,6 +911,7 @@ mod tests { false, false, &mut seen_txns, + &mut seen_domains, None, ); @@ -870,6 +1062,7 @@ mod tests { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); + let mut seen_domains = HashSet::new(); let mut visitor = ActionReconciliationVisitor::new( &mut seen_file_keys, true, @@ -878,6 +1071,7 @@ mod tests { false, false, &mut seen_txns, + &mut seen_domains, Some(1000), // expiration timestamp ); @@ -1026,6 +1220,7 @@ mod tests { fn create_test_visitor<'a>( seen_file_keys: &'a mut HashSet, seen_txns: &'a mut HashSet, + seen_domains: &'a mut HashSet, txn_expiration_timestamp: Option, ) -> ActionReconciliationVisitor<'a> { ActionReconciliationVisitor::new( @@ -1036,17 +1231,18 @@ mod tests { false, false, seen_txns, + seen_domains, txn_expiration_timestamp, ) } - /// Helper function to create 13 getters with one specific error getter at the given index + /// Helper function to create 14 getters with one specific error getter at the given index fn create_getters_with_error_at_index( error_index: usize, error_field: &'static str, error_type: &'static str, ) -> Vec { - (0..13) + (0..15) .map(|i| { if i == error_index { MockErrorGetData::new(error_field, error_type) @@ -1062,9 +1258,11 @@ mod tests { // Test 1: Wrong getter count validation let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = create_test_visitor(&mut seen_file_keys, &mut seen_txns, None); + let mut seen_domains = HashSet::new(); + let mut visitor = + create_test_visitor(&mut seen_file_keys, &mut seen_txns, &mut seen_domains, None); let getter = MockErrorGetData::default(); - let getters = vec![&getter as &dyn GetData<'_>; 5]; // Wrong count (should be 13)! + let getters = vec![&getter as &dyn GetData<'_>; 5]; // Wrong count (should be 15)! let result = visitor.visit(1, &getters); assert!(result.is_err()); assert!(result @@ -1088,7 +1286,9 @@ mod tests { for (getter_index, field_name, error_type, expected_error_text) in test_cases { let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = create_test_visitor(&mut seen_file_keys, &mut seen_txns, None); + let mut seen_domains = HashSet::new(); + let mut visitor = + create_test_visitor(&mut seen_file_keys, &mut seen_txns, &mut seen_domains, None); let getters = create_getters_with_error_at_index(getter_index, field_name, error_type); let getter_refs: Vec<&dyn GetData<'_>> = getters.iter().map(|g| g as &dyn GetData<'_>).collect(); @@ -1106,17 +1306,27 @@ mod tests { // Test txn.lastUpdated with retention enabled let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = create_test_visitor(&mut seen_file_keys, &mut seen_txns, Some(1000)); + let mut seen_domains = HashSet::new(); + let mut visitor = create_test_visitor( + &mut seen_file_keys, + &mut seen_txns, + &mut seen_domains, + Some(1000), + ); let defaults = (0..11) .map(|_| MockErrorGetData::default()) .collect::>(); let error_mock = FlexibleMock { error_field: "lastUpdated", }; + let domain_default = MockErrorGetData::default(); + let domain_removed_default = MockErrorGetData::default(); let mut getters: Vec<&dyn GetData<'_>> = defaults.iter().map(|g| g as &dyn GetData<'_>).collect(); getters.push(&error_mock); // txn fields getters.push(&error_mock); + getters.push(&domain_default); // domainMetadata.domain + getters.push(&domain_removed_default); // domainMetadata.removed let result = visitor.visit(1, &getters); assert!(result.is_err()); assert!(result @@ -1127,14 +1337,16 @@ mod tests { // Test remove.deletionTimestamp let mut seen_file_keys = HashSet::new(); let mut seen_txns = HashSet::new(); - let mut visitor = create_test_visitor(&mut seen_file_keys, &mut seen_txns, None); + let mut seen_domains = HashSet::new(); + let mut visitor = + create_test_visitor(&mut seen_file_keys, &mut seen_txns, &mut seen_domains, None); let defaults = (0..4) .map(|_| MockErrorGetData::default()) .collect::>(); let error_mock = FlexibleMock { error_field: "deletionTimestamp", }; - let defaults2 = (0..7) + let defaults2 = (0..9) .map(|_| MockErrorGetData::default()) .collect::>(); let mut getters: Vec<&dyn GetData<'_>> = diff --git a/kernel/src/action_reconciliation/mod.rs b/kernel/src/action_reconciliation/mod.rs index 6daf2de3d0..ccc824f1a0 100644 --- a/kernel/src/action_reconciliation/mod.rs +++ b/kernel/src/action_reconciliation/mod.rs @@ -23,6 +23,8 @@ use crate::{DeltaResult, Error}; pub(crate) mod log_replay; +pub use log_replay::{ActionReconciliationIterator, ActionReconciliationIteratorState}; + const SECONDS_PER_MINUTE: u64 = 60; const MINUTES_PER_HOUR: u64 = 60; const HOURS_PER_DAY: u64 = 24; diff --git a/kernel/src/actions/crc.rs b/kernel/src/actions/crc.rs deleted file mode 100644 index e9265b180a..0000000000 --- a/kernel/src/actions/crc.rs +++ /dev/null @@ -1,277 +0,0 @@ -//! CRC (version checksum) file -use std::sync::LazyLock; - -use super::visitors::{visit_metadata_at, visit_protocol_at}; -use super::{Add, DomainMetadata, Metadata, Protocol, SetTransaction}; -use crate::actions::PROTOCOL_NAME; -use crate::engine_data::GetData; -use crate::schema::ToSchema as _; -use crate::schema::{ColumnName, ColumnNamesAndTypes, DataType}; -use crate::utils::require; -use crate::{DeltaResult, Error, RowVisitor}; -use delta_kernel_derive::ToSchema; - -/// Though technically not an action, we include the CRC (version checksum) file here. A [CRC file] -/// must: -/// 1. Be named `{version}.crc` with version zero-padded to 20 digits: `00000000000000000001.crc` -/// 2. Be stored directly in the _delta_log directory alongside Delta log files -/// 3. Contain exactly one JSON object with the schema of this [`Crc`] struct. -/// -/// [CRC file]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#version-checksum-file -#[allow(unused)] // TODO: remove after we complete CRC support -#[derive(Debug, Clone, PartialEq, Eq, ToSchema)] -pub(crate) struct Crc { - /// A unique identifier for the transaction that produced this commit. - pub(crate) txn_id: Option, - /// Total size of the table in bytes, calculated as the sum of the `size` field of all live - /// [`Add`] actions. - pub(crate) table_size_bytes: i64, - /// Number of live [`Add`] actions in this table version after action reconciliation. - pub(crate) num_files: i64, - /// Number of [`Metadata`] actions. Must be 1. - pub(crate) num_metadata: i64, - /// Number of [`Protocol`] actions. Must be 1. - pub(crate) num_protocol: i64, - /// The in-commit timestamp of this version. Present iff In-Commit Timestamps are enabled. - pub(crate) in_commit_timestamp_opt: Option, - /// Live transaction identifier ([`SetTransaction`]) actions at this version. - pub(crate) set_transactions: Option>, - /// Live [`DomainMetadata`] actions at this version, excluding tombstones. - pub(crate) domain_metadata: Option>, - /// The table [`Metadata`] at this version. - pub(crate) metadata: Metadata, - /// The table [`Protocol`] at this version. - pub(crate) protocol: Protocol, - /// Size distribution information of files remaining after action reconciliation. - pub(crate) file_size_histogram: Option, - /// All live [`Add`] file actions at this version. - pub(crate) all_files: Option>, - /// Number of records deleted through Deletion Vectors in this table version. - pub(crate) num_deleted_records_opt: Option, - /// Number of Deletion Vectors active in this table version. - pub(crate) num_deletion_vectors_opt: Option, - /// Distribution of deleted record counts across files. See this section for more details. - pub(crate) deleted_record_counts_histogram_opt: Option, -} - -/// The [FileSizeHistogram] object represents a histogram tracking file counts and total bytes -/// across different size ranges. -/// -/// [FileSizeHistogram]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#file-size-histogram-schema -#[derive(Debug, Clone, PartialEq, Eq, ToSchema)] -pub(crate) struct FileSizeHistogram { - /// A sorted array of bin boundaries where each element represents the start of a bin - /// (inclusive) and the next element represents the end of the bin (exclusive). The first - /// element must be 0. - pub(crate) sorted_bin_boundaries: Vec, - /// Count of files in each bin. Length must match `sorted_bin_boundaries`. - pub(crate) file_counts: Vec, - /// Total bytes of files in each bin. Length must match `sorted_bin_boundaries`. - pub(crate) total_bytes: Vec, -} - -/// The [DeletedRecordCountsHistogram] object represents a histogram tracking the distribution of -/// deleted record counts across files in the table. Each bin in the histogram represents a range -/// of deletion counts and stores the number of files having that many deleted records. -/// -/// The histogram bins correspond to the following ranges: -/// Bin 0: [0, 0] (files with no deletions) -/// Bin 1: [1, 9] (files with 1-9 deleted records) -/// Bin 2: [10, 99] (files with 10-99 deleted records) -/// Bin 3: [100, 999] (files with 100-999 deleted records) -/// Bin 4: [1000, 9999] (files with 1,000-9,999 deleted records) -/// Bin 5: [10000, 99999] (files with 10,000-99,999 deleted records) -/// Bin 6: [100000, 999999] (files with 100,000-999,999 deleted records) -/// Bin 7: [1000000, 9999999] (files with 1,000,000-9,999,999 deleted records) -/// Bin 8: [10000000, 2147483646] (files with 10,000,000 to 2,147,483,646 deleted records) -/// Bin 9: [2147483647, ∞) (files with 2,147,483,647 or more deleted records) -/// -/// [DeletedRecordCountsHistogram]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#deleted-record-counts-histogram-schema -#[derive(Debug, Clone, PartialEq, Eq, ToSchema)] -pub(crate) struct DeletedRecordCountsHistogram { - /// Array of size 10 where each element represents the count of files falling into a specific - /// deletion count range. - pub(crate) deleted_record_counts: Vec, -} - -/// For now we just define a visitor for Protocol and Metadata in CRC files since (for now) that's -/// the only optimization we implement. Since CRC files can contain lots of other data, we have a -/// specific visitor for only Protocol/Metadata here. -#[allow(unused)] // TODO: remove after we read CRCs -#[derive(Debug, Default)] -pub(crate) struct CrcProtocolMetadataVisitor { - pub(crate) protocol: Protocol, - pub(crate) metadata: Metadata, -} - -impl RowVisitor for CrcProtocolMetadataVisitor { - fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { - static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { - // annoyingly, the 'metadata' in CRC is under the name 'metadata', not 'metaData' - let mut cols = Metadata::to_schema().leaves("metadata"); - cols.extend(Protocol::to_schema().leaves(PROTOCOL_NAME)); - cols - }); - NAMES_AND_TYPES.as_ref() - } - - fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - // getters = sum of Protocol + Metadata - require!( - getters.len() == 13, - Error::InternalError(format!( - "Wrong number of CrcProtocolMetadataVisitor getters: {}", - getters.len() - )) - ); - if row_count != 1 { - return Err(Error::InternalError(format!( - "Expected 1 row for CRC file, but got {row_count}", - ))); - } - - self.metadata = visit_metadata_at(0, &getters[..9])? - .ok_or(Error::generic("Metadata not found in CRC file"))?; - self.protocol = visit_protocol_at(0, &getters[9..])? - .ok_or(Error::generic("Protocol not found in CRC file"))?; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use std::sync::Arc; - - use crate::arrow::array::StringArray; - - use crate::actions::{Format, Metadata, Protocol}; - use crate::engine::sync::SyncEngine; - use crate::schema::derive_macro_utils::ToDataType as _; - use crate::schema::{ArrayType, DataType, StructField, StructType}; - use crate::table_features::{ReaderFeature, WriterFeature}; - use crate::utils::test_utils::string_array_to_engine_data; - use crate::Engine; - - #[test] - fn test_file_size_histogram_schema() { - let schema = FileSizeHistogram::to_schema(); - let expected = StructType::new_unchecked([ - StructField::not_null("sortedBinBoundaries", ArrayType::new(DataType::LONG, false)), - StructField::not_null("fileCounts", ArrayType::new(DataType::LONG, false)), - StructField::not_null("totalBytes", ArrayType::new(DataType::LONG, false)), - ]); - assert_eq!(schema, expected); - } - - #[test] - fn test_deleted_record_counts_histogram_schema() { - let schema = DeletedRecordCountsHistogram::to_schema(); - let expected = StructType::new_unchecked([StructField::not_null( - "deletedRecordCounts", - ArrayType::new(DataType::LONG, false), - )]); - assert_eq!(schema, expected); - } - - #[test] - fn test_crc_schema() { - let schema = Crc::to_schema(); - let expected = StructType::new_unchecked([ - StructField::nullable("txnId", DataType::STRING), - StructField::not_null("tableSizeBytes", DataType::LONG), - StructField::not_null("numFiles", DataType::LONG), - StructField::not_null("numMetadata", DataType::LONG), - StructField::not_null("numProtocol", DataType::LONG), - StructField::nullable("inCommitTimestampOpt", DataType::LONG), - StructField::nullable( - "setTransactions", - ArrayType::new(SetTransaction::to_data_type(), false), - ), - StructField::nullable( - "domainMetadata", - ArrayType::new(DomainMetadata::to_data_type(), false), - ), - StructField::not_null("metadata", Metadata::to_data_type()), - StructField::not_null("protocol", Protocol::to_data_type()), - StructField::nullable("fileSizeHistogram", FileSizeHistogram::to_data_type()), - StructField::nullable("allFiles", ArrayType::new(Add::to_data_type(), false)), - StructField::nullable("numDeletedRecordsOpt", DataType::LONG), - StructField::nullable("numDeletionVectorsOpt", DataType::LONG), - StructField::nullable( - "deletedRecordCountsHistogramOpt", - DeletedRecordCountsHistogram::to_data_type(), - ), - ]); - assert_eq!(schema, expected); - } - - #[test] - fn test_crc_protocol_metadata_visitor() { - // create CRC to visit - let crc_json = serde_json::json!({ - "tableSizeBytes": 100, - "numFiles": 10, - "numMetadata": 1, - "numProtocol": 1, - "metadata": { - "id": "testId", - "format": { - "provider": "parquet", - "options": {} - }, - "schemaString": r#"{"type":"struct","fields":[{"name":"value","type":"integer","nullable":true,"metadata":{}}]}"#, - "partitionColumns": [], - "configuration": { - "delta.columnMapping.mode": "none" - }, - "createdTime": 1677811175 - }, - "protocol": { - "minReaderVersion": 3, - "minWriterVersion": 7, - "readerFeatures": ["columnMapping"], - "writerFeatures": ["columnMapping"] - } - }); - - // convert JSON -> StringArray -> (string)EngineData -> actual CRC EngineData - let json_string = crc_json.to_string(); - let json_strings = StringArray::from(vec![json_string.as_str()]); - let engine_data = string_array_to_engine_data(json_strings); - let engine = SyncEngine::new(); - let json_handler = engine.json_handler(); - let output_schema = Arc::new(Crc::to_schema()); - let data = json_handler.parse_json(engine_data, output_schema).unwrap(); - - // run the visitor - let mut visitor = CrcProtocolMetadataVisitor::default(); - visitor.visit_rows_of(data.as_ref()).unwrap(); - - let expected_protocol = Protocol { - min_reader_version: 3, - min_writer_version: 7, - reader_features: Some(vec![ReaderFeature::ColumnMapping]), - writer_features: Some(vec![WriterFeature::ColumnMapping]), - }; - let expected_metadata = Metadata { - id: "testId".to_string(), - name: None, - description: None, - format: Format { - provider: "parquet".to_string(), - options: std::collections::HashMap::new(), - }, - schema_string: r#"{"type":"struct","fields":[{"name":"value","type":"integer","nullable":true,"metadata":{}}]}"#.to_string(), - partition_columns: vec![], - created_time: Some(1677811175), - configuration: std::collections::HashMap::from([ - ("delta.columnMapping.mode".to_string(), "none".to_string()), - ]), - }; - - assert_eq!(visitor.protocol, expected_protocol); - assert_eq!(visitor.metadata, expected_metadata); - } -} diff --git a/kernel/src/actions/deletion_vector.rs b/kernel/src/actions/deletion_vector.rs index 2c269f0369..21f9287443 100644 --- a/kernel/src/actions/deletion_vector.rs +++ b/kernel/src/actions/deletion_vector.rs @@ -10,12 +10,24 @@ use delta_kernel_derive::ToSchema; use roaring::RoaringTreemap; use url::Url; +use crc::{Crc, CRC_32_ISO_HDLC}; + use crate::schema::DataType; use crate::utils::require; use crate::{DeltaResult, Error, StorageHandler}; +/// Magic number for portable RoaringBitmap serialization format. +/// This is the standard format defined in the RoaringBitmap Specification +/// and is used by Delta for deletion vector storage. +/// See: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#deletion-vector-format +const ROARING_BITMAP_PORTABLE_MAGIC: u32 = 1681511377; + +/// Magic number for native RoaringBitmap serialization format. +/// This format is reserved for future use and not currently supported. +const ROARING_BITMAP_NATIVE_MAGIC: u32 = 1681511376; + #[derive(Debug, PartialEq, Eq, Clone, Copy)] -#[cfg_attr(test, derive(serde::Serialize))] +#[cfg_attr(test, derive(serde::Serialize, serde::Deserialize))] pub enum DeletionVectorStorageType { #[cfg_attr(test, serde(rename = "u"))] PersistedRelative, @@ -34,8 +46,7 @@ impl FromStr for DeletionVectorStorageType { "i" => Ok(Self::Inline), "p" => Ok(Self::PersistedAbsolute), _ => Err(Error::internal_error(format!( - "Unsupported deletion vector format option: {}", - s + "Unsupported deletion vector format option: {s}" ))), } } @@ -57,8 +68,69 @@ impl ToDataType for DeletionVectorStorageType { } } +/// Represents an abstract path to a deletion vector file. +/// +/// This is used in the public API to construct the path to a deletion vector file and +/// has logic to convert [`crate::actions::deletion_vector_writer::DeletionVectorWriteResult`] +/// to a [`DeletionVectorDescriptor`] with appropriate storage type and path. +pub struct DeletionVectorPath { + /// The base URL path to the Delta table + table_path: Url, + /// Unique identifier for this deletion vector file + uuid: uuid::Uuid, + /// Optional directory prefix within the table path where the DV file will be located, + /// this is to allow for randomizing reads/writes to avoid object store throttling. + prefix: String, +} + +impl DeletionVectorPath { + pub(crate) fn new(table_path: Url, prefix: String) -> Self { + Self { + table_path, + uuid: uuid::Uuid::new_v4(), + prefix, + } + } + + #[cfg(test)] + pub(crate) fn new_with_uuid(table_path: Url, prefix: String, uuid: uuid::Uuid) -> Self { + Self { + table_path, + uuid, + prefix, + } + } + + /// Helper method to construct the relative path to a deletion vector file + /// from the prefix and UUID suffix. + fn relative_path(prefix: &str, uuid: &uuid::Uuid) -> String { + if !prefix.is_empty() { + format!("{prefix}/deletion_vector_{uuid}.bin") + } else { + format!("deletion_vector_{uuid}.bin") + } + } + + /// Returns the absolute path to the deletion vector file. + pub fn absolute_path(&self) -> DeltaResult { + let dv_suffix = Self::relative_path(&self.prefix, &self.uuid); + self.table_path + .join(&dv_suffix) + .map_err(|_| Error::DeletionVector(format!("invalid path: {dv_suffix}"))) + } + + /// Returns the compressed encoded path for use in descriptor (prefix + z85 encoded UUID). + pub(crate) fn encoded_relative_path(&self) -> String { + format!("{}{}", self.prefix, z85::encode(self.uuid.as_bytes())) + } +} + #[derive(Debug, Clone, PartialEq, Eq, ToSchema)] -#[cfg_attr(test, derive(serde::Serialize), serde(rename_all = "camelCase"))] +#[cfg_attr( + test, + derive(serde::Serialize, serde::Deserialize), + serde(rename_all = "camelCase") +)] pub struct DeletionVectorDescriptor { /// A single character to indicate how to access the DV. Legal options are: ['u', 'i', 'p']. pub storage_type: DeletionVectorStorageType, @@ -122,14 +194,8 @@ impl DeletionVectorDescriptor { .map_err(|_| Error::deletion_vector("Failed to decode DV uuid"))?; let uuid = uuid::Uuid::from_slice(&decoded) .map_err(|err| Error::DeletionVector(err.to_string()))?; - let dv_suffix = if prefix_len > 0 { - format!( - "{}/deletion_vector_{uuid}.bin", - &self.path_or_inline_dv[..prefix_len] - ) - } else { - format!("deletion_vector_{uuid}.bin") - }; + let dv_suffix = + DeletionVectorPath::relative_path(&self.path_or_inline_dv[..prefix_len], &uuid); let dv_path = parent .join(&dv_suffix) .map_err(|_| Error::DeletionVector(format!("invalid path: {dv_suffix}")))?; @@ -161,63 +227,129 @@ impl DeletionVectorDescriptor { .map_err(|_| Error::deletion_vector("Failed to decode DV"))?; let magic = slice_to_u32(&byte_slice[0..4], Endian::Little)?; match magic { - 1681511377 => RoaringTreemap::deserialize_from(&byte_slice[4..]) - .map_err(|err| Error::DeletionVector(err.to_string())), - 1681511376 => { - todo!("Don't support native serialization in inline bitmaps yet"); + ROARING_BITMAP_PORTABLE_MAGIC => { + RoaringTreemap::deserialize_from(&byte_slice[4..]) + .map_err(|err| Error::DeletionVector(err.to_string())) } + ROARING_BITMAP_NATIVE_MAGIC => Err(Error::deletion_vector( + "Native serialization in inline bitmaps is not yet supported", + )), _ => Err(Error::DeletionVector(format!("Invalid magic {magic}"))), } } Some(path) => { - let offset = self.offset; - let size_in_bytes = self.size_in_bytes; + let size_in_bytes: u32 = + self.size_in_bytes + .try_into() + .or(Err(Error::DeletionVector(format!( + "size_in_bytes doesn't fit in usize for {path}" + ))))?; let dv_data = storage - .read_files(vec![(path, None)])? + .read_files(vec![(path.clone(), None)])? .next() - .ok_or(Error::missing_data("No deletion vector data"))??; + .ok_or(Error::missing_data(format!( + "No deletion vector data for {path}" + )))??; + let dv_data_len = dv_data.len(); let mut cursor = Cursor::new(dv_data); let mut version_buf = [0; 1]; - cursor - .read(&mut version_buf) - .map_err(|err| Error::DeletionVector(err.to_string()))?; + cursor.read(&mut version_buf).map_err(|err| { + Error::DeletionVector(format!("Failed to read version from {path}: {err}")) + })?; let version = u8::from_be_bytes(version_buf); require!( version == 1, - Error::DeletionVector(format!("Invalid version: {version}")) + Error::DeletionVector(format!("Invalid version {version} for {path}")) ); - if let Some(offset) = offset { - cursor.set_position(offset as u64); - } + // Deletion vector file format: + // +---------------+-----------------+ + // | num bytes | value | + // +===============+=================+ + // | 1 byte | version | + // +---------------+-----------------+ + // | offset-1 | other dvs... | + // +---------------+-----------------+ <- this_dv_start + // | 4 bytes | dv_size | + // +---------------+-----------------+ + // | 4 bytes | magic value | + // +---------------+-----------------+ <- bitmap_start + // | dv_size - 4 | bitmap | + // +---------------+-----------------+ <- crc_start + // | 4 bytes | CRC | + // +---------------+-----------------+ + + let this_dv_start: usize = + self.offset + .unwrap_or(1) + .try_into() + .or(Err(Error::DeletionVector(format!( + "Offset {:?} doesn't fit in usize for {path}", + self.offset + ))))?; + let magic_start = this_dv_start + 4; + // bitmap_start = this_dv_start + 4 (dv_size field) + 4 (magic field) + let bitmap_start = this_dv_start + 8; + // crc_start = this_dv_start + 4 (dv_size field) + dv_size (magic field + bitmap) + // Safety: size_in_bytes is checked to fit in u32 which for all known platforms should + // fix in usize range. + let crc_start = this_dv_start + 4 + (size_in_bytes as usize); + require!( + this_dv_start < dv_data_len, + Error::DeletionVector(format!( + "This DV start is out of bounds for {path} (Offset: {this_dv_start} >= Size: {dv_data_len})" + )) + ); + + cursor.set_position(this_dv_start as u64); let dv_size = read_u32(&mut cursor, Endian::Big)?; require!( - dv_size == size_in_bytes as u32, + dv_size == size_in_bytes, Error::DeletionVector(format!( - "DV size mismatch. Log indicates {size_in_bytes}, file says: {dv_size}" + "DV size mismatch for {path}. Log indicates {size_in_bytes}, file says: {dv_size}" )) ); let magic = read_u32(&mut cursor, Endian::Little)?; require!( - magic == 1681511377, - Error::DeletionVector(format!("Invalid magic: {magic}")) + magic == ROARING_BITMAP_PORTABLE_MAGIC, + Error::DeletionVector(format!("Invalid magic {magic} for {path}")) ); - // get the Bytes back out and limit it to dv_size - let position = cursor.position(); - let mut bytes = cursor.into_inner(); - let truncate_pos = position + dv_size as u64; - assert!( - truncate_pos <= usize::MAX as u64, - "Can't truncate as truncate_pos is > usize::MAX" + let bytes = cursor.into_inner(); + + // +4 to account for CRC value + require!( + bytes.len() >= crc_start + 4, + Error::DeletionVector(format!( + "Can't read deletion vector for {path} as there are not enough bytes. Expected {}, but got {}", + crc_start + 4, + bytes.len() + )) + ); + + let mut crc_cursor: Cursor = + Cursor::new(bytes.slice(crc_start..crc_start + 4)); + let crc = read_u32(&mut crc_cursor, Endian::Big)?; + let crc32 = create_dv_crc32(); + // CRC is calculated from magic field through end of bitmap + // Safety: verified bytes is larger than crc_start + 4, above. + let expected_crc = crc32.checksum(&bytes.slice(magic_start..crc_start)); + require!( + crc == expected_crc, + Error::DeletionVector(format!( + "CRC32 checksum mismatch for {path}. Got: {crc}, expected: {expected_crc}" + )) ); - bytes.truncate(truncate_pos as usize); - let mut cursor = Cursor::new(bytes); - cursor.set_position(position); - RoaringTreemap::deserialize_from(cursor) - .map_err(|err| Error::DeletionVector(err.to_string())) + // Safety: verified bytes is larger than crc_start + 4, above. + let dv_bytes = bytes.slice(bitmap_start..crc_start); + let cursor = Cursor::new(dv_bytes); + RoaringTreemap::deserialize_from(cursor).map_err(|err| { + Error::DeletionVector(format!( + "Failed to deserialize deletion vector for {path}: {err}" + )) + }) } } } @@ -238,6 +370,12 @@ enum Endian { Little, } +/// Factory function to create a CRC-32 instance using the ISO HDLC algorithm. +/// This ensures consistent CRC algorithm usage for deletion vectors. +pub(crate) fn create_dv_crc32() -> Crc { + Crc::::new(&CRC_32_ISO_HDLC) +} + /// small helper to read a big or little endian u32 from a cursor fn read_u32(cursor: &mut Cursor, endian: Endian) -> DeltaResult { let mut buf = [0; 4]; @@ -412,6 +550,12 @@ mod tests { assert_eq!(dv_url, example.absolute_path(&parent).unwrap().unwrap()); } + #[test] + fn test_magic_number_constants() { + assert_eq!(ROARING_BITMAP_PORTABLE_MAGIC, 1681511377); + assert_eq!(ROARING_BITMAP_NATIVE_MAGIC, 1681511376); + } + #[test] fn test_inline_read() { let inline = dv_inline(); @@ -428,6 +572,36 @@ mod tests { } } + #[test] + fn test_inline_native_serialization_error() { + // Construct an inline DV payload whose first 4 bytes (little-endian) are the + // native serialization magic (1681511376). The `read` method should return + // a DeletionVector error indicating native serialization isn't supported. + let sync_engine = SyncEngine::new(); + let storage = sync_engine.storage_handler(); + let parent = Url::parse("http://not.used").unwrap(); + + let mut bytes = Vec::new(); + // native serialization magic (little-endian) + bytes.extend_from_slice(&1681511376u32.to_le_bytes()); + // some trailing bytes (content not important for this test) + bytes.extend_from_slice(&[1u8, 2, 3, 4]); + + let encoded = z85::encode(&bytes); + + let inline = DeletionVectorDescriptor { + storage_type: DeletionVectorStorageType::Inline, + path_or_inline_dv: encoded, + offset: None, + size_in_bytes: bytes.len() as i32, + cardinality: 0, + }; + + let err = inline.read(storage, &parent).unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("Native serialization in inline bitmaps is not yet supported")); + } + #[test] fn test_deletion_vector_read() { let path = @@ -437,7 +611,7 @@ mod tests { let storage = sync_engine.storage_handler(); let example = dv_example(); - let tree_map = example.read(storage, &parent).unwrap(); + let tree_map = example.read(storage.clone(), &parent).unwrap(); let expected: Vec = vec![0, 9]; let found = tree_map.iter().collect::>(); @@ -562,4 +736,81 @@ mod tests { assert_eq!(variant, parsed); } } + + #[test] + fn test_deletion_vector_path_uniqueness() { + // Verify that two DeletionVectorPath instances created with the same arguments + // produce different absolute paths due to unique UUIDs + let table_path = Url::parse("file:///tmp/test_table/").unwrap(); + let prefix = String::from("deletion_vectors"); + + let dv_path1 = DeletionVectorPath::new(table_path.clone(), prefix.clone()); + let dv_path2 = DeletionVectorPath::new(table_path.clone(), prefix.clone()); + + let abs_path1 = dv_path1.absolute_path().unwrap(); + let abs_path2 = dv_path2.absolute_path().unwrap(); + + // The absolute paths should be different because each DeletionVectorPath + // gets a unique UUID + assert_ne!(abs_path1, abs_path2); + assert_ne!( + dv_path1.encoded_relative_path(), + dv_path2.encoded_relative_path() + ); + } + + #[test] + fn test_deletion_vector_path_absolute_path_with_prefix() { + let table_path = Url::parse("file:///tmp/test_table/").unwrap(); + let prefix = String::from("dv"); + let known_uuid = uuid::Uuid::parse_str("abcdef01-2345-6789-abcd-ef0123456789").unwrap(); + + let dv_path = DeletionVectorPath::new_with_uuid(table_path.clone(), prefix, known_uuid); + let abs_path = dv_path.absolute_path().unwrap(); + + // Verify the exact path with known UUID + let expected = + "file:///tmp/test_table/dv/deletion_vector_abcdef01-2345-6789-abcd-ef0123456789.bin"; + assert_eq!(abs_path.as_str(), expected); + } + + #[test] + fn test_deletion_vector_path_absolute_path_with_known_uuid() { + // Test with a known UUID to verify exact path construction + let table_path = Url::parse("file:///tmp/test_table/").unwrap(); + let prefix = String::from("dv"); + let known_uuid = uuid::Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); + + let dv_path = DeletionVectorPath::new_with_uuid(table_path, prefix, known_uuid); + let abs_path = dv_path.absolute_path().unwrap(); + + // Verify the exact path is constructed correctly + let expected_path = + "file:///tmp/test_table/dv/deletion_vector_550e8400-e29b-41d4-a716-446655440000.bin"; + assert_eq!(abs_path.as_str(), expected_path); + + // Verify the encoded_relative_path is exactly as expected (prefix + z85 encoded UUID: 20 chars) + let encoded = dv_path.encoded_relative_path(); + assert_eq!(encoded, "dvrsTVZ&*Sl-RXRWjryu/!"); + } + + #[test] + fn test_deletion_vector_path_absolute_path_with_known_uuid_empty_prefix() { + // Test with a known UUID and empty prefix + let table_path = Url::parse("file:///tmp/test_table/").unwrap(); + let prefix = String::from(""); + let known_uuid = uuid::Uuid::parse_str("123e4567-e89b-12d3-a456-426614174000").unwrap(); + + let dv_path = DeletionVectorPath::new_with_uuid(table_path, prefix, known_uuid); + let abs_path = dv_path.absolute_path().unwrap(); + + // Verify the exact path is constructed correctly without prefix directory + let expected_path = + "file:///tmp/test_table/deletion_vector_123e4567-e89b-12d3-a456-426614174000.bin"; + assert_eq!(abs_path.as_str(), expected_path); + + // Verify the encoded_relative_path is exactly as expected (z85 encoded UUID: 20 chars) + let encoded = dv_path.encoded_relative_path(); + assert_eq!(encoded, "5:JjlQ/G/]6C<1m"); + } } diff --git a/kernel/src/actions/deletion_vector_writer.rs b/kernel/src/actions/deletion_vector_writer.rs new file mode 100644 index 0000000000..6121adfb81 --- /dev/null +++ b/kernel/src/actions/deletion_vector_writer.rs @@ -0,0 +1,788 @@ +//! Code for writing deletion vectors to object storage. +//! +//! This module provides APIs for engines to write deletion vectors as part of a Delta transaction. + +use std::borrow::Borrow; +use std::io::Write; + +use bytes::Bytes; +use roaring::RoaringTreemap; + +use crate::actions::deletion_vector::{ + create_dv_crc32, DeletionVectorDescriptor, DeletionVectorPath, DeletionVectorStorageType, +}; +use crate::{DeltaResult, Error}; + +/// A trait that allows engines to provide deletion vectors in various formats. +/// +/// Engines can implement this trait to provide their own deletion vector implementations, +/// or use the provided [`KernelDeletionVector`] implementation backed by RoaringTreemap. +/// +/// # Examples +/// +/// ```rust +/// use delta_kernel::actions::deletion_vector_writer::DeletionVector; +/// +/// struct MyDeletionVector { +/// deleted_indexes: Vec, +/// } +/// +/// impl DeletionVector for MyDeletionVector { +/// type IndexIterator = std::vec::IntoIter; +/// +/// fn into_iter(self) -> Self::IndexIterator { +/// self.deleted_indexes.into_iter() +/// } +/// +/// fn cardinality(&self) -> u64 { +/// self.deleted_indexes.len() as u64 +/// } +/// } +/// ``` +pub trait DeletionVector: Sized { + /// Iterator type that yields deleted row indexes. + type IndexIterator: Iterator; + + /// Return an iterator over deleted row indexes. + fn into_iter(self) -> Self::IndexIterator; + + /// Return the number of deleted rows in the deletion vector. + fn cardinality(&self) -> u64; + + /// Serialize the deletion vector into bytes. + /// + /// This serializes the deletion vector in the format expected by the Delta Lake protocol. + /// it may be overridden for more efficient serialization if the implementation already has the data in a suitable format. + /// But generally, only do this if you fully understand the the format requirements. + fn serialize(self) -> DeltaResult { + let treemap: RoaringTreemap = self.into_iter().collect(); + let mut serialized = Vec::new(); + treemap + .serialize_into(&mut serialized) + .map_err(|e| Error::generic(format!("Failed to serialize deletion vector: {e}")))?; + Ok(Bytes::from(serialized)) + } +} + +/// Metadata about a written deletion vector, excluding the storage path. +/// +/// This structure contains the information needed to construct a full +/// [`DeletionVectorDescriptor`] +/// after writing the DV to storage. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DeletionVectorWriteResult { + /// Start of the data for this DV in number of bytes from the beginning of the file. + /// Does not include CRC length or size in bytes prefix. + pub offset: i32, + + /// Size of the serialized DV in bytes (raw data size). + pub size_in_bytes: i32, + + /// Number of rows the deletion vector logically removes from the file. + pub cardinality: i64, +} + +impl DeletionVectorWriteResult { + /// Convert the write result to a deletion vector descriptor. + /// + /// As an implementation detail, this method will always use the persisted relative storage type. + /// + /// # Arguments + /// + /// * `path` - The path to the deletion vector file. + pub fn to_descriptor(self, path: &DeletionVectorPath) -> DeletionVectorDescriptor { + DeletionVectorDescriptor { + storage_type: DeletionVectorStorageType::PersistedRelative, + path_or_inline_dv: path.encoded_relative_path(), + offset: Some(self.offset), + size_in_bytes: self.size_in_bytes, + cardinality: self.cardinality, + } + } +} + +/// A Kernel-provided deletion vector implementation backed by [`RoaringTreemap`]. +/// +/// This is the default implementation that engines can use. It provides memory-efficient +/// storage of deleted row indexes using compressed bitmaps. +/// +/// # Examples +/// +/// ```rust +/// use delta_kernel::actions::deletion_vector_writer::KernelDeletionVector; +/// +/// let mut dv = KernelDeletionVector::new(); +/// dv.add_deleted_row_indexes([0, 5, 10]); +/// ``` +#[derive(Debug, Clone)] +pub struct KernelDeletionVector { + dv: RoaringTreemap, +} + +impl Default for KernelDeletionVector { + fn default() -> Self { + Self::new() + } +} + +impl KernelDeletionVector { + /// Create a new empty deletion vector. + pub fn new() -> Self { + Self { + dv: RoaringTreemap::new(), + } + } + + /// Adds indexes to be deleted to this deletion vector. + pub fn add_deleted_row_indexes(&mut self, iter: I) + where + I: IntoIterator, + T: Borrow, + { + for index in iter { + self.dv.insert(*index.borrow()); + } + } + + /// Get the number of deleted rows in this deletion vector. + pub fn cardinality(&self) -> u64 { + self.dv.len() + } +} + +impl DeletionVector for KernelDeletionVector { + type IndexIterator = roaring::treemap::IntoIter; + + fn into_iter(self) -> Self::IndexIterator { + self.dv.into_iter() + } + + /// Optimized serialization that directly serializes the internal RoaringTreemap. + fn serialize(self) -> DeltaResult { + let mut serialized = Vec::new(); + self.dv + .serialize_into(&mut serialized) + .map_err(|e| Error::generic(format!("Failed to serialize deletion vector: {e}")))?; + Ok(Bytes::from(serialized)) + } + + fn cardinality(&self) -> u64 { + self.dv.len() + } +} + +/// A streaming writer for deletion vectors. +/// +/// This writer allows for writing multiple deletion vectors to a single file in a streaming +/// fashion, which is memory-efficient for distributed workloads where deletion vectors are +/// generated on executors. +/// +/// # Format +/// +/// The writer produces deletion vector files in the Delta Lake format: +/// - The first byte of the file is a version byte (currently 1) +/// - Each DV is prefixed with a 4-byte size (big-endian) of the serialized data +/// - Followed by a 4-byte magic number (0x64485871, little-endian) +/// - Followed by the serialized 64-bit Roaring Bitmap +/// - Followed by a 4-byte CRC32 checksum (big-endian) of the serialized data +/// +/// # Examples +/// +/// ```rust +/// use delta_kernel::actions::deletion_vector_writer::{StreamingDeletionVectorWriter, KernelDeletionVector}; +/// +/// let mut buffer = Vec::new(); +/// let mut writer = StreamingDeletionVectorWriter::new(&mut buffer); +/// +/// let mut dv = KernelDeletionVector::new(); +/// dv.add_deleted_row_indexes([1, 5, 10]); +/// +/// let descriptor = writer.write_deletion_vector(dv)?; +/// writer.finalize()?; +/// # Ok::<(), delta_kernel::Error>(()) +/// ``` +pub struct StreamingDeletionVectorWriter<'a, W: Write> { + writer: &'a mut W, + current_offset: usize, +} + +impl<'a, W: Write> StreamingDeletionVectorWriter<'a, W> { + /// Create a new streaming deletion vector writer. + /// + /// # Arguments + /// + /// * `writer` - A mutable reference to any type implementing [`std::io::Write`]. + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut buffer = Vec::new(); + /// let writer = StreamingDeletionVectorWriter::new(&mut buffer); + /// ``` + pub fn new(writer: &'a mut W) -> Self { + Self { + writer, + current_offset: 0, + } + } + + /// Write a deletion vector to the underlying writer. + /// + /// This method can be called multiple times to write multiple deletion vectors to the same + /// writer. The caller is responsible for keeping track of which deletion vector corresponds to + /// which data file. + /// + /// # Arguments + /// + /// * `deletion_vector` - The deletion vector to write + /// + /// # Returns + /// + /// A [`DeletionVectorWriteResult`] containing the offset, size, and cardinality + /// of the written deletion vector. + /// + /// # Errors + /// + /// Returns an error if: + /// - The writer fails to write data + /// - The deletion vector cannot be serialized + /// - The offset or size would overflow an i32 + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut dv = KernelDeletionVector::new(); + /// dv.add_deleted_row_indexes([1, 5, 10]); + /// + /// let descriptor = writer.write_deletion_vector(dv)?; + /// println!("Written DV at offset {} with size {}", descriptor.offset, descriptor.size_in_bytes); + /// # Ok::<(), delta_kernel::Error>(()) + /// ``` + pub fn write_deletion_vector( + &mut self, + deletion_vector: impl DeletionVector, + ) -> DeltaResult { + // Write version byte on first write + if self.current_offset == 0 { + // Write header. + self.writer + .write_all(&[1u8]) + .map_err(|e| Error::generic(format!("Failed to write version byte: {e}")))?; + self.current_offset = 1; + } + + let cardinality = deletion_vector.cardinality(); + // Serialize the deletion vector to bytes + let serialized = deletion_vector.serialize()?; + + // Calculate sizes + + // The size field contains the size of data + magic(4) (doesn't include CRC) + let dv_size = serialized.len() + 4; + // Use i32::MAX as the limit since Java implementations don't have unsigned integers. + // This ensures compatibility with the Scala/Java implementation [1]. + // + // [1] https://github.com/delta-io/delta/blob/b388f280d083d4cf92c6434e4f7a549fc26cd1fa/spark/src/main/scala/org/apache/spark/sql/delta/deletionvectors/RoaringBitmapArray.scala#L311 + if dv_size > i32::MAX as usize { + return Err(Error::generic( + "Deletion vector size exceeds maximum allowed size", + )); + } + + // Record the offset where this DV size starts. + let dv_offset: i32 = self + .current_offset + .try_into() + .map_err(|_| Error::generic("Deletion vector offset doesn't fit in i32"))?; + + // Write size (big-endian, as per Delta spec) + let size_bytes = (dv_size as u32).to_be_bytes(); + self.writer + .write_all(&size_bytes) + .map_err(|e| Error::generic(format!("Failed to write size: {e}")))?; + + // Write magic number (little-endian) + // This is the RoaringBitmapArray format magic + let magic: u32 = 1681511377; + self.writer + .write_all(&magic.to_le_bytes()) + .map_err(|e| Error::generic(format!("Failed to write magic: {e}")))?; + + // Write the serialized treemap + self.writer + .write_all(&serialized) + .map_err(|e| Error::generic(format!("Failed to write deletion vector data: {e}")))?; + + // Calculate and write CRC32 checksum (big-endian) + // The CRC must include both the magic and the serialized data + let crc_instance = create_dv_crc32(); + let mut digest = crc_instance.digest(); + digest.update(&magic.to_le_bytes()); + digest.update(&serialized); + let checksum = digest.finalize(); + self.writer + .write_all(&checksum.to_be_bytes()) + .map_err(|e| Error::generic(format!("Failed to write CRC32 checksum: {e}")))?; + + // Update offset for next write (size_prefix + magic + data + crc) + let bytes_written = 4 + dv_size + 4; // size + (magic + data) + crc + self.current_offset += bytes_written; + + Ok(DeletionVectorWriteResult { + offset: dv_offset, + size_in_bytes: dv_size as i32, + cardinality: cardinality as i64, + }) + } + + /// Finalize all writes and flush the underlying writer. + /// + /// This method should be called after all deletion vectors have been written. + /// After calling this method, the writer should not be used anymore. + /// + /// # Errors + /// + /// Returns an error if flushing the writer fails. + /// + /// # Examples + /// + /// ```rust,ignore + /// writer.write_deletion_vector(dv1)?; + /// writer.write_deletion_vector(dv2)?; + /// writer.finalize()?; + /// # Ok::<(), delta_kernel::Error>(()) + /// ``` + pub fn finalize(self) -> DeltaResult<()> { + // Note: Currently this method only flushes the writer, but is kept as an explicit API + // for future-proofing. If we need to support formats that require footers (e.g., Puffin + // files or new DV file formats), this provides a consistent place to add that logic + // without breaking downstream code. + // + + self.writer + .flush() + .map_err(|e| Error::generic(format!("Failed to flush writer: {e}"))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + #[test] + fn test_kernel_deletion_vector_new() { + let dv = KernelDeletionVector::new(); + assert_eq!(dv.cardinality(), 0); + } + + #[test] + fn test_kernel_deletion_vector_add_indexes() { + let mut dv = KernelDeletionVector::new(); + dv.add_deleted_row_indexes([1u64, 5, 10]); + + assert_eq!(dv.cardinality(), 3); + assert_eq!( + dv.into_iter().collect::(), + RoaringTreemap::from_iter([1, 5, 10]) + ); + } + + #[test] + fn test_streaming_writer_single_dv() { + let mut buffer = Vec::new(); + let mut writer = StreamingDeletionVectorWriter::new(&mut buffer); + + let mut dv = KernelDeletionVector::new(); + dv.add_deleted_row_indexes([0u64, 9]); + + let descriptor = writer.write_deletion_vector(dv).unwrap(); + writer.finalize().unwrap(); + + // Check descriptor values + assert_eq!(descriptor.offset, 1); // After version byte + assert_eq!(descriptor.cardinality, 2); + assert!(descriptor.size_in_bytes > 0); + + // Check buffer contents + assert!(!buffer.is_empty()); + assert_eq!(buffer[0], 1); // Version byte + } + + #[test] + fn test_streaming_writer_multiple_dvs() { + let mut buffer = Vec::new(); + let mut writer = StreamingDeletionVectorWriter::new(&mut buffer); + + let mut dv1 = KernelDeletionVector::new(); + dv1.add_deleted_row_indexes([0u64, 9]); + + let mut dv2 = KernelDeletionVector::new(); + dv2.add_deleted_row_indexes([5u64, 15, 25]); + + let desc1 = writer.write_deletion_vector(dv1).unwrap(); + let desc2 = writer.write_deletion_vector(dv2).unwrap(); + writer.finalize().unwrap(); + + // Check that offsets are different and sequential + assert_eq!(desc1.offset, 1); + assert!(desc2.offset > desc1.offset); + assert_eq!(desc1.cardinality, 2); + assert_eq!(desc2.cardinality, 3); + } + + #[test] + fn test_streaming_writer_empty_dv() { + use crate::Engine; + use std::fs::File; + use tempfile::tempdir; + use url::Url; + + // Create a temporary directory and file + let temp_dir = tempdir().unwrap(); + let table_url = Url::from_directory_path(temp_dir.path()).unwrap(); + + let dv_path = DeletionVectorPath::new(table_url.clone(), String::from("test")); + let file_path = dv_path.absolute_path().unwrap().to_file_path().unwrap(); + + // Create parent directory if it doesn't exist + if let Some(parent) = file_path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + + let mut file = File::create(&file_path).unwrap(); + + // Create an empty deletion vector + let dv = KernelDeletionVector::new(); + + let mut writer = StreamingDeletionVectorWriter::new(&mut file); + let write_result = writer.write_deletion_vector(dv).unwrap(); + writer.finalize().unwrap(); + drop(file); // Ensure file is closed + + // Check descriptor values for empty DV + assert_eq!(write_result.offset, 1); // After version byte + assert_eq!(write_result.cardinality, 0); + assert!(write_result.size_in_bytes > 0); // Still has magic number + + // Read back using the descriptor to verify empty bitmap can be read + use crate::engine::sync::SyncEngine; + let engine = SyncEngine::new(); + let storage = engine.storage_handler(); + + let descriptor = write_result.to_descriptor(&dv_path); + let treemap = descriptor.read(storage, &table_url).unwrap(); + + // Verify the treemap is empty + assert_eq!(treemap.len(), 0); + assert!(treemap.is_empty()); + } + + #[test] + fn test_streaming_writer_roundtrip() { + // Write a deletion vector + let mut buffer = Vec::new(); + let mut writer = StreamingDeletionVectorWriter::new(&mut buffer); + + let mut dv = KernelDeletionVector::new(); + let test_indexes = vec![3, 4, 7, 11, 18, 29]; + dv.add_deleted_row_indexes(&test_indexes); + + let descriptor = writer.write_deletion_vector(dv).unwrap(); + writer.finalize().unwrap(); + + // Now try to read it back + let mut cursor = Cursor::new(buffer); + cursor.set_position(descriptor.offset as u64); + + // Read size + let mut size_buf = [0u8; 4]; + std::io::Read::read_exact(&mut cursor, &mut size_buf).unwrap(); + let size = u32::from_be_bytes(size_buf); + assert_eq!(size, descriptor.size_in_bytes as u32); + + // Read magic + let mut magic_buf = [0u8; 4]; + std::io::Read::read_exact(&mut cursor, &mut magic_buf).unwrap(); + let magic = u32::from_le_bytes(magic_buf); + assert_eq!(magic, 1681511377); + + // Read the serialized data (size includes magic, so actual data is size - 4) + let serialized_data_len = (size - 4) as usize; + let mut serialized_data = vec![0u8; serialized_data_len]; + std::io::Read::read_exact(&mut cursor, &mut serialized_data).unwrap(); + + // Read and verify CRC32 checksum + let mut crc_buf = [0u8; 4]; + std::io::Read::read_exact(&mut cursor, &mut crc_buf).unwrap(); + let stored_checksum = u32::from_be_bytes(crc_buf); + + // Calculate expected checksum (must include magic + serialized data) + let crc_instance = create_dv_crc32(); + let mut digest = crc_instance.digest(); + digest.update(&magic_buf); + digest.update(&serialized_data); + let expected_checksum = digest.finalize(); + assert_eq!( + stored_checksum, expected_checksum, + "CRC32 checksum mismatch" + ); + + // Deserialize the treemap + let treemap = RoaringTreemap::deserialize_from(&serialized_data[..]).unwrap(); + assert_eq!(treemap.len(), test_indexes.len() as u64); + for idx in test_indexes { + assert!(treemap.contains(idx)); + } + } + + #[test] + fn test_deletion_vector_trait() { + struct TestDV { + indexes: Vec, + } + + impl DeletionVector for TestDV { + type IndexIterator = std::vec::IntoIter; + + fn into_iter(self) -> Self::IndexIterator { + self.indexes.into_iter() + } + + fn cardinality(&self) -> u64 { + self.indexes.len() as u64 + } + } + + let test_dv = TestDV { + indexes: vec![1, 2, 3], + }; + + let mut buffer = Vec::new(); + let mut writer = StreamingDeletionVectorWriter::new(&mut buffer); + let descriptor = writer.write_deletion_vector(test_dv).unwrap(); + + assert_eq!(descriptor.cardinality, 3); + } + + #[test] + fn test_array_based_deletion_vector() { + use crate::Engine; + use std::fs::File; + use tempfile::tempdir; + use url::Url; + + // Custom DeletionVector implementation that wraps an array of u64 + struct ArrayDeletionVector { + deleted_rows: Vec, + } + + impl ArrayDeletionVector { + fn new(deleted_rows: Vec) -> Self { + Self { deleted_rows } + } + } + + impl DeletionVector for ArrayDeletionVector { + type IndexIterator = std::vec::IntoIter; + + fn into_iter(self) -> Self::IndexIterator { + self.deleted_rows.into_iter() + } + + fn cardinality(&self) -> u64 { + self.deleted_rows.len() as u64 + } + } + + // Create a temporary directory and file + let temp_dir = tempdir().unwrap(); + let table_url = Url::from_directory_path(temp_dir.path()).unwrap(); + + let dv_path = DeletionVectorPath::new(table_url.clone(), String::from("test")); + let file_path = dv_path.absolute_path().unwrap().to_file_path().unwrap(); + + // Create parent directory if it doesn't exist + if let Some(parent) = file_path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + + let mut file = File::create(&file_path).unwrap(); + + // Create an array-based deletion vector with specific deleted row indexes + let deleted_indexes = vec![5u64, 12, 23, 45, 67, 89, 100]; + let array_dv = ArrayDeletionVector::new(deleted_indexes.clone()); + + // Write using StreamingDeletionVectorWriter + let mut writer = StreamingDeletionVectorWriter::new(&mut file); + let write_result = writer.write_deletion_vector(array_dv).unwrap(); + writer.finalize().unwrap(); + drop(file); // Ensure file is closed + + // Verify the write result metadata + assert_eq!(write_result.cardinality, deleted_indexes.len() as i64); + assert_eq!(write_result.offset, 1); // After version byte + assert!(write_result.size_in_bytes > 0); + + // Read back using the descriptor to verify the data was written correctly + use crate::engine::sync::SyncEngine; + let engine = SyncEngine::new(); + let storage = engine.storage_handler(); + + let descriptor = write_result.to_descriptor(&dv_path); + let treemap = descriptor.read(storage, &table_url).unwrap(); + + // Verify the exact set of indexes matches + let read_indexes: Vec = treemap.into_iter().collect(); + assert_eq!(read_indexes, deleted_indexes); + } + + #[test] + fn test_to_descriptor_preserves_absolute_path() { + use url::Url; + + let table_path = Url::parse("file:///tmp/test_table/").unwrap(); + let prefix = String::from("deletion_vectors"); + + let dv_path = DeletionVectorPath::new(table_path.clone(), prefix); + + // Get the absolute path from DeletionVectorPath + let expected_absolute_path = dv_path.absolute_path().unwrap(); + + // Create a write result and convert to descriptor + let write_result = DeletionVectorWriteResult { + offset: 1, + size_in_bytes: 100, + cardinality: 42, + }; + + let descriptor = write_result.to_descriptor(&dv_path); + + // Get the absolute path from the descriptor + let actual_absolute_path = descriptor.absolute_path(&table_path).unwrap(); + + // Verify they match + assert_eq!(Some(expected_absolute_path), actual_absolute_path); + } + + #[test] + fn test_to_descriptor_preserves_absolute_path_empty_prefix() { + use url::Url; + + let table_path = Url::parse("file:///tmp/test_table/").unwrap(); + let prefix = String::from(""); + + let dv_path = DeletionVectorPath::new(table_path.clone(), prefix); + + // Get the absolute path from DeletionVectorPath + let expected_absolute_path = dv_path.absolute_path().unwrap(); + + // Create a write result and convert to descriptor + let write_result = DeletionVectorWriteResult { + offset: 10, + size_in_bytes: 50, + cardinality: 5, + }; + + let descriptor = write_result.to_descriptor(&dv_path); + + // Get the absolute path from the descriptor + let actual_absolute_path = descriptor.absolute_path(&table_path).unwrap(); + + // Verify they match + assert_eq!(Some(expected_absolute_path), actual_absolute_path); + } + + #[test] + fn test_to_descriptor_fields() { + use url::Url; + + let table_path = Url::parse("s3://my-bucket/delta_table/").unwrap(); + let prefix = String::from("dv"); + + let dv_path = DeletionVectorPath::new(table_path.clone(), prefix); + + let write_result = DeletionVectorWriteResult { + offset: 42, + size_in_bytes: 256, + cardinality: 100, + }; + + let descriptor = write_result.to_descriptor(&dv_path); + + // Verify descriptor fields match write result + assert_eq!(descriptor.offset, Some(42)); + assert_eq!(descriptor.size_in_bytes, 256); + assert_eq!(descriptor.cardinality, 100); + assert_eq!( + descriptor.storage_type, + DeletionVectorStorageType::PersistedRelative + ); + } + + #[test] + fn test_multiple_deletion_vectors_roundtrip_with_descriptor() { + use crate::Engine; + use std::fs::File; + use tempfile::tempdir; + use url::Url; + + // Create a temporary directory and file + let temp_dir = tempdir().unwrap(); + let table_url = Url::from_directory_path(temp_dir.path()).unwrap(); + + let dv_path = DeletionVectorPath::new(table_url.clone(), String::from("abc")); + let file_path = dv_path.absolute_path().unwrap().to_file_path().unwrap(); + + // Create parent directory if it doesn't exist + if let Some(parent) = file_path.parent() { + std::fs::create_dir_all(parent).unwrap(); + } + + let mut file = File::create(&file_path).unwrap(); + + // Create multiple deletion vectors with different data + let test_data = vec![ + vec![0u64, 5, 10, 15], + vec![1u64, 2, 3, 100, 200], + vec![50u64, 51, 52, 53, 54, 55], + ]; + + // Write all deletion vectors and collect their descriptors + let mut descriptors = Vec::new(); + let mut writer = StreamingDeletionVectorWriter::new(&mut file); + + for indexes in &test_data { + let mut dv = KernelDeletionVector::new(); + dv.add_deleted_row_indexes(indexes); + + let write_result = writer.write_deletion_vector(dv).unwrap(); + descriptors.push(write_result); + } + + writer.finalize().unwrap(); + drop(file); // Ensure file is closed + + // Create a storage handler using sync engine + use crate::engine::sync::SyncEngine; + let engine = SyncEngine::new(); + let storage = engine.storage_handler(); + + // Now read back each deletion vector using the descriptors + for (write_result, expected_indexes) in descriptors.iter().zip(&test_data) { + // Create a new DeletionVectorPath for each DV (they would have different UUIDs normally, + // but for this test we're writing multiple to the same file) + let descriptor = write_result.clone().to_descriptor(&dv_path); + + // Read the deletion vector back using the descriptor + let treemap = descriptor.read(storage.clone(), &table_url).unwrap(); + + // Verify the content matches + assert_eq!( + treemap, + expected_indexes.iter().collect::(), + "read {treemap:?} != expected {expected_indexes:?}" + ); + } + } +} diff --git a/kernel/src/actions/domain_metadata.rs b/kernel/src/actions/domain_metadata.rs deleted file mode 100644 index 5a94182566..0000000000 --- a/kernel/src/actions/domain_metadata.rs +++ /dev/null @@ -1,79 +0,0 @@ -//! This module includes support for reading DomainMetadata from the log. NB: it is similar to the -//! set_transaction module which reads SetTransaction actions from the log. -//! -//! For now, this module only exposes the ability to read a single domain at once from the log. In -//! the future this should allow for reading all domains from the log at once. - -use std::collections::HashMap; -use std::sync::{Arc, LazyLock}; - -use crate::actions::get_log_domain_metadata_schema; -use crate::actions::visitors::DomainMetadataVisitor; -use crate::actions::{DomainMetadata, DOMAIN_METADATA_NAME}; -use crate::log_replay::ActionsBatch; -use crate::log_segment::LogSegment; -use crate::{DeltaResult, Engine, Expression as Expr, PredicateRef, RowVisitor as _}; - -const DOMAIN_METADATA_DOMAIN_FIELD: &str = "domain"; - -pub(crate) type DomainMetadataMap = HashMap; - -/// Read the latest domain metadata for a given domain and return its `configuration`. This -/// accounts for 'removed' domain metadata: if the domain is removed, then the configuration is -/// `None`. Additionally, this includes 'internal' (delta.*) domains. The consumer must filter -/// these before returning domains to the user. -// TODO we should have some finer-grained unit tests here instead of relying on the top-level -// snapshot tests. -pub(crate) fn domain_metadata_configuration( - log_segment: &LogSegment, - domain: &str, - engine: &dyn Engine, -) -> DeltaResult> { - let mut domain_metadatas = scan_domain_metadatas(log_segment, Some(domain), engine)?; - Ok(domain_metadatas - .remove(domain) - .map(|domain_metadata| domain_metadata.configuration)) -} - -/// Scan the entire log for all domain metadata actions but terminate early if a specific domain -/// is provided. Note that this returns the latest domain metadata for each domain, accounting for -/// tombstones (removed=true) - that is, removed domain metadatas will _never_ be returned. -pub(crate) fn scan_domain_metadatas( - log_segment: &LogSegment, - domain: Option<&str>, - engine: &dyn Engine, -) -> DeltaResult { - let mut visitor = DomainMetadataVisitor::new(domain.map(|s| s.to_owned())); - // If a specific domain is requested then we can terminate log replay early as soon as it was - // found. If all domains are requested then we are forced to replay the entire log. - for actions in replay_for_domain_metadatas(log_segment, engine)? { - // throw away is_log_batch since we don't care - let domain_metadatas = actions?.actions; - visitor.visit_rows_of(domain_metadatas.as_ref())?; - // if a specific domain is requested and it was found, then return. note that we don't need - // to check if it was the one that was found since the visitor will only keep the requested - // domain - if visitor.filter_found() { - break; - } - } - - Ok(visitor.into_domain_metadatas()) -} - -fn replay_for_domain_metadatas( - log_segment: &LogSegment, - engine: &dyn Engine, -) -> DeltaResult> + Send> { - let schema = get_log_domain_metadata_schema(); - static META_PREDICATE: LazyLock> = LazyLock::new(|| { - Some(Arc::new( - Expr::column([DOMAIN_METADATA_NAME, DOMAIN_METADATA_DOMAIN_FIELD]).is_not_null(), - )) - }); - log_segment.read_actions( - engine, - schema.clone(), // Arc clone - META_PREDICATE.clone(), - ) -} diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index ae7669912b..c78f8b4158 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -2,18 +2,14 @@ //! specification](https://github.com/delta-io/delta/blob/master/PROTOCOL.md) use std::collections::HashMap; -use std::fmt::{Debug, Display}; -use std::hash::Hash; -use std::str::FromStr; use std::sync::{Arc, LazyLock}; use self::deletion_vector::DeletionVectorDescriptor; -use crate::expressions::{ArrayData, MapData, Scalar, StructData}; -use crate::schema::{ - ArrayType, DataType, MapType, SchemaRef, StructField, StructType, ToSchema as _, -}; +use crate::expressions::{MapData, Scalar, StructData}; +use crate::schema::{DataType, MapType, SchemaRef, StructField, StructType, ToSchema as _}; use crate::table_features::{ - ReaderFeature, WriterFeature, SUPPORTED_READER_FEATURES, SUPPORTED_WRITER_FEATURES, + FeatureType, IntoTableFeature, TableFeature, MIN_VALID_RW_VERSION, + TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION, }; use crate::table_properties::TableProperties; use crate::utils::require; @@ -26,18 +22,15 @@ use url::Url; use visitors::{MetadataVisitor, ProtocolVisitor}; use delta_kernel_derive::{internal_api, IntoEngineData, ToSchema}; -use itertools::Itertools; use serde::{Deserialize, Serialize}; const KERNEL_VERSION: &str = env!("CARGO_PKG_VERSION"); const UNKNOWN_OPERATION: &str = "UNKNOWN"; pub mod deletion_vector; +pub mod deletion_vector_writer; pub mod set_transaction; -pub(crate) mod crc; -pub(crate) mod domain_metadata; - // see comment in ../lib.rs for the path module for why we include this way #[cfg(feature = "internal-api")] pub mod visitors; @@ -89,6 +82,8 @@ static ALL_ACTIONS_SCHEMA: LazyLock = LazyLock::new(|| { )) }); +/// Schema for Add actions in the Delta log. +/// Wraps the Add action schema in a top-level struct with "add" field name. static LOG_ADD_SCHEMA: LazyLock = LazyLock::new(|| { Arc::new(StructType::new_unchecked([StructField::nullable( ADD_NAME, @@ -96,6 +91,17 @@ static LOG_ADD_SCHEMA: LazyLock = LazyLock::new(|| { )])) }); +/// Schema for Remove actions in the Delta log. +/// Wraps the Remove action schema in a top-level struct with "remove" field name. +static LOG_REMOVE_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(StructType::new_unchecked([StructField::nullable( + REMOVE_NAME, + Remove::to_schema(), + )])) +}); + +/// Schema for CommitInfo actions in the Delta log. +/// Wraps the CommitInfo schema in a top-level struct with "commitInfo" field name. static LOG_COMMIT_INFO_SCHEMA: LazyLock = LazyLock::new(|| { Arc::new(StructType::new_unchecked([StructField::nullable( COMMIT_INFO_NAME, @@ -103,6 +109,8 @@ static LOG_COMMIT_INFO_SCHEMA: LazyLock = LazyLock::new(|| { )])) }); +/// Schema for transaction (txn) actions in the Delta log. +/// Wraps the SetTransaction schema in a top-level struct with "txn" field name. static LOG_TXN_SCHEMA: LazyLock = LazyLock::new(|| { Arc::new(StructType::new_unchecked([StructField::nullable( SET_TRANSACTION_NAME, @@ -136,6 +144,10 @@ pub(crate) fn get_log_add_schema() -> &'static SchemaRef { &LOG_ADD_SCHEMA } +pub(crate) fn get_log_remove_schema() -> &'static SchemaRef { + &LOG_REMOVE_SCHEMA +} + pub(crate) fn get_log_commit_info_schema() -> &'static SchemaRef { &LOG_COMMIT_INFO_SCHEMA } @@ -165,12 +177,9 @@ pub(crate) fn as_log_add_schema(schema: SchemaRef) -> SchemaRef { )])) } -#[derive(Debug, Clone, PartialEq, Eq, ToSchema)] -#[cfg_attr( - any(test, feature = "internal-api"), - derive(Serialize, Deserialize), - serde(rename_all = "camelCase") -)] +// Serde derives are needed for CRC file deserialization (see `crc::reader`). +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "camelCase")] #[internal_api] pub(crate) struct Format { /// Name of the encoding for files in this table @@ -205,12 +214,9 @@ impl TryFrom for Scalar { } } -#[derive(Debug, Default, Clone, PartialEq, Eq, ToSchema)] -#[cfg_attr( - any(test, feature = "internal-api"), - derive(Serialize, Deserialize), - serde(rename_all = "camelCase") -)] +// Serde derives are needed for CRC file deserialization (see `crc::reader`). +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "camelCase")] #[internal_api] pub(crate) struct Metadata { /// Unique identifier for this table @@ -243,7 +249,7 @@ impl Metadata { pub(crate) fn try_new( name: Option, description: Option, - schema: StructType, + schema: SchemaRef, partition_columns: Vec, created_time: i64, configuration: HashMap, @@ -307,19 +313,28 @@ impl Metadata { } #[internal_api] - #[allow(dead_code)] pub(crate) fn configuration(&self) -> &HashMap { &self.configuration } + #[internal_api] + #[allow(dead_code)] + pub(crate) fn format_provider(&self) -> &str { + &self.format.provider + } + + #[internal_api] + pub(crate) fn schema_string(&self) -> &String { + &self.schema_string + } + #[internal_api] pub(crate) fn parse_schema(&self) -> DeltaResult { Ok(serde_json::from_str(&self.schema_string)?) } #[internal_api] - #[allow(dead_code)] - pub(crate) fn partition_columns(&self) -> &Vec { + pub(crate) fn partition_columns(&self) -> &[String] { &self.partition_columns } @@ -330,9 +345,34 @@ impl Metadata { pub(crate) fn parse_table_properties(&self) -> TableProperties { TableProperties::from(self.configuration.iter()) } + + #[cfg(test)] + #[allow(clippy::too_many_arguments)] + pub(crate) fn new_unchecked( + id: impl Into, + name: Option, + description: Option, + format: Format, + schema_string: impl Into, + partition_columns: Vec, + created_time: Option, + configuration: HashMap, + ) -> Self { + Self { + id: id.into(), + name, + description, + format, + schema_string: schema_string.into(), + partition_columns, + created_time, + configuration, + } + } } -// TODO: derive IntoEngineData instead (see issue #1083) +// NOTE: We can't derive IntoEngineData for Metadata because it has a nested Format struct, +// and create_one expects flattened values for nested schemas. impl IntoEngineData for Metadata { fn into_engine_data( self, @@ -356,7 +396,9 @@ impl IntoEngineData for Metadata { } } -#[derive(Default, Debug, Clone, PartialEq, Eq, ToSchema, Serialize, Deserialize)] +#[derive( + Default, Debug, Clone, PartialEq, Eq, ToSchema, Serialize, Deserialize, IntoEngineData, +)] #[serde(rename_all = "camelCase")] #[internal_api] // TODO move to another module so that we disallow constructing this struct without using the @@ -371,56 +413,168 @@ pub(crate) struct Protocol { /// A collection of features that a client must implement in order to correctly /// read this table (exist only when minReaderVersion is set to 3) #[serde(skip_serializing_if = "Option::is_none")] - reader_features: Option>, + reader_features: Option>, /// A collection of features that a client must implement in order to correctly /// write this table (exist only when minWriterVersion is set to 7) #[serde(skip_serializing_if = "Option::is_none")] - writer_features: Option>, + writer_features: Option>, } -fn parse_features(features: Option>) -> Option> -where - T: FromStr, - T::Err: Debug, -{ - features - .map(|fs| { - fs.into_iter() - .map(|f| T::from_str(&f.to_string())) - .collect() - }) - .transpose() - .ok()? +/// Parse a list of feature identifiers into TableFeatures. Returns `None` for `None` input; +/// otherwise infallible (unrecognized names become `TableFeature::Unknown`). +fn parse_features( + features: Option>, +) -> Option> { + let features = features?.into_iter().map(|f| f.into_table_feature()); + Some(features.collect()) } impl Protocol { - /// Try to create a new Protocol instance from reader/writer versions and table features. This - /// can fail if the protocol is invalid. + /// Try to create a new modern Protocol instance with the given table feature lists + pub(crate) fn try_new_modern( + reader_features: impl IntoIterator, + writer_features: impl IntoIterator, + ) -> DeltaResult { + Self::try_new( + TABLE_FEATURES_MIN_READER_VERSION, + TABLE_FEATURES_MIN_WRITER_VERSION, + Some(reader_features), + Some(writer_features), + ) + } + + /// Try to create a new legacy Protocol instance with the given reader/writer versions + #[cfg(test)] + pub(crate) fn try_new_legacy( + min_reader_version: i32, + min_writer_version: i32, + ) -> DeltaResult { + Self::try_new( + min_reader_version, + min_writer_version, + TableFeature::NO_LIST, + TableFeature::NO_LIST, + ) + } + + /// Try to create a new Protocol instance from reader/writer versions and table features. pub(crate) fn try_new( min_reader_version: i32, min_writer_version: i32, - reader_features: Option>, - writer_features: Option>, + reader_features: Option>, + writer_features: Option>, ) -> DeltaResult { - if min_reader_version == 3 { + require!( + min_reader_version >= MIN_VALID_RW_VERSION, + Error::InvalidProtocol(format!( + "min_reader_version must be >= {MIN_VALID_RW_VERSION}, got {min_reader_version}" + )) + ); + require!( + min_writer_version >= MIN_VALID_RW_VERSION, + Error::InvalidProtocol(format!( + "min_writer_version must be >= {MIN_VALID_RW_VERSION}, got {min_writer_version}" + )) + ); + + let reader_features = parse_features(reader_features); + let writer_features = parse_features(writer_features); + + // The protocol states that Reader features may be present if and only if the min_reader_version is 3 + if min_reader_version == TABLE_FEATURES_MIN_READER_VERSION { require!( reader_features.is_some(), Error::invalid_protocol( "Reader features must be present when minimum reader version = 3" ) ); + } else { + require!( + reader_features.is_none(), + Error::invalid_protocol( + "Reader features must not be present when minimum reader version != 3" + ) + ); } - if min_writer_version == 7 { + + // The protocol states that Writer features may be present if and only if the min_writer_version is 7 + if min_writer_version == TABLE_FEATURES_MIN_WRITER_VERSION { require!( writer_features.is_some(), Error::invalid_protocol( "Writer features must be present when minimum writer version = 7" ) ); + } else { + require!( + writer_features.is_none(), + Error::invalid_protocol( + "Writer features must not be present when minimum writer version != 7" + ) + ); } - let reader_features = parse_features(reader_features); - let writer_features = parse_features(writer_features); + // Self- and cross-validate the reader and writer feature lists. + match (&reader_features, &writer_features) { + (Some(reader_features), Some(writer_features)) => { + // Check all reader features are ReaderWriter and present in writer features. + // Unknown features are treated as potentially ReaderWriter for forward compatibility. + let check_r = reader_features.iter().all(|feature| { + matches!( + feature.feature_type(), + FeatureType::ReaderWriter | FeatureType::Unknown + ) && writer_features.contains(feature) + }); + require!( + check_r, + Error::invalid_protocol( + "Reader features must contain only ReaderWriter features that are also listed in writer features" + ) + ); + + // Check all writer features that are ReaderWriter must also be in reader features + // Unknown features are treated as potentially Writer-only for forward compatibility. + let check_w = writer_features + .iter() + .all(|feature| match feature.feature_type() { + FeatureType::WriterOnly | FeatureType::Unknown => true, + FeatureType::ReaderWriter => reader_features.contains(feature), + }); + require!( + check_w, + Error::invalid_protocol( + "Writer features must be Writer-only or also listed in reader features" + ) + ); + Ok(()) + } + (None, None) => Ok(()), + (None, Some(writer_features)) => { + // Special case: reader version 2 implies ColumnMapping support. + // All other ReaderWriter features require explicit reader_features list (reader version 3). + // Unknown features are treated as potentially Writer-only for forward compatibility. + let is_valid = writer_features.iter().all(|feature| { + match feature.feature_type() { + FeatureType::WriterOnly | FeatureType::Unknown => true, + FeatureType::ReaderWriter => { + // ColumnMapping is allowed when reader version is 2 (implied support) + min_reader_version == 2 && feature == &TableFeature::ColumnMapping + } + } + }); + + require!( + is_valid, + Error::invalid_protocol( + "Writer features must be Writer-only or also listed in reader features" + ) + ); + Ok(()) + } + (Some(_), None) => Err(Error::invalid_protocol( + "Reader features should be present in writer features", + )), + }?; Ok(Protocol { min_reader_version, @@ -452,191 +606,40 @@ impl Protocol { /// Get the reader features for the protocol #[internal_api] - pub(crate) fn reader_features(&self) -> Option<&[ReaderFeature]> { + pub(crate) fn reader_features(&self) -> Option<&[TableFeature]> { self.reader_features.as_deref() } /// Get the writer features for the protocol #[internal_api] - pub(crate) fn writer_features(&self) -> Option<&[WriterFeature]> { + pub(crate) fn writer_features(&self) -> Option<&[TableFeature]> { self.writer_features.as_deref() } - /// True if this protocol has the requested reader feature - pub(crate) fn has_reader_feature(&self, feature: &ReaderFeature) -> bool { - self.reader_features() - .is_some_and(|features| features.contains(feature)) - } - - /// True if this protocol has the requested writer feature - pub(crate) fn has_writer_feature(&self, feature: &WriterFeature) -> bool { + /// True if this protocol has the requested feature + pub(crate) fn has_table_feature(&self, feature: &TableFeature) -> bool { + // Since each reader features is a subset of writer features, we only check writer feature self.writer_features() .is_some_and(|features| features.contains(feature)) } - /// Check if reading a table with this protocol is supported. That is: does the kernel support - /// the specified protocol reader version and all enabled reader features? If yes, returns unit - /// type, otherwise will return an error. - pub(crate) fn ensure_read_supported(&self) -> DeltaResult<()> { - match &self.reader_features { - // if min_reader_version = 3 and all reader features are subset of supported => OK - Some(reader_features) if self.min_reader_version == 3 => { - ensure_supported_features(reader_features, &SUPPORTED_READER_FEATURES) - } - // if min_reader_version = 3 and no reader features => ERROR - // NOTE this is caught by the protocol parsing. - None if self.min_reader_version == 3 => Err(Error::internal_error( - "Reader features must be present when minimum reader version = 3", - )), - // if min_reader_version = 1,2 and there are no reader features => OK - None if self.min_reader_version == 1 || self.min_reader_version == 2 => Ok(()), - // if min_reader_version = 1,2 and there are reader features => ERROR - // NOTE this is caught by the protocol parsing. - Some(_) if self.min_reader_version == 1 || self.min_reader_version == 2 => { - Err(Error::internal_error( - "Reader features must not be present when minimum reader version = 1 or 2", - )) - } - // any other min_reader_version is not supported - _ => Err(Error::Unsupported(format!( - "Unsupported minimum reader version {}", - self.min_reader_version - ))), - } - } - - /// Check if writing to a table with this protocol is supported. That is: does the kernel - /// support the specified protocol writer version and all enabled writer features? - pub(crate) fn ensure_write_supported(&self) -> DeltaResult<()> { - #[cfg(feature = "catalog-managed")] - require!( - !self.is_catalog_managed(), - Error::unsupported("Writes are not yet supported for catalog-managed tables") - ); - match &self.writer_features { - Some(writer_features) if self.min_writer_version == 7 => { - // if we're on version 7, make sure we support all the specified features - ensure_supported_features(writer_features, &SUPPORTED_WRITER_FEATURES)?; - - // ensure that there is no illegal combination of features - if writer_features.contains(&WriterFeature::RowTracking) - && !writer_features.contains(&WriterFeature::DomainMetadata) - { - Err(Error::invalid_protocol( - "rowTracking feature requires domainMetadata to also be enabled", - )) - } else { - Ok(()) - } - } - Some(_) => { - // there are features, but we're not on 7, so the protocol is actually broken - Err(Error::unsupported( - "Tables with min writer version != 7 should not have table features.", - )) - } - None => { - // no features, we currently only support version 1 or 2 in this case - require!( - self.min_writer_version == 1 || self.min_writer_version == 2, - Error::unsupported( - "Currently delta-kernel-rs can only write to tables with protocol.minWriterVersion = 1, 2, or 7" - ) - ); - Ok(()) - } - } - } - - #[cfg(feature = "catalog-managed")] - pub(crate) fn is_catalog_managed(&self) -> bool { - self.reader_features.as_ref().is_some_and(|fs| { - fs.contains(&ReaderFeature::CatalogManaged) - || fs.contains(&ReaderFeature::CatalogOwnedPreview) - }) || self.writer_features.as_ref().is_some_and(|fs| { - fs.contains(&WriterFeature::CatalogManaged) - || fs.contains(&WriterFeature::CatalogOwnedPreview) - }) - } -} - -// TODO: implement Scalar::From> so we can derive IntoEngineData using a macro (issue#1083) -impl IntoEngineData for Protocol { - fn into_engine_data( - self, - schema: SchemaRef, - engine: &dyn Engine, - ) -> DeltaResult> { - fn features_to_scalar( - features: Option>, - ) -> DeltaResult - where - T: Into, - { - match features { - Some(features) => { - let features: Vec = features.into_iter().map(Into::into).collect(); - Ok(Scalar::Array(ArrayData::try_new( - ArrayType::new(DataType::STRING, false), - features, - )?)) - } - None => Ok(Scalar::Null(DataType::Array(Box::new(ArrayType::new( - DataType::STRING, - false, - ))))), - } + #[cfg(test)] + pub(crate) fn new_unchecked( + min_reader_version: i32, + min_writer_version: i32, + reader_features: Option>, + writer_features: Option>, + ) -> Self { + Self { + min_reader_version, + min_writer_version, + reader_features, + writer_features, } - - let values = [ - self.min_reader_version.into(), - self.min_writer_version.into(), - features_to_scalar(self.reader_features)?, - features_to_scalar(self.writer_features)?, - ]; - - engine.evaluation_handler().create_one(schema, &values) } } -// given `table_features`, check if they are subset of `supported_features` -pub(crate) fn ensure_supported_features( - table_features: &[T], - supported_features: &[T], -) -> DeltaResult<()> -where - T: Display + FromStr + Hash + Eq, - ::Err: Display, -{ - // first check if all features are supported, else we proceed to craft an error message - if table_features - .iter() - .all(|feature| supported_features.contains(feature)) - { - return Ok(()); - } - - // we get the type name (ReaderFeature/WriterFeature) for better error messages - let features_type = std::any::type_name::() - .rsplit("::") - .next() - .unwrap_or("table feature"); - - // NB: we didn't do this above to avoid allocation in the common case - let mut unsupported = table_features - .iter() - .filter(|feature| !supported_features.contains(*feature)); - - Err(Error::Unsupported(format!( - "Unknown {}s: \"{}\". Supported {}s: \"{}\"", - features_type, - unsupported.join("\", \""), - features_type, - supported_features.iter().join("\", \""), - ))) -} - -#[derive(Debug, Clone, PartialEq, Eq, ToSchema)] +#[derive(Debug, Clone, PartialEq, Eq, ToSchema, IntoEngineData)] #[internal_api] #[cfg_attr(test, derive(Serialize, Default), serde(rename_all = "camelCase"))] pub(crate) struct CommitInfo { @@ -659,6 +662,8 @@ pub(crate) struct CommitInfo { /// write this field, but it is optional since many tables will not have this field (i.e. any /// tables not written by kernel). pub(crate) kernel_version: Option, + /// Whether this commit is a blind append. + pub(crate) is_blind_append: Option, /// A place for the engine to store additional metadata associated with this commit pub(crate) engine_info: Option, /// A unique transaction identifier for this commit. @@ -671,42 +676,27 @@ impl CommitInfo { in_commit_timestamp: Option, operation: Option, engine_info: Option, + is_blind_append: bool, ) -> Self { Self { timestamp: Some(timestamp), in_commit_timestamp, operation: Some(operation.unwrap_or_else(|| UNKNOWN_OPERATION.to_string())), - operation_parameters: None, + operation_parameters: Some(HashMap::new()), kernel_version: Some(format!("v{KERNEL_VERSION}")), + is_blind_append: is_blind_append.then_some(true), engine_info, txn_id: Some(uuid::Uuid::new_v4().to_string()), } } } -// TODO: implement Scalar::From> so we can derive IntoEngineData using a macro (issue#1083) -impl IntoEngineData for CommitInfo { - fn into_engine_data( - self, - schema: SchemaRef, - engine: &dyn Engine, - ) -> DeltaResult> { - let values = [ - self.timestamp.into(), - self.in_commit_timestamp.into(), - self.operation.into(), - self.operation_parameters.unwrap_or_default().try_into()?, - self.kernel_version.into(), - self.engine_info.into(), - self.txn_id.into(), - ]; - - engine.evaluation_handler().create_one(schema, &values) - } -} - #[derive(Debug, Clone, PartialEq, Eq, ToSchema)] -#[cfg_attr(test, derive(Serialize, Default), serde(rename_all = "camelCase"))] +#[cfg_attr( + test, + derive(Serialize, Deserialize, Default), + serde(rename_all = "camelCase") +)] #[internal_api] pub(crate) struct Add { /// A relative path to a data file from the root of the table or an absolute path to a file @@ -722,7 +712,7 @@ pub(crate) struct Add { /// null values. This means an engine can assume that if a partition is found in /// [`Metadata::partition_columns`] but not in this map, its value is null. /// - /// [`materialize`]: crate::engine_data::EngineMap::materialize + /// [`materialize`]: crate::engine_data::MapItem::materialize #[allow_null_container_values] pub(crate) partition_values: HashMap, @@ -743,8 +733,13 @@ pub(crate) struct Add { pub stats: Option, /// Map containing metadata about this logical file. + /// Note: map values can be null. + /// We don't use `#[allow_null_container_values]` here because [`MapItem::materialize`] + /// drops null values when that attribute is present. + /// + /// [`MapItem::materialize`]: crate::engine_data::MapItem::materialize #[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))] - pub tags: Option>, + pub tags: Option>>, /// Information about deletion vector (DV) associated with this add action #[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))] @@ -796,7 +791,14 @@ pub(crate) struct Remove { #[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))] pub(crate) extended_file_metadata: Option, - /// A map from partition column to value for this logical file. + /// A map from partition column to value for this logical file. This map can contain null in + /// the values meaning a partition is null. We drop those values from this map, due to the + /// `allow_null_container_values` annotation allowing them and because [`materialize`] drops + /// null values. This means an engine can assume that if a partition is found in + /// [`Metadata::partition_columns`] but not in this map, its value is null. + /// + /// [`materialize`]: crate::engine_data::EngineMap::materialize + #[allow_null_container_values] #[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))] pub(crate) partition_values: Option>, @@ -804,7 +806,14 @@ pub(crate) struct Remove { #[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))] pub(crate) size: Option, - /// Map containing metadata about this logical file. + /// Contains [statistics] (e.g., count, min/max values for columns) about the data in this logical file encoded as a JSON string. + /// + /// [statistics]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Per-file-Statistics + #[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))] + pub stats: Option, + + /// Map containing metadata about this logical file. Values can be null. + #[allow_null_container_values] #[cfg_attr(test, serde(skip_serializing_if = "Option::is_none"))] pub(crate) tags: Option>, @@ -840,7 +849,7 @@ pub(crate) struct Cdc { /// null values. This means an engine can assume that if a partition is found in /// [`Metadata::partition_columns`] but not in this map, its value is null. /// - /// [`materialize`]: crate::engine_data::EngineMap::materialize + /// [`materialize`]: crate::engine_data::MapItem::materialize #[allow_null_container_values] pub partition_values: HashMap, @@ -854,11 +863,13 @@ pub(crate) struct Cdc { /// data of the table pub data_change: bool, - /// Map containing metadata about this logical file. + /// Map containing metadata about this logical file. Values can be null. + #[allow_null_container_values] pub tags: Option>, } -#[derive(Debug, Clone, PartialEq, Eq, ToSchema, IntoEngineData)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema, IntoEngineData)] +#[serde(rename_all = "camelCase")] #[internal_api] pub(crate) struct SetTransaction { /// A unique identifier for the application performing the transaction. @@ -903,7 +914,8 @@ pub(crate) struct Sidecar { /// The time this logical file was created, as milliseconds since the epoch. pub modification_time: i64, - /// A map containing any additional metadata about the logicial file. + /// A map containing any additional metadata about the logical file. Values can be null. + #[allow_null_container_values] pub tags: Option>, } @@ -940,7 +952,8 @@ pub(crate) struct CheckpointMetadata { /// See issue #786 for tracking progress. pub(crate) version: i64, - /// Map containing any additional metadata about the V2 spec checkpoint. + /// Map containing any additional metadata about the V2 spec checkpoint. Values can be null. + #[allow_null_container_values] pub(crate) tags: Option>, } @@ -951,7 +964,7 @@ pub(crate) struct CheckpointMetadata { /// Note that the `delta.*` domain is reserved for internal use. /// /// [DomainMetadata]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#domain-metadata -#[derive(Debug, Clone, PartialEq, Eq, ToSchema, IntoEngineData)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema, IntoEngineData)] #[internal_api] pub(crate) struct DomainMetadata { domain: String, @@ -969,7 +982,7 @@ impl DomainMetadata { } } - // Create a new DomainMetadata action to remove a domain. + /// Create a new DomainMetadata action to remove a domain. pub(crate) fn remove(domain: String, configuration: String) -> Self { Self { domain, @@ -981,18 +994,22 @@ impl DomainMetadata { // returns true if the domain metadata is an system-controlled domain (all domains that start // with "delta.") #[allow(unused)] + #[internal_api] pub(crate) fn is_internal(&self) -> bool { self.domain.starts_with(INTERNAL_DOMAIN_PREFIX) } + #[internal_api] pub(crate) fn domain(&self) -> &str { &self.domain } + #[internal_api] pub(crate) fn configuration(&self) -> &str { &self.configuration } + /// Returns `true` if this action is a tombstone (marking domain removal). pub(crate) fn is_removed(&self) -> bool { self.removed } @@ -1000,18 +1017,23 @@ impl DomainMetadata { #[cfg(test)] mod tests { + use rstest::rstest; + + use super::set_transaction::is_set_txn_expired; use super::*; use crate::{ - arrow::array::{ - Array, BooleanArray, Int32Array, Int64Array, ListArray, ListBuilder, MapBuilder, - MapFieldNames, RecordBatch, StringArray, StringBuilder, StructArray, + arrow::{ + array::{ + Array, BooleanArray, Int32Array, Int64Array, ListArray, ListBuilder, MapBuilder, + MapFieldNames, RecordBatch, StringArray, StringBuilder, StructArray, + }, + datatypes::{DataType as ArrowDataType, Field, Schema}, + json::ReaderBuilder, }, - arrow::datatypes::{DataType as ArrowDataType, Field, Schema}, - arrow::json::ReaderBuilder, - engine::{arrow_data::ArrowEngineData, arrow_expression::ArrowEvaluationHandler}, + engine::{arrow_data::EngineDataArrowExt as _, arrow_expression::ArrowEvaluationHandler}, schema::{ArrayType, DataType, MapType, StructField}, utils::test_utils::assert_result_error_with_message, - Engine, EvaluationHandler, JsonHandler, ParquetHandler, StorageHandler, + Engine, EvaluationHandler, IntoEngineData, JsonHandler, ParquetHandler, StorageHandler, }; use serde_json::json; @@ -1061,6 +1083,24 @@ mod tests { )) } + #[rstest] + #[case::no_expiration_configured(None, Some(1000), false)] + #[case::null_last_updated_never_expires(Some(5000), None, false)] + #[case::both_none(None, None, false)] + #[case::last_updated_before_expiration(Some(2000), Some(1000), true)] + #[case::last_updated_at_expiration(Some(1000), Some(1000), true)] + #[case::last_updated_after_expiration(Some(2000), Some(3000), false)] + fn test_is_set_txn_expired( + #[case] expiration_timestamp: Option, + #[case] last_updated: Option, + #[case] expected: bool, + ) { + assert_eq!( + is_set_txn_expired(expiration_timestamp, last_updated), + expected + ); + } + #[test] fn test_metadata_schema() { let schema = get_commit_schema() @@ -1115,7 +1155,7 @@ mod tests { StructField::nullable("stats", DataType::STRING), StructField::nullable( "tags", - MapType::new(DataType::STRING, DataType::STRING, false), + MapType::new(DataType::STRING, DataType::STRING, true), ), deletion_vector_field(), StructField::nullable("baseRowId", DataType::LONG), @@ -1129,14 +1169,14 @@ mod tests { fn tags_field() -> StructField { StructField::nullable( "tags", - MapType::new(DataType::STRING, DataType::STRING, false), + MapType::new(DataType::STRING, DataType::STRING, true), ) } fn partition_values_field() -> StructField { StructField::nullable( "partitionValues", - MapType::new(DataType::STRING, DataType::STRING, false), + MapType::new(DataType::STRING, DataType::STRING, true), ) } @@ -1167,6 +1207,7 @@ mod tests { StructField::nullable("extendedFileMetadata", DataType::BOOLEAN), partition_values_field(), StructField::nullable("size", DataType::LONG), + StructField::nullable("stats", DataType::STRING), tags_field(), deletion_vector_field(), StructField::nullable("baseRowId", DataType::LONG), @@ -1258,6 +1299,7 @@ mod tests { MapType::new(DataType::STRING, DataType::STRING, false), ), StructField::nullable("kernelVersion", DataType::STRING), + StructField::nullable("isBlindAppend", DataType::BOOLEAN), StructField::nullable("engineInfo", DataType::STRING), StructField::nullable("txnId", DataType::STRING), ]), @@ -1322,188 +1364,207 @@ mod tests { } } + #[rstest] + #[case(0, 1)] + #[case(1, 0)] + #[case(-1, 2)] + #[case(1, -1)] + fn reject_protocol_version_below_minimum(#[case] rv: i32, #[case] wv: i32) { + let expected = if rv < 1 { + format!("Invalid protocol action in the delta log: min_reader_version must be >= 1, got {rv}") + } else { + format!("Invalid protocol action in the delta log: min_writer_version must be >= 1, got {wv}") + }; + assert_result_error_with_message( + Protocol::try_new(rv, wv, TableFeature::NO_LIST, TableFeature::NO_LIST), + &expected, + ); + } + #[test] - fn test_v2_checkpoint_supported() { - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::V2Checkpoint]), - Some([ReaderFeature::V2Checkpoint]), - ) - .unwrap(); - assert!(protocol.ensure_read_supported().is_ok()); + fn accept_min_versions() { + let p = Protocol::try_new_legacy(1, 1).unwrap(); + assert_eq!(p.min_reader_version(), 1); + assert_eq!(p.min_writer_version(), 1); } #[test] - fn test_ensure_read_supported() { - let protocol = Protocol { - min_reader_version: 3, - min_writer_version: 7, - reader_features: Some(vec![]), - writer_features: Some(vec![]), - }; - assert!(protocol.ensure_read_supported().is_ok()); + fn test_validate_table_features_invalid() { + // (reader_feature, writer_feature) + let invalid_features = [ + // ReaderWriter feature not present in writer features + ( + vec![TableFeature::DeletionVectors], + vec![TableFeature::AppendOnly], + "Reader features must contain only ReaderWriter features that are also listed in writer features", + ), + ( + vec![TableFeature::DeletionVectors], + vec![], + "Reader features must contain only ReaderWriter features that are also listed in writer features", + ), + // ReaderWriter feature not present in reader features + ( + vec![], + vec![TableFeature::DeletionVectors], + "Writer features must be Writer-only or also listed in reader features", + ), + ( + vec![TableFeature::VariantType], + vec![ + TableFeature::VariantType, + TableFeature::DeletionVectors, + ], + "Writer features must be Writer-only or also listed in reader features", + ), + // WriterOnly feature present in reader features + ( + vec![TableFeature::AppendOnly], + vec![TableFeature::AppendOnly], + "Reader features must contain only ReaderWriter features that are also listed in writer features", + ), + ]; - let empty_features: [String; 0] = []; - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::V2Checkpoint]), - Some(&empty_features), - ) - .unwrap(); - assert!(protocol.ensure_read_supported().is_ok()); + for (reader_features, writer_features, error_msg) in invalid_features { + let res = Protocol::try_new_modern(reader_features, writer_features); + assert!( + matches!( + &res, + Err(Error::InvalidProtocol(error)) if error.to_string().eq(error_msg) + ), + "Expected:\t{error_msg}\nBut got:{res:?}\n" + ); + } + } - let protocol = Protocol::try_new( - 3, - 7, - Some(&empty_features), - Some([WriterFeature::V2Checkpoint]), - ) - .unwrap(); - assert!(protocol.ensure_read_supported().is_ok()); + #[test] + fn test_validate_table_features_unknown() { + // Unknown features are allowed during validation for forward compatibility, + // but will be rejected when trying to use the protocol (ensure_operation_supported) + + // Test unknown features in reader - validation passes + let protocol = Protocol::try_new_modern( + vec![TableFeature::Unknown("unknown_reader".to_string())], + vec![TableFeature::Unknown("unknown_reader".to_string())], + ); + assert!(protocol.is_ok()); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::V2Checkpoint]), - Some([WriterFeature::V2Checkpoint]), - ) - .unwrap(); - assert!(protocol.ensure_read_supported().is_ok()); + // Test unknown features in writer - validation passes + let protocol = Protocol::try_new_modern( + TableFeature::EMPTY_LIST, + vec![TableFeature::Unknown("unknown_writer".to_string())], + ); + assert!(protocol.is_ok()); + } - let protocol = Protocol { - min_reader_version: 1, - min_writer_version: 7, - reader_features: None, - writer_features: None, - }; - assert!(protocol.ensure_read_supported().is_ok()); + #[test] + fn test_validate_table_features_valid() { + // (reader_feature, writer_feature) + let valid_features = [ + // ReaderWriter feature present in both reader/writer features, + // WriterOnly feature present in writer feature + ( + vec![TableFeature::DeletionVectors], + vec![TableFeature::DeletionVectors], + ), + (vec![], vec![TableFeature::AppendOnly]), + ( + vec![TableFeature::VariantType], + vec![TableFeature::VariantType, TableFeature::AppendOnly], + ), + // Unknown feature may be ReaderWriter or WriterOnly (for forward compatibility) + ( + vec![TableFeature::Unknown("rw".to_string())], + vec![ + TableFeature::Unknown("rw".to_string()), + TableFeature::Unknown("w".to_string()), + ], + ), + // Empty feature set is valid + (vec![], vec![]), + ]; - let protocol = Protocol { - min_reader_version: 2, - min_writer_version: 7, - reader_features: None, - writer_features: None, - }; - assert!(protocol.ensure_read_supported().is_ok()); + for (reader_features, writer_features) in valid_features { + assert!(Protocol::try_new_modern(reader_features, writer_features).is_ok()); + } } #[test] - fn test_ensure_write_supported() { + fn test_validate_legacy_column_mapping_valid() { + // Valid: ColumnMapping with reader v2 + // Reader version 2 implies columnMapping support (no explicit reader_features) + // Writer version 7 requires explicit writer_features list let protocol = Protocol::try_new( - 3, + 2, 7, - Some::>(vec![]), - Some(vec![ - WriterFeature::AppendOnly, - WriterFeature::DeletionVectors, - WriterFeature::DomainMetadata, - WriterFeature::Invariants, - WriterFeature::RowTracking, - ]), - ) - .unwrap(); - assert!(protocol.ensure_write_supported().is_ok()); + TableFeature::NO_LIST, + Some(vec![TableFeature::ColumnMapping]), + ); + assert!(protocol.is_ok()); + } - // Verify that unsupported writer features are rejected - // NOTE: Unsupported reader features should not cause an error here + #[test] + fn test_validate_legacy_writer_only_features_valid() { + // Valid: Writer-only features with reader v1 let protocol = Protocol::try_new( - 3, + 1, 7, - Some([ReaderFeature::Unknown("unsupported reader".to_string())]), - Some([WriterFeature::IdentityColumns]), - ) - .unwrap(); - assert_result_error_with_message( - protocol.ensure_write_supported(), - r#"Unsupported: Unknown WriterFeatures: "identityColumns". Supported WriterFeatures: "appendOnly", "deletionVectors", "domainMetadata", "inCommitTimestamp", "invariants", "rowTracking", "timestampNtz", "variantType", "variantType-preview", "variantShredding-preview""#, + TableFeature::NO_LIST, + Some(vec![TableFeature::AppendOnly]), ); + assert!(protocol.is_ok()); + } - // Unknown writer features should cause an error + #[test] + fn test_validate_legacy_column_mapping_with_writer_features_valid() { + // Valid: Mix of Writer-only and ColumnMapping with reader v2 let protocol = Protocol::try_new( - 3, + 2, 7, - Some([ReaderFeature::Unknown("unsupported reader".to_string())]), - Some([WriterFeature::Unknown("unsupported writer".to_string())]), - ) - .unwrap(); - assert_result_error_with_message( - protocol.ensure_write_supported(), - r#"Unsupported: Unknown WriterFeatures: "unsupported writer". Supported WriterFeatures: "appendOnly", "deletionVectors", "domainMetadata", "inCommitTimestamp", "invariants", "rowTracking", "timestampNtz", "variantType", "variantType-preview", "variantShredding-preview""#, + TableFeature::NO_LIST, + Some(vec![TableFeature::AppendOnly, TableFeature::ColumnMapping]), ); + assert!(protocol.is_ok()); } #[test] - fn test_illegal_writer_feature_combination() { + fn test_validate_column_mapping_reader_v1_invalid() { + // Invalid: ColumnMapping with reader v1 + // Reader v1 doesn't imply any ReaderWriter features let protocol = Protocol::try_new( - 3, + 1, 7, - Some::>(vec![]), - Some(vec![ - // No domain metadata even though that is required - WriterFeature::RowTracking, - ]), - ) - .unwrap(); - - assert_result_error_with_message( - protocol.ensure_write_supported(), - "rowTracking feature requires domainMetadata to also be enabled", + TableFeature::NO_LIST, + Some(vec![TableFeature::ColumnMapping]), ); + assert!(protocol.is_err()); } #[test] - fn test_ensure_supported_features() { - let supported_features = [ReaderFeature::ColumnMapping, ReaderFeature::DeletionVectors]; - let table_features = vec![ReaderFeature::ColumnMapping]; - ensure_supported_features(&table_features, &supported_features).unwrap(); - - // test unknown features - let table_features = vec![ReaderFeature::ColumnMapping, ReaderFeature::unknown("idk")]; - let error = ensure_supported_features(&table_features, &supported_features).unwrap_err(); - match error { - Error::Unsupported(e) if e == - "Unknown ReaderFeatures: \"idk\". Supported ReaderFeatures: \"columnMapping\", \"deletionVectors\"" - => {}, - _ => panic!("Expected unsupported error, got: {error}"), - } + fn test_validate_multiple_readerwriter_features_reader_v2_invalid() { + // Invalid: Multiple ReaderWriter features with reader v2 + // Only ColumnMapping alone is allowed with reader v2 + let protocol = Protocol::try_new( + 2, + 7, + TableFeature::NO_LIST, + Some(vec![ + TableFeature::ColumnMapping, + TableFeature::DeletionVectors, + ]), + ); + assert!(protocol.is_err()); } #[test] fn test_parse_table_feature_never_fails() { - // parse a non-str - let features = Some([5]); - let expected = Some(vec![ReaderFeature::unknown("5")]); - assert_eq!(parse_features::(features), expected); - // weird strs let features = Some(["", "absurD_)(+13%^⚙️"]); - let expected = Some(vec![ - ReaderFeature::unknown(""), - ReaderFeature::unknown("absurD_)(+13%^⚙️"), - ]); - assert_eq!(parse_features::(features), expected); - } - - #[test] - fn test_no_catalog_managed_writes() { - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::CatalogManaged]), - Some([WriterFeature::CatalogManaged]), - ) - .unwrap(); - assert!(protocol.ensure_write_supported().is_err()); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::CatalogOwnedPreview]), - Some([WriterFeature::CatalogOwnedPreview]), - ) - .unwrap(); - assert!(protocol.ensure_write_supported().is_err()); + let expected = Some(FromIterator::from_iter([ + TableFeature::unknown(""), + TableFeature::unknown("absurD_)(+13%^⚙️"), + ])); + assert_eq!(parse_features(features), expected); } #[test] @@ -1518,13 +1579,7 @@ mod tests { let engine_data = set_transaction.into_engine_data(SetTransaction::to_schema().into(), &engine); - - let record_batch: RecordBatch = engine_data - .unwrap() - .into_any() - .downcast::() - .unwrap() - .into(); + let record_batch = engine_data.try_into_record_batch().unwrap(); let schema = Arc::new(Schema::new(vec![ Field::new("appId", ArrowDataType::Utf8, false), @@ -1549,17 +1604,11 @@ mod tests { fn test_commit_info_into_engine_data() { let engine = ExprEngine::new(); - let commit_info = CommitInfo::new(0, None, None, None); + let commit_info = CommitInfo::new(0, None, None, None, false); let commit_info_txn_id = commit_info.txn_id.clone(); let engine_data = commit_info.into_engine_data(CommitInfo::to_schema().into(), &engine); - - let record_batch: RecordBatch = engine_data - .unwrap() - .into_any() - .downcast::() - .unwrap() - .into(); + let record_batch = engine_data.try_into_record_batch().unwrap(); let mut map_builder = create_string_map_builder(false); map_builder.append(true).unwrap(); @@ -1573,6 +1622,7 @@ mod tests { Arc::new(StringArray::from(vec![Some("UNKNOWN")])), operation_parameters, Arc::new(StringArray::from(vec![Some(format!("v{KERNEL_VERSION}"))])), + Arc::new(BooleanArray::from(vec![None::])), Arc::new(StringArray::from(vec![None::])), Arc::new(StringArray::from(vec![commit_info_txn_id])), ], @@ -1594,13 +1644,7 @@ mod tests { let engine_data = domain_metadata.into_engine_data(DomainMetadata::to_schema().into(), &engine); - - let record_batch: RecordBatch = engine_data - .unwrap() - .into_any() - .downcast::() - .unwrap() - .into(); + let record_batch = engine_data.try_into_record_batch().unwrap(); let expected = RecordBatch::try_new( record_batch.schema(), @@ -1617,7 +1661,10 @@ mod tests { #[test] fn test_metadata_try_new() { - let schema = StructType::new_unchecked([StructField::not_null("id", DataType::INTEGER)]); + let schema = Arc::new(StructType::new_unchecked([StructField::not_null( + "id", + DataType::INTEGER, + )])); let config = HashMap::from([("key1".to_string(), "value1".to_string())]); let metadata = Metadata::try_new( @@ -1642,7 +1689,10 @@ mod tests { #[test] fn test_metadata_try_new_default() { - let schema = StructType::new_unchecked([StructField::not_null("id", DataType::INTEGER)]); + let schema = Arc::new(StructType::new_unchecked([StructField::not_null( + "id", + DataType::INTEGER, + )])); let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); assert!(!metadata.id.is_empty()); @@ -1652,7 +1702,10 @@ mod tests { #[test] fn test_metadata_unique_ids() { - let schema = StructType::new_unchecked([StructField::not_null("id", DataType::INTEGER)]); + let schema = Arc::new(StructType::new_unchecked([StructField::not_null( + "id", + DataType::INTEGER, + )])); let m1 = Metadata::try_new(None, None, schema.clone(), vec![], 0, HashMap::new()).unwrap(); let m2 = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); assert_ne!(m1.id, m2.id); @@ -1739,7 +1792,10 @@ mod tests { #[test] fn test_metadata_into_engine_data() { let engine = ExprEngine::new(); - let schema = StructType::new_unchecked([StructField::not_null("id", DataType::INTEGER)]); + let schema = Arc::new(StructType::new_unchecked([StructField::not_null( + "id", + DataType::INTEGER, + )])); let test_metadata = Metadata::try_new( Some("test".to_string()), @@ -1753,14 +1809,11 @@ mod tests { // have to get the id since it's random let test_id = test_metadata.id.clone(); - - let actual: RecordBatch = test_metadata + let actual = test_metadata .into_engine_data(Metadata::to_schema().into(), &engine) .unwrap() - .into_any() - .downcast::() - .unwrap() - .into(); + .try_into_record_batch() + .unwrap(); let expected_json = json!({ "id": test_id, @@ -1790,7 +1843,10 @@ mod tests { #[test] fn test_metadata_with_log_schema() { let engine = ExprEngine::new(); - let schema = StructType::new_unchecked([StructField::not_null("id", DataType::INTEGER)]); + let schema = Arc::new(StructType::new_unchecked([StructField::not_null( + "id", + DataType::INTEGER, + )])); let metadata = Metadata::try_new( Some("table".to_string()), @@ -1806,13 +1862,11 @@ mod tests { // test with the full log schema that wraps metadata in a "metaData" field let commit_schema = get_commit_schema().project(&[METADATA_NAME]).unwrap(); - let actual: RecordBatch = metadata + let actual = metadata .into_engine_data(commit_schema, &engine) .unwrap() - .into_any() - .downcast::() - .unwrap() - .into(); + .try_into_record_batch() + .unwrap(); let expected_json = json!({ "metaData": { @@ -1841,23 +1895,16 @@ mod tests { #[test] fn test_protocol_into_engine_data() { let engine = ExprEngine::new(); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::ColumnMapping]), - Some([WriterFeature::DeletionVectors]), + let protocol = Protocol::try_new_modern( + [TableFeature::DeletionVectors, TableFeature::ColumnMapping], + [TableFeature::DeletionVectors, TableFeature::ColumnMapping], ) .unwrap(); let engine_data = protocol .clone() .into_engine_data(Protocol::to_schema().into(), &engine); - let record_batch: RecordBatch = engine_data - .unwrap() - .into_any() - .downcast::() - .unwrap() - .into(); + let record_batch = engine_data.try_into_record_batch().unwrap(); let list_field = Arc::new(Field::new("element", ArrowDataType::Utf8, false)); let protocol_fields = vec![ @@ -1878,6 +1925,7 @@ mod tests { let string_builder = StringBuilder::new(); let mut list_builder = ListBuilder::new(string_builder).with_field(list_field.clone()); + list_builder.values().append_value("deletionVectors"); list_builder.values().append_value("columnMapping"); list_builder.append(true); let reader_features_array = list_builder.finish(); @@ -1885,6 +1933,7 @@ mod tests { let string_builder = StringBuilder::new(); let mut list_builder = ListBuilder::new(string_builder).with_field(list_field.clone()); list_builder.values().append_value("deletionVectors"); + list_builder.values().append_value("columnMapping"); list_builder.append(true); let writer_features_array = list_builder.finish(); @@ -1942,12 +1991,7 @@ mod tests { ) .unwrap(); - let record_batch: RecordBatch = engine_data - .unwrap() - .into_any() - .downcast::() - .unwrap() - .into(); + let record_batch = engine_data.try_into_record_batch().unwrap(); assert_eq!(record_batch, expected); } @@ -1955,18 +1999,13 @@ mod tests { #[test] fn test_protocol_into_engine_data_empty_features() { let engine = ExprEngine::new(); - let empty_features: Vec = vec![]; let protocol = - Protocol::try_new(3, 7, Some(empty_features.clone()), Some(empty_features)).unwrap(); + Protocol::try_new_modern(TableFeature::EMPTY_LIST, TableFeature::EMPTY_LIST).unwrap(); let engine_data = protocol .into_engine_data(Protocol::to_schema().into(), &engine) .unwrap(); - let record_batch: RecordBatch = engine_data - .into_any() - .downcast::() - .unwrap() - .into(); + let record_batch = engine_data.try_into_record_batch().unwrap(); assert_eq!(record_batch.num_rows(), 1); assert_eq!(record_batch.num_columns(), 4); @@ -1991,16 +2030,12 @@ mod tests { #[test] fn test_protocol_into_engine_data_no_features() { let engine = ExprEngine::new(); - let protocol = Protocol::try_new(1, 2, None::>, None::>).unwrap(); + let protocol = Protocol::try_new_legacy(1, 2).unwrap(); let engine_data = protocol .into_engine_data(Protocol::to_schema().into(), &engine) .unwrap(); - let record_batch: RecordBatch = engine_data - .into_any() - .downcast::() - .unwrap() - .into(); + let record_batch = engine_data.try_into_record_batch().unwrap(); assert_eq!(record_batch.num_rows(), 1); assert_eq!(record_batch.num_columns(), 4); @@ -2053,4 +2088,42 @@ mod tests { let schema = Arc::new(StructType::new_unchecked([])); assert!(!schema_contains_file_actions(&schema)); } + + #[test] + fn test_add_tags_deserialization_null_case() { + let json1 = r#"{"path":"file1.parquet","partitionValues":{},"size":100,"modificationTime":1234567890,"dataChange":true,"tags":null}"#; + let add1: Add = serde_json::from_str(json1).unwrap(); + assert_eq!(add1.tags, None); + } + + #[test] + fn test_add_tags_deserialization_nullable_values_case() { + let json2 = r#"{"path":"file2.parquet","partitionValues":{},"size":200,"modificationTime":1234567890,"dataChange":true,"tags":{"INSERTION_TIME":"1677811178336000","NULLABLE_TAG":null}}"#; + let add2: Add = serde_json::from_str(json2).unwrap(); + assert!(add2.tags.is_some()); + let tags = add2.tags.unwrap(); + assert_eq!(tags.len(), 2); + assert_eq!( + tags.get("INSERTION_TIME"), + Some(&Some("1677811178336000".to_string())) + ); + assert_eq!(tags.get("NULLABLE_TAG"), Some(&None)); + } + + #[test] + fn test_add_tags_deserialization_non_null_values_case() { + let json3 = r#"{"path":"file3.parquet","partitionValues":{},"size":300,"modificationTime":1234567890,"dataChange":true,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000"}}"#; + let add3: Add = serde_json::from_str(json3).unwrap(); + assert!(add3.tags.is_some()); + let tags = add3.tags.unwrap(); + assert_eq!(tags.len(), 2); + assert_eq!( + tags.get("INSERTION_TIME"), + Some(&Some("1677811178336000".to_string())) + ); + assert_eq!( + tags.get("MIN_INSERTION_TIME"), + Some(&Some("1677811178336000".to_string())) + ); + } } diff --git a/kernel/src/actions/set_transaction.rs b/kernel/src/actions/set_transaction.rs index 4fb6ff0be6..a0c6e38359 100644 --- a/kernel/src/actions/set_transaction.rs +++ b/kernel/src/actions/set_transaction.rs @@ -1,14 +1,27 @@ -use std::sync::{Arc, LazyLock}; - use crate::actions::get_log_txn_schema; use crate::actions::visitors::SetTransactionVisitor; -use crate::actions::{SetTransaction, SET_TRANSACTION_NAME}; +use crate::actions::SetTransaction; use crate::log_replay::ActionsBatch; use crate::log_segment::LogSegment; -use crate::{DeltaResult, Engine, Expression as Expr, PredicateRef, RowVisitor as _}; +use crate::{DeltaResult, Engine, RowVisitor as _}; pub(crate) use crate::actions::visitors::SetTransactionMap; +/// Returns `true` if a set transaction is expired according to the given expiration and +/// last-updated timestamps. A transaction is expired when both values are present and +/// `last_updated <= expiration_timestamp`. Transactions without `last_updated` never +/// expire. A `None` expiration timestamp (no retention duration configured) means +/// nothing expires. +pub(crate) fn is_set_txn_expired( + expiration_timestamp: Option, + last_updated: Option, +) -> bool { + matches!( + (expiration_timestamp, last_updated), + (Some(exp_ts), Some(lu)) if lu <= exp_ts + ) +} + pub(crate) struct SetTransactionScanner {} impl SetTransactionScanner { @@ -76,20 +89,7 @@ fn replay_for_app_ids( engine: &dyn Engine, ) -> DeltaResult> + Send> { let txn_schema = get_log_txn_schema(); - // This meta-predicate should be effective because all the app ids end up in a single - // checkpoint part when patitioned by `add.path` like the Delta spec requires. There's no - // point filtering by a particular app id, even if we have one, because app ids are all in - // the a single checkpoint part having large min/max range (because they're usually uuids). - static META_PREDICATE: LazyLock> = LazyLock::new(|| { - Some(Arc::new( - Expr::column([SET_TRANSACTION_NAME, "appId"]).is_not_null(), - )) - }); - log_segment.read_actions( - engine, - txn_schema.clone(), // Arc clone - META_PREDICATE.clone(), - ) + log_segment.read_actions(engine, txn_schema.clone()) } #[cfg(test)] diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 390abcda95..126999026c 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -1,7 +1,8 @@ //! This module defines visitors that can be used to extract the various delta actions from //! [`crate::engine_data::EngineData`] types. -use std::collections::HashMap; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet}; use std::sync::{Arc, LazyLock}; use delta_kernel_derive::internal_api; @@ -12,8 +13,9 @@ use crate::utils::require; use crate::{DeltaResult, Error}; use super::deletion_vector::DeletionVectorDescriptor; -use super::domain_metadata::DomainMetadataMap; +use super::set_transaction::is_set_txn_expired; use super::*; +use crate::log_segment::DomainMetadataMap; #[derive(Default)] #[internal_api] @@ -42,6 +44,7 @@ impl RowVisitor for MetadataVisitor { #[derive(Default)] pub(crate) struct SelectionVectorVisitor { pub(crate) selection_vector: Vec, + pub(crate) num_filtered: u64, } /// A single non-nullable BOOL column @@ -60,8 +63,11 @@ impl RowVisitor for SelectionVectorVisitor { )) ); for i in 0..row_count { - self.selection_vector - .push(getters[0].get(i, "selectionvector.output")?); + let selected: bool = getters[0].get(i, "selectionvector.output")?; + if !selected { + self.num_filtered += 1; + } + self.selection_vector.push(selected); } Ok(()) } @@ -178,7 +184,7 @@ impl RemoveVisitor { getters: &[&'a dyn GetData<'a>], ) -> DeltaResult { require!( - getters.len() == 14, + getters.len() == 15, Error::InternalError(format!( "Wrong number of RemoveVisitor getters: {}", getters.len() @@ -194,14 +200,14 @@ impl RemoveVisitor { getters[4].get_opt(row_index, "remove.partitionValues")?; let size: Option = getters[5].get_opt(row_index, "remove.size")?; + let stats: Option = getters[6].get_opt(row_index, "remove.stats")?; + // TODO(nick) tags are skipped in getters[7] - // TODO(nick) tags are skipped in getters[6] - - let deletion_vector = visit_deletion_vector_at(row_index, &getters[7..])?; + let deletion_vector = visit_deletion_vector_at(row_index, &getters[8..])?; - let base_row_id: Option = getters[12].get_opt(row_index, "remove.baseRowId")?; + let base_row_id: Option = getters[13].get_opt(row_index, "remove.baseRowId")?; let default_row_commit_version: Option = - getters[13].get_opt(row_index, "remove.defaultRowCommitVersion")?; + getters[14].get_opt(row_index, "remove.defaultRowCommitVersion")?; Ok(Remove { path, @@ -210,6 +216,7 @@ impl RemoveVisitor { extended_file_metadata, partition_values, size, + stats, tags: None, deletion_vector, base_row_id, @@ -357,13 +364,8 @@ impl RowVisitor for SetTransactionVisitor { .is_none_or(|requested| requested.eq(&app_id)) { let txn = SetTransactionVisitor::visit_txn(i, app_id, getters)?; - // Check retention: filter out transactions that are old - // If last_updated is None, the transaction never expires - match self.expiration_timestamp.zip(txn.last_updated) { - Some((expiration_ts, last_updated)) if last_updated <= expiration_ts => { - continue - } - _ => (), + if is_set_txn_expired(self.expiration_timestamp, txn.last_updated) { + continue; } if !self.set_transactions.contains_key(&txn.app_id) { self.set_transactions.insert(txn.app_id.clone(), txn); @@ -382,6 +384,7 @@ pub(crate) struct SidecarVisitor { } impl SidecarVisitor { + #[internal_api] fn visit_sidecar<'a>( row_index: usize, path: String, @@ -427,17 +430,18 @@ impl RowVisitor for SidecarVisitor { /// Note that this visitor requires that the log (each actions batch) is replayed in reverse order. /// /// This visitor maintains the first entry for each domain it encounters. A domain_filter may be -/// included to only retain the domain metadata for a specific domain (in order to bound memory -/// requirements). +/// included to only retain domain metadata for a specific set of domains (in order to bound memory +/// requirements and enable early termination once all requested domains are found). #[derive(Debug, Default)] pub(crate) struct DomainMetadataVisitor { domain_metadatas: DomainMetadataMap, - domain_filter: Option, + domain_filter: Option>, } impl DomainMetadataVisitor { - /// Create a new visitor. When domain_filter is set then we only retain - pub(crate) fn new(domain_filter: Option) -> Self { + /// Create a new visitor. When domain_filter is set then we only retain domain metadata for + /// domains in the provided set, enabling early termination once all requested domains are found. + pub(crate) fn new(domain_filter: Option>) -> Self { DomainMetadataVisitor { domain_filter, ..Default::default() @@ -465,8 +469,13 @@ impl DomainMetadataVisitor { }) } + /// Returns true if a domain filter is set and all requested domains have been found. + /// This is used to enable early termination of log replay once all N requested domains + /// have been discovered. pub(crate) fn filter_found(&self) -> bool { - self.domain_filter.is_some() && !self.domain_metadatas.is_empty() + self.domain_filter + .as_ref() + .is_some_and(|filter| self.domain_metadatas.len() == filter.len()) } pub(crate) fn into_domain_metadatas(mut self) -> DomainMetadataMap { @@ -488,14 +497,18 @@ impl RowVisitor for DomainMetadataVisitor { for i in 0..row_count { let domain: Option = getters[0].get_opt(i, "domainMetadata.domain")?; if let Some(domain) = domain { - // if caller requested a specific domain then only visit matches + // if caller requested specific domains then only visit matches let filter = self.domain_filter.as_ref(); - if filter.is_none_or(|requested| requested == &domain) { - let domain_metadata = - DomainMetadataVisitor::visit_domain_metadata(i, domain.clone(), getters)?; - self.domain_metadatas - .entry(domain) - .or_insert(domain_metadata); + if filter.is_none_or(|requested| requested.contains(&domain)) { + // Since batches are visited newest-first, a domain already present in + // domain_metadatas was found in a newer commit and takes precedence. + // Use Entry::Vacant so we only read configuration/removed when the + // slot is actually empty, avoiding unnecessary field access. + if let Entry::Vacant(entry) = self.domain_metadatas.entry(domain.clone()) { + let domain_metadata = + DomainMetadataVisitor::visit_domain_metadata(i, domain, getters)?; + entry.insert(domain_metadata); + } } } } @@ -682,11 +695,14 @@ impl RowVisitor for InCommitTimestampVisitor { mod tests { use super::*; - use crate::arrow::array::StringArray; + use crate::arrow::array::{BooleanArray, StringArray}; + use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::arrow::record_batch::RecordBatch; + use crate::engine::arrow_data::ArrowEngineData; use crate::engine::sync::SyncEngine; use crate::expressions::{column_expr_ref, Expression}; - use crate::table_features::{ReaderFeature, WriterFeature}; + use crate::table_features::TableFeature; use crate::utils::test_utils::{action_batch, parse_json_batch}; use crate::Engine; @@ -697,8 +713,8 @@ mod tests { let expected = Protocol { min_reader_version: 3, min_writer_version: 7, - reader_features: Some(vec![ReaderFeature::DeletionVectors]), - writer_features: Some(vec![WriterFeature::DeletionVectors]), + reader_features: Some(vec![TableFeature::DeletionVectors]), + writer_features: Some(vec![TableFeature::DeletionVectors]), }; assert_eq!(parsed, expected); Ok(()) @@ -751,13 +767,14 @@ mod tests { let data = action_batch(); let parsed = Metadata::try_new_from_data(data.as_ref())?.unwrap(); + use crate::table_properties::{ + COLUMN_MAPPING_MODE, ENABLE_CHANGE_DATA_FEED, ENABLE_DELETION_VECTORS, + }; + let configuration = HashMap::from_iter([ - ( - "delta.enableDeletionVectors".to_string(), - "true".to_string(), - ), - ("delta.columnMapping.mode".to_string(), "none".to_string()), - ("delta.enableChangeDataFeed".to_string(), "true".to_string()), + (ENABLE_DELETION_VECTORS.to_string(), "true".to_string()), + (COLUMN_MAPPING_MODE.to_string(), "none".to_string()), + (ENABLE_CHANGE_DATA_FEED.to_string(), "true".to_string()), ]); let expected = Metadata { id: "testId".into(), @@ -834,7 +851,7 @@ mod tests { let json_strings: StringArray = vec![ r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, - r#"{"remove":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452}}"#, + r#"{"remove":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","deletionTimestamp":1670892998135,"dataChange":true,"partitionValues":{"c1":"4","c2":"c"},"size":452,"stats":"{\"numRecords\":1}"}}"#, ] .into(); let batch = parse_json_batch(json_strings); @@ -850,6 +867,7 @@ mod tests { ("c2".to_string(), "c".to_string()), ])), size: Some(452), + stats: Some(r#"{"numRecords":1}"#.to_string()), ..Default::default() }; assert_eq!( @@ -863,6 +881,82 @@ mod tests { ); } + #[test] + fn test_parse_remove_all_fields_unique() { + // This test verifies that all fields in the Remove action are correctly parsed + // and that each field gets a unique value, ensuring no index collisions + let json_strings: StringArray = vec![ + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"test-id","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1670892997849}}"#, + r#"{"remove":{"path":"test-path.parquet","deletionTimestamp":1234567890,"dataChange":false,"extendedFileMetadata":true,"partitionValues":{"part":"value"},"size":9999,"stats":"{\"numRecords\":42}","deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":3},"baseRowId":100,"defaultRowCommitVersion":5}}"#, + ] + .into(); + let batch = parse_json_batch(json_strings); + let mut remove_visitor = RemoveVisitor::default(); + remove_visitor.visit_rows_of(batch.as_ref()).unwrap(); + + assert_eq!( + remove_visitor.removes.len(), + 1, + "Expected exactly one remove action" + ); + + let remove = &remove_visitor.removes[0]; + + // Verify each field has the expected unique value + assert_eq!(remove.path, "test-path.parquet", "path mismatch"); + assert_eq!( + remove.deletion_timestamp, + Some(1234567890), + "deletion_timestamp mismatch" + ); + assert!(!remove.data_change, "data_change mismatch"); + assert_eq!( + remove.extended_file_metadata, + Some(true), + "extended_file_metadata mismatch" + ); + assert_eq!( + remove.partition_values, + Some(HashMap::from([("part".to_string(), "value".to_string())])), + "partition_values mismatch" + ); + assert_eq!(remove.size, Some(9999), "size mismatch"); + assert_eq!( + remove.stats, + Some(r#"{"numRecords":42}"#.to_string()), + "stats mismatch" + ); + + // Verify deletion vector fields + let dv = remove + .deletion_vector + .as_ref() + .expect("deletion_vector should be present"); + assert_eq!( + dv.path_or_inline_dv, "vBn[lx{q8@P<9BNH/isA", + "deletion_vector.path_or_inline_dv mismatch" + ); + assert_eq!(dv.offset, Some(1), "deletion_vector.offset mismatch"); + assert_eq!( + dv.size_in_bytes, 36, + "deletion_vector.size_in_bytes mismatch" + ); + assert_eq!(dv.cardinality, 3, "deletion_vector.cardinality mismatch"); + + // Verify row tracking fields (these would have been incorrect with the bug) + assert_eq!( + remove.base_row_id, + Some(100), + "base_row_id mismatch - check getter index" + ); + assert_eq!( + remove.default_row_commit_version, + Some(5), + "default_row_commit_version mismatch - check getter index" + ); + } + #[test] fn test_parse_txn() { let json_strings: StringArray = vec![ @@ -1036,7 +1130,8 @@ mod tests { assert_eq!(domain_metadata_visitor.into_domain_metadatas(), expected); // test filtering - let mut domain_metadata_visitor = DomainMetadataVisitor::new(Some("zach3".to_string())); + let mut domain_metadata_visitor = + DomainMetadataVisitor::new(Some(HashSet::from(["zach3".to_string()]))); domain_metadata_visitor .visit_rows_of(commit_1.as_ref()) .unwrap(); @@ -1057,7 +1152,8 @@ mod tests { assert_eq!(domain_metadata_visitor.into_domain_metadatas(), expected); // test filtering for a domain that is not present - let mut domain_metadata_visitor = DomainMetadataVisitor::new(Some("notexist".to_string())); + let mut domain_metadata_visitor = + DomainMetadataVisitor::new(Some(HashSet::from(["notexist".to_string()]))); domain_metadata_visitor .visit_rows_of(commit_1.as_ref()) .unwrap(); @@ -1067,6 +1163,89 @@ mod tests { assert!(domain_metadata_visitor.domain_metadatas.is_empty()); } + #[test] + fn test_domain_metadata_visitor_multi_domain_filter() { + // Reuse the same two-commit setup from test_parse_domain_metadata. + // commit_1 (newer): zach1(removed), zach2, zach3(removed), zach4, zach5(removed), zach6 + // commit_0 (older): zach1(removed), zach2, zach3, zach4(removed), zach7(removed), zach8 + let commit_1: Box = parse_json_batch( + vec![ + r#"{"domainMetadata":{"domain":"zach1","configuration":"cfg1","removed":true}}"#, + r#"{"domainMetadata":{"domain":"zach2","configuration":"cfg2","removed":false}}"#, + r#"{"domainMetadata":{"domain":"zach3","configuration":"cfg3","removed":true}}"#, + r#"{"domainMetadata":{"domain":"zach4","configuration":"cfg4","removed":false}}"#, + r#"{"domainMetadata":{"domain":"zach5","configuration":"cfg5","removed":true}}"#, + r#"{"domainMetadata":{"domain":"zach6","configuration":"cfg6","removed":false}}"#, + ] + .into(), + ); + let commit_0: Box = parse_json_batch( + vec![ + r#"{"domainMetadata":{"domain":"zach1","configuration":"old_cfg1","removed":true}}"#, + r#"{"domainMetadata":{"domain":"zach2","configuration":"old_cfg2","removed":false}}"#, + r#"{"domainMetadata":{"domain":"zach3","configuration":"old_cfg3","removed":false}}"#, + r#"{"domainMetadata":{"domain":"zach4","configuration":"old_cfg4","removed":true}}"#, + r#"{"domainMetadata":{"domain":"zach7","configuration":"cfg7","removed":true}}"#, + r#"{"domainMetadata":{"domain":"zach8","configuration":"cfg8","removed":false}}"#, + ] + .into(), + ); + + // --- filter for two active domains both in commit_1 --- + let mut visitor = DomainMetadataVisitor::new(Some(HashSet::from([ + "zach2".to_string(), + "zach4".to_string(), + ]))); + assert!(!visitor.filter_found()); // nothing found yet + visitor.visit_rows_of(commit_1.as_ref()).unwrap(); + // both zach2 and zach4 appear in commit_1, so early termination should trigger + assert!(visitor.filter_found()); + // commit_0 would NOT be visited in a real replay (early termination), but even if it + // were the results should be the same since commit_1 entries take precedence + let result = visitor.into_domain_metadatas(); + assert_eq!(result.len(), 2); + assert_eq!(result["zach2"].configuration, "cfg2"); + assert_eq!(result["zach4"].configuration, "cfg4"); + + // --- filter spanning both commits (zach2 in commit_1, zach8 in commit_0) --- + let mut visitor = DomainMetadataVisitor::new(Some(HashSet::from([ + "zach2".to_string(), + "zach8".to_string(), + ]))); + visitor.visit_rows_of(commit_1.as_ref()).unwrap(); + // only zach2 found so far — should NOT terminate early yet + assert!(!visitor.filter_found()); + visitor.visit_rows_of(commit_0.as_ref()).unwrap(); + // now zach8 found too + assert!(visitor.filter_found()); + let result = visitor.into_domain_metadatas(); + assert_eq!(result.len(), 2); + assert_eq!(result["zach2"].configuration, "cfg2"); + assert_eq!(result["zach8"].configuration, "cfg8"); + + // --- filter where one domain is removed (tombstone) --- + // zach3 is removed in commit_1; only zach6 survives into_domain_metadatas + let mut visitor = DomainMetadataVisitor::new(Some(HashSet::from([ + "zach3".to_string(), + "zach6".to_string(), + ]))); + visitor.visit_rows_of(commit_1.as_ref()).unwrap(); + assert!(visitor.filter_found()); // both found in commit_1 + let result = visitor.into_domain_metadatas(); + assert_eq!(result.len(), 1); // zach3 is removed, filtered out + assert_eq!(result["zach6"].configuration, "cfg6"); + + // --- filter where no requested domains exist --- + let mut visitor = DomainMetadataVisitor::new(Some(HashSet::from([ + "ghost1".to_string(), + "ghost2".to_string(), + ]))); + visitor.visit_rows_of(commit_1.as_ref()).unwrap(); + visitor.visit_rows_of(commit_0.as_ref()).unwrap(); + assert!(!visitor.filter_found()); + assert!(visitor.into_domain_metadatas().is_empty()); + } + /************************************* * In-commit timestamp visitor tests * **************************************/ @@ -1081,7 +1260,7 @@ mod tests { fn transform_batch(batch: Box) -> Box { let engine = SyncEngine::new(); let expression = - Expression::Struct(vec![Arc::new(Expression::Struct(vec![column_expr_ref!( + Expression::struct_from([Arc::new(Expression::struct_from([column_expr_ref!( "commitInfo.inCommitTimestamp" )]))]); engine @@ -1091,6 +1270,7 @@ mod tests { expression.into(), InCommitTimestampVisitor::schema().into(), ) + .unwrap() .evaluate(batch.as_ref()) .unwrap() } @@ -1122,4 +1302,29 @@ mod tests { Some(1677811178585), // Retrieved ICT ); } + + // Helper to create a boolean batch for SelectionVectorVisitor tests + fn create_boolean_batch(values: Vec) -> Box { + let array = BooleanArray::from(values); + let arrow_schema = ArrowSchema::new(vec![Field::new("output", DataType::Boolean, false)]); + let batch = RecordBatch::try_new(Arc::new(arrow_schema), vec![Arc::new(array)]).unwrap(); + Box::new(ArrowEngineData::new(batch)) + } + + #[rstest::rstest] + #[case::empty_batch(vec![], 0, "empty batch should have no filtered rows")] + #[case::all_selected(vec![true, true, true, true], 0, "all selected should have no filtered rows")] + #[case::all_filtered(vec![false, false, false, false, false], 5, "all filtered should count all rows")] + #[case::mixed_selection(vec![true, false, true, false, false, true], 3, "mixed selection should count false values")] + fn selection_vector_visitor_counter_accuracy( + #[case] input: Vec, + #[case] expected_filtered: u64, + #[case] _description: &str, + ) { + let batch = create_boolean_batch(input.clone()); + let mut visitor = SelectionVectorVisitor::default(); + visitor.visit_rows_of(batch.as_ref()).unwrap(); + assert_eq!(visitor.selection_vector, input); + assert_eq!(visitor.num_filtered, expected_filtered); + } } diff --git a/kernel/src/arrow_compat.rs b/kernel/src/arrow_compat.rs index e57e634045..08e2d0e8ab 100644 --- a/kernel/src/arrow_compat.rs +++ b/kernel/src/arrow_compat.rs @@ -1,25 +1,28 @@ //! This module re-exports the different versions of arrow, parquet, and object_store we support. -#[cfg(feature = "arrow-56")] +#[cfg(feature = "arrow-57")] mod arrow_compat_shims { - pub use arrow_56 as arrow; - pub use parquet_56 as parquet; + pub use arrow_57 as arrow; + pub use object_store_12 as object_store; + pub use parquet_57 as parquet; } -#[cfg(all(feature = "arrow-55", not(feature = "arrow-56")))] +#[cfg(all(feature = "arrow-56", not(feature = "arrow-57"),))] mod arrow_compat_shims { - pub use arrow_55 as arrow; - pub use parquet_55 as parquet; + pub use arrow_56 as arrow; + pub use object_store_12 as object_store; + pub use parquet_56 as parquet; } // if nothing is enabled but we need arrow because of some other feature flag, throw compile-time // error #[cfg(all( feature = "need-arrow", - not(feature = "arrow-55"), - not(feature = "arrow-56") + not(feature = "arrow-56"), + not(feature = "arrow-57") ))] -compile_error!("Requested a feature that needs arrow without enabling arrow. Please enable the `arrow-55` or `arrow-56` feature"); +compile_error!("Requested a feature that needs arrow without enabling arrow. Please enable the `arrow-56`, or `arrow-57` feature"); -#[cfg(any(feature = "arrow-55", feature = "arrow-56"))] +#[cfg(any(feature = "arrow-56", feature = "arrow-57"))] +#[doc(hidden)] pub use arrow_compat_shims::*; diff --git a/kernel/src/checkpoint/checkpoint_transform.rs b/kernel/src/checkpoint/checkpoint_transform.rs new file mode 100644 index 0000000000..95c1db997a --- /dev/null +++ b/kernel/src/checkpoint/checkpoint_transform.rs @@ -0,0 +1,742 @@ +//! Transforms for populating checkpoint-specific fields in the Add action. +//! +//! This module ensures that Add actions in checkpoints have the correct format for: +//! +//! **Statistics** (controlled by `delta.checkpoint.writeStatsAsStruct` / `writeStatsAsJson`): +//! - `stats`: JSON string format (default: enabled) +//! - `stats_parsed`: Native struct format (default: disabled) +//! +//! **Partition values** (controlled by `delta.checkpoint.writeStatsAsStruct`): +//! - `partitionValues`: String-valued map (always present) +//! - `partitionValues_parsed`: Native typed struct (only when `writeStatsAsStruct=true`) +//! +//! This module provides transforms to populate these fields using COALESCE expressions, +//! ensuring that values are preserved regardless of the source format (commits vs checkpoints). + +use std::sync::{Arc, LazyLock}; + +use crate::actions::ADD_NAME; +use crate::expressions::{Expression, ExpressionRef, Transform, UnaryExpressionOp}; +use crate::schema::{DataType, SchemaRef, StructField, StructType}; +use crate::table_properties::TableProperties; +use crate::{DeltaResult, Error}; + +pub(crate) const STATS_FIELD: &str = "stats"; +pub(crate) const STATS_PARSED_FIELD: &str = "stats_parsed"; +pub(crate) const PARTITION_VALUES_FIELD: &str = "partitionValues"; +pub(crate) const PARTITION_VALUES_PARSED_FIELD: &str = "partitionValues_parsed"; + +/// Configuration for stats transformation based on table properties. +#[derive(Debug, Clone, Copy)] +pub(crate) struct StatsTransformConfig { + pub write_stats_as_json: bool, + pub write_stats_as_struct: bool, +} + +impl StatsTransformConfig { + pub(super) fn from_table_properties(properties: &TableProperties) -> Self { + Self { + write_stats_as_json: properties.should_write_stats_as_json(), + write_stats_as_struct: properties.should_write_stats_as_struct(), + } + } +} + +/// Builds a transform for the Add action to populate and/or drop stats and partition fields. +/// +/// The transform handles statistics based on table properties: +/// - When `writeStatsAsJson=true`: `stats = COALESCE(stats, ToJson(stats_parsed))` +/// - When `writeStatsAsJson=false`: drop `stats` field +/// - When `writeStatsAsStruct=true`: `stats_parsed = COALESCE(stats_parsed, ParseJson(stats))` +/// - When `writeStatsAsStruct=false`: drop `stats_parsed` field +/// +/// For partitioned tables when `writeStatsAsStruct=true`, it also populates: +/// - `partitionValues_parsed = COALESCE(partitionValues_parsed, MAP_TO_STRUCT(partitionValues))` +/// +/// Returns a top-level transform that wraps the nested Add transform, ensuring the +/// full checkpoint batch is produced with the modified Add action. +/// +/// # Arguments +/// +/// * `stats_schema` - The expected schema for parsed file statistics, typically generated +/// by [`expected_stats_schema`]. This schema has the following structure: +/// ```ignore +/// { +/// numRecords: long, +/// nullCount: , +/// minValues: , +/// maxValues: , +/// } +/// ``` +/// The schema is derived from the table's physical file schema and table properties +/// (`dataSkippingNumIndexedCols`, `dataSkippingStatsColumns`). Only columns eligible +/// for data skipping are included in `minValues`/`maxValues`. +/// +/// * `partition_schema` - The physical partition schema for `partitionValues_parsed`. `None` +/// for non-partitioned tables. +/// +/// [`expected_stats_schema`]: crate::scan::data_skipping::stats_schema::expected_stats_schema +pub(crate) fn build_checkpoint_transform( + config: &StatsTransformConfig, + stats_schema: &SchemaRef, + partition_schema: Option<&SchemaRef>, +) -> ExpressionRef { + let mut add_transform = Transform::new_nested([ADD_NAME]); + + // Handle stats field + if config.write_stats_as_json { + // Populate stats from stats_parsed if needed (for old checkpoints that only had stats_parsed) + add_transform = add_transform.with_replaced_field(STATS_FIELD, STATS_JSON_EXPR.clone()); + } else { + // Drop stats field when not writing as JSON + add_transform = add_transform.with_dropped_field(STATS_FIELD); + } + + // Handle stats_parsed field + // Note: stats_parsed was added to read schema (via build_checkpoint_read_schema), + // so we always need to either replace it (with COALESCE) or drop it. + if config.write_stats_as_struct { + // Populate stats_parsed from JSON stats (for commits that only have JSON stats) + let stats_parsed_expr = build_stats_parsed_expr(stats_schema); + add_transform = add_transform.with_replaced_field(STATS_PARSED_FIELD, stats_parsed_expr); + } else { + // Drop stats_parsed field when not writing as struct + add_transform = add_transform.with_dropped_field(STATS_PARSED_FIELD); + } + + // Handle partitionValues_parsed field (only for partitioned tables) + if partition_schema.is_some() { + if config.write_stats_as_struct { + let pv_parsed_expr = build_partition_values_parsed_expr(); + add_transform = + add_transform.with_replaced_field(PARTITION_VALUES_PARSED_FIELD, pv_parsed_expr); + } else { + // Drop partitionValues_parsed since it was added to read schema + add_transform = add_transform.with_dropped_field(PARTITION_VALUES_PARSED_FIELD); + } + } + + // Wrap the nested Add transform in a top-level transform that replaces the Add field + let add_transform_expr: ExpressionRef = Arc::new(Expression::transform(add_transform)); + let outer_transform = + Transform::new_top_level().with_replaced_field(ADD_NAME, add_transform_expr); + + Arc::new(Expression::transform(outer_transform)) +} + +/// Builds a read schema that includes `stats_parsed` and optionally `partitionValues_parsed` +/// in the Add action. +/// +/// The read schema must be union-compatible across all log segment files (checkpoints and +/// JSON commits). This means all reads use the same schema even though commits don't have +/// `stats_parsed` or `partitionValues_parsed` — those columns are read as nulls. This +/// union-compatible schema ensures log replay can process checkpoint and commit batches +/// uniformly, and COALESCE expressions can operate correctly across both sources. +/// +/// # Errors +/// +/// Returns an error if: +/// - The `add` field is not found or is not a struct type +/// - The `stats_parsed` or `partitionValues_parsed` field already exists in the Add schema +pub(crate) fn build_checkpoint_read_schema( + base_schema: &StructType, + stats_schema: &StructType, + partition_schema: Option<&StructType>, +) -> DeltaResult { + transform_add_schema(base_schema, |add_struct| { + // Validate fields aren't already present + if add_struct.field(STATS_PARSED_FIELD).is_some() { + return Err(Error::generic( + "stats_parsed field already exists in Add schema", + )); + } + if partition_schema.is_some() && add_struct.field(PARTITION_VALUES_PARSED_FIELD).is_some() { + return Err(Error::generic( + "partitionValues_parsed field already exists in Add schema", + )); + } + let mut result = add_struct.clone().with_field_inserted_after( + Some(STATS_FIELD), + StructField::nullable( + STATS_PARSED_FIELD, + DataType::Struct(Box::new(stats_schema.clone())), + ), + )?; + if let Some(pv_schema) = partition_schema { + result = result.with_field_inserted_after( + Some(PARTITION_VALUES_FIELD), + StructField::nullable( + PARTITION_VALUES_PARSED_FIELD, + DataType::Struct(Box::new(pv_schema.clone())), + ), + )?; + } + Ok(result) + }) +} + +/// Builds the output schema based on configuration. +/// +/// The output schema determines which fields are included in the checkpoint: +/// - If `writeStatsAsJson=false`: `stats` field is excluded +/// - If `writeStatsAsStruct=true`: `stats_parsed` and `partitionValues_parsed` fields are included +/// +/// # Errors +/// +/// Returns an error if the `add` field is not found or is not a struct type. +pub(crate) fn build_checkpoint_output_schema( + config: &StatsTransformConfig, + base_schema: &StructType, + stats_schema: &StructType, + partition_schema: Option<&StructType>, +) -> DeltaResult { + transform_add_schema(base_schema, |add_struct| { + build_add_output_schema(config, add_struct, stats_schema, partition_schema) + }) +} + +// ======================== +// Private helpers +// ======================== + +/// Builds expression: `stats_parsed = COALESCE(stats_parsed, ParseJson(stats, schema))` +/// +/// This expression prefers existing stats_parsed, falling back to parsing JSON stats. +/// If `stats_parsed` is non-null, the data originated from a checkpoint (commits only +/// contain JSON stats, so `stats_parsed` will be null for commit-sourced rows). +/// +/// Column paths are relative to the full batch (not the nested Add struct), so we use +/// ["add", "stats"] instead of just ["stats"]. +fn build_stats_parsed_expr(stats_schema: &SchemaRef) -> ExpressionRef { + Arc::new(Expression::coalesce([ + Expression::column([ADD_NAME, STATS_PARSED_FIELD]), + Expression::parse_json( + Expression::column([ADD_NAME, STATS_FIELD]), + stats_schema.clone(), + ), + ])) +} + +/// Builds expression: `partitionValues_parsed = COALESCE(partitionValues_parsed, +/// MAP_TO_STRUCT(partitionValues))` +/// +/// This expression prefers existing `partitionValues_parsed`, falling back to converting +/// the string-valued `partitionValues` map into a native typed struct. The target struct +/// type (field names and data types) is determined by the output schema — `MAP_TO_STRUCT` +/// itself carries no schema, so the expression evaluator uses the expected output type to +/// parse each string value into the correct native type. +/// +/// Column paths are relative to the full batch (not the nested Add struct), so we use +/// `["add", "partitionValues"]` instead of just `["partitionValues"]`. +fn build_partition_values_parsed_expr() -> ExpressionRef { + Arc::new(Expression::coalesce([ + Expression::column([ADD_NAME, PARTITION_VALUES_PARSED_FIELD]), + Expression::map_to_struct(Expression::column([ADD_NAME, PARTITION_VALUES_FIELD])), + ])) +} + +/// Static expression: `stats = COALESCE(stats, ToJson(stats_parsed))` +/// +/// This expression prefers existing JSON stats, falling back to converting stats_parsed. +/// Column paths are relative to the full batch (not the nested Add struct), so we use +/// ["add", "stats"] instead of just ["stats"]. +static STATS_JSON_EXPR: LazyLock = LazyLock::new(|| { + Arc::new(Expression::coalesce([ + Expression::column([ADD_NAME, STATS_FIELD]), + Expression::unary( + UnaryExpressionOp::ToJson, + Expression::column([ADD_NAME, STATS_PARSED_FIELD]), + ), + ])) +}); + +/// Transforms the Add action schema within a checkpoint schema. +/// +/// This helper applies a transformation function to the Add struct and returns +/// a new schema with the modified Add field. +/// +// TODO(https://github.com/delta-io/delta-kernel-rs/issues/1820): Replace manual field +// iteration with StructType helper methods (e.g., with_field_inserted, with_field_removed). +/// +/// # Errors +/// +/// Returns an error if: +/// - The `add` field is not found in the schema +/// - The `add` field is not a struct type +fn transform_add_schema( + base_schema: &StructType, + transform_fn: impl FnOnce(&StructType) -> DeltaResult, +) -> DeltaResult { + // Find and validate the add field + let add_field = base_schema + .field(ADD_NAME) + .ok_or_else(|| Error::generic("Expected 'add' field in checkpoint schema"))?; + + let DataType::Struct(add_struct) = &add_field.data_type else { + return Err(Error::generic(format!( + "Expected 'add' field to be a struct type, got {:?}", + add_field.data_type + ))); + }; + + let modified_add = transform_fn(add_struct)?; + let new_schema = base_schema.clone().with_field_replaced( + ADD_NAME, + StructField { + name: ADD_NAME.to_string(), + data_type: DataType::Struct(Box::new(modified_add)), + nullable: add_field.nullable, + metadata: add_field.metadata.clone(), + }, + )?; + + Ok(Arc::new(new_schema)) +} + +fn build_add_output_schema( + config: &StatsTransformConfig, + add_schema: &StructType, + stats_schema: &StructType, + partition_schema: Option<&StructType>, +) -> DeltaResult { + let mut new_schema = add_schema.clone(); + if config.write_stats_as_struct { + new_schema = new_schema.with_field_inserted_after( + Some(STATS_FIELD), + StructField::nullable( + STATS_PARSED_FIELD, + DataType::Struct(Box::new(stats_schema.clone())), + ), + )?; + if let Some(pv_schema) = partition_schema { + new_schema = new_schema.with_field_inserted_after( + Some(PARTITION_VALUES_FIELD), + StructField::nullable( + PARTITION_VALUES_PARSED_FIELD, + DataType::Struct(Box::new(pv_schema.clone())), + ), + )?; + } + } + + if config.write_stats_as_json { + Ok(new_schema) + } else { + Ok(new_schema.with_field_removed(STATS_FIELD)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_config_defaults() { + // Default: writeStatsAsJson=true, writeStatsAsStruct=false (per protocol) + let props = TableProperties::default(); + let config = StatsTransformConfig::from_table_properties(&props); + assert!(config.write_stats_as_json); + assert!(!config.write_stats_as_struct); + } + + #[test] + fn test_config_with_struct_enabled() { + let props = TableProperties { + checkpoint_write_stats_as_struct: Some(true), + ..Default::default() + }; + let config = StatsTransformConfig::from_table_properties(&props); + assert!(config.write_stats_as_json); + assert!(config.write_stats_as_struct); + } + + /// Helper to extract the outer and inner transforms from a stats transform expression. + /// Returns (outer_transform, inner_transform). + fn extract_transforms(expr: &Expression) -> (&Transform, &Transform) { + let Expression::Transform(outer) = expr else { + panic!("Expected outer Transform expression"); + }; + + // Outer should be top-level (no input path) + assert!( + outer.input_path.is_none(), + "Outer transform should be top-level" + ); + + // Outer should replace "add" field + let add_field_transform = outer + .field_transforms + .get(ADD_NAME) + .expect("Outer transform should have 'add' field transform"); + assert!(add_field_transform.is_replace, "Should replace 'add' field"); + assert_eq!( + add_field_transform.exprs.len(), + 1, + "Should have exactly one replacement expression" + ); + + // Extract inner transform + let Expression::Transform(inner) = add_field_transform.exprs[0].as_ref() else { + panic!("Expected inner Transform expression for 'add' field"); + }; + + // Inner should target "add" path + assert_eq!( + inner.input_path.as_ref().map(|p| p.to_string()), + Some("add".to_string()), + "Inner transform should target 'add' path" + ); + + (outer, inner) + } + + /// Helper to check if a field transform is a drop (replace with nothing). + fn is_drop(transform: &Transform, field: &str) -> bool { + transform + .field_transforms + .get(field) + .map(|ft| ft.is_replace && ft.exprs.is_empty()) + .unwrap_or(false) + } + + /// Helper to check if a field transform is a replacement with an expression. + fn is_replacement(transform: &Transform, field: &str) -> bool { + transform + .field_transforms + .get(field) + .map(|ft| ft.is_replace && ft.exprs.len() == 1) + .unwrap_or(false) + } + + #[test] + fn test_build_transform_with_json_only() { + // writeStatsAsJson=true, writeStatsAsStruct=false (default) + // Inner transform: stats=COALESCE, stats_parsed=drop + let config = StatsTransformConfig { + write_stats_as_json: true, + write_stats_as_struct: false, + }; + let stats_schema = Arc::new(StructType::new_unchecked([])); + let transform_expr = build_checkpoint_transform(&config, &stats_schema, None); + + let (_, inner) = extract_transforms(&transform_expr); + + // stats should be replaced with COALESCE expression + assert!( + is_replacement(inner, STATS_FIELD), + "stats should be replaced" + ); + + // stats_parsed should be dropped + assert!( + is_drop(inner, STATS_PARSED_FIELD), + "stats_parsed should be dropped" + ); + } + + #[test] + fn test_build_transform_drops_both_when_false() { + // writeStatsAsJson=false, writeStatsAsStruct=false + // Inner transform: stats=drop, stats_parsed=drop + let config = StatsTransformConfig { + write_stats_as_json: false, + write_stats_as_struct: false, + }; + let stats_schema = Arc::new(StructType::new_unchecked([])); + let transform_expr = build_checkpoint_transform(&config, &stats_schema, None); + + let (_, inner) = extract_transforms(&transform_expr); + + // Both fields should be dropped + assert!(is_drop(inner, STATS_FIELD), "stats should be dropped"); + assert!( + is_drop(inner, STATS_PARSED_FIELD), + "stats_parsed should be dropped" + ); + } + + #[test] + fn test_build_transform_with_both_enabled() { + // writeStatsAsJson=true, writeStatsAsStruct=true + // Inner transform: stats=COALESCE, stats_parsed=COALESCE + let config = StatsTransformConfig { + write_stats_as_json: true, + write_stats_as_struct: true, + }; + let stats_schema = Arc::new(StructType::new_unchecked([])); + let transform_expr = build_checkpoint_transform(&config, &stats_schema, None); + + let (_, inner) = extract_transforms(&transform_expr); + + // Both fields should be replaced with COALESCE expressions + assert!( + is_replacement(inner, STATS_FIELD), + "stats should be replaced" + ); + assert!( + is_replacement(inner, STATS_PARSED_FIELD), + "stats_parsed should be replaced" + ); + } + + #[test] + fn test_build_transform_struct_only() { + // writeStatsAsJson=false, writeStatsAsStruct=true + // Inner transform: stats=drop, stats_parsed=COALESCE + let config = StatsTransformConfig { + write_stats_as_json: false, + write_stats_as_struct: true, + }; + let stats_schema = Arc::new(StructType::new_unchecked([])); + let transform_expr = build_checkpoint_transform(&config, &stats_schema, None); + + let (_, inner) = extract_transforms(&transform_expr); + + // stats should be dropped + assert!(is_drop(inner, STATS_FIELD), "stats should be dropped"); + + // stats_parsed should be replaced with COALESCE expression + assert!( + is_replacement(inner, STATS_PARSED_FIELD), + "stats_parsed should be replaced" + ); + } + + #[test] + fn test_build_transform_with_partition_values() { + // writeStatsAsStruct=true with partitioned table + let config = StatsTransformConfig { + write_stats_as_json: true, + write_stats_as_struct: true, + }; + let stats_schema = Arc::new(StructType::new_unchecked([])); + let pv_schema = Arc::new(StructType::new_unchecked([ + StructField::nullable("year", DataType::INTEGER), + StructField::nullable("month", DataType::INTEGER), + ])); + let transform_expr = build_checkpoint_transform(&config, &stats_schema, Some(&pv_schema)); + + let (_, inner) = extract_transforms(&transform_expr); + + // partitionValues_parsed should be replaced with COALESCE expression + assert!( + is_replacement(inner, PARTITION_VALUES_PARSED_FIELD), + "partitionValues_parsed should be replaced" + ); + } + + #[test] + fn test_build_transform_no_partition_values_when_struct_disabled() { + // writeStatsAsStruct=false with partitioned table + let config = StatsTransformConfig { + write_stats_as_json: true, + write_stats_as_struct: false, + }; + let stats_schema = Arc::new(StructType::new_unchecked([])); + let pv_schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "year", + DataType::INTEGER, + )])); + let transform_expr = build_checkpoint_transform(&config, &stats_schema, Some(&pv_schema)); + + let (_, inner) = extract_transforms(&transform_expr); + + // partitionValues_parsed should be dropped + assert!( + is_drop(inner, PARTITION_VALUES_PARSED_FIELD), + "partitionValues_parsed should be dropped" + ); + } + + #[test] + fn test_build_transform_non_partitioned_table() { + // Non-partitioned table: no partitionValues_parsed handling at all + let config = StatsTransformConfig { + write_stats_as_json: true, + write_stats_as_struct: true, + }; + let stats_schema = Arc::new(StructType::new_unchecked([])); + let transform_expr = build_checkpoint_transform(&config, &stats_schema, None); + + let (_, inner) = extract_transforms(&transform_expr); + + // No partitionValues_parsed transform should exist + assert!( + !inner + .field_transforms + .contains_key(PARTITION_VALUES_PARSED_FIELD), + "non-partitioned table should not have partitionValues_parsed transform" + ); + } + + #[test] + fn test_field_inserted_after_in_add_schema() { + let add_schema = StructType::new_unchecked([ + StructField::not_null("path", DataType::STRING), + StructField::nullable("stats", DataType::STRING), + StructField::nullable("tags", DataType::STRING), + ]); + + let injected_schema = + StructType::new_unchecked([StructField::nullable("numRecords", DataType::LONG)]); + + let result = add_schema + .with_field_inserted_after( + Some(STATS_FIELD), + StructField::nullable( + STATS_PARSED_FIELD, + DataType::Struct(Box::new(injected_schema)), + ), + ) + .expect("inserting stats_parsed should succeed"); + + // Should have 4 fields: path, stats, stats_parsed, tags + assert_eq!(result.fields().count(), 4); + + let field_names: Vec<&str> = result.fields().map(|f| f.name.as_str()).collect(); + assert_eq!(field_names, vec!["path", "stats", "stats_parsed", "tags"]); + } + + #[test] + fn test_build_add_output_schema_json_only() { + let config = StatsTransformConfig { + write_stats_as_json: true, + write_stats_as_struct: false, + }; + + let add_schema = StructType::new_unchecked([ + StructField::not_null("path", DataType::STRING), + StructField::nullable("stats", DataType::STRING), + ]); + + let stats_schema = StructType::new_unchecked([]); + + let result = build_add_output_schema(&config, &add_schema, &stats_schema, None) + .expect("build add output schema should produce a valid schema"); + + // Should have path and stats, no stats_parsed + let field_names: Vec<&str> = result.fields().map(|f| f.name.as_str()).collect(); + assert_eq!(field_names, vec!["path", "stats"]); + } + + #[test] + fn test_build_add_output_schema_struct_only() { + let config = StatsTransformConfig { + write_stats_as_json: false, + write_stats_as_struct: true, + }; + + let add_schema = StructType::new_unchecked([ + StructField::not_null("path", DataType::STRING), + StructField::nullable("stats", DataType::STRING), + ]); + + let stats_schema = + StructType::new_unchecked([StructField::nullable("numRecords", DataType::LONG)]); + + let result = build_add_output_schema(&config, &add_schema, &stats_schema, None) + .expect("build add output schema should produce a valid schema"); + + // Should have path and stats_parsed (stats dropped) + let field_names: Vec<&str> = result.fields().map(|f| f.name.as_str()).collect(); + assert_eq!(field_names, vec!["path", "stats_parsed"]); + } + + #[test] + fn test_build_add_output_schema_both() { + let config = StatsTransformConfig { + write_stats_as_json: true, + write_stats_as_struct: true, + }; + + let add_schema = StructType::new_unchecked([ + StructField::not_null("path", DataType::STRING), + StructField::nullable("stats", DataType::STRING), + ]); + + let stats_schema = + StructType::new_unchecked([StructField::nullable("numRecords", DataType::LONG)]); + + let result = build_add_output_schema(&config, &add_schema, &stats_schema, None) + .expect("build add output schema should produce a valid schema"); + + // Should have path, stats, and stats_parsed + let field_names: Vec<&str> = result.fields().map(|f| f.name.as_str()).collect(); + assert_eq!(field_names, vec!["path", "stats", "stats_parsed"]); + } + + #[test] + fn test_build_add_output_schema_with_partition_values() { + let config = StatsTransformConfig { + write_stats_as_json: true, + write_stats_as_struct: true, + }; + + let add_schema = StructType::new_unchecked([ + StructField::not_null("path", DataType::STRING), + StructField::nullable( + "partitionValues", + DataType::Map(Box::new(crate::schema::MapType::new( + DataType::STRING, + DataType::STRING, + true, + ))), + ), + StructField::nullable("stats", DataType::STRING), + ]); + + let stats_schema = + StructType::new_unchecked([StructField::nullable("numRecords", DataType::LONG)]); + let pv_schema = StructType::new_unchecked([ + StructField::nullable("year", DataType::INTEGER), + StructField::nullable("month", DataType::INTEGER), + ]); + + let result = build_add_output_schema(&config, &add_schema, &stats_schema, Some(&pv_schema)) + .expect("build add output schema should produce a valid schema"); + + let field_names: Vec<&str> = result.fields().map(|f| f.name.as_str()).collect(); + assert_eq!( + field_names, + vec![ + "path", + "partitionValues", + "partitionValues_parsed", + "stats", + "stats_parsed" + ] + ); + } + + #[test] + fn test_build_add_output_schema_no_partition_values_when_struct_disabled() { + let config = StatsTransformConfig { + write_stats_as_json: true, + write_stats_as_struct: false, + }; + + let add_schema = StructType::new_unchecked([ + StructField::not_null("path", DataType::STRING), + StructField::nullable( + "partitionValues", + DataType::Map(Box::new(crate::schema::MapType::new( + DataType::STRING, + DataType::STRING, + true, + ))), + ), + StructField::nullable("stats", DataType::STRING), + ]); + + let stats_schema = StructType::new_unchecked([]); + let pv_schema = + StructType::new_unchecked([StructField::nullable("year", DataType::INTEGER)]); + + let result = build_add_output_schema(&config, &add_schema, &stats_schema, Some(&pv_schema)) + .expect("build add output schema should produce a valid schema"); + + let field_names: Vec<&str> = result.fields().map(|f| f.name.as_str()).collect(); + // partitionValues_parsed should NOT be present when writeStatsAsStruct=false + assert_eq!(field_names, vec!["path", "partitionValues", "stats"]); + } +} diff --git a/kernel/src/checkpoint/mod.rs b/kernel/src/checkpoint/mod.rs index 2adbd3d30d..5888968a54 100644 --- a/kernel/src/checkpoint/mod.rs +++ b/kernel/src/checkpoint/mod.rs @@ -1,6 +1,6 @@ //! This module implements the API for writing single-file checkpoints. //! -//! The entry point for this API is [`Snapshot::checkpoint`]. +//! The entry point for this API is [`Snapshot::create_checkpoint_writer`]. //! //! ## Checkpoint Types and Selection Logic //! This API supports two checkpoint types, selected based on table features: @@ -16,13 +16,13 @@ //! ## Architecture //! //! - [`CheckpointWriter`] - Core component that manages the checkpoint creation workflow -//! - [`CheckpointDataIterator`] - Iterator over the checkpoint data to be written +//! - [`ActionReconciliationIterator`] - Iterator over the checkpoint data to be written //! //! ## Usage //! //! The following steps outline the process of creating a checkpoint: //! -//! 1. Create a [`CheckpointWriter`] using [`Snapshot::checkpoint`] +//! 1. Create a [`CheckpointWriter`] using [`Snapshot::create_checkpoint_writer`] //! 2. Get the checkpoint path from [`CheckpointWriter::checkpoint_path`] //! 2. Get the checkpoint data from [`CheckpointWriter::checkpoint_data`] //! 3. Write the data to the path in object storage (engine-specific) @@ -31,7 +31,7 @@ //! //! ```no_run //! # use std::sync::Arc; -//! # use delta_kernel::checkpoint::CheckpointDataIterator; +//! # use delta_kernel::ActionReconciliationIterator; //! # use delta_kernel::checkpoint::CheckpointWriter; //! # use delta_kernel::Engine; //! # use delta_kernel::Snapshot; @@ -40,7 +40,7 @@ //! # use delta_kernel::Error; //! # use delta_kernel::FileMeta; //! # use url::Url; -//! fn write_checkpoint_file(path: Url, data: &CheckpointDataIterator) -> DeltaResult { +//! fn write_checkpoint_file(path: Url, data: &mut ActionReconciliationIterator) -> DeltaResult { //! todo!() /* engine-specific logic to write data to object storage*/ //! } //! @@ -51,19 +51,22 @@ //! let snapshot = Snapshot::builder_for(url).build(engine)?; //! //! // Create a checkpoint writer from the snapshot -//! let mut writer = snapshot.checkpoint()?; +//! let mut writer = snapshot.create_checkpoint_writer()?; //! //! // Get the checkpoint path and data //! let checkpoint_path = writer.checkpoint_path()?; //! let checkpoint_data = writer.checkpoint_data(engine)?; //! +//! // Get the iterator state +//! let state = checkpoint_data.state(); +//! //! // Write the checkpoint data to the object store and collect metadata -//! let metadata: FileMeta = write_checkpoint_file(checkpoint_path, &checkpoint_data)?; +//! let metadata: FileMeta = write_checkpoint_file(checkpoint_path, &mut checkpoint_data)?; //! //! /* IMPORTANT: All data must be written before finalizing the checkpoint */ //! -//! // Finalize the checkpoint by passing the metadata and exhausted data iterator -//! writer.finalize(engine, &metadata, checkpoint_data)?; +//! // Finalize the checkpoint by passing the metadata and state handle +//! writer.finalize(engine, &metadata, &state)?; //! //! # Ok::<_, Error>(()) //! ``` @@ -80,32 +83,43 @@ //! //! [`CheckpointMetadata`]: crate::actions::CheckpointMetadata //! [`LastCheckpointHint`]: crate::last_checkpoint_hint::LastCheckpointHint -//! [`Snapshot::checkpoint`]: crate::Snapshot::checkpoint +//! [`Snapshot::create_checkpoint_writer`]: crate::Snapshot::create_checkpoint_writer // Future extensions: // - TODO(#837): Multi-file V2 checkpoints are not supported yet. The API is designed to be extensible for future // multi-file support, but the current implementation only supports single-file checkpoints. -use std::sync::{Arc, LazyLock}; +use std::sync::{Arc, LazyLock, OnceLock}; use crate::action_reconciliation::log_replay::{ ActionReconciliationBatch, ActionReconciliationProcessor, }; -use crate::action_reconciliation::RetentionCalculator; +use crate::action_reconciliation::{ + ActionReconciliationIterator, ActionReconciliationIteratorState, RetentionCalculator, +}; use crate::actions::{ - Add, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, CHECKPOINT_METADATA_NAME, - METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, + Add, DomainMetadata, Metadata, Protocol, Remove, SetTransaction, Sidecar, ADD_NAME, + CHECKPOINT_METADATA_NAME, DOMAIN_METADATA_NAME, METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, + SET_TRANSACTION_NAME, SIDECAR_NAME, }; use crate::engine_data::FilteredEngineData; -use crate::expressions::Scalar; +use crate::expressions::{Expression, Scalar, StructData, Transform}; use crate::last_checkpoint_hint::LastCheckpointHint; use crate::log_replay::LogReplayProcessor; use crate::path::ParsedLogPath; use crate::schema::{DataType, SchemaRef, StructField, StructType, ToSchema as _}; use crate::snapshot::SnapshotRef; +use crate::table_features::TableFeature; use crate::table_properties::TableProperties; use crate::{DeltaResult, Engine, EngineData, Error, EvaluationHandlerExtension, FileMeta}; use url::Url; +mod checkpoint_transform; + +use checkpoint_transform::{ + build_checkpoint_output_schema, build_checkpoint_read_schema, build_checkpoint_transform, + StatsTransformConfig, +}; + #[cfg(test)] mod tests; @@ -123,64 +137,39 @@ static LAST_CHECKPOINT_SCHEMA: LazyLock = LazyLock::new(|| { .into() }); -/// Schema for extracting relevant actions from log files for checkpoint creation -static CHECKPOINT_ACTIONS_SCHEMA: LazyLock = LazyLock::new(|| { - Arc::new(StructType::new_unchecked([ +/// Action fields shared by V1 and V2 checkpoint schemas. +fn base_checkpoint_action_fields() -> Vec { + vec![ StructField::nullable(ADD_NAME, Add::to_schema()), StructField::nullable(REMOVE_NAME, Remove::to_schema()), StructField::nullable(METADATA_NAME, Metadata::to_schema()), StructField::nullable(PROTOCOL_NAME, Protocol::to_schema()), StructField::nullable(SET_TRANSACTION_NAME, SetTransaction::to_schema()), + StructField::nullable(DOMAIN_METADATA_NAME, DomainMetadata::to_schema()), StructField::nullable(SIDECAR_NAME, Sidecar::to_schema()), - ])) -}); + ] +} -// Schema of the [`CheckpointMetadata`] action that is included in V2 checkpoints -// We cannot use `CheckpointMetadata::to_schema()` as it would include the 'tags' field which -// we're not supporting yet due to the lack of map support TODO(#880). -static CHECKPOINT_METADATA_ACTION_SCHEMA: LazyLock = LazyLock::new(|| { - Arc::new(StructType::new_unchecked([StructField::nullable( +/// Schema for V1 checkpoints (without checkpointMetadata action) +static CHECKPOINT_ACTIONS_SCHEMA_V1: LazyLock = + LazyLock::new(|| Arc::new(StructType::new_unchecked(base_checkpoint_action_fields()))); + +/// Schema for the checkpointMetadata field in V2 checkpoints. +/// We cannot use `CheckpointMetadata::to_schema()` as it would include the 'tags' field which +/// we're not supporting yet due to the lack of map support TODO(#880). +fn checkpoint_metadata_field() -> StructField { + StructField::nullable( CHECKPOINT_METADATA_NAME, DataType::struct_type_unchecked([StructField::not_null("version", DataType::LONG)]), - )])) -}); - -/// An iterator over the checkpoint data to be written to the file. -/// -/// This iterator yields filtered checkpoint data batches ([`FilteredEngineData`]) and -/// tracks action statistics required for finalizing the checkpoint. -/// -/// # Warning -/// The [`CheckpointDataIterator`] must be fully consumed to ensure proper collection of statistics for -/// the checkpoint. Additionally, all yielded data must be written to the specified path before calling -/// [`CheckpointWriter::finalize`]. Failing to do so may result in data loss or corruption. -pub struct CheckpointDataIterator { - /// The nested iterator that yields checkpoint batches with action counts - checkpoint_batch_iterator: - Box> + Send>, - /// Running total of actions included in the checkpoint - actions_count: i64, - /// Running total of add actions included in the checkpoint - add_actions_count: i64, + ) } -impl Iterator for CheckpointDataIterator { - type Item = DeltaResult; - - /// Advances the iterator and returns the next value. - /// - /// This implementation transforms the `ActionReconciliationBatch` items from the nested iterator into - /// [`FilteredEngineData`] items for the engine to write, while accumulating action counts from - /// each batch. The [`CheckpointDataIterator`] is passed back to the kernel on call to - /// [`CheckpointWriter::finalize`] for counts to be read and written to the `_last_checkpoint` file - fn next(&mut self) -> Option { - Some(self.checkpoint_batch_iterator.next()?.map(|batch| { - self.actions_count += batch.actions_count; - self.add_actions_count += batch.add_actions_count; - batch.filtered_data - })) - } -} +/// Schema for V2 checkpoints (includes checkpointMetadata action) +static CHECKPOINT_ACTIONS_SCHEMA_V2: LazyLock = LazyLock::new(|| { + let mut fields = base_checkpoint_action_fields(); + fields.push(checkpoint_metadata_field()); + Arc::new(StructType::new_unchecked(fields)) +}); /// Orchestrates the process of creating a checkpoint for a table. /// @@ -203,6 +192,9 @@ pub struct CheckpointWriter { /// Note: Although the version is stored as a u64 in the snapshot, it is stored as an i64 /// field here to avoid multiple type conversions. version: i64, + + /// Cached checkpoint output schema. + checkpoint_output_schema: OnceLock, } impl RetentionCalculator for CheckpointWriter { @@ -222,12 +214,35 @@ impl CheckpointWriter { )) })?; - // We disallow checkpointing if the LogSegment contains any unpublished commits. (could - // create gaps in the version history, thereby breaking old readers) - snapshot.log_segment().validate_no_staged_commits()?; + // We disallow checkpointing if the Snapshot is not published. If we didn't, this could + // create gaps in the version history, thereby breaking old readers. + snapshot.log_segment().validate_published()?; - Ok(Self { snapshot, version }) + Ok(Self { + snapshot, + version, + checkpoint_output_schema: OnceLock::new(), + }) } + /// Returns the cached output schema, initializing it with `f` on first call. + /// + /// `OnceLock::get_or_try_init` is unstable, so we use a custom implementation. + /// (tracking issue: ). + fn get_or_init_output_schema( + &self, + f: impl FnOnce() -> DeltaResult, + ) -> DeltaResult { + if let Some(schema) = self.checkpoint_output_schema.get() { + return Ok(schema.clone()); + } + let schema = f()?; + let _ = self.checkpoint_output_schema.set(schema); + self.checkpoint_output_schema + .get() + .cloned() + .ok_or_else(|| Error::internal_error("OnceLock should be initialized")) + } + /// Returns the URL where the checkpoint file should be written. /// /// This method generates the checkpoint path based on the table's root and the version @@ -247,49 +262,132 @@ impl CheckpointWriter { } /// Returns the checkpoint data to be written to the checkpoint file. /// - /// This method reads the actions from the log segment and processes them - /// to create the checkpoint data. + /// This method reads actions from the log segment, processes them for checkpoint creation, + /// and applies stats transforms based on table properties: + /// - `delta.checkpoint.writeStatsAsJson` (default: true) + /// - `delta.checkpoint.writeStatsAsStruct` (default: false) /// - /// # Parameters - /// - `engine`: Implementation of [`Engine`] APIs. + /// The returned [`ActionReconciliationIterator`] yields [`FilteredEngineData`] batches with + /// stats transforms already applied. Use [`ActionReconciliationIterator::state`] to get the + /// shared state for passing to [`CheckpointWriter::finalize`]. /// - /// # Returns: [`CheckpointDataIterator`] containing the checkpoint data - // This method is the core of the checkpoint generation process. It: - // 1. Determines whether to write a V1 or V2 checkpoint based on the table's - // `v2Checkpoints` feature support - // 2. Reads actions from the log segment using the checkpoint read schema - // 3. Filters and deduplicates actions for the checkpoint - // 4. Chains the checkpoint metadata action if writing a V2 spec checkpoint - // (i.e., if `v2Checkpoints` feature is supported by table) - // 5. Generates the appropriate checkpoint path - pub fn checkpoint_data(&self, engine: &dyn Engine) -> DeltaResult { + /// # Engine Usage + /// + /// ```ignore + /// let mut checkpoint_data = writer.checkpoint_data(&engine)?; + /// let state = checkpoint_data.state(); + /// while let Some(batch) = checkpoint_data.next() { + /// let data = batch?.apply_selection_vector()?; + /// parquet_writer.write(&data).await?; + /// } + /// writer.finalize(&engine, &metadata, &state)?; + /// ``` + // Implementation overview: + // 1. Determines whether to write a V1 or V2 checkpoint based on `v2Checkpoints` feature + // 2. Builds a read schema with stats_parsed for COALESCE expressions + // 3. Reads actions from the log segment and deduplicates via reconciliation + // 4. Applies stats transforms (COALESCE/drop) to each reconciled batch + // 5. Chains the checkpoint metadata action for V2 checkpoints + pub fn checkpoint_data( + &self, + engine: &dyn Engine, + ) -> DeltaResult { + let config = StatsTransformConfig::from_table_properties(self.snapshot.table_properties()); + + // Get clustering columns so they are always included in stats per the Delta protocol. + let tc = self.snapshot.table_configuration(); + let physical_clustering_columns = self.snapshot.get_physical_clustering_columns(engine)?; + + // Get stats schema from table configuration. + // This already excludes partition columns and applies column mapping. + let stats_schema = tc + .build_expected_stats_schemas(physical_clustering_columns.as_deref(), None)? + .physical; + + // Select schema based on V2 checkpoint support let is_v2_checkpoints_supported = self .snapshot .table_configuration() - .is_v2_checkpoint_write_supported(); + .is_feature_supported(&TableFeature::V2Checkpoint); - let actions = self.snapshot.log_segment().read_actions( - engine, - CHECKPOINT_ACTIONS_SCHEMA.clone(), - None, - )?; + let base_schema = if is_v2_checkpoints_supported { + &CHECKPOINT_ACTIONS_SCHEMA_V2 + } else { + &CHECKPOINT_ACTIONS_SCHEMA_V1 + }; + + // Build partition schema for partitionValues_parsed (None for non-partitioned tables) + let partition_schema = self + .snapshot + .table_configuration() + .build_partition_values_parsed_schema(); + + // The read schema and output schema differ because the transform needs access to + // both stats formats as input, but may only write one format as output. + // + // read_schema: Always includes both `stats` and `stats_parsed` fields in the Add + // action, so COALESCE expressions can read from either source. For commit files, + // `stats_parsed` doesn't exist and is read as nulls. For partitioned tables, + // `partitionValues_parsed` is also included. + // + // output_schema: Only includes the stats fields that the table config requests + // (e.g., only `stats` if writeStatsAsJson=true and writeStatsAsStruct=false). + let read_schema = + build_checkpoint_read_schema(base_schema, &stats_schema, partition_schema.as_deref())?; + + // Read actions from log segment + let actions = self + .snapshot + .log_segment() + .read_actions(engine, read_schema.clone())?; - // Create iterator over actions for checkpoint data + // Process actions through reconciliation let checkpoint_data = ActionReconciliationProcessor::new( self.deleted_file_retention_timestamp()?, self.get_transaction_expiration_timestamp()?, ) .process_actions_iter(actions); - let checkpoint_metadata = - is_v2_checkpoints_supported.then(|| self.create_checkpoint_metadata_batch(engine)); + let output_schema = self.get_or_init_output_schema(|| { + build_checkpoint_output_schema( + &config, + base_schema, + &stats_schema, + partition_schema.as_deref(), + ) + })?; - // Wrap the iterator in a CheckpointDataIterator to track action counts - Ok(CheckpointDataIterator { - checkpoint_batch_iterator: Box::new(checkpoint_data.chain(checkpoint_metadata)), - actions_count: 0, - add_actions_count: 0, - }) + // Build transform expression and create expression evaluator. + // The transform is applied to reconciled action batches only (not checkpoint metadata). + let transform_expr = + build_checkpoint_transform(&config, &stats_schema, partition_schema.as_ref()); + let evaluator = engine.evaluation_handler().new_expression_evaluator( + read_schema, + transform_expr, + output_schema.clone().into(), + )?; + + // Apply stats transform to each reconciled batch + let transformed = checkpoint_data.map(move |batch_result| { + let batch = batch_result?; + let (data, sv) = batch.filtered_data.into_parts(); + let transformed = evaluator.evaluate(data.as_ref())?; + Ok(ActionReconciliationBatch { + filtered_data: FilteredEngineData::try_new(transformed, sv)?, + actions_count: batch.actions_count, + add_actions_count: batch.add_actions_count, + }) + }); + + // For V2 checkpoints, chain the checkpoint metadata batch after the transformed + // action stream. The metadata batch is created with the output schema directly, + // bypassing the stats transform (it has no add actions to transform). + let checkpoint_metadata = is_v2_checkpoints_supported + .then(|| self.create_checkpoint_metadata_batch(engine, &output_schema)); + + Ok(ActionReconciliationIterator::new(Box::new( + transformed.chain(checkpoint_metadata), + ))) } /// Finalizes checkpoint creation by saving metadata about the checkpoint. @@ -302,7 +400,7 @@ impl CheckpointWriter { /// # Parameters /// - `engine`: Implementation of [`Engine`] apis. /// - `metadata`: The metadata of the written checkpoint file - /// - `checkpoint_data`: The exhausted checkpoint data iterator + /// - `checkpoint_iter_state`: The state of the checkpoint data iterator /// /// # Returns: `Ok` if the checkpoint was successfully finalized // Internally, this method: @@ -313,10 +411,10 @@ impl CheckpointWriter { self, engine: &dyn Engine, metadata: &FileMeta, - mut checkpoint_data: CheckpointDataIterator, + checkpoint_iter_state: &ActionReconciliationIteratorState, ) -> DeltaResult<()> { // Ensure the checkpoint data iterator is fully exhausted - if checkpoint_data.checkpoint_batch_iterator.next().is_some() { + if !checkpoint_iter_state.is_exhausted() { return Err(Error::checkpoint_write( "The checkpoint data iterator must be fully consumed and written to storage before calling finalize" )); @@ -332,8 +430,8 @@ impl CheckpointWriter { let data = create_last_checkpoint_data( engine, self.version, - checkpoint_data.actions_count, - checkpoint_data.add_actions_count, + checkpoint_iter_state.actions_count(), + checkpoint_iter_state.add_actions_count(), size_in_bytes, ); @@ -358,23 +456,46 @@ impl CheckpointWriter { /// /// # Implementation Details /// - /// The function creates a single-row [`EngineData`] batch containing only the - /// version field of the [`CheckpointMetadata`] action. Future implementations will - /// include the additional metadata field `tags` when map support is added. + /// The function creates a single-row [`EngineData`] batch using the output checkpoint + /// schema, with all action fields (add, remove, etc.) set to null except for the + /// `checkpointMetadata` field. This ensures the checkpoint metadata batch has the same + /// schema as other action batches, allowing them to be written to the same Parquet file. + /// + /// The batch is created directly with the output schema and does not go through the stats + /// transform pipeline, since it contains no `add` actions to transform. /// /// # Returns: - /// A [`ActionReconciliationBatch`] batch including the single-row [`EngineData`] batch along with + /// An [`ActionReconciliationBatch`] including the single-row [`EngineData`] batch along with /// an accompanying selection vector with a single `true` value, indicating the action in - /// batch should be included in the checkpoint. + /// the batch should be included in the checkpoint. fn create_checkpoint_metadata_batch( &self, engine: &dyn Engine, + schema: &SchemaRef, ) -> DeltaResult { - let checkpoint_metadata_batch = engine.evaluation_handler().create_one( - CHECKPOINT_METADATA_ACTION_SCHEMA.clone(), - &[Scalar::from(self.version)], + // Start with an all-null row + let null_row = engine.evaluation_handler().null_row(schema.clone())?; + + // Build the checkpointMetadata struct value + let checkpoint_metadata_value = Scalar::Struct(StructData::try_new( + vec![StructField::not_null("version", DataType::LONG)], + vec![Scalar::from(self.version)], + )?); + + // Use a Transform to set just the checkpointMetadata field, keeping others null + let transform = Transform::new_top_level().with_replaced_field( + CHECKPOINT_METADATA_NAME, + Arc::new(Expression::literal(checkpoint_metadata_value)), + ); + + let evaluator = engine.evaluation_handler().new_expression_evaluator( + schema.clone(), + Arc::new(Expression::transform(transform)), + schema.clone().into(), )?; + let checkpoint_metadata_batch = evaluator.evaluate(null_row.as_ref())?; + let filtered_data = FilteredEngineData::with_all_rows_selected(checkpoint_metadata_batch); Ok(ActionReconciliationBatch { diff --git a/kernel/src/checkpoint/tests.rs b/kernel/src/checkpoint/tests.rs index 64fca3d2c6..bbe334e85c 100644 --- a/kernel/src/checkpoint/tests.rs +++ b/kernel/src/checkpoint/tests.rs @@ -4,66 +4,55 @@ use crate::action_reconciliation::{ deleted_file_retention_timestamp_with_time, DEFAULT_RETENTION_SECS, }; use crate::actions::{Add, Metadata, Protocol, Remove}; -use crate::arrow::array::{ArrayRef, StructArray}; -use crate::arrow::datatypes::{DataType, Schema}; -use crate::checkpoint::create_last_checkpoint_data; -use crate::engine::arrow_data::ArrowEngineData; -use crate::engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}; +use crate::arrow::datatypes::DataType; +use crate::arrow::{ + array::{create_array, Array, AsArray, RecordBatch, StructArray}, + datatypes::{Field, Schema}, +}; +use crate::checkpoint::{create_last_checkpoint_data, CHECKPOINT_ACTIONS_SCHEMA_V2}; +use crate::committer::FileSystemCommitter; +use crate::engine::arrow_data::{ArrowEngineData, EngineDataArrowExt}; +use crate::engine::default::executor::tokio::TokioMultiThreadExecutor; +use crate::engine::default::DefaultEngineBuilder; use crate::log_replay::HasSelectionVector; +use crate::object_store::local::LocalFileSystem; +use crate::object_store::{memory::InMemory, path::Path, ObjectStore}; use crate::schema::{DataType as KernelDataType, StructField, StructType}; +use crate::table_features::TableFeature; use crate::utils::test_utils::Action; use crate::{DeltaResult, FileMeta, LogPath, Snapshot}; - -use arrow_56::{ - array::{create_array, RecordBatch}, - datatypes::Field, -}; - -use object_store::{memory::InMemory, path::Path, ObjectStore}; use serde_json::{from_slice, json, Value}; +use tempfile::tempdir; use test_utils::delta_path_for_version; use url::Url; -#[test] -fn test_deleted_file_retention_timestamp() -> DeltaResult<()> { - const MILLIS_PER_SECOND: i64 = 1_000; - +#[rstest::rstest] +#[case::default_retention( + None, + 10_000_000 - (DEFAULT_RETENTION_SECS as i64 * 1_000) +)] +#[case::zero_retention(Some(Duration::from_secs(0)), 10_000_000)] +#[case::custom_retention(Some(Duration::from_secs(2_000)), 10_000_000 - 2_000_000)] +fn test_deleted_file_retention_timestamp( + #[case] retention: Option, + #[case] expected_timestamp: i64, +) -> DeltaResult<()> { let reference_time_secs = 10_000; let reference_time = Duration::from_secs(reference_time_secs); - let reference_time_millis = reference_time.as_millis() as i64; - - // Retention scenarios: - // ( retention duration , expected_timestamp ) - let test_cases = [ - // None = Default retention (7 days) - ( - None, - reference_time_millis - (DEFAULT_RETENTION_SECS as i64 * MILLIS_PER_SECOND), - ), - // Zero retention - (Some(Duration::from_secs(0)), reference_time_millis), - // Custom retention (e.g., 2000 seconds) - ( - Some(Duration::from_secs(2_000)), - reference_time_millis - (2_000 * MILLIS_PER_SECOND), - ), - ]; - for (retention, expected_timestamp) in test_cases { - let result = deleted_file_retention_timestamp_with_time(retention, reference_time)?; - assert_eq!(result, expected_timestamp); - } + let result = deleted_file_retention_timestamp_with_time(retention, reference_time)?; + assert_eq!(result, expected_timestamp); Ok(()) } -#[test] -fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { +#[tokio::test] +async fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { let (store, _) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // 1st commit (version 0) - metadata and protocol actions - // Protocol action does not include the v2Checkpoint reader/writer feature. + // Protocol action includes the v2Checkpoint reader/writer feature. write_commit_to_store( &store, vec![ @@ -71,38 +60,42 @@ fn test_create_checkpoint_metadata_batch() -> DeltaResult<()> { create_metadata_action(), ], 0, - )?; + ) + .await?; let table_root = Url::parse("memory:///")?; let snapshot = Snapshot::builder_for(table_root).build(&engine)?; - let writer = snapshot.checkpoint()?; + let writer = snapshot.create_checkpoint_writer()?; - let checkpoint_batch = writer.create_checkpoint_metadata_batch(&engine)?; + // Use V2 schema for the checkpoint metadata batch + let checkpoint_batch = + writer.create_checkpoint_metadata_batch(&engine, &CHECKPOINT_ACTIONS_SCHEMA_V2)?; assert!(checkpoint_batch.filtered_data.has_selected_rows()); - // Verify the underlying EngineData contains the expected CheckpointMetadata action + // Verify the underlying EngineData contains the expected fields let (underlying_data, _) = checkpoint_batch.filtered_data.into_parts(); let arrow_engine_data = ArrowEngineData::try_from_engine_data(underlying_data)?; let record_batch = arrow_engine_data.record_batch(); - // Build the expected RecordBatch - // Note: The schema is a struct with a single field "checkpointMetadata" of type struct - // containing a single field "version" of type long - let expected_schema = Arc::new(Schema::new(vec![Field::new( - "checkpointMetadata", - DataType::Struct(vec![Field::new("version", DataType::Int64, false)].into()), - true, - )])); - let expected = RecordBatch::try_new( - expected_schema, - vec![Arc::new(StructArray::from(vec![( - Arc::new(Field::new("version", DataType::Int64, false)), - create_array!(Int64, [0]) as ArrayRef, - )]))], - ) - .unwrap(); + // Verify the schema has the expected fields + let schema = record_batch.schema(); + assert!( + schema.field_with_name("checkpointMetadata").is_ok(), + "Schema should have checkpointMetadata field" + ); + assert!( + schema.field_with_name("add").is_ok(), + "Schema should have add field" + ); + assert!( + schema.field_with_name("remove").is_ok(), + "Schema should have remove field" + ); - assert_eq!(*record_batch, expected); + // Verify we have one row + assert_eq!(record_batch.num_rows(), 1); + + // Verify action counts assert_eq!(checkpoint_batch.actions_count, 1); assert_eq!(checkpoint_batch.add_actions_count, 0); @@ -116,7 +109,7 @@ fn test_create_last_checkpoint_data() -> DeltaResult<()> { let add_actions_counter = 75; let size_in_bytes: i64 = 1024 * 1024; // 1MB let (store, _) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Create last checkpoint metadata let last_checkpoint_batch = create_last_checkpoint_data( @@ -170,7 +163,7 @@ fn new_in_memory_store() -> (Arc, Url) { /// TODO(#855): Merge copies and move to `test_utils` /// Writes all actions to a _delta_log json commit file in the store. /// This function formats the provided filename into the _delta_log directory. -fn write_commit_to_store( +async fn write_commit_to_store( store: &Arc, actions: Vec, version: u64, @@ -180,28 +173,21 @@ fn write_commit_to_store( .map(|action| serde_json::to_string(&action).expect("action to string")) .collect(); let content = json_lines.join("\n"); - - let commit_path = format!("_delta_log/{}", delta_path_for_version(version, "json")); - - tokio::runtime::Runtime::new() - .expect("create tokio runtime") - .block_on(async { store.put(&Path::from(commit_path), content.into()).await })?; - + let commit_path = delta_path_for_version(version, "json"); + store.put(&commit_path, content.into()).await?; Ok(()) } /// Create a Protocol action without v2Checkpoint feature support fn create_basic_protocol_action() -> Action { Action::Protocol( - Protocol::try_new(3, 7, Some(Vec::::new()), Some(Vec::::new())).unwrap(), + Protocol::try_new_modern(TableFeature::EMPTY_LIST, TableFeature::EMPTY_LIST).unwrap(), ) } /// Create a Protocol action with v2Checkpoint feature support fn create_v2_checkpoint_protocol_action() -> Action { - Action::Protocol( - Protocol::try_new(3, 7, Some(vec!["v2Checkpoint"]), Some(vec!["v2Checkpoint"])).unwrap(), - ) + Action::Protocol(Protocol::try_new_modern(vec!["v2Checkpoint"], vec!["v2Checkpoint"]).unwrap()) } /// Create a Metadata action @@ -210,7 +196,10 @@ fn create_metadata_action() -> Action { Metadata::try_new( Some("test-table".into()), None, - StructType::new_unchecked([StructField::nullable("value", KernelDataType::INTEGER)]), + Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + KernelDataType::INTEGER, + )])), vec![], 0, HashMap::new(), @@ -219,7 +208,7 @@ fn create_metadata_action() -> Action { ) } -/// Create an Add action with the specified path +/// Create a simple Add action with the specified path (no stats) fn create_add_action(path: &str) -> Action { Action::Add(Add { path: path.into(), @@ -242,14 +231,14 @@ fn create_remove_action(path: &str) -> Action { } /// Helper to verify the contents of the `_last_checkpoint` file -fn assert_last_checkpoint_contents( +async fn assert_last_checkpoint_contents( store: &Arc, expected_version: u64, expected_size: u64, expected_num_add_files: u64, expected_size_in_bytes: u64, ) -> DeltaResult<()> { - let last_checkpoint_data = read_last_checkpoint_file(store)?; + let last_checkpoint_data = read_last_checkpoint_file(store).await?; let expected_data = json!({ "version": expected_version, "size": expected_size, @@ -261,36 +250,39 @@ fn assert_last_checkpoint_contents( } /// Reads the `_last_checkpoint` file from storage -fn read_last_checkpoint_file(store: &Arc) -> DeltaResult { +async fn read_last_checkpoint_file(store: &Arc) -> DeltaResult { let path = Path::from("_delta_log/_last_checkpoint"); - let rt = tokio::runtime::Runtime::new().expect("create tokio runtime"); - let byte_data = rt.block_on(async { - let data = store.get(&path).await?; - data.bytes().await - })?; + let data = store.get(&path).await?; + let byte_data = data.bytes().await?; Ok(from_slice(&byte_data)?) } /// Tests the `checkpoint()` API with: /// - A table that does not support v2Checkpoint /// - No version specified (latest version is used) -#[test] -fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { +#[tokio::test] +async fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { let (store, _) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // 1st commit: adds `fake_path_1` - write_commit_to_store(&store, vec![create_add_action("fake_path_1")], 0)?; + write_commit_to_store( + &store, + vec![create_add_action_with_stats("fake_path_1", 10)], + 0, + ) + .await?; // 2nd commit: adds `fake_path_2` & removes `fake_path_1` write_commit_to_store( &store, vec![ - create_add_action("fake_path_2"), + create_add_action_with_stats("fake_path_2", 20), create_remove_action("fake_path_1"), ], 1, - )?; + ) + .await?; // 3rd commit: metadata & protocol actions // Protocol action does not include the v2Checkpoint reader/writer feature. @@ -298,11 +290,12 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { &store, vec![create_metadata_action(), create_basic_protocol_action()], 2, - )?; + ) + .await?; let table_root = Url::parse("memory:///")?; let snapshot = Snapshot::builder_for(table_root).build(&engine)?; - let writer = snapshot.checkpoint()?; + let writer = snapshot.create_checkpoint_writer()?; // Verify the checkpoint file path is the latest version by default. assert_eq!( @@ -310,7 +303,8 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { Url::parse("memory:///_delta_log/00000000000000000002.checkpoint.parquet")? ); - let mut data_iter = writer.checkpoint_data(&engine)?; + let result = writer.checkpoint_data(&engine)?; + let mut data_iter = result; // The first batch should be the metadata and protocol actions. let batch = data_iter.next().unwrap()?; assert_eq!(batch.selection_vector(), &[true, true]); @@ -330,13 +324,13 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { last_modified: 0, size: size_in_bytes, }; - writer.finalize(&engine, &metadata, data_iter)?; + writer.finalize(&engine, &metadata, &data_iter.state())?; // Asserts the checkpoint file contents: // - version: latest version (2) // - size: 1 metadata + 1 protocol + 1 add action + 1 remove action // - numOfAddFiles: 1 add file from 2nd commit (fake_path_2) // - sizeInBytes: passed to finalize (10) - assert_last_checkpoint_contents(&store, 2, 4, 1, size_in_bytes)?; + assert_last_checkpoint_contents(&store, 2, 4, 1, size_in_bytes).await?; Ok(()) } @@ -344,10 +338,10 @@ fn test_v1_checkpoint_latest_version_by_default() -> DeltaResult<()> { /// Tests the `checkpoint()` API with: /// - A table that does not support v2Checkpoint /// - A specific version specified (version 0) -#[test] -fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { +#[tokio::test] +async fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { let (store, _) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // 1st commit (version 0) - metadata and protocol actions // Protocol action does not include the v2Checkpoint reader/writer feature. @@ -355,24 +349,26 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { &store, vec![create_basic_protocol_action(), create_metadata_action()], 0, - )?; + ) + .await?; // 2nd commit (version 1) - add actions write_commit_to_store( &store, vec![ - create_add_action("file1.parquet"), - create_add_action("file2.parquet"), + create_add_action_with_stats("file1.parquet", 100), + create_add_action_with_stats("file2.parquet", 200), ], 1, - )?; + ) + .await?; let table_root = Url::parse("memory:///")?; // Specify version 0 for checkpoint let snapshot = Snapshot::builder_for(table_root) .at_version(0) .build(&engine)?; - let writer = snapshot.checkpoint()?; + let writer = snapshot.create_checkpoint_writer()?; // Verify the checkpoint file path is the specified version. assert_eq!( @@ -380,7 +376,8 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { Url::parse("memory:///_delta_log/00000000000000000000.checkpoint.parquet")? ); - let mut data_iter = writer.checkpoint_data(&engine)?; + let result = writer.checkpoint_data(&engine)?; + let mut data_iter = result; // The first batch should be the metadata and protocol actions. let batch = data_iter.next().unwrap()?; assert_eq!(batch.selection_vector(), &[true, true]); @@ -395,34 +392,35 @@ fn test_v1_checkpoint_specific_version() -> DeltaResult<()> { last_modified: 0, size: size_in_bytes, }; - writer.finalize(&engine, &metadata, data_iter)?; + writer.finalize(&engine, &metadata, &data_iter.state())?; // Asserts the checkpoint file contents: // - version: specified version (0) // - size: 1 metadata + 1 protocol // - numOfAddFiles: no add files in version 0 // - sizeInBytes: passed to finalize (10) - assert_last_checkpoint_contents(&store, 0, 2, 0, size_in_bytes)?; + assert_last_checkpoint_contents(&store, 0, 2, 0, size_in_bytes).await?; Ok(()) } -#[test] -fn test_finalize_errors_if_checkpoint_data_iterator_is_not_exhausted() -> DeltaResult<()> { +#[tokio::test] +async fn test_finalize_errors_if_checkpoint_data_iterator_is_not_exhausted() -> DeltaResult<()> { let (store, _) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // 1st commit (version 0) - metadata and protocol actions write_commit_to_store( &store, vec![create_basic_protocol_action(), create_metadata_action()], 0, - )?; + ) + .await?; let table_root = Url::parse("memory:///")?; let snapshot = Snapshot::builder_for(table_root) .at_version(0) .build(&engine)?; - let writer = snapshot.checkpoint()?; + let writer = snapshot.create_checkpoint_writer()?; let data_iter = writer.checkpoint_data(&engine)?; /* The returned data iterator has batches that we do not consume */ @@ -436,7 +434,7 @@ fn test_finalize_errors_if_checkpoint_data_iterator_is_not_exhausted() -> DeltaR // Attempt to finalize the checkpoint with an iterator that has not been fully consumed let err = writer - .finalize(&engine, &metadata, data_iter) + .finalize(&engine, &metadata, &data_iter.state()) .expect_err("finalize should fail"); assert!( err.to_string().contains("Error writing checkpoint: The checkpoint data iterator must be fully consumed and written to storage before calling finalize") @@ -448,20 +446,21 @@ fn test_finalize_errors_if_checkpoint_data_iterator_is_not_exhausted() -> DeltaR /// Tests the `checkpoint()` API with: /// - A table that does supports v2Checkpoint /// - No version specified (latest version is used) -#[test] -fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { +#[tokio::test] +async fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { let (store, _) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // 1st commit: adds `fake_path_2` & removes `fake_path_1` write_commit_to_store( &store, vec![ - create_add_action("fake_path_2"), + create_add_action_with_stats("fake_path_2", 50), create_remove_action("fake_path_1"), ], 0, - )?; + ) + .await?; // 2nd commit: metadata & protocol actions // Protocol action includes the v2Checkpoint reader/writer feature. @@ -472,11 +471,12 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { create_v2_checkpoint_protocol_action(), ], 1, - )?; + ) + .await?; let table_root = Url::parse("memory:///")?; let snapshot = Snapshot::builder_for(table_root).build(&engine)?; - let writer = snapshot.checkpoint()?; + let writer = snapshot.create_checkpoint_writer()?; // Verify the checkpoint file path is the latest version by default. assert_eq!( @@ -484,7 +484,8 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { Url::parse("memory:///_delta_log/00000000000000000001.checkpoint.parquet")? ); - let mut data_iter = writer.checkpoint_data(&engine)?; + let result = writer.checkpoint_data(&engine)?; + let mut data_iter = result; // The first batch should be the metadata and protocol actions. let batch = data_iter.next().unwrap()?; assert_eq!(batch.selection_vector(), &[true, true]); @@ -509,43 +510,42 @@ fn test_v2_checkpoint_supported_table() -> DeltaResult<()> { last_modified: 0, size: size_in_bytes, }; - writer.finalize(&engine, &metadata, data_iter)?; + writer.finalize(&engine, &metadata, &data_iter.state())?; // Asserts the checkpoint file contents: // - version: latest version (1) // - size: 1 metadata + 1 protocol + 1 add action + 1 remove action + 1 checkpointMetadata // - numOfAddFiles: 1 add file from version 0 // - sizeInBytes: passed to finalize (10) - assert_last_checkpoint_contents(&store, 1, 5, 1, size_in_bytes)?; + assert_last_checkpoint_contents(&store, 1, 5, 1, size_in_bytes).await?; Ok(()) } -#[test] -fn test_no_checkpoint_staged_commits() -> DeltaResult<()> { +#[tokio::test] +async fn test_no_checkpoint_on_unpublished_snapshot() -> DeltaResult<()> { let (store, _) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // normal commit write_commit_to_store( &store, vec![create_metadata_action(), create_basic_protocol_action()], 0, - )?; + ) + .await?; // staged commit let staged_commit_path = Path::from( "_delta_log/_staged_commits/00000000000000000001.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json", ); - futures::executor::block_on(async { - let add_action = Action::Add(Add::default()); - store - .put( - &staged_commit_path, - serde_json::to_string(&add_action).unwrap().into(), - ) - .await - .unwrap() - }); + let add_action = Action::Add(Add::default()); + store + .put( + &staged_commit_path, + serde_json::to_string(&add_action).unwrap().into(), + ) + .await + .unwrap(); let table_root = Url::parse("memory:///")?; let staged_commit = FileMeta { @@ -558,8 +558,581 @@ fn test_no_checkpoint_staged_commits() -> DeltaResult<()> { .build(&engine)?; assert!(matches!( - snapshot.checkpoint().unwrap_err(), - crate::Error::Generic(e) if e == "Found staged commit file in log segment" + snapshot.create_checkpoint_writer().unwrap_err(), + crate::Error::Generic(e) if e == "Log segment is not published" )); Ok(()) } + +/// Create an Add action with JSON stats +fn create_add_action_with_stats(path: &str, num_records: i64) -> Action { + let stats = format!( + r#"{{"numRecords":{num_records},"minValues":{{"id":1,"name":"alice"}},"maxValues":{{"id":100,"name":"zoe"}},"nullCount":{{"id":0,"name":5}}}}"# + ); + Action::Add(Add { + path: path.into(), + data_change: true, + stats: Some(stats), + ..Default::default() + }) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_snapshot_checkpoint() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(); + + // Version 0: metadata & protocol + write_commit_to_store( + &store, + vec![create_metadata_action(), create_basic_protocol_action()], + 0, + ) + .await?; + + // Version 1: add 3 files + write_commit_to_store( + &store, + vec![ + create_add_action("file1.parquet"), + create_add_action("file2.parquet"), + create_add_action("file3.parquet"), + ], + 1, + ) + .await?; + + // Version 2: add 2 more files, remove 1 + write_commit_to_store( + &store, + vec![ + create_add_action("file4.parquet"), + create_add_action("file5.parquet"), + create_remove_action("file1.parquet"), + ], + 2, + ) + .await?; + + // Version 3: add 1 file, remove 2 + write_commit_to_store( + &store, + vec![ + create_add_action("file6.parquet"), + create_remove_action("file2.parquet"), + create_remove_action("file3.parquet"), + ], + 3, + ) + .await?; + + // Version 4: add 2 files + write_commit_to_store( + &store, + vec![ + create_add_action("file7.parquet"), + create_add_action("file8.parquet"), + ], + 4, + ) + .await?; + + let table_root = Url::parse("memory:///")?; + let snapshot = Snapshot::builder_for(table_root.clone()).build(&engine)?; + + snapshot.checkpoint(&engine)?; + + // First checkpoint: 1 metadata + 1 protocol + 5 add + 3 remove = 10, numOfAddFiles = 5 + let checkpoint_path = Path::from("_delta_log/00000000000000000004.checkpoint.parquet"); + let checkpoint_size = store.head(&checkpoint_path).await?.size; + assert_last_checkpoint_contents(&store, 4, 10, 5, checkpoint_size).await?; + + // Version 5: add 2 files, remove 1 + write_commit_to_store( + &store, + vec![ + create_add_action("file9.parquet"), + create_add_action("file10.parquet"), + create_remove_action("file4.parquet"), + ], + 5, + ) + .await?; + + // Version 6: add 1 file + write_commit_to_store(&store, vec![create_add_action("file11.parquet")], 6).await?; + + let snapshot = Snapshot::builder_for(table_root).build(&engine)?; + + snapshot.checkpoint(&engine)?; + + // Second checkpoint: 1 metadata + 1 protocol + 7 add + 4 remove = 13, numOfAddFiles = 7 + let checkpoint_path = Path::from("_delta_log/00000000000000000006.checkpoint.parquet"); + let checkpoint_size = store.head(&checkpoint_path).await?.size; + assert_last_checkpoint_contents(&store, 6, 13, 7, checkpoint_size).await?; + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_checkpoint_preserves_domain_metadata() -> DeltaResult<()> { + // ===== Setup ===== + let tmp_dir = tempdir().unwrap(); + let table_path = tmp_dir.path(); + let table_url = Url::from_directory_path(table_path).unwrap(); + std::fs::create_dir_all(table_path.join("_delta_log")).unwrap(); + + // ===== Create Table ===== + let commit0 = [ + json!({ + "protocol": { + "minReaderVersion": 3, + "minWriterVersion": 7, + "readerFeatures": [], + "writerFeatures": ["domainMetadata"] + } + }), + json!({ + "metaData": { + "id": "test-table-id", + "format": { "provider": "parquet", "options": {} }, + "schemaString": "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}", + "partitionColumns": [], + "configuration": {}, + "createdTime": 1587968585495i64 + } + }), + ] + .map(|j| j.to_string()) + .join("\n"); + std::fs::write( + table_path.join("_delta_log/00000000000000000000.json"), + commit0, + ) + .unwrap(); + + // ===== Create Engine ===== + let store = Arc::new(LocalFileSystem::new()); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(); + + let commit_domain_metadata = |domain: &str, value: &str| -> DeltaResult<()> { + let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; + let txn = snapshot.transaction(Box::new(FileSystemCommitter::new()), &engine)?; + let result = txn + .with_domain_metadata(domain.to_string(), value.to_string()) + .commit(&engine)?; + assert!(result.is_committed()); + Ok(()) + }; + + // ===== Commit Domain Metadata ===== + commit_domain_metadata("foo", "bar1")?; + commit_domain_metadata("foo", "bar2")?; + + // ===== Case 1: Verify domain metadata is preserved *before* checkpoint ===== + let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; + assert_eq!(snapshot.version(), 2); + let domain_value = snapshot.get_domain_metadata("foo", &engine)?; + assert_eq!(domain_value, Some("bar2".to_string())); + + // Trigger checkpoint + snapshot.checkpoint(&engine)?; + + // ===== Case 2: Verify domain metadata is preserved *after* checkpoint ===== + let snapshot = Snapshot::builder_for(table_url) + .at_version(2) + .build(&engine)?; + let domain_value = snapshot.get_domain_metadata("foo", &engine)?; + assert_eq!(domain_value, Some("bar2".to_string())); + + Ok(()) +} + +// TODO: Add test that checkpoint does not contain tombstoned domain metadata. + +/// Helper to create metadata action with specific stats settings +fn create_metadata_with_stats_config( + write_stats_as_json: bool, + write_stats_as_struct: bool, +) -> Action { + create_metadata_with_stats_config_and_partitions( + write_stats_as_json, + write_stats_as_struct, + vec![], + ) +} + +/// Helper to create metadata action with stats settings and partition columns +fn create_metadata_with_stats_config_and_partitions( + write_stats_as_json: bool, + write_stats_as_struct: bool, + partition_columns: Vec, +) -> Action { + let config = HashMap::from([ + ( + "delta.checkpoint.writeStatsAsJson".to_string(), + write_stats_as_json.to_string(), + ), + ( + "delta.checkpoint.writeStatsAsStruct".to_string(), + write_stats_as_struct.to_string(), + ), + ]); + Action::Metadata( + Metadata::try_new( + Some("test-table".into()), + None, + StructType::new_unchecked([ + StructField::nullable("id", KernelDataType::LONG), + StructField::nullable("name", KernelDataType::STRING), + StructField::nullable("category", KernelDataType::STRING), + ]) + .into(), + partition_columns, + 0, + config, + ) + .unwrap(), + ) +} + +/// Verifies checkpoint schema has expected fields based on stats configuration. +/// Non-partitioned tables should never have `partitionValues_parsed`. +fn verify_checkpoint_schema( + schema: &Schema, + expect_stats: bool, + expect_stats_parsed: bool, +) -> DeltaResult<()> { + verify_checkpoint_schema_with_partitions(schema, expect_stats, expect_stats_parsed, false) +} + +/// Verifies checkpoint schema has expected fields based on stats and partition configuration. +fn verify_checkpoint_schema_with_partitions( + schema: &Schema, + expect_stats: bool, + expect_stats_parsed: bool, + expect_partition_values_parsed: bool, +) -> DeltaResult<()> { + let add_field = schema + .field_with_name("add") + .expect("schema should have 'add' field"); + + if let DataType::Struct(add_fields) = add_field.data_type() { + let has_stats = add_fields.iter().any(|f| f.name() == "stats"); + let has_stats_parsed = add_fields.iter().any(|f| f.name() == "stats_parsed"); + let has_pv_parsed = add_fields + .iter() + .any(|f| f.name() == "partitionValues_parsed"); + + assert_eq!( + has_stats, expect_stats, + "stats field: expected={expect_stats}, actual={has_stats}" + ); + assert_eq!( + has_stats_parsed, expect_stats_parsed, + "stats_parsed field: expected={expect_stats_parsed}, actual={has_stats_parsed}" + ); + assert_eq!( + has_pv_parsed, expect_partition_values_parsed, + "partitionValues_parsed field: expected={expect_partition_values_parsed}, actual={has_pv_parsed}" + ); + } else { + panic!("add field should be a struct"); + } + Ok(()) +} + +/// Tests all 16 combinations of writeStatsAsJson and writeStatsAsStruct settings with a +/// full round-trip through parquet. +/// +/// For each combination (json1, struct1, json2, struct2): +/// 1. Writes checkpoint 1 to parquet with (json1, struct1) settings +/// 2. Changes config to (json2, struct2) +/// 3. Reads from checkpoint 1 to produce checkpoint 2 data, exercising COALESCE paths +/// (e.g., recovering stats from stats_parsed via ToJson, or vice versa) +/// 4. Verifies checkpoint 2 schema matches (json2, struct2) +#[rstest::rstest] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_stats_config_round_trip( + #[values(true, false)] json1: bool, + #[values(true, false)] struct1: bool, + #[values(true, false)] json2: bool, + #[values(true, false)] struct2: bool, +) -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(); + let table_root = Url::parse("memory:///")?; + + // Commit 0: protocol + metadata with initial settings + write_commit_to_store( + &store, + vec![ + create_basic_protocol_action(), + create_metadata_with_stats_config(json1, struct1), + ], + 0, + ) + .await?; + + // Commit 1: add action with JSON stats + write_commit_to_store( + &store, + vec![create_add_action_with_stats("file1.parquet", 100)], + 1, + ) + .await?; + + // Write checkpoint 1 to parquet with (json1, struct1) settings + let snapshot1 = Snapshot::builder_for(table_root.clone()).build(&engine)?; + snapshot1.checkpoint(&engine)?; + + // Commit 2: update metadata with new settings + write_commit_to_store( + &store, + vec![create_metadata_with_stats_config(json2, struct2)], + 2, + ) + .await?; + + // Build snapshot that reads from checkpoint 1 + commit 2. + // The add action for file1.parquet comes from checkpoint 1, so the COALESCE + // expressions must recover stats across format changes. + let snapshot2 = Snapshot::builder_for(table_root).build(&engine)?; + let writer2 = snapshot2.create_checkpoint_writer()?; + let mut result2 = writer2.checkpoint_data(&engine)?; + + // Verify checkpoint 2 schema matches new settings + let first_batch = result2.next().expect("should have at least one batch")?; + let data = first_batch.apply_selection_vector()?; + let record_batch = data.try_into_record_batch()?; + verify_checkpoint_schema(&record_batch.schema(), json2, struct2)?; + + // Consume remaining batches (verifies COALESCE doesn't error) + for batch in result2 { + let _ = batch?; + } + + Ok(()) +} + +/// Same as `test_stats_config_round_trip` but with a partitioned table. +/// Verifies that `partitionValues_parsed` is included in the checkpoint schema when +/// `writeStatsAsStruct` is true, and omitted when false. +#[rstest::rstest] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_stats_config_round_trip_partitioned( + #[values(true, false)] json1: bool, + #[values(true, false)] struct1: bool, + #[values(true, false)] json2: bool, + #[values(true, false)] struct2: bool, +) -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(); + let table_root = Url::parse("memory:///")?; + + // Commit 0: protocol + partitioned metadata with initial settings + write_commit_to_store( + &store, + vec![ + create_basic_protocol_action(), + create_metadata_with_stats_config_and_partitions( + json1, + struct1, + vec!["category".into()], + ), + ], + 0, + ) + .await?; + + // Commit 1: add action with partition values and JSON stats + let mut add = Add { + path: "category=books/file1.parquet".into(), + data_change: true, + stats: Some( + r#"{"numRecords":100,"minValues":{"id":1,"name":"alice"},"maxValues":{"id":100,"name":"zoe"},"nullCount":{"id":0,"name":5}}"#.into(), + ), + ..Default::default() + }; + add.partition_values + .insert("category".into(), "books".into()); + write_commit_to_store(&store, vec![Action::Add(add)], 1).await?; + + // Write checkpoint 1 with (json1, struct1) settings + let snapshot1 = Snapshot::builder_for(table_root.clone()).build(&engine)?; + snapshot1.checkpoint(&engine)?; + + // Commit 2: update metadata with new settings + write_commit_to_store( + &store, + vec![create_metadata_with_stats_config_and_partitions( + json2, + struct2, + vec!["category".into()], + )], + 2, + ) + .await?; + + // Build snapshot that reads from checkpoint 1 + commit 2 + let snapshot2 = Snapshot::builder_for(table_root).build(&engine)?; + let writer2 = snapshot2.create_checkpoint_writer()?; + let result2 = writer2.checkpoint_data(&engine)?; + + // Collect all checkpoint batches + let mut all_batches = Vec::new(); + for batch_result in result2 { + let batch = batch_result?; + let data = batch.apply_selection_vector()?; + all_batches.push(data.try_into_record_batch()?); + } + + // Verify checkpoint schema matches new settings + verify_checkpoint_schema_with_partitions( + &all_batches[0].schema(), + json2, + struct2, + struct2, // partitionValues_parsed present iff writeStatsAsStruct=true + )?; + + // When writeStatsAsStruct=true, verify partitionValues_parsed contains correct values + if struct2 { + let mut found_add = false; + for record_batch in &all_batches { + let add_col = record_batch + .column_by_name("add") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for row in 0..record_batch.num_rows() { + if !add_col.is_valid(row) { + continue; + } + found_add = true; + let pv_parsed = add_col + .column_by_name("partitionValues_parsed") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let category_col = pv_parsed + .column_by_name("category") + .expect("partitionValues_parsed should have category field"); + assert_eq!(category_col.as_string::().value(row), "books"); + } + } + assert!(found_add, "should have found an add action"); + } + + Ok(()) +} + +// This tests that we can change the metadata of a schema field in between checkpoints and still +// manage to checkpoint, with parsed stats enabled. +// The checkpoint at version 0 is written with a schema without field metadata, so its +// stats_parsed nullCount fields are plain Int64. Then a new metadata action at version 1 +// adds `__CHAR_VARCHAR_TYPE_STRING` to the "name" field. When checkpointing version 1, +// the kernel builds a stats schema with that metadata on nullCount fields (via +// NullCountStatsTransform), but the stats_parsed data from the old checkpoint lacks it, +// causing an Arrow schema mismatch in the COALESCE expression. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_checkpoint_with_varchar_metadata_on_field() -> DeltaResult<()> { + let (store, _) = new_in_memory_store(); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(); + + let config = HashMap::from([ + ("delta.checkpoint.writeStatsAsJson".into(), "true".into()), + ("delta.checkpoint.writeStatsAsStruct".into(), "true".into()), + ]); + + // Version 0: schema WITHOUT __CHAR_VARCHAR_TYPE_STRING + add with stats + let schema_v0 = Arc::new(StructType::new_unchecked([ + StructField::nullable("id", KernelDataType::LONG), + StructField::nullable("name", KernelDataType::STRING), + ])); + write_commit_to_store( + &store, + vec![ + create_basic_protocol_action(), + Action::Metadata( + Metadata::try_new( + Some("test".into()), + None, + schema_v0, + vec![], + 0, + config.clone(), + ) + .unwrap(), + ), + Action::Add(Add { + path: "file1.parquet".into(), + data_change: true, + stats: Some( + r#"{"numRecords":10,"minValues":{"id":1,"name":"alice"},"maxValues":{"id":100,"name":"zoe"},"nullCount":{"id":0,"name":2}}"#.into(), + ), + ..Default::default() + }), + ], + 0, + ) + .await?; + + // Checkpoint version 0: stats_parsed nullCount fields are plain Int64 (no metadata) + let table_root = Url::parse("memory:///")?; + Snapshot::builder_for(table_root.clone()) + .build(&engine)? + .checkpoint(&engine)?; + + // Version 1: new metadata WITH __CHAR_VARCHAR_TYPE_STRING on the "name" field + let schema_v1 = Arc::new(StructType::new_unchecked([ + StructField::nullable("id", KernelDataType::LONG), + StructField::nullable("name", KernelDataType::STRING).with_metadata([( + "__CHAR_VARCHAR_TYPE_STRING", + crate::schema::MetadataValue::String("varchar(255)".to_string()), + )]), + ])); + write_commit_to_store( + &store, + vec![Action::Metadata( + Metadata::try_new(Some("test".into()), None, schema_v1, vec![], 0, config).unwrap(), + )], + 1, + ) + .await?; + + // Checkpoint version 1: the add from checkpoint 0 has stats_parsed with nullCount fields + // lacking metadata. Ensure our checkpointing drops the new metadata for the stats fields and + // doesn't see a mismatch + Snapshot::builder_for(table_root) + .build(&engine)? + .checkpoint(&engine)?; + + Ok(()) +} diff --git a/kernel/src/clustering.rs b/kernel/src/clustering.rs new file mode 100644 index 0000000000..ffd830c5a6 --- /dev/null +++ b/kernel/src/clustering.rs @@ -0,0 +1,429 @@ +//! Clustering column support for Delta tables. +//! +//! This module provides functionality for reading and writing clustering columns +//! via domain metadata. Per the Delta protocol, writers MUST write per-file statistics +//! for clustering columns. +//! +//! Clustering columns are stored in domain metadata under the `delta.clustering` domain +//! as a JSON object with a `clusteringColumns` field containing an array of column paths, +//! where each path is an array of field names (to handle nested columns). + +use serde::{Deserialize, Serialize}; + +use crate::actions::DomainMetadata; +use crate::expressions::ColumnName; +use crate::scan::data_skipping::stats_schema::is_skipping_eligible_datatype; +use crate::schema::{DataType, StructType}; +use crate::{DeltaResult, Error}; + +/// Domain metadata structure for clustering columns. +/// +/// This is deserialized from the JSON configuration stored in the +/// `delta.clustering` domain metadata. Each clustering column is represented +/// as an array of field names to support nested columns. +/// +/// The column names are physical names. If column mapping is enabled, these will be +/// the physical column identifiers (e.g., `col-uuid`); otherwise, they match the logical names. +/// +/// Example JSON: +/// ```json +/// {"clusteringColumns": [["col1"], ["user", "address", "city"]]} +/// ``` +#[derive(Debug, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +struct ClusteringDomainMetadata { + clustering_columns: Vec>, +} + +/// The domain name for clustering metadata. +pub(crate) const CLUSTERING_DOMAIN_NAME: &str = "delta.clustering"; + +/// Validates clustering columns against the table schema. +/// +/// This function performs comprehensive validation of clustering columns: +/// +/// **Structural validations:** +/// 1. At least one column must be specified +/// 2. No duplicate columns +/// +/// **Schema validations:** +/// 3. Column paths must resolve in the schema (including nested paths through structs) +/// 4. Leaf field must have a data type eligible for statistics collection +/// +/// Both top-level and nested columns are supported. For nested columns, all intermediate +/// fields must be struct types and the leaf field must be a stats-eligible primitive. +/// +/// # Errors +/// +/// Returns an error if any validation fails. +pub(crate) fn validate_clustering_columns( + schema: &StructType, + columns: &[ColumnName], +) -> DeltaResult<()> { + use std::collections::HashSet; + + // Structural validation: at least one column required + if columns.is_empty() { + return Err(Error::generic("Clustering requires at least one column")); + } + + // Validate each column and check for duplicates + let mut seen = HashSet::new(); + for col in columns { + if !seen.insert(col) { + return Err(Error::generic(format!( + "Duplicate clustering column: '{col}'" + ))); + } + + // Walk the column path through nested structs and validate the leaf type. + // walk_column_fields validates: non-empty path, each field exists, intermediates are structs. + let fields = schema.walk_column_fields(col)?; + let leaf_type = fields + .last() + .ok_or_else(|| Error::generic(format!("Could not resolve column '{col}' in schema")))? + .data_type(); + match leaf_type { + DataType::Primitive(ptype) if is_skipping_eligible_datatype(ptype) => {} + dt => { + return Err(Error::generic(format!( + "Clustering column '{col}' has unsupported type '{dt}'. \ + Supported types: Byte, Short, Integer, Long, Float, Double, \ + Decimal, Date, Timestamp, TimestampNtz, String" + ))); + } + } + } + Ok(()) +} + +/// Creates domain metadata for clustering configuration. +/// +/// Converts the given clustering columns into the JSON format required by the Delta protocol +/// and wraps it in a `DomainMetadata` action. +/// +/// # Format +/// +/// The JSON format is: `{"clusteringColumns": [["col1"], ["col2"]]}` +/// Each column is represented as an array of path components to support nested columns. +pub(crate) fn create_clustering_domain_metadata(columns: &[ColumnName]) -> DomainMetadata { + let metadata = ClusteringDomainMetadata { + clustering_columns: columns + .iter() + .map(|c| c.path().iter().map(|s| s.to_string()).collect()) + .collect(), + }; + // ClusteringDomainMetadata serialization cannot fail (only contains Vec>) + #[allow(clippy::unwrap_used)] + let config = serde_json::to_string(&metadata).unwrap(); + + DomainMetadata::new(CLUSTERING_DOMAIN_NAME.to_string(), config) +} + +/// Parses clustering columns from a JSON configuration string. +/// +/// Returns `Ok(columns)` if the configuration is valid, or an error if malformed. +pub(crate) fn parse_clustering_columns(json_str: &str) -> DeltaResult> { + let metadata: ClusteringDomainMetadata = serde_json::from_str(json_str)?; + Ok(metadata + .clustering_columns + .into_iter() + .map(ColumnName::new) + .collect()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::schema::{DataType, StructField}; + + #[rstest::rstest] + #[case::simple( + r#"{"clusteringColumns": [["col1"], ["col2"]]}"#, + vec![vec!["col1"], vec!["col2"]] + )] + #[case::empty( + r#"{"clusteringColumns": []}"#, + vec![] + )] + #[case::nested( + r#"{"clusteringColumns": [["id"], ["user", "address", "city"], ["a", "b", "c", "d", "e"]]}"#, + vec![vec!["id"], vec!["user", "address", "city"], vec!["a", "b", "c", "d", "e"]] + )] + #[case::special_characters( + r#"{"clusteringColumns": [["col.with.dot"], ["`backticks`", "nested"]]}"#, + vec![vec!["col.with.dot"], vec!["`backticks`", "nested"]] + )] + #[case::tolerates_unknown_fields( + r#"{"clusteringColumns": [["col1"]], "foo": "bar", "futureField": 123}"#, + vec![vec!["col1"]] + )] + fn test_parse_clustering_columns(#[case] json: &str, #[case] expected: Vec>) { + let columns = parse_clustering_columns(json).unwrap(); + let expected_cols: Vec = expected.into_iter().map(ColumnName::new).collect(); + assert_eq!(columns, expected_cols); + } + + #[test] + fn test_validate_clustering_columns_valid() { + let schema = StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ]); + let columns = vec![ColumnName::new(["id"])]; + assert!(validate_clustering_columns(&schema, &columns).is_ok()); + } + + #[test] + fn test_validate_clustering_columns_not_found() { + let schema = + StructType::new_unchecked(vec![StructField::new("id", DataType::INTEGER, false)]); + let columns = vec![ColumnName::new(["nonexistent"])]; + let result = validate_clustering_columns(&schema, &columns); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("not found in schema")); + } + + #[test] + fn test_validate_clustering_columns_nested_valid() { + let address_struct = StructType::new_unchecked(vec![ + StructField::new("city", DataType::STRING, true), + StructField::new("zip", DataType::STRING, true), + ]); + let user_struct = StructType::new_unchecked(vec![ + StructField::new("name", DataType::STRING, true), + StructField::new("address", DataType::Struct(Box::new(address_struct)), true), + ]); + let schema = StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("user", DataType::Struct(Box::new(user_struct)), true), + ]); + + // Nested leaf column with eligible type should succeed + let columns = vec![ColumnName::new(["user", "address", "city"])]; + assert!(validate_clustering_columns(&schema, &columns).is_ok()); + } + + #[test] + fn test_validate_clustering_nested_struct_leaf_rejected() { + let inner_struct = + StructType::new_unchecked(vec![StructField::new("field", DataType::STRING, false)]); + let schema = StructType::new_unchecked(vec![StructField::new( + "parent", + DataType::Struct(Box::new(inner_struct)), + false, + )]); + + // Clustering on an entire struct (not a leaf primitive) should fail + let columns = vec![ColumnName::new(["parent"])]; + let result = validate_clustering_columns(&schema, &columns); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("unsupported type")); + } + + #[test] + fn test_validate_clustering_nested_intermediate_not_struct() { + let schema = + StructType::new_unchecked(vec![StructField::new("flat_col", DataType::STRING, false)]); + + // Trying to traverse into a non-struct field should fail + let columns = vec![ColumnName::new(["flat_col", "child"])]; + let result = validate_clustering_columns(&schema, &columns); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is not a struct type")); + } + + #[test] + fn test_validate_clustering_nested_path_not_found() { + let inner_struct = + StructType::new_unchecked(vec![StructField::new("field", DataType::STRING, false)]); + let schema = StructType::new_unchecked(vec![StructField::new( + "parent", + DataType::Struct(Box::new(inner_struct)), + false, + )]); + + // Nested field that doesn't exist should fail + let columns = vec![ColumnName::new(["parent", "nonexistent"])]; + let result = validate_clustering_columns(&schema, &columns); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("not found in schema")); + } + + #[test] + fn test_create_clustering_domain_metadata() { + let columns = vec![ColumnName::new(["col1"]), ColumnName::new(["col2"])]; + let dm = create_clustering_domain_metadata(&columns); + + assert_eq!(dm.domain(), CLUSTERING_DOMAIN_NAME); + + // Verify roundtrip: the JSON we create should be parseable back + let parsed = parse_clustering_columns(dm.configuration()).unwrap(); + assert_eq!(parsed, columns); + } + + #[test] + fn test_create_and_parse_roundtrip() { + // Test that create and parse are inverses + let original = vec![ + ColumnName::new(["id"]), + ColumnName::new(["timestamp"]), + ColumnName::new(["region"]), + ]; + let dm = create_clustering_domain_metadata(&original); + let parsed = parse_clustering_columns(dm.configuration()).unwrap(); + assert_eq!(original, parsed); + } + + #[test] + fn test_validate_clustering_columns_supported_types() { + // All supported primitive types + let schema = StructType::new_unchecked(vec![ + StructField::new("byte_col", DataType::BYTE, false), + StructField::new("short_col", DataType::SHORT, false), + StructField::new("int_col", DataType::INTEGER, false), + StructField::new("long_col", DataType::LONG, false), + StructField::new("float_col", DataType::FLOAT, false), + StructField::new("double_col", DataType::DOUBLE, false), + StructField::new("date_col", DataType::DATE, false), + StructField::new("timestamp_col", DataType::TIMESTAMP, false), + StructField::new("timestamp_ntz_col", DataType::TIMESTAMP_NTZ, false), + StructField::new("string_col", DataType::STRING, false), + StructField::new("decimal_col", DataType::decimal(10, 2).unwrap(), false), + ]); + + // Each supported type should be valid for clustering + for field in schema.fields() { + let columns = vec![ColumnName::new([field.name()])]; + assert!( + validate_clustering_columns(&schema, &columns).is_ok(), + "Type {} should be supported for clustering", + field.data_type() + ); + } + } + + #[test] + fn test_validate_clustering_columns_unsupported_primitive_types() { + // Boolean and Binary are primitives but not supported for clustering + let schema = StructType::new_unchecked(vec![ + StructField::new("bool_col", DataType::BOOLEAN, false), + StructField::new("binary_col", DataType::BINARY, false), + ]); + + for field in schema.fields() { + let columns = vec![ColumnName::new([field.name()])]; + let result = validate_clustering_columns(&schema, &columns); + assert!( + result.is_err(), + "Type {} should NOT be supported for clustering", + field.data_type() + ); + assert!(result.unwrap_err().to_string().contains("unsupported type")); + } + } + + #[test] + fn test_validate_clustering_columns_complex_types_rejected() { + use crate::schema::{ArrayType, MapType}; + + let inner_struct = + StructType::new_unchecked(vec![StructField::new("inner", DataType::STRING, false)]); + + let schema = StructType::new_unchecked(vec![ + StructField::new( + "struct_col", + DataType::Struct(Box::new(inner_struct)), + false, + ), + StructField::new( + "array_col", + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))), + false, + ), + StructField::new( + "map_col", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::INTEGER, + false, + ))), + false, + ), + ]); + + for field in schema.fields() { + let columns = vec![ColumnName::new([field.name()])]; + let result = validate_clustering_columns(&schema, &columns); + assert!( + result.is_err(), + "Complex type {} should NOT be supported for clustering", + field.data_type() + ); + assert!(result.unwrap_err().to_string().contains("unsupported type")); + } + } + + // Structural validation tests - parameterized with rstest + + /// Test that any number of clustering columns is allowed (no protocol-imposed limit). + #[rstest::rstest] + #[case::four(4)] + #[case::five(5)] + #[case::ten(10)] + fn test_validate_clustering_column_count(#[case] num_columns: usize) { + let fields: Vec = (0..num_columns) + .map(|i| StructField::new(format!("col{i}"), DataType::INTEGER, false)) + .collect(); + let schema = StructType::new_unchecked(fields); + + let columns: Vec = (0..num_columns) + .map(|i| ColumnName::new([format!("col{i}")])) + .collect(); + + assert!(validate_clustering_columns(&schema, &columns).is_ok()); + } + + /// Test various structural validation error cases. + #[rstest::rstest] + #[case::empty_columns(vec![], "at least one column")] + #[case::duplicate_columns(vec!["id", "id"], "Duplicate clustering column")] + fn test_validate_clustering_structural_errors( + #[case] column_names: Vec<&str>, + #[case] expected_error: &str, + ) { + let schema = + StructType::new_unchecked(vec![StructField::new("id", DataType::INTEGER, false)]); + let columns: Vec = column_names + .into_iter() + .map(|s| ColumnName::new([s])) + .collect(); + + let result = validate_clustering_columns(&schema, &columns); + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains(expected_error), + "Expected error containing '{expected_error}'" + ); + } + + #[test] + fn test_validate_clustering_columns_empty_name_rejected() { + let schema = + StructType::new_unchecked(vec![StructField::new("id", DataType::INTEGER, false)]); + // Create a ColumnName with empty path (can't easily express in rstest case) + let columns: Vec = vec![ColumnName::new(Vec::::new())]; + let result = validate_clustering_columns(&schema, &columns); + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("cannot be empty")); + } +} diff --git a/kernel/src/column_trie.rs b/kernel/src/column_trie.rs new file mode 100644 index 0000000000..4b8b5c2b78 --- /dev/null +++ b/kernel/src/column_trie.rs @@ -0,0 +1,147 @@ +//! A trie (prefix tree) for efficient column path matching. +//! +//! Used to quickly determine if a column path matches or is a descendant of any +//! user-specified column. This provides O(path_length) lookup instead of +//! O(num_specified_columns * path_length). + +use std::collections::HashMap; + +use crate::expressions::ColumnName; + +/// A trie (prefix tree) for efficient column path matching. +/// +/// The lifetime `'col` ties this trie to the column names it was built from, +/// allowing it to borrow string slices instead of cloning. +/// +/// The `Default` implementation creates an empty trie node with no children and +/// `is_terminal = false`. This is used both for creating a new root trie and for +/// creating intermediate nodes during insertion (via `or_default()`). +#[derive(Debug, Default)] +pub(crate) struct ColumnTrie<'col> { + children: HashMap<&'col str, ColumnTrie<'col>>, + /// True if this node represents the end of a specified column path. + /// Intermediate nodes have `is_terminal = false`; only the final node of + /// an inserted column path has `is_terminal = true`. + is_terminal: bool, +} + +impl<'col> ColumnTrie<'col> { + /// Creates an empty trie. + pub(crate) fn new() -> Self { + Self::default() + } + + /// Builds a trie from a list of column names. + /// + /// For example, `from_columns(&[column_name!("a.b"), column_name!("a.c")])` creates: + /// ```text + /// root (is_terminal=false) + /// └── "a" (is_terminal=false) + /// ├── "b" (is_terminal=true) + /// └── "c" (is_terminal=true) + /// ``` + pub(crate) fn from_columns(columns: &'col [ColumnName]) -> Self { + let mut trie = Self::new(); + for column in columns { + trie.insert(column); + } + trie + } + + /// Inserts a column path into the trie. + /// + /// Walks down the trie for each path component, creating nodes as needed via `or_default()` + /// (which initializes `is_terminal = false`). After the loop, only the final node is marked + /// as terminal. + /// + /// For example, inserting `a.b.c` creates: + /// ```text + /// root (is_terminal=false) + /// └── "a" (is_terminal=false) + /// └── "b" (is_terminal=false) + /// └── "c" (is_terminal=true) + /// ``` + pub(crate) fn insert(&mut self, column: &'col ColumnName) { + let mut node = self; + for part in column.iter() { + node = node.children.entry(part.as_str()).or_default(); + } + node.is_terminal = true; + } + + /// Returns true if `path` equals or is a descendant of any inserted column. + /// + /// For example, if the trie contains `["a", "b"]`: + /// - `["a", "b"]` → true (exact match) + /// - `["a", "b", "c"]` → true (descendant) + /// - `["a"]` → false (ancestor, not descendant) + /// - `["a", "x"]` → false (divergent path) + pub(crate) fn contains_prefix_of(&self, path: &[String]) -> bool { + let mut node = self; + for part in path { + if node.is_terminal { + // We've matched a complete specified column, and path continues. + // So path is a descendant of this specified column. + return true; + } + match node.children.get(part.as_str()) { + Some(child) => node = child, + None => return false, // Path diverges from all specified columns + } + } + // We've consumed the entire path. Match only if we're at a terminal. + node.is_terminal + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_column_trie() { + // Build trie with specified column ["a", "b"] + let columns = [ColumnName::new(["a", "b"])]; + let trie = ColumnTrie::from_columns(&columns); + + // Exact match: path = ["a", "b"] → include + assert!(trie.contains_prefix_of(&["a".to_string(), "b".to_string()])); + + // Descendant of specified: path = ["a", "b", "c"] → include + assert!(trie.contains_prefix_of(&["a".to_string(), "b".to_string(), "c".to_string()])); + + // Ancestor of specified: path = ["a"] → NOT include + assert!(!trie.contains_prefix_of(&["a".to_string()])); + + // Unrelated paths → NOT include + assert!(!trie.contains_prefix_of(&["a".to_string(), "c".to_string()])); + assert!(!trie.contains_prefix_of(&["x".to_string(), "y".to_string()])); + + // Non-existent nested path: trie has ["a", "b", "c", "d"], path = ["a", "b"] + // User asked for a.b.c.d but a.b is a leaf → NOT include + let deep_columns = [ColumnName::new(["a", "b", "c", "d"])]; + let deep_trie = ColumnTrie::from_columns(&deep_columns); + assert!(!deep_trie.contains_prefix_of(&["a".to_string(), "b".to_string()])); + + // Multiple specified columns + let multi_columns = [ + ColumnName::new(["a", "b"]), + ColumnName::new(["x", "y", "z"]), + ]; + let multi_trie = ColumnTrie::from_columns(&multi_columns); + assert!(multi_trie.contains_prefix_of(&["a".to_string(), "b".to_string()])); + assert!(multi_trie.contains_prefix_of(&[ + "a".to_string(), + "b".to_string(), + "c".to_string() + ])); + assert!(multi_trie.contains_prefix_of(&[ + "x".to_string(), + "y".to_string(), + "z".to_string() + ])); + assert!(!multi_trie.contains_prefix_of(&["x".to_string(), "y".to_string()])); // ancestor + assert!(!multi_trie.contains_prefix_of(&["a".to_string(), "c".to_string()])); + // divergent + } +} diff --git a/kernel/src/committer.rs b/kernel/src/committer.rs deleted file mode 100644 index cd61ca7e66..0000000000 --- a/kernel/src/committer.rs +++ /dev/null @@ -1,243 +0,0 @@ -//! The `committer` module provides a [`Committer`] trait which allows different implementations to -//! define how to commit transactions to a catalog or filesystem. For catalog-managed tables, a -//! [`Committer`] specific to the managing catalog should be provided. For non-catalog-managed -//! tables, the [`FileSystemCommitter`] should be used to commit directly to the object store (via -//! put-if-absent call to storage to atomically write new commit files). -//! -//! By implementing the [`Committer`] trait, different catalogs can define what happens when the -//! kernel needs to commit a transaction to a table. The goal terminal state of every -//! [`Transaction`] is to be committed to the table. This means writing the changes (we call these -//! actions) in the transaction as a new version of the table. The [`Committer`] trait exposes a -//! single method, [`commit`] which takes an engine, an iterator of actions (as [`EngineData`] -//! batches), and [`CommitMetadata`] (which includes critical commit metadata like the version to -//! commit) to allow different catalogs to define what it means to 'commit' the actions to a table. -//! For some, this may mean writing staged commits to object storage and retaining an in-memory list -//! (server side) of commits. For others, this may mean writing new (version, actions) tuples to a -//! database. -//! -//! The implementation of [`commit`] must ensure that the actions are committed atomically to the -//! table at the given version and either (1) persisted directly to object storage as published -//! deltas as in non-catalog-managed tables or (2) persisted within the catalog and made available -//! to readers during snapshot contstruction via the [`log_tail`] API. -//! -//! [`Transaction`]: crate::transaction::Transaction -//! [`commit`]: crate::committer::Committer::commit -//! [`log_tail`]: crate::snapshot::SnapshotBuilder::with_log_tail -//! [`EngineData`]: crate::EngineData - -use crate::path::LogRoot; -use crate::{AsAny, DeltaResult, Engine, Error, FilteredEngineData, Version}; - -use url::Url; - -/// `CommitMetadata` bundles the metadata about a commit operation. This currently includes the -/// commit path and version but will expand to things like `Protocol`, `Metadata`, etc. to allow -/// for catalogs to understand/cache/persist more information about the table at commit time. -/// -/// Note that this struct cannot be constructed. It is handed to the [`Committer`] (in the -/// [`commit`] method) by the kernel when a transaction is being committed. -/// -/// See the [module-level documentation] for more details. -/// -/// [`commit`]: Committer::commit -/// [module-level documentation]: crate::committer -#[derive(Debug)] -pub struct CommitMetadata { - pub(crate) log_root: LogRoot, - pub(crate) version: Version, - // in the future this will include Protocol, Metadata, CommitInfo, Domain Metadata, etc. -} - -impl CommitMetadata { - pub(crate) fn new(log_root: LogRoot, version: Version) -> Self { - Self { log_root, version } - } - - /// The commit path is the absolute path (e.g. s3://bucket/table/_delta_log/{version}.json) to - /// the published delta file for this commit. - pub fn published_commit_path(&self) -> DeltaResult { - self.log_root - .new_commit_path(self.version) - .map(|p| p.location) - } - - /// The staged commit path is the absolute path (e.g. - /// s3://bucket/table/_delta_log/{version}.{uuid}.json) to the staged commit file. - pub fn staged_commit_path(&self) -> DeltaResult { - self.log_root - .new_staged_commit_path(self.version) - .map(|p| p.location) - } - - /// The version to which the transaction is being committed. - pub fn version(&self) -> Version { - self.version - } -} - -/// `CommitResponse` is the result of committing a transaction via a catalog. The committer uses -/// this type to indicate whether or not the commit was successful or conflicted. The kernel then -/// transforms the associated [`Transaction`] into the appropriate state. -/// -/// If the commit was successful, the committer returns `CommitResponse::Committed` with the commit -/// version set. If the commit conflicted (e.g. another writer committed to the same version), the -/// Committer returns `CommitResponse::Conflict` with the version that was attempted. -/// -/// [`Transaction`]: crate::transaction::Transaction -#[derive(Debug)] -pub enum CommitResponse { - Committed { version: Version }, - Conflict { version: Version }, -} - -/// A Committer is the system by which transactions are committed to a table. Transactions are -/// effectively a collection of actions performed on the table at a specific version. The kernel -/// exposes this trait so different catalogs can build their own commit implementations. For -/// example, different catalogs may: commit directly to a database, commit to an object store, or -/// use another system entirely. -/// -/// Critically, a Committer must implement [`commit`] which takes an engine and an iterator of -/// actions (as [`EngineData`] batches) to commit to the table at the given version -/// ([`CommitMetadata::version`]). -/// -/// [`commit`]: Committer::commit -/// [`EngineData`]: crate::EngineData -// -// Note: While we could omit the Send bound, we keep it here for simplicity - so usage can be -// Arc (instead of Arc). If there is a strong case for a !Send -// Committer then we can remove this bound and possibly just do an alias like CommitterRef = -// Arc. -pub trait Committer: Send + AsAny { - fn commit( - &self, - engine: &dyn Engine, - actions: Box> + Send + '_>, - commit_metadata: CommitMetadata, - ) -> DeltaResult; -} - -/// The `FileSystemCommitter` is an internal implementation of the `Committer` trait which -/// commits to a file system directly via `Engine::json_handler().write_json_file` for -/// non-catalog-managed tables. -/// -/// SAFETY: it is _incorrect_ to use this committer for catalog-managed tables. -#[derive(Debug, Default)] -pub struct FileSystemCommitter; - -impl FileSystemCommitter { - pub fn new() -> Self { - Self {} - } -} - -impl Committer for FileSystemCommitter { - fn commit( - &self, - engine: &dyn Engine, - actions: Box> + Send + '_>, - commit_metadata: CommitMetadata, - ) -> DeltaResult { - match engine.json_handler().write_json_file( - &commit_metadata.published_commit_path()?, - Box::new(actions), - false, - ) { - Ok(()) => Ok(CommitResponse::Committed { - version: commit_metadata.version, - }), - Err(Error::FileAlreadyExists(_)) => Ok(CommitResponse::Conflict { - version: commit_metadata.version, - }), - Err(e) => Err(e), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use std::sync::Arc; - - use crate::engine::default::executor::tokio::TokioBackgroundExecutor; - use crate::engine::default::DefaultEngine; - use crate::path::LogRoot; - - use object_store::memory::InMemory; - use object_store::ObjectStore as _; - use url::Url; - - #[test] - fn test_commit_metadata() { - let table_root = Url::parse("s3://my-bucket/path/to/table/").unwrap(); - let log_root = LogRoot::new(table_root).unwrap(); - let version = 42; - - let commit_metadata = CommitMetadata::new(log_root, version); - - // version - assert_eq!(commit_metadata.version(), 42); - - // published commit path - let published_path = commit_metadata.published_commit_path().unwrap(); - assert_eq!( - published_path.as_str(), - "s3://my-bucket/path/to/table/_delta_log/00000000000000000042.json" - ); - - // staged commit path - let staged_path = commit_metadata.staged_commit_path().unwrap(); - let staged_path_str = staged_path.as_str(); - - assert!( - staged_path_str - .starts_with("s3://my-bucket/path/to/table/_delta_log/00000000000000000042."), - "Staged path should start with the correct prefix, got: {}", - staged_path_str - ); - assert!( - staged_path_str.ends_with(".json"), - "Staged path should end with .json, got: {}", - staged_path_str - ); - let uuid_str = staged_path_str - .strip_prefix("s3://my-bucket/path/to/table/_delta_log/00000000000000000042.") - .and_then(|s| s.strip_suffix(".json")) - .expect("Staged path should have expected format"); - uuid::Uuid::parse_str(uuid_str).expect("Staged path should contain a valid UUID"); - } - - #[cfg(feature = "catalog-managed")] - #[tokio::test] - async fn catalog_managed_tables_block_transactions() { - let storage = Arc::new(InMemory::new()); - let table_root = Url::parse("memory:///").unwrap(); - let engine = DefaultEngine::new(storage.clone(), Arc::new(TokioBackgroundExecutor::new())); - - let actions = [ - r#"{"commitInfo":{"timestamp":12345678900,"inCommitTimestamp":12345678900}}"#, - r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["catalogManaged"],"writerFeatures":["catalogManaged","inCommitTimestamp"]}}"#, - r#"{"metaData":{"id":"test-id","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[]}","partitionColumns":[],"configuration":{},"createdTime":1234567890}}"#, - ].join("\n"); - - let commit_path = object_store::path::Path::from("_delta_log/00000000000000000000.json"); - storage.put(&commit_path, actions.into()).await.unwrap(); - - let snapshot = crate::snapshot::SnapshotBuilder::new_for(table_root) - .build(&engine) - .unwrap(); - // Try to create a transaction with FileSystemCommitter - let committer = Box::new(FileSystemCommitter::new()); - let err = snapshot.transaction(committer).unwrap_err(); - assert!(matches!( - err, - crate::Error::Unsupported(e) if e.contains("Writes are not yet supported for catalog-managed tables") - )); - // after allowing writes, we will check that this disallows default committer for - // catalog-managed tables. - // assert!(matches!( - // err, - // crate::Error::Generic(e) if e.contains("Cannot use the default committer for a catalog-managed table") - // )); - } -} diff --git a/kernel/src/committer/commit_types.rs b/kernel/src/committer/commit_types.rs new file mode 100644 index 0000000000..0081c2a70b --- /dev/null +++ b/kernel/src/committer/commit_types.rs @@ -0,0 +1,398 @@ +//! Commit metadata types for the committer module. + +use std::collections::HashMap; +#[cfg(any(test, feature = "test-utils"))] +use std::sync::Arc; + +use url::Url; + +use crate::actions::{DomainMetadata, Metadata, Protocol}; +use crate::path::LogRoot; +use crate::{DeltaResult, Version}; + +/// The type of commit operation being performed. This communicates to the committer whether this +/// is a table creation or a write to an existing table, and whether the table is catalog-managed. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CommitType { + /// Creating a new table via filesystem (no catalog involvement). + PathBasedCreate, + /// Creating a new catalog-managed table. + CatalogManagedCreate, + /// Writing to an existing path-based table. + PathBasedWrite, + /// Writing to an existing catalog-managed table. + CatalogManagedWrite, + // TODO: Wire these up when ALTER TABLE SET TBLPROPERTIES is supported. + /// Upgrading an existing path-based table to catalog-managed. Not currently supported. + #[allow(dead_code)] + UpgradeToCatalogManaged, + /// Downgrading an existing catalog-managed table to path-based. Not currently supported. + #[allow(dead_code)] + DowngradeToPathBased, +} + +impl CommitType { + /// Returns `true` if this is a create-table commit (version 0). + pub fn is_create(&self) -> bool { + matches!(self, Self::PathBasedCreate | Self::CatalogManagedCreate) + } + + /// Returns `true` if this commit includes a catalog-managed operation, + /// including upgrade/downgrade. + pub fn requires_catalog_committer(&self) -> bool { + matches!( + self, + Self::CatalogManagedCreate + | Self::CatalogManagedWrite + | Self::UpgradeToCatalogManaged + | Self::DowngradeToPathBased + ) + } +} + +/// The protocol and metadata state for this commit. Groups the read snapshot state (if any) +/// and the new state being committed (if any). +#[derive(Debug)] +pub(crate) struct CommitProtocolMetadata { + /// Existing table protocol from read snapshot. `None` for create-table. + read_protocol: Option, + /// Existing table metadata from read snapshot. `None` for create-table. + read_metadata: Option, + /// New protocol being committed. `Some` for create-table and future ALTER TABLE. + new_protocol: Option, + /// New metadata being committed. `Some` for create-table and future ALTER TABLE. + new_metadata: Option, +} + +impl CommitProtocolMetadata { + pub(crate) fn try_new( + read_protocol: Option, + read_metadata: Option, + new_protocol: Option, + new_metadata: Option, + ) -> DeltaResult { + if read_protocol.is_some() != read_metadata.is_some() { + return Err(crate::Error::generic( + "read_protocol and read_metadata must both be present or both be absent", + )); + } + if read_protocol.is_none() && new_protocol.is_none() { + return Err(crate::Error::generic( + "CommitProtocolMetadata requires at least one protocol (read or new)", + )); + } + if read_metadata.is_none() && new_metadata.is_none() { + return Err(crate::Error::generic( + "CommitProtocolMetadata requires at least one metadata (read or new)", + )); + } + Ok(Self { + read_protocol, + read_metadata, + new_protocol, + new_metadata, + }) + } +} + +/// `CommitMetadata` bundles the metadata about a commit operation. This includes the commit path, +/// version, and protocol/metadata state of the table being committed to. Catalog committers can +/// use the protocol and metadata getters to validate or inspect the commit. +/// +/// Note that this struct cannot be constructed. It is handed to the [`Committer`] (in the +/// [`commit`] method) by the kernel when a transaction is being committed. +/// +/// See the [module-level documentation] for more details. +/// +/// [`Committer`]: super::Committer +/// [`commit`]: super::Committer::commit +/// [module-level documentation]: crate::committer +#[derive(Debug)] +pub struct CommitMetadata { + pub(crate) log_root: LogRoot, + pub(crate) version: Version, + pub(crate) commit_type: CommitType, + pub(crate) in_commit_timestamp: i64, + pub(crate) max_published_version: Option, + /// Protocol and metadata state for this commit. + pub(crate) protocol_metadata: CommitProtocolMetadata, + /// Domain metadata actions in this commit (additions and removals). + pub(crate) domain_metadata_changes: Vec, +} + +impl CommitMetadata { + pub(crate) fn new( + log_root: LogRoot, + version: Version, + commit_type: CommitType, + in_commit_timestamp: i64, + max_published_version: Option, + protocol_metadata: CommitProtocolMetadata, + domain_metadata_changes: Vec, + ) -> Self { + Self { + log_root, + version, + commit_type, + in_commit_timestamp, + max_published_version, + protocol_metadata, + domain_metadata_changes, + } + } + + /// The commit path is the absolute path (e.g. s3://bucket/table/_delta_log/{version}.json) to + /// the published delta file for this commit. + pub fn published_commit_path(&self) -> DeltaResult { + self.log_root + .new_commit_path(self.version) + .map(|p| p.location) + } + + /// The staged commit path is the absolute path (e.g. + /// s3://bucket/table/_delta_log/{version}.{uuid}.json) to the staged commit file. + pub fn staged_commit_path(&self) -> DeltaResult { + self.log_root + .new_staged_commit_path(self.version) + .map(|p| p.location) + } + + /// The version to which the transaction is being committed. + pub fn version(&self) -> Version { + self.version + } + + /// The type of commit operation being performed. + pub fn commit_type(&self) -> CommitType { + self.commit_type + } + + /// The in-commit timestamp for the commit. Note that this may differ from the actual commit + /// file modification time. + pub fn in_commit_timestamp(&self) -> i64 { + self.in_commit_timestamp + } + + /// The maximum published version of the table. + pub fn max_published_version(&self) -> Option { + self.max_published_version + } + + /// The root URL of the table being committed to. + pub fn table_root(&self) -> &Url { + self.log_root.table_root() + } + + /// Returns the effective protocol for this commit. Prefers new_protocol (create-table / ALTER + /// TABLE), falling back to the read snapshot's protocol. + pub(crate) fn effective_protocol(&self) -> DeltaResult<&Protocol> { + let pm = &self.protocol_metadata; + pm.new_protocol + .as_ref() + .or(pm.read_protocol.as_ref()) + .ok_or_else(|| { + crate::Error::internal_error( + "CommitProtocolMetadata should have at least one protocol", + ) + }) + } + + /// Returns the effective metadata for this commit. Prefers new_metadata (create-table / ALTER + /// TABLE), falling back to the read snapshot's metadata. + pub(crate) fn effective_metadata(&self) -> DeltaResult<&Metadata> { + let pm = &self.protocol_metadata; + pm.new_metadata + .as_ref() + .or(pm.read_metadata.as_ref()) + .ok_or_else(|| { + crate::Error::internal_error( + "CommitProtocolMetadata should have at least one metadata", + ) + }) + } + + /// Check if the effective protocol has a specific writer feature by name. + pub fn has_writer_feature(&self, feature_name: &str) -> bool { + self.effective_protocol() + .ok() + .and_then(|p| p.writer_features()) + .is_some_and(|features| features.iter().any(|f| f.as_ref() == feature_name)) + } + + /// Check if the effective protocol has a specific reader feature by name. + pub fn has_reader_feature(&self, feature_name: &str) -> bool { + self.effective_protocol() + .ok() + .and_then(|p| p.reader_features()) + .is_some_and(|features| features.iter().any(|f| f.as_ref() == feature_name)) + } + + /// Get the raw metadata configuration for the effective metadata. Returns `None` if no + /// metadata is set. + pub fn metadata_configuration(&self) -> Option<&HashMap> { + self.effective_metadata().ok().map(|m| m.configuration()) + } + + /// Returns `true` if this commit changes the table's protocol. + pub fn has_protocol_change(&self) -> bool { + self.protocol_metadata.new_protocol.is_some() + } + + /// Returns `true` if this commit changes the table's metadata. + pub fn has_metadata_change(&self) -> bool { + self.protocol_metadata.new_metadata.is_some() + } + + /// Returns `true` if this commit includes a domain metadata change for the given domain name. + pub fn has_domain_metadata_change(&self, domain: &str) -> bool { + self.domain_metadata_changes + .iter() + .any(|dm| dm.domain() == domain) + } + + /// Creates a new `CommitMetadata` for the given `table_root` and `version`. Test-only. + /// + /// Uses a default modern protocol (empty features) and empty metadata. + #[cfg(any(test, feature = "test-utils"))] + pub fn new_unchecked(table_root: Url, version: Version) -> DeltaResult { + Self::new_unchecked_with(table_root, version, vec![], vec![], HashMap::new()) + } + + /// Creates a new `CommitMetadata` with specific features and configuration. Test-only. + #[cfg(any(test, feature = "test-utils"))] + pub fn new_unchecked_with( + table_root: Url, + version: Version, + reader_features: Vec<&str>, + writer_features: Vec<&str>, + configuration: HashMap, + ) -> DeltaResult { + let log_root = crate::path::LogRoot::new(table_root)?; + let protocol = Protocol::try_new_modern(reader_features, writer_features)?; + let schema = Arc::new(crate::schema::StructType::new_unchecked(vec![])); + let metadata = Metadata::try_new(None, None, schema, vec![], 0, configuration)?; + Ok(Self::new( + log_root, + version, + CommitType::PathBasedWrite, + 0, + None, + CommitProtocolMetadata::try_new(Some(protocol), Some(metadata), None, None)?, + vec![], + )) + } + + /// Marks this `CommitMetadata` as having a protocol change. Test-only. + /// + /// that changes the protocol. + #[cfg(any(test, feature = "test-utils"))] + pub fn with_protocol_change(mut self) -> Self { + let protocol = self.effective_protocol().ok().cloned(); + self.protocol_metadata.new_protocol = protocol; + self + } + + /// Marks this `CommitMetadata` as having a metadata change. Test-only. + /// + /// Copies the existing metadata into the `new_metadata` field to simulate an ALTER TABLE + /// that changes the metadata. + #[cfg(any(test, feature = "test-utils"))] + pub fn with_metadata_change(mut self) -> Self { + let metadata = self.effective_metadata().ok().cloned(); + self.protocol_metadata.new_metadata = metadata; + self + } + + /// Adds a domain metadata change for the given domain name. Test-only. + /// + /// Creates a synthetic domain metadata entry to simulate a domain metadata change + /// (e.g. clustering column change via ALTER TABLE CLUSTER BY). + #[cfg(any(test, feature = "test-utils"))] + pub fn with_domain_change(mut self, domain: &str) -> Self { + self.domain_metadata_changes + .push(DomainMetadata::new(domain.to_string(), String::new())); + self + } +} + +/// `CommitResponse` is the result of committing a transaction via a catalog. The committer uses +/// this type to indicate whether or not the commit was successful or conflicted. The kernel then +/// transforms the associated [`Transaction`] into the appropriate state. +/// +/// If the commit was successful, the committer returns `CommitResponse::Committed` with the commit +/// version set. If the commit conflicted (e.g. another writer committed to the same version), the +/// Committer returns `CommitResponse::Conflict` with the version that was attempted. +/// +/// [`Transaction`]: crate::transaction::Transaction +#[derive(Debug)] +pub enum CommitResponse { + Committed { file_meta: crate::FileMeta }, + Conflict { version: Version }, +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::sync::Arc; + + use crate::path::LogRoot; + use url::Url; + + #[test] + fn test_commit_metadata() { + let table_root = Url::parse("s3://my-bucket/path/to/table/").unwrap(); + let log_root = LogRoot::new(table_root).unwrap(); + let version = 42; + let ts = 1234; + let max_published_version = Some(42); + let protocol = Protocol::try_new_modern(Vec::<&str>::new(), Vec::<&str>::new()).unwrap(); + let schema = Arc::new(crate::schema::StructType::new_unchecked(vec![])); + let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); + + let commit_metadata = CommitMetadata::new( + log_root, + version, + CommitType::PathBasedWrite, + ts, + max_published_version, + CommitProtocolMetadata::try_new(Some(protocol), Some(metadata), None, None).unwrap(), + vec![], + ); + + // version + assert_eq!(commit_metadata.version(), 42); + // in_commit_timestamp + assert_eq!(commit_metadata.in_commit_timestamp(), 1234); + assert_eq!(commit_metadata.max_published_version(), Some(42)); + + // published commit path + let published_path = commit_metadata.published_commit_path().unwrap(); + assert_eq!( + published_path.as_str(), + "s3://my-bucket/path/to/table/_delta_log/00000000000000000042.json" + ); + + // staged commit path + let staged_path = commit_metadata.staged_commit_path().unwrap(); + let staged_path_str = staged_path.as_str(); + + assert!( + staged_path_str.starts_with( + "s3://my-bucket/path/to/table/_delta_log/_staged_commits/00000000000000000042." + ), + "Staged path should start with the correct prefix, got: {staged_path_str}" + ); + assert!( + staged_path_str.ends_with(".json"), + "Staged path should end with .json, got: {staged_path_str}" + ); + let uuid_str = staged_path_str + .strip_prefix( + "s3://my-bucket/path/to/table/_delta_log/_staged_commits/00000000000000000042.", + ) + .and_then(|s| s.strip_suffix(".json")) + .expect("Staged path should have expected format"); + uuid::Uuid::parse_str(uuid_str).expect("Staged path should contain a valid UUID"); + } +} diff --git a/kernel/src/committer/filesystem.rs b/kernel/src/committer/filesystem.rs new file mode 100644 index 0000000000..f63d7346b3 --- /dev/null +++ b/kernel/src/committer/filesystem.rs @@ -0,0 +1,212 @@ +//! File system committer for non-catalog-managed tables. + +use crate::{DeltaResult, Engine, Error, FileMeta, FilteredEngineData}; +use tracing::{info, instrument}; + +use super::commit_types::{CommitMetadata, CommitResponse}; +use super::publish_types::PublishMetadata; +use super::Committer; + +/// The `FileSystemCommitter` is an internal implementation of the `Committer` trait which +/// commits to a file system directly via `Engine::json_handler().write_json_file` for +/// non-catalog-managed tables. +/// +/// SAFETY: it is _incorrect_ to use this committer for catalog-managed tables. +#[derive(Debug, Default)] +pub struct FileSystemCommitter; + +impl FileSystemCommitter { + pub fn new() -> Self { + Self {} + } +} + +impl Committer for FileSystemCommitter { + #[instrument( + name = "fs_committer.commit", + skip_all, + fields(version = commit_metadata.version()), + err + )] + fn commit( + &self, + engine: &dyn Engine, + actions: Box> + Send + '_>, + commit_metadata: CommitMetadata, + ) -> DeltaResult { + let version = commit_metadata.version(); + let published_commit_path = commit_metadata.published_commit_path()?; + + match engine.json_handler().write_json_file( + &published_commit_path, + Box::new(actions), + false, + ) { + Ok(()) => { + info!( + committed_version = version, + "Committed delta file via filesystem committer" + ); + // For now, we don't need the real size of the written file, so we can use 0. + // If we need this in the future, we can get it from StorageHandler::head. + let file_meta = FileMeta::new( + published_commit_path, + commit_metadata.in_commit_timestamp(), + 0, + ); + Ok(CommitResponse::Committed { file_meta }) + } + Err(Error::FileAlreadyExists(_)) => { + info!( + conflicting_version = version, + "Filesystem commit conflict: target version already exists" + ); + Ok(CommitResponse::Conflict { version }) + } + Err(e) => Err(e), + } + } + + fn is_catalog_committer(&self) -> bool { + false + } + + /// The FileSystemCommitter should never be invoked to publish catalog commits. If it is, + /// something has gone wrong upstream. + fn publish(&self, _engine: &dyn Engine, publish_metadata: PublishMetadata) -> DeltaResult<()> { + if !publish_metadata.commits_to_publish().is_empty() { + return Err(Error::generic( + "The FilesystemCommitter does not support publishing catalog commits.", + )); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::collections::HashMap; + use std::sync::Arc; + + use crate::actions::{Metadata, Protocol}; + use crate::committer::{CommitProtocolMetadata, CommitType}; + use crate::engine::default::DefaultEngineBuilder; + use crate::object_store::memory::InMemory; + use crate::object_store::path::Path; + use crate::object_store::ObjectStore as _; + use crate::path::LogRoot; + use url::Url; + + #[tokio::test] + async fn disallow_filesystem_committer_for_catalog_managed_tables() { + let storage = Arc::new(InMemory::new()); + let table_root = Url::parse("memory:///").unwrap(); + let engine = DefaultEngineBuilder::new(storage.clone()).build(); + + let actions = [ + r#"{"commitInfo":{"timestamp":12345678900,"inCommitTimestamp":12345678900}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["catalogManaged"],"writerFeatures":["catalogManaged","inCommitTimestamp"]}}"#, + r#"{"metaData":{"id":"test-id","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[]}","partitionColumns":[],"configuration":{"delta.enableInCommitTimestamps":"true"},"createdTime":1234567890}}"#, + ].join("\n"); + + let commit_path = Path::from("_delta_log/00000000000000000000.json"); + storage.put(&commit_path, actions.into()).await.unwrap(); + + let snapshot = crate::snapshot::SnapshotBuilder::new_for(table_root) + .build(&engine) + .unwrap(); + // Try to commit a transaction with FileSystemCommitter + let committer = Box::new(FileSystemCommitter::new()); + let err = snapshot + .transaction(committer, &engine) + .unwrap() + .commit(&engine) + .unwrap_err(); + assert!(matches!( + err, + crate::Error::Generic(e) if e.contains("This table is catalog-managed and requires a catalog committer.") + )); + } + + #[tokio::test] + async fn test_filesystem_committer_returns_valid_commit_response() { + let storage = Arc::new(InMemory::new()); + let table_root = Url::parse("memory:///").unwrap(); + let engine = DefaultEngineBuilder::new(storage).build(); + + let committer = FileSystemCommitter::new(); + let log_root = LogRoot::new(table_root).unwrap(); + let protocol = Protocol::try_new_modern(Vec::<&str>::new(), Vec::<&str>::new()).unwrap(); + let schema = Arc::new(crate::schema::StructType::new_unchecked(vec![])); + let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); + let commit_metadata = CommitMetadata::new( + log_root, + 1, + CommitType::PathBasedWrite, + 12345, + Some(0), + CommitProtocolMetadata::try_new(Some(protocol), Some(metadata), None, None).unwrap(), + vec![], + ); + let actions = Box::new(std::iter::empty()); + + let result = committer.commit(&engine, actions, commit_metadata).unwrap(); + + match result { + CommitResponse::Committed { file_meta } => { + assert_eq!(file_meta.last_modified, 12345); + assert_eq!(file_meta.size, 0); + assert!(file_meta + .location + .as_str() + .ends_with("00000000000000000001.json")); + } + CommitResponse::Conflict { .. } => panic!("Expected Committed, got Conflict"), + } + } + + #[tokio::test] + async fn test_filesystem_committer_returns_conflict_for_existing_version() { + let storage = Arc::new(InMemory::new()); + let table_root = Url::parse("memory:///").unwrap(); + let engine = DefaultEngineBuilder::new(storage).build(); + + let committer = FileSystemCommitter::new(); + let protocol = Protocol::try_new_modern(Vec::<&str>::new(), Vec::<&str>::new()).unwrap(); + let schema = Arc::new(crate::schema::StructType::new_unchecked(vec![])); + let metadata1 = + Metadata::try_new(None, None, schema.clone(), vec![], 0, HashMap::new()).unwrap(); + let metadata2 = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); + let first_metadata = CommitMetadata::new( + LogRoot::new(table_root.clone()).unwrap(), + 1, + CommitType::PathBasedWrite, + 12345, + Some(0), + CommitProtocolMetadata::try_new(Some(protocol.clone()), Some(metadata1), None, None) + .unwrap(), + vec![], + ); + let second_metadata = CommitMetadata::new( + LogRoot::new(table_root).unwrap(), + 1, + CommitType::PathBasedWrite, + 12346, + Some(0), + CommitProtocolMetadata::try_new(Some(protocol), Some(metadata2), None, None).unwrap(), + vec![], + ); + + let first = committer + .commit(&engine, Box::new(std::iter::empty()), first_metadata) + .unwrap(); + assert!(matches!(first, CommitResponse::Committed { .. })); + + let second = committer + .commit(&engine, Box::new(std::iter::empty()), second_metadata) + .unwrap(); + assert!(matches!(second, CommitResponse::Conflict { version: 1 })); + } +} diff --git a/kernel/src/committer/mod.rs b/kernel/src/committer/mod.rs new file mode 100644 index 0000000000..c447e925e6 --- /dev/null +++ b/kernel/src/committer/mod.rs @@ -0,0 +1,107 @@ +//! The `committer` module provides a [`Committer`] trait which allows different implementations to +//! define how to commit transactions to a catalog or filesystem. For catalog-managed tables, a +//! [`Committer`] specific to the managing catalog should be provided. For non-catalog-managed +//! tables, the [`FileSystemCommitter`] should be used to commit directly to the object store (via +//! put-if-absent call to storage to atomically write new commit files). +//! +//! By implementing the [`Committer`] trait, different catalogs can define what happens when the +//! kernel needs to commit a transaction to a table. The goal terminal state of every +//! [`Transaction`] is to be committed to the table. This means writing the changes (we call these +//! actions) in the transaction as a new version of the table. The [`Committer`] trait exposes a +//! single method, [`commit`] which takes an engine, an iterator of actions (as [`EngineData`] +//! batches), and [`CommitMetadata`] (which includes critical commit metadata like the version to +//! commit) to allow different catalogs to define what it means to 'commit' the actions to a table. +//! For some, this may mean writing staged commits to object storage and retaining an in-memory list +//! (server side) of commits. For others, this may mean writing new (version, actions) tuples to a +//! database. +//! +//! The implementation of [`commit`] must ensure that the actions are committed atomically to the +//! table at the given version and either (1) persisted directly to object storage as published +//! deltas as in non-catalog-managed tables or (2) persisted within the catalog and made available +//! to readers during snapshot contstruction via the [`log_tail`] API. +//! +//! [`Transaction`]: crate::transaction::Transaction +//! [`commit`]: crate::committer::Committer::commit +//! [`log_tail`]: crate::snapshot::SnapshotBuilder::with_log_tail +//! [`EngineData`]: crate::EngineData + +mod commit_types; +mod filesystem; +mod publish_types; + +pub(crate) use commit_types::CommitProtocolMetadata; +pub use commit_types::{CommitMetadata, CommitResponse, CommitType}; +pub use filesystem::FileSystemCommitter; +pub use publish_types::{CatalogCommit, PublishMetadata}; + +use crate::{DeltaResult, Engine, FilteredEngineData}; + +/// A Committer is the system by which transactions are committed to a table. Transactions are +/// effectively a collection of actions performed on the table at a specific version. The kernel +/// exposes this trait so different catalogs can build their own commit implementations. For +/// example, different catalogs may: commit directly to a database, commit to an object store, or +/// use another system entirely. +/// +/// Critically, a Committer must implement [`commit`] which takes an engine and an iterator of +/// actions (as [`EngineData`] batches) to commit to the table at the given version +/// ([`CommitMetadata::version`]). +/// +/// [`commit`]: Committer::commit +/// [`EngineData`]: crate::EngineData +// +// Note: While we could omit the Send bound, we keep it here for simplicity - so usage can be +// Arc (instead of Arc). If there is a strong case for a !Send +// Committer then we can remove this bound and possibly just do an alias like CommitterRef = +// Arc. +pub trait Committer: Send { + /// Commits actions to the table at the version specified in [`CommitMetadata`]. + /// + /// Implementations must ensure that actions are committed atomically and either: + /// 1. Persisted directly to object storage as published deltas (for filesystem-based tables), or + /// 2. Persisted as per the managing catalog's semantics (for catalog-managed tables) + fn commit( + &self, + engine: &dyn Engine, + actions: Box> + Send + '_>, + commit_metadata: CommitMetadata, + ) -> DeltaResult; + + /// Returns `true` if this committer is for a catalog-managed table, else `false`. + fn is_catalog_committer(&self) -> bool; + + /// Publishes catalog commits to the Delta log. Applicable only to catalog-managed tables. + /// + /// Publishing is the act of copying ratified catalog commits to the Delta log as published + /// Delta files (e.g., `_delta_log/00000000000000000001.json`). + /// + /// # When to call + /// + /// This method should only be called on catalog committers (i.e., when [`is_catalog_committer`] + /// returns `true`). Filesystem committers will error if called with catalog commits to publish. + /// + /// # Benefits + /// + /// - Reduces the number of commits the catalog needs to store internally and serve to readers + /// - Enables table maintenance operations that must operate on published versions only, such + /// as checkpointing and log compaction + /// + /// # Requirements + /// + /// - This method must ensure that all catalog commits are published to the Delta log up to and + /// including the snapshot version specified in [`PublishMetadata`] + /// - Commits must be published in order: version V-1 must be published before version V + /// + /// # Catalog-specific semantics + /// + /// Each catalog implementation may specify its own rules and semantics for publishing, + /// including whether it expects to be notified immediately upon publishing success, whether + /// published commits must appear with PUT-if-absent semantics in the Delta log, and whether + /// publishing happens in the client-side or server-side catalog component. + /// + /// # Errors + /// + /// Returns an error if the publish operation fails. + /// + /// [`is_catalog_committer`]: Committer::is_catalog_committer + fn publish(&self, engine: &dyn Engine, publish_metadata: PublishMetadata) -> DeltaResult<()>; +} diff --git a/kernel/src/committer/publish_types.rs b/kernel/src/committer/publish_types.rs new file mode 100644 index 0000000000..fdc0a26415 --- /dev/null +++ b/kernel/src/committer/publish_types.rs @@ -0,0 +1,236 @@ +//! Types for publishing catalog commits to the Delta log. + +use url::Url; + +use crate::path::{LogPathFileType, ParsedLogPath}; +use crate::utils::require; +use crate::{DeltaResult, Error, FileMeta, Version}; + +/// A catalog commit that has been ratified by the catalog but not yet published to the Delta log. +/// +/// Catalog commits are staged commits stored in `_delta_log/_staged_commits/` that have been +/// ratified (accepted) by the catalog but not yet copied to the main delta log as published +/// commits. This struct provides the information needed to publish a catalog commit. +/// +/// See [`Committer::publish`] for details on the publish operation. +/// +/// [`Committer::publish`]: super::Committer::publish +#[derive(Debug, Clone)] +pub struct CatalogCommit { + version: Version, + location: Url, + published_location: Url, +} + +impl CatalogCommit { + #[allow(dead_code)] // pub(crate) constructor will be used in future PRs + pub(crate) fn try_new( + log_root: &Url, + catalog_commit: &ParsedLogPath, + ) -> DeltaResult { + require!( + catalog_commit.file_type == LogPathFileType::StagedCommit, + Error::Generic(format!( + "Cannot construct CatalogCommit. Expected a StagedCommit, got {:?}", + catalog_commit.file_type + )) + ); + Ok(Self { + version: catalog_commit.version, + location: catalog_commit.location.location.clone(), + published_location: log_root.join(&format!("{:020}.json", catalog_commit.version))?, + }) + } + + /// The version of this catalog commit. + pub fn version(&self) -> Version { + self.version + } + + /// The location of the staged catalog commit file + /// (e.g., `s3://bucket/table/_delta_log/_staged_commits/00000000000000000001.uuid.json`). + pub fn location(&self) -> &Url { + &self.location + } + + /// The target location where this commit should be published + /// (e.g., `s3://bucket/table/_delta_log/00000000000000000001.json`). + pub fn published_location(&self) -> &Url { + &self.published_location + } +} + +#[cfg(any(test, feature = "test-utils"))] +impl CatalogCommit { + /// Creates a new `CatalogCommit` with explicit locations. Test-only. + pub fn new_unchecked(version: Version, location: Url, published_location: Url) -> Self { + Self { + version, + location, + published_location, + } + } +} + +/// Metadata required for publishing catalog commits to the Delta log. +/// +/// `PublishMetadata` bundles all the information needed to publish catalog commits: the version up +/// to which commits should be published, and the list of catalog commits themselves. +/// +/// # Invariants +/// +/// The following invariants are enforced at construction time: +/// - `commits_to_publish` must be non-empty +/// - `commits_to_publish` must be contiguous (no version gaps) in ascending order of version +/// - The last catalog commit version must equal `publish_to_version` +/// +/// See [`Committer::publish`] for details on the publish operation. +/// +/// [`Committer::publish`]: super::Committer::publish +pub struct PublishMetadata { + publish_to_version: Version, + commits_to_publish: Vec, +} + +impl PublishMetadata { + /// Creates a new `PublishMetadata` with the given publish to version and catalog commits. + #[allow(dead_code)] // constructor will be used in future PRs + pub fn try_new( + publish_to_version: Version, + commits_to_publish: Vec, + ) -> DeltaResult { + Self::validate_contiguous(&commits_to_publish)?; + Self::validate_end_version(&commits_to_publish, publish_to_version)?; + Ok(Self { + publish_to_version, + commits_to_publish, + }) + } + + /// The snapshot version up to which all catalog commits must be published. + pub fn publish_version(&self) -> Version { + self.publish_to_version + } + + /// The list of contiguous catalog commits to be published, in ascending order of version. + pub fn commits_to_publish(&self) -> &[CatalogCommit] { + &self.commits_to_publish + } + + fn validate_contiguous(commits_to_publish: &[CatalogCommit]) -> DeltaResult<()> { + commits_to_publish + .windows(2) + .all(|c| c[0].version() + 1 == c[1].version()) + .then_some(()) + .ok_or_else(|| { + Error::Generic(format!( + "Catalog commits must be contiguous: got versions {:?}", + commits_to_publish + .iter() + .map(|c| c.version()) + .collect::>() + )) + }) + } + + fn validate_end_version( + commits_to_publish: &[CatalogCommit], + publish_to_version: Version, + ) -> DeltaResult<()> { + match commits_to_publish.last().map(|c| c.version()) { + Some(v) if v == publish_to_version => Ok(()), + Some(v) => Err(Error::Generic(format!( + "Catalog commits must end with snapshot version {publish_to_version}, but got {v}" + ))), + None => Err(Error::Generic(format!( + "Catalog commits are empty, expected snapshot version {publish_to_version}" + ))), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::test_utils::assert_result_error_with_message; + + fn table_root() -> Url { + Url::parse("memory:///").unwrap() + } + + fn log_root() -> Url { + table_root().join("_delta_log/").unwrap() + } + + #[test] + fn test_catalog_commit_try_new_with_valid_staged_commit() { + let parsed_staged_commit = ParsedLogPath::create_parsed_staged_commit(&table_root(), 10); + let catalog_commit = CatalogCommit::try_new(&log_root(), &parsed_staged_commit).unwrap(); + assert_eq!(catalog_commit.version(), 10); + assert!(catalog_commit + .location() + .as_str() + .starts_with("memory:///_delta_log/_staged_commits/00000000000000000010")); + assert_eq!( + catalog_commit.published_location().as_str(), + "memory:///_delta_log/00000000000000000010.json" + ); + } + + #[test] + fn test_catalog_commit_try_new_rejects_non_staged_commit() { + let parsed_commit = ParsedLogPath::create_parsed_published_commit(&table_root(), 10); + + assert_result_error_with_message( + CatalogCommit::try_new(&log_root(), &parsed_commit), + "Cannot construct CatalogCommit. Expected a StagedCommit, got Commit", + ) + } + + fn create_catalog_commits(versions: &[Version]) -> Vec { + let table_root = table_root(); + let log_root = log_root(); + versions + .iter() + .map(|v| { + let parsed_staged_commit = + ParsedLogPath::create_parsed_staged_commit(&table_root, *v); + CatalogCommit::try_new(&log_root, &parsed_staged_commit).unwrap() + }) + .collect() + } + + #[test] + fn test_publish_metadata_construction_with_valid_commits() { + let catalog_commits = create_catalog_commits(&[10, 11, 12]); + let publish_metadata = PublishMetadata::try_new(12, catalog_commits).unwrap(); + assert_eq!(publish_metadata.publish_version(), 12); + assert_eq!(publish_metadata.commits_to_publish().len(), 3); + } + + #[test] + fn test_publish_metadata_construction_rejects_empty_commits() { + assert_result_error_with_message( + PublishMetadata::try_new(12, vec![]), + "Catalog commits are empty, expected snapshot version 12", + ) + } + + #[test] + fn test_publish_metadata_construction_rejects_non_contiguous_commits() { + let catalog_commits = create_catalog_commits(&[10, 12]); + assert_result_error_with_message( + PublishMetadata::try_new(12, catalog_commits), + "Catalog commits must be contiguous: got versions [10, 12]", + ) + } + + #[test] + fn test_publish_metadata_construction_rejects_commits_not_ending_with_publish_to_version() { + let catalog_commits = create_catalog_commits(&[10, 11]); + assert_result_error_with_message( + PublishMetadata::try_new(12, catalog_commits), + "Catalog commits must end with snapshot version 12, but got 11", + ) + } +} diff --git a/kernel/src/crc/delta.rs b/kernel/src/crc/delta.rs new file mode 100644 index 0000000000..0d3b60a12d --- /dev/null +++ b/kernel/src/crc/delta.rs @@ -0,0 +1,880 @@ +//! Incremental CRC state updates via commit deltas. +//! +//! A [`CrcDelta`] captures CRC-relevant changes from a single commit (produced by reading a +//! `.json` commit file during log replay, or from in-memory transaction state during writes). +//! [`Crc::apply`] advances a CRC forward one commit at a time by applying a delta. +//! +//! A CRC tracks two categories of fields, updated differently: +//! - **Metadata fields** (protocol, metadata, domain metadata, set transactions, in-commit +//! timestamp): always kept up-to-date -- every `apply` unconditionally merges these from +//! the delta. +//! - **File stats** (`num_files`, `table_size_bytes`): only updated when the current +//! [`FileStatsValidity`] is not terminal and the commit's operation is incremental-safe. +//! Once validity degrades (e.g. a non-incremental operation like ANALYZE STATS, or a +//! missing file size), file stats stop updating for the lifetime of that CRC. + +use tracing::warn; + +use crate::actions::{DomainMetadata, Metadata, Protocol, SetTransaction}; + +use super::file_stats::FileStatsDelta; +use super::{Crc, FileStatsValidity}; + +/// The CRC-relevant changes ("delta") from a single commit. Produced either by reading a +/// `.json` commit file during log replay, or from in-memory transaction state during writes. +#[derive(Debug, Clone, Default)] +pub(crate) struct CrcDelta { + /// Net file count, size changes and histograms. + pub(crate) file_stats: FileStatsDelta, + /// New protocol action, if this commit changed it. + pub(crate) protocol: Option, + /// New metadata action, if this commit changed it. + pub(crate) metadata: Option, + /// All DM actions in this commit (additions and removals). `apply()` only processes these + /// when the base CRC's `domain_metadata` is `Some` (tracked). + pub(crate) domain_metadata_changes: Vec, + /// All SetTransaction actions in this commit. `apply()` only processes these when the base + /// CRC's `set_transactions` is `Some` (tracked). + pub(crate) set_transaction_changes: Vec, + /// In-commit timestamp, if present in this commit. + pub(crate) in_commit_timestamp: Option, + /// Must be `Some` with an incremental-safe value for file stats to update. `None` or + /// unrecognized values transition validity to `Indeterminate`. + pub(crate) operation: Option, + /// A file action in this commit had a missing `size` field, making byte-level file stats + /// impossible to compute. + pub(crate) has_missing_file_size: bool, +} + +impl CrcDelta { + /// Convert this delta into a fresh [`Crc`]. Used when the delta represents the entire table + /// state (e.g. CREATE TABLE or the first commit in a forward replay from version zero). + /// + /// Returns `None` if protocol or metadata are missing (both are required for a valid CRC). + pub(crate) fn into_crc_for_version_zero(self) -> Option { + let protocol = self.protocol?; + let metadata = self.metadata?; + // For CREATE TABLE we always know the full domain metadata state: the transaction + // either included domain metadata actions or it didn't. So this is always `Some` -- + // an empty map means "no domain metadata", not "unknown". + let domain_metadata = Some( + self.domain_metadata_changes + .into_iter() + .filter(|dm| !dm.is_removed()) + .map(|dm| (dm.domain().to_string(), dm)) + .collect(), + ); + // CREATE TABLE starts with a known-complete set of transactions (possibly empty), + // so we always track them. + let set_transactions = Some( + self.set_transaction_changes + .into_iter() + .map(|txn| (txn.app_id.clone(), txn)) + .collect(), + ); + // For version zero the delta IS the full table histogram. Validate that all bins + // are non-negative (a real table can't have negative file counts). If validation + // fails, drop the histogram. + let initial_histogram = self.file_stats.net_histogram.and_then(|delta| { + delta + .check_non_negative() + .inspect_err(|e| { + warn!("Non-negative file count check failed, dropping file size histogram for version zero: {e}"); + }) + .ok() + }); + Some(Crc { + table_size_bytes: self.file_stats.net_bytes, + num_files: self.file_stats.net_files, + num_metadata: 1, + num_protocol: 1, + protocol, + metadata, + domain_metadata, + set_transactions, + in_commit_timestamp_opt: self.in_commit_timestamp, + file_size_histogram: initial_histogram, + ..Default::default() + }) + } +} + +/// Commit delta application for [`Crc`]. See the [module-level docs](self) for details. +impl Crc { + /// Apply a commit delta, updating all CRC fields and adjusting file stats validity. + /// + /// Metadata fields are always updated. File stats are only updated when: + /// - Validity is not already terminal ([`Untrackable`](FileStatsValidity::Untrackable) or + /// [`Indeterminate`](FileStatsValidity::Indeterminate)) + /// - The delta has no missing file sizes + /// - The operation is incremental-safe + pub(crate) fn apply(&mut self, delta: CrcDelta) { + // Protocol and metadata: replace if present. + if let Some(p) = delta.protocol { + self.protocol = p; + } + if let Some(m) = delta.metadata { + self.metadata = m; + } + + // Domain metadata: insert or remove by domain name. Only update if the base CRC + // tracks domain metadata (Some). If None ("not tracked"), leave it as None -- + // applying partial changes would create an incomplete map. + if !delta.domain_metadata_changes.is_empty() { + if let Some(map) = &mut self.domain_metadata { + for dm in delta.domain_metadata_changes { + if dm.is_removed() { + map.remove(dm.domain()); + } else { + let domain = dm.domain().to_string(); + map.insert(domain, dm); + } + } + } + } + + // Set transactions: upsert by app_id. Only update if the base CRC tracks set + // transactions (Some). If None ("not tracked"), leave it as None. + if let Some(map) = &mut self.set_transactions { + map.extend( + delta + .set_transaction_changes + .into_iter() + .map(|txn| (txn.app_id.clone(), txn)), + ); + } + + // In-commit timestamp: unconditional replace (not guarded by `if let Some`). + // If ICT was disabled after being enabled, the delta carries None, which correctly + // clears the previous value. + self.in_commit_timestamp_opt = delta.in_commit_timestamp; + + // Bail if already Untrackable -- nothing can recover missing file stats or histograms. + if self.file_stats_validity == FileStatsValidity::Untrackable { + return; + } + + // Missing file size poisons stats permanently. Checked after the Untrackable bail-out + // so that Untrackable can never transition to Indeterminate below. + if delta.has_missing_file_size { + self.file_stats_validity = FileStatsValidity::Untrackable; + self.file_size_histogram = None; + return; + } + + // Bail if already Indeterminate (theoretically recoverable via full replay). + if self.file_stats_validity == FileStatsValidity::Indeterminate { + return; + } + + let is_incremental_safe = delta + .operation + .as_deref() + .is_some_and(FileStatsDelta::is_incremental_safe); + if !is_incremental_safe { + self.file_stats_validity = FileStatsValidity::Indeterminate; + self.file_size_histogram = None; + return; + } + self.num_files += delta.file_stats.net_files; + self.table_size_bytes += delta.file_stats.net_bytes; + + // Histogram: merge base and delta. + // Only update if the base CRC has a histogram AND the delta provides one. + // If the merge fails (e.g. negative counts from corrupted data) or the delta is + // missing a histogram, drop it rather than leaving stale data. + if let (Some(base_hist), Some(delta_hist)) = ( + self.file_size_histogram.as_ref(), + &delta.file_stats.net_histogram, + ) { + match base_hist.try_apply_delta(delta_hist) { + Ok(merged) => self.file_size_histogram = Some(merged), + Err(e) => { + warn!("Histogram merge failed, dropping file size histogram: {e}"); + self.file_size_histogram = None; + } + } + } else if self.file_size_histogram.is_some() { + // The base had a histogram but the delta couldn't provide one. Drop it rather than + // leaving a stale value. + self.file_size_histogram = None; + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use rstest::rstest; + + use super::*; + use crate::actions::{DomainMetadata, Metadata, Protocol}; + use crate::crc::FileSizeHistogram; + + fn base_crc() -> Crc { + Crc { + table_size_bytes: 1000, + num_files: 10, + num_metadata: 1, + num_protocol: 1, + ..Default::default() + } + } + + fn write_delta(net_files: i64, net_bytes: i64) -> CrcDelta { + CrcDelta { + file_stats: FileStatsDelta { + net_files, + net_bytes, + ..Default::default() + }, + operation: Some("WRITE".to_string()), + ..Default::default() + } + } + + // ===== is_incremental_safe tests ===== + + #[test] + fn test_incremental_safe_operations() { + for op in [ + "WRITE", + "MERGE", + "UPDATE", + "DELETE", + "OPTIMIZE", + "CREATE TABLE", + "REPLACE TABLE", + "CREATE TABLE AS SELECT", + "REPLACE TABLE AS SELECT", + "CREATE OR REPLACE TABLE AS SELECT", + ] { + assert!( + FileStatsDelta::is_incremental_safe(op), + "{op} should be incremental-safe" + ); + } + } + + #[test] + fn test_non_incremental_safe_operations() { + assert!(!FileStatsDelta::is_incremental_safe("ANALYZE STATS")); + assert!(!FileStatsDelta::is_incremental_safe("UNKNOWN")); + } + + // ===== Crc deserialized from CRC file (default validity) ===== + + #[test] + fn test_deserialized_crc_has_valid_stats() { + let crc = base_crc(); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Valid); + assert_eq!(crc.num_files, 10); + assert_eq!(crc.table_size_bytes, 1000); + } + + // ===== Crc::apply tests ===== + + #[test] + fn test_apply_updates_file_stats() { + let mut crc = base_crc(); + crc.apply(write_delta(3, 600)); + assert_eq!(crc.num_files, 13); // 10 + 3 + assert_eq!(crc.table_size_bytes, 1600); // 1000 + 600 + assert_eq!(crc.file_stats_validity, FileStatsValidity::Valid); + } + + /// Applies multiple commit deltas sequentially. + #[test] + fn test_apply_multiple_deltas() { + let mut crc = base_crc(); + crc.apply(write_delta(3, 600)); + crc.apply(write_delta(-2, -400)); + assert_eq!(crc.num_files, 11); // 10 + 3 - 2 + assert_eq!(crc.table_size_bytes, 1200); // 1000 + 600 - 400 + assert_eq!(crc.file_stats_validity, FileStatsValidity::Valid); + } + + #[test] + fn test_apply_unsafe_op_transitions_to_indeterminate() { + let mut crc = base_crc(); + let unsafe_change = CrcDelta { + operation: Some("ANALYZE STATS".to_string()), + ..write_delta(1, 100) + }; + crc.apply(unsafe_change); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Indeterminate); + } + + #[test] + fn test_apply_none_op_transitions_to_indeterminate() { + let mut crc = base_crc(); + let unknown_delta = CrcDelta { + operation: None, + ..write_delta(1, 100) + }; + crc.apply(unknown_delta); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Indeterminate); + } + + #[test] + fn test_indeterminate_stays_indeterminate() { + let mut crc = base_crc(); + let unsafe_change = CrcDelta { + operation: Some("ANALYZE STATS".to_string()), + ..write_delta(1, 100) + }; + crc.apply(unsafe_change); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Indeterminate); + + // Subsequent safe op doesn't recover validity. + crc.apply(write_delta(5, 500)); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Indeterminate); + } + + // ===== apply: Untrackable (missing file size) tests ===== + + #[test] + fn test_missing_file_size_transitions_to_untrackable() { + let mut crc = base_crc(); + let delta = CrcDelta { + has_missing_file_size: true, + ..write_delta(1, 100) + }; + crc.apply(delta); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Untrackable); + } + + #[test] + fn test_untrackable_stays_untrackable() { + let mut crc = base_crc(); + let delta = CrcDelta { + has_missing_file_size: true, + ..write_delta(1, 100) + }; + crc.apply(delta); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Untrackable); + + // Applying a safe delta does not recover from Untrackable. + crc.apply(write_delta(5, 500)); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Untrackable); + + // Applying an unsafe delta also stays Untrackable (does not downgrade to Indeterminate). + crc.apply(CrcDelta { + operation: None, + ..write_delta(1, 100) + }); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Untrackable); + } + + #[test] + fn test_indeterminate_transitions_to_untrackable_on_missing_size() { + let mut crc = base_crc(); + let unsafe_change = CrcDelta { + operation: Some("ANALYZE STATS".to_string()), + ..write_delta(1, 100) + }; + crc.apply(unsafe_change); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Indeterminate); + + // Missing size escalates Indeterminate to Untrackable. + let delta = CrcDelta { + has_missing_file_size: true, + ..write_delta(1, 100) + }; + crc.apply(delta); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Untrackable); + } + + // ===== apply: non-file-stats field updates ===== + + #[test] + fn test_apply_replaces_protocol() { + let mut crc = base_crc(); + let new_protocol = Protocol::try_new( + 2, + 5, + None::>, + None::>, + ) + .unwrap(); + let delta = CrcDelta { + protocol: Some(new_protocol.clone()), + ..write_delta(0, 0) + }; + crc.apply(delta); + assert_eq!(crc.protocol, new_protocol); + assert_eq!(crc.metadata, Metadata::default()); // unchanged + } + + #[test] + fn test_apply_adds_domain_metadata_to_tracked_map() { + let mut crc = base_crc(); + crc.domain_metadata = Some(HashMap::new()); + let dm = DomainMetadata::new("my.domain".to_string(), "config1".to_string()); + let delta = CrcDelta { + domain_metadata_changes: vec![dm], + ..write_delta(0, 0) + }; + crc.apply(delta); + + let map = crc.domain_metadata.as_ref().unwrap(); + assert_eq!(map.len(), 1); + assert_eq!(map["my.domain"].configuration(), "config1"); + } + + #[test] + fn test_apply_with_untracked_domain_metadata_skips_changes() { + let mut crc = base_crc(); + assert!(crc.domain_metadata.is_none()); // Not tracked (default) + let dm = DomainMetadata::new("my.domain".to_string(), "config1".to_string()); + let delta = CrcDelta { + domain_metadata_changes: vec![dm], + ..write_delta(0, 0) + }; + crc.apply(delta); + + // domain_metadata stays None -- apply() must not create a partial map. + assert!(crc.domain_metadata.is_none()); + } + + #[test] + fn test_apply_upserts_domain_metadata() { + let mut crc = base_crc(); + crc.domain_metadata = Some(HashMap::from([( + "my.domain".to_string(), + DomainMetadata::new("my.domain".to_string(), "old_config".to_string()), + )])); + + let dm = DomainMetadata::new("my.domain".to_string(), "new_config".to_string()); + let delta = CrcDelta { + domain_metadata_changes: vec![dm], + ..write_delta(0, 0) + }; + crc.apply(delta); + + let map = crc.domain_metadata.as_ref().unwrap(); + assert_eq!(map.len(), 1); + assert_eq!(map["my.domain"].configuration(), "new_config"); + } + + #[test] + fn test_apply_removes_domain_metadata() { + let mut crc = base_crc(); + crc.domain_metadata = Some(HashMap::from([( + "my.domain".to_string(), + DomainMetadata::new("my.domain".to_string(), "config1".to_string()), + )])); + + let dm = DomainMetadata::remove("my.domain".to_string(), "config1".to_string()); + let delta = CrcDelta { + domain_metadata_changes: vec![dm], + ..write_delta(0, 0) + }; + crc.apply(delta); + + let map = crc.domain_metadata.as_ref().unwrap(); + assert!(map.is_empty()); + } + + #[test] + fn test_apply_replaces_in_commit_timestamp() { + let mut crc = base_crc(); + let delta = CrcDelta { + in_commit_timestamp: Some(9999), + ..write_delta(0, 0) + }; + crc.apply(delta); + assert_eq!(crc.in_commit_timestamp_opt, Some(9999)); + } + + #[test] + fn test_apply_clears_in_commit_timestamp_when_ict_disabled() { + let mut crc = base_crc(); + crc.in_commit_timestamp_opt = Some(1000); + + // Delta without ICT (e.g. ICT was disabled) clears the previous value. + let delta = CrcDelta { + in_commit_timestamp: None, + ..write_delta(0, 0) + }; + crc.apply(delta); + assert_eq!(crc.in_commit_timestamp_opt, None); + } + + // ===== CrcDelta::into_crc_for_version_zero tests ===== + + fn test_protocol() -> Protocol { + Protocol::try_new( + 1, + 2, + None::>, + None::>, + ) + .unwrap() + } + + #[test] + fn test_into_crc_for_version_zero_with_protocol_and_metadata() { + let protocol = test_protocol(); + let metadata = Metadata::default(); + let delta = CrcDelta { + protocol: Some(protocol.clone()), + metadata: Some(metadata.clone()), + ..write_delta(5, 1000) + }; + let crc = delta.into_crc_for_version_zero().unwrap(); + assert_eq!(crc.protocol, protocol); + assert_eq!(crc.metadata, metadata); + assert_eq!(crc.num_files, 5); + assert_eq!(crc.table_size_bytes, 1000); + assert_eq!(crc.num_metadata, 1); + assert_eq!(crc.num_protocol, 1); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Valid); + assert_eq!(crc.domain_metadata, Some(HashMap::new())); + assert_eq!(crc.in_commit_timestamp_opt, None); + } + + #[test] + fn test_into_crc_for_version_zero_returns_none_without_protocol() { + let delta = CrcDelta { + metadata: Some(Metadata::default()), + ..write_delta(5, 1000) + }; + assert!(delta.into_crc_for_version_zero().is_none()); + } + + #[test] + fn test_into_crc_for_version_zero_returns_none_without_metadata() { + let delta = CrcDelta { + protocol: Some(test_protocol()), + ..write_delta(5, 1000) + }; + assert!(delta.into_crc_for_version_zero().is_none()); + } + + #[test] + fn test_into_crc_for_version_zero_with_domain_metadata() { + let dm = DomainMetadata::new("my.domain".to_string(), "config1".to_string()); + let delta = CrcDelta { + protocol: Some(test_protocol()), + metadata: Some(Metadata::default()), + domain_metadata_changes: vec![dm], + ..write_delta(0, 0) + }; + let crc = delta.into_crc_for_version_zero().unwrap(); + let map = crc.domain_metadata.as_ref().unwrap(); + assert_eq!(map.len(), 1); + assert_eq!(map["my.domain"].configuration(), "config1"); + } + + #[test] + fn test_into_crc_for_version_zero_with_in_commit_timestamp() { + let delta = CrcDelta { + protocol: Some(test_protocol()), + metadata: Some(Metadata::default()), + in_commit_timestamp: Some(12345), + ..write_delta(0, 0) + }; + let crc = delta.into_crc_for_version_zero().unwrap(); + assert_eq!(crc.in_commit_timestamp_opt, Some(12345)); + } + + // ===== apply: set transaction tests ===== + + #[test] + fn test_apply_adds_set_transaction_to_tracked_map() { + let mut crc = base_crc(); + crc.set_transactions = Some(HashMap::new()); + let txn = SetTransaction::new("my-app".to_string(), 1, Some(1000)); + let delta = CrcDelta { + set_transaction_changes: vec![txn], + ..write_delta(0, 0) + }; + crc.apply(delta); + + let map = crc.set_transactions.as_ref().unwrap(); + assert_eq!(map.len(), 1); + assert_eq!(map["my-app"].version, 1); + assert_eq!(map["my-app"].last_updated, Some(1000)); + } + + #[test] + fn test_apply_with_untracked_set_transactions_skips_changes() { + let mut crc = base_crc(); + assert!(crc.set_transactions.is_none()); // Not tracked (default) + let txn = SetTransaction::new("my-app".to_string(), 1, Some(1000)); + let delta = CrcDelta { + set_transaction_changes: vec![txn], + ..write_delta(0, 0) + }; + crc.apply(delta); + + // set_transactions stays None -- apply() must not create a partial map. + assert!(crc.set_transactions.is_none()); + } + + #[test] + fn test_apply_upserts_set_transaction() { + let mut crc = base_crc(); + crc.set_transactions = Some(HashMap::from([( + "my-app".to_string(), + SetTransaction::new("my-app".to_string(), 1, Some(1000)), + )])); + + let txn = SetTransaction::new("my-app".to_string(), 2, Some(2000)); + let delta = CrcDelta { + set_transaction_changes: vec![txn], + ..write_delta(0, 0) + }; + crc.apply(delta); + + let map = crc.set_transactions.as_ref().unwrap(); + assert_eq!(map.len(), 1); + assert_eq!(map["my-app"].version, 2); + assert_eq!(map["my-app"].last_updated, Some(2000)); + } + + // ===== into_crc_for_version_zero: set transaction tests ===== + + #[test] + fn test_into_crc_for_version_zero_with_set_transactions() { + let txn = SetTransaction::new("my-app".to_string(), 5, Some(3000)); + let delta = CrcDelta { + protocol: Some(test_protocol()), + metadata: Some(Metadata::default()), + set_transaction_changes: vec![txn], + ..write_delta(0, 0) + }; + let crc = delta.into_crc_for_version_zero().unwrap(); + let map = crc.set_transactions.as_ref().unwrap(); + assert_eq!(map.len(), 1); + assert_eq!(map["my-app"].version, 5); + assert_eq!(map["my-app"].last_updated, Some(3000)); + } + + #[test] + fn test_into_crc_for_version_zero_with_no_set_transactions() { + let delta = CrcDelta { + protocol: Some(test_protocol()), + metadata: Some(Metadata::default()), + ..write_delta(0, 0) + }; + let crc = delta.into_crc_for_version_zero().unwrap(); + // Empty map, not None -- we always know the full state at version zero. + assert_eq!(crc.set_transactions, Some(HashMap::new())); + } + + // ===== Histogram tests ===== + + /// Helper: creates a default-boundary histogram populated with the given file sizes. + fn histogram_from_sizes(sizes: &[i64]) -> FileSizeHistogram { + let mut hist = FileSizeHistogram::create_default(); + for &size in sizes { + hist.insert(size).unwrap(); + } + hist + } + + /// Helper: creates a CRC with a histogram containing the given file sizes. + fn base_crc_with_histogram(file_sizes: &[i64]) -> Crc { + let hist = histogram_from_sizes(file_sizes); + Crc { + table_size_bytes: file_sizes.iter().sum(), + num_files: file_sizes.len() as i64, + num_metadata: 1, + num_protocol: 1, + file_size_histogram: Some(hist), + ..Default::default() + } + } + + /// Helper: creates a CrcDelta with a delta histogram built from adds and removes. + fn write_delta_with_histograms(add_sizes: &[i64], remove_sizes: &[i64]) -> CrcDelta { + let mut hist = FileSizeHistogram::create_default(); + for &s in add_sizes { + hist.insert(s).unwrap(); + } + for &s in remove_sizes { + hist.remove(s).unwrap(); + } + let net_files = add_sizes.len() as i64 - remove_sizes.len() as i64; + let net_bytes: i64 = add_sizes.iter().sum::() - remove_sizes.iter().sum::(); + CrcDelta { + file_stats: FileStatsDelta { + net_files, + net_bytes, + net_histogram: Some(hist), + }, + operation: Some("WRITE".to_string()), + ..Default::default() + } + } + + /// Histogram bins used in tests (default boundaries): + /// Bin 0: [0, 8KB) -- e.g. 100, 200, 300, 500 + /// Bin 1: [8KB, 16KB) -- e.g. 10_000 + /// Bin 2: [16KB, 32KB) -- e.g. 20_000 + /// Bin 10: [4MB, 8MB) -- e.g. 5_000_000 + #[rstest] + #[case::single_bin(&[100, 200, 300], &[500], &[200], &[(0, 3, 900)])] + #[case::adds_only(&[100], &[200, 300], &[], &[(0, 3, 600)])] + #[case::removes_only(&[100, 200, 300], &[], &[100, 200], &[(0, 1, 300)])] + #[case::empty_delta(&[100, 10_000], &[], &[], &[(0, 1, 100), (1, 1, 10_000)])] + #[case::multi_bin( + &[100, 10_000, 20_000], + &[200, 10_500], + &[100, 20_000], + &[(0, 1, 200), (1, 2, 20_500), (2, 0, 0)] + )] + #[case::large_files( + &[100, 5_000_000], + &[10_000, 5_500_000], + &[100], + &[(0, 0, 0), (1, 1, 10_000), (10, 2, 10_500_000)] + )] + fn apply_merges_histogram( + #[case] base: &[i64], + #[case] add: &[i64], + #[case] remove: &[i64], + #[case] expected_bins: &[(usize, i64, i64)], + ) { + let mut crc = base_crc_with_histogram(base); + let delta = write_delta_with_histograms(add, remove); + crc.apply(delta); + + let hist = crc.file_size_histogram.as_ref().unwrap(); + for &(bin, count, bytes) in expected_bins { + assert_eq!(hist.file_counts[bin], count, "file_counts[{bin}]"); + assert_eq!(hist.total_bytes[bin], bytes, "total_bytes[{bin}]"); + } + } + + #[rstest] + #[case::base_none_delta_none(None)] + #[case::base_some_delta_none(Some(vec![100i64, 200]))] + fn apply_drops_histogram_when_delta_missing_histogram(#[case] base_files: Option>) { + let mut crc = match &base_files { + Some(sizes) => base_crc_with_histogram(sizes), + None => base_crc(), + }; + let delta = CrcDelta { + file_stats: FileStatsDelta { + net_files: 1, + net_bytes: 100, + net_histogram: None, + }, + operation: Some("WRITE".to_string()), + ..Default::default() + }; + crc.apply(delta); + assert!( + crc.file_size_histogram.is_none(), + "histogram should be None when delta doesn't provide a histogram" + ); + } + + #[test] + fn apply_drops_histogram_on_indeterminate() { + let mut crc = base_crc_with_histogram(&[100, 200]); + let unsafe_delta = CrcDelta { + operation: Some("ANALYZE STATS".to_string()), + ..write_delta(1, 100) + }; + crc.apply(unsafe_delta); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Indeterminate); + assert!(crc.file_size_histogram.is_none()); + } + + #[test] + fn apply_drops_histogram_on_untrackable() { + let mut crc = base_crc_with_histogram(&[100, 200]); + // A missing file size makes byte-level stats impossible, so the histogram is dropped. + let delta = CrcDelta { + has_missing_file_size: true, + ..write_delta(1, 100) + }; + crc.apply(delta); + assert_eq!(crc.file_stats_validity, FileStatsValidity::Untrackable); + assert!(crc.file_size_histogram.is_none()); + } + + #[test] + fn into_crc_for_version_zero_includes_histogram() { + let delta_hist = histogram_from_sizes(&[500, 1000]); + let delta = CrcDelta { + protocol: Some(test_protocol()), + metadata: Some(Metadata::default()), + file_stats: FileStatsDelta { + net_files: 2, + net_bytes: 1500, + net_histogram: Some(delta_hist), + }, + operation: Some("WRITE".to_string()), + ..Default::default() + }; + let crc = delta.into_crc_for_version_zero().unwrap(); + let hist = crc.file_size_histogram.as_ref().unwrap(); + assert_eq!(hist.file_counts[0], 2); + assert_eq!(hist.total_bytes[0], 1500); + } + + #[test] + fn into_crc_for_version_zero_without_histogram() { + // write_delta() produces a CrcDelta with no histogram delta, so + // into_crc_for_version_zero cannot construct a file size histogram. + let delta = CrcDelta { + protocol: Some(test_protocol()), + metadata: Some(Metadata::default()), + ..write_delta(0, 0) + }; + let crc = delta.into_crc_for_version_zero().unwrap(); + assert!(crc.file_size_histogram.is_none()); + } + + #[test] + fn apply_merges_histogram_with_non_default_boundaries() { + // Base CRC with custom 3-bin histogram: [0, 200) [200, 1000) [1000, inf) + let boundaries = vec![0, 200, 1000]; + let base_hist = FileSizeHistogram::try_new( + boundaries.clone(), + vec![2, 1, 0], // 2 files in bin 0, 1 in bin 1 + vec![300, 500, 0], + ) + .unwrap(); + let mut crc = Crc { + table_size_bytes: 800, + num_files: 3, + num_metadata: 1, + num_protocol: 1, + file_size_histogram: Some(base_hist), + ..Default::default() + }; + + // Delta with matching non-default boundaries: +100 and +1500, -150 + let mut delta_hist = FileSizeHistogram::create_empty_with_boundaries(boundaries).unwrap(); + delta_hist.insert(100).unwrap(); // bin 0 + delta_hist.insert(1500).unwrap(); // bin 2 + delta_hist.remove(150).unwrap(); // bin 0 + + let delta = CrcDelta { + file_stats: FileStatsDelta { + net_files: 1, // +2 - 1 + net_bytes: 1450, // (100 + 1500) - 150 + net_histogram: Some(delta_hist), + }, + operation: Some("WRITE".to_string()), + ..Default::default() + }; + + crc.apply(delta); + + // Histogram should be preserved (boundaries match) + let hist = crc.file_size_histogram.as_ref().unwrap(); + assert_eq!(hist.sorted_bin_boundaries, vec![0, 200, 1000]); + assert_eq!(hist.file_counts, vec![2, 1, 1]); // (2+1-1), (1+0-0), (0+1-0) + assert_eq!(hist.total_bytes, vec![250, 500, 1500]); // (300+100-150), (500+0-0), (0+1500-0) + assert_eq!(crc.num_files, 4); + assert_eq!(crc.table_size_bytes, 2250); + } +} diff --git a/kernel/src/crc/file_size_histogram.rs b/kernel/src/crc/file_size_histogram.rs new file mode 100644 index 0000000000..70f37f6480 --- /dev/null +++ b/kernel/src/crc/file_size_histogram.rs @@ -0,0 +1,464 @@ +//! [`FileSizeHistogram`] tracks the distribution of file sizes across predefined bins. +//! +//! Used in CRC (version checksum) files to record the size distribution of active files in a +//! table version. Supports incremental updates via [`insert`](FileSizeHistogram::insert) and +//! [`remove`](FileSizeHistogram::remove), and delta merging via +//! [`try_apply_delta`](FileSizeHistogram::try_apply_delta). +//! +//! [FileSizeHistogram]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#file-size-histogram-schema + +use serde::{Deserialize, Serialize}; + +use crate::utils::require; +use crate::{DeltaResult, Error}; + +const KB: i64 = 1024; +const MB: i64 = KB * 1024; +const GB: i64 = MB * 1024; + +/// Default bin boundaries for file size categorization, matching Delta Kernel Java. +/// +/// 95 boundaries covering: +/// - 0 and powers of 2 from 8KB to 4MB +/// - 4MB steps from 8MB to 40MB +/// - 8MB steps from 48MB to 120MB +/// - 4MB steps from 124MB to 144MB (fine granularity around 128MB target file size) +/// - 16MB steps from 160MB to 576MB +/// - 64MB steps from 640MB to 1408MB +/// - 128MB steps from 1536MB to 2GB +/// - 256MB steps from 2304MB to 4GB +/// - Powers of 2 from 8GB to 256GB +#[rustfmt::skip] +const DEFAULT_BIN_BOUNDARIES: [i64; 95] = [ + 0, + // Powers of 2 from 8KB to 4MB + 8 * KB, 16 * KB, 32 * KB, 64 * KB, 128 * KB, 256 * KB, 512 * KB, + MB, 2 * MB, 4 * MB, + // 4MB steps from 8MB to 40MB + 8 * MB, 12 * MB, 16 * MB, 20 * MB, 24 * MB, 28 * MB, 32 * MB, 36 * MB, 40 * MB, + // 8MB steps from 48MB to 120MB + 48 * MB, 56 * MB, 64 * MB, 72 * MB, 80 * MB, 88 * MB, 96 * MB, 104 * MB, 112 * MB, 120 * MB, + // 4MB steps from 124MB to 144MB + 124 * MB, 128 * MB, 132 * MB, 136 * MB, 140 * MB, 144 * MB, + // 16MB steps from 160MB to 576MB + 160 * MB, 176 * MB, 192 * MB, 208 * MB, 224 * MB, 240 * MB, 256 * MB, 272 * MB, + 288 * MB, 304 * MB, 320 * MB, 336 * MB, 352 * MB, 368 * MB, 384 * MB, 400 * MB, + 416 * MB, 432 * MB, 448 * MB, 464 * MB, 480 * MB, 496 * MB, 512 * MB, 528 * MB, + 544 * MB, 560 * MB, 576 * MB, + // 64MB steps from 640MB to 1408MB + 640 * MB, 704 * MB, 768 * MB, 832 * MB, 896 * MB, 960 * MB, 1024 * MB, 1088 * MB, + 1152 * MB, 1216 * MB, 1280 * MB, 1344 * MB, 1408 * MB, + // 128MB steps from 1536MB to 2GB + 1536 * MB, 1664 * MB, 1792 * MB, 1920 * MB, 2048 * MB, + // 256MB steps from 2304MB to 4GB + 2304 * MB, 2560 * MB, 2816 * MB, 3072 * MB, 3328 * MB, 3584 * MB, 3840 * MB, 4096 * MB, + // Powers of 2 from 8GB to 256GB + 8 * GB, 16 * GB, 32 * GB, 64 * GB, 128 * GB, 256 * GB, +]; + +/// Tracks the distribution of file sizes across predefined bins. +/// +/// Each bin `i` covers the range `[sorted_bin_boundaries[i], sorted_bin_boundaries[i+1])`, +/// with the last bin extending to infinity. The histogram records both the count of files +/// and the total bytes in each bin. +/// +/// See the [Delta protocol spec] for the full schema definition. +/// +/// [Delta protocol spec]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#file-size-histogram-schema +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct FileSizeHistogram { + /// A sorted array of bin boundaries where each element represents the start of a bin + /// (inclusive) and the next element represents the end of the bin (exclusive). The first + /// element must be 0. + pub(crate) sorted_bin_boundaries: Vec, + /// Count of files in each bin. Length must match `sorted_bin_boundaries`. + pub(crate) file_counts: Vec, + /// Total bytes of files in each bin. Length must match `sorted_bin_boundaries`. + pub(crate) total_bytes: Vec, +} + +impl FileSizeHistogram { + /// Creates a new histogram with the given arrays, after validation. + /// + /// Validates that: + /// - All arrays have the same length (>= 2) + /// - The first boundary is 0 + /// - Boundaries are sorted in ascending order + pub(crate) fn try_new( + sorted_bin_boundaries: Vec, + file_counts: Vec, + total_bytes: Vec, + ) -> DeltaResult { + require!( + sorted_bin_boundaries.len() >= 2, + Error::internal_error(format!( + "sorted_bin_boundaries must have at least 2 elements, got {}", + sorted_bin_boundaries.len() + )) + ); + require!( + sorted_bin_boundaries[0] == 0, + Error::internal_error(format!( + "First boundary must be 0, got {}", + sorted_bin_boundaries[0] + )) + ); + require!( + sorted_bin_boundaries.len() == file_counts.len() + && sorted_bin_boundaries.len() == total_bytes.len(), + Error::internal_error(format!( + "All arrays must have the same length: boundaries={}, file_counts={}, total_bytes={}", + sorted_bin_boundaries.len(), + file_counts.len(), + total_bytes.len() + )) + ); + require!( + sorted_bin_boundaries.windows(2).all(|w| w[0] < w[1]), + Error::internal_error( + "sorted_bin_boundaries must be sorted in strictly ascending order" + ) + ); + Ok(Self { + sorted_bin_boundaries, + file_counts, + total_bytes, + }) + } + + /// Creates an empty histogram with the given bin boundaries and zero counts/bytes. + /// + /// Used when a previous CRC has non-default boundaries and we need to build delta + /// histograms that match, so that `try_apply_delta` succeeds during merge. + pub(crate) fn create_empty_with_boundaries( + sorted_bin_boundaries: Vec, + ) -> DeltaResult { + let len = sorted_bin_boundaries.len(); + Self::try_new(sorted_bin_boundaries, vec![0; len], vec![0; len]) + } + + /// Creates a default histogram with the standard 95 bin boundaries and zero counts. + /// + /// Uses the same bin boundaries as Delta Kernel Java for cross-implementation compatibility. + pub(crate) fn create_default() -> Self { + let len = DEFAULT_BIN_BOUNDARIES.len(); + Self { + sorted_bin_boundaries: DEFAULT_BIN_BOUNDARIES.to_vec(), + file_counts: vec![0; len], + total_bytes: vec![0; len], + } + } + + /// Returns the bin index for the given file size via binary search. + /// + /// The bin index is the largest `i` such that `sorted_bin_boundaries[i] <= file_size`. + /// Files larger than the maximum boundary are placed in the last bin. + fn get_bin_index(&self, file_size: i64) -> usize { + debug_assert!(file_size >= 0); + match self.sorted_bin_boundaries.binary_search(&file_size) { + Ok(idx) => idx, + // binary_search returns Err(insertion_point) where insertion_point is where the + // value would be inserted. We want the bin before that. Since boundaries[0] = 0 + // and file_size >= 0, insertion_point >= 1, so subtraction is safe. + Err(insertion_point) => insertion_point - 1, + } + } + + /// Adds a file of the given size to the histogram, incrementing the appropriate bin's + /// file count and total bytes. + pub(crate) fn insert(&mut self, file_size: i64) -> DeltaResult<()> { + require!( + file_size >= 0, + Error::internal_error(format!("File size must be non-negative, got {}", file_size)) + ); + let idx = self.get_bin_index(file_size); + self.file_counts[idx] += 1; + self.total_bytes[idx] += file_size; + Ok(()) + } + + /// Removes a file of the given size from the histogram, decrementing the appropriate bin's + /// file count by 1 and total bytes by `file_size`. + /// + /// Does not validate that the bin remains non-negative, since this is used to build delta + /// histograms where removes may exceed adds in a given bin. + pub(crate) fn remove(&mut self, file_size: i64) -> DeltaResult<()> { + require!( + file_size >= 0, + Error::internal_error(format!("File size must be non-negative, got {}", file_size)) + ); + let idx = self.get_bin_index(file_size); + self.file_counts[idx] -= 1; + self.total_bytes[idx] -= file_size; + Ok(()) + } + + /// Applies a delta histogram element-wise to this histogram. Both must have the same bin + /// boundaries. + /// + /// The delta may contain negative values (more files removed than added in a bin). + /// Returns a new histogram whose file counts and total bytes are the sum of the two inputs. + /// Returns an error if any resulting bin would have negative file counts or total bytes. + pub(crate) fn try_apply_delta( + &self, + delta: &FileSizeHistogram, + ) -> DeltaResult { + require!( + self.sorted_bin_boundaries == delta.sorted_bin_boundaries, + Error::internal_error("Cannot add histograms with different bin boundaries") + ); + let len = self.sorted_bin_boundaries.len(); + let mut file_counts = Vec::with_capacity(len); + let mut total_bytes = Vec::with_capacity(len); + for i in 0..len { + let count = self.file_counts[i] + delta.file_counts[i]; + let bytes = self.total_bytes[i] + delta.total_bytes[i]; + require!( + count >= 0 && bytes >= 0, + Error::internal_error(format!( + "Merge would result in negative counts or bytes at bin {}", + i + )) + ); + file_counts.push(count); + total_bytes.push(bytes); + } + Ok(FileSizeHistogram { + sorted_bin_boundaries: self.sorted_bin_boundaries.clone(), + file_counts, + total_bytes, + }) + } + + /// Checks that all bins have non-negative file counts and total bytes. + /// + /// Returns `Ok(self)` if valid, or an error indicating the first bin that is negative. + /// Used to validate a delta histogram before using it as an absolute histogram (e.g. for + /// version zero where the delta represents the full table state). + pub(crate) fn check_non_negative(self) -> DeltaResult { + for i in 0..self.sorted_bin_boundaries.len() { + require!( + self.file_counts[i] >= 0 && self.total_bytes[i] >= 0, + Error::internal_error(format!( + "Histogram has negative counts or bytes at bin {}", + i + )) + ); + } + Ok(self) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + use test_utils::assert_result_error_with_message; + + // ===== Construction ===== + + #[test] + fn create_default_has_95_bins_starting_at_zero() { + let hist = FileSizeHistogram::create_default(); + assert_eq!(hist.sorted_bin_boundaries.len(), 95); + assert_eq!(hist.file_counts.len(), 95); + assert_eq!(hist.total_bytes.len(), 95); + assert_eq!(hist.sorted_bin_boundaries[0], 0); + assert!(hist.file_counts.iter().all(|&c| c == 0)); + assert!(hist.total_bytes.iter().all(|&b| b == 0)); + } + + #[test] + fn create_empty_with_boundaries_produces_zeroed_histogram() { + let hist = FileSizeHistogram::create_empty_with_boundaries(vec![0, 100, 1000]).unwrap(); + assert_eq!(hist.sorted_bin_boundaries, vec![0, 100, 1000]); + assert_eq!(hist.file_counts, vec![0, 0, 0]); + assert_eq!(hist.total_bytes, vec![0, 0, 0]); + } + + #[test] + fn try_new_valid_histogram() { + let hist = FileSizeHistogram::try_new(vec![0, 100], vec![5, 3], vec![200, 900]).unwrap(); + assert_eq!(hist.sorted_bin_boundaries, vec![0, 100]); + assert_eq!(hist.file_counts, vec![5, 3]); + assert_eq!(hist.total_bytes, vec![200, 900]); + } + + #[rstest] + #[case::empty_boundaries(vec![], vec![], vec![], "at least 2 elements")] + #[case::single_boundary(vec![0], vec![0], vec![0], "at least 2 elements")] + #[case::nonzero_first_boundary(vec![1, 100], vec![0, 0], vec![0, 0], "First boundary must be 0")] + #[case::mismatched_array_lengths(vec![0, 100], vec![0], vec![0, 0], "same length")] + #[case::unsorted_boundaries(vec![0, 200, 100], vec![0, 0, 0], vec![0, 0, 0], "strictly ascending")] + #[case::duplicate_boundaries(vec![0, 100, 100], vec![0, 0, 0], vec![0, 0, 0], "strictly ascending")] + fn try_new_rejects_invalid_inputs( + #[case] boundaries: Vec, + #[case] file_counts: Vec, + #[case] total_bytes: Vec, + #[case] expected_msg: &str, + ) { + assert_result_error_with_message( + FileSizeHistogram::try_new(boundaries, file_counts, total_bytes), + expected_msg, + ); + } + + // ===== Binary search ===== + + #[test] + fn get_bin_index_exact_boundary_match() { + let hist = FileSizeHistogram::try_new(vec![0, 100, 200], vec![0; 3], vec![0; 3]).unwrap(); + assert_eq!(hist.get_bin_index(0), 0); + assert_eq!(hist.get_bin_index(100), 1); + assert_eq!(hist.get_bin_index(200), 2); + } + + #[test] + fn get_bin_index_between_boundaries() { + let hist = FileSizeHistogram::try_new(vec![0, 100, 200], vec![0; 3], vec![0; 3]).unwrap(); + assert_eq!(hist.get_bin_index(50), 0); + assert_eq!(hist.get_bin_index(99), 0); + assert_eq!(hist.get_bin_index(150), 1); + assert_eq!(hist.get_bin_index(199), 1); + } + + #[test] + fn get_bin_index_beyond_max_boundary_returns_last_bin() { + let hist = FileSizeHistogram::try_new(vec![0, 100, 200], vec![0; 3], vec![0; 3]).unwrap(); + assert_eq!(hist.get_bin_index(999), 2); + assert_eq!(hist.get_bin_index(i64::MAX), 2); + } + + // ===== Insert / Remove ===== + + #[test] + fn insert_increments_count_and_bytes() { + let mut hist = + FileSizeHistogram::try_new(vec![0, 100, 200], vec![0; 3], vec![0; 3]).unwrap(); + hist.insert(50).unwrap(); + hist.insert(75).unwrap(); + hist.insert(150).unwrap(); + assert_eq!(hist.file_counts, vec![2, 1, 0]); + assert_eq!(hist.total_bytes, vec![125, 150, 0]); + } + + #[test] + fn insert_negative_size_returns_error() { + let mut hist = FileSizeHistogram::create_default(); + assert_result_error_with_message(hist.insert(-1), "non-negative"); + } + + #[test] + fn remove_decrements_count_and_bytes() { + let mut hist = + FileSizeHistogram::try_new(vec![0, 100, 200], vec![2, 1, 0], vec![125, 150, 0]) + .unwrap(); + hist.remove(50).unwrap(); + assert_eq!(hist.file_counts, vec![1, 1, 0]); + assert_eq!(hist.total_bytes, vec![75, 150, 0]); + } + + #[test] + fn remove_allows_negative_bin_values() { + let mut hist = FileSizeHistogram::try_new(vec![0, 100], vec![0, 0], vec![0, 0]).unwrap(); + hist.remove(50).unwrap(); + assert_eq!(hist.file_counts, vec![-1, 0]); + assert_eq!(hist.total_bytes, vec![-50, 0]); + } + + #[test] + fn remove_negative_size_returns_error() { + let mut hist = FileSizeHistogram::create_default(); + assert_result_error_with_message(hist.remove(-1), "non-negative"); + } + + // ===== try_apply_delta ===== + + #[test] + fn try_apply_delta_combines_counts_and_bytes() { + let base = FileSizeHistogram::try_new(vec![0, 100], vec![2, 3], vec![50, 400]).unwrap(); + let delta = FileSizeHistogram::try_new(vec![0, 100], vec![1, 4], vec![30, 600]).unwrap(); + let result = base.try_apply_delta(&delta).unwrap(); + assert_eq!(result.file_counts, vec![3, 7]); + assert_eq!(result.total_bytes, vec![80, 1000]); + assert_eq!(result.sorted_bin_boundaries, vec![0, 100]); + } + + #[test] + fn try_apply_delta_with_negative_delta_succeeds() { + let base = FileSizeHistogram::try_new(vec![0, 100], vec![5, 8], vec![200, 900]).unwrap(); + let delta = + FileSizeHistogram::try_new(vec![0, 100], vec![-2, -3], vec![-50, -400]).unwrap(); + let result = base.try_apply_delta(&delta).unwrap(); + assert_eq!(result.file_counts, vec![3, 5]); + assert_eq!(result.total_bytes, vec![150, 500]); + } + + #[test] + fn try_apply_delta_mismatched_boundaries_returns_error() { + let a = FileSizeHistogram::try_new(vec![0, 100], vec![0; 2], vec![0; 2]).unwrap(); + let b = FileSizeHistogram::try_new(vec![0, 200], vec![0; 2], vec![0; 2]).unwrap(); + assert_result_error_with_message(a.try_apply_delta(&b), "different bin boundaries"); + } + + #[test] + fn try_apply_delta_rejects_negative_result_counts() { + let base = FileSizeHistogram::try_new(vec![0, 100], vec![1, 0], vec![50, 0]).unwrap(); + let delta = FileSizeHistogram::try_new(vec![0, 100], vec![-2, 0], vec![-50, 0]).unwrap(); + assert_result_error_with_message(base.try_apply_delta(&delta), "negative counts or bytes"); + } + + #[test] + fn try_apply_delta_rejects_negative_result_bytes() { + let base = FileSizeHistogram::try_new(vec![0, 100], vec![2, 0], vec![50, 0]).unwrap(); + let delta = FileSizeHistogram::try_new(vec![0, 100], vec![-1, 0], vec![-100, 0]).unwrap(); + assert_result_error_with_message(base.try_apply_delta(&delta), "negative counts or bytes"); + } + + // ===== Serde ===== + + #[test] + fn serde_round_trip_default_histogram() { + let hist = FileSizeHistogram::create_default(); + let json = serde_json::to_string(&hist).unwrap(); + let deserialized: FileSizeHistogram = serde_json::from_str(&json).unwrap(); + assert_eq!(hist, deserialized); + } + + #[test] + fn serde_round_trip_populated_histogram() { + let mut hist = FileSizeHistogram::create_default(); + hist.insert(500).unwrap(); + hist.insert(10_000_000).unwrap(); + let json = serde_json::to_string(&hist).unwrap(); + let deserialized: FileSizeHistogram = serde_json::from_str(&json).unwrap(); + assert_eq!(hist, deserialized); + } + + #[test] + fn serde_uses_camel_case_field_names() { + let hist = FileSizeHistogram::try_new(vec![0, 100], vec![1, 2], vec![10, 200]).unwrap(); + let json_value = serde_json::to_value(&hist).unwrap(); + assert!(json_value.get("sortedBinBoundaries").is_some()); + assert!(json_value.get("fileCounts").is_some()); + assert!(json_value.get("totalBytes").is_some()); + // Snake case should NOT be present + assert!(json_value.get("sorted_bin_boundaries").is_none()); + assert!(json_value.get("file_counts").is_none()); + assert!(json_value.get("total_bytes").is_none()); + } + + #[test] + fn serde_deserialize_from_crc_json_format() { + let json = r#"{ + "sortedBinBoundaries": [0, 8192, 16384], + "fileCounts": [10, 0, 0], + "totalBytes": [5259, 0, 0] + }"#; + let hist: FileSizeHistogram = serde_json::from_str(json).unwrap(); + assert_eq!(hist.sorted_bin_boundaries, vec![0, 8192, 16384]); + assert_eq!(hist.file_counts, vec![10, 0, 0]); + assert_eq!(hist.total_bytes, vec![5259, 0, 0]); + } +} diff --git a/kernel/src/crc/file_stats.rs b/kernel/src/crc/file_stats.rs new file mode 100644 index 0000000000..9328093d3f --- /dev/null +++ b/kernel/src/crc/file_stats.rs @@ -0,0 +1,365 @@ +//! File statistics and deltas for CRC tracking. +//! +//! [`FileStats`] represents absolute file-level statistics (count, size, histogram) for a table +//! version. [`FileStatsDelta`] captures the net changes from a single commit as a single delta +//! [`FileSizeHistogram`] (adds minus removes). +//! +//! [`FileStatsDelta`] captures how many files were added/removed and their total sizes. It can be +//! produced from either: +//! 1. In-memory transaction data via [`FileStatsDelta::try_compute_for_txn`] +//! 2. A parsed .json commit file + +use std::sync::LazyLock; + +use super::FileSizeHistogram; +use crate::engine_data::{FilteredEngineData, GetData, TypedGetData as _}; +use crate::schema::{ColumnName, ColumnNamesAndTypes, DataType}; +use crate::utils::require; +use crate::{DeltaResult, EngineData, Error, RowVisitor}; + +/// File-level statistics for a table version: total file count, size, and histogram. +/// +/// Obtained via [`Crc::file_stats()`](super::Crc::file_stats), which returns `None` when +/// the stats are not known to be valid. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FileStats { + /// Number of active [`Add`](crate::actions::Add) file actions in this table version. + pub num_files: i64, + /// Total size of the table in bytes (sum of all active + /// [`Add`](crate::actions::Add) file sizes). + pub table_size_bytes: i64, + /// Size distribution of active files, if available. + pub file_size_histogram: Option, +} + +/// Net file count and size changes from a single commit, with an optional net histogram. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub(crate) struct FileStatsDelta { + /// Net change in file count (files added minus files removed). + pub(crate) net_files: i64, + /// Net change in total bytes (bytes added minus bytes removed). + pub(crate) net_bytes: i64, + /// Net change in file size histogram (adds minus removes per bin). May contain negative + /// values in bins where more files were removed than added. `None` when the delta source + /// does not provide histogram data. + pub(crate) net_histogram: Option, +} + +impl FileStatsDelta { + /// Returns `true` if the given operation can be safely tracked by incremental file stats. + /// + /// Incremental-safe operations produce add/remove actions whose net counts give correct + /// file stats. Unknown or missing operations are treated as unsafe. For example, ANALYZE + /// STATS re-adds existing files with updated statistics -- if we naively counted those + /// adds, we'd double count file stats. + const INCREMENTAL_SAFE_OPS: &[&str] = &[ + "WRITE", + "MERGE", + "UPDATE", + "DELETE", + "OPTIMIZE", + "CREATE TABLE", + "REPLACE TABLE", + "CREATE TABLE AS SELECT", + "REPLACE TABLE AS SELECT", + "CREATE OR REPLACE TABLE AS SELECT", + ]; + + pub(crate) fn is_incremental_safe(operation: &str) -> bool { + Self::INCREMENTAL_SAFE_OPS.contains(&operation) + } + + /// Compute file stats and a delta histogram from a transaction's staged add and remove + /// metadata. + /// + /// A commit writes three kinds of file actions: + /// (1) Add actions (from `add_files_metadata`) + /// (2) Remove actions (from `remove_files_metadata`) + /// (3) DV update actions (which contain both a Remove and an Add for the same file at + /// the same size). + /// + /// Only the first two need visiting -- DV updates have a net-zero effect on file counts, + /// sizes, and histograms. + /// + /// `bin_boundaries` specifies the histogram bin boundaries to use. When `Some`, the + /// delta histogram is built with those boundaries (matching the previous CRC's histogram). + /// When `None`, the standard default boundaries are used. Callers should pass the previous + /// CRC's boundaries when available so that `try_apply_delta` in [`Crc::apply`] succeeds. + pub(crate) fn try_compute_for_txn( + add_files_metadata: &[Box], + remove_files_metadata: &[FilteredEngineData], + bin_boundaries: Option<&[i64]>, + ) -> DeltaResult { + let mut histogram = match bin_boundaries { + Some(b) => FileSizeHistogram::create_empty_with_boundaries(b.to_vec())?, + None => FileSizeHistogram::create_default(), + }; + let mut net_files = 0i64; + let mut net_bytes = 0i64; + + // Visit add files (insert into histogram). Every row is a file being added. + for batch in add_files_metadata { + let mut visitor = FileStatsVisitor::new(None, false, &mut histogram); + visitor.visit_rows_of(batch.as_ref())?; + net_files += visitor.count; + net_bytes += visitor.total_size; + } + + // Visit remove files (remove from histogram). Each FilteredEngineData has its own + // selection vector, so we create a visitor per batch. + for filtered_batch in remove_files_metadata { + let sv = filtered_batch.selection_vector(); + let sv_opt = if sv.is_empty() { None } else { Some(sv) }; + let mut visitor = FileStatsVisitor::new(sv_opt, true, &mut histogram); + visitor.visit_rows_of(filtered_batch.data())?; + net_files += visitor.count; + net_bytes += visitor.total_size; + } + + Ok(FileStatsDelta { + net_files, + net_bytes, + net_histogram: Some(histogram), + }) + } +} + +/// Visitor that extracts the `size` column from file metadata and updates a shared histogram. +/// +/// When `is_remove` is false (add files), each visited row increments the histogram bin's count +/// and bytes. When true (remove files), each row decrements them. This builds a single delta +/// histogram directly without needing separate add/remove histograms. +/// +/// Accepts an optional selection vector to filter which rows are visited. AddFiles pass `None` +/// (count every row); RemoveFiles may pass `Some(sv)` from [`FilteredEngineData`] to skip rows +/// that are not actually being removed. +struct FileStatsVisitor<'sv, 'h> { + /// Optional selection vector. When `Some`, only rows marked `true` are counted. Rows beyond + /// the SV length are implicitly selected. + selection_vector: Option<&'sv [bool]>, + /// Offset into the selection vector, tracking position across multiple visit calls. + offset: usize, + /// Whether this visitor is processing remove files (decrements) vs add files (increments). + is_remove: bool, + /// Net file count contribution from this visitor. Negative for remove visitors. + count: i64, + /// Net byte size contribution from this visitor. Negative for remove visitors. + total_size: i64, + /// Shared histogram that all visitors (add and remove) write to. + histogram: &'h mut FileSizeHistogram, +} + +impl<'sv, 'h> FileStatsVisitor<'sv, 'h> { + fn new( + selection_vector: Option<&'sv [bool]>, + is_remove: bool, + histogram: &'h mut FileSizeHistogram, + ) -> Self { + Self { + selection_vector, + offset: 0, + is_remove, + count: 0, + total_size: 0, + histogram, + } + } +} + +impl RowVisitor for FileStatsVisitor<'_, '_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + static NAMES_AND_TYPES: LazyLock = + LazyLock::new(|| (vec![ColumnName::new(["size"])], vec![DataType::LONG]).into()); + NAMES_AND_TYPES.as_ref() + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + require!( + getters.len() == 1, + Error::InternalError(format!( + "Wrong number of FileStatsVisitor getters: {}", + getters.len() + )) + ); + for i in 0..row_count { + let selected = match self.selection_vector { + Some(sv) => sv.get(self.offset + i).copied().unwrap_or(true), + None => true, + }; + if selected { + let size: i64 = getters[0].get(i, "size")?; + if self.is_remove { + self.count -= 1; + self.total_size -= size; + self.histogram.remove(size)?; + } else { + self.count += 1; + self.total_size += size; + self.histogram.insert(size)?; + } + } + } + self.offset += row_count; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::engine::arrow_data::ArrowEngineData; + use rstest::rstest; + use test_utils::{generate_batch, IntoArray}; + + fn size_batch(sizes: Vec) -> Box { + let batch = generate_batch(vec![("size", sizes.into_array())]).unwrap(); + Box::new(ArrowEngineData::new(batch)) + } + + struct TryComputeCase { + add_batches: Vec>, + remove_batches: Vec>, + expected_net_files: i64, + expected_net_bytes: i64, + } + + #[rstest] + #[case::empty(TryComputeCase { + add_batches: vec![], + remove_batches: vec![], + expected_net_files: 0, + expected_net_bytes: 0, + })] + #[case::adds_only(TryComputeCase { + add_batches: vec![vec![100, 200, 300]], + remove_batches: vec![], + expected_net_files: 3, + expected_net_bytes: 600, // 600 = 100 + 200 + 300 + })] + #[case::multiple_add_batches(TryComputeCase { + add_batches: vec![vec![100, 200], vec![300, 400, 500]], + remove_batches: vec![], + expected_net_files: 5, + expected_net_bytes: 1500, // 1500 = 100 + 200 + 300 + 400 + 500 + })] + #[case::removes_only(TryComputeCase { + add_batches: vec![], + remove_batches: vec![vec![500, 700]], + expected_net_files: -2, + expected_net_bytes: -1200, // -1200 = -(500 + 700) + })] + #[case::adds_and_removes(TryComputeCase { + add_batches: vec![vec![100, 200], vec![300, 400]], + remove_batches: vec![vec![500], vec![600, 700]], + expected_net_files: 1, + expected_net_bytes: -800, // -800 = (100 + 200 + 300 + 400) -(500 + 600 + 700) + })] + fn test_try_compute(#[case] case: TryComputeCase) { + let adds: Vec<_> = case.add_batches.into_iter().map(size_batch).collect(); + let removes: Vec<_> = case + .remove_batches + .into_iter() + .map(|sizes| FilteredEngineData::with_all_rows_selected(size_batch(sizes))) + .collect(); + let stats = FileStatsDelta::try_compute_for_txn(&adds, &removes, None).unwrap(); + assert_eq!(stats.net_files, case.expected_net_files); + assert_eq!(stats.net_bytes, case.expected_net_bytes); + } + + #[test] + fn test_with_selection_vectors() { + // Multiple add batches + multiple remove batches with mixed SV scenarios + let adds = vec![size_batch(vec![100, 200]), size_batch(vec![300])]; + let removes = vec![ + // First remove batch: all rows selected (no SV) + FilteredEngineData::with_all_rows_selected(size_batch(vec![400, 500])), + // Second remove batch: partial selection (600 skipped) + FilteredEngineData::try_new(size_batch(vec![600, 700, 800]), vec![false, true, true]) + .unwrap(), + ]; + let stats = FileStatsDelta::try_compute_for_txn(&adds, &removes, None).unwrap(); + // adds: 3 files, 600 bytes (100 + 200 + 300) + // removes: 4 files, 2400 bytes (400 + 500 + 700 + 800) + assert_eq!(stats.net_files, -1); // 3 - 4 + assert_eq!(stats.net_bytes, -1800); // 600 - 2400 + } + + #[test] + fn try_compute_builds_delta_histogram_from_add_and_remove_sizes() { + let adds = vec![size_batch(vec![100, 200, 300])]; + let removes = vec![FilteredEngineData::with_all_rows_selected(size_batch( + vec![500, 700], + ))]; + let stats = FileStatsDelta::try_compute_for_txn(&adds, &removes, None).unwrap(); + + // All sizes < 8KB so they all land in bin 0. Net: 3 adds - 2 removes = 1 file, + // 600 - 1200 = -600 bytes. + let delta = stats.net_histogram.unwrap(); + assert_eq!(delta.file_counts[0], 1); + assert_eq!(delta.total_bytes[0], -600); + } + + #[test] + fn try_compute_empty_batches_produce_zero_histogram() { + let stats = FileStatsDelta::try_compute_for_txn(&[], &[], None).unwrap(); + let delta = stats.net_histogram.unwrap(); + assert!(delta.file_counts.iter().all(|&c| c == 0)); + assert!(delta.total_bytes.iter().all(|&b| b == 0)); + } + + #[test] + fn try_compute_histogram_with_selection_vectors() { + let adds = vec![size_batch(vec![100, 200])]; + let removes = vec![FilteredEngineData::try_new( + size_batch(vec![300, 400, 500]), + vec![true, false, true], // 300 selected, 400 skipped, 500 selected + ) + .unwrap()]; + let stats = FileStatsDelta::try_compute_for_txn(&adds, &removes, None).unwrap(); + + // Net bin 0: 2 adds - 2 removes = 0 files, 300 - 800 = -500 bytes + let delta = stats.net_histogram.unwrap(); + assert_eq!(delta.file_counts[0], 0); + assert_eq!(delta.total_bytes[0], -500); + } + + #[test] + fn try_compute_with_custom_boundaries_uses_them() { + // Custom 3-bin histogram: [0, 200) [200, 1000) [1000, inf) + let boundaries: &[i64] = &[0, 200, 1000]; + let adds = vec![size_batch(vec![50, 300, 1500])]; + let removes = vec![FilteredEngineData::with_all_rows_selected(size_batch( + vec![100, 500], + ))]; + let stats = FileStatsDelta::try_compute_for_txn(&adds, &removes, Some(boundaries)).unwrap(); + + let delta = stats.net_histogram.unwrap(); + assert_eq!(delta.sorted_bin_boundaries, vec![0, 200, 1000]); + // Net per bin: (1-1, 1-1, 1-0) = (0, 0, 1) + assert_eq!(delta.file_counts, vec![0, 0, 1]); + // Net per bin: (50-100, 300-500, 1500-0) = (-50, -200, 1500) + assert_eq!(delta.total_bytes, vec![-50, -200, 1500]); + } + + #[test] + fn try_compute_with_custom_boundaries_produces_mergeable_histogram() { + // Build a base histogram with custom boundaries, then verify delta merges correctly. + let boundaries = vec![0, 200, 1000]; + let mut base = FileSizeHistogram::create_empty_with_boundaries(boundaries.clone()).unwrap(); + base.insert(150).unwrap(); // bin 0 + base.insert(500).unwrap(); // bin 1 + + let adds = vec![size_batch(vec![100, 300])]; + let removes = vec![FilteredEngineData::with_all_rows_selected(size_batch( + vec![150], + ))]; + let stats = + FileStatsDelta::try_compute_for_txn(&adds, &removes, Some(&boundaries)).unwrap(); + + let delta = stats.net_histogram.unwrap(); + let merged = base.try_apply_delta(&delta).unwrap(); + assert_eq!(merged.file_counts, vec![1, 2, 0]); // (1+1-1), (1+1-0), (0+0-0) + assert_eq!(merged.total_bytes, vec![100, 800, 0]); // (150+100-150), (500+300-0) + } +} diff --git a/kernel/src/crc/lazy.rs b/kernel/src/crc/lazy.rs new file mode 100644 index 0000000000..3514b59965 --- /dev/null +++ b/kernel/src/crc/lazy.rs @@ -0,0 +1,306 @@ +//! Lazy CRC loading support. +//! +//! Provides thread-safe lazy loading of CRC files, ensuring they are read at most once and the +//! result is shared across all consumers. + +use std::sync::{Arc, OnceLock}; + +use tracing::warn; + +use super::{try_read_crc_file, Crc}; +use crate::path::ParsedLogPath; +use crate::{Engine, Version}; + +/// Result of attempting to load a CRC file. +/// +/// The "not yet loaded" state is represented by `OnceLock::get()` returning `None`, not as an enum +/// variant. +#[derive(Debug, Clone)] +pub(crate) enum CrcLoadResult { + /// No CRC file exists for this log segment. + DoesNotExist, + /// CRC file exists but failed to read/parse (corrupted or I/O error). + CorruptOrFailed, + /// CRC file was successfully loaded. + Loaded(Arc), +} + +impl CrcLoadResult { + /// Returns the CRC if successfully loaded. + #[allow(dead_code)] // Used in future phases (domain metadata, ICT) + pub(crate) fn get(&self) -> Option<&Arc> { + match self { + CrcLoadResult::Loaded(crc) => Some(crc), + _ => None, + } + } +} + +/// Lazy loader for CRC info that ensures it's only read once. +/// +/// Uses `OnceLock` to ensure thread-safe initialization that happens at most once. +/// Can also hold a precomputed CRC (e.g. from post-commit CRC merge) without a backing file. +#[derive(Debug)] +pub(crate) struct LazyCrc { + /// The CRC file path, if one exists in the log segment. + crc_file: Option, + /// Cached load result (loaded lazily, at most once). + pub(crate) cached: OnceLock, + /// Version of a precomputed CRC (set when CRC was computed rather than read from file). + /// When set, this takes priority over `crc_file` for version checks. + precomputed_version: Option, +} + +impl LazyCrc { + /// Create a new lazy CRC loader. + /// + /// If `crc_file` is `None`, the loader will immediately return `DoesNotExist` when accessed. + pub(crate) fn new(crc_file: Option) -> Self { + Self { + crc_file, + cached: OnceLock::new(), + precomputed_version: None, + } + } + + /// Create a `LazyCrc` with a precomputed CRC value (no backing file). + /// + /// The CRC is immediately available via `get_or_load` without any I/O. The `version` + /// parameter records which table version this CRC corresponds to, enabling + /// `get_if_loaded_at_version` to work for chained commits. + pub(crate) fn new_precomputed(crc: Crc, version: Version) -> Self { + let cached = OnceLock::new(); + // OnceLock::set cannot fail here because we just created it + let _ = cached.set(CrcLoadResult::Loaded(Arc::new(crc))); + Self { + crc_file: None, + cached, + precomputed_version: Some(version), + } + } + + /// Returns the CRC load result, loading if necessary. + /// + /// The loading closure is only called once, even across threads. Subsequent calls return the + /// cached result. + pub(crate) fn get_or_load(&self, engine: &dyn Engine) -> &CrcLoadResult { + self.cached.get_or_init(|| match &self.crc_file { + None => CrcLoadResult::DoesNotExist, + Some(crc_path) => match try_read_crc_file(engine, crc_path) { + Ok(crc) => CrcLoadResult::Loaded(Arc::new(crc)), + Err(e) => { + warn!( + "Failed to read CRC file {:?}: {}.", + crc_path.location.location, e + ); + CrcLoadResult::CorruptOrFailed + } + }, + }) + } + + /// Returns the CRC only if the CRC file is at the given version, loading if necessary. + pub(crate) fn get_or_load_if_at_version( + &self, + engine: &dyn Engine, + version: Version, + ) -> Option<&Arc> { + if self.crc_version() != Some(version) { + return None; + } + self.get_or_load(engine).get() + } + + /// Returns the CRC only if it is already loaded (no I/O) and matches the given version. + /// + /// This is purely opportunistic: it returns `Some` only when the CRC was previously loaded + /// (via `get_or_load`) or precomputed (via `new_precomputed`) AND the version matches. + pub(crate) fn get_if_loaded_at_version(&self, version: Version) -> Option<&Arc> { + if self.crc_version() != Some(version) { + return None; + } + self.cached.get()?.get() + } + + /// Check if CRC has been loaded (without triggering loading). + #[allow(dead_code)] // Used in future phases (domain metadata, ICT) + pub(crate) fn is_loaded(&self) -> bool { + self.cached.get().is_some() + } + + /// Returns the CRC version, checking precomputed version first, then CRC file version. + /// + /// This enables chaining: a post-commit snapshot with a precomputed CRC at version N+1 + /// can serve as the read snapshot for a transaction targeting version N+2. + pub(crate) fn crc_version(&self) -> Option { + self.precomputed_version + .or_else(|| self.crc_file.as_ref().map(|f| f.version)) + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use rstest::rstest; + + use super::*; + use crate::engine::default::executor::tokio::TokioBackgroundExecutor; + use crate::engine::default::{DefaultEngine, DefaultEngineBuilder}; + use crate::object_store::memory::InMemory; + + fn table_root() -> url::Url { + url::Url::parse("memory:///").unwrap() + } + + fn test_engine() -> DefaultEngine { + DefaultEngineBuilder::new(Arc::new(InMemory::new())).build() + } + + // ===== CrcLoadResult Tests ===== + + #[test] + fn test_crc_load_result_loaded() { + let crc = Crc { + table_size_bytes: 100, + num_files: 10, + num_metadata: 1, + num_protocol: 1, + ..Default::default() + }; + let loaded = CrcLoadResult::Loaded(Arc::new(crc)); + assert!(loaded.get().is_some()); + assert_eq!(loaded.get().unwrap().table_size_bytes, 100); + } + + #[rstest] + #[case::does_not_exist(CrcLoadResult::DoesNotExist)] + #[case::corrupt(CrcLoadResult::CorruptOrFailed)] + fn test_crc_load_result(#[case] result: CrcLoadResult) { + assert!(result.get().is_none()); + } + + // ===== LazyCrc Tests ===== + + #[test] + fn test_lazy_crc_no_file() { + let engine = test_engine(); + + let lazy = LazyCrc::new(None); + assert!(!lazy.is_loaded()); + assert_eq!(lazy.crc_version(), None); + + let result = lazy.get_or_load(&engine); + assert!(matches!(result, CrcLoadResult::DoesNotExist)); + assert!(result.get().is_none()); + assert!(lazy.is_loaded()); + } + + #[test] + fn test_lazy_crc_missing_file() { + let engine = test_engine(); + + let lazy = LazyCrc::new(Some(ParsedLogPath::create_parsed_crc(&table_root(), 5))); + assert!(!lazy.is_loaded()); + assert_eq!(lazy.crc_version(), Some(5)); + + let result = lazy.get_or_load(&engine); + assert!(matches!(result, CrcLoadResult::CorruptOrFailed)); + assert!(result.get().is_none()); + assert!(lazy.is_loaded()); + } + + fn test_table_root(dir: &str) -> url::Url { + let path = std::fs::canonicalize(PathBuf::from(dir)).unwrap(); + url::Url::from_directory_path(path).unwrap() + } + + #[test] + fn test_lazy_crc_loads_real_file() { + let engine = crate::engine::sync::SyncEngine::new(); + let table_root = test_table_root("./tests/data/crc-full/"); + + let lazy = LazyCrc::new(Some(ParsedLogPath::create_parsed_crc(&table_root, 0))); + assert!(!lazy.is_loaded()); + assert_eq!(lazy.crc_version(), Some(0)); + + let result = lazy.get_or_load(&engine); + assert!(lazy.is_loaded()); + + let crc = result.get().unwrap(); + assert_eq!(crc.table_size_bytes, 5259); + } + + #[test] + fn test_lazy_crc_malformed_file() { + let engine = crate::engine::sync::SyncEngine::new(); + let table_root = test_table_root("./tests/data/crc-malformed/"); + + let lazy = LazyCrc::new(Some(ParsedLogPath::create_parsed_crc(&table_root, 0))); + assert!(!lazy.is_loaded()); + assert_eq!(lazy.crc_version(), Some(0)); + + let result = lazy.get_or_load(&engine); + assert!(matches!(result, CrcLoadResult::CorruptOrFailed)); + assert!(result.get().is_none()); + assert!(lazy.is_loaded()); + } + + // ===== Precomputed LazyCrc Tests ===== + + fn test_crc(table_size_bytes: i64) -> Crc { + Crc { + table_size_bytes, + num_files: 1, + num_metadata: 1, + num_protocol: 1, + ..Default::default() + } + } + + #[test] + fn test_lazy_crc_precomputed() { + let crc = test_crc(42); + let lazy = LazyCrc::new_precomputed(crc, 5); + + assert!(lazy.is_loaded()); + assert_eq!(lazy.crc_version(), Some(5)); + + // get_if_loaded_at_version should return the CRC at the correct version + let loaded = lazy.get_if_loaded_at_version(5); + assert!(loaded.is_some()); + assert_eq!(loaded.unwrap().table_size_bytes, 42); + + // Wrong version should return None + assert!(lazy.get_if_loaded_at_version(4).is_none()); + assert!(lazy.get_if_loaded_at_version(6).is_none()); + } + + #[test] + fn test_lazy_crc_precomputed_version_takes_priority() { + let crc = test_crc(100); + let lazy = LazyCrc::new_precomputed(crc, 3); + assert_eq!(lazy.crc_version(), Some(3)); + } + + #[test] + fn test_get_if_loaded_at_version_not_loaded() { + // CRC file exists but not yet loaded -> should return None (no I/O) + let lazy = LazyCrc::new(Some(ParsedLogPath::create_parsed_crc(&table_root(), 5))); + assert!(!lazy.is_loaded()); + assert!(lazy.get_if_loaded_at_version(5).is_none()); + } + + #[test] + fn test_get_if_loaded_at_version_wrong_version() { + let crc = test_crc(100); + let lazy = LazyCrc::new_precomputed(crc, 5); + assert!(lazy.get_if_loaded_at_version(3).is_none()); + } + + #[test] + fn test_get_if_loaded_at_version_no_crc() { + let lazy = LazyCrc::new(None); + assert!(lazy.get_if_loaded_at_version(0).is_none()); + } +} diff --git a/kernel/src/crc/mod.rs b/kernel/src/crc/mod.rs new file mode 100644 index 0000000000..4f379d90dd --- /dev/null +++ b/kernel/src/crc/mod.rs @@ -0,0 +1,567 @@ +//! CRC (version checksum) file support. +//! +//! A [CRC file] contains a snapshot of table state at a specific version, which can be used to +//! optimize log replay operations like reading Protocol/Metadata, domain metadata, and ICT. +//! +//! [CRC file]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#version-checksum-file + +// Allow unreachable_pub because this module is pub when test-utils is enabled +// but pub(crate) otherwise. The items need to be pub for integration tests. +#![allow(unreachable_pub)] + +mod delta; +mod file_size_histogram; +mod file_stats; +mod lazy; +mod reader; +mod writer; + +#[allow(unused)] +pub(crate) use delta::CrcDelta; +pub(crate) use file_size_histogram::FileSizeHistogram; +pub(crate) use file_stats::FileStats; +#[allow(unused)] +pub(crate) use file_stats::FileStatsDelta; +pub(crate) use lazy::{CrcLoadResult, LazyCrc}; +pub(crate) use reader::try_read_crc_file; +#[allow(unused)] +pub(crate) use writer::try_write_crc_file; + +use std::collections::HashMap; + +use serde::de::Deserializer; +use serde::ser::Serializer; +use serde::{Deserialize, Serialize}; + +use crate::actions::{Add, DomainMetadata, Metadata, Protocol, SetTransaction}; + +/// Tracks whether file stats (`num_files`, `table_size_bytes`) are trustworthy. +/// +/// Defaults to [`Valid`](Self::Valid), which is the correct state when deserializing a CRC file +/// from disk (a CRC file's stats are correct by definition). +#[allow(dead_code)] // Variants used in follow-up PRs (forward replay, transaction delta). +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub enum FileStatsValidity { + /// File stats are known-correct absolute totals. This is the case when seeded from a CRC + /// file (which contains `num_files` and `table_size_bytes`) or when replay starts from + /// version zero (where the initial state is trivially zero). Safe to write to disk. + #[default] + Valid, + /// File stats are relative deltas, not absolute totals. This happens when seeding from a + /// checkpoint: we extract metadata fields but not file counts (reading all add actions from + /// a checkpoint just for counts is too expensive). The accumulated deltas are correct, but + /// without a baseline they cannot produce final totals. Not safe to write to disk. + RequiresCheckpointRead, + /// A non-incremental operation was seen: file stats cannot be determined incrementally. + /// For example, ANALYZE STATS re-adds existing files with updated statistics but no + /// corresponding removes, so naively counting adds would double-count. + /// A full log replay from scratch could recover correct file stats. Not safe to write to disk. + Indeterminate, + /// A file action had a missing size field: correct file stats are impossible to compute. + /// For example, the Delta protocol allows `remove.size` to be null -- when encountered, + /// we can no longer track byte totals. Unlike [`Indeterminate`](Self::Indeterminate), no + /// amount of replay can recover the missing data. Not safe to write to disk. + Untrackable, +} + +/// Parsed content of a CRC (version checksum) file. +/// +/// A `Crc` is either (a) loaded from disk (deserialized from a `.crc` JSON file) or (b) computed +/// in memory (built incrementally via `Crc::apply`). +/// +/// A CRC file must: +/// 1. Be named `{version}.crc` with version zero-padded to 20 digits: `00000000000000000001.crc` +/// 2. Be stored directly in the _delta_log directory alongside Delta log files +/// 3. Contain exactly one JSON object with the schema of this struct. +/// +/// This struct and its fields are marked `pub`, but the `crc` module is only re-exported as `pub` +/// when the `test-utils` feature is enabled (otherwise `pub(crate)`). See `kernel/src/lib.rs`. +// Deserialized directly from JSON via serde. See `reader::try_read_crc_file`. +#[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct Crc { + // ===== Required fields ===== + /// Total size of the table in bytes, calculated as the sum of the `size` field of all live + /// [`Add`] actions. Private -- use [`Crc::file_stats()`] to access safely. + table_size_bytes: i64, + /// Number of live [`Add`] actions in this table version after action reconciliation. + /// Private -- use [`Crc::file_stats()`] to access safely. + num_files: i64, + /// Number of [`Metadata`] actions. Must be 1. + pub num_metadata: i64, + /// Number of [`Protocol`] actions. Must be 1. + pub num_protocol: i64, + /// The table [`Metadata`] at this version. + pub metadata: Metadata, + /// The table [`Protocol`] at this version. + pub protocol: Protocol, + /// Whether the file stats (`num_files`, `table_size_bytes`) in this CRC are trustworthy. + /// Not serialized -- this is an in-memory replay concern only. When deserialized from a CRC + /// file on disk, defaults to [`FileStatsValidity::Valid`] (a CRC file's stats are correct + /// by definition). A CRC is only safe to write to disk when validity is `Valid`. + #[serde(skip)] + pub file_stats_validity: FileStatsValidity, + + // ===== Optional fields ===== + /// A unique identifier for the transaction that produced this commit. + #[serde(skip)] + pub txn_id: Option, + /// The in-commit timestamp of this version. Present iff In-Commit Timestamps are enabled. + pub in_commit_timestamp_opt: Option, + /// Live transaction identifier ([`SetTransaction`]) actions at this version. `None` = not + /// tracked (field absent in CRC JSON or not computed). `Some(empty_map)` = tracked, no + /// active set transactions. `apply()` skips updates when `None`. + /// + /// Stored as a HashMap keyed by `app_id` for efficient lookup. The CRC JSON format uses + /// a Vec, which is converted via custom serde deserialization. + #[serde( + default, + deserialize_with = "de_opt_vec_to_opt_map", + serialize_with = "ser_opt_map_to_opt_vec" + )] + pub set_transactions: Option>, + /// Active (non-removed) [`DomainMetadata`] actions at this version. Tombstones + /// (`removed=true`) are never stored. `None` = not tracked (field absent in CRC JSON or not + /// computed). `Some(empty_map)` = tracked, no active domain metadata. `apply()` skips + /// updates when `None`. + /// + /// Stored as a HashMap keyed by domain name for efficient lookup. The CRC JSON format uses + /// a Vec, which is converted via custom serde deserialization. + #[serde( + default, + deserialize_with = "de_opt_vec_to_opt_map", + serialize_with = "ser_opt_map_to_opt_vec" + )] + pub domain_metadata: Option>, + /// Size distribution information of files remaining after action reconciliation. + #[serde( + default, + deserialize_with = "de_validated_file_size_histogram", + skip_serializing_if = "Option::is_none" + )] + pub file_size_histogram: Option, + /// All live [`Add`] file actions at this version. + #[serde(skip)] + pub all_files: Option>, + /// Number of records deleted through Deletion Vectors in this table version. + #[serde(skip)] + pub num_deleted_records_opt: Option, + /// Number of Deletion Vectors active in this table version. + #[serde(skip)] + pub num_deletion_vectors_opt: Option, + /// Distribution of deleted record counts across files. See this section for more details. + #[serde(skip)] + pub deleted_record_counts_histogram_opt: Option, +} + +impl Crc { + /// Returns file-level statistics only if they are known to be valid. + /// + /// Returns `None` when file stats cannot be trusted -- for example, when the CRC was + /// built from incremental replay that encountered a non-incremental operation or a + /// missing file size. + pub fn file_stats(&self) -> Option { + match self.file_stats_validity { + FileStatsValidity::Valid => Some(FileStats { + num_files: self.num_files, + table_size_bytes: self.table_size_bytes, + file_size_histogram: self.file_size_histogram.clone(), + }), + _ => None, + } + } +} + +/// Trait for types that can be stored in a HashMap keyed by a string identifier. +/// Used by CRC serde helpers to convert between Vec (JSON format) and HashMap (in-memory). +trait MapKey { + fn map_key(&self) -> &str; +} + +impl MapKey for DomainMetadata { + fn map_key(&self) -> &str { + self.domain() + } +} + +impl MapKey for SetTransaction { + fn map_key(&self) -> &str { + &self.app_id + } +} + +/// Deserialize an `Option>` from JSON into `Option>`, using +/// [`MapKey::map_key`] to derive the HashMap key for each element. +fn de_opt_vec_to_opt_map<'de, D, T>(deserializer: D) -> Result>, D::Error> +where + D: Deserializer<'de>, + T: Deserialize<'de> + MapKey, +{ + let opt_vec: Option> = Option::deserialize(deserializer)?; + Ok(opt_vec.map(|vec| { + vec.into_iter() + .map(|item| (item.map_key().to_string(), item)) + .collect() + })) +} + +/// Deserializes an `Option` from a CRC JSON file with validation. +/// +/// After serde deserializes the raw JSON fields, this validates the histogram invariants +/// (sorted boundaries, matching array lengths, etc.) via [`FileSizeHistogram::try_new`], +/// ensuring malformed CRC files are rejected rather than causing panics later. +fn de_validated_file_size_histogram<'de, D>( + deserializer: D, +) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let opt: Option = Option::deserialize(deserializer)?; + match opt { + Some(hist) => FileSizeHistogram::try_new( + hist.sorted_bin_boundaries, + hist.file_counts, + hist.total_bytes, + ) + .map(Some) + .map_err(serde::de::Error::custom), + None => Ok(None), + } +} + +/// Serialize `Option>` back to `Option>` so the CRC JSON format +/// uses an array (matching the Delta protocol spec). +fn ser_opt_map_to_opt_vec( + map: &Option>, + serializer: S, +) -> Result +where + S: Serializer, + T: Serialize, +{ + match map { + None => serializer.serialize_none(), + Some(m) => m.values().collect::>().serialize(serializer), + } +} + +/// The [DeletedRecordCountsHistogram] object represents a histogram tracking the distribution of +/// deleted record counts across files in the table. Each bin in the histogram represents a range +/// of deletion counts and stores the number of files having that many deleted records. +/// +/// The histogram bins correspond to the following ranges: +/// Bin 0: [0, 0] (files with no deletions) +/// Bin 1: [1, 9] (files with 1-9 deleted records) +/// Bin 2: [10, 99] (files with 10-99 deleted records) +/// Bin 3: [100, 999] (files with 100-999 deleted records) +/// Bin 4: [1000, 9999] (files with 1,000-9,999 deleted records) +/// Bin 5: [10000, 99999] (files with 10,000-99,999 deleted records) +/// Bin 6: [100000, 999999] (files with 100,000-999,999 deleted records) +/// Bin 7: [1000000, 9999999] (files with 1,000,000-9,999,999 deleted records) +/// Bin 8: [10000000, 2147483646] (files with 10,000,000 to 2,147,483,646 deleted records) +/// Bin 9: [2147483647, inf) (files with 2,147,483,647 or more deleted records) +/// +/// [DeletedRecordCountsHistogram]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#deleted-record-counts-histogram-schema +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DeletedRecordCountsHistogram { + /// Array of size 10 where each element represents the count of files falling into a specific + /// deletion count range. + pub(crate) deleted_record_counts: Vec, +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::Crc; + use crate::actions::{DomainMetadata, SetTransaction}; + + /// Helper to create a minimal `Crc` with only set_transactions and domain_metadata populated. + fn crc_with( + txns: Option>, + domains: Option>, + ) -> Crc { + Crc { + set_transactions: txns, + domain_metadata: domains, + ..Default::default() + } + } + + #[test] + fn de_vec_to_map_produces_correct_keys_and_values() { + let json = r#"{ + "tableSizeBytes": 0, + "numFiles": 0, + "numMetadata": 1, + "numProtocol": 1, + "metadata": { + "id": "test", + "format": {"provider": "parquet", "options": {}}, + "schemaString": "{\"type\":\"struct\",\"fields\":[]}", + "partitionColumns": [], + "configuration": {}, + "createdTime": 0 + }, + "protocol": {"minReaderVersion": 1, "minWriterVersion": 1}, + "setTransactions": [ + {"appId": "app-1", "version": 3, "lastUpdated": 1000}, + {"appId": "app-2", "version": 7} + ], + "domainMetadata": [ + {"domain": "delta.rowTracking", "configuration": "{\"rowIdHighWaterMark\":1}", "removed": false}, + {"domain": "delta.clustering", "configuration": "{}", "removed": false} + ] + }"#; + + let crc: Crc = serde_json::from_str(json).unwrap(); + + let txns = crc.set_transactions.as_ref().unwrap(); + assert_eq!(txns.len(), 2); + + let txn1 = &txns["app-1"]; + assert_eq!(txn1.app_id, "app-1"); + assert_eq!(txn1.version, 3); + assert_eq!(txn1.last_updated, Some(1000)); + + let txn2 = &txns["app-2"]; + assert_eq!(txn2.app_id, "app-2"); + assert_eq!(txn2.version, 7); + assert_eq!(txn2.last_updated, None); + + let domains = crc.domain_metadata.as_ref().unwrap(); + assert_eq!(domains.len(), 2); + assert!(domains.contains_key("delta.rowTracking")); + assert!(domains.contains_key("delta.clustering")); + } + + #[test] + fn de_null_deserializes_to_none() { + let json = r#"{ + "tableSizeBytes": 0, + "numFiles": 0, + "numMetadata": 1, + "numProtocol": 1, + "metadata": { + "id": "test", + "format": {"provider": "parquet", "options": {}}, + "schemaString": "{\"type\":\"struct\",\"fields\":[]}", + "partitionColumns": [], + "configuration": {}, + "createdTime": 0 + }, + "protocol": {"minReaderVersion": 1, "minWriterVersion": 1}, + "setTransactions": null, + "domainMetadata": null + }"#; + let crc: Crc = serde_json::from_str(json).unwrap(); + assert!(crc.set_transactions.is_none()); + assert!(crc.domain_metadata.is_none()); + } + + #[test] + fn de_missing_field_deserializes_to_none() { + let json = r#"{ + "tableSizeBytes": 0, + "numFiles": 0, + "numMetadata": 1, + "numProtocol": 1, + "metadata": { + "id": "test", + "format": {"provider": "parquet", "options": {}}, + "schemaString": "{\"type\":\"struct\",\"fields\":[]}", + "partitionColumns": [], + "configuration": {}, + "createdTime": 0 + }, + "protocol": {"minReaderVersion": 1, "minWriterVersion": 1} + }"#; + let crc: Crc = serde_json::from_str(json).unwrap(); + assert!(crc.set_transactions.is_none()); + assert!(crc.domain_metadata.is_none()); + } + + #[test] + fn ser_none_serializes_to_null() { + let crc = crc_with(None, None); + let json = serde_json::to_value(&crc).unwrap(); + assert!(json["setTransactions"].is_null()); + assert!(json["domainMetadata"].is_null()); + } + + #[test] + fn ser_map_round_trips_through_vec() { + let mut txns = HashMap::new(); + txns.insert( + "app-1".to_string(), + SetTransaction::new("app-1".to_string(), 5, Some(2000)), + ); + txns.insert( + "app-2".to_string(), + SetTransaction::new("app-2".to_string(), 10, None), + ); + + let mut domains = HashMap::new(); + domains.insert( + "delta.rowTracking".to_string(), + DomainMetadata::new("delta.rowTracking".to_string(), "{}".to_string()), + ); + + let original = crc_with(Some(txns), Some(domains)); + + let json_str = serde_json::to_string(&original).unwrap(); + let deserialized: Crc = serde_json::from_str(&json_str).unwrap(); + + assert_eq!(original, deserialized); + } + + #[test] + fn round_trip_empty_maps() { + let original = crc_with(Some(HashMap::new()), Some(HashMap::new())); + + let json_str = serde_json::to_string(&original).unwrap(); + let deserialized: Crc = serde_json::from_str(&json_str).unwrap(); + + assert_eq!(original, deserialized); + + // Verify the JSON has empty arrays (not null) + let json_value = serde_json::to_value(&original).unwrap(); + assert_eq!(json_value["setTransactions"], serde_json::json!([])); + assert_eq!(json_value["domainMetadata"], serde_json::json!([])); + } + + #[test] + fn test_crc_with_multiple_domain_metadatas_and_set_transactions() { + let mut txns = HashMap::new(); + txns.insert( + "streaming-app".to_string(), + SetTransaction::new("streaming-app".to_string(), 42, Some(1700000000)), + ); + txns.insert( + "batch-job".to_string(), + SetTransaction::new("batch-job".to_string(), 100, None), + ); + txns.insert( + "etl-pipeline".to_string(), + SetTransaction::new("etl-pipeline".to_string(), 7, Some(1700001000)), + ); + + let mut domains = HashMap::new(); + domains.insert( + "delta.rowTracking".to_string(), + DomainMetadata::new( + "delta.rowTracking".to_string(), + r#"{"rowIdHighWaterMark":500}"#.to_string(), + ), + ); + domains.insert( + "delta.clustering".to_string(), + DomainMetadata::new("delta.clustering".to_string(), "{}".to_string()), + ); + domains.insert( + "custom.app".to_string(), + DomainMetadata::new("custom.app".to_string(), r#"{"version":"2.0"}"#.to_string()), + ); + + let crc = Crc { + table_size_bytes: 1024 * 1024, + num_files: 10, + num_metadata: 1, + num_protocol: 1, + set_transactions: Some(txns), + domain_metadata: Some(domains), + ..Default::default() + }; + + // Round-trip through JSON + let json_str = serde_json::to_string(&crc).unwrap(); + let deserialized: Crc = serde_json::from_str(&json_str).unwrap(); + + // Verify scalar fields survive the round-trip + assert_eq!(deserialized.table_size_bytes, 1024 * 1024); + assert_eq!(deserialized.num_files, 10); + + // Verify all set transactions + let txns = deserialized.set_transactions.as_ref().unwrap(); + assert_eq!(txns.len(), 3); + assert_eq!(txns["streaming-app"].version, 42); + assert_eq!(txns["streaming-app"].last_updated, Some(1700000000)); + assert_eq!(txns["batch-job"].version, 100); + assert_eq!(txns["batch-job"].last_updated, None); + assert_eq!(txns["etl-pipeline"].version, 7); + + // Verify all domain metadatas + let domains = deserialized.domain_metadata.as_ref().unwrap(); + assert_eq!(domains.len(), 3); + assert!(domains.contains_key("delta.rowTracking")); + assert!(domains.contains_key("delta.clustering")); + assert!(domains.contains_key("custom.app")); + assert_eq!( + domains["custom.app"].configuration(), + r#"{"version":"2.0"}"# + ); + + // Verify the original and deserialized are equal + assert_eq!(crc, deserialized); + } + + // ===== File size histogram validation ===== + + /// Minimal CRC JSON with a file size histogram field spliced in. + fn crc_json_with_histogram(histogram_json: &str) -> String { + format!( + r#"{{ + "tableSizeBytes": 0, + "numFiles": 0, + "numMetadata": 1, + "numProtocol": 1, + "metadata": {{ + "id": "test", + "format": {{"provider": "parquet", "options": {{}}}}, + "schemaString": "{{\"type\":\"struct\",\"fields\":[]}}", + "partitionColumns": [], + "configuration": {{}}, + "createdTime": 0 + }}, + "protocol": {{"minReaderVersion": 1, "minWriterVersion": 1}}, + "fileSizeHistogram": {histogram_json} + }}"# + ) + } + + #[test] + fn de_valid_file_size_histogram_succeeds() { + let json = crc_json_with_histogram( + r#"{"sortedBinBoundaries": [0, 100, 200], "fileCounts": [1, 2, 3], "totalBytes": [10, 200, 300]}"#, + ); + let crc: Crc = serde_json::from_str(&json).unwrap(); + assert!(crc.file_size_histogram.is_some()); + } + + #[test] + fn de_null_file_size_histogram_deserializes_to_none() { + let json = crc_json_with_histogram("null"); + let crc: Crc = serde_json::from_str(&json).unwrap(); + assert!(crc.file_size_histogram.is_none()); + } + + use rstest::rstest; + + #[rstest] + #[case::unsorted_boundaries( + r#"{"sortedBinBoundaries": [0, 200, 100], "fileCounts": [0, 0, 0], "totalBytes": [0, 0, 0]}"# + )] + #[case::nonzero_first_boundary( + r#"{"sortedBinBoundaries": [1, 100], "fileCounts": [0, 0], "totalBytes": [0, 0]}"# + )] + #[case::mismatched_lengths( + r#"{"sortedBinBoundaries": [0, 100], "fileCounts": [0], "totalBytes": [0, 0]}"# + )] + #[case::single_boundary( + r#"{"sortedBinBoundaries": [0], "fileCounts": [0], "totalBytes": [0]}"# + )] + fn de_malformed_file_size_histogram_returns_error(#[case] histogram_json: &str) { + let json = crc_json_with_histogram(histogram_json); + assert!(serde_json::from_str::(&json).is_err()); + } +} diff --git a/kernel/src/crc/reader.rs b/kernel/src/crc/reader.rs new file mode 100644 index 0000000000..a7c4849789 --- /dev/null +++ b/kernel/src/crc/reader.rs @@ -0,0 +1,151 @@ +//! CRC file reading functionality. + +use super::Crc; +use crate::path::{AsUrl as _, ParsedLogPath}; +use crate::{DeltaResult, Engine, Error}; + +/// Attempt to read and parse a CRC file. +/// +/// Reads raw bytes via the storage handler and deserializes with serde_json. +/// +/// Returns `Ok(Crc)` on success, `Err` on any failure (file not readable, corrupt JSON, missing +/// required fields). The caller should handle errors gracefully by falling back to log replay. +pub(crate) fn try_read_crc_file(engine: &dyn Engine, crc_path: &ParsedLogPath) -> DeltaResult { + let storage = engine.storage_handler(); + let url = crc_path.location.as_url().clone(); + let data = storage + .read_files(vec![(url, None)])? + .next() + .ok_or_else(|| Error::generic("CRC file read returned no data"))??; + let crc: Crc = serde_json::from_slice(&data)?; + Ok(crc) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::path::PathBuf; + + use super::*; + use crate::actions::{Format, Metadata, Protocol}; + use crate::engine::sync::SyncEngine; + use crate::path::ParsedLogPath; + use crate::table_features::TableFeature; + use test_utils::assert_result_error_with_message; + + fn test_table_root(dir: &str) -> url::Url { + let path = std::fs::canonicalize(PathBuf::from(dir)).unwrap(); + url::Url::from_directory_path(path).unwrap() + } + + #[test] + fn test_read_crc_file() { + let engine = SyncEngine::new(); + let table_root = test_table_root("./tests/data/crc-full/"); + let crc_path = ParsedLogPath::create_parsed_crc(&table_root, 0); + + // Read and parse the CRC file + let crc = try_read_crc_file(&engine, &crc_path).unwrap(); + + // Verify basic fields + assert_eq!(crc.table_size_bytes, 5259); + assert_eq!(crc.num_files, 10); + assert_eq!(crc.num_metadata, 1); + assert_eq!(crc.num_protocol, 1); + assert_eq!(crc.in_commit_timestamp_opt, Some(1694758257000)); + + // Verify protocol + let expected_protocol = Protocol::new_unchecked( + 3, + 7, + Some(vec![TableFeature::DeletionVectors]), + Some(vec![ + TableFeature::DomainMetadata, + TableFeature::ClusteredTable, + TableFeature::DeletionVectors, + TableFeature::RowTracking, + ]), + ); + assert_eq!(crc.protocol, expected_protocol); + + // Verify metadata + let expected_metadata = Metadata::new_unchecked( + "6ca3020b-3cd9-4048-82e3-1417a0abb98f", + None, + None, + Format::default(), + r#"{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}}]}"#, + vec![], + Some(1694758256009), + HashMap::from([ + ( + "delta.enableDeletionVectors".to_string(), + "true".to_string(), + ), + ( + "delta.checkpoint.writeStatsAsStruct".to_string(), + "true".to_string(), + ), + ("delta.enableRowTracking".to_string(), "true".to_string()), + ( + "delta.checkpoint.writeStatsAsJson".to_string(), + "false".to_string(), + ), + ( + "delta.rowTracking.materializedRowCommitVersionColumnName".to_string(), + "_row-commit-version-col-2f60dcc1-9e36-4424-95e7-799b707e4ddb".to_string(), + ), + ( + "delta.rowTracking.materializedRowIdColumnName".to_string(), + "_row-id-col-4cbc7924-f662-4db1-aa59-22c23f59eb5d".to_string(), + ), + ]), + ); + assert_eq!(crc.metadata, expected_metadata); + + // Verify domain metadatas + let dms = crc.domain_metadata.unwrap(); + assert_eq!(dms.len(), 3); + + assert!(dms["delta.clustering"] + .configuration() + .contains("clusteringColumns")); + assert!(dms["delta.rowTracking"] + .configuration() + .contains("rowIdHighWaterMark")); + assert!(dms["myApp.metadata"].configuration().contains("key")); + + // Verify set transactions + let txns = crc.set_transactions.unwrap(); + assert_eq!(txns.len(), 2); + assert_eq!(txns["spark-app-1"].version, 42); + assert_eq!(txns["spark-app-1"].last_updated, Some(1694758250000)); + assert_eq!(txns["streaming-job-abc"].version, 100); + assert_eq!(txns["streaming-job-abc"].last_updated, Some(1694758255000)); + + // Verify file size histogram was deserialized (all 10 files in bin 0, < 8KB) + let hist = crc.file_size_histogram.as_ref().unwrap(); + assert_eq!(hist.sorted_bin_boundaries.len(), 95); + assert_eq!(hist.file_counts[0], 10); + assert_eq!(hist.total_bytes[0], 5259); + // All other bins should be zero + assert!(hist.file_counts[1..].iter().all(|&c| c == 0)); + assert!(hist.total_bytes[1..].iter().all(|&b| b == 0)); + + // Remaining skipped fields are still None (pending serde support on their types) + assert!(crc.txn_id.is_none()); + assert!(crc.all_files.is_none()); + assert!(crc.num_deleted_records_opt.is_none()); + assert!(crc.num_deletion_vectors_opt.is_none()); + assert!(crc.deleted_record_counts_histogram_opt.is_none()); + } + + #[test] + fn test_read_malformed_crc_file_fails() { + let engine = SyncEngine::new(); + let table_root = test_table_root("./tests/data/crc-malformed/"); + let crc_path = ParsedLogPath::create_parsed_crc(&table_root, 0); + + assert_result_error_with_message(try_read_crc_file(&engine, &crc_path), "expected value"); + } +} diff --git a/kernel/src/crc/writer.rs b/kernel/src/crc/writer.rs new file mode 100644 index 0000000000..04416afaa6 --- /dev/null +++ b/kernel/src/crc/writer.rs @@ -0,0 +1,225 @@ +//! CRC file writing functionality. + +use url::Url; + +use super::{Crc, FileStatsValidity}; +use crate::utils::require; +use crate::{DeltaResult, Engine, Error}; + +/// Serialize and write a CRC file to storage. +/// +/// Serializes the [`Crc`] struct to JSON via serde and writes the raw bytes using the storage +/// handler. Returns [`Error::ChecksumWriteUnsupported`] if file stats are not valid (a CRC file +/// on disk must have correct stats). Per the Delta protocol, writers MUST NOT overwrite existing +/// CRC files, so this always writes with `overwrite = false`. If the file already exists, returns +/// `Err(Error::FileAlreadyExists)`. +pub(crate) fn try_write_crc_file(engine: &dyn Engine, path: &Url, crc: &Crc) -> DeltaResult<()> { + require!( + crc.file_stats_validity == FileStatsValidity::Valid, + Error::ChecksumWriteUnsupported(format!( + "Cannot write CRC file with {:?} file stats", + crc.file_stats_validity + )) + ); + let data = serde_json::to_vec(crc)?; + engine + .storage_handler() + .put(path, data.into(), false /* overwrite */) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use std::collections::HashMap; + + use super::*; + use crate::actions::{DomainMetadata, Protocol, SetTransaction}; + use crate::crc::reader::try_read_crc_file; + use crate::crc::{FileSizeHistogram, FileStatsValidity}; + use crate::engine::default::DefaultEngineBuilder; + use crate::object_store::memory::InMemory; + use crate::path::{AsUrl, ParsedLogPath}; + use crate::table_features::TableFeature; + + fn test_crc() -> Crc { + let protocol = Protocol::try_new_modern( + [TableFeature::ColumnMapping], + [ + TableFeature::ColumnMapping, + TableFeature::RowTracking, + TableFeature::DomainMetadata, + TableFeature::InCommitTimestamp, + ], + ) + .unwrap(); + // NOTE: Adding more entries here will break test_crc_serialized_json_content because + // domain_metadata is backed by an unsorted HashMap -- the serialized array order is + // non-deterministic. If you need multiple entries, either make the test order-independent + // (e.g. sort both sides by domain name) or switch to a BTreeMap. + let domain_metadata = HashMap::from([( + "delta.rowTracking".to_string(), + DomainMetadata::new( + "delta.rowTracking".to_string(), + r#"{"rowIdHighWaterMark":1048576}"#.to_string(), + ), + )]); + let ict = 1234567890; + let app_id = "testAppId".to_string(); + let set_transactions = + HashMap::from([(app_id.clone(), SetTransaction::new(app_id, 1, Some(ict)))]); + // Build a histogram with 5 files totaling 1024 bytes, all in the first bin (< 8KB). + let mut histogram = FileSizeHistogram::create_default(); + for size in [100, 200, 300, 150, 274] { + histogram.insert(size).unwrap(); // 5 files, 1024 bytes total + } + Crc { + table_size_bytes: 1024, + num_files: 5, + num_metadata: 1, + num_protocol: 1, + protocol, + txn_id: None, + in_commit_timestamp_opt: Some(ict), + set_transactions: Some(set_transactions), + domain_metadata: Some(domain_metadata), + file_size_histogram: Some(histogram), + ..Default::default() + } + } + + #[test] + fn test_serde_round_trip() { + let crc = test_crc(); + let json_bytes = serde_json::to_vec(&crc).unwrap(); + let round_tripped: Crc = serde_json::from_slice(&json_bytes).unwrap(); + + assert_eq!(round_tripped, crc); + } + + #[test] + fn test_write_then_read_crc_file() { + let store = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(store).build(); + let table_root = url::Url::parse("memory:///test_table/").unwrap(); + let write_path = ParsedLogPath::create_parsed_crc(&table_root, 0); + let read_path = ParsedLogPath::create_parsed_crc(&table_root, 0); + let crc = test_crc(); + + try_write_crc_file(&engine, write_path.location.as_url(), &crc).unwrap(); + + let read_back = try_read_crc_file(&engine, &read_path).unwrap(); + assert_eq!(read_back, crc); + } + + /// Verify JSON content produced by CRC serialization via serde_json::Value comparison. + #[test] + fn test_crc_serialized_json_content() { + let crc = test_crc(); + let actual: serde_json::Value = serde_json::to_value(&crc).unwrap(); + + // Verify non-histogram fields match exactly. + let actual_obj = actual.as_object().unwrap(); + let expected_non_hist = serde_json::json!({ + "tableSizeBytes": 1024, + "numFiles": 5, + "numMetadata": 1, + "numProtocol": 1, + "metadata": { + "id": "", + "name": null, + "description": null, + "format": { + "provider": "parquet", + "options": {} + }, + "schemaString": "", + "partitionColumns": [], + "createdTime": null, + "configuration": {} + }, + "protocol": { + "minReaderVersion": 3, + "minWriterVersion": 7, + "readerFeatures": ["columnMapping"], + "writerFeatures": [ + "columnMapping", + "rowTracking", + "domainMetadata", + "inCommitTimestamp" + ] + }, + "inCommitTimestampOpt": 1234567890, + "domainMetadata": [ + { + "domain": "delta.rowTracking", + "configuration": "{\"rowIdHighWaterMark\":1048576}", + "removed": false + } + ], + "setTransactions": [ + { + "appId": "testAppId", + "version": 1, + "lastUpdated": 1234567890 + } + ] + }); + for (key, expected_val) in expected_non_hist.as_object().unwrap() { + assert_eq!( + actual_obj.get(key).unwrap(), + expected_val, + "Mismatch for key: {key}" + ); + } + + // Verify the histogram is present with correct camelCase keys and values. + let hist = actual_obj.get("fileSizeHistogram").unwrap(); + let boundaries = hist.get("sortedBinBoundaries").unwrap().as_array().unwrap(); + let counts = hist.get("fileCounts").unwrap().as_array().unwrap(); + let bytes = hist.get("totalBytes").unwrap().as_array().unwrap(); + assert_eq!(boundaries.len(), 95); + assert_eq!(counts.len(), 95); + assert_eq!(bytes.len(), 95); + // All 5 files are in bin 0 (< 8KB) + assert_eq!(counts[0].as_i64().unwrap(), 5); + assert_eq!(bytes[0].as_i64().unwrap(), 1024); // 100+200+300+150+274 + } + + #[test] + fn test_write_crc_file_already_exists() { + let store = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(store).build(); + let table_root = url::Url::parse("memory:///test_table/").unwrap(); + let crc_path = ParsedLogPath::create_parsed_crc(&table_root, 0); + let crc = test_crc(); + + try_write_crc_file(&engine, crc_path.location.as_url(), &crc).unwrap(); + + // Second write should fail (never overwrites) + let result = try_write_crc_file(&engine, crc_path.location.as_url(), &crc); + assert!(result.is_err()); + } + + #[test] + fn test_write_rejects_invalid_file_stats_with_checksum_write_unsupported() { + let store = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(store).build(); + let table_root = url::Url::parse("memory:///test_table/").unwrap(); + let crc_path = ParsedLogPath::create_parsed_crc(&table_root, 0); + + for invalid_validity in [ + FileStatsValidity::RequiresCheckpointRead, + FileStatsValidity::Indeterminate, + FileStatsValidity::Untrackable, + ] { + let mut crc = test_crc(); + crc.file_stats_validity = invalid_validity; + let result = try_write_crc_file(&engine, crc_path.location.as_url(), &crc); + assert!( + matches!(result, Err(Error::ChecksumWriteUnsupported(_))), + "should reject {invalid_validity:?} with ChecksumWriteUnsupported" + ); + } + } +} diff --git a/kernel/src/doctests/into_engine_data.rs b/kernel/src/doctests/into_engine_data.rs new file mode 100644 index 0000000000..3085b4b4a1 --- /dev/null +++ b/kernel/src/doctests/into_engine_data.rs @@ -0,0 +1,39 @@ +//! Doctests for the `IntoEngineData` derive macro. +//! +//! `IntoEngineData` converts a Rust struct into the `EngineData` representation. +//! See the `IntoEngineData` trait for details. +//! `#[derive(IntoEngineData)]` implements the `IntoEngineData` trait for the struct. +//! +//! What is valid: +//! - A **named-field struct** (a regular `struct Foo { a: T, b: U }`) +//! +//! What is not valid (and should fail to compile): +//! - A **unit struct** (`struct Foo;`) — no fields to convert into engine data. +//! - A **tuple struct** (`struct Foo(T, U);`) — the macro expects named fields. + +/// ``` +/// # use delta_kernel_derive::IntoEngineData; +/// #[derive(IntoEngineData)] +/// pub struct WithFields { +/// some_name: String, +/// count: i32, +/// } +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithField; + +/// ```compile_fail +/// # use delta_kernel_derive::IntoEngineData; +/// #[derive(IntoEngineData)] +/// pub struct NoFields; +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithoutField; + +/// ```compile_fail +/// # use delta_kernel_derive::IntoEngineData; +/// #[derive(IntoEngineData)] +/// pub struct TupleStruct(String, i32); +/// ``` +#[cfg(doctest)] +pub struct MacroTestTupleStruct; diff --git a/kernel/src/doctests/mod.rs b/kernel/src/doctests/mod.rs new file mode 100644 index 0000000000..51658f5361 --- /dev/null +++ b/kernel/src/doctests/mod.rs @@ -0,0 +1,3 @@ +// doctests for macros +mod into_engine_data; +mod to_schema; diff --git a/kernel/src/doctests/to_schema.rs b/kernel/src/doctests/to_schema.rs new file mode 100644 index 0000000000..ae13a8c884 --- /dev/null +++ b/kernel/src/doctests/to_schema.rs @@ -0,0 +1,91 @@ +//! Doctests for ToSchema derive macro + +/// ``` +/// # use delta_kernel_derive::ToSchema; +/// #[derive(ToSchema)] +/// pub struct WithFields { +/// some_name: String, +/// } +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithField; + +/// ```compile_fail +/// # use delta_kernel_derive::ToSchema; +/// #[derive(ToSchema)] +/// pub struct NoFields; +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithoutField; + +/// ``` +/// # use delta_kernel_derive::ToSchema; +/// # use std::collections::HashMap; +/// #[derive(ToSchema)] +/// pub struct WithAngleBracketPath { +/// map_field: HashMap, +/// } +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithAngleBracketedPathField; + +/// ``` +/// # use delta_kernel_derive::ToSchema; +/// # use std::collections::HashMap; +/// #[derive(ToSchema)] +/// pub struct WithAttributedField { +/// #[allow_null_container_values] +/// map_field: HashMap, +/// } +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithAttributedField; + +/// ```compile_fail +/// # use delta_kernel_derive::ToSchema; +/// #[derive(ToSchema)] +/// pub struct WithInvalidAttributeTarget { +/// #[allow_null_container_values] +/// some_name: String, +/// } +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithInvalidAttributeTarget; + +/// Verify that `#[allow_null_container_values]` works on `Option>` fields. +/// This is needed for optional map fields like `Remove.partition_values` that can contain +/// null values. +/// ``` +/// # use delta_kernel_derive::ToSchema; +/// # use std::collections::HashMap; +/// #[derive(ToSchema)] +/// pub struct WithOptionalAttributedField { +/// #[allow_null_container_values] +/// map_field: Option>, +/// } +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithOptionalAttributedField; + +/// Verify that `#[allow_null_container_values]` fails on `Option<_>` fields that are not maps. +/// ```compile_fail +/// # use delta_kernel_derive::ToSchema; +/// #[derive(ToSchema)] +/// pub struct WithInvalidOptionalAttributeTarget { +/// #[allow_null_container_values] +/// some_name: Option, +/// } +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithInvalidOptionalAttributeTarget; + +/// ```compile_fail +/// # use delta_kernel_derive::ToSchema; +/// # use syn::Token; +/// #[derive(ToSchema)] +/// pub struct WithInvalidFieldType { +/// token: Token![struct], +/// } +/// ``` +#[cfg(doctest)] +pub struct MacroTestStructWithInvalidFieldType; diff --git a/kernel/src/engine/arrow_conversion.rs b/kernel/src/engine/arrow_conversion.rs index 42049d26aa..419c0e2bbf 100644 --- a/kernel/src/engine/arrow_conversion.rs +++ b/kernel/src/engine/arrow_conversion.rs @@ -1,5 +1,6 @@ //! Conversions from kernel schema types to arrow schema types. +use std::collections::HashMap; use std::sync::Arc; use crate::arrow::datatypes::{ @@ -7,11 +8,13 @@ use crate::arrow::datatypes::{ SchemaRef as ArrowSchemaRef, TimeUnit, }; use crate::arrow::error::ArrowError; +use crate::parquet::arrow::PARQUET_FIELD_ID_META_KEY; use itertools::Itertools; use crate::error::Error; use crate::schema::{ - ArrayType, DataType, MapType, MetadataValue, PrimitiveType, StructField, StructType, + ArrayType, ColumnMetadataKey, DataType, MapType, MetadataValue, PrimitiveType, StructField, + StructType, }; pub(crate) const LIST_ARRAY_ROOT: &str = "element"; @@ -19,6 +22,34 @@ pub(crate) const MAP_ROOT_DEFAULT: &str = "key_value"; pub(crate) const MAP_KEY_DEFAULT: &str = "key"; pub(crate) const MAP_VALUE_DEFAULT: &str = "value"; +/// Converts kernel [`StructField`] metadata to Arrow field metadata format. +/// +/// Specifically, this transforms the `"parquet.field.id"` key (used by kernel/delta-spark) to +/// `"PARQUET:field_id"` (the native Parquet/Arrow metadata key), enabling correct field ID +/// handling by the Arrow/Parquet writer. +pub(crate) fn kernel_metadata_to_arrow_metadata( + field: &StructField, +) -> Result, ArrowError> { + field + .metadata() + .iter() + .map(|(key, val)| { + let transformed_key = if key == ColumnMetadataKey::ParquetFieldId.as_ref() { + PARQUET_FIELD_ID_META_KEY.to_string() + } else { + key.clone() + }; + match val { + MetadataValue::String(s) => Ok((transformed_key, s.clone())), + _ => Ok(( + transformed_key, + serde_json::to_string(val).map_err(|e| ArrowError::JsonError(e.to_string()))?, + )), + } + }) + .collect() +} + /// Convert a kernel type into an arrow type (automatically implemented for all types that /// implement [`TryFromKernel`]) pub trait TryIntoArrow { @@ -61,25 +92,20 @@ where } } +/// Converts a kernel [`StructType`] to a `Vec`. +fn try_kernel_struct_to_arrow_fields(s: &StructType) -> Result, ArrowError> { + s.fields().map(|f| f.try_into_arrow()).try_collect() +} + impl TryFromKernel<&StructType> for ArrowSchema { fn try_from_kernel(s: &StructType) -> Result { - let fields: Vec = s.fields().map(|f| f.try_into_arrow()).try_collect()?; - Ok(ArrowSchema::new(fields)) + Ok(ArrowSchema::new(try_kernel_struct_to_arrow_fields(s)?)) } } impl TryFromKernel<&StructField> for ArrowField { fn try_from_kernel(f: &StructField) -> Result { - let metadata = f - .metadata() - .iter() - .map(|(key, val)| match &val { - &MetadataValue::String(val) => Ok((key.clone(), val.clone())), - _ => Ok((key.clone(), serde_json::to_string(val)?)), - }) - .collect::>() - .map_err(|err| ArrowError::JsonError(err.to_string()))?; - + let metadata = kernel_metadata_to_arrow_metadata(f)?; let field = ArrowField::new(f.name(), f.data_type().try_into_arrow()?, f.is_nullable()) .with_metadata(metadata); @@ -151,10 +177,7 @@ impl TryFromKernel<&DataType> for ArrowDataType { } } DataType::Struct(s) => Ok(ArrowDataType::Struct( - s.fields() - .map(TryIntoArrow::try_into_arrow) - .collect::, ArrowError>>()? - .into(), + try_kernel_struct_to_arrow_fields(s)?.into(), )), DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(a.as_ref().try_into_arrow()?))), DataType::Map(m) => Ok(ArrowDataType::Map( @@ -164,10 +187,7 @@ impl TryFromKernel<&DataType> for ArrowDataType { DataType::Variant(s) => { if *t == DataType::unshredded_variant() { Ok(ArrowDataType::Struct( - s.fields() - .map(TryIntoArrow::try_into_arrow) - .collect::, ArrowError>>()? - .into(), + try_kernel_struct_to_arrow_fields(s)?.into(), )) } else { Err(ArrowError::SchemaError(format!( @@ -199,12 +219,39 @@ impl TryFromArrow for StructType { impl TryFromArrow<&ArrowField> for StructField { fn try_from_arrow(arrow_field: &ArrowField) -> Result { + let metadata = arrow_field.metadata(); + // If both the native Arrow key (PARQUET:field_id) and the kernel key (parquet.field.id) + // are present with different values, the translation below would silently overwrite one + // with the other. Detect and reject this up front. + if let (Some(arrow_id), Some(kernel_id)) = ( + metadata.get(PARQUET_FIELD_ID_META_KEY), + metadata.get(ColumnMetadataKey::ParquetFieldId.as_ref()), + ) { + if arrow_id != kernel_id { + return Err(ArrowError::SchemaError(format!( + "Field '{}': conflicting parquet field IDs: '{}' ({}) vs '{}' ({})", + arrow_field.name(), + arrow_id, + PARQUET_FIELD_ID_META_KEY, + kernel_id, + ColumnMetadataKey::ParquetFieldId.as_ref(), + ))); + } + } Ok(StructField::new( arrow_field.name().clone(), DataType::try_from_arrow(arrow_field.data_type())?, arrow_field.is_nullable(), ) - .with_metadata(arrow_field.metadata().iter().map(|(k, v)| (k.clone(), v)))) + .with_metadata(metadata.iter().map(|(k, v)| { + // Transform "PARQUET:field_id" to "parquet.field.id" when reading from Parquet + let transformed_key = if k == PARQUET_FIELD_ID_META_KEY { + ColumnMetadataKey::ParquetFieldId.as_ref().to_string() + } else { + k.clone() + }; + (transformed_key, v) + }))) } } @@ -246,6 +293,12 @@ impl TryFromArrow<&ArrowDataType> for DataType { { Ok(DataType::TIMESTAMP) } + ArrowDataType::Timestamp(TimeUnit::Nanosecond, None) => Ok(DataType::TIMESTAMP_NTZ), + ArrowDataType::Timestamp(TimeUnit::Nanosecond, Some(tz)) + if tz.eq_ignore_ascii_case("utc") => + { + Ok(DataType::TIMESTAMP) + } ArrowDataType::Struct(fields) => DataType::try_struct_type_from_results( fields.iter().map(|field| field.as_ref().try_into_kernel()), ) @@ -302,10 +355,12 @@ mod tests { use super::*; use crate::engine::arrow_conversion::ArrowField; use crate::engine::arrow_data::unshredded_variant_arrow_type; - use crate::{ - schema::{DataType, StructField}, - DeltaResult, + use crate::parquet::arrow::PARQUET_FIELD_ID_META_KEY; + use crate::schema::{ + ArrayType, ColumnMetadataKey, DataType, MapType, MetadataValue, StructField, StructType, }; + use crate::transforms::SchemaTransform; + use crate::DeltaResult; use std::collections::HashMap; #[test] @@ -341,4 +396,244 @@ mod tests { .contains("Incorrect Variant Schema")); Ok(()) } + + /// Helper visitor to collect all field IDs from a kernel StructType + #[derive(Default)] + struct FieldIdCollector { + field_ids: Vec<(String, String)>, // (field_name, field_id) + } + + impl<'a> SchemaTransform<'a> for FieldIdCollector { + fn transform_struct_field( + &mut self, + field: &'a StructField, + ) -> Option> { + // Collect field ID if present + if let Some(field_id) = field + .metadata() + .get(ColumnMetadataKey::ParquetFieldId.as_ref()) + { + self.field_ids + .push((field.name().to_string(), field_id.to_string())); + } + // Recurse into nested types + self.recurse_into_struct_field(field) + } + } + + /// Helper function to recursively collect field IDs from an Arrow schema + fn collect_arrow_field_ids(schema: &ArrowSchema, metadata_key: &str) -> Vec<(String, String)> { + let mut field_ids = Vec::new(); + + fn collect_from_fields( + fields: &[std::sync::Arc], + metadata_key: &str, + field_ids: &mut Vec<(String, String)>, + ) { + for field in fields { + collect_from_field(field, metadata_key, field_ids); + } + } + + fn collect_from_field( + field: &ArrowField, + metadata_key: &str, + field_ids: &mut Vec<(String, String)>, + ) { + // Collect field ID from this field + if let Some(id) = field.metadata().get(metadata_key) { + field_ids.push((field.name().clone(), id.clone())); + } + + // Recurse into nested types + match field.data_type() { + ArrowDataType::Struct(fields) => { + collect_from_fields(fields, metadata_key, field_ids); + } + ArrowDataType::List(entry) + | ArrowDataType::LargeList(entry) + | ArrowDataType::FixedSizeList(entry, _) + | ArrowDataType::Map(entry, _) => { + collect_from_field(entry, metadata_key, field_ids); + } + _ => {} + } + } + + collect_from_fields(schema.fields(), metadata_key, &mut field_ids); + field_ids + } + + #[test] + fn test_recursive_field_id_transformation() -> DeltaResult<()> { + // Create a complex nested structure with field IDs at multiple levels: + // top_struct { + // simple_field: int (field_id=1) + // nested_struct: struct { (field_id=2) + // inner_field: string (field_id=3) + // } + // array_field: array + // map_field: map + // } + + // Build nested struct + let inner_struct_type = StructType::try_new(vec![StructField::new( + "inner_field", + DataType::STRING, + false, + ) + .with_metadata([( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(3), + )])])?; + + // Build array element struct + let array_item_struct = StructType::try_new(vec![StructField::new( + "array_item", + DataType::INTEGER, + false, + ) + .with_metadata([( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(5), + )])])?; + let array_type = ArrayType::new(DataType::Struct(Box::new(array_item_struct)), false); + + // Build map with struct key and struct value (both with field IDs) + let map_key_struct = StructType::try_new(vec![StructField::new( + "map_key_field", + DataType::STRING, + false, + ) + .with_metadata([( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(7), + )])])?; + let map_value_struct = StructType::try_new(vec![StructField::new( + "map_value_field", + DataType::INTEGER, + false, + ) + .with_metadata([( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(8), + )])])?; + let map_type = MapType::new( + DataType::Struct(Box::new(map_key_struct)), + DataType::Struct(Box::new(map_value_struct)), + false, + ); + + // Build top-level struct + let top_struct = StructType::try_new(vec![ + StructField::new("simple_field", DataType::INTEGER, false).with_metadata([( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(1), + )]), + StructField::new( + "nested_struct", + DataType::Struct(Box::new(inner_struct_type)), + false, + ) + .with_metadata([( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(2), + )]), + StructField::new("array_field", DataType::Array(Box::new(array_type)), false) + .with_metadata([( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(4), + )]), + StructField::new("map_field", DataType::Map(Box::new(map_type)), false).with_metadata( + [( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(6), + )], + ), + ])?; + + // Convert to Arrow schema + let arrow_schema = ArrowSchema::try_from_kernel(&top_struct)?; + + let expected_ids: HashMap = [ + ("simple_field", "1"), + ("nested_struct", "2"), + ("inner_field", "3"), + ("array_field", "4"), + ("array_item", "5"), + ("map_field", "6"), + ("map_key_field", "7"), + ("map_value_field", "8"), + ] + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + + // Verify field IDs are transformed to PARQUET:field_id at all levels + let arrow_field_ids: HashMap = + collect_arrow_field_ids(&arrow_schema, PARQUET_FIELD_ID_META_KEY) + .into_iter() + .collect(); + assert_eq!( + arrow_field_ids, expected_ids, + "All field IDs should be transformed to PARQUET:field_id" + ); + + // Test round-trip: Arrow -> Kernel, field IDs should be preserved unchanged + let kernel_struct = StructType::try_from_arrow(&arrow_schema)?; + let mut collector = FieldIdCollector::default(); + collector.transform_struct(&kernel_struct); + let kernel_field_ids: HashMap = collector.field_ids.into_iter().collect(); + assert_eq!( + kernel_field_ids, arrow_field_ids, + "Kernel field IDs should match Arrow field IDs after round-trip" + ); + + Ok(()) + } + + /// When an Arrow field carries both `PARQUET:field_id` and `parquet.field.id` with the same + /// value, the round-trip to kernel should succeed (one key is kept after translation). + #[test] + fn test_arrow_to_kernel_matching_field_ids_succeed() { + let arrow_field = ArrowField::new("a", ArrowDataType::Int32, false).with_metadata( + [ + (PARQUET_FIELD_ID_META_KEY.to_string(), "42".to_string()), + ( + ColumnMetadataKey::ParquetFieldId.as_ref().to_string(), + "42".to_string(), + ), + ] + .into(), + ); + let result = StructField::try_from_arrow(&arrow_field); + assert!(result.is_ok(), "Matching field IDs should succeed"); + } + + /// When an Arrow field carries both `PARQUET:field_id` and `parquet.field.id` with *different* + /// values, converting to kernel must fail rather than silently overwriting one ID with the + /// other. + #[test] + fn test_arrow_to_kernel_conflicting_field_ids_fail() { + let arrow_field = ArrowField::new("a", ArrowDataType::Int32, false).with_metadata( + [ + (PARQUET_FIELD_ID_META_KEY.to_string(), "1".to_string()), + ( + ColumnMetadataKey::ParquetFieldId.as_ref().to_string(), + "2".to_string(), + ), + ] + .into(), + ); + crate::utils::test_utils::assert_result_error_with_message( + StructField::try_from_arrow(&arrow_field), + "conflicting parquet field IDs", + ); + } } diff --git a/kernel/src/engine/arrow_data.rs b/kernel/src/engine/arrow_data.rs index 1ce2154f82..3ef89e6b28 100644 --- a/kernel/src/engine/arrow_data.rs +++ b/kernel/src/engine/arrow_data.rs @@ -1,21 +1,26 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::sync::Arc; use itertools::Itertools; use tracing::debug; use crate::arrow::array::cast::AsArray; -use crate::arrow::array::types::{Int32Type, Int64Type}; +use crate::arrow::array::types::{ + Date32Type, Decimal128Type, Float32Type, Float64Type, GenericStringType, Int32Type, Int64Type, + TimestampMicrosecondType, +}; use crate::arrow::array::{ - Array, ArrayRef, GenericListArray, MapArray, OffsetSizeTrait, RecordBatch, StructArray, + Array, ArrayRef, GenericByteArray, OffsetSizeTrait, RecordBatch, RunArray, StringViewArray, + StructArray, }; +use crate::arrow::compute::filter_record_batch; use crate::arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, FieldRef, Schema as ArrowSchema, }; use crate::engine::arrow_conversion::TryIntoArrow as _; -use crate::engine_data::{EngineData, EngineList, EngineMap, GetData, RowVisitor}; +use crate::engine_data::{EngineData, GetData, RowVisitor, StringArrayAccessor}; use crate::expressions::ArrayData; -use crate::schema::{ColumnName, DataType, SchemaRef}; +use crate::schema::{ColumnName, DataType, PrimitiveType, SchemaRef}; use crate::{DeltaResult, Error}; pub use crate::engine::arrow_utils::fix_nested_null_masks; @@ -29,6 +34,32 @@ pub struct ArrowEngineData { data: RecordBatch, } +/// A trait to allow easy conversion from [`EngineData`] to an arrow [``RecordBatch`]. Returns an +/// error if called on an `EngineData` that is not an `ArrowEngineData`. +pub trait EngineDataArrowExt { + fn try_into_record_batch(self) -> DeltaResult; +} + +impl EngineDataArrowExt for Box { + fn try_into_record_batch(self) -> DeltaResult { + Ok(self + .into_any() + .downcast::() + .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? + .into()) + } +} + +impl EngineDataArrowExt for DeltaResult> { + fn try_into_record_batch(self) -> DeltaResult { + Ok(self? + .into_any() + .downcast::() + .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? + .into()) + } +} + /// Helper function to extract a RecordBatch from EngineData, ensuring it's ArrowEngineData pub(crate) fn extract_record_batch(engine_data: &dyn EngineData) -> DeltaResult<&RecordBatch> { let Some(arrow_data) = engine_data.any_ref().downcast_ref::() else { @@ -90,58 +121,39 @@ impl From> for RecordBatch { } } -impl EngineList for GenericListArray -where - OffsetSize: OffsetSizeTrait, -{ - fn len(&self, row_index: usize) -> usize { - self.value(row_index).len() +impl StringArrayAccessor for GenericByteArray> { + fn len(&self) -> usize { + Array::len(self) } - - fn get(&self, row_index: usize, index: usize) -> String { - let arry = self.value(row_index); - let sarry = arry.as_string::(); - sarry.value(index).to_string() + fn value(&self, index: usize) -> &str { + self.value(index) } - - fn materialize(&self, row_index: usize) -> Vec { - let mut result = vec![]; - for i in 0..EngineList::len(self, row_index) { - result.push(self.get(row_index, i)); - } - result + fn is_valid(&self, index: usize) -> bool { + Array::is_valid(self, index) } } -impl EngineMap for MapArray { - fn get<'a>(&'a self, row_index: usize, key: &str) -> Option<&'a str> { - let offsets = self.offsets(); - let start_offset = offsets[row_index] as usize; - let count = offsets[row_index + 1] as usize - start_offset; - let keys = self.keys().as_string::(); - for (idx, map_key) in keys.iter().enumerate().skip(start_offset).take(count) { - if let Some(map_key) = map_key { - if key == map_key { - // found the item - let vals = self.values().as_string::(); - return Some(vals.value(idx)); - } - } - } - None +impl StringArrayAccessor for StringViewArray { + fn len(&self) -> usize { + Array::len(self) + } + fn value(&self, index: usize) -> &str { + self.value(index) } + fn is_valid(&self, index: usize) -> bool { + Array::is_valid(self, index) + } +} - fn materialize(&self, row_index: usize) -> HashMap { - let mut ret = HashMap::new(); - let map_val = self.value(row_index); - let keys = map_val.column(0).as_string::(); - let values = map_val.column(1).as_string::(); - for (key, value) in keys.iter().zip(values.iter()) { - if let (Some(key), Some(value)) = (key, value) { - ret.insert(key.into(), value.into()); - } - } - ret +/// Downcast an Arrow array to a [`StringArrayAccessor`], trying Utf8, LargeUtf8, and +/// Utf8View in order. Returns `None` if the array is not a string type. +pub(crate) fn as_string_accessor(array: &dyn Array) -> Option<&dyn StringArrayAccessor> { + if let Some(a) = array.as_string_opt::() { + Some(a) + } else if let Some(a) = array.as_string_opt::() { + Some(a) + } else { + Some(array.as_string_view_opt()?) } } @@ -170,6 +182,16 @@ impl ProvidesColumnsAndFields for StructArray { } } +/// Tracks the state of a column during extraction +enum ColumnState<'a> { + /// Parent path used for traversal into nested structs + Parent, + /// Leaf column awaiting a getter to be extracted + AwaitingGetter(&'a DataType), + /// Leaf column with getter successfully extracted + HasGetter(&'a dyn GetData<'a>), +} + impl EngineData for ArrowEngineData { fn len(&self) -> usize { self.data.num_rows() @@ -191,19 +213,47 @@ impl EngineData for ArrowEngineData { .with_backtrace()); } - // Collect the names of all leaf columns we want to extract, along with their parents, to - // guide our depth-first extraction. If the list contains any non-leaf, duplicate, or - // missing column references, the extracted column list will be too short (error out below). - let mut mask = HashSet::new(); + // Build a map tracking the state of each column path: + // - Parent: used for traversal into nested structs + // - AwaitingGetter: leaf column that needs a getter extracted + // - HasGetter: leaf column with getter successfully extracted (set during extraction) + // + // This is used to guide our depth-first extraction. If the list contains any non-leaf, + // duplicate, or missing column references, the extracted column list will be too + // short (error out below). + let mut column_map = HashMap::with_capacity(leaf_columns.len() * 2); + + for (column, data_type) in leaf_columns.iter().zip(leaf_types.iter()) { + column_map.insert(column.clone(), ColumnState::AwaitingGetter(data_type)); + let mut cur_parent = column.parent(); + while let Some(parent) = cur_parent { + column_map + .entry(parent.clone()) + .or_insert(ColumnState::Parent); + cur_parent = parent.parent(); + } + } + debug!( + "Column map for selected columns {leaf_columns:?} has {} entries", + column_map.len() + ); + + // Extract all columns, transitioning AwaitingGetter -> HasGetter + Self::extract_columns(&mut vec![], &mut column_map, &self.data)?; + + // Extract getters in the requested column order, verifying state transitions + let mut getters = Vec::with_capacity(leaf_columns.len()); for column in leaf_columns { - for i in 0..column.len() { - mask.insert(&column[..i + 1]); + match column_map.get(column.as_ref()) { + Some(ColumnState::HasGetter(getter)) => getters.push(*getter), + _ => { + return Err(Error::MissingColumn(format!( + "Column {column} not found in the data" + ))); + } } } - debug!("Column mask for selected columns {leaf_columns:?} is {mask:#?}"); - let mut getters = vec![]; - Self::extract_columns(&mut vec![], &mut getters, leaf_types, &mask, &self.data)?; if getters.len() != leaf_columns.len() { return Err(Error::MissingColumn(format!( "Visitor expected {} leaf columns, but only {} were found in the data", @@ -237,32 +287,55 @@ impl EngineData for ArrowEngineData { let data = RecordBatch::try_new(combined_schema, combined_columns)?; Ok(Box::new(ArrowEngineData { data })) } + + fn apply_selection_vector( + self: Box, + mut selection_vector: Vec, + ) -> DeltaResult> { + selection_vector.resize(self.len(), true); + let filtered = filter_record_batch(&self.data, &selection_vector.into())?; + Ok(Box::new(Self::new(filtered))) + } } impl ArrowEngineData { fn extract_columns<'a>( path: &mut Vec, - getters: &mut Vec<&'a dyn GetData<'a>>, - leaf_types: &[DataType], - column_mask: &HashSet<&[String]>, + column_map: &mut HashMap>, data: &'a dyn ProvidesColumnsAndFields, ) -> DeltaResult<()> { for (column, field) in data.columns().iter().zip(data.fields()) { path.push(field.name().to_string()); - if column_mask.contains(&path[..]) { - if let Some(struct_array) = column.as_struct_opt() { - debug!( - "Recurse into a struct array for {}", - ColumnName::new(path.iter()) - ); - Self::extract_columns(path, getters, leaf_types, column_mask, struct_array)?; - } else if column.data_type() == &ArrowDataType::Null { - debug!("Pushing a null array for {}", ColumnName::new(path.iter())); - getters.push(&()); - } else { - let data_type = &leaf_types[getters.len()]; - let getter = Self::extract_leaf_column(path, data_type, column)?; - getters.push(getter); + + // Check if this path is in our column map and mutate state if needed + if let Some(state) = column_map.get_mut(path.as_slice()) { + match state { + ColumnState::Parent => { + // Parent path - recurse if it's a struct + if let Some(struct_array) = column.as_struct_opt() { + debug!( + "Recurse into a struct array for {}", + ColumnName::new(path.iter()) + ); + Self::extract_columns(path, column_map, struct_array)?; + } + } + ColumnState::AwaitingGetter(data_type) => { + // Leaf column - extract and transition to HasGetter + let getter = if column.data_type() == &ArrowDataType::Null { + debug!("Pushing a null array for {}", ColumnName::new(path.iter())); + &() as &'a dyn GetData<'a> + } else { + Self::extract_leaf_column(path, data_type, column)? + }; + *state = ColumnState::HasGetter(getter); + } + ColumnState::HasGetter(_) => { + return Err(Error::internal_error(format!( + "Column {} already has a getter - duplicate column?", + ColumnName::new(path.iter()) + ))); + } } } else { debug!("Skipping unmasked path {}", ColumnName::new(path.iter())); @@ -272,47 +345,122 @@ impl ArrowEngineData { Ok(()) } + /// Helper function to extract a column, supporting both direct arrays and REE-encoded (RunEndEncoded) arrays. + /// This reduces boilerplate by handling the common pattern of trying direct access first, + /// then falling back to RunArray if the column is REE-encoded. + fn try_extract_with_ree<'a>(col: &'a dyn Array) -> Option<&'a dyn GetData<'a>> { + match col.data_type() { + ArrowDataType::RunEndEncoded(_, _) => col + .as_any() + .downcast_ref::>() + .map(|run_array| run_array as &'a dyn GetData<'a>), + _ => None, + } + } + fn extract_leaf_column<'a>( path: &[String], data_type: &DataType, col: &'a dyn Array, ) -> DeltaResult<&'a dyn GetData<'a>> { - use ArrowDataType::Utf8; - let col_as_list = || { - if let Some(array) = col.as_list_opt::() { - (array.value_type() == Utf8).then_some(array as _) - } else if let Some(array) = col.as_list_opt::() { - (array.value_type() == Utf8).then_some(array as _) - } else { - None + // TODO: Replace with `ArrowDataType::is_string()` once we bump arrow-schema past 57.2.0 + let is_string_type = |dt: &ArrowDataType| { + matches!( + dt, + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View + ) + }; + let col_as_list = || -> Option<&'a dyn GetData<'a>> { + match col.data_type() { + ArrowDataType::List(f) + | ArrowDataType::LargeList(f) + | ArrowDataType::ListView(f) + | ArrowDataType::LargeListView(f) + if is_string_type(f.data_type()) => {} + _ => return None, } + col.as_list_opt::() + .map(|a| a as _) + .or_else(|| col.as_list_opt::().map(|a| a as _)) + .or_else(|| col.as_list_view_opt::().map(|a| a as _)) + .or_else(|| col.as_list_view_opt::().map(|a| a as _)) }; let col_as_map = || { col.as_map_opt().and_then(|array| { - (array.key_type() == &Utf8 && array.value_type() == &Utf8).then_some(array as _) + (is_string_type(array.key_type()) && is_string_type(array.value_type())) + .then_some(array as _) }) }; let result: Result<&'a dyn GetData<'a>, _> = match data_type { &DataType::BOOLEAN => { debug!("Pushing boolean array for {}", ColumnName::new(path)); - col.as_boolean_opt().map(|a| a as _).ok_or("bool") + col.as_boolean_opt() + .map(|a| a as _) + .or_else(|| Self::try_extract_with_ree(col)) + .ok_or("bool") } &DataType::STRING => { debug!("Pushing string array for {}", ColumnName::new(path)); - col.as_string_opt().map(|a| a as _).ok_or("string") + col.as_string_opt::() + .map(|a| a as _) + .or_else(|| col.as_string_opt::().map(|a| a as _)) + .or_else(|| col.as_string_view_opt().map(|a| a as _)) + .or_else(|| Self::try_extract_with_ree(col)) + .ok_or("string") + } + &DataType::BINARY => { + debug!("Pushing binary array for {}", ColumnName::new(path)); + col.as_binary_opt::() + .map(|a| a as _) + .or_else(|| col.as_binary_opt::().map(|a| a as _)) + .or_else(|| col.as_binary_view_opt().map(|a| a as _)) + .or_else(|| Self::try_extract_with_ree(col)) + .ok_or("binary") } &DataType::INTEGER => { debug!("Pushing int32 array for {}", ColumnName::new(path)); col.as_primitive_opt::() .map(|a| a as _) + .or_else(|| Self::try_extract_with_ree(col)) .ok_or("int") } &DataType::LONG => { debug!("Pushing int64 array for {}", ColumnName::new(path)); col.as_primitive_opt::() .map(|a| a as _) + .or_else(|| Self::try_extract_with_ree(col)) .ok_or("long") } + &DataType::FLOAT => { + debug!("Pushing float array for {}", ColumnName::new(path)); + col.as_primitive_opt::() + .map(|a| a as _) + .ok_or("float") + } + &DataType::DOUBLE => { + debug!("Pushing double array for {}", ColumnName::new(path)); + col.as_primitive_opt::() + .map(|a| a as _) + .ok_or("double") + } + &DataType::DATE => { + debug!("Pushing date array for {}", ColumnName::new(path)); + col.as_primitive_opt::() + .map(|a| a as _) + .ok_or("date") + } + &DataType::TIMESTAMP | &DataType::TIMESTAMP_NTZ => { + debug!("Pushing timestamp array for {}", ColumnName::new(path)); + col.as_primitive_opt::() + .map(|a| a as _) + .ok_or("timestamp") + } + DataType::Primitive(PrimitiveType::Decimal(_)) => { + debug!("Pushing decimal array for {}", ColumnName::new(path)); + col.as_primitive_opt::() + .map(|a| a as _) + .ok_or("decimal") + } DataType::Array(_) => { debug!("Pushing list for {}", ColumnName::new(path)); col_as_list().ok_or("array") @@ -341,20 +489,27 @@ impl ArrowEngineData { #[cfg(test)] mod tests { - use std::sync::Arc; + use std::sync::{Arc, LazyLock}; use crate::actions::{get_commit_schema, Metadata, Protocol}; - use crate::arrow::array::types::Int32Type; - use crate::arrow::array::{Array, AsArray, Int32Array, RecordBatch, StringArray}; + use crate::arrow::array::types::{Int32Type, Int64Type}; + use crate::arrow::array::{ + Array, ArrayRef, AsArray, BinaryArray, BooleanArray, Int32Array, Int64Array, + LargeBinaryArray, LargeStringArray, ListViewArray, MapArray, RecordBatch, RunArray, + StringArray, StringViewArray, StructArray, + }; + use crate::arrow::buffer::{OffsetBuffer, ScalarBuffer}; use crate::arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, }; use crate::engine::sync::SyncEngine; + use crate::engine_data::{GetData, ListItem, MapItem, RowVisitor, TypedGetData}; use crate::expressions::ArrayData; - use crate::schema::{ArrayType, DataType, StructField, StructType}; - use crate::table_features::{ReaderFeature, WriterFeature}; + use crate::schema::{ArrayType, ColumnName, DataType, StructField, StructType}; + use crate::table_features::TableFeature; use crate::utils::test_utils::{assert_result_error_with_message, string_array_to_engine_data}; use crate::{DeltaResult, Engine as _, EngineData as _}; + use rstest::rstest; use super::{extract_record_batch, ArrowEngineData}; @@ -394,11 +549,11 @@ mod tests { assert_eq!(protocol.min_writer_version(), 7); assert_eq!( protocol.reader_features(), - Some([ReaderFeature::unknown("rw1")].as_slice()) + Some([TableFeature::unknown("rw1")].as_slice()) ); assert_eq!( protocol.writer_features(), - Some([WriterFeature::unknown("rw1"), WriterFeature::unknown("w2")].as_slice()) + Some([TableFeature::unknown("rw1"), TableFeature::unknown("w2")].as_slice()) ); Ok(()) } @@ -724,4 +879,829 @@ mod tests { Ok(()) } + + #[test] + fn test_binary_column_extraction() -> DeltaResult<()> { + // Create a RecordBatch with binary data + let binary_data: Vec> = vec![ + Some(b"hello"), + Some(b"world"), + None, + Some(b"\x00\x01\x02\x03"), + ]; + let binary_array = BinaryArray::from(binary_data.clone()); + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "data", + ArrowDataType::Binary, + true, + )])); + + let batch = RecordBatch::try_new(schema, vec![Arc::new(binary_array)])?; + let arrow_data = ArrowEngineData::new(batch); + + // Create a visitor to extract binary data + struct BinaryVisitor { + values: Vec>>, + } + + impl RowVisitor for BinaryVisitor { + fn selected_column_names_and_types( + &self, + ) -> (&'static [ColumnName], &'static [DataType]) { + static NAMES: LazyLock> = + LazyLock::new(|| vec![ColumnName::new(["data"])]); + static TYPES: LazyLock> = LazyLock::new(|| vec![DataType::BINARY]); + (&NAMES, &TYPES) + } + + fn visit<'a>( + &mut self, + row_count: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult<()> { + assert_eq!(getters.len(), 1); + let getter = getters[0]; + + for i in 0..row_count { + self.values + .push(getter.get_binary(i, "data")?.map(|b| b.to_vec())); + } + Ok(()) + } + } + + let mut visitor = BinaryVisitor { values: vec![] }; + arrow_data.visit_rows(&[ColumnName::new(["data"])], &mut visitor)?; + + // Verify the extracted values + assert_eq!(visitor.values.len(), 4); + assert_eq!(visitor.values[0].as_deref(), Some(b"hello".as_ref())); + assert_eq!(visitor.values[1].as_deref(), Some(b"world".as_ref())); + assert_eq!(visitor.values[2], None); + assert_eq!( + visitor.values[3].as_deref(), + Some(b"\x00\x01\x02\x03".as_ref()) + ); + + Ok(()) + } + + #[test] + fn test_binary_column_extraction_type_mismatch() -> DeltaResult<()> { + // Create a RecordBatch with Int32 data (not binary) + let data: Vec> = vec![Some(123)]; + let int_array = Int32Array::from(data); + + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "data", + ArrowDataType::Int32, + true, + )])); + + let batch = RecordBatch::try_new(schema, vec![Arc::new(int_array)])?; + let arrow_data = ArrowEngineData::new(batch); + + // Create a visitor that tries to extract binary data from an int column + struct BinaryVisitor { + values: Vec>>, + } + + impl RowVisitor for BinaryVisitor { + fn selected_column_names_and_types( + &self, + ) -> (&'static [ColumnName], &'static [DataType]) { + static NAMES: LazyLock> = + LazyLock::new(|| vec![ColumnName::new(["data"])]); + static TYPES: LazyLock> = LazyLock::new(|| vec![DataType::BINARY]); + (&NAMES, &TYPES) + } + + fn visit<'a>( + &mut self, + row_count: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult<()> { + assert_eq!(getters.len(), 1); + let getter = getters[0]; + + for i in 0..row_count { + self.values + .push(getter.get_binary(i, "data")?.map(|b| b.to_vec())); + } + Ok(()) + } + } + + let mut visitor = BinaryVisitor { values: vec![] }; + let result = arrow_data.visit_rows(&[ColumnName::new(["data"])], &mut visitor); + + // Verify that we get a type mismatch error + assert_result_error_with_message( + result, + "Type mismatch on data: expected binary, got Int32", + ); + + Ok(()) + } + + #[test] + fn test_column_ordering_independence() -> DeltaResult<()> { + // Schema: field_a, field_b, nested.x, nested.y + let nested_fields = vec![ + ArrowField::new("x", ArrowDataType::Int32, false), + ArrowField::new("y", ArrowDataType::Int32, false), + ]; + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + ArrowField::new("field_a", ArrowDataType::Int32, false), + ArrowField::new("field_b", ArrowDataType::Int32, false), + ArrowField::new( + "nested", + ArrowDataType::Struct(nested_fields.clone().into()), + false, + ), + ])), + vec![ + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(Int32Array::from(vec![10, 20])), + Arc::new(StructArray::try_new( + nested_fields.into(), + vec![ + Arc::new(Int32Array::from(vec![100, 200])), + Arc::new(Int32Array::from(vec![1000, 2000])), + ], + None, + )?), + ], + )?; + + // Column names requested in reverse order (not schema order) + static REQUESTED_COLUMNS: LazyLock> = LazyLock::new(|| { + vec![ + ColumnName::new(["nested", "y"]), + ColumnName::new(["field_b"]), + ColumnName::new(["nested", "x"]), + ColumnName::new(["field_a"]), + ] + }); + + struct Visitor { + values: Vec<(i32, i32, i32, i32)>, + } + impl RowVisitor for Visitor { + fn selected_column_names_and_types( + &self, + ) -> (&'static [ColumnName], &'static [DataType]) { + static TYPES: LazyLock> = + LazyLock::new(|| vec![DataType::INTEGER; 4]); + (&REQUESTED_COLUMNS, &TYPES) + } + + fn visit<'a>( + &mut self, + row_count: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult<()> { + for i in 0..row_count { + self.values.push(( + getters[0].get(i, "nested.y")?, + getters[1].get(i, "field_b")?, + getters[2].get(i, "nested.x")?, + getters[3].get(i, "field_a")?, + )); + } + Ok(()) + } + } + + let mut visitor = Visitor { values: vec![] }; + ArrowEngineData::new(batch).visit_rows(&REQUESTED_COLUMNS, &mut visitor)?; + + // Verify values match requested order, not schema order + assert_eq!(visitor.values, vec![(1000, 10, 100, 1), (2000, 20, 200, 2)]); + Ok(()) + } + + #[test] + fn test_visit_duplicate_column_error() -> DeltaResult<()> { + // Create batch with simple columns + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + ArrowField::new("field_a", ArrowDataType::Int32, false), + ArrowField::new("field_a", ArrowDataType::Int32, false), // Duplicate column name + ])), + vec![ + Arc::new(Int32Array::from(vec![1, 2])), + Arc::new(Int32Array::from(vec![10, 20])), + ], + )?; + + // Request the duplicate column + static REQUESTED_COLUMNS: LazyLock> = + LazyLock::new(|| vec![ColumnName::new(["field_a"])]); + + struct DummyVisitor; + impl RowVisitor for DummyVisitor { + fn selected_column_names_and_types( + &self, + ) -> (&'static [ColumnName], &'static [DataType]) { + static TYPES: LazyLock> = LazyLock::new(|| vec![DataType::INTEGER]); + (&REQUESTED_COLUMNS, &TYPES) + } + fn visit<'a>( + &mut self, + _row_count: usize, + _getters: &[&'a dyn crate::engine_data::GetData<'a>], + ) -> DeltaResult<()> { + Ok(()) + } + } + + let mut visitor = DummyVisitor; + let result = ArrowEngineData::new(batch).visit_rows(&REQUESTED_COLUMNS, &mut visitor); + + assert_result_error_with_message( + result, + "Column field_a already has a getter - duplicate column?", + ); + Ok(()) + } + + #[test] + fn test_run_array_out_of_bounds_errors() -> DeltaResult<()> { + // Test that out of bounds errors include field name for all types + let run_ends = Int64Array::from(vec![2]); + + // Test str + let str_array = + RunArray::::try_new(&run_ends, &StringArray::from(vec!["test"]))?; + let err_msg = str_array.get_str(2, "str_field").unwrap_err().to_string(); + assert!(err_msg.contains("out of bounds") && err_msg.contains("str_field")); + + // Test int + let int_array = RunArray::::try_new(&run_ends, &Int32Array::from(vec![42]))?; + let err_msg = int_array.get_int(5, "int_field").unwrap_err().to_string(); + assert!(err_msg.contains("out of bounds") && err_msg.contains("int_field")); + + // Test long + let long_array = + RunArray::::try_new(&run_ends, &Int64Array::from(vec![100i64]))?; + let err_msg = long_array + .get_long(3, "long_field") + .unwrap_err() + .to_string(); + assert!(err_msg.contains("out of bounds") && err_msg.contains("long_field")); + + // Test bool + let bool_array = + RunArray::::try_new(&run_ends, &BooleanArray::from(vec![true]))?; + let err_msg = bool_array + .get_bool(2, "bool_field") + .unwrap_err() + .to_string(); + assert!(err_msg.contains("out of bounds") && err_msg.contains("bool_field")); + + // Test binary + let binary_array = RunArray::::try_new( + &run_ends, + &BinaryArray::from(vec![Some(b"data".as_ref())]), + )?; + let err_msg = binary_array + .get_binary(4, "binary_field") + .unwrap_err() + .to_string(); + assert!(err_msg.contains("out of bounds") && err_msg.contains("binary_field")); + + Ok(()) + } + + #[test] + fn test_run_array_extraction_via_visitor() -> DeltaResult<()> { + // Create RunArray columns with pattern: [val1, val1, null, null, val2] + // Per Arrow spec: nulls are encoded as runs in the values child array + let run_ends = Int64Array::from(vec![2, 4, 5]); + let mk_field = |name, dt| { + ArrowField::new( + name, + ArrowDataType::RunEndEncoded( + Arc::new(ArrowField::new("run_ends", ArrowDataType::Int64, false)), + Arc::new(ArrowField::new("values", dt, true)), + ), + true, + ) + }; + + let columns: Vec> = vec![ + Arc::new(RunArray::::try_new( + &run_ends, + &StringArray::from(vec![Some("a"), None, Some("b")]), + )?), + Arc::new(RunArray::::try_new( + &run_ends, + &Int32Array::from(vec![Some(1), None, Some(2)]), + )?), + Arc::new(RunArray::::try_new( + &run_ends, + &Int64Array::from(vec![Some(10i64), None, Some(20)]), + )?), + Arc::new(RunArray::::try_new( + &run_ends, + &BooleanArray::from(vec![Some(true), None, Some(false)]), + )?), + Arc::new(RunArray::::try_new( + &run_ends, + &BinaryArray::from(vec![Some(b"x".as_ref()), None, Some(b"y".as_ref())]), + )?), + ]; + + let schema = Arc::new(ArrowSchema::new(vec![ + mk_field("s", ArrowDataType::Utf8), + mk_field("i", ArrowDataType::Int32), + mk_field("l", ArrowDataType::Int64), + mk_field("b", ArrowDataType::Boolean), + mk_field("bin", ArrowDataType::Binary), + ])); + + let arrow_data = ArrowEngineData::new(RecordBatch::try_new(schema, columns)?); + + type Row = ( + Option, + Option, + Option, + Option, + Option>, + ); + + struct TestVisitor { + data: Vec, + } + + impl RowVisitor for TestVisitor { + fn selected_column_names_and_types( + &self, + ) -> (&'static [ColumnName], &'static [DataType]) { + static COLUMNS: LazyLock<[ColumnName; 5]> = LazyLock::new(|| { + [ + ColumnName::new(["s"]), + ColumnName::new(["i"]), + ColumnName::new(["l"]), + ColumnName::new(["b"]), + ColumnName::new(["bin"]), + ] + }); + static TYPES: &[DataType] = &[ + DataType::STRING, + DataType::INTEGER, + DataType::LONG, + DataType::BOOLEAN, + DataType::BINARY, + ]; + (&*COLUMNS, TYPES) + } + + fn visit<'a>( + &mut self, + row_count: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult<()> { + for i in 0..row_count { + self.data.push(( + getters[0].get_str(i, "s")?.map(|s| s.to_string()), + getters[1].get_int(i, "i")?, + getters[2].get_long(i, "l")?, + getters[3].get_bool(i, "b")?, + getters[4].get_binary(i, "bin")?.map(|b| b.to_vec()), + )); + } + Ok(()) + } + } + + let mut visitor = TestVisitor { data: vec![] }; + visitor.visit_rows_of(&arrow_data)?; + + // Verify decompression including nulls: [val1, val1, null, null, val2] + let expected = vec![ + ( + Some("a".into()), + Some(1), + Some(10), + Some(true), + Some(b"x".to_vec()), + ), + ( + Some("a".into()), + Some(1), + Some(10), + Some(true), + Some(b"x".to_vec()), + ), + (None, None, None, None, None), + (None, None, None, None, None), + ( + Some("b".into()), + Some(2), + Some(20), + Some(false), + Some(b"y".to_vec()), + ), + ]; + assert_eq!(visitor.data, expected); + + Ok(()) + } + + /// Helper to create a MapArray from key-value pairs for materialize tests + fn create_map_array(entries: Vec)>>) -> MapArray { + let mut all_keys = vec![]; + let mut all_values = vec![]; + let mut offsets = vec![0i32]; + + for entry_group in entries { + for (key, value) in entry_group { + all_keys.push(Some(key)); + all_values.push(value); + } + offsets.push(all_keys.len() as i32); + } + + let keys_array = + Arc::new(StringArray::from(all_keys)) as Arc; + let values_array = + Arc::new(StringArray::from(all_values)) as Arc; + + let entries_struct = StructArray::try_new( + vec![ + Arc::new(ArrowField::new("keys", ArrowDataType::Utf8, false)), + Arc::new(ArrowField::new("values", ArrowDataType::Utf8, true)), + ] + .into(), + vec![keys_array, values_array], + None, + ) + .unwrap(); + + let offsets_buffer = OffsetBuffer::new(offsets.into()); + MapArray::try_new( + Arc::new(ArrowField::new_struct( + "entries", + vec![ + Arc::new(ArrowField::new("keys", ArrowDataType::Utf8, false)), + Arc::new(ArrowField::new("values", ArrowDataType::Utf8, true)), + ], + false, + )), + offsets_buffer, + entries_struct, + None, + false, + ) + .unwrap() + } + + /// Helper to construct a MapItem from a MapArray for a given row. + fn map_item_from<'a>(map: &'a MapArray, row: usize) -> MapItem<'a> { + let keys = super::as_string_accessor(map.keys().as_ref()).unwrap(); + let values = super::as_string_accessor(map.values().as_ref()).unwrap(); + let start = map.offsets()[row] as usize; + let end = map.offsets()[row + 1] as usize; + MapItem::new(keys, values, start..end) + } + + #[test] + fn test_materialize_matches_get() -> DeltaResult<()> { + // Create MapArray with various keys + let map_array = create_map_array(vec![vec![ + ("key1", Some("value1")), + ("key2", Some("value2")), + ("key3", Some("value3")), + ]]); + + let item = map_item_from(&map_array, 0); + let materialized = item.materialize(); + + // Verify that get(key) matches materialize()[key] for all keys + for (key, value) in &materialized { + let get_result = item.get(key); + assert_eq!(get_result, Some(value.as_str())); + } + + // Verify count matches + assert_eq!(materialized.len(), 3); + Ok(()) + } + + #[test] + fn test_materialize_handles_nulls() -> DeltaResult<()> { + // Create MapArray with null values + let map_array = + create_map_array(vec![vec![("a", Some("1")), ("b", None), ("c", Some("3"))]]); + + let item = map_item_from(&map_array, 0); + let result = item.materialize(); + + // Null values should be excluded from materialized map + assert_eq!(result.len(), 2); + assert_eq!(result.get("a"), Some(&"1".to_string())); + assert_eq!(result.get("b"), None); + assert_eq!(result.get("c"), Some(&"3".to_string())); + Ok(()) + } + + #[test] + fn test_materialize_empty_map() -> DeltaResult<()> { + // Create MapArray with empty map + let map_array = create_map_array(vec![vec![]]); + + let item = map_item_from(&map_array, 0); + let result = item.materialize(); + + assert_eq!(result.len(), 0); + Ok(()) + } + + #[test] + fn test_materialize_multiple_rows() -> DeltaResult<()> { + // Create MapArray with multiple rows + let map_array = create_map_array(vec![ + vec![("a", Some("1")), ("b", Some("2"))], + vec![("x", Some("10")), ("y", Some("20"))], + ]); + + let item0 = map_item_from(&map_array, 0); + let result0 = item0.materialize(); + assert_eq!(result0.len(), 2); + assert_eq!(result0.get("a"), Some(&"1".to_string())); + assert_eq!(result0.get("b"), Some(&"2".to_string())); + + let item1 = map_item_from(&map_array, 1); + let result1 = item1.materialize(); + assert_eq!(result1.len(), 2); + assert_eq!(result1.get("x"), Some(&"10".to_string())); + assert_eq!(result1.get("y"), Some(&"20".to_string())); + Ok(()) + } + + #[test] + fn test_get_vs_materialize_consistency_with_duplicates() -> DeltaResult<()> { + // Test that materialize() handles duplicate keys correctly (last wins) + // and that get() returns the same value as materialize() for duplicate keys + let map_array = create_map_array(vec![vec![ + ("a", Some("1")), + ("b", Some("2")), + ("a", Some("3")), // Duplicate 'a' - should override first + ("c", Some("4")), + ("a", Some("5")), // Another duplicate 'a' - should be final value + ]]); + + let item = map_item_from(&map_array, 0); + let materialized = item.materialize(); + + // Verify materialize() handles duplicates correctly (last wins) + assert_eq!(materialized.len(), 3); // Only 3 unique keys + assert_eq!(materialized.get("a"), Some(&"5".to_string())); // Last 'a' wins + assert_eq!(materialized.get("b"), Some(&"2".to_string())); + assert_eq!(materialized.get("c"), Some(&"4".to_string())); + + // Verify get() and materialize() return same values + assert_eq!(item.get("a"), Some("5")); // Matches materialized + assert_eq!(item.get("b"), Some("2")); + assert_eq!(item.get("c"), Some("4")); + + Ok(()) + } + + #[test] + fn test_materialize_null_map() -> DeltaResult<()> { + // Create MapArray with 3 elements: 2 entries in first, 1 entry in second (null), 1 entry in third + let keys_array = Arc::new(StringArray::from(vec![ + Some("a"), + Some("b"), // First element (2 entries) + Some("c"), // Second element (1 entry, but element is null) + Some("d"), // Third element (1 entry) + ])) as Arc; + + let values_array = Arc::new(StringArray::from(vec![ + Some("1"), + Some("2"), // First element values + Some("3"), // Second element value (but element is null) + Some("4"), // Third element value + ])) as Arc; + + let entries_struct = StructArray::try_new( + vec![ + Arc::new(ArrowField::new("keys", ArrowDataType::Utf8, false)), + Arc::new(ArrowField::new("values", ArrowDataType::Utf8, true)), + ] + .into(), + vec![keys_array, values_array], + None, + ) + .unwrap(); + + // Offsets: [0, 2, 3, 4] - first has 2 entries, second has 1, third has 1 + let offsets_buffer = OffsetBuffer::new(vec![0i32, 2, 3, 4].into()); + + // Create null buffer with second element (index 1) null + let null_buffer = Some(crate::arrow::buffer::NullBuffer::from(vec![ + true, false, true, + ])); + + let map_array = MapArray::try_new( + Arc::new(ArrowField::new_struct( + "entries", + vec![ + Arc::new(ArrowField::new("keys", ArrowDataType::Utf8, false)), + Arc::new(ArrowField::new("values", ArrowDataType::Utf8, true)), + ], + false, + )), + offsets_buffer, + entries_struct, + null_buffer, + false, + ) + .unwrap(); + + // First element should have 2 entries + let item0 = map_item_from(&map_array, 0); + let result0 = item0.materialize(); + assert_eq!(result0.len(), 2); + assert_eq!(result0.get("a"), Some(&"1".to_string())); + assert_eq!(result0.get("b"), Some(&"2".to_string())); + + // Second element is null — GetData::get_map returns None for null elements + let map_item_1: Option> = map_array.get_map(1, "test")?; + assert!(map_item_1.is_none()); + + // Third element should have 1 entry + let item2 = map_item_from(&map_array, 2); + let result2 = item2.materialize(); + assert_eq!(result2.len(), 1); + assert_eq!(result2.get("d"), Some(&"4".to_string())); + + Ok(()) + } + + /// visit_rows must accept all Arrow string representations (Utf8/StringArray, + /// LargeUtf8/LargeStringArray, Utf8View/StringViewArray) when the visitor declares + /// DataType::STRING. + #[rstest] + #[case::utf8(Arc::new(StringArray::from(vec![Some("alice"), None, Some("charlie")])) as ArrayRef)] + #[case::large_utf8(Arc::new(LargeStringArray::from(vec![Some("alice"), None, Some("charlie")])) as ArrayRef)] + #[case::utf8_view(Arc::new(StringViewArray::from(vec![Some("alice"), None, Some("charlie")])) as ArrayRef)] + fn test_visit_rows_string_types(#[case] values: ArrayRef) -> DeltaResult<()> { + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ArrowField::new( + "name", + values.data_type().clone(), + true, + )])), + vec![values], + )?; + let arrow_data = ArrowEngineData::new(batch); + + struct Visitor { + values: Vec>, + } + impl RowVisitor for Visitor { + fn selected_column_names_and_types( + &self, + ) -> (&'static [ColumnName], &'static [DataType]) { + static NAMES: LazyLock> = + LazyLock::new(|| vec![ColumnName::new(["name"])]); + static TYPES: &[DataType] = &[DataType::STRING]; + (&NAMES, TYPES) + } + fn visit<'a>( + &mut self, + row_count: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult<()> { + for i in 0..row_count { + self.values + .push(getters[0].get_str(i, "name")?.map(|s| s.to_string())); + } + Ok(()) + } + } + + let mut visitor = Visitor { values: vec![] }; + arrow_data.visit_rows(&[ColumnName::new(["name"])], &mut visitor)?; + assert_eq!( + visitor.values, + vec![Some("alice".into()), None, Some("charlie".into())] + ); + Ok(()) + } + + /// visit_rows must accept LargeBinary columns when the visitor declares DataType::BINARY. + #[test] + fn test_visit_rows_large_binary() -> DeltaResult<()> { + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ArrowField::new( + "data", + ArrowDataType::LargeBinary, + true, + )])), + vec![Arc::new(LargeBinaryArray::from(vec![ + Some(b"hello" as &[u8]), + None, + Some(b"\x00\x01"), + ]))], + )?; + let arrow_data = ArrowEngineData::new(batch); + + struct Visitor { + values: Vec>>, + } + impl RowVisitor for Visitor { + fn selected_column_names_and_types( + &self, + ) -> (&'static [ColumnName], &'static [DataType]) { + static NAMES: LazyLock> = + LazyLock::new(|| vec![ColumnName::new(["data"])]); + static TYPES: &[DataType] = &[DataType::BINARY]; + (&NAMES, TYPES) + } + fn visit<'a>( + &mut self, + row_count: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult<()> { + for i in 0..row_count { + self.values + .push(getters[0].get_binary(i, "data")?.map(|b| b.to_vec())); + } + Ok(()) + } + } + + let mut visitor = Visitor { values: vec![] }; + arrow_data.visit_rows(&[ColumnName::new(["data"])], &mut visitor)?; + assert_eq!( + visitor.values, + vec![Some(b"hello".to_vec()), None, Some(b"\x00\x01".to_vec())] + ); + Ok(()) + } + + /// visit_rows must accept ListView columns when the visitor declares a DataType::Array. + #[test] + fn test_visit_rows_list_view() -> DeltaResult<()> { + // Build a ListViewArray with string values: [["a", "b"], ["c"]] + let values = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef; + let field = Arc::new(ArrowField::new("item", ArrowDataType::Utf8, false)); + let offsets = ScalarBuffer::from(vec![0i32, 2]); + let sizes = ScalarBuffer::from(vec![2i32, 1]); + let list_view = ListViewArray::new(field.clone(), offsets, sizes, values, None); + + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ArrowField::new( + "tags", + list_view.data_type().clone(), + false, + )])), + vec![Arc::new(list_view)], + )?; + let arrow_data = ArrowEngineData::new(batch); + + struct Visitor { + values: Vec>, + } + impl RowVisitor for Visitor { + fn selected_column_names_and_types( + &self, + ) -> (&'static [ColumnName], &'static [DataType]) { + static NAMES: LazyLock> = + LazyLock::new(|| vec![ColumnName::new(["tags"])]); + static TYPES: LazyLock> = + LazyLock::new(|| vec![ArrayType::new(DataType::STRING, false).into()]); + (&NAMES, &TYPES) + } + fn visit<'a>( + &mut self, + row_count: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult<()> { + for i in 0..row_count { + let list: ListItem<'_> = getters[0].get(i, "tags")?; + self.values.push(list.materialize()); + } + Ok(()) + } + } + + let mut visitor = Visitor { values: vec![] }; + arrow_data.visit_rows(&[ColumnName::new(["tags"])], &mut visitor)?; + assert_eq!( + visitor.values, + vec![ + vec!["a".to_string(), "b".to_string()], + vec!["c".to_string()] + ] + ); + Ok(()) + } } diff --git a/kernel/src/engine/arrow_expression/apply_schema.rs b/kernel/src/engine/arrow_expression/apply_schema.rs index ac3cbf14c2..cd86085c72 100644 --- a/kernel/src/engine/arrow_expression/apply_schema.rs +++ b/kernel/src/engine/arrow_expression/apply_schema.rs @@ -10,14 +10,21 @@ use crate::arrow::array::{ use crate::arrow::datatypes::Schema as ArrowSchema; use crate::arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField}; +use super::super::arrow_conversion::kernel_metadata_to_arrow_metadata; use super::super::arrow_utils::make_arrow_error; -use crate::engine::ensure_data_types::ensure_data_types; +use crate::engine::ensure_data_types::{ensure_data_types, ValidationMode}; use crate::error::{DeltaResult, Error}; +use crate::parquet::arrow::PARQUET_FIELD_ID_META_KEY; use crate::schema::{ArrayType, DataType, MapType, Schema, StructField}; -// Apply a schema to an array. The array _must_ be a `StructArray`. Returns a `RecordBatch where the -// names of fields, nullable, and metadata in the struct have been transformed to match those in -// schema specified by `schema` +// Apply a schema to an array. The array _must_ be a `StructArray`. Returns a `RecordBatch` where +// the names of fields, nullable, and metadata in the struct have been transformed to match those +// in the schema specified by `schema`. +// +// Note: If the struct array has top-level nulls, the child columns are expected to already have +// those nulls propagated. Arrow's JSON reader does this automatically, and parquet data goes +// through `fix_nested_null_masks` which handles it. We decompose the struct and discard its null +// buffer since RecordBatch cannot have top-level nulls. pub(crate) fn apply_schema(array: &dyn Array, schema: &DataType) -> DeltaResult { let DataType::Struct(struct_schema) = schema else { return Err(Error::generic( @@ -25,14 +32,8 @@ pub(crate) fn apply_schema(array: &dyn Array, schema: &DataType) -> DeltaResult< )); }; let applied = apply_schema_to_struct(array, struct_schema)?; - let (fields, columns, nulls) = applied.into_parts(); - if let Some(nulls) = nulls { - if nulls.null_count() != 0 { - return Err(Error::invalid_struct_data( - "Top-level nulls in struct are not supported", - )); - } - } + let (fields, columns, _nulls) = applied.into_parts(); + Ok(RecordBatch::try_new( Arc::new(ArrowSchema::new(fields)), columns, @@ -64,23 +65,37 @@ fn transform_struct( struct_array: &StructArray, target_fields: impl Iterator>, ) -> DeltaResult { - let (_, arrow_cols, nulls) = struct_array.clone().into_parts(); + let (input_fields, arrow_cols, nulls) = struct_array.clone().into_parts(); let input_col_count = arrow_cols.len(); - let result_iter = - arrow_cols - .into_iter() - .zip(target_fields) - .map(|(sa_col, target_field)| -> DeltaResult<_> { - let target_field = target_field.borrow(); - let transformed_col = apply_schema_to(&sa_col, target_field.data_type())?; - let transformed_field = new_field_with_metadata( - &target_field.name, - transformed_col.data_type(), - target_field.nullable, - Some(target_field.metadata_with_string_values()), - ); - Ok((transformed_field, transformed_col)) - }); + let result_iter = arrow_cols + .into_iter() + .zip(input_fields.iter()) + .zip(target_fields) + .map(|((sa_col, input_field), target_field)| -> DeltaResult<_> { + let target_field = target_field.borrow(); + let transformed_col = apply_schema_to(&sa_col, target_field.data_type())?; + let arrow_metadata = kernel_metadata_to_arrow_metadata(target_field)?; + // If both the input field and the target field carry a field ID they must agree, + // otherwise we would silently overwrite one field ID with another. + if let (Some(input_id), Some(target_id)) = ( + input_field.metadata().get(PARQUET_FIELD_ID_META_KEY), + arrow_metadata.get(PARQUET_FIELD_ID_META_KEY), + ) { + if input_id != target_id { + return Err(Error::generic(format!( + "Field '{}': input field ID {} conflicts with target field ID {}", + target_field.name, input_id, target_id + ))); + } + } + let transformed_field = new_field_with_metadata( + &target_field.name, + transformed_col.data_type(), + target_field.nullable, + Some(arrow_metadata), + ); + Ok((transformed_field, transformed_col)) + }); let (transformed_fields, transformed_cols): (Vec, Vec) = result_iter.process_results(|iter| iter.unzip())?; if transformed_cols.len() != input_col_count { @@ -177,7 +192,7 @@ pub(crate) fn apply_schema_to(array: &ArrayRef, schema: &DataType) -> DeltaResul Array(atype) => Arc::new(apply_schema_to_list(array, atype)?), Map(mtype) => Arc::new(apply_schema_to_map(array, mtype)?), _ => { - ensure_data_types(schema, array.data_type(), true)?; + ensure_data_types(schema, array.data_type(), ValidationMode::Full)?; array.clone() } }; @@ -191,10 +206,13 @@ mod apply_schema_validation_tests { use std::sync::Arc; use crate::arrow::array::{Int32Array, StructArray}; + use crate::arrow::buffer::{BooleanBuffer, NullBuffer}; use crate::arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, }; - use crate::schema::{DataType, StructField, StructType}; + use crate::parquet::arrow::PARQUET_FIELD_ID_META_KEY; + use crate::schema::{ColumnMetadataKey, DataType, MetadataValue, StructField, StructType}; + use crate::utils::test_utils::assert_result_error_with_message; #[test] fn test_apply_schema_basic_functionality() { @@ -238,4 +256,160 @@ mod apply_schema_validation_tests { StructField::new("b", DataType::INTEGER, false), ]) } + + /// Test that apply_schema handles structs with top-level nulls correctly. + /// + /// This simulates a Delta log scenario where each row is one action type (add, remove, etc.). + /// When extracting `add.stats_parsed`, rows where `add` is null (e.g., remove actions) should + /// have null child columns. The child columns are expected to already have nulls propagated + /// (Arrow's JSON reader does this, and parquet data goes through `fix_nested_null_masks`). + #[test] + fn test_apply_schema_handles_top_level_nulls() { + // Create a struct array with 4 rows where rows 1 and 3 have top-level nulls. + // This simulates: [add_action, remove_action, add_action, remove_action] + // where remove_action rows have null for the entire struct. + let field_a = ArrowField::new("a", ArrowDataType::Int32, true); + let field_b = ArrowField::new("b", ArrowDataType::Int32, true); + let schema = ArrowSchema::new(vec![field_a, field_b]); + + // Child columns with nulls already propagated (simulating what Arrow readers do). + // Rows 1 and 3 are null because the parent struct is null at those positions. + let a_data = Int32Array::from(vec![Some(1), None, Some(3), None]); + let b_data = Int32Array::from(vec![Some(10), None, Some(30), None]); + + // Top-level struct nulls: rows 0 and 2 are valid, rows 1 and 3 are null + let null_buffer = NullBuffer::new(BooleanBuffer::from(vec![true, false, true, false])); + + let struct_array = StructArray::try_new( + schema.fields.clone(), + vec![Arc::new(a_data), Arc::new(b_data)], + Some(null_buffer), + ) + .unwrap(); + + // Target schema with nullable fields + let target_schema = DataType::Struct(Box::new(StructType::new_unchecked([ + StructField::new("a", DataType::INTEGER, true), + StructField::new("b", DataType::INTEGER, true), + ]))); + + // Apply schema - should successfully convert to RecordBatch + let result = apply_schema(&struct_array, &target_schema).unwrap(); + + assert_eq!(result.num_rows(), 4); + assert_eq!(result.num_columns(), 2); + + // Verify columns preserve nulls from child arrays + let col_a = result.column(0); + assert!(col_a.is_valid(0), "Row 0 should be valid"); + assert!(col_a.is_null(1), "Row 1 should be null"); + assert!(col_a.is_valid(2), "Row 2 should be valid"); + assert!(col_a.is_null(3), "Row 3 should be null"); + + let col_b = result.column(1); + assert!(col_b.is_valid(0), "Row 0 should be valid"); + assert!(col_b.is_null(1), "Row 1 should be null"); + assert!(col_b.is_valid(2), "Row 2 should be valid"); + assert!(col_b.is_null(3), "Row 3 should be null"); + + // Verify the actual values for valid rows + let col_a = col_a + .as_any() + .downcast_ref::() + .expect("column a should be Int32Array"); + let col_b = col_b + .as_any() + .downcast_ref::() + .expect("column b should be Int32Array"); + + assert_eq!(col_a.value(0), 1); + assert_eq!(col_a.value(2), 3); + assert_eq!(col_b.value(0), 10); + assert_eq!(col_b.value(2), 30); + } + + /// Test that apply_schema translates "parquet.field.id" kernel metadata to the Arrow-specific + /// "PARQUET:field_id" key. This ensures the same key translation applied during schema + /// conversion (`TryFromKernel<&StructField> for ArrowField`) is also applied when + /// `apply_schema` is used to map data onto an existing schema (e.g. in the arrow expression + /// evaluator). + #[test] + fn test_apply_schema_transforms_parquet_field_id_metadata() { + let field_id_key = ColumnMetadataKey::ParquetFieldId.as_ref(); + let target_schema = + StructType::new_unchecked([StructField::new("a", DataType::INTEGER, false) + .with_metadata([(field_id_key.to_string(), MetadataValue::Number(42))])]); + + let arrow_field = ArrowField::new("a", ArrowDataType::Int32, false); + let input_array = StructArray::try_new( + vec![arrow_field].into(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + None, + ) + .unwrap(); + + let result = apply_schema_to_struct(&input_array, &target_schema).unwrap(); + + let (_, output_field) = result.fields().find("a").unwrap(); + // "parquet.field.id" must be translated to the Arrow/Parquet native key + assert_eq!( + output_field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .map(String::as_str), + Some("42"), + "parquet.field.id should be translated to PARQUET:field_id" + ); + // The original key must not be present + assert!( + !output_field.metadata().contains_key(field_id_key), + "original parquet.field.id key should not be present after translation" + ); + } + + /// Test that apply_schema succeeds when the input Arrow field already carries the same field + /// ID as the target kernel schema field (no conflict). + #[test] + fn test_apply_schema_matching_field_ids_succeed() { + let field_id_key = ColumnMetadataKey::ParquetFieldId.as_ref(); + let target_schema = + StructType::new_unchecked([StructField::new("a", DataType::INTEGER, false) + .with_metadata([(field_id_key.to_string(), MetadataValue::Number(42))])]); + + let arrow_field = ArrowField::new("a", ArrowDataType::Int32, false) + .with_metadata([(PARQUET_FIELD_ID_META_KEY.to_string(), "42".to_string())].into()); + let input_array = StructArray::try_new( + vec![arrow_field].into(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + None, + ) + .unwrap(); + + let result = apply_schema_to_struct(&input_array, &target_schema); + assert!(result.is_ok(), "Matching field IDs should succeed"); + } + + /// Test that apply_schema fails when the input Arrow field already carries a *different* field + /// ID than the target kernel schema field. + #[test] + fn test_apply_schema_conflicting_field_ids_fail() { + let field_id_key = ColumnMetadataKey::ParquetFieldId.as_ref(); + let target_schema = + StructType::new_unchecked([StructField::new("a", DataType::INTEGER, false) + .with_metadata([(field_id_key.to_string(), MetadataValue::Number(42))])]); + + let arrow_field = ArrowField::new("a", ArrowDataType::Int32, false) + .with_metadata([(PARQUET_FIELD_ID_META_KEY.to_string(), "99".to_string())].into()); + let input_array = StructArray::try_new( + vec![arrow_field].into(), + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + None, + ) + .unwrap(); + + assert_result_error_with_message( + apply_schema_to_struct(&input_array, &target_schema), + "conflicts with", + ); + } } diff --git a/kernel/src/engine/arrow_expression/evaluate_expression.rs b/kernel/src/engine/arrow_expression/evaluate_expression.rs index bda9ea06f4..2ae6ecefb3 100644 --- a/kernel/src/engine/arrow_expression/evaluate_expression.rs +++ b/kernel/src/engine/arrow_expression/evaluate_expression.rs @@ -1,30 +1,36 @@ //! Expression handling based on arrow-rs compute kernels. use std::borrow::Cow; +use std::collections::HashMap; use std::sync::Arc; use itertools::Itertools; +use tracing::warn; use crate::arrow::array::types::*; use crate::arrow::array::{ - make_array, Array, ArrayData, ArrayRef, AsArray, BooleanArray, Datum, MutableArrayData, - NullBufferBuilder, RecordBatch, StringArray, StructArray, + self as arrow_array, make_array, new_null_array, Array, ArrayBuilder, ArrayData, ArrayRef, + AsArray, BooleanArray, Datum, MapArray, MutableArrayData, NullBufferBuilder, RecordBatch, + StringArray, StructArray, }; -use crate::arrow::buffer::OffsetBuffer; +use crate::arrow::buffer::{NullBuffer, OffsetBuffer}; use crate::arrow::compute::kernels::cmp::{distinct, eq, gt, gt_eq, lt, lt_eq, neq, not_distinct}; use crate::arrow::compute::kernels::comparison::in_list_utf8; use crate::arrow::compute::kernels::numeric::{add, div, mul, sub}; -use crate::arrow::compute::{and_kleene, is_not_null, is_null, not, or_kleene}; +use crate::arrow::compute::{and_kleene, cast, is_not_null, is_null, not, or_kleene}; use crate::arrow::datatypes::{ - DataType as ArrowDataType, Field as ArrowField, Fields as ArrowFields, IntervalUnit, TimeUnit, + DataType as ArrowDataType, Field as ArrowField, Fields as ArrowFields, IntervalUnit, + Schema as ArrowSchema, TimeUnit, }; use crate::arrow::error::ArrowError; use crate::arrow::json::writer::{make_encoder, EncoderOptions}; use crate::arrow::json::StructMode; -use crate::engine::arrow_conversion::TryIntoArrow; +use crate::engine::arrow_conversion::{TryFromKernel, TryIntoArrow}; use crate::engine::arrow_expression::opaque::{ ArrowOpaqueExpressionOpAdaptor, ArrowOpaquePredicateOpAdaptor, }; +use crate::engine::arrow_utils::parse_json_impl; use crate::engine::arrow_utils::prim_array_cmp; +use crate::engine::ensure_data_types::{ensure_data_types, ValidationMode}; use crate::error::{DeltaResult, Error}; use crate::expressions::{ BinaryExpression, BinaryExpressionOp, BinaryPredicate, BinaryPredicateOp, Expression, @@ -32,7 +38,7 @@ use crate::expressions::{ Predicate, Scalar, Transform, UnaryExpression, UnaryExpressionOp, UnaryPredicate, UnaryPredicateOp, VariadicExpression, VariadicExpressionOp, }; -use crate::schema::{DataType, StructType}; +use crate::schema::{DataType, PrimitiveType, StructField, StructType}; pub(super) trait ProvidesColumnByName { fn schema_fields(&self) -> &ArrowFields; @@ -101,7 +107,16 @@ fn evaluate_struct_expression( fields: &[ExpressionRef], batch: &RecordBatch, output_schema: &StructType, + nullability_predicate: Option<&ExpressionRef>, ) -> DeltaResult { + if fields.len() != output_schema.num_fields() { + return Err(Error::generic(format!( + "Struct expression field count mismatch: {} fields in expression but {} in schema", + fields.len(), + output_schema.num_fields() + ))); + } + let output_cols: Vec = fields .iter() .zip(output_schema.fields()) @@ -114,11 +129,26 @@ fn evaluate_struct_expression( ArrowField::new( output_field.name(), output_col.data_type().clone(), - output_col.is_nullable(), + output_field.nullable, // Use schema's nullability; Arrow will validate any mismatch ) }) .collect(); - let data = StructArray::try_new(output_fields.into(), output_cols, None)?; + let null_buffer = if let Some(predicate_expr) = nullability_predicate { + let predicate_array = evaluate_expression(predicate_expr, batch, Some(&DataType::BOOLEAN))?; + let bool_array = predicate_array + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::generic("Nullability predicate must evaluate to boolean"))?; + let values = bool_array.values(); + let combined = match bool_array.nulls() { + Some(nulls) => values & nulls.inner(), + None => values.clone(), + }; + Some(NullBuffer::new(combined)) + } else { + None + }; + let data = StructArray::try_new(output_fields.into(), output_cols, null_buffer)?; Ok(Arc::new(data)) } @@ -131,7 +161,7 @@ fn evaluate_transform_expression( let mut used_field_transforms = 0; // Collect output columns directly to avoid creating intermediate Expr::Column instances. - let mut output_cols = Vec::new(); + let mut output_cols = Vec::with_capacity(output_schema.num_fields()); // Helper lambda to get the next output field type let mut output_schema_iter = output_schema.fields(); @@ -148,12 +178,19 @@ fn evaluate_transform_expression( } // Extract the input path, if any - let source_data = transform + let source_array = transform .input_path() .map(|path| extract_column(batch, path)) .transpose()?; - let source_data: &dyn ProvidesColumnByName = match source_data { + // For nested transforms, get the source struct's null bitmap to preserve null rows + let source_null_buffer = source_array.as_ref().and_then(|arr| { + arr.as_any() + .downcast_ref::() + .and_then(|s| s.nulls().cloned()) + }); + + let source_data: &dyn ProvidesColumnByName = match source_array { Some(ref array) => array .as_any() .downcast_ref::() @@ -181,10 +218,15 @@ fn evaluate_transform_expression( } } - // Verify that all field transforms were used - if used_field_transforms != transform.field_transforms.len() { + // Verify that all non-optional field transforms were used + let required_count = transform + .field_transforms + .values() + .filter(|ft| !ft.optional) + .count(); + if used_field_transforms < required_count { return Err(Error::generic( - "Some field transforms reference invalid input field names", + "Some non-optional field transforms reference invalid input field names", )); } @@ -193,7 +235,7 @@ fn evaluate_transform_expression( return Err(Error::generic("Too many fields in output schema")); } - // Build the final struct + // Build the final struct, preserving null bitmap for nested transforms let output_fields: Vec = output_cols .iter() .zip(output_schema.fields()) @@ -205,7 +247,7 @@ fn evaluate_transform_expression( ) }) .collect(); - let data = StructArray::try_new(output_fields.into(), output_cols, None)?; + let data = StructArray::try_new(output_fields.into(), output_cols, source_null_buffer)?; Ok(Arc::new(data)) } @@ -220,14 +262,24 @@ pub fn evaluate_expression( use UnaryExpressionOp::*; use VariadicExpressionOp::*; match (expression, result_type) { - (Literal(scalar), _) => Ok(scalar.to_array(batch.num_rows())?), - (Column(name), _) => extract_column(batch, name), - (Struct(fields), Some(DataType::Struct(output_schema))) => { - evaluate_struct_expression(fields, batch, output_schema) + (Literal(scalar), _) => { + validate_array_type(scalar.to_array(batch.num_rows())?, result_type) } - (Struct(_), _) => Err(Error::generic( - "Data type is required to evaluate struct expressions", - )), + (Column(name), _) => { + // Column extraction uses ordinal-based struct validation because column mapping + // can cause physical/logical name mismatches. apply_schema handles renaming. + let arr = extract_column(batch, name)?; + if let Some(expected) = result_type { + ensure_data_types(expected, arr.data_type(), ValidationMode::TypesOnly)?; + } + Ok(arr) + } + (Struct(fields, nullability), Some(DataType::Struct(output_schema))) => { + evaluate_struct_expression(fields, batch, output_schema, nullability.as_ref()) + } + (Struct(..), dt) => Err(Error::Generic(format!( + "Struct expression expects a DataType::Struct result, but got {dt:?}" + ))), (Transform(transform), Some(DataType::Struct(output_schema))) => { evaluate_transform_expression(transform, batch, output_schema) } @@ -262,7 +314,7 @@ pub fn evaluate_expression( Divide => div, }; - Ok(eval(&left_arr, &right_arr)?) + validate_array_type(eval(&left_arr, &right_arr)?, result_type) } ( Variadic(VariadicExpression { @@ -271,10 +323,20 @@ pub fn evaluate_expression( }), result_type, ) => { - let arrays: Vec = exprs - .iter() - .map(|expr| evaluate_expression(expr, batch, None)) - .try_collect()?; + let mut arrays: Vec = Vec::with_capacity(exprs.len()); + + for expr in exprs { + let array = evaluate_expression(expr, batch, result_type)?; + let null_count = array.null_count(); + arrays.push(array); + // Short-circuit: if this array has no nulls, we can stop evaluating + // remaining expressions since no more values are needed. + if null_count == 0 { + break; + } + } + + // Coalesce accumulated arrays Ok(coalesce_arrays(&arrays, result_type)?) } (Opaque(OpaqueExpression { op, exprs }), _) => { @@ -288,10 +350,146 @@ pub fn evaluate_expression( ))), } } + (ParseJson(p), _) => { + // Evaluate the JSON string expression + let json_arr = evaluate_expression(&p.json_expr, batch, Some(&DataType::STRING))?; + let json_strings = + json_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::generic("ParseJson input must evaluate to a STRING column") + })?; + + // Convert kernel schema to Arrow schema and parse + let arrow_schema = Arc::new(ArrowSchema::try_from_kernel(p.output_schema.as_ref())?); + match parse_json_impl(json_strings, arrow_schema.clone()) { + Ok(batch) => Ok(Arc::new(StructArray::from(batch)) as ArrayRef), + Err(e) => { + warn!( + "Failed to parse JSON stats as {}: {e}. Using null stats.", + p.output_schema, + ); + Ok(new_null_array( + &ArrowDataType::Struct(arrow_schema.fields().clone()), + json_strings.len(), + )) + } + } + } + (MapToStruct(m), Some(DataType::Struct(output_schema))) => { + let map_arr = evaluate_expression(&m.map_expr, batch, None)?; + let result = evaluate_map_to_struct(&map_arr, output_schema)?; + Ok(Arc::new(result) as ArrayRef) + } + (MapToStruct(_), dt) => Err(Error::Generic(format!( + "MapToStruct expression requires a DataType::Struct result type, but got {dt:?}" + ))), (Unknown(name), _) => Err(Error::unsupported(format!("Unknown expression: {name:?}"))), } } +/// Direction for casting between Arrow view and non-view string/binary types. +#[derive(Clone, Copy)] +enum ViewCast { + ToView, + ToNonView, +} + +/// Casts list element types between view and non-view string/binary variants. +/// +/// When [`ViewCast::ToView`], non-view string/binary element types are converted to their view +/// equivalents (e.g. `List` -> `List`). View container types (`ListView`, +/// `LargeListView`) are preserved. +/// +/// When [`ViewCast::ToNonView`], view element types are converted to their non-view equivalents +/// (e.g. `List` -> `List`). Additionally, view container types are always +/// converted to their non-view equivalents (e.g. `ListView` -> `List`), even +/// when the element type does not change. +/// +/// Nested type conversion is not supported. +fn cast_list_elements( + vals: &Arc, + field: &Arc, + dir: ViewCast, +) -> DeltaResult> { + let to_type = match dir { + ViewCast::ToView => match field.data_type() { + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => ArrowDataType::Utf8View, + ArrowDataType::Binary | ArrowDataType::LargeBinary => ArrowDataType::BinaryView, + _ => return Ok(vals.clone()), + }, + ViewCast::ToNonView => match field.data_type() { + ArrowDataType::Utf8View => ArrowDataType::Utf8, + ArrowDataType::BinaryView => ArrowDataType::Binary, + other => { + if !matches!( + vals.data_type(), + ArrowDataType::ListView(_) | ArrowDataType::LargeListView(_) + ) { + return Ok(vals.clone()); + } + // Container is a view type but element is not -- preserve element type, + // cast only the container (ListView -> List, LargeListView -> LargeList). + other.clone() + } + }, + }; + let new_field = Arc::new(field.as_ref().clone().with_data_type(to_type)); + let container = match (vals.data_type(), dir) { + (ArrowDataType::List(_), _) => ArrowDataType::List(new_field), + (ArrowDataType::LargeList(_), _) => ArrowDataType::LargeList(new_field), + (ArrowDataType::ListView(_), ViewCast::ToView) => ArrowDataType::ListView(new_field), + (ArrowDataType::ListView(_), ViewCast::ToNonView) => ArrowDataType::List(new_field), + (ArrowDataType::LargeListView(_), ViewCast::ToView) => { + ArrowDataType::LargeListView(new_field) + } + (ArrowDataType::LargeListView(_), ViewCast::ToNonView) => { + ArrowDataType::LargeList(new_field) + } + (dt, _) => { + return Err(Error::generic(format!( + "cast_list_elements: expected a list type, got {dt:?}" + ))) + } + }; + Ok(cast(vals, &container)?) +} + +/// This function converts ArrowView types to their non-view type equivalents. This is used for [`evaluate_predicate`] conversion, +/// currently does not support nested conversion. This only supports limited conversions (see code for exactly which). +fn arrow_convert_to_non_view_type(vals: Arc) -> DeltaResult> { + match vals.data_type() { + ArrowDataType::List(field) => cast_list_elements(&vals, field, ViewCast::ToNonView), + ArrowDataType::LargeList(field) => cast_list_elements(&vals, field, ViewCast::ToNonView), + ArrowDataType::ListView(field) => cast_list_elements(&vals, field, ViewCast::ToNonView), + ArrowDataType::LargeListView(field) => { + cast_list_elements(&vals, field, ViewCast::ToNonView) + } + ArrowDataType::Utf8View => Ok(cast(&vals, &ArrowDataType::Utf8)?), + ArrowDataType::BinaryView => Ok(cast(&vals, &ArrowDataType::Binary)?), + _ => Ok(vals), + } +} + +/// This function converts Arrow types to their Arrow view type equivalents. This is used for [`evaluate_predicate`] conversion, +/// currently does not support nested conversion. This only supports limited conversions (see code for exactly which). +fn arrow_convert_to_view_type(vals: Arc) -> DeltaResult> { + match vals.data_type() { + ArrowDataType::List(field) => cast_list_elements(&vals, field, ViewCast::ToView), + ArrowDataType::LargeList(field) => cast_list_elements(&vals, field, ViewCast::ToView), + ArrowDataType::ListView(field) => cast_list_elements(&vals, field, ViewCast::ToView), + ArrowDataType::LargeListView(field) => cast_list_elements(&vals, field, ViewCast::ToView), + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 => { + Ok(cast(&vals, &ArrowDataType::Utf8View)?) + } + ArrowDataType::Binary | ArrowDataType::LargeBinary => { + Ok(cast(&vals, &ArrowDataType::BinaryView)?) + } + _ => Ok(vals), + } +} + /// Evaluates a (possibly inverted) kernel predicate over a record batch pub fn evaluate_predicate( predicate: &Predicate, @@ -336,11 +534,16 @@ pub fn evaluate_predicate( let eval_in = || match (left, right) { (Expression::Literal(_), Expression::Column(_)) => { let left = evaluate_expression(left, batch, None)?; + let left = arrow_convert_to_non_view_type(left)?; + let right = evaluate_expression(right, batch, None)?; + let right = arrow_convert_to_non_view_type(right)?; if let Some(string_arr) = left.as_string_opt::() { if let Some(list_arr) = right.as_list_opt::() { - let result = in_list_utf8(string_arr, list_arr)?; - return Ok(result); + if list_arr.value_type() == ArrowDataType::Utf8 { + let result = in_list_utf8(string_arr, list_arr)?; + return Ok(result); + } } } @@ -380,7 +583,6 @@ pub fn evaluate_predicate( } } (Expression::Literal(lit), Expression::Literal(Scalar::Array(ad))) => { - #[allow(deprecated)] let exists = ad.array_elements().contains(lit); Ok(BooleanArray::from(vec![exists])) } @@ -403,6 +605,18 @@ pub fn evaluate_predicate( let left = evaluate_expression(left, batch, None)?; let right = evaluate_expression(right, batch, None)?; + + // If the types differ (e.g. one side is a view type and the other is not), + // normalize both to view types since benchamrking results show that casting from non-view to view type is faster + // than casting from view type to non-view type. + let (left, right) = if left.data_type() == right.data_type() { + (left, right) + } else { + ( + arrow_convert_to_view_type(left)?, + arrow_convert_to_view_type(right)?, + ) + }; Ok(eval_fn(&left, &right)?) } Junction(JunctionPredicate { op, preds }) => { @@ -580,16 +794,138 @@ pub fn coalesce_arrays( Ok(make_array(mutable.freeze())) } +/// Evaluates `MAP_TO_STRUCT(map_col, output_schema)`: extracts keys from a `Map` +/// and parses each value into its target type using Delta's partition value serialization rules, +/// producing a `StructArray`. +/// +/// - Missing keys produce null values +/// - Parse errors are propagated (indicating a broken table) +/// - Duplicate map keys are resolved by taking the rightmost entry +fn evaluate_map_to_struct( + map_arr: &ArrayRef, + output_schema: &StructType, +) -> DeltaResult { + let map_array = map_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::generic("MapToStruct requires a MapArray as input"))?; + + let map_keys = map_array + .keys() + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::generic("MapToStruct requires maps with string keys"))?; + let map_values = map_array + .values() + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::generic("MapToStruct requires maps with string values"))?; + + let num_rows = map_array.len(); + let fields: Vec<&StructField> = output_schema.fields().collect(); + + // Pre-build a builder and resolve the PrimitiveType for each output field. + let mut builders: Vec> = Vec::with_capacity(fields.len()); + let mut target_types: Vec<&PrimitiveType> = Vec::with_capacity(fields.len()); + for field in &fields { + let prim = match field.data_type() { + DataType::Primitive(p) => p, + other => { + return Err(Error::generic(format!( + "MapToStruct only supports primitive target types, got {other:?}" + ))); + } + }; + target_types.push(prim); + let arrow_type = ArrowDataType::try_from_kernel(field.data_type())?; + builders.push(arrow_array::make_builder(&arrow_type, num_rows)); + } + + // Reverse lookup from field name to field index. Each map key is compared against this once + // per row, avoiding repeated string comparisons and storing only entries we care about. + let field_indices: HashMap<&str, usize> = HashMap::from_iter( + fields + .iter() + .enumerate() + .map(|(i, f)| (f.name().as_str(), i)), + ); + + // Per-field index into the flat `map_keys`/`map_values` arrays, tracking the most recently + // matched map entry for each output field. For a given row with entry range + // `[entry_start, entry_end)`, checking `matched_entry_idx[i] >= entry_start` tells us whether + // field `i` was found in that row's map. Because Arrow enforces monotonically increasing + // offsets, stale matches from earlier rows are naturally below the current row's `entry_start`, + // so we never need to clear or reinitialize this vector between rows. + let mut matched_entry_idx: Vec = vec![-1; fields.len()]; + + let offsets = map_array.value_offsets(); + let mut entry_end = offsets[0]; + + for row in 0..num_rows { + let entry_start = entry_end; + entry_end = offsets[row + 1]; + + // Scan this row's map entries (skipped entirely for null rows since offsets still + // increase monotonically — the empty range means no matches are recorded). + if map_array.is_valid(row) { + for entry_idx in entry_start..entry_end { + let key = map_keys.value(entry_idx as usize); + if let Some(&i) = field_indices.get(key) { + matched_entry_idx[i] = entry_idx; + } + } + } + + for (i, field) in fields.iter().enumerate() { + let entry_idx = matched_entry_idx[i]; + let builder = builders[i].as_mut(); + + // Only process values belonging to the current row (entry_idx >= entry_start) + // and where the value is non-null. + if entry_idx >= entry_start && map_values.is_valid(entry_idx as usize) { + let raw = map_values.value(entry_idx as usize); + let scalar = target_types[i].parse_scalar(raw)?; + scalar.append_to(builder, 1)?; + } else { + Scalar::append_null(builder, field.data_type(), 1)?; + } + } + } + + let output_columns: Vec = builders.iter_mut().map(|b| b.finish()).collect(); + let arrow_fields: Vec = fields + .iter() + .map(|f| ArrowField::try_from_kernel(*f)) + .try_collect()?; + + Ok(StructArray::try_new( + arrow_fields.into(), + output_columns, + None, + )?) +} + +fn validate_array_type(array: ArrayRef, expected: Option<&DataType>) -> DeltaResult { + if let Some(expected) = expected { + ensure_data_types(expected, array.data_type(), ValidationMode::TypesAndNames)?; + } + Ok(array) +} + #[cfg(test)] mod tests { use super::*; - use crate::arrow::array::{ArrayRef, Int32Array, Int64Array, StringArray, StructArray}; + use crate::arrow::array::{ + ArrayRef, BooleanArray, Int32Array, Int64Array, StringArray, StructArray, + }; use crate::arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, }; - use crate::expressions::{column_expr_ref, Expression as Expr, Transform}; + use crate::expressions::column_expr; + use crate::expressions::{column_expr_ref, BinaryExpressionOp, Expression as Expr, Transform}; use crate::schema::{DataType, StructField, StructType}; use crate::utils::test_utils::assert_result_error_with_message; + use rstest::rstest; use std::sync::Arc; fn create_test_batch() -> RecordBatch { @@ -932,6 +1268,111 @@ mod tests { .contains("Data type is required")); } + #[test] + fn test_drop_field_if_exists_present() { + let batch = create_test_batch(); + let transform = Transform::new_top_level().with_dropped_field_if_exists("a"); + let output_schema = StructType::new_unchecked(vec![ + StructField::not_null("b", DataType::INTEGER), + StructField::not_null("c", DataType::INTEGER), + ]); + let expr = Expr::Transform(transform); + let result = evaluate_expression( + &expr, + &batch, + Some(&DataType::Struct(Box::new(output_schema))), + ) + .unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + validate_i32_column(result, 0, &[10, 20, 30]); + validate_i32_column(result, 1, &[100, 200, 300]); + } + + #[test] + fn test_drop_field_if_exists_missing() { + let batch = create_test_batch(); + let transform = Transform::new_top_level().with_dropped_field_if_exists("nonexistent"); + let output_schema = StructType::new_unchecked(vec![ + StructField::not_null("a", DataType::INTEGER), + StructField::not_null("b", DataType::INTEGER), + StructField::not_null("c", DataType::INTEGER), + ]); + let expr = Expr::Transform(transform); + let result = evaluate_expression( + &expr, + &batch, + Some(&DataType::Struct(Box::new(output_schema))), + ) + .unwrap(); + let result = result.as_any().downcast_ref::().unwrap(); + validate_i32_column(result, 0, &[1, 2, 3]); + validate_i32_column(result, 1, &[10, 20, 30]); + validate_i32_column(result, 2, &[100, 200, 300]); + } + + #[test] + fn test_drop_field_non_optional_missing_still_errors() { + let batch = create_test_batch(); + let transform = Transform::new_top_level().with_dropped_field("nonexistent"); + let output_schema = StructType::new_unchecked(vec![ + StructField::not_null("a", DataType::INTEGER), + StructField::not_null("b", DataType::INTEGER), + StructField::not_null("c", DataType::INTEGER), + ]); + let expr = Expr::Transform(transform); + let result = evaluate_expression( + &expr, + &batch, + Some(&DataType::Struct(Box::new(output_schema))), + ); + assert!(result + .unwrap_err() + .to_string() + .contains("reference invalid input field names")); + } + + #[test] + fn test_struct_expression_schema_validation() { + let batch = create_test_batch(); + + let test_cases = vec![ + ( + "too many schema fields", + Expr::struct_from([column_expr_ref!("a"), column_expr_ref!("b")]), + StructType::new_unchecked(vec![ + StructField::not_null("a", DataType::INTEGER), + StructField::not_null("b", DataType::INTEGER), + StructField::not_null("c", DataType::INTEGER), + ]), + ), + ( + "too few schema fields", + Expr::struct_from([ + column_expr_ref!("a"), + column_expr_ref!("b"), + column_expr_ref!("c"), + ]), + StructType::new_unchecked(vec![ + StructField::not_null("a", DataType::INTEGER), + StructField::not_null("b", DataType::INTEGER), + ]), + ), + ]; + + for (name, expr, schema) in test_cases { + let result = + evaluate_expression(&expr, &batch, Some(&DataType::Struct(Box::new(schema)))); + assert!(result.is_err(), "Test case '{name}' should fail"); + assert!( + result + .unwrap_err() + .to_string() + .contains("field count mismatch"), + "Test case '{name}' should contain 'field count mismatch' error" + ); + } + } + #[test] fn test_coalesce_arrays_same_type() { // Test with Int32 arrays @@ -1045,6 +1486,106 @@ mod tests { ); } + #[test] + fn test_coalesce_arrays_first_no_nulls() { + // First array has no nulls - coalesce_arrays still works correctly + let arr1 = Arc::new(Int32Array::from(vec![1, 2, 3])); // No nulls + let arr2 = Arc::new(Int32Array::from(vec![10, 20, 30])); + + let result = coalesce_arrays(&[arr1.clone(), arr2], None).unwrap(); + let result_array = result.as_any().downcast_ref::().unwrap(); + + // Result should be arr1's values (first non-null for each row) + assert_eq!(result_array.len(), 3); + assert_eq!(result_array.value(0), 1); + assert_eq!(result_array.value(1), 2); + assert_eq!(result_array.value(2), 3); + } + + #[test] + fn test_coalesce_arrays_second_no_nulls() { + // First array has nulls, second has none + let arr1 = Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])); + let arr2 = Arc::new(Int32Array::from(vec![10, 20, 30])); // No nulls + + let result = coalesce_arrays(&[arr1, arr2], None).unwrap(); + let result_array = result.as_any().downcast_ref::().unwrap(); + + // Row 0: 1 (from arr1), Row 1: 20 (from arr2), Row 2: 3 (from arr1) + assert_eq!(result_array.len(), 3); + assert_eq!(result_array.value(0), 1); + assert_eq!(result_array.value(1), 20); + assert_eq!(result_array.value(2), 3); + } + + #[test] + fn test_coalesce_expression_short_circuit_first() { + // Test the short-circuit optimization when first array has no nulls + let schema = ArrowSchema::new(vec![ArrowField::new("a", ArrowDataType::Int32, false)]); + let a_values = Int32Array::from(vec![1, 2, 3]); // No nulls + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a_values)]).unwrap(); + + // Create coalesce expression with column that has no nulls, followed by + // a reference to a non-existent column. If short-circuit works, the + // non-existent column is never evaluated and no error occurs. + let expr = Expression::coalesce([ + Expression::column(["a"]), + Expression::column(["nonexistent"]), // Would fail if evaluated + ]); + + // Should return column "a" directly (short-circuit skips evaluating "nonexistent") + let result = evaluate_expression(&expr, &batch, Some(&DataType::INTEGER)).unwrap(); + let result_array = result.as_any().downcast_ref::().unwrap(); + assert_eq!(result_array.values(), &[1, 2, 3]); + } + + #[test] + fn test_coalesce_expression_short_circuit_second() { + // Test short-circuit when second array has no nulls (still needs coalesce) + let schema = ArrowSchema::new(vec![ + ArrowField::new("a", ArrowDataType::Int32, true), + ArrowField::new("b", ArrowDataType::Int32, false), + ]); + let a_values = Int32Array::from(vec![Some(1), None, Some(3)]); // Has nulls + let b_values = Int32Array::from(vec![10, 20, 30]); // No nulls + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(a_values), Arc::new(b_values)], + ) + .unwrap(); + + // Create coalesce expression: a has nulls, b has none, c doesn't exist. + // Short-circuit should stop after evaluating b. + let expr = Expression::coalesce([ + Expression::column(["a"]), + Expression::column(["b"]), + Expression::column(["nonexistent"]), // Would fail if evaluated + ]); + + // Should coalesce a and b, never evaluate "nonexistent" + let result = evaluate_expression(&expr, &batch, Some(&DataType::INTEGER)).unwrap(); + let result_array = result.as_any().downcast_ref::().unwrap(); + // Row 0: 1 (from a), Row 1: 20 (from b), Row 2: 3 (from a) + assert_eq!(result_array.len(), 3); + assert_eq!(result_array.value(0), 1); + assert_eq!(result_array.value(1), 20); + assert_eq!(result_array.value(2), 3); + } + + #[test] + fn test_coalesce_expression_short_circuit_type_mismatch() { + // Verify type validation works when short-circuiting + let schema = ArrowSchema::new(vec![ArrowField::new("a", ArrowDataType::Int32, false)]); + let a_values = Int32Array::from(vec![1, 2, 3]); // No nulls - would short-circuit + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a_values)]).unwrap(); + + let expr = Expression::coalesce([Expression::column(["a"])]); + + // Request STRING type but array is INT32 - should fail even with short-circuit + let result = evaluate_expression(&expr, &batch, Some(&DataType::STRING)); + assert!(result.is_err()); + } + #[test] fn test_nested_transforms() { let nested_batch = create_nested_test_batch(); @@ -1099,4 +1640,851 @@ mod tests { validate_i32_column(nested_struct_result, 0, &[1, 2, 3]); validate_i32_column(nested_struct_result, 1, &[10, 20, 30]); } + + #[test] + fn test_literal_type_validation() { + let batch = create_test_batch(); + + // Valid: literal matches expected type + let result = evaluate_expression(&Expr::literal(42), &batch, Some(&DataType::INTEGER)); + assert!(result.is_ok()); + + // Error: literal type mismatch + let result = evaluate_expression(&Expr::literal(42), &batch, Some(&DataType::STRING)); + assert_result_error_with_message(result, "Incorrect datatype"); + } + + #[test] + fn test_column_type_validation() { + let batch = create_test_batch(); + + // Valid: column matches expected type + let result = evaluate_expression(&column_expr_ref!("a"), &batch, Some(&DataType::INTEGER)); + assert!(result.is_ok()); + + // Error: column type mismatch + let result = evaluate_expression(&column_expr_ref!("a"), &batch, Some(&DataType::STRING)); + assert_result_error_with_message(result, "Incorrect datatype"); + } + + #[test] + fn test_binary_type_validation() { + let batch = create_test_batch(); + let add_expr = Expr::binary( + BinaryExpressionOp::Plus, + Expr::column(["a"]), + Expr::column(["b"]), + ); + + // Valid: binary result matches expected type + let result = evaluate_expression(&add_expr, &batch, Some(&DataType::INTEGER)); + assert!(result.is_ok()); + + // Error: binary result type mismatch + let result = evaluate_expression(&add_expr, &batch, Some(&DataType::STRING)); + assert_result_error_with_message(result, "Incorrect datatype"); + } + + fn create_json_batch() -> RecordBatch { + let schema = ArrowSchema::new(vec![ArrowField::new("json_col", ArrowDataType::Utf8, true)]); + let json_strings = StringArray::from(vec![ + Some(r#"{"a": 1, "b": "hello"}"#), + Some(r#"{"a": 2, "b": "world"}"#), + Some(r#"{"a": 3, "b": "test"}"#), + ]); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(json_strings)]).unwrap() + } + + #[test] + fn test_parse_json_basic() { + let batch = create_json_batch(); + + // Define the output schema for parsing + let output_schema = Arc::new(StructType::new_unchecked(vec![ + StructField::new("a", DataType::LONG, true), + StructField::new("b", DataType::STRING, true), + ])); + + let expr = Expr::parse_json(column_expr!("json_col"), output_schema); + let result = evaluate_expression(&expr, &batch, None).unwrap(); + + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.num_columns(), 2); + assert_eq!(struct_result.len(), 3); + + // Verify 'a' column (Long values) + let a_col = struct_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(a_col.values(), &[1, 2, 3]); + + // Verify 'b' column (String values) + let b_col = struct_result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(b_col.value(0), "hello"); + assert_eq!(b_col.value(1), "world"); + assert_eq!(b_col.value(2), "test"); + } + + #[test] + fn test_parse_json_nested_struct() { + let schema = ArrowSchema::new(vec![ArrowField::new("json_col", ArrowDataType::Utf8, true)]); + let json_strings = StringArray::from(vec![ + Some(r#"{"outer": 10, "inner": {"x": 1, "y": 2}}"#), + Some(r#"{"outer": 20, "inner": {"x": 3, "y": 4}}"#), + ]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(json_strings)]).unwrap(); + + // Define nested output schema + let inner_schema = StructType::new_unchecked(vec![ + StructField::new("x", DataType::LONG, true), + StructField::new("y", DataType::LONG, true), + ]); + let output_schema = Arc::new(StructType::new_unchecked(vec![ + StructField::new("outer", DataType::LONG, true), + StructField::new("inner", DataType::Struct(Box::new(inner_schema)), true), + ])); + + let expr = Expr::parse_json(column_expr!("json_col"), output_schema); + let result = evaluate_expression(&expr, &batch, None).unwrap(); + + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.num_columns(), 2); + assert_eq!(struct_result.len(), 2); + + // Verify 'outer' column + let outer_col = struct_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(outer_col.values(), &[10, 20]); + + // Verify nested 'inner' struct + let inner_struct = struct_result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + let x_col = inner_struct + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let y_col = inner_struct + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(x_col.values(), &[1, 3]); + assert_eq!(y_col.values(), &[2, 4]); + } + + #[test] + fn test_parse_json_with_nulls() { + let schema = ArrowSchema::new(vec![ArrowField::new("json_col", ArrowDataType::Utf8, true)]); + // NULL JSON strings are treated as empty objects {} + let json_strings = StringArray::from(vec![Some(r#"{"a": 1}"#), None, Some(r#"{"a": 3}"#)]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(json_strings)]).unwrap(); + + let output_schema = Arc::new(StructType::new_unchecked(vec![StructField::new( + "a", + DataType::LONG, + true, + )])); + + let expr = Expr::parse_json(column_expr!("json_col"), output_schema); + let result = evaluate_expression(&expr, &batch, None).unwrap(); + + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.len(), 3); + + let a_col = struct_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + // Row 0 has value 1, row 1 is null (from empty {}), row 2 has value 3 + assert!(!a_col.is_null(0)); + assert_eq!(a_col.value(0), 1); + assert!(a_col.is_null(1)); // NULL JSON string -> empty object -> null field + assert!(!a_col.is_null(2)); + assert_eq!(a_col.value(2), 3); + } + + #[test] + fn test_parse_json_empty_batch() { + let schema = ArrowSchema::new(vec![ArrowField::new("json_col", ArrowDataType::Utf8, true)]); + let json_strings: StringArray = StringArray::from(Vec::>::new()); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(json_strings)]).unwrap(); + + let output_schema = Arc::new(StructType::new_unchecked(vec![StructField::new( + "a", + DataType::LONG, + true, + )])); + + let expr = Expr::parse_json(column_expr!("json_col"), output_schema); + let result = evaluate_expression(&expr, &batch, None).unwrap(); + + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.len(), 0); + } + + #[test] + fn test_parse_json_missing_field() { + // JSON objects are missing field "b" that the schema expects + let schema = ArrowSchema::new(vec![ArrowField::new("json_col", ArrowDataType::Utf8, true)]); + let json_strings = StringArray::from(vec![ + Some(r#"{"a": 1}"#), // missing "b" + Some(r#"{"a": 2, "b": "hi"}"#), // has both + Some(r#"{"a": 3}"#), // missing "b" + ]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(json_strings)]).unwrap(); + + let output_schema = Arc::new(StructType::new_unchecked(vec![ + StructField::new("a", DataType::LONG, true), + StructField::new("b", DataType::STRING, true), + ])); + + let expr = Expr::parse_json(column_expr!("json_col"), output_schema); + let result = evaluate_expression(&expr, &batch, None).unwrap(); + + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.len(), 3); + + // 'a' column should have all values + let a_col = struct_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(a_col.values(), &[1, 2, 3]); + + // 'b' column should have NULLs where missing + let b_col = struct_result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(b_col.is_null(0)); // missing in JSON + assert_eq!(b_col.value(1), "hi"); + assert!(b_col.is_null(2)); // missing in JSON + } + + #[test] + fn test_parse_json_extra_field_ignored() { + // JSON has extra field "c" not in schema - should be ignored + let schema = ArrowSchema::new(vec![ArrowField::new("json_col", ArrowDataType::Utf8, true)]); + let json_strings = StringArray::from(vec![ + Some(r#"{"a": 1, "b": "x", "c": "extra"}"#), + Some(r#"{"a": 2, "b": "y", "ignored": 999}"#), + ]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(json_strings)]).unwrap(); + + // Schema only asks for "a" and "b" + let output_schema = Arc::new(StructType::new_unchecked(vec![ + StructField::new("a", DataType::LONG, true), + StructField::new("b", DataType::STRING, true), + ])); + + let expr = Expr::parse_json(column_expr!("json_col"), output_schema); + let result = evaluate_expression(&expr, &batch, None).unwrap(); + + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.num_columns(), 2); // Only 2 columns, not 3 + assert_eq!(struct_result.len(), 2); + + let a_col = struct_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(a_col.values(), &[1, 2]); + + let b_col = struct_result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(b_col.value(0), "x"); + assert_eq!(b_col.value(1), "y"); + } + + #[test] + fn test_parse_json_errors_return_nulls() { + // ParseJson is used for stats parsing. Corrupt or unparseable values should produce + // null output rather than failing the query -- files with null stats simply skip data + // skipping and are always included in scan results. + + fn assert_parse_json_result_all_nulls( + json_strings: Vec>, + output_schema: Arc, + ) { + let schema = + ArrowSchema::new(vec![ArrowField::new("json_col", ArrowDataType::Utf8, true)]); + let len = json_strings.len(); + let json_arr = StringArray::from(json_strings); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(json_arr)]).unwrap(); + + let expr = Expr::parse_json(column_expr!("json_col"), output_schema); + let result = evaluate_expression(&expr, &batch, None).unwrap(); + + assert_eq!(result.len(), len); + assert_eq!(result.null_count(), len); + } + + // Type mismatch: string value where integer is expected + assert_parse_json_result_all_nulls( + vec![Some(r#"{"a": "not_a_number"}"#)], + Arc::new(StructType::new_unchecked(vec![StructField::new( + "a", + DataType::LONG, + true, + )])), + ); + + // Value overflow: 99999 doesn't fit in decimal(4,2) (max 99.99) + assert_parse_json_result_all_nulls( + vec![Some(r#"{"a": 99999}"#)], + Arc::new(StructType::new_unchecked(vec![StructField::new( + "a", + DataType::decimal(4, 2).unwrap(), + true, + )])), + ); + } + + // ==================== MapToStruct Tests ==================== + + /// Helper: creates a RecordBatch with a `pv` column of type Map. + fn create_partition_map_batch() -> RecordBatch { + use crate::arrow::array::{MapBuilder, StringBuilder}; + + let mut builder = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new()); + + // Row 0: {"date": "2024-01-15", "region": "us", "id": "42"} + builder.keys().append_value("date"); + builder.values().append_value("2024-01-15"); + builder.keys().append_value("region"); + builder.values().append_value("us"); + builder.keys().append_value("id"); + builder.values().append_value("42"); + builder.append(true).unwrap(); + + // Row 1: {"date": "", "region": "eu", "id": "-7"} + builder.keys().append_value("date"); + builder.values().append_value(""); + builder.keys().append_value("region"); + builder.values().append_value("eu"); + builder.keys().append_value("id"); + builder.values().append_value("-7"); + builder.append(true).unwrap(); + + // Row 2: null map + builder.append(false).unwrap(); + + let map_array = builder.finish(); + let schema = ArrowSchema::new(vec![ArrowField::new( + "pv", + map_array.data_type().clone(), + true, + )]); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]).unwrap() + } + + #[test] + fn test_map_to_struct_basic() { + use crate::arrow::array::Date32Array; + + let batch = create_partition_map_batch(); + let output_schema = StructType::new_unchecked(vec![ + StructField::nullable("region", DataType::STRING), + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("date", DataType::DATE), + ]); + let result_type = DataType::Struct(Box::new(output_schema)); + let expr = Expr::map_to_struct(column_expr!("pv")); + let result = evaluate_expression(&expr, &batch, Some(&result_type)).unwrap(); + let structs = result.as_any().downcast_ref::().unwrap(); + + let regions = structs + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let ids = structs + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + let dates = structs + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + + // Row 0: all values present and parseable + assert_eq!(regions.value(0), "us"); + assert_eq!(ids.value(0), 42); + assert_eq!(dates.value(0), 19737); // 2024-01-15 + + // Row 1: date is empty string → null, region and id are valid + assert_eq!(regions.value(1), "eu"); + assert_eq!(ids.value(1), -7); + assert!(dates.is_null(1)); + + // Row 2: null map → all null + assert!(regions.is_null(2)); + assert!(ids.is_null(2)); + assert!(dates.is_null(2)); + } + + #[test] + fn test_map_to_struct_missing_key() { + let batch = create_partition_map_batch(); + let output_schema = + StructType::new_unchecked(vec![StructField::nullable("nonexistent", DataType::STRING)]); + let result_type = DataType::Struct(Box::new(output_schema)); + let expr = Expr::map_to_struct(column_expr!("pv")); + let result = evaluate_expression(&expr, &batch, Some(&result_type)).unwrap(); + let structs = result.as_any().downcast_ref::().unwrap(); + let col = structs + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(col.is_null(0)); + assert!(col.is_null(1)); + assert!(col.is_null(2)); + } + + #[test] + fn test_map_to_struct_parse_error() { + use crate::arrow::array::{MapBuilder, StringBuilder}; + + let mut builder = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new()); + builder.keys().append_value("count"); + builder.values().append_value("not_a_number"); + builder.append(true).unwrap(); + + let map_array = builder.finish(); + let schema = ArrowSchema::new(vec![ArrowField::new( + "pv", + map_array.data_type().clone(), + true, + )]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]).unwrap(); + + let output_schema = + StructType::new_unchecked(vec![StructField::nullable("count", DataType::INTEGER)]); + let result_type = DataType::Struct(Box::new(output_schema)); + let expr = Expr::map_to_struct(column_expr!("pv")); + let result = evaluate_expression(&expr, &batch, Some(&result_type)); + assert!(result.is_err()); + } + + #[test] + fn test_map_to_struct_duplicate_keys() { + use crate::arrow::array::{MapBuilder, StringBuilder}; + + let mut builder = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new()); + builder.keys().append_value("x"); + builder.values().append_value("first"); + builder.keys().append_value("x"); + builder.values().append_value("last"); + builder.append(true).unwrap(); + + let map_array = builder.finish(); + let schema = ArrowSchema::new(vec![ArrowField::new( + "pv", + map_array.data_type().clone(), + true, + )]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]).unwrap(); + + let output_schema = + StructType::new_unchecked(vec![StructField::nullable("x", DataType::STRING)]); + let result_type = DataType::Struct(Box::new(output_schema)); + let expr = Expr::map_to_struct(column_expr!("pv")); + let result = evaluate_expression(&expr, &batch, Some(&result_type)).unwrap(); + let structs = result.as_any().downcast_ref::().unwrap(); + let col = structs + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + // Rightmost entry wins + assert_eq!(col.value(0), "last"); + } + + #[test] + fn test_map_to_struct_non_map_input() { + let schema = ArrowSchema::new(vec![ArrowField::new("s", ArrowDataType::Utf8, true)]); + let strings = StringArray::from(vec![Some("hello")]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(strings)]).unwrap(); + + let output_schema = + StructType::new_unchecked(vec![StructField::nullable("x", DataType::STRING)]); + let result_type = DataType::Struct(Box::new(output_schema)); + let expr = Expr::map_to_struct(column_expr!("s")); + let result = evaluate_expression(&expr, &batch, Some(&result_type)); + assert!(result.is_err()); + } + + /// Helper to build a batch with Int32 column `a` and a Boolean column `is_valid`. + fn create_batch_with_bool_col( + a_vals: Vec>, + is_valid_vals: Vec>, + ) -> RecordBatch { + let schema = ArrowSchema::new(vec![ + ArrowField::new("a", ArrowDataType::Int32, true), + ArrowField::new("is_valid", ArrowDataType::Boolean, true), + ]); + let a_array: ArrayRef = Arc::new(Int32Array::from(a_vals)); + let is_valid_array: ArrayRef = Arc::new(BooleanArray::from(is_valid_vals)); + RecordBatch::try_new(Arc::new(schema), vec![a_array, is_valid_array]).unwrap() + } + + #[rstest] + // Fast path: no nulls in predicate array — values bitmap used directly. + #[case::fast_path( + vec![Some(1), Some(2), Some(3)], + vec![Some(true), Some(false), Some(true)], + vec![true, false, true], + )] + // Slow path: predicate has nulls — Kleene AND; both false and null → null struct. + #[case::slow_path( + vec![Some(1), Some(2), Some(3), Some(4)], + vec![Some(true), Some(false), None, Some(true)], + vec![true, false, false, true], + )] + fn test_struct_with_nullability_predicate( + #[case] a_vals: Vec>, + #[case] pred_vals: Vec>, + #[case] expected_valid: Vec, + ) { + let batch = create_batch_with_bool_col(a_vals, pred_vals); + let schema = DataType::Struct(Box::new(StructType::new_unchecked(vec![StructField::new( + "a", + DataType::INTEGER, + true, + )]))); + let expr = Expr::struct_with_nullability_from( + [column_expr_ref!("a")], + column_expr_ref!("is_valid"), + ); + let result = evaluate_expression(&expr, &batch, Some(&schema)).unwrap(); + let struct_result = result.as_any().downcast_ref::().unwrap(); + for (i, valid) in expected_valid.iter().enumerate() { + assert_eq!(struct_result.is_valid(i), *valid, "row {i}"); + } + } + + #[test] + fn test_struct_with_nullability_predicate_nested_schema() { + // Nested struct as schema: outer struct has one field that is itself a struct. + let batch = create_batch_with_bool_col( + vec![Some(1), Some(2), Some(3)], + vec![Some(true), Some(false), Some(true)], + ); + let inner_schema = + StructType::new_unchecked(vec![StructField::new("a", DataType::INTEGER, true)]); + let schema = DataType::Struct(Box::new(StructType::new_unchecked(vec![StructField::new( + "nested", + DataType::Struct(Box::new(inner_schema)), + true, + )]))); + let inner_expr = Expr::struct_from([column_expr_ref!("a")]); + let expr = Expr::struct_with_nullability_from([inner_expr], column_expr_ref!("is_valid")); + let result = evaluate_expression(&expr, &batch, Some(&schema)).unwrap(); + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert!(struct_result.is_valid(0)); + assert!(struct_result.is_null(1)); + assert!(struct_result.is_valid(2)); + // The "nested" column should itself be a StructArray with 3 rows + let nested = struct_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(nested.len(), 3); + } + + #[test] + fn test_struct_with_nullability_predicate_multiple_fields() { + // Multiple expressions: [column_expr_ref!("a"), column_expr_ref!("b")] with predicate. + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("a", ArrowDataType::Int32, true), + ArrowField::new("b", ArrowDataType::Int32, true), + ArrowField::new("is_valid", ArrowDataType::Boolean, true), + ]); + let batch = RecordBatch::try_new( + Arc::new(arrow_schema), + vec![ + Arc::new(Int32Array::from(vec![Some(1), Some(2), Some(3)])) as ArrayRef, + Arc::new(Int32Array::from(vec![Some(10), Some(20), Some(30)])) as ArrayRef, + Arc::new(BooleanArray::from(vec![ + Some(true), + Some(false), + Some(true), + ])) as ArrayRef, + ], + ) + .unwrap(); + let schema = DataType::Struct(Box::new(StructType::new_unchecked(vec![ + StructField::new("a", DataType::INTEGER, true), + StructField::new("b", DataType::INTEGER, true), + ]))); + let expr = Expr::struct_with_nullability_from( + [column_expr_ref!("a"), column_expr_ref!("b")], + column_expr_ref!("is_valid"), + ); + let result = evaluate_expression(&expr, &batch, Some(&schema)).unwrap(); + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert!(struct_result.is_valid(0), "row 0 should be valid"); + assert!(struct_result.is_null(1), "row 1 should be null"); + assert!(struct_result.is_valid(2), "row 2 should be valid"); + validate_i32_column(struct_result, 0, &[1, 2, 3]); + validate_i32_column(struct_result, 1, &[10, 20, 30]); + } + + #[test] + fn test_struct_nullability_non_boolean_predicate_errors() { + // Non-boolean expression (Int32 column) as nullability predicate should error. + let batch = create_batch_with_bool_col( + vec![Some(1), Some(2), Some(3)], + vec![Some(true), Some(false), Some(true)], + ); + let schema = DataType::Struct(Box::new(StructType::new_unchecked(vec![StructField::new( + "a", + DataType::INTEGER, + true, + )]))); + let expr = + Expr::struct_with_nullability_from([column_expr_ref!("a")], column_expr_ref!("a")); + let result = evaluate_expression(&expr, &batch, Some(&schema)); + assert_result_error_with_message(result, "Incorrect datatype"); + } + + #[test] + fn test_struct_no_result_type_errors() { + // struct_from with result_type = None should return an error + let batch = create_test_batch(); + let expr = Expr::struct_from([column_expr_ref!("a")]); + let result = evaluate_expression(&expr, &batch, None); + assert!(result.is_err()); + } + + /// Helper to build a batch with a single struct column named "stats". + fn make_struct_batch(arrow_fields: Vec, arrays: Vec) -> RecordBatch { + let stats_type = ArrowDataType::Struct(arrow_fields.clone().into()); + let schema = ArrowSchema::new(vec![ArrowField::new("stats", stats_type, true)]); + let stats_array = StructArray::try_new(arrow_fields.into(), arrays, None).unwrap(); + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(stats_array)]).unwrap() + } + + #[test] + fn column_extract_struct_with_mismatched_field_names() { + let batch = make_struct_batch( + vec![ + ArrowField::new("col-abc-001", ArrowDataType::Int64, true), + ArrowField::new("col-abc-002", ArrowDataType::Int64, true), + ], + vec![ + Arc::new(Int64Array::from(vec![Some(1), Some(2)])), + Arc::new(Int64Array::from(vec![Some(10), Some(20)])), + ], + ); + + // Logical names differ from physical names due to column mapping + let logical_type = DataType::try_struct_type([ + StructField::nullable("my_column", DataType::LONG), + StructField::nullable("other_column", DataType::LONG), + ]) + .unwrap(); + + let expr = column_expr!("stats"); + let result = evaluate_expression(&expr, &batch, Some(&logical_type)); + + // Ordinal-based validation passes: same field count and types by position. + // The downstream apply_schema transformation handles renaming. + let arr = result.expect("should succeed with mismatched names but matching types"); + let struct_arr = arr.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_arr.num_columns(), 2); + assert_eq!(struct_arr.len(), 2); + } + + #[test] + fn column_extract_struct_rejects_mismatched_field_count() { + let batch = make_struct_batch( + vec![ArrowField::new("col-abc-001", ArrowDataType::Int64, true)], + vec![Arc::new(Int64Array::from(vec![Some(1), Some(2)]))], + ); + + let logical_type = DataType::try_struct_type([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::LONG), + ]) + .unwrap(); + + let expr = column_expr!("stats"); + let result = evaluate_expression(&expr, &batch, Some(&logical_type)); + assert_result_error_with_message(result, "Struct field count mismatch"); + } + + #[test] + fn column_extract_struct_rejects_mismatched_child_types() { + let batch = make_struct_batch( + vec![ + ArrowField::new("col-abc-001", ArrowDataType::Int64, true), + ArrowField::new("col-abc-002", ArrowDataType::Utf8, true), + ], + vec![ + Arc::new(Int64Array::from(vec![Some(1)])), + Arc::new(StringArray::from(vec![Some("x")])), + ], + ); + + // Expect two LONG columns, but the second arrow field is Utf8 + let logical_type = DataType::try_struct_type([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::LONG), + ]) + .unwrap(); + + let expr = column_expr!("stats"); + let result = evaluate_expression(&expr, &batch, Some(&logical_type)); + assert_result_error_with_message(result, "Incorrect datatype"); + } + + #[test] + fn column_extract_struct_with_matching_names_still_works() { + let batch = make_struct_batch( + vec![ + ArrowField::new("a", ArrowDataType::Int64, true), + ArrowField::new("b", ArrowDataType::Int64, true), + ], + vec![ + Arc::new(Int64Array::from(vec![Some(1)])), + Arc::new(Int64Array::from(vec![Some(2)])), + ], + ); + + let logical_type = DataType::try_struct_type([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::LONG), + ]) + .unwrap(); + + let expr = column_expr!("stats"); + let result = evaluate_expression(&expr, &batch, Some(&logical_type)); + assert!(result.is_ok()); + } + + /// Exercises the exact code path from `get_add_transform_expr` where a `struct_from` + /// expression wraps `column_expr!("add.stats_parsed")`. When the checkpoint parquet has + /// stats_parsed with physical column names (e.g. `col-abc-001`) but the output schema + /// uses logical names (e.g. `id`), `evaluate_struct_expression` calls + /// `evaluate_expression(Column, struct_result_type)` with mismatched field names. + /// Without ordinal-based validation this fails with a name mismatch error. + #[test] + fn struct_from_with_column_tolerates_nested_name_mismatch() { + // Build a batch mimicking checkpoint data: add.stats_parsed uses physical names + let stats_fields: Vec = vec![ + ArrowField::new("col-abc-001", ArrowDataType::Int64, true), + ArrowField::new("col-abc-002", ArrowDataType::Int64, true), + ]; + let stats_array = StructArray::try_new( + stats_fields.clone().into(), + vec![ + Arc::new(Int64Array::from(vec![Some(1)])), + Arc::new(Int64Array::from(vec![Some(10)])), + ], + None, + ) + .unwrap(); + + let add_fields: Vec = vec![ + ArrowField::new("path", ArrowDataType::Utf8, true), + ArrowField::new( + "stats_parsed", + ArrowDataType::Struct(stats_fields.into()), + true, + ), + ]; + let add_struct = StructArray::try_new( + add_fields.clone().into(), + vec![ + Arc::new(StringArray::from(vec![Some("file.parquet")])), + Arc::new(stats_array), + ], + None, + ) + .unwrap(); + + let schema = ArrowSchema::new(vec![ArrowField::new( + "add", + ArrowDataType::Struct(add_fields.into()), + true, + )]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(add_struct)]).unwrap(); + + // struct_from mimicking get_add_transform_expr: wraps a Column referencing stats_parsed + let expr = Expr::struct_from([ + column_expr_ref!("add.path"), + column_expr_ref!("add.stats_parsed"), + ]); + + // Output schema uses logical names (differs from physical names in the batch) + let output_type = DataType::try_struct_type([ + StructField::nullable("path", DataType::STRING), + StructField::nullable( + "stats_parsed", + DataType::struct_type_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("value", DataType::LONG), + ]), + ), + ]) + .unwrap(); + + let result = evaluate_expression(&expr, &batch, Some(&output_type)); + result.expect("struct_from with Column sub-expression should tolerate field name mismatch"); + } + + #[test] + fn column_extract_nested_struct_with_mismatched_names() { + let inner_fields = vec![ArrowField::new("phys-inner", ArrowDataType::Int64, true)]; + let inner_struct = ArrowDataType::Struct(inner_fields.clone().into()); + let batch = make_struct_batch( + vec![ArrowField::new("phys-outer", inner_struct, true)], + vec![Arc::new( + StructArray::try_new( + inner_fields.into(), + vec![Arc::new(Int64Array::from(vec![Some(42)]))], + None, + ) + .unwrap(), + )], + ); + + let logical_type = DataType::try_struct_type([StructField::nullable( + "logical_outer", + DataType::struct_type_unchecked([StructField::nullable( + "logical_inner", + DataType::LONG, + )]), + )]) + .unwrap(); + + let expr = column_expr!("stats"); + let result = evaluate_expression(&expr, &batch, Some(&logical_type)); + assert!(result.is_ok()); + } } diff --git a/kernel/src/engine/arrow_expression/mod.rs b/kernel/src/engine/arrow_expression/mod.rs index 556347dd9c..8ed16db4c7 100644 --- a/kernel/src/engine/arrow_expression/mod.rs +++ b/kernel/src/engine/arrow_expression/mod.rs @@ -70,6 +70,16 @@ impl Scalar { }}; } + // Use append_value_n for primitive builders that support batch append + macro_rules! append_val_n_as { + ($t:ty, $val:expr) => {{ + let builder = builder_as!($t); + builder.append_value_n($val, num_rows); + }}; + } + + // Use append_value in a loop for builders without batch append (String, Binary) + // TODO: Remove after https://github.com/apache/arrow-rs/pull/9426 gets in macro_rules! append_val_as { ($t:ty, $val:expr) => {{ let builder = builder_as!($t); @@ -80,40 +90,40 @@ impl Scalar { } match self { - Integer(val) => append_val_as!(array::Int32Builder, *val), - Long(val) => append_val_as!(array::Int64Builder, *val), - Short(val) => append_val_as!(array::Int16Builder, *val), - Byte(val) => append_val_as!(array::Int8Builder, *val), - Float(val) => append_val_as!(array::Float32Builder, *val), - Double(val) => append_val_as!(array::Float64Builder, *val), + Integer(val) => append_val_n_as!(array::Int32Builder, *val), + Long(val) => append_val_n_as!(array::Int64Builder, *val), + Short(val) => append_val_n_as!(array::Int16Builder, *val), + Byte(val) => append_val_n_as!(array::Int8Builder, *val), + Float(val) => append_val_n_as!(array::Float32Builder, *val), + Double(val) => append_val_n_as!(array::Float64Builder, *val), String(val) => append_val_as!(array::StringBuilder, val), - Boolean(val) => append_val_as!(array::BooleanBuilder, *val), + Boolean(val) => builder_as!(array::BooleanBuilder).append_n(num_rows, *val), Timestamp(val) | TimestampNtz(val) => { // timezone was already set at builder construction time - append_val_as!(array::TimestampMicrosecondBuilder, *val) + append_val_n_as!(array::TimestampMicrosecondBuilder, *val) } - Date(val) => append_val_as!(array::Date32Builder, *val), + Date(val) => append_val_n_as!(array::Date32Builder, *val), Binary(val) => append_val_as!(array::BinaryBuilder, val), // precision and scale were already set at builder construction time - Decimal(val) => append_val_as!(array::Decimal128Builder, val.bits()), + Decimal(val) => append_val_n_as!(array::Decimal128Builder, val.bits()), Struct(data) => { let builder = builder_as!(array::StructBuilder); require!( builder.num_fields() == data.fields().len(), Error::generic("Struct builder has wrong number of fields") ); + let field_builders = builder.field_builders_mut().iter_mut(); + for (builder, value) in field_builders.zip(data.values()) { + value.append_to(builder, num_rows)?; + } + // TODO: Loop can be removed after: https://github.com/apache/arrow-rs/pull/9430 for _ in 0..num_rows { - let field_builders = builder.field_builders_mut().iter_mut(); - for (builder, value) in field_builders.zip(data.values()) { - value.append_to(builder, 1)?; - } builder.append(true); } } Array(data) => { let builder = builder_as!(array::ListBuilder>); for _ in 0..num_rows { - #[allow(deprecated)] for value in data.array_elements() { value.append_to(builder.values(), 1)?; } @@ -151,31 +161,29 @@ impl Scalar { }}; } - macro_rules! append_null_as { + macro_rules! append_nulls_as { ($t:ty) => {{ let builder = builder_as!($t); - for _ in 0..num_rows { - builder.append_null() - } + builder.append_nulls(num_rows); }}; } match *data_type { - DataType::INTEGER => append_null_as!(array::Int32Builder), - DataType::LONG => append_null_as!(array::Int64Builder), - DataType::SHORT => append_null_as!(array::Int16Builder), - DataType::BYTE => append_null_as!(array::Int8Builder), - DataType::FLOAT => append_null_as!(array::Float32Builder), - DataType::DOUBLE => append_null_as!(array::Float64Builder), - DataType::STRING => append_null_as!(array::StringBuilder), - DataType::BOOLEAN => append_null_as!(array::BooleanBuilder), + DataType::INTEGER => append_nulls_as!(array::Int32Builder), + DataType::LONG => append_nulls_as!(array::Int64Builder), + DataType::SHORT => append_nulls_as!(array::Int16Builder), + DataType::BYTE => append_nulls_as!(array::Int8Builder), + DataType::FLOAT => append_nulls_as!(array::Float32Builder), + DataType::DOUBLE => append_nulls_as!(array::Float64Builder), + DataType::STRING => append_nulls_as!(array::StringBuilder), + DataType::BOOLEAN => append_nulls_as!(array::BooleanBuilder), DataType::TIMESTAMP | DataType::TIMESTAMP_NTZ => { - append_null_as!(array::TimestampMicrosecondBuilder) + append_nulls_as!(array::TimestampMicrosecondBuilder) } - DataType::DATE => append_null_as!(array::Date32Builder), - DataType::BINARY => append_null_as!(array::BinaryBuilder), + DataType::DATE => append_nulls_as!(array::Date32Builder), + DataType::BINARY => append_nulls_as!(array::BinaryBuilder), DataType::Primitive(PrimitiveType::Decimal(_)) => { - append_null_as!(array::Decimal128Builder) + append_nulls_as!(array::Decimal128Builder) } DataType::Struct(ref stype) => { // WARNING: Unlike ArrayBuilder and MapBuilder, StructBuilder always requires us to @@ -185,20 +193,19 @@ impl Scalar { builder.num_fields() == stype.num_fields(), Error::generic("Struct builder has wrong number of fields") ); - for _ in 0..num_rows { - let field_builders = builder.field_builders_mut().iter_mut(); - for (builder, field) in field_builders.zip(stype.fields()) { - Self::append_null(builder, &field.data_type, 1)?; - } - builder.append(false); + let field_builders = builder.field_builders_mut().iter_mut(); + for (builder, field) in field_builders.zip(stype.fields()) { + Self::append_null(builder, &field.data_type, num_rows)?; } + builder.append_nulls(num_rows); } - DataType::Array(_) => append_null_as!(array::ListBuilder>), + DataType::Array(_) => append_nulls_as!(array::ListBuilder>), DataType::Map(_) => { // For some reason, there is no `MapBuilder::append_null` method -- even tho // StructBuilder and ListBuilder both provide it. let builder = builder_as!(array::MapBuilder, Box>); + // TODO: Can be removed after https://github.com/apache/arrow-rs/pull/9432 for _ in 0..num_rows { builder.append(false)?; } @@ -218,7 +225,6 @@ impl ArrayData { pub fn to_arrow(&self) -> DeltaResult { let arrow_data_type = ArrowDataType::try_from_kernel(self.array_type().element_type())?; - #[allow(deprecated)] let elements = self.array_elements(); let mut builder = array::make_builder(&arrow_data_type, elements.len()); for element in elements { @@ -235,26 +241,22 @@ pub struct ArrowEvaluationHandler; impl EvaluationHandler for ArrowEvaluationHandler { fn new_expression_evaluator( &self, - schema: SchemaRef, + _schema: SchemaRef, expression: ExpressionRef, output_type: DataType, - ) -> Arc { - Arc::new(DefaultExpressionEvaluator { - input_schema: schema, + ) -> DeltaResult> { + Ok(Arc::new(DefaultExpressionEvaluator { expression, output_type, - }) + })) } fn new_predicate_evaluator( &self, - schema: SchemaRef, + _schema: SchemaRef, predicate: PredicateRef, - ) -> Arc { - Arc::new(DefaultPredicateEvaluator { - input_schema: schema, - predicate, - }) + ) -> DeltaResult> { + Ok(Arc::new(DefaultPredicateEvaluator { predicate })) } /// Create a single-row array with all-null leaf values. Note that if a nested struct is @@ -269,11 +271,64 @@ impl EvaluationHandler for ArrowEvaluationHandler { RecordBatch::try_new(Arc::new(output_schema.as_ref().try_into_arrow()?), arrays)?; Ok(Box::new(ArrowEngineData::new(record_batch))) } + + fn create_many( + &self, + schema: SchemaRef, + rows: &[&[Scalar]], + ) -> DeltaResult> { + let arrow_schema: Arc = Arc::new(schema.as_ref().try_into_arrow()?); + if rows.is_empty() { + return Ok(Box::new(ArrowEngineData::new(RecordBatch::new_empty( + arrow_schema, + )))); + } + + let num_rows = rows.len(); + let num_fields = schema.fields().len(); + for (row_idx, row) in rows.iter().enumerate() { + if row.len() != num_fields { + return Err(Error::generic(format!( + "Row {} has {} scalars but schema has {} fields", + row_idx, + row.len(), + num_fields + ))); + } + } + + let mut builders: Vec> = arrow_schema + .fields() + .iter() + .map(|field| array::make_builder(field.data_type(), num_rows)) + .collect(); + + let fields: Vec<_> = schema.fields().collect(); + for (col_idx, builder) in builders.iter_mut().enumerate() { + let field_name = fields[col_idx].name(); + for (row_idx, row) in rows.iter().enumerate() { + row[col_idx].append_to(builder.as_mut(), 1).map_err(|e| { + Error::generic(format!( + "Row {row_idx}, field '{field_name}' \ + (expected type {}, got {}): {e}", + fields[col_idx].data_type(), + row[col_idx].data_type() + )) + })?; + } + } + + let arrays: Vec = builders.into_iter().map(|mut b| b.finish()).collect(); + + Ok(Box::new(ArrowEngineData::new(RecordBatch::try_new( + arrow_schema, + arrays, + )?))) + } } #[derive(Debug)] pub struct DefaultExpressionEvaluator { - input_schema: SchemaRef, expression: ExpressionRef, output_type: DataType, } @@ -282,16 +337,6 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator { fn evaluate(&self, batch: &dyn EngineData) -> DeltaResult> { debug!("Arrow evaluator evaluating: {:#?}", self.expression); let batch = extract_record_batch(batch)?; - let _input_schema: ArrowSchema = self.input_schema.as_ref().try_into_arrow()?; - // TODO: make sure we have matching schemas for validation - // if batch.schema().as_ref() != &input_schema { - // return Err(Error::Generic(format!( - // "input schema does not match batch schema: {:?} != {:?}", - // input_schema, - // batch.schema() - // ))); - // }; - let batch = match (self.expression.as_ref(), &self.output_type) { (Expression::Transform(transform), DataType::Struct(_)) if transform.is_identity() => { // Empty transform optimization: Skip expression evaluation and directly apply the @@ -322,7 +367,6 @@ impl ExpressionEvaluator for DefaultExpressionEvaluator { #[derive(Debug)] pub struct DefaultPredicateEvaluator { - input_schema: SchemaRef, predicate: PredicateRef, } @@ -330,15 +374,6 @@ impl PredicateEvaluator for DefaultPredicateEvaluator { fn evaluate(&self, batch: &dyn EngineData) -> DeltaResult> { debug!("Arrow evaluator evaluating: {:#?}", self.predicate); let batch = extract_record_batch(batch)?; - let _input_schema: ArrowSchema = self.input_schema.as_ref().try_into_arrow()?; - // TODO: make sure we have matching schemas for validation - // if batch.schema().as_ref() != &input_schema { - // return Err(Error::Generic(format!( - // "input schema does not match batch schema: {:?} != {:?}", - // input_schema, - // batch.schema() - // ))); - // }; let array = evaluate_predicate(&self.predicate, batch, false)?; let schema = ArrowSchema::new(vec![ArrowField::new( "output", diff --git a/kernel/src/engine/arrow_expression/tests.rs b/kernel/src/engine/arrow_expression/tests.rs index e2a5aea24c..f139ba5bd8 100644 --- a/kernel/src/engine/arrow_expression/tests.rs +++ b/kernel/src/engine/arrow_expression/tests.rs @@ -1,12 +1,18 @@ use std::ops::{Add, Div, Mul, Sub}; +use rstest::rstest; + +#[cfg(not(feature = "arrow-56"))] +use crate::arrow::array::ListViewArray; use crate::arrow::array::{ - create_array, Array, ArrayRef, BooleanArray, GenericStringArray, Int32Array, Int32Builder, - ListArray, MapArray, MapBuilder, MapFieldNames, StringArray, StringBuilder, StructArray, + create_array, Array, ArrayRef, BinaryViewArray, BooleanArray, GenericStringArray, Int32Array, + Int32Builder, ListArray, MapArray, MapBuilder, MapFieldNames, StringArray, StringBuilder, + StringViewArray, StructArray, }; use crate::arrow::buffer::{BooleanBuffer, NullBuffer, OffsetBuffer, ScalarBuffer}; use crate::arrow::compute::kernels::cmp::{gt_eq, lt}; use crate::arrow::datatypes::{DataType, Field, Fields, Schema}; +use crate::engine::arrow_data::EngineDataArrowExt as _; use crate::engine::arrow_expression::evaluate_expression::to_json; use crate::engine::arrow_expression::opaque::{ ArrowOpaqueExpression as _, ArrowOpaqueExpressionOp, ArrowOpaquePredicate as _, @@ -19,7 +25,7 @@ use crate::kernel_predicates::{ }; use crate::schema::{ArrayType, DataType as KernelDataType, MapType, StructField, StructType}; use crate::utils::test_utils::assert_result_error_with_message; -use crate::EvaluationHandlerExtension as _; +use crate::{EvaluationHandler as _, EvaluationHandlerExtension as _}; use super::*; @@ -87,6 +93,158 @@ fn test_bad_right_type_array() { ); } +#[test] +fn test_in_predicate_with_utf8view_list_column() { + let values = StringViewArray::from(vec!["hello", "world", "foo", "bar", "hello", "baz"]); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 2, 3, 6])); + let item_field = Arc::new(Field::new("item", DataType::Utf8View, true)); + let list_field = Arc::new(Field::new( + "items", + DataType::List(item_field.clone()), + true, + )); + let schema = Schema::new([list_field]); + let list_array = ListArray::new(item_field, offsets, Arc::new(values), None); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(list_array)]).unwrap(); + + let in_pred = Pred::binary( + BinaryPredicateOp::In, + Expr::literal("hello"), + column_expr!("items"), + ); + + let expected = BooleanArray::from(vec![true, false, true]); + assert_eq!( + evaluate_predicate(&in_pred, &batch, false).unwrap(), + expected + ); +} + +#[test] +#[cfg(not(feature = "arrow-56"))] +// TODO: this test need arrow-57 to be run successfully. Please remove the cfg after "arrow-56" is deprecated. +fn test_in_predicate_with_list_view_column() { + // Three rows: [0,1,2], [3,4,5], [6,7,8] + let values = Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8]); + let offsets = ScalarBuffer::from(vec![0i32, 3, 6]); + let sizes = ScalarBuffer::from(vec![3i32, 3, 3]); + let item_field = Arc::new(Field::new("item", DataType::Int32, true)); + let list_field = Arc::new(Field::new( + "items", + DataType::ListView(item_field.clone()), + true, + )); + let schema = Schema::new([list_field]); + let list_view_array = ListViewArray::new(item_field, offsets, sizes, Arc::new(values), None); + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(list_view_array)]).unwrap(); + + let in_op = Pred::binary( + BinaryPredicateOp::In, + Expr::literal(5), + column_expr!("items"), + ); + let not_op = Pred::not(Pred::binary( + BinaryPredicateOp::In, + Expr::literal(5), + column_expr!("items"), + )); + + let result = evaluate_predicate(&in_op, &batch, false).unwrap(); + let expected_in = BooleanArray::from(vec![false, true, false]); + assert_eq!(result, expected_in); + + let result = evaluate_predicate(¬_op, &batch, false).unwrap(); + let expected_not_in = BooleanArray::from(vec![true, false, true]); + assert_eq!(result, expected_not_in); + + // Test inversion + let result = evaluate_predicate(&in_op, &batch, true).unwrap(); + assert_eq!(result, expected_not_in); + + let result = evaluate_predicate(¬_op, &batch, true).unwrap(); + assert_eq!(result, expected_in); +} + +#[rstest] +#[case::utf8view( + Arc::new(StringViewArray::from(vec![None, Some("apple"), Some("hello"), Some("zebra")])) as ArrayRef, + DataType::Utf8View, + Expr::literal("hello"), +)] +#[case::large_utf8( + Arc::new(GenericStringArray::::from(vec![None, Some("apple"), Some("hello"), Some("zebra")])) as ArrayRef, + DataType::LargeUtf8, + Expr::literal("hello"), +)] +#[case::binary_view( + Arc::new(BinaryViewArray::from(vec![None, Some(b"apple".as_ref()), Some(b"hello"), Some(b"zebra")])) as ArrayRef, + DataType::BinaryView, + Expr::literal(b"hello".as_ref()), +)] +fn test_binary_predicate_with_view_types( + #[case] array: ArrayRef, + #[case] dtype: DataType, + #[case] lit: Expr, +) { + let schema = Schema::new([Arc::new(Field::new("col", dtype, true))]); + let batch = RecordBatch::try_new(Arc::new(schema), vec![array]).unwrap(); + let column = column_expr!("col"); + + let predicate_lt = column.clone().lt(lit.clone()); + let results = evaluate_predicate(&predicate_lt, &batch, false).unwrap(); + let expected_lt = BooleanArray::from(vec![None, Some(true), Some(false), Some(false)]); + assert_eq!(results, expected_lt); + + let predicate_le = column.clone().le(lit.clone()); + let results = evaluate_predicate(&predicate_le, &batch, false).unwrap(); + let expected_le = BooleanArray::from(vec![None, Some(true), Some(true), Some(false)]); + assert_eq!(results, expected_le); + + let predicate_gt = column.clone().gt(lit.clone()); + let results = evaluate_predicate(&predicate_gt, &batch, false).unwrap(); + let expected_gt = BooleanArray::from(vec![None, Some(false), Some(false), Some(true)]); + assert_eq!(results, expected_gt); + + let predicate_ge = column.clone().ge(lit.clone()); + let results = evaluate_predicate(&predicate_ge, &batch, false).unwrap(); + let expected_ge = BooleanArray::from(vec![None, Some(false), Some(true), Some(true)]); + assert_eq!(results, expected_ge); + + let predicate_eq = column.clone().eq(lit.clone()); + let results = evaluate_predicate(&predicate_eq, &batch, false).unwrap(); + let expected_eq = BooleanArray::from(vec![None, Some(false), Some(true), Some(false)]); + assert_eq!(results, expected_eq); + + let predicate_ne = column.clone().ne(lit.clone()); + let results = evaluate_predicate(&predicate_ne, &batch, false).unwrap(); + let expected_ne = BooleanArray::from(vec![None, Some(true), Some(false), Some(true)]); + assert_eq!(results, expected_ne); + + let predicate_distinct = column.clone().distinct(lit.clone()); + let results = evaluate_predicate(&predicate_distinct, &batch, false).unwrap(); + let expected_distinct = + BooleanArray::from(vec![Some(true), Some(true), Some(false), Some(true)]); + assert_eq!(results, expected_distinct); + + // Test inversion (NOT pushdown): each inverted op equals the complement + let results = evaluate_predicate(&predicate_lt, &batch, true).unwrap(); + assert_eq!(results, expected_ge); + let results = evaluate_predicate(&predicate_le, &batch, true).unwrap(); + assert_eq!(results, expected_gt); + let results = evaluate_predicate(&predicate_gt, &batch, true).unwrap(); + assert_eq!(results, expected_le); + let results = evaluate_predicate(&predicate_ge, &batch, true).unwrap(); + assert_eq!(results, expected_lt); + let results = evaluate_predicate(&predicate_eq, &batch, true).unwrap(); + assert_eq!(results, expected_ne); + let results = evaluate_predicate(&predicate_ne, &batch, true).unwrap(); + assert_eq!(results, expected_eq); + let results = evaluate_predicate(&predicate_distinct, &batch, true).unwrap(); + let expected_not_distinct = + BooleanArray::from(vec![Some(false), Some(false), Some(true), Some(false)]); + assert_eq!(results, expected_not_distinct); +} + #[test] fn test_literal_type_array() { let field = Arc::new(Field::new("item", DataType::Int32, true)); @@ -535,7 +693,7 @@ impl OpaqueLessThanOp { panic!("Invalid arg count: {}", args.len()); }; - let eval = |arg| evaluate_expression(arg, batch, Some(&KernelDataType::BOOLEAN)); + let eval = |arg| evaluate_expression(arg, batch, Some(&KernelDataType::INTEGER)); Ok(op_fn(&eval(left)?, &eval(right)?)?) } } @@ -673,11 +831,8 @@ fn test_null_row() { ], ) .unwrap(); - let result: RecordBatch = result - .into_any() - .downcast::() - .unwrap() - .into(); + + let result = result.try_into_record_batch().unwrap(); assert_eq!(result, expected); } @@ -698,11 +853,7 @@ fn test_null_row_err() { fn assert_create_one(values: &[Scalar], schema: SchemaRef, expected: RecordBatch) { let handler = ArrowEvaluationHandler; let actual = handler.create_one(schema, values).unwrap(); - let actual_rb: RecordBatch = actual - .into_any() - .downcast::() - .unwrap() - .into(); + let actual_rb = actual.try_into_record_batch().unwrap(); assert_eq!(actual_rb, expected); } @@ -833,6 +984,8 @@ fn test_create_one_mismatching_scalar_types() { #[test] fn test_create_one_not_null_struct() { + // Creating a NOT NULL struct field with null values should error. + // The error comes from Arrow's RecordBatch validation (non-nullable column has nulls). let values: &[Scalar] = &[ Scalar::Null(KernelDataType::INTEGER), Scalar::Null(KernelDataType::INTEGER), @@ -847,12 +1000,14 @@ fn test_create_one_not_null_struct() { let handler = ArrowEvaluationHandler; assert_result_error_with_message( handler.create_one(schema, values), - "Invalid struct data: Top-level nulls in struct are not supported", + "Column 'a' is declared as non-nullable but contains null values", ); } #[test] fn test_create_one_top_level_null() { + // Creating a NOT NULL field with null value should error. + // The error comes from Arrow's RecordBatch validation. let values = &[Scalar::Null(KernelDataType::INTEGER)]; let handler = ArrowEvaluationHandler; @@ -860,10 +1015,10 @@ fn test_create_one_top_level_null() { "col_1", KernelDataType::INTEGER, )])); - assert!(matches!( + assert_result_error_with_message( handler.create_one(schema, values), - Err(Error::InvalidStructData(_)) - )); + "Column 'col_1' is declared as non-nullable but contains null values", + ); } #[test] @@ -1108,3 +1263,164 @@ fn test_to_json_with_nested_struct() { r#"{"outer_int":200,"nested_struct":{"inner_string":"value"}}"# ); } + +// helper to build a RecordBatch via `create_many` and assert it equals `expected` +fn assert_create_many(rows: &[&[Scalar]], schema: SchemaRef, expected: RecordBatch) { + let handler = ArrowEvaluationHandler; + let actual = handler.create_many(schema, rows).unwrap(); + let actual_rb = actual.try_into_record_batch().unwrap(); + assert_eq!(actual_rb, expected); +} + +#[test] +fn test_create_many_multiple_rows() { + let row1: &[Scalar] = &[1.into(), "A".into()]; + let row2: &[Scalar] = &[2.into(), "B".into()]; + let row3: &[Scalar] = &[Scalar::Null(KernelDataType::INTEGER), "C".into()]; + let schema = Arc::new(StructType::new_unchecked([ + StructField::nullable("id", KernelDataType::INTEGER), + StructField::nullable("name", KernelDataType::STRING), + ])); + let expected_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + ])); + let expected = RecordBatch::try_new( + expected_schema, + vec![ + create_array!(Int32, [Some(1), Some(2), None]), + create_array!(Utf8, ["A", "B", "C"]), + ], + ) + .unwrap(); + assert_create_many(&[row1, row2, row3], schema, expected); +} + +#[test] +fn test_create_many_empty_rows_returns_zero_row_batch() { + let schema = Arc::new(StructType::new_unchecked([ + StructField::nullable("a", KernelDataType::INTEGER), + StructField::nullable("b", KernelDataType::STRING), + ])); + let handler = ArrowEvaluationHandler; + let result = handler.create_many(schema.clone(), &[]).unwrap(); + assert_eq!(result.len(), 0); + let rb = result.try_into_record_batch().unwrap(); + assert_eq!(rb.num_rows(), 0); + assert_eq!(rb.num_columns(), 2); +} + +#[test] +fn test_create_many_wrong_field_count_returns_error() { + let schema = Arc::new(StructType::new_unchecked([ + StructField::nullable("a", KernelDataType::INTEGER), + StructField::nullable("b", KernelDataType::STRING), + ])); + // Row has 3 scalars but schema has 2 fields + let bad_row: &[Scalar] = &[1.into(), "x".into(), 99.into()]; + let handler = ArrowEvaluationHandler; + assert_result_error_with_message( + handler.create_many(schema, &[bad_row]), + "Row 0 has 3 scalars but schema has 2 fields", + ); +} + +#[test] +fn test_create_many_wrong_field_type_returns_error() { + let schema = Arc::new(StructType::new_unchecked([ + StructField::nullable("a", KernelDataType::INTEGER), + StructField::nullable("b", KernelDataType::STRING), + ])); + // Row 1 passes a Long where an Integer is expected for field "a" + let good_row: &[Scalar] = &[1.into(), "x".into()]; + let bad_row: &[Scalar] = &[1i64.into(), "y".into()]; + let handler = ArrowEvaluationHandler; + assert_result_error_with_message( + handler.create_many(schema, &[good_row, bad_row]), + "Row 1, field 'a' (expected type integer, got long): Invalid expression evaluation: Invalid builder for long", + ); +} + +#[test] +fn test_create_many_single_row_matches_create_one() { + // create_many with one row should produce the same result as create_one + let values: &[Scalar] = &[ + 1.into(), + "hello".into(), + Scalar::Null(KernelDataType::INTEGER), + ]; + let schema = Arc::new(StructType::new_unchecked([ + StructField::nullable("a", KernelDataType::INTEGER), + StructField::nullable("b", KernelDataType::STRING), + StructField::nullable("c", KernelDataType::INTEGER), + ])); + let handler = ArrowEvaluationHandler; + let from_one = handler + .create_one(schema.clone(), values) + .unwrap() + .try_into_record_batch() + .unwrap(); + let from_many = handler + .create_many(schema, &[values]) + .unwrap() + .try_into_record_batch() + .unwrap(); + assert_eq!(from_one, from_many); +} + +#[test] +fn test_create_many_nested_struct() { + // Schema: outer { inner: Struct { x: INT, y: STRING }, flag: BOOLEAN } + let inner_type = KernelDataType::struct_type_unchecked([ + StructField::nullable("x", KernelDataType::INTEGER), + StructField::nullable("y", KernelDataType::STRING), + ]); + let schema = Arc::new(StructType::new_unchecked([ + StructField::nullable("inner", inner_type.clone()), + StructField::nullable("flag", KernelDataType::BOOLEAN), + ])); + + // Row 1: inner = Struct { x: 10, y: "hello" }, flag = true + let row1: &[Scalar] = &[ + Scalar::Struct( + crate::expressions::StructData::try_new( + vec![ + StructField::nullable("x", KernelDataType::INTEGER), + StructField::nullable("y", KernelDataType::STRING), + ], + vec![10.into(), "hello".into()], + ) + .unwrap(), + ), + true.into(), + ]; + // Row 2: inner = null struct, flag = false + let row2: &[Scalar] = &[Scalar::Null(inner_type), false.into()]; + + let arrow_inner_fields: Fields = vec![ + Field::new("x", DataType::Int32, true), + Field::new("y", DataType::Utf8, true), + ] + .into(); + let expected_schema = Arc::new(Schema::new(vec![ + Field::new("inner", DataType::Struct(arrow_inner_fields.clone()), true), + Field::new("flag", DataType::Boolean, true), + ])); + + // Build expected inner struct column: row 1 has values, row 2 is null + let inner_col = Arc::new(StructArray::new( + arrow_inner_fields.clone(), + vec![ + create_array!(Int32, [Some(10), None]) as ArrayRef, + create_array!(Utf8, [Some("hello"), None]) as ArrayRef, + ], + // null buffer: row 0 valid, row 1 null + Some(NullBuffer::from(BooleanBuffer::from(vec![true, false]))), + )); + let expected = RecordBatch::try_new( + expected_schema, + vec![inner_col, create_array!(Boolean, [true, false])], + ) + .unwrap(); + assert_create_many(&[row1, row2], schema, expected); +} diff --git a/kernel/src/engine/arrow_get_data.rs b/kernel/src/engine/arrow_get_data.rs index fbed64df10..4e8387da63 100644 --- a/kernel/src/engine/arrow_get_data.rs +++ b/kernel/src/engine/arrow_get_data.rs @@ -1,79 +1,450 @@ +use std::ops::Range; + +use crate::arrow::array::cast::AsArray; use crate::arrow::array::{ - types::{GenericStringType, Int32Type, Int64Type}, - Array, BooleanArray, GenericByteArray, GenericListArray, MapArray, OffsetSizeTrait, - PrimitiveArray, + types::{ + Date32Type, Decimal128Type, Float32Type, Float64Type, GenericBinaryType, GenericStringType, + Int32Type, Int64Type, TimestampMicrosecondType, + }, + Array, BinaryViewArray, BooleanArray, GenericByteArray, GenericListArray, GenericListViewArray, + MapArray, OffsetSizeTrait, PrimitiveArray, RunArray, StringViewArray, }; +use crate::engine::arrow_data::as_string_accessor; use crate::{ engine_data::{GetData, ListItem, MapItem}, - DeltaResult, + DeltaResult, Error, }; // actual impls (todo: could macro these) impl GetData<'_> for BooleanArray { fn get_bool(&self, row_index: usize, _field_name: &str) -> DeltaResult> { - if self.is_valid(row_index) { - Ok(Some(self.value(row_index))) - } else { - Ok(None) - } + Ok(self.is_valid(row_index).then(|| self.value(row_index))) } } impl GetData<'_> for PrimitiveArray { fn get_int(&self, row_index: usize, _field_name: &str) -> DeltaResult> { - if self.is_valid(row_index) { - Ok(Some(self.value(row_index))) - } else { - Ok(None) - } + Ok(self.is_valid(row_index).then(|| self.value(row_index))) } } impl GetData<'_> for PrimitiveArray { fn get_long(&self, row_index: usize, _field_name: &str) -> DeltaResult> { - if self.is_valid(row_index) { - Ok(Some(self.value(row_index))) - } else { - Ok(None) - } + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +impl GetData<'_> for PrimitiveArray { + fn get_float(&self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +impl GetData<'_> for PrimitiveArray { + fn get_double(&self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +impl GetData<'_> for PrimitiveArray { + fn get_date(&self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +impl GetData<'_> for PrimitiveArray { + fn get_timestamp(&self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +impl GetData<'_> for PrimitiveArray { + fn get_decimal(&self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) } } impl<'a> GetData<'a> for GenericByteArray> { fn get_str(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { - if self.is_valid(row_index) { - Ok(Some(self.value(row_index))) - } else { - Ok(None) - } + Ok(self.is_valid(row_index).then(|| self.value(row_index))) } } -impl<'a, OffsetSize> GetData<'a> for GenericListArray -where - OffsetSize: OffsetSizeTrait, -{ - fn get_list( - &'a self, - row_index: usize, - _field_name: &str, - ) -> DeltaResult>> { - if self.is_valid(row_index) { - Ok(Some(ListItem::new(self, row_index))) - } else { - Ok(None) - } +impl<'a> GetData<'a> for GenericByteArray> { + fn get_str(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +impl<'a> GetData<'a> for StringViewArray { + fn get_str(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +impl<'a> GetData<'a> for GenericByteArray> { + fn get_binary(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +impl<'a> GetData<'a> for GenericByteArray> { + fn get_binary(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +impl<'a> GetData<'a> for BinaryViewArray { + fn get_binary(&'a self, row_index: usize, _field_name: &str) -> DeltaResult> { + Ok(self.is_valid(row_index).then(|| self.value(row_index))) + } +} + +/// Uniform access to list-like Arrow arrays (List, LargeList, ListView, LargeListView), +/// abstracting away differences in how each type computes per-row offsets. +trait ListLikeArray: Array { + fn list_values(&self) -> &dyn Array; + fn row_offsets(&self, row: usize) -> Range; +} + +impl ListLikeArray for GenericListArray { + fn list_values(&self) -> &dyn Array { + self.values().as_ref() + } + fn row_offsets(&self, row: usize) -> Range { + self.offsets()[row].as_usize()..self.offsets()[row + 1].as_usize() + } +} + +impl ListLikeArray for GenericListViewArray { + fn list_values(&self) -> &dyn Array { + self.values().as_ref() + } + fn row_offsets(&self, row: usize) -> Range { + let start = self.offsets()[row].as_usize(); + start..start + self.sizes()[row].as_usize() + } +} + +fn get_list_item<'a>( + list: &'a impl ListLikeArray, + row_index: usize, + field_name: &str, +) -> DeltaResult>> { + if !list.is_valid(row_index) { + return Ok(None); + } + let values = as_string_accessor(list.list_values()).ok_or_else(|| { + Error::unexpected_column_type(format!( + "{field_name}: list values are not a supported string type" + )) + })?; + Ok(Some(ListItem::new(values, list.row_offsets(row_index)))) +} + +impl<'a, OffsetSize: OffsetSizeTrait> GetData<'a> for GenericListArray { + fn get_list(&'a self, row_index: usize, field_name: &str) -> DeltaResult>> { + get_list_item(self, row_index, field_name) + } +} + +impl<'a, OffsetSize: OffsetSizeTrait> GetData<'a> for GenericListViewArray { + fn get_list(&'a self, row_index: usize, field_name: &str) -> DeltaResult>> { + get_list_item(self, row_index, field_name) } } impl<'a> GetData<'a> for MapArray { - fn get_map(&'a self, row_index: usize, _field_name: &str) -> DeltaResult>> { - if self.is_valid(row_index) { - Ok(Some(MapItem::new(self, row_index))) - } else { - Ok(None) + fn get_map(&'a self, row_index: usize, field_name: &str) -> DeltaResult>> { + if !self.is_valid(row_index) { + return Ok(None); } + let keys = as_string_accessor(self.keys().as_ref()).ok_or_else(|| { + Error::unexpected_column_type(format!( + "{field_name}: map keys are not a supported string type" + )) + })?; + let values = as_string_accessor(self.values().as_ref()).ok_or_else(|| { + Error::unexpected_column_type(format!( + "{field_name}: map values are not a supported string type" + )) + })?; + let start = self.offsets()[row_index] as usize; + let end = self.offsets()[row_index + 1] as usize; + Ok(Some(MapItem::new(keys, values, start..end))) + } +} + +/// Validates row index and returns physical index into the values array. +/// +/// Per Arrow spec, REE parent array has no validity bitmap (null_count = 0). +/// Nulls are encoded in the values child array, so null checking must be done +/// on the values array in each get_* method, not here on the parent array. +fn validate_and_get_physical_index( + run_array: &RunArray, + row_index: usize, + field_name: &str, +) -> DeltaResult { + if row_index >= run_array.len() { + return Err(Error::generic(format!( + "Row index {row_index} out of bounds for field '{field_name}'" + ))); + } + + let physical_idx = run_array.run_ends().get_physical_index(row_index); + Ok(physical_idx) +} + +/// Implement GetData for RunArray directly, so we can return it as a trait object +/// without needing a wrapper struct or Box::leak. +/// +/// This implementation supports multiple value types (strings, integers, booleans, etc.) +/// by runtime downcasting of the values array. +impl<'a> GetData<'a> for RunArray { + fn get_str(&'a self, row_index: usize, field_name: &str) -> DeltaResult> { + let physical_idx = validate_and_get_physical_index(self, row_index, field_name)?; + let values = self + .values() + .as_any() + .downcast_ref::>>() + .ok_or_else(|| { + Error::generic(format!( + "Expected StringArray values in RunArray, got {:?}", + self.values().data_type() + )) + })?; + + Ok((!values.is_null(physical_idx)).then(|| values.value(physical_idx))) + } + + fn get_int(&'a self, row_index: usize, field_name: &str) -> DeltaResult> { + let physical_idx = validate_and_get_physical_index(self, row_index, field_name)?; + let values = self + .values() + .as_primitive_opt::() + .ok_or_else(|| { + Error::generic(format!( + "Expected Int32Array values in RunArray, got {:?}", + self.values().data_type() + )) + })?; + + Ok((!values.is_null(physical_idx)).then(|| values.value(physical_idx))) + } + + fn get_long(&'a self, row_index: usize, field_name: &str) -> DeltaResult> { + let physical_idx = validate_and_get_physical_index(self, row_index, field_name)?; + let values = self + .values() + .as_primitive_opt::() + .ok_or_else(|| { + Error::generic(format!( + "Expected Int64Array values in RunArray, got {:?}", + self.values().data_type() + )) + })?; + + Ok((!values.is_null(physical_idx)).then(|| values.value(physical_idx))) + } + + fn get_bool(&'a self, row_index: usize, field_name: &str) -> DeltaResult> { + let physical_idx = validate_and_get_physical_index(self, row_index, field_name)?; + let values = self.values().as_boolean_opt().ok_or_else(|| { + Error::generic(format!( + "Expected BooleanArray values in RunArray, got {:?}", + self.values().data_type() + )) + })?; + + Ok((!values.is_null(physical_idx)).then(|| values.value(physical_idx))) + } + + fn get_binary(&'a self, row_index: usize, field_name: &str) -> DeltaResult> { + let physical_idx = validate_and_get_physical_index(self, row_index, field_name)?; + let values = self + .values() + .as_any() + .downcast_ref::>>() + .ok_or_else(|| { + Error::generic(format!( + "Expected BinaryArray values in RunArray, got {:?}", + self.values().data_type() + )) + })?; + + Ok((!values.is_null(physical_idx)).then(|| values.value(physical_idx))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::array::{ + BooleanArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, + LargeStringArray, PrimitiveArray, + }; + use crate::engine_data::GetData; + + // ========================================================================= + // Existing type tests (bool, int, long, str) + // ========================================================================= + + #[test] + fn test_get_bool() { + let array = BooleanArray::from(vec![Some(true), Some(false), None]); + assert_eq!(array.get_bool(0, "f").unwrap(), Some(true)); + assert_eq!(array.get_bool(1, "f").unwrap(), Some(false)); + assert_eq!(array.get_bool(2, "f").unwrap(), None); + } + + #[test] + fn test_get_int() { + let array = Int32Array::from(vec![Some(42), Some(-1), None]); + assert_eq!(array.get_int(0, "f").unwrap(), Some(42)); + assert_eq!(array.get_int(1, "f").unwrap(), Some(-1)); + assert_eq!(array.get_int(2, "f").unwrap(), None); + } + + #[test] + fn test_get_long() { + let array = Int64Array::from(vec![Some(i64::MAX), Some(i64::MIN), None]); + assert_eq!(array.get_long(0, "f").unwrap(), Some(i64::MAX)); + assert_eq!(array.get_long(1, "f").unwrap(), Some(i64::MIN)); + assert_eq!(array.get_long(2, "f").unwrap(), None); + } + + // ========================================================================= + // New type tests (float, double, date, timestamp, decimal) + // ========================================================================= + + #[test] + fn test_get_float() { + let array = Float32Array::from(vec![Some(1.5f32), Some(-0.0), None]); + assert_eq!(array.get_float(0, "f").unwrap(), Some(1.5f32)); + assert_eq!(array.get_float(1, "f").unwrap(), Some(-0.0f32)); + assert_eq!(array.get_float(2, "f").unwrap(), None); + } + + #[test] + fn test_get_float_special_values() { + let array = Float32Array::from(vec![ + Some(f32::NAN), + Some(f32::INFINITY), + Some(f32::NEG_INFINITY), + ]); + assert!(array.get_float(0, "f").unwrap().unwrap().is_nan()); + assert_eq!(array.get_float(1, "f").unwrap(), Some(f32::INFINITY)); + assert_eq!(array.get_float(2, "f").unwrap(), Some(f32::NEG_INFINITY)); + } + + #[test] + fn test_get_double() { + let array = Float64Array::from(vec![Some(1.23), Some(-4.56), None]); + assert_eq!(array.get_double(0, "f").unwrap(), Some(1.23)); + assert_eq!(array.get_double(1, "f").unwrap(), Some(-4.56)); + assert_eq!(array.get_double(2, "f").unwrap(), None); + } + + #[test] + fn test_get_double_special_values() { + let array = Float64Array::from(vec![ + Some(f64::NAN), + Some(f64::INFINITY), + Some(f64::NEG_INFINITY), + ]); + assert!(array.get_double(0, "f").unwrap().unwrap().is_nan()); + assert_eq!(array.get_double(1, "f").unwrap(), Some(f64::INFINITY)); + assert_eq!(array.get_double(2, "f").unwrap(), Some(f64::NEG_INFINITY)); + } + + #[test] + fn test_get_date() { + // Date32 stores days since epoch + let array = PrimitiveArray::::from(vec![Some(0), Some(19000), None]); + assert_eq!(array.get_date(0, "f").unwrap(), Some(0)); + assert_eq!(array.get_date(1, "f").unwrap(), Some(19000)); + assert_eq!(array.get_date(2, "f").unwrap(), None); + } + + #[test] + fn test_get_timestamp() { + // TimestampMicrosecond stores microseconds since epoch + let array = PrimitiveArray::::from(vec![ + Some(1_000_000), + Some(-1_000_000), + None, + ]); + assert_eq!(array.get_timestamp(0, "f").unwrap(), Some(1_000_000)); + assert_eq!(array.get_timestamp(1, "f").unwrap(), Some(-1_000_000)); + assert_eq!(array.get_timestamp(2, "f").unwrap(), None); + } + + #[test] + fn test_get_decimal() { + // Decimal128 stores as i128 + let array = + PrimitiveArray::::from(vec![Some(12345_i128), Some(-99999_i128), None]); + assert_eq!(array.get_decimal(0, "f").unwrap(), Some(12345)); + assert_eq!(array.get_decimal(1, "f").unwrap(), Some(-99999)); + assert_eq!(array.get_decimal(2, "f").unwrap(), None); + } + + // ========================================================================= + // Alternative Arrow representations for STRING and BINARY: + // STRING -> LargeUtf8 (LargeStringArray), Utf8View (StringViewArray) + // BINARY -> LargeBinary (LargeBinaryArray), BinaryView (BinaryViewArray) + // ========================================================================= + + #[test] + fn test_get_str_large_string() { + let array = LargeStringArray::from(vec![Some("hello"), Some("world"), None]); + assert_eq!(array.get_str(0, "f").unwrap(), Some("hello")); + assert_eq!(array.get_str(1, "f").unwrap(), Some("world")); + assert_eq!(array.get_str(2, "f").unwrap(), None); + } + + #[test] + fn test_get_str_string_view() { + let array = StringViewArray::from(vec![Some("hello"), Some("world"), None]); + assert_eq!(array.get_str(0, "f").unwrap(), Some("hello")); + assert_eq!(array.get_str(1, "f").unwrap(), Some("world")); + assert_eq!(array.get_str(2, "f").unwrap(), None); + } + + #[test] + fn test_get_binary_large_binary() { + let array = LargeBinaryArray::from(vec![Some(b"abc" as &[u8]), Some(b"xyz"), None]); + assert_eq!(array.get_binary(0, "f").unwrap(), Some(b"abc" as &[u8])); + assert_eq!(array.get_binary(1, "f").unwrap(), Some(b"xyz" as &[u8])); + assert_eq!(array.get_binary(2, "f").unwrap(), None); + } + + #[test] + fn test_get_binary_binary_view() { + let array = BinaryViewArray::from(vec![Some(b"abc" as &[u8]), Some(b"xyz"), None]); + assert_eq!(array.get_binary(0, "f").unwrap(), Some(b"abc" as &[u8])); + assert_eq!(array.get_binary(1, "f").unwrap(), Some(b"xyz" as &[u8])); + assert_eq!(array.get_binary(2, "f").unwrap(), None); + } + + // ========================================================================= + // Wrong-type error tests: calling the wrong getter returns an error + // ========================================================================= + + #[test] + fn test_wrong_type_returns_error() { + let int_array = Int32Array::from(vec![Some(42)]); + + // Calling get_float on an Int32Array should error + assert!(int_array.get_float(0, "f").is_err()); + assert!(int_array.get_double(0, "f").is_err()); + assert!(int_array.get_long(0, "f").is_err()); + assert!(int_array.get_decimal(0, "f").is_err()); + + let float_array = Float32Array::from(vec![Some(1.0f32)]); + assert!(float_array.get_int(0, "f").is_err()); + assert!(float_array.get_double(0, "f").is_err()); } } diff --git a/kernel/src/engine/arrow_utils.rs b/kernel/src/engine/arrow_utils.rs index 21cb246f37..404defba8b 100644 --- a/kernel/src/engine/arrow_utils.rs +++ b/kernel/src/engine/arrow_utils.rs @@ -1,33 +1,30 @@ //! Some utilities for working with arrow data types -use std::collections::{HashMap, HashSet}; -use std::io::{BufRead, BufReader}; -use std::ops::Range; -use std::sync::{Arc, OnceLock}; - use crate::engine::arrow_conversion::{TryFromKernel as _, TryIntoArrow as _}; use crate::engine::ensure_data_types::DataTypeCompat; use crate::engine_data::FilteredEngineData; use crate::schema::{ColumnMetadataKey, MetadataValue}; use crate::{ - engine::arrow_data::{extract_record_batch, ArrowEngineData}, + engine::arrow_data::ArrowEngineData, schema::{DataType, MetadataColumnSpec, Schema, SchemaRef, StructField, StructType}, utils::require, DeltaResult, EngineData, Error, }; +use std::collections::{HashMap, HashSet}; +use std::ops::Range; +use std::sync::{Arc, OnceLock}; use crate::arrow::array::{ - cast::AsArray, make_array, new_null_array, Array as ArrowArray, BooleanArray, GenericListArray, - MapArray, OffsetSizeTrait, PrimitiveArray, RecordBatch, StringArray, StructArray, + cast::AsArray, make_array, new_null_array, Array as ArrowArray, GenericListArray, MapArray, + OffsetSizeTrait, PrimitiveArray, RecordBatch, StringArray, StructArray, }; use crate::arrow::buffer::NullBuffer; -use crate::arrow::compute::concat_batches; -use crate::arrow::compute::filter_record_batch; use crate::arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Fields as ArrowFields, Int64Type, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; -use crate::arrow::json::{LineDelimitedWriter, ReaderBuilder}; +use crate::arrow::json::writer::{make_encoder, LineDelimited, NullableEncoder}; +use crate::arrow::json::{Encoder, EncoderFactory, EncoderOptions, ReaderBuilder, WriterBuilder}; use crate::parquet::arrow::PARQUET_FIELD_ID_META_KEY; use crate::parquet::file::metadata::RowGroupMetaData; use crate::parquet::{arrow::ProjectionMask, schema::types::SchemaDescriptor}; @@ -111,7 +108,7 @@ pub(crate) struct RowIndexBuilder { impl RowIndexBuilder { pub(crate) fn new(row_groups: &[RowGroupMetaData]) -> Self { - let mut row_group_row_index_ranges = vec![]; + let mut row_group_row_index_ranges = Vec::with_capacity(row_groups.len()); let mut offset = 0; for row_group in row_groups { let num_rows = row_group.num_rows(); @@ -140,7 +137,7 @@ impl RowIndexBuilder { pub(crate) fn build(self) -> DeltaResult> { let starting_offsets = match self.row_group_ordinals { Some(ordinals) => { - let mut seen_ordinals = HashSet::new(); + let mut seen_ordinals = HashSet::with_capacity(ordinals.len()); ordinals .iter() .map(|&i| { @@ -168,19 +165,272 @@ impl RowIndexBuilder { /// ensure schema compatibility, as well as `fix_nested_null_masks` to ensure that leaf columns have /// accurate null masks that row visitors rely on for correctness. /// `row_indexes` are passed through to `reorder_struct_array`. -pub(crate) fn fixup_parquet_read( +/// `file_location` is used to populate file metadata columns if requested. +/// If `target_schema` is provided, coerces the batch's field nullability to match it. +pub(crate) fn fixup_parquet_read( batch: RecordBatch, requested_ordering: &[ReorderIndex], row_indexes: Option<&mut FlattenedRangeIterator>, -) -> DeltaResult -where - StructArray: Into, -{ - let data = reorder_struct_array(batch.into(), requested_ordering, row_indexes)?; + file_location: Option<&str>, + target_schema: Option<&ArrowSchemaRef>, +) -> DeltaResult { + let data = reorder_struct_array(batch.into(), requested_ordering, row_indexes, file_location)?; let data = fix_nested_null_masks(data); + let data = if let Some(schema) = target_schema { + let batch = RecordBatch::from(data); + // Type mismatches are already handled by `reorder_struct_array` above, + // we don't do anything more strict here. + let allow_all = |_: &ArrowFieldRef, _: &ArrowFieldRef| Ok(()); + coerce_batch_nullability(batch, schema, Some(&allow_all))?.into() + } else { + data + }; Ok(data.into()) } +/// Coerces a [`RecordBatch`]'s field nullability to match a target Arrow schema. +/// +/// For example, given a source batch whose schema is: +/// +/// ```text +/// x: Int64 (nullable) +/// a: Struct (non-null) +/// ├── b: Utf8 (nullable) +/// └── c: Struct (nullable) +/// └── d: Int32 (non-null) +/// ``` +/// +/// and a target schema: +/// +/// ```text +/// x: Int64 (non-null) ← was nullable +/// a: Struct (nullable) ← was non-null +/// ├── b: Utf8 (non-null) ← was nullable +/// └── c: Struct (non-null) ← was nullable +/// └── d: Int32 (nullable) ← was non-null +/// ``` +/// +/// this function returns a new `RecordBatch` whose schema matches the target exactly — every +/// field's nullability flag (including deeply nested ones like `a.c.d`) is updated to match, +/// recursing into structs and maps. The underlying array data is unchanged; only the nullability flag is adjusted. +/// +/// **Complexity:** O(F) time and space where F is the total number of fields (including nested) +/// in the schema. The actual row data (Arrow buffers) is shared via `Arc` and never copied. +/// When the source schema already matches the target, the original batch is returned immediately. +/// +/// If `type_mismatch_validator` is provided, it is called when a source field's data type differs +/// from the target field's data type. It should return `Ok(())` to allow the mismatch or an error +/// to reject it. When `None`, any type mismatch is rejected with an internal error. +type TypeMismatchValidator<'a> = + Option<&'a dyn Fn(&ArrowFieldRef, &ArrowFieldRef) -> DeltaResult<()>>; + +pub(crate) fn coerce_batch_nullability( + batch: RecordBatch, + target_schema: &ArrowSchemaRef, + type_mismatch_validator: TypeMismatchValidator<'_>, +) -> DeltaResult { + if *batch.schema() == **target_schema { + return Ok(batch); + } + + fn coerce_struct( + src_column: &Arc, + src_children: &ArrowFields, + target_children: &ArrowFields, + type_mismatch_validator: TypeMismatchValidator<'_>, + ) -> DeltaResult> { + let src_struct = src_column + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::generic("expected Struct array during nullability coercion"))?; + let (coerced_columns, coerced_fields): (Vec>, Vec) = + src_struct + .columns() + .iter() + .zip(src_children.iter()) + .zip(target_children.iter()) + .map(|((src_child_col, src_child), target_child)| { + coerce( + src_child_col.clone(), + src_child, + target_child, + type_mismatch_validator, + ) + }) + .collect::>>()? + .into_iter() + .unzip(); + Ok(Arc::new(StructArray::try_new( + coerced_fields.into(), + coerced_columns, + src_struct.nulls().cloned(), + )?)) + } + + // Map type: recurse into entries struct to fix nested nullability + fn coerce_map( + src_column: &Arc, + src_entries_field: &ArrowFieldRef, + target_entries_field: &ArrowFieldRef, + type_mismatch_validator: TypeMismatchValidator<'_>, + ) -> DeltaResult> { + let src_map = src_column + .as_any() + .downcast_ref::() + .ok_or_else(|| Error::generic("expected Map array during nullability coercion"))?; + // Discard the source entries field; the recursive `coerce` call below + // produces `coerced_entries_field` with the target's nullability applied. + let (_, src_offsets, src_entries, src_nulls, src_ordered) = src_map.clone().into_parts(); + let (coerced_entries_col, coerced_entries_field) = coerce( + Arc::new(src_entries), + src_entries_field, + target_entries_field, + type_mismatch_validator, + )?; + let coerced_entries = coerced_entries_col + .as_any() + .downcast_ref::() + .ok_or_else(|| { + Error::generic("expected Struct array for Map entries during nullability coercion") + })? + .clone(); + Ok(Arc::new(MapArray::try_new( + coerced_entries_field, + src_offsets, + coerced_entries, + src_nulls, + src_ordered, + )?)) + } + + // List type: recurse into element to fix nested nullability + fn coerce_list( + src_column: &Arc, + src_element: &ArrowFieldRef, + target_element: &ArrowFieldRef, + type_mismatch_validator: TypeMismatchValidator<'_>, + ) -> DeltaResult> { + let src_list = src_column + .as_any() + .downcast_ref::>() + .ok_or_else(|| Error::generic("expected List array during nullability coercion"))?; + // Discard the source element field; the recursive `coerce` call below + // produces `coerced_element_field` with the target's nullability applied. + let (_, src_offsets, src_values, src_nulls) = src_list.clone().into_parts(); + let (coerced_values, coerced_element_field) = coerce( + src_values, + src_element, + target_element, + type_mismatch_validator, + )?; + Ok(Arc::new(GenericListArray::::try_new( + coerced_element_field, + src_offsets, + coerced_values, + src_nulls, + )?)) + } + + // Recursively coerces nullability for a column+field pair. For struct columns, recurses + // into children; for leaf columns, just adjusts the field's nullability flag. + fn coerce( + src_column: Arc, + src_field: &ArrowFieldRef, + target_field: &ArrowFieldRef, + type_mismatch_validator: TypeMismatchValidator<'_>, + ) -> DeltaResult<(Arc, ArrowFieldRef)> { + let coerced_array: Arc = + match (src_column.data_type(), target_field.data_type()) { + (ArrowDataType::Struct(src_children), ArrowDataType::Struct(target_children)) + if src_children != target_children => + { + coerce_struct( + &src_column, + src_children, + target_children, + type_mismatch_validator, + )? + } + ( + ArrowDataType::Map(src_entries_field, _), + ArrowDataType::Map(target_entries_field, _), + ) if src_entries_field != target_entries_field => coerce_map( + &src_column, + src_entries_field, + target_entries_field, + type_mismatch_validator, + )?, + (ArrowDataType::List(src_element), ArrowDataType::List(target_element)) + if src_element != target_element => + { + coerce_list( + &src_column, + src_element, + target_element, + type_mismatch_validator, + )? + } + + (src_type, target_type) => { + if src_type != target_type { + // Delegate to the caller-provided validator if present + // (some callers tolerate certain mismatches), otherwise reject. + if let Some(validator) = type_mismatch_validator { + validator(src_field, target_field)?; + } else { + return Err(Error::internal_error(format!( + "data type mismatch for field '{}': \ + source has {src_type:?} but target has {target_type:?}", + src_field.name(), + ))); + } + } + let coerced_field = if src_field.is_nullable() == target_field.is_nullable() { + src_field.clone() + } else { + Arc::new( + src_field + .as_ref() + .clone() + .with_nullable(target_field.is_nullable()), + ) + }; + return Ok((src_column, coerced_field)); + } + }; + let coerced_field = Arc::new( + src_field + .as_ref() + .clone() + .with_data_type(coerced_array.data_type().clone()) + .with_nullable(target_field.is_nullable()), + ); + Ok((coerced_array, coerced_field)) + } + + let batch_schema = batch.schema(); + let batch_struct_array = StructArray::from(batch); + let (src_fields, src_columns, _) = batch_struct_array.into_parts(); + + let (coerced_columns, coerced_fields): (Vec>, Vec) = + src_columns + .into_iter() + .zip(src_fields.iter()) + .zip(target_schema.fields().iter()) + .map(|((src_column, src_field), target_field)| { + coerce(src_column, src_field, target_field, type_mismatch_validator) + }) + .collect::>>()? + .into_iter() + .unzip(); + + let coerced_schema = Arc::new(ArrowSchema::new_with_metadata( + coerced_fields, + batch_schema.metadata().clone(), + )); + Ok(RecordBatch::try_new(coerced_schema, coerced_columns)?) +} + /* * The code below implements proper pruning of columns when reading parquet, reordering of columns to * match the specified schema, and insertion of null columns if the requested schema includes a @@ -306,6 +556,8 @@ pub(crate) enum ReorderIndexTransform { Missing(ArrowFieldRef), /// Row index column requested, compute it RowIndex(ArrowFieldRef), + /// File path column requested, populate with file path + FilePath(ArrowFieldRef), } impl ReorderIndex { @@ -333,14 +585,19 @@ impl ReorderIndex { ReorderIndex::new(index, ReorderIndexTransform::RowIndex(field)) } + fn file_path(index: usize, field: ArrowFieldRef) -> Self { + ReorderIndex::new(index, ReorderIndexTransform::FilePath(field)) + } + /// Check if this reordering requires a transformation anywhere. See comment below on /// [`ordering_needs_transform`] to understand why this is needed. fn needs_transform(&self) -> bool { match self.transform { - // if we're casting, inserting null, or generating row index, we need to transform + // if we're casting, inserting null, or generating row index/file path, we need to transform ReorderIndexTransform::Cast(_) | ReorderIndexTransform::Missing(_) - | ReorderIndexTransform::RowIndex(_) => true, + | ReorderIndexTransform::RowIndex(_) + | ReorderIndexTransform::FilePath(_) => true, // if our nested ordering needs a transform, we need a transform ReorderIndexTransform::Nested(ref children) => ordering_needs_transform(children), // no transform needed @@ -408,6 +665,10 @@ fn get_indices( ) -> DeltaResult<(usize, Vec)> { let mut found_fields = HashSet::with_capacity(requested_schema.num_fields()); let mut reorder_indices = Vec::with_capacity(requested_schema.num_fields()); + // Missing entries for structs found in parquet but with no selected leaves. These must + // be appended after all input-consuming entries (Identity/Nested/Cast) because + // `reorder_struct_array` uses vec position as the index into the parquet reader output. + let mut deferred_missing = Vec::new(); let mut parquet_offset = start_parquet_offset; // for each field, get its position in the parquet (via enumerate), a reference to the arrow // field, and info about where it appears in the requested_schema, or None if the field is not @@ -439,6 +700,7 @@ fn get_indices( if let DataType::Struct(ref requested_schema) | DataType::Variant(ref requested_schema) = requested_field.data_type { + let mask_before = mask_indices.len(); let (parquet_advance, children) = get_indices( parquet_index + parquet_offset, requested_schema.as_ref(), @@ -447,12 +709,27 @@ fn get_indices( )?; // advance the number of parquet fields, but subtract 1 because the // struct will be counted by the `enumerate` call but doesn't count as - // an actual index. - parquet_offset += parquet_advance - 1; - // note that we found this field + // an actual index. Use saturating_sub to handle empty structs (0 fields). + parquet_offset += parquet_advance.saturating_sub(1); + // If no leaf columns were selected (mask unchanged), the parquet + // reader will omit this struct entirely. We cannot create a Nested + // entry because it would index into a column that doesn't exist. + // The recursive call is still needed for the correct + // `parquet_advance` value. found_fields.insert(requested_field.name()); - // push the child reorder on - reorder_indices.push(ReorderIndex::nested(index, children)); + if mask_indices.len() > mask_before { + reorder_indices.push(ReorderIndex::nested(index, children)); + } else { + // The recursive call resolved all children (as nullable/missing + // or the struct is empty), but no parquet leaves were selected. + // Defer the Missing entry so it appears after all entries that + // consume parquet input columns. + debug_assert_eq!(children.len(), requested_schema.num_fields()); + deferred_missing.push(ReorderIndex::missing( + index, + Arc::new(requested_field.try_into_arrow()?), + )); + } } else { return Err(Error::unexpected_column_type(field.name())); } @@ -468,6 +745,7 @@ fn get_indices( array_type.element_type.clone(), array_type.contains_null, )]); + let mask_before = mask_indices.len(); let (parquet_advance, mut children) = get_indices( parquet_index + parquet_offset, &requested_schema, @@ -477,17 +755,24 @@ fn get_indices( // see comment above in struct match arm parquet_offset += parquet_advance - 1; found_fields.insert(requested_field.name()); - if children.len() != 1 { + if mask_indices.len() <= mask_before { + // No leaves selected inside this list. Defer a Missing entry. + deferred_missing.push(ReorderIndex::missing( + index, + Arc::new(requested_field.try_into_arrow()?), + )); + } else if children.len() != 1 { return Err(Error::generic( "List call should not have generated more than one reorder index", )); + } else { + // safety, checked that we have 1 element + let mut children = children.swap_remove(0); + // the index is wrong, as it's the index from the inner schema. + // Adjust it to be our index + children.index = index; + reorder_indices.push(children); } - // safety, checked that we have 1 element - let mut children = children.swap_remove(0); - // the index is wrong, as it's the index from the inner schema. Adjust - // it to be our index - children.index = index; - reorder_indices.push(children); } else { return Err(Error::unexpected_column_type(list_field.name())); } @@ -507,6 +792,7 @@ fn get_indices( return Err(Error::generic("map fields had more than 2 members")); } let inner_schema = map_type.as_struct_schema(key_name, val_name); + let mask_before = mask_indices.len(); let (parquet_advance, mut children) = get_indices( parquet_index + parquet_offset, &inner_schema, @@ -518,29 +804,34 @@ fn get_indices( // map will be counted by the `enumerate` call but doesn't count as // an actual index. parquet_offset += parquet_advance - 1; - // note that we found this field found_fields.insert(requested_field.name()); - - if children.len() != 2 { + if mask_indices.len() <= mask_before { + // No leaves selected inside this map. Defer a Missing entry. + deferred_missing.push(ReorderIndex::missing( + index, + Arc::new(requested_field.try_into_arrow()?), + )); + } else if children.len() != 2 { return Err(Error::generic( "Map call should have generated exactly two reorder indices", )); + } else { + // vec indexing is safe, we checked len above + let mut num_identity_transforms = 0; + if !children[0].needs_transform() { + children[0] = ReorderIndex::identity(0); + num_identity_transforms += 1; + } + if !children[1].needs_transform() { + children[1] = ReorderIndex::identity(1); + num_identity_transforms += 1; + } + let transform = match num_identity_transforms { + 2 => ReorderIndex::identity(index), + _ => ReorderIndex::nested(index, children), + }; + reorder_indices.push(transform); } - // vec indexing is safe, we checked len above - let mut num_identity_transforms = 0; - if !children[0].needs_transform() { - children[0] = ReorderIndex::identity(0); - num_identity_transforms += 1; - } - if !children[1].needs_transform() { - children[1] = ReorderIndex::identity(1); - num_identity_transforms += 1; - } - let transform = match num_identity_transforms { - 2 => ReorderIndex::identity(index), - _ => ReorderIndex::nested(index, children), - }; - reorder_indices.push(transform); } _ => { return Err(Error::unexpected_column_type(field.name())); @@ -548,14 +839,13 @@ fn get_indices( } } _ => { - // we don't care about matching on nullability or metadata here so pass `false` - // as the final argument. These can differ between the delta schema and the - // parquet schema without causing issues in reading the data. We fix them up in - // expression evaluation later. + // We don't care about matching on nullability or metadata here. These can + // differ between the delta schema and the parquet schema without causing + // issues in reading the data. We fix them up in expression evaluation later. match super::ensure_data_types::ensure_data_types( &requested_field.data_type, field.data_type(), - false, + super::ensure_data_types::ValidationMode::TypesAndNames, )? { DataTypeCompat::Identical => { reorder_indices.push(ReorderIndex::identity(index)) @@ -579,10 +869,13 @@ fn get_indices( debug!("Skipping over un-selected field: {}", field.name()); // offset by number of inner fields. subtract one, because the enumerate still // counts this logical "parent" field - parquet_offset += count_cols(field) - 1; + parquet_offset += count_cols(field).saturating_sub(1); } } + // Append deferred Missing entries after all input-consuming entries from the main loop. + reorder_indices.extend(deferred_missing); + if found_fields.len() != requested_schema.num_fields() { // some fields are missing, but they might be nullable or metadata columns, need to insert them into the reorder_indices for (requested_position, field) in requested_schema.fields().enumerate() { @@ -595,6 +888,13 @@ fn get_indices( Arc::new(field.try_into_arrow()?), )); } + Some(MetadataColumnSpec::FilePath) => { + debug!("Inserting a file path column: {}", field.name()); + reorder_indices.push(ReorderIndex::file_path( + requested_position, + Arc::new(field.try_into_arrow()?), + )); + } Some(metadata_spec) => { return Err(Error::Generic(format!( "Metadata column {metadata_spec:?} is not supported by the default parquet reader" @@ -760,15 +1060,34 @@ pub(crate) fn ordering_needs_row_indexes(requested_ordering: &[ReorderIndex]) -> // of this type and then set elements of the Vec to Some(FieldArrayOpt) for each column type FieldArrayOpt = Option<(Arc, Arc)>; +/// Creates an array for a missing field. For non-nullable structs, produces a non-null struct +/// (no null buffer) with recursively missing children, preserving the non-null constraint at +/// every level. For all other types (or nullable structs), produces an all-null array. +fn new_missing_array(field: &ArrowField, num_rows: usize) -> Arc { + match (field.is_nullable(), field.data_type()) { + (false, ArrowDataType::Struct(child_fields)) => { + let child_arrays: Vec> = child_fields + .iter() + .map(|f| new_missing_array(f, num_rows)) + .collect(); + Arc::new(StructArray::new(child_fields.clone(), child_arrays, None)) + } + _ => new_null_array(field.data_type(), num_rows), + } +} + /// Reorder a RecordBatch to match `requested_ordering`. For each non-zero value in /// `requested_ordering`, the column at that index will be added in order to the returned batch. /// /// If the requested ordering contains a [`ReorderIndexTransform::RowIndex`], `row_indexes` /// must not be `None` to append a row index column to the output. +/// If the requested ordering contains a [`ReorderIndexTransform::FilePath`], `file_location` +/// must not be `None` to append a file path column to the output. pub(crate) fn reorder_struct_array( input_data: StructArray, requested_ordering: &[ReorderIndex], mut row_indexes: Option<&mut FlattenedRangeIterator>, + file_location: Option<&str>, ) -> DeltaResult { debug!("Reordering {input_data:?} with ordering: {requested_ordering:?}"); if !ordering_needs_transform(requested_ordering) { @@ -806,6 +1125,7 @@ pub(crate) fn reorder_struct_array( struct_array, children, None, // Nested structures don't need row indexes since metadata columns can't be nested + None, // No file_location passed since metadata columns can't be nested )?); // create the new field specifying the correct order for the struct let new_field = Arc::new(ArrowField::new_struct( @@ -845,9 +1165,8 @@ pub(crate) fn reorder_struct_array( )); } ReorderIndexTransform::Missing(field) => { - let null_array = Arc::new(new_null_array(field.data_type(), num_rows)); - let field = field.clone(); // cheap Arc clone - final_fields_cols[reorder_index.index] = Some((field, null_array)); + let array = new_missing_array(field, num_rows); + final_fields_cols[reorder_index.index] = Some((field.clone(), array)); } ReorderIndexTransform::RowIndex(field) => { let Some(ref mut row_index_iter) = row_indexes else { @@ -866,6 +1185,16 @@ pub(crate) fn reorder_struct_array( final_fields_cols[reorder_index.index] = Some((Arc::clone(field), Arc::new(row_index_array))); } + ReorderIndexTransform::FilePath(field) => { + let Some(file_path) = file_location else { + return Err(Error::generic( + "File path column requested but file location not provided", + )); + }; + let file_path_array = StringArray::from(vec![file_path; num_rows]); + final_fields_cols[reorder_index.index] = + Some((Arc::clone(field), Arc::new(file_path_array))); + } } } let num_cols = final_fields_cols.len(); @@ -895,6 +1224,7 @@ fn reorder_list( struct_array, children, None, // Nested structures don't need row indexes since metadata columns can't be nested + None, // No file_location passed since metadata columns can't be nested )?); let new_list_field = Arc::new(ArrowField::new_struct( list_field.name(), @@ -930,6 +1260,7 @@ fn reorder_map( struct_array, children, None, // Nested structures don't need row indexes since metadata columns can't be nested + None, // No file_location passed since metadata columns can't be nested )?; let result_fields = result_array.fields(); let new_map_field = Arc::new(ArrowField::new_struct( @@ -1004,8 +1335,10 @@ fn compute_nested_null_masks(sa: StructArray, parent_nulls: Option<&NullBuffer>) unsafe { StructArray::new_unchecked(fields, columns, nulls) } } -/// Arrow lacks the functionality to json-parse a string column into a struct column -- even tho the -/// JSON file reader does exactly the same thing. This function is a hack to work around that gap. +/// Arrow lacks the functionality to json-parse a string column into a struct column, so we +/// implement it here. This method is for json-parsing each string in a column of strings (add.stats +/// to be specific) to produce a nested column of strongly typed values. We require that N rows in +/// means N rows out. #[internal_api] pub(crate) fn parse_json( json_strings: Box, @@ -1025,53 +1358,110 @@ pub(crate) fn parse_json( } // Raw arrow implementation of the json parsing. Separate from the public function for testing. -// -// NOTE: This code is really inefficient because arrow lacks the native capability to perform robust -// StringArray -> StructArray JSON parsing. See https://github.com/apache/arrow-rs/issues/6522. If -// that shortcoming gets fixed upstream, this method can simplify or hopefully even disappear. -fn parse_json_impl(json_strings: &StringArray, schema: ArrowSchemaRef) -> DeltaResult { +// Also used by ParseJson expression evaluation. +pub(crate) fn parse_json_impl( + json_strings: &StringArray, + schema: ArrowSchemaRef, +) -> DeltaResult { if json_strings.is_empty() { return Ok(RecordBatch::new_empty(schema)); } - // Use batch size of 1 to force one record per string input let mut decoder = ReaderBuilder::new(schema.clone()) - .with_batch_size(1) + .with_batch_size(json_strings.len()) + .with_coerce_primitive(true) .build_decoder()?; - let parse_one = |json_string: Option<&str>| -> DeltaResult { - let mut reader = BufReader::new(json_string.unwrap_or("{}").as_bytes()); - // loop to fill + empty the buffer until end of input. note that we can't just one-shot - // attempt to decode the entire thing since the buffer might only contain part of the JSON. - // see: https://github.com/delta-io/delta-kernel-rs/pull/1244 - loop { - let buf = reader.fill_buf()?; - if buf.is_empty() { - break; - } - // from `decode` docs: - // > Read JSON objects from `buf`, returning the number of bytes read - // > This method returns once `batch_size` objects have been parsed since the last call - // > to [`Self::flush`], or `buf` is exhausted. Any remaining bytes should be included - // > in the next call to [`Self::decode`] - // - // if we attempt a `parse_one` of e.g. "{}{}", we will parse the first "{}" successfully - // then decode will always return immediately sinee we have read `batch_size = 1`, - // leading to an infinite loop. Since we always just want to parse one record here, we - // detect this by checking if we always consume the entire buffer, and error if not. - let consumed = decoder.decode(buf)?; - if consumed != buf.len() { - return Err(Error::generic("Malformed JSON: Multiple JSON objects")); + + for (json, row_number) in json_strings.iter().zip(1..) { + let line = json.unwrap_or("{}"); + let consumed = decoder.decode(line.as_bytes())?; + // did we fail to decode the whole line, or was the line partial + if consumed != line.len() || decoder.has_partial_record() { + return Err(Error::Generic(format!( + "Malformed JSON: Multiple, partial, or 0 JSON objects on row {row_number}" + ))); + } + // did we decode exactly one record + if decoder.len() != row_number { + return Err(Error::Generic(format!( + "Malformed JSON: Multiple, partial, or 0 JSON objects on row {row_number}" + ))); + } + } + // Get the final batch out + if let Some(batch) = decoder.flush()? { + if batch.num_rows() != json_strings.len() { + return Err(Error::Generic(format!( + "Unexpected number of rows decoded. Got {}, expected{}", + batch.num_rows(), + json_strings.len() + ))); + } + return Ok(batch); + } + Err(Error::generic( + "Malformed JSON: exited parse_json_impl without deserializing anything useful", + )) +} + +pub(crate) fn filter_to_record_batch( + filtered_data: FilteredEngineData, +) -> DeltaResult { + let filtered = filtered_data.apply_selection_vector()?; + let arrow_data = ArrowEngineData::try_from_engine_data(filtered)?; + Ok((*arrow_data).into()) +} + +// we want to keep nulls in our partition map, so we end up with data in the log like: +// {partitionValues:{"foo": null}}, which is what is generally expected. Without this we would +// get: {partitionValues:{}} +struct NullValueMapEncoder<'a> { + field: &'a ArrowFieldRef, + array: &'a MapArray, +} + +impl<'a> Encoder for NullValueMapEncoder<'a> { + fn encode(&mut self, idx: usize, out: &mut Vec) { + let options = EncoderOptions::default().with_explicit_nulls(true); + // this unwrap is technically unsafe, but we _know_ that the array is a MapArray, and that + // `make_encoder` won't return an error for that. It would still be nice if we could return + // a `Result`, but we cannot + #[allow(clippy::unwrap_used)] + let mut encoder = make_encoder(self.field, self.array, &options).unwrap(); + encoder.encode(idx, out); + } +} + +/// This is a special encoder factory that will use the default encoder for all array types except +/// MapArrays. For MapArrays, it will make a `NullValueMapEncoder` which encodes the map preserving +/// keys that have null values. +#[derive(Debug)] +struct NullValueMapEncoderFactory; + +impl EncoderFactory for NullValueMapEncoderFactory { + fn make_default_encoder<'a>( + &self, + field: &'a ArrowFieldRef, + array: &'a dyn ArrowArray, + _options: &'a EncoderOptions, + ) -> Result>, crate::arrow::error::ArrowError> { + // It would be tempting to use `make_encoder` below, but we can't because we have to create + // a new `EncoderOptions` in order to set `with_explicit_nulls`. Then the lifetime of the + // created encoder becomes tied to the lifetime of the `EncoderOptions`, and we cannot + // return it from this method as the options would be freed here. We _also_ can't put the + // options inside the NullValueMapEncoderFactory, because this method takes `&self` not + // `&'a self`, and we can't change that as it's part of the trait definition. + match array.data_type() { + ArrowDataType::Map(_, _) => { + let array = array.as_map(); + let encoder = NullValueMapEncoder { field, array }; + let array_encoder = Box::new(encoder) as Box; + let nulls = array.nulls().cloned(); + Ok(Some(NullableEncoder::new(array_encoder, nulls))) } - reader.consume(consumed); + _ => Ok(None), } - let Some(batch) = decoder.flush()? else { - return Err(Error::missing_data("Expected data")); - }; - require!(batch.num_rows() == 1, Error::generic("Expected one row")); - Ok(batch) - }; - let output: Vec<_> = json_strings.iter().map(parse_one).try_collect()?; - Ok(concat_batches(&schema, output.iter())?) + } } /// serialize an arrow RecordBatch to a JSON string by appending to a buffer. @@ -1080,40 +1470,91 @@ fn parse_json_impl(json_strings: &StringArray, schema: ArrowSchemaRef) -> DeltaR pub(crate) fn to_json_bytes( data: impl Iterator> + Send, ) -> DeltaResult> { - let mut writer = LineDelimitedWriter::new(Vec::new()); + let builder = WriterBuilder::new().with_encoder_factory(Arc::new(NullValueMapEncoderFactory)); + let mut writer = builder.build::<_, LineDelimited>(Vec::new()); for chunk in data { - let filtered_data = chunk?; - // Honor the new contract: if selection vector is shorter than the number of rows, - // then all rows not covered by the selection vector are assumed to be selected - let (underlying_data, mut selection_vector) = filtered_data.into_parts(); - let batch = extract_record_batch(&*underlying_data)?; - let num_rows = batch.num_rows(); - - if selection_vector.is_empty() { - // If selection vector is empty, write all rows per contract. - writer.write(batch)?; - } else { - // Extend the selection vector with `true` for uncovered rows - if selection_vector.len() < num_rows { - selection_vector.resize(num_rows, true); - } - - let filtered_batch = filter_record_batch(batch, &BooleanArray::from(selection_vector)) - .map_err(|e| Error::generic(format!("Failed to filter record batch: {e}")))?; - writer.write(&filtered_batch)? - }; + let batch = filter_to_record_batch(chunk?)?; + writer.write(&batch)?; } writer.finish()?; Ok(writer.into_inner()) } +/// Applies post-processing to data read from a JSON file. Inserts synthesized metadata columns +/// (e.g. [`MetadataColumnSpec::FilePath`]) at the positions specified by `reorder_indices`. +/// +/// `reorder_indices` should be built once per schema via [`build_json_reorder_indices`] and +/// reused for every batch from the same file. +pub(crate) fn fixup_json_read( + batch: RecordBatch, + reorder_indices: &[ReorderIndex], + file_location: &str, +) -> DeltaResult { + let data = reorder_struct_array(batch.into(), reorder_indices, None, Some(file_location))?; + Ok(data.into()) +} + +/// Builds the [`ReorderIndex`] vec for post-processing JSON read batches. +/// +/// The JSON reader is given a schema with metadata columns stripped (see [`json_arrow_schema`]). +/// Its output therefore has non-metadata columns at contiguous indices 0..N in schema order. +/// This function maps those source indices — and any metadata column specs — into a +/// `Vec` that [`reorder_struct_array`] can use to produce the final batch with +/// every column at its correct position. +/// +/// Build the index vec once per schema (e.g. once per file); apply it to every batch produced +/// by the reader via [`reorder_struct_array`]. +/// +/// # Companion function +/// - Use [`json_arrow_schema`] to strip metadata columns before passing the schema to the JSON +/// reader. +pub(crate) fn build_json_reorder_indices(schema: &StructType) -> DeltaResult> { + // Real columns: position in reorder_indices IS the source column index (0..N in schema + // order), and reorder_index.index carries the output position. + let mut reorder_indices = Vec::with_capacity(schema.num_fields()); + // Metadata columns are appended after all real columns. reorder_struct_array never reads + // source data for metadata transforms, so their vec position doesn't correspond to a source + // column. Unsupported specs use Missing (null fill); non-nullable violations surface + // naturally via StructArray::try_new. + let mut metadata_entries = Vec::new(); + + for (output_pos, field) in schema.fields().enumerate() { + match field.get_metadata_column_spec() { + None => reorder_indices.push(ReorderIndex::identity(output_pos)), + Some(spec) => metadata_entries.push((output_pos, field, spec)), + } + } + + for (output_pos, field, spec) in metadata_entries { + let field = Arc::new(field.try_into_arrow()?); + let rindex = match spec { + MetadataColumnSpec::FilePath => ReorderIndex::file_path(output_pos, field), + _ => ReorderIndex::missing(output_pos, field), + }; + reorder_indices.push(rindex); + } + + Ok(reorder_indices) +} + +/// Builds an Arrow [`ArrowSchema`] from `schema` containing only the "real" JSON columns, +/// omitting any fields annotated with [`MetadataColumnSpec`]. +/// +/// Pass the returned schema to Arrow's JSON reader; then call [`build_json_reorder_indices`] +/// once on the same schema and apply [`reorder_struct_array`] to each resulting batch to +/// insert the synthesized metadata columns at their correct positions. +pub(crate) fn json_arrow_schema(schema: &StructType) -> DeltaResult { + let json_fields = schema.with_fields_filtered(|f| f.get_metadata_column_spec().is_none())?; + Ok(ArrowSchema::try_from_kernel(&json_fields)?) +} + #[cfg(test)] mod tests { use std::sync::Arc; use crate::arrow::array::{ Array, ArrayRef as ArrowArrayRef, BooleanArray, GenericListArray, Int32Array, Int32Builder, - MapArray, MapBuilder, StructArray, StructBuilder, + Int64Array, MapArray, MapBuilder, StringArray, StringBuilder, StructArray, StructBuilder, }; use crate::arrow::datatypes::{ DataType as ArrowDataType, Field as ArrowField, Fields as ArrowFields, @@ -1123,9 +1564,10 @@ mod tests { array::AsArray, buffer::{OffsetBuffer, ScalarBuffer}, }; - + use crate::engine::arrow_conversion::TryIntoArrow; use crate::schema::{ - ArrayType, ColumnMetadataKey, DataType, MapType, MetadataValue, StructField, StructType, + ArrayType, ColumnMetadataKey, DataType, MapType, MetadataColumnSpec, MetadataValue, + StructField, StructType, }; use crate::table_features::ColumnMappingMode; use crate::utils::test_utils::assert_result_error_with_message; @@ -1162,8 +1604,15 @@ mod tests { } /// Generates the column mapping metadata for a logical struct field given the field id. - fn column_mapping_metadata(field_id: i64) -> HashMap { - kernel_fid_and_name(field_id, physical_name(field_id)) + /// Returns empty metadata for `None` mode, since no annotations should be present. + fn column_mapping_metadata( + field_id: i64, + mode: ColumnMappingMode, + ) -> HashMap { + match mode { + ColumnMappingMode::None => HashMap::new(), + _ => kernel_fid_and_name(field_id, physical_name(field_id)), + } } /// Generates metadata for a parquet field with id `field_id`. @@ -1225,6 +1674,21 @@ mod tests { #[test] fn test_json_parsing() { + static EXPECTED_JSON_ERR_STR: &str = "Generic delta kernel error: Malformed JSON: Multiple, partial, or 0 JSON objects on row"; + fn check_parse_fails( + input: Vec>, + schema: ArrowSchemaRef, + expected_start: &str, + ) { + let result = parse_json_impl(&input.into(), schema); + let err = result.expect_err("Expected an error"); + let msg = err.to_string(); + assert!( + msg.starts_with(expected_start), + "Error message was not what was expected" + ); + } + let requested_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("a", ArrowDataType::Int32, true), ArrowField::new("b", ArrowDataType::Utf8, true), @@ -1234,39 +1698,24 @@ mod tests { let result = parse_json_impl(&input.into(), requested_schema.clone()).unwrap(); assert_eq!(result.num_rows(), 0); - let input: Vec> = vec![Some("")]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); - result.expect_err("empty string"); - - let input: Vec> = vec![Some(" \n\t")]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); - result.expect_err("empty string"); - - let input: Vec> = vec![Some(r#""a""#)]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); - result.expect_err("invalid string"); - - let input: Vec> = vec![Some(r#"{ "a": 1"#)]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); - result.expect_err("incomplete object"); - - let input: Vec> = vec![Some("{}{}")]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); - assert!(matches!( - result.unwrap_err(), - Error::Generic(s) if s == "Malformed JSON: Multiple JSON objects" - )); - - let input: Vec> = vec![Some(r#"{} { "a": 1"#)]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); - assert!(matches!( - result.unwrap_err(), - Error::Generic(s) if s == "Malformed JSON: Multiple JSON objects" - )); + for input in [ + vec![Some("")], + vec![Some(" \n\t")], + vec![Some(r#"{ "a": 1"#)], + vec![Some("{}{}")], + vec![Some(r#"{} { "a": 1"#)], + vec![Some(r#"{} { "a": 1"#), Some("}")], + vec![Some(r#"{ "a": 1"#), Some(r#", "b": "b"}"#)], + ] { + check_parse_fails(input, requested_schema.clone(), EXPECTED_JSON_ERR_STR); + } - let input: Vec> = vec![Some(r#"{ "a": 1"#), Some(r#", "b"}"#)]; - let result = parse_json_impl(&input.into(), requested_schema.clone()); - result.expect_err("split object"); + // this one is an error from within the tape decoder, so has a different format + check_parse_fails( + vec![Some(r#""a""#)], + requested_schema.clone(), + "Json error: expected { got \"a\"", + ); let input: Vec> = vec![None, Some(r#"{"a": 1, "b": "2", "c": 3}"#), None]; let result = parse_json_impl(&input.into(), requested_schema.clone()).unwrap(); @@ -1294,18 +1743,43 @@ mod tests { assert_eq!(long_col.value(0), long_string); } + #[test] + fn test_parse_json_impl_propagates_type_errors() { + // Verify that parse_json_impl surfaces errors for values that don't match the schema, + // so the expression-level caller can catch them and return nulls. + + // Value overflow: 99999 doesn't fit in decimal(4,2) (max 99.99) + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "a", + ArrowDataType::Decimal128(4, 2), + true, + )])); + let input: Vec> = vec![Some(r#"{"a": 99999}"#)]; + assert!(parse_json_impl(&input.into(), schema).is_err()); + + // Type mismatch: string where integer expected + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "a", + ArrowDataType::Int64, + true, + )])); + let input: Vec> = vec![Some(r#"{"a": "not_a_number"}"#)]; + assert!(parse_json_impl(&input.into(), schema).is_err()); + } + #[test] fn simple_mask_indices() { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(0), DataType::INTEGER) - .with_metadata(column_mapping_metadata(0)), + .with_metadata(column_mapping_metadata(0, mode)), StructField::nullable(logical_name(1), DataType::STRING) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::nullable(logical_name(2), DataType::INTEGER) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(0, mode), ArrowDataType::Int32, false) @@ -1476,11 +1950,12 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(0), DataType::INTEGER) - .with_metadata(column_mapping_metadata(0)), + .with_metadata(column_mapping_metadata(0, mode)), StructField::nullable(logical_name(1), DataType::INTEGER) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(0, mode), ArrowDataType::Int32, false) @@ -1496,11 +1971,12 @@ mod tests { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(0), DataType::INTEGER) - .with_metadata(column_mapping_metadata(0)), + .with_metadata(column_mapping_metadata(0, mode)), StructField::nullable(logical_name(1), DataType::STRING) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(0, mode), ArrowDataType::Int32, false), @@ -1521,8 +1997,9 @@ mod tests { logical_name(0), MapType::new(DataType::INTEGER, DataType::STRING, false), ) - .with_metadata(column_mapping_metadata(0))]) + .with_metadata(column_mapping_metadata(0, mode))]) .make_physical(mode) + .unwrap() .into(); // The key and value may have field ids not present in the delta schema @@ -1549,13 +2026,14 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(0), DataType::INTEGER) - .with_metadata(column_mapping_metadata(0)), + .with_metadata(column_mapping_metadata(0, mode)), StructField::nullable(logical_name(1), DataType::STRING) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::nullable(logical_name(2), DataType::INTEGER) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(2, mode), ArrowDataType::Int32, true) @@ -1583,13 +2061,14 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(0), DataType::INTEGER) - .with_metadata(column_mapping_metadata(0)), + .with_metadata(column_mapping_metadata(0, mode)), StructField::nullable(logical_name(1), DataType::STRING) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::nullable(logical_name(2), DataType::INTEGER) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(0, mode), ArrowDataType::Int32, false) @@ -1600,20 +2079,15 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1]; - let expected_arrow_metadata = requested_schema + let expected_arrow_field = requested_schema .field(parquet_name(1, mode)) .unwrap() - .metadata_with_string_values(); + .try_into_arrow() + .unwrap(); let expect_reorder = vec![ ReorderIndex::identity(0), ReorderIndex::identity(2), - ReorderIndex::missing( - 1, - Arc::new( - ArrowField::new(parquet_name(1, mode), ArrowDataType::Utf8, true) - .with_metadata(expected_arrow_metadata), - ), - ), + ReorderIndex::missing(1, Arc::new(expected_arrow_field)), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1631,6 +2105,7 @@ mod tests { .with_metadata(kernel_fid_and_name(3, "i2_physical")), ]) .make_physical(ColumnMappingMode::Id) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("not-i", ArrowDataType::Int32, false).with_metadata(arrow_fid(1)), @@ -1639,20 +2114,15 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1]; - let expected_arrow_metadata = requested_schema + let expected_arrow_field = requested_schema .field("s_physical") .unwrap() - .metadata_with_string_values(); + .try_into_arrow() + .unwrap(); let expect_reorder = vec![ ReorderIndex::identity(0), ReorderIndex::identity(2), - ReorderIndex::missing( - 1, - Arc::new( - ArrowField::new("s_physical", ArrowDataType::Utf8, true) - .with_metadata(expected_arrow_metadata), - ), - ), + ReorderIndex::missing(1, Arc::new(expected_arrow_field)), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1669,6 +2139,7 @@ mod tests { .with_metadata(kernel_fid_and_name(3, "i2_physical")), ]) .make_physical(ColumnMappingMode::Id) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new("i_logical", ArrowDataType::Int32, false).with_metadata(arrow_fid(1)), @@ -1677,20 +2148,15 @@ mod tests { let (mask_indices, reorder_indices) = get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask = vec![0, 1]; - let expected_arrow_metadata = requested_schema + let expected_arrow_field = requested_schema .field("s_physical") .unwrap() - .metadata_with_string_values(); + .try_into_arrow() + .unwrap(); let expect_reorder = vec![ ReorderIndex::identity(0), ReorderIndex::identity(2), - ReorderIndex::missing( - 1, - Arc::new( - ArrowField::new("s_physical", ArrowDataType::Utf8, true) - .with_metadata(expected_arrow_metadata), - ), - ), + ReorderIndex::missing(1, Arc::new(expected_arrow_field)), ]; assert_eq!(mask_indices, expect_mask); assert_eq!(reorder_indices, expect_reorder); @@ -1721,8 +2187,6 @@ mod tests { #[test] fn test_match_parquet_fields_filters_metadata_columns() { - use crate::schema::MetadataColumnSpec; - let kernel_schema = StructType::new_unchecked([ StructField::not_null("regular_field", DataType::INTEGER), StructField::create_metadata_column("row_index", MetadataColumnSpec::RowIndex), @@ -1793,7 +2257,7 @@ mod tests { ), ]; - let result = reorder_struct_array(arry, &reorder, None); + let result = reorder_struct_array(arry, &reorder, None, None); assert_result_error_with_message( result, "Row index column requested but row index iterator not provided", @@ -1816,7 +2280,7 @@ mod tests { #[allow(clippy::single_range_in_vec_init)] let mut row_indexes = vec![(0..4)].into_iter().flatten(); - let ordered = reorder_struct_array(arry, &reorder, Some(&mut row_indexes)).unwrap(); + let ordered = reorder_struct_array(arry, &reorder, Some(&mut row_indexes), None).unwrap(); assert_eq!(ordered.column_names(), vec!["b", "row_idx"]); // Verify the row index column contains the expected values @@ -1853,6 +2317,79 @@ mod tests { assert_eq!(reorder_indices, expect_reorder); } + #[test] + fn simple_file_path_field() { + let requested_schema = Arc::new(StructType::new_unchecked([ + StructField::not_null("i", DataType::INTEGER), + StructField::create_metadata_column("_file", MetadataColumnSpec::FilePath), + StructField::nullable("i2", DataType::INTEGER), + ])); + let parquet_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("i", ArrowDataType::Int32, false), + ArrowField::new("i2", ArrowDataType::Int32, true), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); + let expect_mask = vec![0, 1]; + let mut arrow_file_path_field = ArrowField::new("_file", ArrowDataType::Utf8, false); + arrow_file_path_field.set_metadata(HashMap::from([( + "delta.metadataSpec".to_string(), + "_file".to_string(), + )])); + let expect_reorder = vec![ + ReorderIndex::identity(0), + ReorderIndex::identity(2), + ReorderIndex::file_path(1, Arc::new(arrow_file_path_field)), + ]; + assert_eq!(mask_indices, expect_mask); + assert_eq!(reorder_indices, expect_reorder); + } + + #[test] + fn test_reorder_struct_array_with_file_path() { + // Test that file paths work when properly provided + let arry = make_struct_array(); + let reorder = vec![ + ReorderIndex::identity(0), + ReorderIndex::file_path( + 1, + Arc::new(ArrowField::new("_file", ArrowDataType::Utf8, false)), + ), + ]; + + let file_location = "s3://bucket/path/to/file.parquet"; + let ordered = reorder_struct_array(arry, &reorder, None, Some(file_location)).unwrap(); + assert_eq!(ordered.column_names(), vec!["b", "_file"]); + + // Verify the file path column is a plain StringArray with the path repeated for each row. + let file_path_col = ordered.column(1); + let string_array = file_path_col + .as_any() + .downcast_ref::() + .expect("Expected StringArray"); + assert_eq!(string_array.len(), 4); + assert!(string_array.iter().all(|v| v == Some(file_location))); + } + + #[test] + fn test_reorder_struct_array_missing_file_path() { + // Test that error occurs when file path is requested but not provided + let arry = make_struct_array(); + let reorder = vec![ + ReorderIndex::identity(0), + ReorderIndex::file_path( + 1, + Arc::new(ArrowField::new("_file", ArrowDataType::Utf8, false)), + ), + ]; + + let result = reorder_struct_array(arry, &reorder, None, None); + assert_result_error_with_message( + result, + "File path column requested but file location not provided", + ); + } + #[test] fn test_row_index_builder_no_skipping() { let row_groups = vec![ @@ -1955,21 +2492,22 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(1), DataType::INTEGER) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::not_null( logical_name(3), StructType::new_unchecked([ StructField::not_null(logical_name(4), DataType::INTEGER) - .with_metadata(column_mapping_metadata(4)), + .with_metadata(column_mapping_metadata(4, mode)), StructField::not_null(logical_name(5), DataType::STRING) - .with_metadata(column_mapping_metadata(5)), + .with_metadata(column_mapping_metadata(5, mode)), ]), ) - .with_metadata(column_mapping_metadata(3)), + .with_metadata(column_mapping_metadata(3, mode)), StructField::not_null(logical_name(2), DataType::INTEGER) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = nested_parquet_schema(mode); let (mask_indices, reorder_indices) = @@ -1995,18 +2533,19 @@ mod tests { logical_name(3), StructType::new_unchecked([ StructField::not_null(logical_name(5), DataType::STRING) - .with_metadata(column_mapping_metadata(5)), + .with_metadata(column_mapping_metadata(5, mode)), StructField::not_null(logical_name(4), DataType::INTEGER) - .with_metadata(column_mapping_metadata(4)), + .with_metadata(column_mapping_metadata(4, mode)), ]), ) - .with_metadata(column_mapping_metadata(3)), + .with_metadata(column_mapping_metadata(3, mode)), StructField::not_null(logical_name(2), DataType::INTEGER) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), StructField::not_null(logical_name(1), DataType::INTEGER) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = nested_parquet_schema(mode); let (mask_indices, reorder_indices) = @@ -2030,20 +2569,21 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(1), DataType::INTEGER) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::not_null( logical_name(3), StructType::new_unchecked([StructField::not_null( logical_name(4), DataType::INTEGER, ) - .with_metadata(column_mapping_metadata(4))]), + .with_metadata(column_mapping_metadata(4, mode))]), ) - .with_metadata(column_mapping_metadata(3)), + .with_metadata(column_mapping_metadata(3, mode)), StructField::not_null(logical_name(2), DataType::INTEGER) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = nested_parquet_schema(mode); let (mask_indices, reorder_indices) = @@ -2059,18 +2599,63 @@ mod tests { }) } + #[test] + fn unmatched_struct_before_selected_leaf_ordering() { + // Regression: when a struct with no matching children appears BEFORE a selected + // leaf in parquet order, the Missing entry must be deferred so the leaf's + // Identity entry gets the correct parquet_position in reorder_struct_array. + let requested_schema: SchemaRef = Arc::new(StructType::new_unchecked([ + StructField::nullable("a", DataType::LONG), + StructField::nullable( + "stats", + StructType::new_unchecked([StructField::nullable("age", DataType::LONG)]), + ), + ])); + // Parquet has stats BEFORE a + let parquet_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new( + "stats", + ArrowDataType::Struct( + vec![ArrowField::new("id", ArrowDataType::Int64, true)].into(), + ), + true, + ), + ArrowField::new("a", ArrowDataType::Int64, true), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); + // Only "a" should be in the mask (leaf index 1, after stats.id at index 0) + assert_eq!(mask_indices, vec![1]); + let expected_stats_field = Arc::new( + requested_schema + .field("stats") + .unwrap() + .try_into_arrow() + .unwrap(), + ); + // Identity for "a" must come FIRST (parquet_position 0), then Missing for stats + assert_eq!( + reorder_indices, + vec![ + ReorderIndex::identity(0), + ReorderIndex::missing(1, expected_stats_field), + ] + ); + } + #[test] fn simple_list_mask() { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(1), DataType::INTEGER) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::not_null(logical_name(2), ArrayType::new(DataType::INTEGER, false)) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), StructField::not_null(logical_name(3), DataType::INTEGER) - .with_metadata(column_mapping_metadata(3)), + .with_metadata(column_mapping_metadata(3, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(1, mode), ArrowDataType::Int32, false) @@ -2108,8 +2693,9 @@ mod tests { logical_name(1), ArrayType::new(DataType::INTEGER, false), ) - .with_metadata(column_mapping_metadata(1))]) + .with_metadata(column_mapping_metadata(1, mode))]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(0, mode), ArrowDataType::Int32, false) @@ -2138,25 +2724,26 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(0), DataType::INTEGER) - .with_metadata(column_mapping_metadata(0)), + .with_metadata(column_mapping_metadata(0, mode)), StructField::not_null( logical_name(1), ArrayType::new( StructType::new_unchecked([ StructField::not_null(logical_name(3), DataType::INTEGER) - .with_metadata(column_mapping_metadata(3)), + .with_metadata(column_mapping_metadata(3, mode)), StructField::not_null(logical_name(4), DataType::STRING) - .with_metadata(column_mapping_metadata(4)), + .with_metadata(column_mapping_metadata(4, mode)), ]) .into(), false, ), ) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::not_null(logical_name(2), DataType::INTEGER) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(0, mode), ArrowDataType::Int32, false) @@ -2203,11 +2790,12 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(1), DataType::INTEGER) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::not_null(logical_name(3), DataType::INTEGER) - .with_metadata(column_mapping_metadata(3)), + .with_metadata(column_mapping_metadata(3, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(1, mode), ArrowDataType::Int32, false) @@ -2247,7 +2835,7 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(1), DataType::INTEGER) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::not_null( logical_name(2), ArrayType::new( @@ -2255,16 +2843,17 @@ mod tests { logical_name(4), DataType::INTEGER, ) - .with_metadata(column_mapping_metadata(4))]) + .with_metadata(column_mapping_metadata(4, mode))]) .into(), false, ), ) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), StructField::not_null(logical_name(3), DataType::INTEGER) - .with_metadata(column_mapping_metadata(3)), + .with_metadata(column_mapping_metadata(3, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(1, mode), ArrowDataType::Int32, false) @@ -2308,25 +2897,26 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(1), DataType::INTEGER) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::not_null( logical_name(2), ArrayType::new( StructType::new_unchecked([ StructField::not_null(logical_name(6), DataType::STRING) - .with_metadata(column_mapping_metadata(6)), + .with_metadata(column_mapping_metadata(6, mode)), StructField::not_null(logical_name(5), DataType::INTEGER) - .with_metadata(column_mapping_metadata(5)), + .with_metadata(column_mapping_metadata(5, mode)), ]) .into(), false, ), ) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), StructField::not_null(logical_name(3), DataType::INTEGER) - .with_metadata(column_mapping_metadata(3)), + .with_metadata(column_mapping_metadata(3, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new(parquet_name(1, mode), ArrowDataType::Int32, false) @@ -2375,21 +2965,22 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::not_null(logical_name(1), DataType::INTEGER) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::not_null( logical_name(2), StructType::new_unchecked([ StructField::not_null(logical_name(4), DataType::INTEGER) - .with_metadata(column_mapping_metadata(4)), + .with_metadata(column_mapping_metadata(4, mode)), StructField::not_null(logical_name(5), DataType::STRING) - .with_metadata(column_mapping_metadata(5)), + .with_metadata(column_mapping_metadata(5, mode)), ]), ) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), StructField::not_null(logical_name(3), DataType::INTEGER) - .with_metadata(column_mapping_metadata(3)), + .with_metadata(column_mapping_metadata(3, mode)), ]) .make_physical(mode) + .unwrap() .into(); let parquet_schema = Arc::new(ArrowSchema::new(vec![ ArrowField::new( @@ -2530,7 +3121,7 @@ mod tests { fn simple_reorder_struct() { let arry = make_struct_array(); let reorder = vec![ReorderIndex::identity(1), ReorderIndex::identity(0)]; - let ordered = reorder_struct_array(arry, &reorder, None).unwrap(); + let ordered = reorder_struct_array(arry, &reorder, None, None).unwrap(); assert_eq!(ordered.column_names(), vec!["c", "b"]); } @@ -2578,7 +3169,7 @@ mod tests { ], ), ]; - let ordered = reorder_struct_array(nested, &reorder, None).unwrap(); + let ordered = reorder_struct_array(nested, &reorder, None, None).unwrap(); assert_eq!(ordered.column_names(), vec!["struct2", "struct1"]); let ordered_s2 = ordered.column(0).as_struct(); assert_eq!(ordered_s2.column_names(), vec!["b", "c", "s"]); @@ -2625,7 +3216,7 @@ mod tests { 0, vec![ReorderIndex::identity(1), ReorderIndex::identity(0)], )]; - let ordered = reorder_struct_array(struct_array, &reorder, None).unwrap(); + let ordered = reorder_struct_array(struct_array, &reorder, None, None).unwrap(); let ordered_list_col = ordered.column(0).as_list::(); for i in 0..ordered_list_col.len() { let array_item = ordered_list_col.value(i); @@ -2691,7 +3282,7 @@ mod tests { ], ), ]; - let ordered = reorder_struct_array(struct_array, &reorder, None).unwrap(); + let ordered = reorder_struct_array(struct_array, &reorder, None, None).unwrap(); assert_eq!(ordered.column_names(), vec!["map", "i"]); if let ArrowDataType::Map(field, _) = ordered.column(0).data_type() { if let ArrowDataType::Struct(fields) = field.data_type() { @@ -2719,11 +3310,12 @@ mod tests { column_mapping_cases().into_iter().for_each(|mode| { let requested_schema = StructType::new_unchecked([ StructField::nullable(logical_name(1), DataType::STRING) - .with_metadata(column_mapping_metadata(1)), + .with_metadata(column_mapping_metadata(1, mode)), StructField::nullable(logical_name(2), DataType::INTEGER) - .with_metadata(column_mapping_metadata(2)), + .with_metadata(column_mapping_metadata(2, mode)), ]) .make_physical(mode) + .unwrap() .into(); let nots_field = ArrowField::new("NOTs", ArrowDataType::Utf8, true).with_metadata(arrow_fid(3)); @@ -2737,17 +3329,12 @@ mod tests { get_requested_indices(&requested_schema, &parquet_schema).unwrap(); let expect_mask: Vec = vec![]; - // Build expected arrow fields + // Build expected arrow fields using proper conversion let mut fields = requested_schema.fields(); - let metadata1 = fields.next().unwrap().metadata_with_string_values(); - let metadata2 = fields.next().unwrap().metadata_with_string_values(); - let expected_field1 = ArrowField::new(parquet_name(1, mode), ArrowDataType::Utf8, true) - .with_metadata(metadata1) - .into(); - let expected_field2 = - ArrowField::new(parquet_name(2, mode), ArrowDataType::Int32, true) - .with_metadata(metadata2) - .into(); + let expected_field1: Arc = + Arc::new(fields.next().unwrap().try_into_arrow().unwrap()); + let expected_field2: Arc = + Arc::new(fields.next().unwrap().try_into_arrow().unwrap()); let expect_reorder = vec![ ReorderIndex::missing(0, expected_field1), @@ -2962,4 +3549,895 @@ mod tests { let non_null_leaf_nullable_2 = inner_non_null_2.column(1); assert_eq!(non_null_leaf_nullable_2, non_null_leaf_nullable_1); } + + /// (src_field, target_field, column) where src and target differ only in nullability. + type CoerceTestCase = (Arc, Arc, Arc); + + /// Builds a (to_nullable, to_non_null) coerce test case pair from field/col constructors. + /// The `leaf_nullable` parameter controls the nullability of the innermost toggled field. + fn make_coerce_pair( + make_field: impl Fn(bool) -> Arc, + make_col: impl Fn(bool) -> Arc, + ) -> (CoerceTestCase, CoerceTestCase) { + ( + (make_field(false), make_field(true), make_col(false)), + (make_field(true), make_field(false), make_col(true)), + ) + } + + /// Builds a Map entries field: `entries: Struct { key: Utf8, value: }`. + fn make_map_entries_field(value_field: ArrowField) -> Arc { + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct(ArrowFields::from(vec![ + ArrowField::new("key", ArrowDataType::Utf8, false), + value_field, + ])), + false, + )) + } + + /// Builds a 2-row MapArray from an entries field, with keys ["a","b"] and given values. + fn make_map_array( + entries_field: Arc, + values: Arc, + ) -> Arc { + let entry_fields = match entries_field.data_type() { + ArrowDataType::Struct(f) => f.clone(), + _ => unreachable!(), + }; + let keys: Arc = Arc::new(StringArray::from(vec!["a", "b"])); + let entries = StructArray::try_new(entry_fields, vec![keys, values], None).unwrap(); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 1, 2])); + Arc::new(MapArray::try_new(entries_field, offsets, entries, None, false).unwrap()) + } + + fn create_data_int() -> (CoerceTestCase, CoerceTestCase) { + let col: Arc = Arc::new(Int32Array::from(vec![1, 2])); + make_coerce_pair( + |leaf_nullable| Arc::new(ArrowField::new("col", ArrowDataType::Int32, leaf_nullable)), + |_leaf_nullable| col.clone(), + ) + } + + fn create_data_string() -> (CoerceTestCase, CoerceTestCase) { + let col: Arc = Arc::new(StringArray::from(vec!["a", "b"])); + make_coerce_pair( + |leaf_nullable| Arc::new(ArrowField::new("col", ArrowDataType::Utf8, leaf_nullable)), + |_leaf_nullable| col.clone(), + ) + } + + fn create_data_struct() -> (CoerceTestCase, CoerceTestCase) { + let inner_col: Arc = Arc::new(Int32Array::from(vec![1, 2])); + make_coerce_pair( + |leaf_nullable| { + Arc::new(ArrowField::new( + "c", + ArrowDataType::Struct(ArrowFields::from(vec![ArrowField::new( + "val", + ArrowDataType::Int32, + leaf_nullable, + )])), + false, + )) + }, + |leaf_nullable| { + let inner = ArrowField::new("val", ArrowDataType::Int32, leaf_nullable); + Arc::new( + StructArray::try_new( + ArrowFields::from(vec![inner]), + vec![inner_col.clone()], + None, + ) + .unwrap(), + ) + }, + ) + } + + fn create_data_list() -> (CoerceTestCase, CoerceTestCase) { + let values: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); + make_coerce_pair( + |leaf_nullable| { + let elem = Arc::new(ArrowField::new("item", ArrowDataType::Int32, leaf_nullable)); + Arc::new(ArrowField::new("col", ArrowDataType::List(elem), false)) + }, + |leaf_nullable| { + let elem = Arc::new(ArrowField::new("item", ArrowDataType::Int32, leaf_nullable)); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 2, 3])); + Arc::new( + GenericListArray::::try_new(elem, offsets, values.clone(), None).unwrap(), + ) + }, + ) + } + + fn create_data_map() -> (CoerceTestCase, CoerceTestCase) { + make_coerce_pair( + |leaf_nullable| { + let entries_field = make_map_entries_field(ArrowField::new( + "value", + ArrowDataType::Int32, + leaf_nullable, + )); + Arc::new(ArrowField::new( + "c", + ArrowDataType::Map(entries_field, false), + false, + )) + }, + |leaf_nullable| { + let entries_field = make_map_entries_field(ArrowField::new( + "value", + ArrowDataType::Int32, + leaf_nullable, + )); + make_map_array(entries_field, Arc::new(Int32Array::from(vec![1, 2]))) + }, + ) + } + + /// List> — toggle nullability of the Int32 leaf inside the struct element. + fn create_data_struct_in_list() -> (CoerceTestCase, CoerceTestCase) { + make_coerce_pair( + |leaf_nullable| { + let leaf = ArrowField::new("val", ArrowDataType::Int32, leaf_nullable); + let elem = Arc::new(ArrowField::new( + "item", + ArrowDataType::Struct(ArrowFields::from(vec![leaf])), + false, + )); + Arc::new(ArrowField::new("col", ArrowDataType::List(elem), false)) + }, + |leaf_nullable| { + let leaf = ArrowField::new("val", ArrowDataType::Int32, leaf_nullable); + let inner_col: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); + let elem_field = Arc::new(ArrowField::new( + "item", + ArrowDataType::Struct(ArrowFields::from(vec![leaf.clone()])), + false, + )); + let structs = + StructArray::try_new(ArrowFields::from(vec![leaf]), vec![inner_col], None) + .unwrap(); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 2, 3])); + Arc::new( + GenericListArray::::try_new(elem_field, offsets, Arc::new(structs), None) + .unwrap(), + ) + }, + ) + } + + /// Struct> — toggle nullability of the Int32 element inside the list child. + fn create_data_list_in_struct() -> (CoerceTestCase, CoerceTestCase) { + make_coerce_pair( + |leaf_nullable| { + let elem = Arc::new(ArrowField::new("item", ArrowDataType::Int32, leaf_nullable)); + let list_field = ArrowField::new("vals", ArrowDataType::List(elem), false); + Arc::new(ArrowField::new( + "c", + ArrowDataType::Struct(ArrowFields::from(vec![list_field])), + false, + )) + }, + |leaf_nullable| { + let elem = Arc::new(ArrowField::new("item", ArrowDataType::Int32, leaf_nullable)); + let values: Arc = Arc::new(Int32Array::from(vec![1, 2, 3])); + let offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 2, 3])); + let list_col: Arc = Arc::new( + GenericListArray::::try_new(elem, offsets, values, None).unwrap(), + ); + let list_field = ArrowField::new("vals", list_col.data_type().clone(), false); + Arc::new( + StructArray::try_new(ArrowFields::from(vec![list_field]), vec![list_col], None) + .unwrap(), + ) + }, + ) + } + + /// Map> — toggle nullability of the Int32 element inside the list value. + fn create_data_list_in_map() -> (CoerceTestCase, CoerceTestCase) { + make_coerce_pair( + |leaf_nullable| { + let elem = Arc::new(ArrowField::new("item", ArrowDataType::Int32, leaf_nullable)); + let entries_field = make_map_entries_field(ArrowField::new( + "value", + ArrowDataType::List(elem), + true, + )); + Arc::new(ArrowField::new( + "c", + ArrowDataType::Map(entries_field, false), + false, + )) + }, + |leaf_nullable| { + let elem = Arc::new(ArrowField::new("item", ArrowDataType::Int32, leaf_nullable)); + let list_values: Arc = Arc::new(Int32Array::from(vec![1, 2])); + let list_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 1, 2])); + let list_col: Arc = Arc::new( + GenericListArray::::try_new(elem, list_offsets, list_values, None) + .unwrap(), + ); + let entries_field = make_map_entries_field(ArrowField::new( + "value", + ArrowDataType::List(Arc::new(ArrowField::new( + "item", + ArrowDataType::Int32, + leaf_nullable, + ))), + true, + )); + make_map_array(entries_field, list_col) + }, + ) + } + + /// Map> — toggle nullability of the Int32 leaf inside the struct value. + fn create_data_struct_in_map() -> (CoerceTestCase, CoerceTestCase) { + make_coerce_pair( + |leaf_nullable| { + let leaf = ArrowField::new("val", ArrowDataType::Int32, leaf_nullable); + let value_field = ArrowField::new( + "value", + ArrowDataType::Struct(ArrowFields::from(vec![leaf])), + true, + ); + let entries_field = make_map_entries_field(value_field); + Arc::new(ArrowField::new( + "c", + ArrowDataType::Map(entries_field, false), + false, + )) + }, + |leaf_nullable| { + let leaf = ArrowField::new("val", ArrowDataType::Int32, leaf_nullable); + let inner: Arc = Arc::new(Int32Array::from(vec![1, 2])); + let struct_col: Arc = Arc::new( + StructArray::try_new(ArrowFields::from(vec![leaf.clone()]), vec![inner], None) + .unwrap(), + ); + let value_field = ArrowField::new( + "value", + ArrowDataType::Struct(ArrowFields::from(vec![leaf])), + true, + ); + let entries_field = make_map_entries_field(value_field); + make_map_array(entries_field, struct_col) + }, + ) + } + + /// Struct> — toggle nullability of the Int32 value inside the map child. + fn create_data_map_in_struct() -> (CoerceTestCase, CoerceTestCase) { + make_coerce_pair( + |leaf_nullable| { + let entries_field = make_map_entries_field(ArrowField::new( + "value", + ArrowDataType::Int32, + leaf_nullable, + )); + let map_field = + ArrowField::new("map_child", ArrowDataType::Map(entries_field, false), false); + Arc::new(ArrowField::new( + "c", + ArrowDataType::Struct(ArrowFields::from(vec![map_field])), + false, + )) + }, + |leaf_nullable| { + let entries_field = make_map_entries_field(ArrowField::new( + "value", + ArrowDataType::Int32, + leaf_nullable, + )); + let map_col = make_map_array(entries_field, Arc::new(Int32Array::from(vec![1, 2]))); + let map_child_field = + ArrowField::new("map_child", map_col.data_type().clone(), false); + Arc::new( + StructArray::try_new( + ArrowFields::from(vec![map_child_field]), + vec![map_col], + None, + ) + .unwrap(), + ) + }, + ) + } + + /// List> — toggle nullability of the Int32 value inside the map element. + fn create_data_map_in_list() -> (CoerceTestCase, CoerceTestCase) { + make_coerce_pair( + |leaf_nullable| { + let entries_field = make_map_entries_field(ArrowField::new( + "value", + ArrowDataType::Int32, + leaf_nullable, + )); + let map_elem = Arc::new(ArrowField::new( + "item", + ArrowDataType::Map(entries_field, false), + false, + )); + Arc::new(ArrowField::new("col", ArrowDataType::List(map_elem), false)) + }, + |leaf_nullable| { + let entries_field = make_map_entries_field(ArrowField::new( + "value", + ArrowDataType::Int32, + leaf_nullable, + )); + let map_col = make_map_array(entries_field, Arc::new(Int32Array::from(vec![1, 2]))); + let map_elem = + Arc::new(ArrowField::new("item", map_col.data_type().clone(), false)); + let list_offsets = OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 1, 2])); + Arc::new( + GenericListArray::::try_new(map_elem, list_offsets, map_col, None) + .unwrap(), + ) + }, + ) + } + + use rstest::rstest; + + /// Rename fields in each case tuple to `c{idx}` so multi-column batches have unique names. + fn allocate_name_to_field(idx: usize, (src, tgt, col): CoerceTestCase) -> CoerceTestCase { + let name = format!("c{idx}"); + let src = Arc::new(ArrowField::new( + &name, + src.data_type().clone(), + src.is_nullable(), + )); + let tgt = Arc::new(ArrowField::new( + &name, + tgt.data_type().clone(), + tgt.is_nullable(), + )); + (src, tgt, col) + } + + #[rstest] + // Each basic type with a nested type: covers all leaf/container recursion paths + #[case::int_and_struct_in_list(vec![ + create_data_int(), create_data_struct_in_list(), + ], true)] + #[case::string_and_list_in_struct(vec![ + create_data_string(), create_data_list_in_struct(), + ], false)] + // All simple types together + #[case::all_simple_types(vec![ + create_data_int(), create_data_string(), create_data_struct(), + create_data_list(), create_data_map(), + ], true)] + // All nested types together + #[case::all_nested_types(vec![ + create_data_struct_in_list(), create_data_list_in_struct(), + create_data_list_in_map(), create_data_struct_in_map(), + create_data_map_in_struct(), create_data_map_in_list(), + ], false)] + // Mix of simple and nested, to_nullable + #[case::mixed_to_nullable(vec![ + create_data_int(), create_data_struct(), create_data_map_in_list(), + create_data_struct_in_map(), + ], true)] + // Mix of simple and nested, to_non_null + #[case::mixed_to_non_null(vec![ + create_data_list(), create_data_map(), create_data_list_in_map(), + create_data_map_in_struct(), + ], false)] + fn test_coerce_batch_nullability( + #[case] data: Vec<(CoerceTestCase, CoerceTestCase)>, + #[case] to_nullable: bool, + ) { + let (src_fields, tgt_fields, cols): (Vec<_>, Vec<_>, Vec<_>) = data + .into_iter() + .enumerate() + .map(|(i, (to_nullable_case, to_non_null_case))| { + let case = if to_nullable { + to_nullable_case + } else { + to_non_null_case + }; + allocate_name_to_field(i, case) + }) + .multiunzip(); + let src_schema = Arc::new(ArrowSchema::new(src_fields)); + let target_schema = Arc::new(ArrowSchema::new(tgt_fields)); + assert_ne!(src_schema, target_schema); + let batch = RecordBatch::try_new(src_schema, cols).unwrap(); + let result = coerce_batch_nullability(batch, &target_schema, None).unwrap(); + assert_eq!(*result.schema(), *target_schema); + } + + #[test] + fn test_coerce_batch_nullability_schema_already_matches() { + let field = ArrowField::new("a", ArrowDataType::Int32, false); + let col: Arc = Arc::new(Int32Array::from(vec![1, 2])); + let schema = Arc::new(ArrowSchema::new(vec![field])); + let batch = RecordBatch::try_new(schema.clone(), vec![col]).unwrap(); + let result = coerce_batch_nullability(batch.clone(), &schema, None).unwrap(); + assert_eq!(result, batch); + } + + #[test] + fn test_coerce_batch_nullability_type_mismatch_rejected_without_validator() { + let ((int_src_field, _, int_col), _) = create_data_int(); + let (_, (_, string_tgt_field, _)) = create_data_string(); + let src_schema = Arc::new(ArrowSchema::new(vec![int_src_field.as_ref().clone()])); + let target_schema = Arc::new(ArrowSchema::new(vec![string_tgt_field.as_ref().clone()])); + let batch = RecordBatch::try_new(src_schema, vec![int_col]).unwrap(); + let result = coerce_batch_nullability(batch, &target_schema, None); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("data type mismatch"), + "unexpected error: {err}" + ); + } + + #[test] + fn test_coerce_batch_nullability_type_mismatch_allowed_with_validator() { + let ((int_src_field, _, int_col), _) = create_data_int(); + let ((_, string_tgt_field, _), _) = create_data_string(); + let src_schema = Arc::new(ArrowSchema::new(vec![int_src_field.as_ref().clone()])); + let target_schema = Arc::new(ArrowSchema::new(vec![string_tgt_field.as_ref().clone()])); + let batch = RecordBatch::try_new(src_schema, vec![int_col]).unwrap(); + let allow_all = |_: &ArrowFieldRef, _: &ArrowFieldRef| Ok(()); + let result = coerce_batch_nullability(batch, &target_schema, Some(&allow_all)).unwrap(); + assert_eq!(result.schema().field(0).data_type(), &ArrowDataType::Int32); + assert!(result.schema().field(0).is_nullable()); + } + + /// Verifies metadata is preserved at every nesting level (struct, list, map) after coercion. + /// Schema: s: Struct { lst: List[Int32], mp: Map }, all non-null → nullable. + #[test] + fn test_coerce_batch_nullability_preserves_field_metadata() { + use std::collections::HashMap; + + let meta = |key: &str| HashMap::from([(key.to_string(), "val".to_string())]); + let offsets_1 = || OffsetBuffer::new(ScalarBuffer::from(vec![0i32, 1])); + + /// Recursively sets all fields to nullable, except map entries and map keys + /// which Arrow requires to be non-null. + fn make_all_nullable(field: &ArrowField) -> ArrowField { + let dt = match field.data_type() { + ArrowDataType::Struct(children) => ArrowDataType::Struct( + children + .iter() + .map(|f| Arc::new(make_all_nullable(f))) + .collect(), + ), + ArrowDataType::List(elem) => ArrowDataType::List(Arc::new(make_all_nullable(elem))), + ArrowDataType::Map(entries, ordered) => { + // Recurse into entries struct but keep entries itself non-null + let inner = make_all_nullable(entries).with_nullable(false); + ArrowDataType::Map(Arc::new(inner), *ordered) + } + other => other.clone(), + }; + field.clone().with_data_type(dt).with_nullable(true) + } + + // Source schema (all non-null, each field carries metadata): + // s: Struct { + // lst: List [ item: Int32 ], + // mp: Map { key: Utf8, value: Int32 } + // } + let src_item = Arc::new( + ArrowField::new("item", ArrowDataType::Int32, false).with_metadata(meta("item")), + ); + let src_entries = Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct(ArrowFields::from(vec![ + ArrowField::new("key", ArrowDataType::Utf8, false), + ArrowField::new("value", ArrowDataType::Int32, false).with_metadata(meta("value")), + ])), + false, + )); + let src_list = ArrowField::new("lst", ArrowDataType::List(src_item.clone()), false) + .with_metadata(meta("list")); + let src_map = ArrowField::new("mp", ArrowDataType::Map(src_entries.clone(), false), false) + .with_metadata(meta("map")); + let src_struct = ArrowField::new( + "s", + ArrowDataType::Struct(ArrowFields::from(vec![src_list.clone(), src_map.clone()])), + false, + ) + .with_metadata(meta("struct")); + + // Target: same structure but all fields nullable + let tgt_struct = make_all_nullable(&src_struct); + + // Build 1-row arrays + let entry_fields = match src_entries.data_type() { + ArrowDataType::Struct(f) => f.clone(), + _ => unreachable!(), + }; + let list_col: Arc = Arc::new( + GenericListArray::::try_new( + src_item, + offsets_1(), + Arc::new(Int32Array::from(vec![1])), + None, + ) + .unwrap(), + ); + let entries = StructArray::try_new( + entry_fields, + vec![ + Arc::new(StringArray::from(vec!["a"])) as _, + Arc::new(Int32Array::from(vec![10])) as _, + ], + None, + ) + .unwrap(); + let map_col: Arc = + Arc::new(MapArray::try_new(src_entries, offsets_1(), entries, None, false).unwrap()); + let struct_col: Arc = Arc::new( + StructArray::try_new( + ArrowFields::from(vec![src_list, src_map]), + vec![list_col, map_col], + None, + ) + .unwrap(), + ); + + let src_schema = Arc::new(ArrowSchema::new(vec![src_struct])); + let target_schema = Arc::new(ArrowSchema::new(vec![tgt_struct])); + let batch = RecordBatch::try_new(src_schema, vec![struct_col]).unwrap(); + let result = coerce_batch_nullability(batch, &target_schema, None).unwrap(); + + // Walk the result schema and assert metadata + nullability at every level + let schema = result.schema(); + let s = schema.field(0); + assert_eq!(s.metadata(), &meta("struct")); + assert!(s.is_nullable()); + + let fields = match s.data_type() { + ArrowDataType::Struct(f) => f, + other => panic!("expected Struct, got {other:?}"), + }; + assert_eq!(fields[0].metadata(), &meta("list")); + assert!(fields[0].is_nullable()); + + let list_item = match fields[0].data_type() { + ArrowDataType::List(e) => e, + other => panic!("expected List, got {other:?}"), + }; + assert_eq!(list_item.metadata(), &meta("item")); + assert!(list_item.is_nullable()); + + assert_eq!(fields[1].metadata(), &meta("map")); + assert!(fields[1].is_nullable()); + + let map_val = match fields[1].data_type() { + ArrowDataType::Map(e, _) => match e.data_type() { + ArrowDataType::Struct(f) => &f[1], + other => panic!("expected Struct entries, got {other:?}"), + }, + other => panic!("expected Map, got {other:?}"), + }; + assert_eq!(map_val.metadata(), &meta("value")); + assert!(map_val.is_nullable()); + } + + // --- Tests for build_json_reorder_indices and json_arrow_schema --- + + const FILE_PATH: &str = "s3://bucket/test.json"; + + struct JsonInsertCase { + /// Full schema; may include a `FilePath` metadata column at any position. + schema: StructType, + /// Field names that [`json_arrow_schema`] should expose (metadata columns stripped). + expected_json_names: &'static [&'static str], + /// Column names in the final output after [`reorder_struct_array`]. + expected_output_names: &'static [&'static str], + /// Index of the `_file` column in the output, or `None` when the schema has no FilePath. + file_path_col: Option, + } + + /// Verifies that `json_arrow_schema` + `build_json_reorder_indices` + `reorder_struct_array` + /// correctly insert (or omit) the `_file` column at the position declared in the schema. + #[rstest] + #[case::no_file_path(JsonInsertCase { + schema: StructType::new_unchecked([ + StructField::not_null("a", DataType::INTEGER), + StructField::nullable("b", DataType::INTEGER), + ]), + expected_json_names: &["a", "b"], + expected_output_names: &["a", "b"], + file_path_col: None, + })] + #[case::file_path_at_start(JsonInsertCase { + schema: StructType::new_unchecked([ + StructField::create_metadata_column("_file", MetadataColumnSpec::FilePath), + StructField::not_null("a", DataType::INTEGER), + StructField::nullable("b", DataType::INTEGER), + ]), + expected_json_names: &["a", "b"], + expected_output_names: &["_file", "a", "b"], + file_path_col: Some(0), + })] + #[case::file_path_in_middle(JsonInsertCase { + schema: StructType::new_unchecked([ + StructField::not_null("a", DataType::INTEGER), + StructField::create_metadata_column("_file", MetadataColumnSpec::FilePath), + StructField::nullable("b", DataType::INTEGER), + ]), + expected_json_names: &["a", "b"], + expected_output_names: &["a", "_file", "b"], + file_path_col: Some(1), + })] + #[case::file_path_at_end(JsonInsertCase { + schema: StructType::new_unchecked([ + StructField::not_null("a", DataType::INTEGER), + StructField::nullable("b", DataType::INTEGER), + StructField::create_metadata_column("_file", MetadataColumnSpec::FilePath), + ]), + expected_json_names: &["a", "b"], + expected_output_names: &["a", "b", "_file"], + file_path_col: Some(2), + })] + fn test_json_file_path_insertion(#[case] case: JsonInsertCase) { + // json_arrow_schema exposes only the non-metadata fields. + let json_schema = json_arrow_schema(&case.schema).unwrap(); + let json_names: Vec<_> = json_schema + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect(); + assert_eq!(json_names, case.expected_json_names); + + // Build an input batch with the JSON schema (real columns only, each an Int32Array). + let arrow_schema = Arc::new(json_schema); + let cols: Vec = (0..arrow_schema.fields().len()) + .map(|_| Arc::new(Int32Array::from(vec![1i32, 2, 3])) as _) + .collect(); + let batch = RecordBatch::try_new(arrow_schema, cols).unwrap(); + + // build_json_reorder_indices + reorder_struct_array inserts the _file column. + let indices = build_json_reorder_indices(&case.schema).unwrap(); + let result = RecordBatch::from( + reorder_struct_array(batch.into(), &indices, None, Some(FILE_PATH)).unwrap(), + ); + + // Verify output column order and row count. + let schema = result.schema(); + let output_names: Vec<_> = schema.fields().iter().map(|f| f.name().as_str()).collect(); + assert_eq!(output_names, case.expected_output_names); + assert_eq!(result.num_rows(), 3); + + // When FilePath is in the schema, verify a plain StringArray with the path for every row. + // When absent, verify no _file column leaked into the output. + if let Some(idx) = case.file_path_col { + let arr = result + .column(idx) + .as_any() + .downcast_ref::() + .expect("_file column should be a StringArray"); + assert!(arr.iter().all(|v| v == Some(FILE_PATH))); + } else { + assert!( + result.schema().fields().iter().all(|f| f.name() != "_file"), + "_file should not appear when not declared in the schema" + ); + } + } + + #[test] + fn test_build_json_reorder_indices_unsupported_metadata_column_errors() { + // RowIndex is not supported for JSON reads. All metadata column specs are non-nullable, + // so the Missing transform inserts a null array — reorder_struct_array errors because + // the field is declared non-nullable. + let schema = StructType::new_unchecked([ + StructField::not_null("a", DataType::INTEGER), + StructField::create_metadata_column("row_index", MetadataColumnSpec::RowIndex), + ]); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "a", + ArrowDataType::Int32, + false, + )])); + let batch = RecordBatch::try_new( + arrow_schema, + vec![Arc::new(Int32Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let indices = build_json_reorder_indices(&schema).unwrap(); + assert!(reorder_struct_array(batch.into(), &indices, None, None).is_err()); + } + + #[test] + fn ensure_we_encode_maps_with_null_values() { + let schema = ArrowSchema::new(vec![ + ArrowField::new("str_col", ArrowDataType::Utf8, false), + ArrowField::new( + "map_col", + ArrowDataType::Map( + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new("keys", ArrowDataType::Utf8, false), + ArrowField::new("values", ArrowDataType::Utf8, true), + ] + .into(), + ), + false, + )), + false, // sorted + ), + false, + ), + ]); + let s_array = StringArray::from(vec!["foo"]); + + let string_builder = StringBuilder::new(); + let string_builder2 = StringBuilder::new(); + let mut map_builder = MapBuilder::new(None, string_builder, string_builder2); + + // Append one entry: "bar" -> null + map_builder.keys().append_value("bar"); + map_builder.values().append_null(); + map_builder.append(true).unwrap(); // finish the map row + + let map_array: MapArray = map_builder.finish(); + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(s_array), Arc::new(map_array)], + ) + .unwrap(); + + let data: Box = Box::new(ArrowEngineData::new(batch)); + let filtered_data = FilteredEngineData::with_all_rows_selected(data); + let json = to_json_bytes(Box::new(std::iter::once(Ok(filtered_data)))).unwrap(); + assert_eq!( + json, + "{\"str_col\":\"foo\",\"map_col\":{\"bar\":null}}\n".as_bytes() + ); + } + + #[rstest] + fn struct_with_all_nullable_children_unmatched_is_missing( + #[values(true, false)] struct_nullable: bool, + ) { + // When a struct exists in parquet but none of its children match the requested + // schema, the struct should be treated as missing regardless of its nullability. + let info_field = if struct_nullable { + StructField::nullable( + "info", + StructType::new_unchecked([StructField::nullable("z", DataType::LONG)]), + ) + } else { + StructField::not_null( + "info", + StructType::new_unchecked([StructField::nullable("z", DataType::LONG)]), + ) + }; + let requested_schema = Arc::new(StructType::new_unchecked([ + StructField::not_null("a", DataType::LONG), + info_field, + ])); + let parquet_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("a", ArrowDataType::Int64, true), + ArrowField::new( + "info", + ArrowDataType::Struct( + vec![ + ArrowField::new("x", ArrowDataType::Int64, true), + ArrowField::new("y", ArrowDataType::Utf8, true), + ] + .into(), + ), + !struct_nullable, + ), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); + assert_eq!(mask_indices, vec![0]); + let expected_info_field = Arc::new( + requested_schema + .field("info") + .unwrap() + .try_into_arrow() + .unwrap(), + ); + assert_eq!( + reorder_indices, + vec![ + ReorderIndex::identity(0), + ReorderIndex::missing(1, expected_info_field), + ] + ); + } + + #[test] + fn reorder_non_nullable_missing_struct_produces_non_null_struct() { + // The Missing transform for a non-nullable struct should produce a struct with + // null_count == 0 and all-null children. + let a_array: Arc = Arc::new(Int64Array::from(vec![1, 2, 3])); + let input = StructArray::from(vec![( + Arc::new(ArrowField::new("a", ArrowDataType::Int64, false)), + a_array, + )]); + let missing_field = Arc::new(ArrowField::new( + "info", + ArrowDataType::Struct(vec![ArrowField::new("z", ArrowDataType::Int64, true)].into()), + false, + )); + let reorder = vec![ + ReorderIndex::identity(0), + ReorderIndex::missing(1, missing_field), + ]; + let ordered = reorder_struct_array(input, &reorder, None, None).unwrap(); + assert_eq!(ordered.column_names(), vec!["a", "info"]); + let info = ordered.column(1).as_struct(); + assert_eq!(info.null_count(), 0); + assert_eq!(info.column(0).null_count(), 3); + } + + #[test] + fn reorder_nested_non_nullable_missing_struct_recurses() { + // Non-nullable struct containing a non-nullable struct child: both levels should + // have null_count == 0, with the leaf nullable child being all-null. + let a_array: Arc = Arc::new(Int64Array::from(vec![1, 2])); + let input = StructArray::from(vec![( + Arc::new(ArrowField::new("a", ArrowDataType::Int64, false)), + a_array, + )]); + let inner_struct = + ArrowDataType::Struct(vec![ArrowField::new("leaf", ArrowDataType::Int64, true)].into()); + let missing_field = Arc::new(ArrowField::new( + "outer", + ArrowDataType::Struct(vec![ArrowField::new("inner", inner_struct, false)].into()), + false, + )); + let reorder = vec![ + ReorderIndex::identity(0), + ReorderIndex::missing(1, missing_field), + ]; + let ordered = reorder_struct_array(input, &reorder, None, None).unwrap(); + let outer = ordered.column(1).as_struct(); + assert_eq!(outer.null_count(), 0); + let inner = outer.column(0).as_struct(); + assert_eq!(inner.null_count(), 0); + assert_eq!(inner.column(0).null_count(), 2); + } + + #[test] + fn empty_struct_is_matched() { + // Delta protocol allows empty structs. An empty struct has no children, so no + // leaf columns are selected, but the struct itself should still be matched. + let requested_schema = Arc::new(StructType::new_unchecked([ + StructField::not_null("a", DataType::LONG), + StructField::not_null("empty", StructType::new_unchecked([])), + ])); + let parquet_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("a", ArrowDataType::Int64, true), + ArrowField::new("empty", ArrowDataType::Struct(ArrowFields::empty()), false), + ])); + let (mask_indices, reorder_indices) = + get_requested_indices(&requested_schema, &parquet_schema).unwrap(); + assert_eq!(mask_indices, vec![0]); + let expected_empty_field = Arc::new( + requested_schema + .field("empty") + .unwrap() + .try_into_arrow() + .unwrap(), + ); + assert_eq!( + reorder_indices, + vec![ + ReorderIndex::identity(0), + ReorderIndex::missing(1, expected_empty_field), + ] + ); + } } diff --git a/kernel/src/engine/cross_engine_tests.rs b/kernel/src/engine/cross_engine_tests.rs new file mode 100644 index 0000000000..1732f56f40 --- /dev/null +++ b/kernel/src/engine/cross_engine_tests.rs @@ -0,0 +1,168 @@ +//! Cross-engine tests: verifies that both the default (Arrow/Tokio) and sync engines exhibit +//! consistent behavior for [`JsonHandler`] and [`ParquetHandler`]. +//! +//! Contract tests (things any [`ParquetHandler`] implementation must satisfy) call into +//! [`super::tests`]. Internal implementation tests (Arrow-specific behavior that both kernel +//! engines share) are defined as local helpers here. + +use std::fs::File; +use std::sync::Arc; + +use rstest::rstest; +use tempfile::tempdir; +use url::Url; + +use crate::arrow::array::{Array, Int64Array, RecordBatch}; +use crate::engine::arrow_conversion::TryIntoKernel as _; +use crate::engine::arrow_data::ArrowEngineData; +use crate::engine::default::executor::tokio::TokioBackgroundExecutor; +use crate::engine::default::json::DefaultJsonHandler; +use crate::engine::default::parquet::DefaultParquetHandler; +use crate::engine::sync::json::SyncJsonHandler; +use crate::engine::sync::SyncParquetHandler; +use crate::object_store::local::LocalFileSystem; +use crate::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use crate::parquet::arrow::arrow_writer::ArrowWriter; +use crate::parquet::arrow::ARROW_SCHEMA_META_KEY; +use crate::{EngineData, JsonHandler, ParquetHandler}; + +fn default_parquet_handler() -> Box { + Box::new(DefaultParquetHandler::new( + Arc::new(LocalFileSystem::new()), + Arc::new(TokioBackgroundExecutor::new()), + )) +} + +fn sync_parquet_handler() -> Box { + Box::new(SyncParquetHandler) +} + +fn default_json_handler() -> Box { + Box::new(DefaultJsonHandler::new( + Arc::new(LocalFileSystem::new()), + Arc::new(TokioBackgroundExecutor::new()), + )) +} + +fn sync_json_handler() -> Box { + Box::new(SyncJsonHandler) +} + +#[rstest] +#[case::default_engine(default_parquet_handler())] +#[case::sync_engine(sync_parquet_handler())] +fn test_reads_footer(#[case] handler: Box) { + super::tests::test_parquet_handler_reads_footer(handler.as_ref()); +} + +#[rstest] +#[case::default_engine(default_parquet_handler())] +#[case::sync_engine(sync_parquet_handler())] +fn test_footer_errors_on_missing_file(#[case] handler: Box) { + super::tests::test_parquet_handler_footer_errors_on_missing_file(handler.as_ref()); +} + +#[rstest] +#[case::default_engine(default_parquet_handler())] +#[case::sync_engine(sync_parquet_handler())] +fn test_footer_preserves_field_ids(#[case] handler: Box) { + super::tests::test_parquet_handler_footer_preserves_field_ids(handler.as_ref()); +} + +#[rstest] +#[case::default_engine(default_parquet_handler())] +#[case::sync_engine(sync_parquet_handler())] +fn test_write_always_overwrites(#[case] handler: Box) { + super::tests::test_parquet_handler_write_always_overwrites(handler.as_ref()); +} + +// Both kernel engines configure their parquet readers and writers to skip the Arrow IPC schema +// (ARROW:schema) in file metadata. The following tests verify this shared behavior. + +fn assert_no_arrow_schema(handler: &dyn ParquetHandler) { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("no_arrow_schema.parquet"); + let url = Url::from_file_path(&file_path).unwrap(); + + let data: Box = Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![( + "id", + Arc::new(Int64Array::from(vec![1, 2])) as Arc, + )]) + .unwrap(), + )); + handler + .write_parquet_file(url, Box::new(std::iter::once(Ok(data)))) + .unwrap(); + + let builder = + ParquetRecordBatchReaderBuilder::try_new(File::open(&file_path).unwrap()).unwrap(); + let kv = builder.metadata().file_metadata().key_value_metadata(); + let has = kv + .map(|kv| kv.iter().any(|e| e.key == ARROW_SCHEMA_META_KEY)) + .unwrap_or(false); + assert!( + !has, + "Parquet file should not contain embedded Arrow schema metadata" + ); +} + +fn assert_reads_file_with_arrow_schema_metadata(handler: &dyn ParquetHandler) { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("with_arrow_schema.parquet"); + + let batch = RecordBatch::try_from_iter(vec![( + "value", + Arc::new(Int64Array::from(vec![10, 20, 30])) as Arc, + )]) + .unwrap(); + let mut writer = + ArrowWriter::try_new(File::create(&file_path).unwrap(), batch.schema(), None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let file_meta = super::tests::file_meta_for(&file_path); + let schema = Arc::new(batch.schema().as_ref().try_into_kernel().unwrap()); + let batches: Vec = handler + .read_parquet_files(&[file_meta], schema, None) + .unwrap() + .map(|r| { + ArrowEngineData::try_from_engine_data(r.unwrap()) + .unwrap() + .into() + }) + .collect(); + + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 3); + assert_eq!( + batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values(), + &[10, 20, 30] + ); +} + +#[rstest] +#[case::default_engine(default_parquet_handler())] +#[case::sync_engine(sync_parquet_handler())] +fn test_write_file_omits_arrow_schema(#[case] handler: Box) { + assert_no_arrow_schema(handler.as_ref()); +} + +#[rstest] +#[case::default_engine(default_parquet_handler())] +#[case::sync_engine(sync_parquet_handler())] +fn test_reads_file_with_arrow_schema_metadata(#[case] handler: Box) { + assert_reads_file_with_arrow_schema_metadata(handler.as_ref()); +} + +#[rstest] +#[case::default_engine(default_json_handler())] +#[case::sync_engine(sync_json_handler())] +fn test_json_file_path_contract(#[case] handler: Box) { + super::tests::test_json_handler_file_path_contract(handler.as_ref()); +} diff --git a/kernel/src/engine/default/executor.rs b/kernel/src/engine/default/executor.rs index 86831bc11e..e65bbc9e5e 100644 --- a/kernel/src/engine/default/executor.rs +++ b/kernel/src/engine/default/executor.rs @@ -21,6 +21,11 @@ use crate::DeltaResult; /// on another thread. This could be a multi-threaded runtime, like Tokio's or /// could be a single-threaded runtime on a background thread. pub trait TaskExecutor: Send + Sync + 'static { + /// The type of guard returned for `enter` + type Guard<'a> + where + Self: 'a; + /// Block on the given future, returning its output. /// /// This should NOT panic if called within an async context. Thus it can't @@ -39,6 +44,9 @@ pub trait TaskExecutor: Send + Sync + 'static { where T: FnOnce() -> R + Send + 'static, R: Send + 'static; + + /// Enter the runtime context of this executor. + fn enter(&self) -> Self::Guard<'_>; } #[cfg(any(feature = "tokio", test))] @@ -46,17 +54,38 @@ pub mod tokio { use super::TaskExecutor; use futures::TryFutureExt; use futures::{future::BoxFuture, Future}; + use std::mem::ManuallyDrop; use std::sync::mpsc::channel; - use tokio::runtime::RuntimeFlavor; + use tokio::runtime::{EnterGuard, Handle, RuntimeFlavor}; - use crate::DeltaResult; + use crate::{DeltaResult, Error}; /// A [`TaskExecutor`] that uses the tokio single-threaded runtime in a /// background thread to service tasks. + /// + /// On drop, the background thread is joined to ensure the runtime is fully + /// shut down before the executor is destroyed. #[derive(Debug)] pub struct TokioBackgroundExecutor { - sender: tokio::sync::mpsc::Sender>, - _thread: std::thread::JoinHandle<()>, + sender: ManuallyDrop>>, + handle: Handle, + /// `Option` because `join` takes ownership; we `take` it in `Drop` to move the + /// handle out. Never `None` outside of `Drop`. + thread: Option>, + } + + impl Drop for TokioBackgroundExecutor { + fn drop(&mut self) { + // SAFETY: The inner `Sender` has not been dropped yet because this is + // the only drop site and `Drop::drop` runs exactly once. + // Drop sender first to close the channel, signaling the background + // thread to exit its recv loop. + unsafe { ManuallyDrop::drop(&mut self.sender) }; + // Join the thread so that runtime shutdown completes before we return. + if let Some(thread) = self.thread.take() { + let _ = thread.join(); + } + } } impl Default for TokioBackgroundExecutor { @@ -67,21 +96,26 @@ pub mod tokio { impl TokioBackgroundExecutor { pub fn new() -> Self { + let (handle_sender, handle_receiver) = std::sync::mpsc::channel::(); let (sender, mut receiver) = tokio::sync::mpsc::channel::>(50); let thread = std::thread::spawn(move || { let rt = tokio::runtime::Builder::new_current_thread() .enable_all() .build() .unwrap(); + let handle = rt.handle().clone(); + handle_sender.send(handle).unwrap(); rt.block_on(async move { while let Some(task) = receiver.recv().await { tokio::task::spawn(task); } }); }); + let handle = handle_receiver.recv().unwrap(); Self { - sender, - _thread: thread, + sender: ManuallyDrop::new(sender), + handle, + thread: Some(thread), } } } @@ -107,6 +141,8 @@ pub mod tokio { } impl TaskExecutor for TokioBackgroundExecutor { + type Guard<'a> = EnterGuard<'a>; + fn block_on(&self, task: T) -> T::Output where T: Future + Send + 'static, @@ -145,30 +181,83 @@ pub mod tokio { T: FnOnce() -> R + Send + 'static, R: Send + 'static, { - Box::pin(tokio::task::spawn_blocking(task).map_err(crate::Error::join_failure)) + Box::pin(tokio::task::spawn_blocking(task).map_err(Error::join_failure)) + } + + fn enter(&self) -> EnterGuard<'_> { + self.handle.enter() } } - /// A [`TaskExecutor`] that uses the tokio multi-threaded runtime. You can - /// create one based on a handle to an existing runtime, so it can share - /// the runtime with other parts of your application. + /// A [`TaskExecutor`] that uses the tokio multi-threaded runtime. + /// + /// You can create one based on a handle to an existing runtime (to share + /// the runtime with other parts of your application), or create one that + /// owns its own runtime. #[derive(Debug)] pub struct TokioMultiThreadExecutor { handle: tokio::runtime::Handle, + /// If Some, this executor owns the runtime and will keep it alive. + /// If None, the executor borrows an external runtime via `handle`. + _runtime: Option, } impl TokioMultiThreadExecutor { + /// Create a new executor that uses an existing runtime's handle. pub fn new(handle: tokio::runtime::Handle) -> Self { assert_eq!( handle.runtime_flavor(), RuntimeFlavor::MultiThread, "TokioExecutor must be created with a multi-threaded runtime" ); - Self { handle } + Self { + handle, + _runtime: None, + } + } + + /// Create a new executor that owns its own multi-threaded Tokio runtime. + /// + /// # Parameters + /// - `worker_threads`: Number of worker threads. If `None`, uses Tokio's default. + /// See [`tokio::runtime::Builder::worker_threads`]. + /// - `max_blocking_threads`: Maximum number of threads for blocking operations. + /// If `None`, uses Tokio's default. See [`tokio::runtime::Builder::max_blocking_threads`]. + /// + /// # Errors + /// Returns an error if the runtime cannot be created. + pub fn new_owned_runtime( + worker_threads: Option, + max_blocking_threads: Option, + ) -> DeltaResult { + let mut builder = tokio::runtime::Builder::new_multi_thread(); + builder.enable_all(); + + if let Some(threads) = worker_threads { + builder.worker_threads(threads); + } + if let Some(max_blocking) = max_blocking_threads { + builder.max_blocking_threads(max_blocking); + } + + let runtime = builder + .build() + .map_err(|e| Error::generic(format!("Failed to create Tokio runtime: {e}")))?; + + let handle = runtime.handle().clone(); + Ok(Self { + handle, + _runtime: Some(runtime), + }) } } impl TaskExecutor for TokioMultiThreadExecutor { + type Guard<'a> = EnterGuard<'a>; + + // `block_on` uses `block_in_place`; If concurrent `block_on` calls exceed Tokio's `max_blocking_threads`, this can deadlock + // See: + // https://docs.rs/tokio/latest/tokio/runtime/struct.Builder.html#method.max_blocking_threads fn block_on(&self, task: T) -> T::Output where T: Future + Send + 'static, @@ -191,9 +280,23 @@ pub mod tokio { // We throw away the handle, but it should continue on. self.handle.spawn(fut); - receiver - .recv() - .expect("TokioMultiThreadExecutor has crashed") + let recv = || { + receiver + .recv() + .expect("TokioMultiThreadExecutor has crashed") + }; + + if tokio::runtime::Handle::try_current().is_ok() { + // Use block_in_place to tell Tokio we're about to block - this allows + // the runtime to move tasks off this worker's local queue so they can + // be stolen by other workers. Only use block_in_place if we're inside + // a Tokio runtime. + tokio::task::block_in_place(recv) + } else { + // If we're not inside a Tokio runtime, we can't use block_in_place, + // so we just block on the receiver. + recv() + } } fn spawn(&self, task: F) @@ -208,7 +311,11 @@ pub mod tokio { T: FnOnce() -> R + Send + 'static, R: Send + 'static, { - Box::pin(tokio::task::spawn_blocking(task).map_err(crate::Error::join_failure)) + Box::pin(tokio::task::spawn_blocking(task).map_err(Error::join_failure)) + } + + fn enter(&self) -> EnterGuard<'_> { + self.handle.enter() } } @@ -246,5 +353,150 @@ pub mod tokio { let executor = TokioMultiThreadExecutor::new(tokio::runtime::Handle::current()); test_executor(executor).await; } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_nested_block_on_does_not_deadlock() { + use std::sync::Arc; + use std::time::Duration; + + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let executor_clone = executor.clone(); + + let (tx, rx) = channel::(); + + let handle = std::thread::spawn(move || { + // Outer block_on + let result = executor.block_on(async move { + // Inner block_on + let inner_result = executor_clone.block_on(async { + tokio::time::sleep(Duration::from_millis(1)).await; + 42 + }); + inner_result + 1 + }); + tx.send(result).ok(); + }); + + // Wait with timeout - if this times out, we have a deadlock + let timeout = Duration::from_secs(5); + let result = rx + .recv_timeout(timeout) + .expect("Timeout - likely deadlock in TokioMultiThreadExecutor::block_on"); + assert_eq!(result, 43); + handle.join().expect("thread panicked"); + } + + #[test] + fn test_tokio_multi_thread_executor_owned_runtime() { + let executor = TokioMultiThreadExecutor::new_owned_runtime(None, None) + .expect("Failed to create executor"); + + // Test block_on works + let result = executor.block_on(async { + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + 2 + 2 + }); + assert_eq!(result, 4, "block_on should return the correct result"); + + // Test spawn works + let (sender, receiver) = channel::(); + executor.spawn(async move { + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + sender.send(2 + 2).unwrap(); + }); + let result = receiver.recv().expect("spawn task should send result"); + assert_eq!(result, 4, "spawned task should compute correct result"); + } + + #[test] + fn test_owned_runtime_small_pool_nested_block_on_deadlocks() { + use std::sync::Arc; + use std::time::Duration; + + // Create a small pool + let executor = Arc::new( + TokioMultiThreadExecutor::new_owned_runtime(Some(1), Some(1)) + .expect("Failed to create executor"), + ); + let e1 = executor.clone(); + let e2 = executor.clone(); + let e3 = executor.clone(); + + let (tx, rx) = channel::(); + + // Spawn a thread to do deeply nested block_on calls + std::thread::spawn(move || { + let result = executor.block_on(async move { + e1.block_on(async move { + e2.block_on(async move { + e3.block_on(async { + tokio::time::sleep(Duration::from_millis(1)).await; + 42 + }) + }) + }) + }); + tx.send(result).ok(); + }); + + // With 1 worker thread, 1 blocking thread and 4 nested block_on calls, this should deadlock + let timeout = Duration::from_millis(500); + let result = rx.recv_timeout(timeout); + + // Test passes if we got a timeout (deadlock occurred as expected) + // Test fails if we got a result (no deadlock - unexpected) + assert!( + result.is_err(), + "Expected deadlock with 1 worker thread, 1 blocking thread and 4 nested block_on calls", + ); + } + + #[test] + fn test_block_on_works_outside_tokio_runtime() { + let executor = TokioMultiThreadExecutor::new_owned_runtime(None, None) + .expect("Failed to create executor"); + + // Verify we're not inside a Tokio runtime + assert!( + tokio::runtime::Handle::try_current().is_err(), + "Test must run outside of a Tokio runtime" + ); + + // block_on should work even though we're not inside a Tokio runtime + let result = executor.block_on(async { + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + 42 + }); + assert_eq!(result, 42); + } + + #[rstest::rstest] + #[case::multithreaded( + TokioMultiThreadExecutor::new_owned_runtime(None, None).expect("Couldn't create multithreaded executor") + )] + #[case::background(TokioBackgroundExecutor::new())] + fn can_enter_a_runtime(#[case] executor: T) { + // Verify we're not inside a Tokio runtime + assert!( + tokio::runtime::Handle::try_current().is_err(), + "Test must run outside of a Tokio runtime" + ); + + let guard = executor.enter(); + + assert!( + tokio::runtime::Handle::try_current().is_ok(), + "Should have entered runtime" + ); + + drop(guard); + + assert!( + tokio::runtime::Handle::try_current().is_err(), + "Should have exited runtime" + ); + } } } diff --git a/kernel/src/engine/default/file_stream.rs b/kernel/src/engine/default/file_stream.rs index ab8c4b95e1..ad93fcf087 100644 --- a/kernel/src/engine/default/file_stream.rs +++ b/kernel/src/engine/default/file_stream.rs @@ -2,7 +2,6 @@ use std::collections::VecDeque; use std::mem; use std::ops::Range; use std::pin::Pin; -use std::sync::Arc; use std::task::{ready, Context, Poll}; use crate::arrow::array::RecordBatch; @@ -10,11 +9,8 @@ use crate::arrow::datatypes::SchemaRef as ArrowSchemaRef; use futures::future::BoxFuture; use futures::stream::{BoxStream, Stream, StreamExt}; use futures::FutureExt; -use tracing::error; -use super::executor::TaskExecutor; -use crate::engine::arrow_data::ArrowEngineData; -use crate::{DeltaResult, FileDataReadResultIterator, FileMeta}; +use crate::{DeltaResult, FileMeta}; /// A fallible future that resolves to a stream of [`RecordBatch`] /// cbindgen:ignore @@ -24,7 +20,7 @@ pub type FileOpenFuture = /// Generic API for opening a file using an [`ObjectStore`] and resolving to a /// stream of [`RecordBatch`] /// -/// [`ObjectStore`]: object_store::ObjectStore +/// [`ObjectStore`]: crate::object_store::ObjectStore pub trait FileOpener: Send + Unpin { /// Asynchronously open the specified file and return a stream /// of [`RecordBatch`] @@ -95,52 +91,6 @@ pub struct FileStream { } impl FileStream { - /// Creates a new `FileStream` from a given schema, `FileOpener`, and files list; the files are - /// processed asynchronously by the provided `TaskExecutor`. Returns an `Iterator` that consumes - /// the results. - pub fn new_async_read_iterator( - task_executor: Arc, - schema: ArrowSchemaRef, - file_opener: Box, - files: &[FileMeta], - readahead: usize, - ) -> DeltaResult { - let mut stream = FileStream::new(files.to_vec(), schema, file_opener)?; - - // This channel will become the output iterator - // The stream will execute in the background, and we allow up to `readahead` - // batches to be buffered in the channel. - let (sender, receiver) = std::sync::mpsc::sync_channel(readahead); - - let executor_for_block = task_executor.clone(); - task_executor.spawn(async move { - while let Some(res) = stream.next().await { - let sender_clone = sender.clone(); - let join_res = executor_for_block - .spawn_blocking(move || sender_clone.send(res)) - .await; - match join_res { - Ok(send_res) => match send_res { - Ok(()) => continue, - Err(_) => break, - }, - Err(je) => { - error!("Couldn't join spawned task, runtime is likely in bad state: {je}"); - // Send an error through the channel to be handled by the receiver - let _ = sender.send(Err(crate::Error::JoinFailure(format!( - "Failed to join spawned task: {je}", - )))); - break; - } - } - } - }); - - Ok(Box::new(receiver.into_iter().map(|rbr| { - rbr.map(|rb| Box::new(ArrowEngineData::new(rb)) as _) - }))) - } - /// Create a new `FileStream` using the given `FileOpener` to scan underlying files pub fn new( files: impl IntoIterator, diff --git a/kernel/src/engine/default/filesystem.rs b/kernel/src/engine/default/filesystem.rs index 73d06088c0..513ba6bb55 100644 --- a/kernel/src/engine/default/filesystem.rs +++ b/kernel/src/engine/default/filesystem.rs @@ -1,30 +1,135 @@ +use std::pin::Pin; use std::sync::Arc; +use std::task::{Context, Poll}; +use std::time::{Duration, Instant}; use bytes::Bytes; use delta_kernel_derive::internal_api; -use futures::stream::StreamExt; +use futures::stream::{self, BoxStream, Stream, StreamExt, TryStreamExt}; use itertools::Itertools; -use object_store::path::Path; -use object_store::{DynObjectStore, ObjectStore, PutMode}; use url::Url; use super::UrlExt; use crate::engine::default::executor::TaskExecutor; +use crate::metrics::{MetricEvent, MetricsReporter}; +use crate::object_store::path::Path; +use crate::object_store::{self, DynObjectStore, ObjectStore, PutMode}; use crate::{DeltaResult, Error, FileMeta, FileSlice, StorageHandler}; +/// Iterator wrapper that emits metrics when exhausted +/// +/// Generic over the inner iterator type and item type. +/// The `event_fn` receives (duration, num_files, bytes_read) to construct the appropriate MetricEvent. +/// Metrics are emitted either when the iterator is exhausted or when dropped. +struct MetricsIterator { + inner: I, + reporter: Option>, + start: Instant, + num_files: u64, + bytes_read: u64, + event_fn: fn(Duration, u64, u64) -> MetricEvent, + _phantom: std::marker::PhantomData, +} + +impl MetricsIterator { + fn new( + inner: I, + reporter: Option>, + start: Instant, + event_fn: fn(Duration, u64, u64) -> MetricEvent, + ) -> Self { + Self { + inner, + reporter, + start, + num_files: 0, + bytes_read: 0, + event_fn, + _phantom: std::marker::PhantomData, + } + } + + fn emit_metrics_once(&mut self) { + if let Some(r) = self.reporter.take() { + r.report((self.event_fn)( + self.start.elapsed(), + self.num_files, + self.bytes_read, + )); + } + } +} + +impl Drop for MetricsIterator { + fn drop(&mut self) { + self.emit_metrics_once(); + } +} + +impl Stream for MetricsIterator +where + I: Stream> + Unpin, +{ + type Item = I::Item; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match futures::ready!(Pin::new(&mut self.inner).poll_next(cx)) { + Some(item) => { + if item.is_ok() { + self.num_files += 1; + } + Poll::Ready(Some(item)) + } + None => { + self.emit_metrics_once(); + Poll::Ready(None) + } + } + } +} + +impl Stream for MetricsIterator +where + I: Stream> + Unpin, +{ + type Item = I::Item; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match futures::ready!(Pin::new(&mut self.inner).poll_next(cx)) { + Some(item) => { + if let Ok(ref bytes) = item { + self.num_files += 1; + self.bytes_read += bytes.len() as u64; + } + Poll::Ready(Some(item)) + } + None => { + self.emit_metrics_once(); + Poll::Ready(None) + } + } + } +} + #[derive(Debug)] pub struct ObjectStoreStorageHandler { inner: Arc, task_executor: Arc, + reporter: Option>, readahead: usize, } impl ObjectStoreStorageHandler { #[internal_api] - pub(crate) fn new(store: Arc, task_executor: Arc) -> Self { + pub(crate) fn new( + store: Arc, + task_executor: Arc, + reporter: Option>, + ) -> Self { Self { inner: store, task_executor, + reporter, readahead: 10, } } @@ -36,84 +141,189 @@ impl ObjectStoreStorageHandler { } } +/// Native async implementation for list_from +async fn list_from_impl( + store: Arc, + path: Url, + reporter: Option>, +) -> DeltaResult>> { + let start = Instant::now(); + + // The offset is used for list-after; the prefix is used to restrict the listing to a specific directory. + // Unfortunately, `Path` provides no easy way to check whether a name is directory-like, + // because it strips trailing /, so we're reduced to manually checking the original URL. + let offset = Path::from_url_path(path.path())?; + let prefix = if path.path().ends_with('/') { + offset.clone() + } else { + let mut parts = offset.parts().collect_vec(); + if parts.pop().is_none() { + return Err(Error::Generic(format!( + "Offset path must not be a root directory. Got: '{path}'", + ))); + } + Path::from_iter(parts) + }; + + let has_ordered_listing = supports_ordered_listing(&path); + + let stream = store + .list_with_offset(Some(&prefix), &offset) + .map(move |meta| { + let meta = meta?; + let mut location = path.clone(); + location.set_path(&format!("/{}", meta.location.as_ref())); + Ok(FileMeta { + location, + last_modified: meta.last_modified.timestamp_millis(), + size: meta.size, + }) + }); + + if !has_ordered_listing { + // Local filesystem doesn't return sorted list - need to collect and sort + let mut items: Vec<_> = stream.try_collect().await?; + items.sort_unstable(); + + if let Some(r) = reporter { + r.report(MetricEvent::StorageListCompleted { + duration: start.elapsed(), + num_files: items.len() as u64, + }); + } + Ok(Box::pin(stream::iter(items.into_iter().map(Ok)))) + } else { + let stream = MetricsIterator::new( + stream, + reporter, + start, + |duration, num_files, _bytes_read| MetricEvent::StorageListCompleted { + duration, + num_files, + }, + ); + Ok(Box::pin(stream)) + } +} + +/// Native async implementation for read_files +async fn read_files_impl( + store: Arc, + files: Vec, + readahead: usize, + reporter: Option>, +) -> DeltaResult>> { + let start = Instant::now(); + let files = stream::iter(files).map(move |(url, range)| { + let store = store.clone(); + async move { + // Wasn't checking the scheme before calling to_file_path causing the url path to + // be eaten in a strange way. Now, if not a file scheme, just blindly convert to a path. + // https://docs.rs/url/latest/url/struct.Url.html#method.to_file_path has more + // details about why this check is necessary + let path = if url.scheme() == "file" { + let file_path = url + .to_file_path() + .map_err(|_| Error::InvalidTableLocation(format!("Invalid file URL: {url}")))?; + Path::from_absolute_path(file_path) + .map_err(|e| Error::InvalidTableLocation(format!("Invalid file path: {e}")))? + } else { + Path::from(url.path()) + }; + if url.is_presigned() { + // have to annotate type here or rustc can't figure it out + Ok::(reqwest::get(url).await?.bytes().await?) + } else if let Some(rng) = range { + Ok(store.get_range(&path, rng).await?) + } else { + let result = store.get(&path).await?; + Ok(result.bytes().await?) + } + } + }); + + // We allow executing up to `readahead` futures concurrently and + // buffer the results. This allows us to achieve async concurrency. + Ok(Box::pin(MetricsIterator::new( + files.buffered(readahead), + reporter, + start, + |duration, num_files, bytes_read| MetricEvent::StorageReadCompleted { + duration, + num_files, + bytes_read, + }, + ))) +} + +/// Native async implementation for copy_atomic +async fn copy_atomic_impl( + store: Arc, + src_path: Path, + dest_path: Path, + reporter: Option>, +) -> DeltaResult<()> { + let start = Instant::now(); + + // Read source file then write atomically with PutMode::Create. Note that a GET/PUT is not + // necessarily atomic, but since the source file is immutable, we aren't exposed to the + // possibility of source file changing while we do the PUT. + let data = store.get(&src_path).await?.bytes().await?; + let result = store + .put_opts(&dest_path, data.into(), PutMode::Create.into()) + .await; + + if let Some(r) = reporter { + r.report(MetricEvent::StorageCopyCompleted { + duration: start.elapsed(), + }); + } + + result.map_err(|e| match e { + object_store::Error::AlreadyExists { .. } => Error::FileAlreadyExists(dest_path.into()), + e => e.into(), + })?; + Ok(()) +} + +/// Native async implementation for put +async fn put_impl( + store: Arc, + path: Path, + data: Bytes, + overwrite: bool, +) -> DeltaResult<()> { + let put_mode = if overwrite { + PutMode::Overwrite + } else { + PutMode::Create + }; + let result = store.put_opts(&path, data.into(), put_mode.into()).await; + result.map_err(|e| match e { + object_store::Error::AlreadyExists { .. } => Error::FileAlreadyExists(path.into()), + e => e.into(), + })?; + Ok(()) +} + +/// Native async implementation for head +async fn head_impl(store: Arc, url: Url) -> DeltaResult { + let meta = store.head(&Path::from_url_path(url.path())?).await?; + Ok(FileMeta { + location: url, + last_modified: meta.last_modified.timestamp_millis(), + size: meta.size, + }) +} + impl StorageHandler for ObjectStoreStorageHandler { fn list_from( &self, path: &Url, ) -> DeltaResult>>> { - // The offset is used for list-after; the prefix is used to restrict the listing to a specific directory. - // Unfortunately, `Path` provides no easy way to check whether a name is directory-like, - // because it strips trailing /, so we're reduced to manually checking the original URL. - let offset = Path::from_url_path(path.path())?; - let prefix = if path.path().ends_with('/') { - offset.clone() - } else { - let mut parts = offset.parts().collect_vec(); - if parts.pop().is_none() { - return Err(Error::Generic(format!( - "Offset path must not be a root directory. Got: '{}'", - path.as_str() - ))); - } - Path::from_iter(parts) - }; - - let store = self.inner.clone(); - - // HACK to check if we're using a LocalFileSystem from ObjectStore. We need this because - // local filesystem doesn't return a sorted list by default. Although the `object_store` - // crate explicitly says it _does not_ return a sorted listing, in practice all the cloud - // implementations actually do: - // - AWS: - // [`ListObjectsV2`](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html) - // states: "For general purpose buckets, ListObjectsV2 returns objects in lexicographical - // order based on their key names." (Directory buckets are out of scope for now) - // - Azure: Docs state - // [here](https://learn.microsoft.com/en-us/rest/api/storageservices/enumerating-blob-resources): - // "A listing operation returns an XML response that contains all or part of the requested - // list. The operation returns entities in alphabetical order." - // - GCP: The [main](https://cloud.google.com/storage/docs/xml-api/get-bucket-list) doc - // doesn't indicate order, but [this - // page](https://cloud.google.com/storage/docs/xml-api/get-bucket-list) does say: "This page - // shows you how to list the [objects](https://cloud.google.com/storage/docs/objects) stored - // in your Cloud Storage buckets, which are ordered in the list lexicographically by name." - // So we just need to know if we're local and then if so, we sort the returned file list - let has_ordered_listing = path.scheme() != "file"; - - // This channel will become the iterator - let (sender, receiver) = std::sync::mpsc::sync_channel(4_000); - let url = path.clone(); - self.task_executor.spawn(async move { - let mut stream = store.list_with_offset(Some(&prefix), &offset); - - while let Some(meta) = stream.next().await { - match meta { - Ok(meta) => { - let mut location = url.clone(); - location.set_path(&format!("/{}", meta.location.as_ref())); - sender - .send(Ok(FileMeta { - location, - last_modified: meta.last_modified.timestamp_millis(), - size: meta.size, - })) - .ok(); - } - Err(e) => { - sender.send(Err(e.into())).ok(); - } - } - } - }); - - if !has_ordered_listing { - // This FS doesn't return things in the order we require - let mut fms: Vec = receiver.into_iter().try_collect()?; - fms.sort_unstable(); - Ok(Box::new(fms.into_iter().map(Ok))) - } else { - Ok(Box::new(receiver.into_iter())) - } + let future = list_from_impl(self.inner.clone(), path.clone(), self.reporter.clone()); + let iter = super::stream_future_to_iter(self.task_executor.clone(), future)?; + Ok(iter) // type coercion drops the unneeded Send bound } /// Read data specified by the start and end offset from the file. @@ -126,100 +336,121 @@ impl StorageHandler for ObjectStoreStorageHandler { &self, files: Vec, ) -> DeltaResult>>> { - let store = self.inner.clone(); - - // This channel will become the output iterator. - // Because there will already be buffering in the stream, we set the - // buffer size to 0. - let (sender, receiver) = std::sync::mpsc::sync_channel(0); - - self.task_executor.spawn( - futures::stream::iter(files) - .map(move |(url, range)| { - let store = store.clone(); - async move { - // Wasn't checking the scheme before calling to_file_path causing the url path to - // be eaten in a strange way. Now, if not a file scheme, just blindly convert to a path. - // https://docs.rs/url/latest/url/struct.Url.html#method.to_file_path has more - // details about why this check is necessary - let path = if url.scheme() == "file" { - let file_path = url.to_file_path().map_err(|_| { - Error::InvalidTableLocation(format!("Invalid file URL: {url}")) - })?; - Path::from_absolute_path(file_path).map_err(|e| { - Error::InvalidTableLocation(format!("Invalid file path: {e}")) - })? - } else { - Path::from(url.path()) - }; - if url.is_presigned() { - // have to annotate type here or rustc can't figure it out - Ok::(reqwest::get(url).await?.bytes().await?) - } else if let Some(rng) = range { - Ok(store.get_range(&path, rng).await?) - } else { - let result = store.get(&path).await?; - Ok(result.bytes().await?) - } - } - }) - // We allow executing up to `readahead` futures concurrently and - // buffer the results. This allows us to achieve async concurrency - // within a synchronous method. - .buffered(self.readahead) - .for_each(move |res| { - sender.send(res).ok(); - futures::future::ready(()) - }), + let future = read_files_impl( + self.inner.clone(), + files, + self.readahead, + self.reporter.clone(), ); + let iter = super::stream_future_to_iter(self.task_executor.clone(), future)?; + Ok(iter) // type coercion drops the unneeded Send bound + } - Ok(Box::new(receiver.into_iter())) + fn put(&self, path: &Url, data: Bytes, overwrite: bool) -> DeltaResult<()> { + let path = Path::from_url_path(path.path())?; + self.task_executor + .block_on(put_impl(self.inner.clone(), path, data, overwrite)) } fn copy_atomic(&self, src: &Url, dest: &Url) -> DeltaResult<()> { let src_path = Path::from_url_path(src.path())?; let dest_path = Path::from_url_path(dest.path())?; - let dest_path_str = dest_path.to_string(); - let store = self.inner.clone(); - - // Read source file then write atomically with PutMode::Create. Note that a GET/PUT is not - // necessarily atomic, but since the source file is immutable, we aren't exposed to the - // possiblilty of source file changing while we do the PUT. - self.task_executor.block_on(async move { - let data = store.get(&src_path).await?.bytes().await?; - - store - .put_opts(&dest_path, data.into(), PutMode::Create.into()) - .await - .map_err(|e| match e { - object_store::Error::AlreadyExists { .. } => { - Error::FileAlreadyExists(dest_path_str) - } - e => e.into(), - })?; - Ok(()) - }) + let future = copy_atomic_impl( + self.inner.clone(), + src_path, + dest_path, + self.reporter.clone(), + ); + self.task_executor.block_on(future) + } + + fn head(&self, path: &Url) -> DeltaResult { + let future = head_impl(self.inner.clone(), path.clone()); + self.task_executor.block_on(future) } } +/// Returns whether or not the [Url] can support ordered listing. +/// +/// When this returns false the default engine will need to collect a stream before returning, +/// which has a performance impact +/// +/// The current known situations where there are unordered listings are with filesystems and AWS S3 +/// Express One Zone directory buckets +/// +/// Although the `object_store` crate explicitly says it _does not_ return a sorted listing, in +/// practice many implementations actually do: +/// - AWS: +/// [`ListObjectsV2`](https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html) +/// states: "For general purpose buckets, ListObjectsV2 returns objects in lexicographical +/// order based on their key names." +/// - Azure: Docs state +/// [here](https://learn.microsoft.com/en-us/rest/api/storageservices/enumerating-blob-resources): +/// "A listing operation returns an XML response that contains all or part of the requested +/// list. The operation returns entities in alphabetical order." +/// - GCP: The [main](https://cloud.google.com/storage/docs/xml-api/get-bucket-list) doc +/// doesn't indicate order, but [this +/// page](https://cloud.google.com/storage/docs/xml-api/get-bucket-list) does say: "This page +/// shows you how to list the [objects](https://cloud.google.com/storage/docs/objects) stored +/// in your Cloud Storage buckets, which are ordered in the list lexicographically by name." +fn supports_ordered_listing(url: &Url) -> bool { + !((url.scheme() == "file") + // S3 Directory Buckets + || url.domain().map(|d| d.contains("--x-s3")).unwrap_or(false) + // S3 Directory Bucket Access Points + || url.domain().map(|d| d.contains("-xa-s3")).unwrap_or(false)) +} + #[cfg(test)] mod tests { use std::ops::Range; use std::time::Duration; use itertools::Itertools; - use object_store::memory::InMemory; - use object_store::{local::LocalFileSystem, ObjectStore}; use test_utils::delta_path_for_version; use crate::engine::default::executor::tokio::TokioBackgroundExecutor; - use crate::engine::default::DefaultEngine; + use crate::engine::default::DefaultEngineBuilder; + use crate::object_store::memory::InMemory; + use crate::object_store::{local::LocalFileSystem, ObjectStore}; use crate::utils::current_time_duration; use crate::Engine as _; use super::*; + fn setup_test() -> ( + tempfile::TempDir, + Arc, + ObjectStoreStorageHandler, + ) { + let tmp = tempfile::tempdir().unwrap(); + let store = Arc::new(LocalFileSystem::new()); + let executor = Arc::new(TokioBackgroundExecutor::new()); + let handler = ObjectStoreStorageHandler::new(store.clone(), executor, None); + (tmp, store, handler) + } + + #[test] + fn test_ordered_listing_for_url() { + for (u, expected) in &[ + (Url::parse("file:///dev/null").unwrap(), false), + (Url::parse("s3://robbert").unwrap(), true), + (Url::parse("s3://robbert/likes/paths").unwrap(), true), + (Url::parse("s3://robbie-one-zone--x-s3").unwrap(), false), + ( + Url::parse("https://robbie-one-zone-xa-s3.us-east-2.amazonaws.biz").unwrap(), + false, + ), + ] { + assert_eq!( + *expected, + supports_ordered_listing(u), + "expected {expected} on {u:?}" + ); + } + } + #[tokio::test] async fn test_read_files() { let tmp = tempfile::tempdir().unwrap(); @@ -243,7 +474,7 @@ mod tests { let store = Arc::new(LocalFileSystem::new()); let executor = Arc::new(TokioBackgroundExecutor::new()); - let storage = ObjectStoreStorageHandler::new(store, executor); + let storage = ObjectStoreStorageHandler::new(store, executor, None); let mut slices: Vec = Vec::new(); @@ -274,7 +505,7 @@ mod tests { store.put(&name, data.clone().into()).await.unwrap(); let table_root = Url::parse("memory:///").expect("valid url"); - let engine = DefaultEngine::new(store, Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store).build(); let files: Vec<_> = engine .storage_handler() .list_from(&table_root.join("_delta_log").unwrap().join("0").unwrap()) @@ -304,7 +535,7 @@ mod tests { let url = Url::from_directory_path(tmp.path()).unwrap(); let store = Arc::new(LocalFileSystem::new()); - let engine = DefaultEngine::new(store, Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store).build(); let files = engine .storage_handler() .list_from(&url.join("_delta_log").unwrap().join("0").unwrap()) @@ -328,10 +559,7 @@ mod tests { #[tokio::test] async fn test_copy() { - let tmp = tempfile::tempdir().unwrap(); - let store = Arc::new(LocalFileSystem::new()); - let executor = Arc::new(TokioBackgroundExecutor::new()); - let handler = ObjectStoreStorageHandler::new(store.clone(), executor); + let (tmp, store, handler) = setup_test(); // basic let data = Bytes::from("test-data"); @@ -357,4 +585,85 @@ mod tests { let new_dest_url = Url::from_file_path(tmp.path().join("new_dest.txt")).unwrap(); assert!(handler.copy_atomic(&missing_url, &new_dest_url).is_err()); } + + #[tokio::test] + async fn test_head() { + let (tmp, store, handler) = setup_test(); + + let data = Bytes::from("test-content"); + let file_path = Path::from_absolute_path(tmp.path().join("test.txt")).unwrap(); + let write_time = current_time_duration().unwrap(); + store.put(&file_path, data.clone().into()).await.unwrap(); + + let file_url = Url::from_file_path(tmp.path().join("test.txt")).unwrap(); + let file_meta = handler.head(&file_url).unwrap(); + + assert_eq!(file_meta.location, file_url); + assert_eq!(file_meta.size, data.len() as u64); + + // Verify timestamp is within the expected range + let meta_time = Duration::from_millis(file_meta.last_modified as u64); + assert!( + meta_time.abs_diff(write_time) < Duration::from_millis(100), + "last_modified timestamp should be around {} ms, but was {} ms", + write_time.as_millis(), + meta_time.as_millis() + ); + } + + #[tokio::test] + async fn test_head_non_existent() { + let (tmp, _store, handler) = setup_test(); + + let missing_url = Url::from_file_path(tmp.path().join("missing.txt")).unwrap(); + let result = handler.head(&missing_url); + + assert!(matches!(result, Err(Error::FileNotFound(_)))); + } + + #[test] + fn test_put() { + let (tmp, _store, handler) = setup_test(); + + let data = Bytes::from("put-test-data"); + let file_url = Url::from_file_path(tmp.path().join("put.txt")).unwrap(); + handler.put(&file_url, data.clone(), false).unwrap(); + + // Read back via read_files and verify content + let read_back: Vec = handler + .read_files(vec![(file_url, None)]) + .unwrap() + .map(|r| r.unwrap()) + .collect(); + assert_eq!(read_back.len(), 1); + assert_eq!(read_back[0], data); + } + + #[test] + fn test_put_already_exists() { + let (tmp, _store, handler) = setup_test(); + + let data = Bytes::from("original"); + let file_url = Url::from_file_path(tmp.path().join("put.txt")).unwrap(); + handler.put(&file_url, data, false).unwrap(); + + // Second put with overwrite=false should fail + let new_data = Bytes::from("updated"); + assert!(matches!( + handler.put(&file_url, new_data.clone(), false), + Err(Error::FileAlreadyExists(_)) + )); + + // Put with overwrite=true should succeed + handler.put(&file_url, new_data.clone(), true).unwrap(); + + // Verify the content was overwritten + let read_back: Vec = handler + .read_files(vec![(file_url, None)]) + .unwrap() + .map(|r| r.unwrap()) + .collect(); + assert_eq!(read_back.len(), 1); + assert_eq!(read_back[0], new_data); + } } diff --git a/kernel/src/engine/default/json.rs b/kernel/src/engine/default/json.rs index 5237b67c7e..9dfbb331bb 100644 --- a/kernel/src/engine/default/json.rs +++ b/kernel/src/engine/default/json.rs @@ -1,35 +1,31 @@ //! Default Json handler implementation use std::io::BufReader; -use std::ops::Range; -use std::sync::{mpsc, Arc}; +use std::sync::Arc; use std::task::Poll; -use crate::arrow::datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; +use crate::arrow::datatypes::SchemaRef as ArrowSchemaRef; use crate::arrow::json::ReaderBuilder; use crate::arrow::record_batch::RecordBatch; +use crate::object_store::path::Path; +use crate::object_store::{self, DynObjectStore, GetResultPayload, PutMode}; use bytes::{Buf, Bytes}; use futures::stream::{self, BoxStream}; use futures::{ready, StreamExt, TryStreamExt}; -use object_store::path::Path; -use object_store::{self, DynObjectStore, GetResultPayload, PutMode}; -use tracing::warn; use url::Url; use super::executor::TaskExecutor; -use crate::engine::arrow_conversion::TryFromKernel as _; -use crate::engine::arrow_data::ArrowEngineData; -use crate::engine::arrow_utils::parse_json as arrow_parse_json; -use crate::engine::arrow_utils::to_json_bytes; +use crate::engine::arrow_utils::{ + build_json_reorder_indices, fixup_json_read, json_arrow_schema, parse_json as arrow_parse_json, + to_json_bytes, +}; use crate::engine_data::FilteredEngineData; +use crate::metrics::{MetricEvent, MetricsReporter}; use crate::schema::SchemaRef; use crate::{ DeltaResult, EngineData, Error, FileDataReadResultIterator, FileMeta, JsonHandler, PredicateRef, }; -const DEFAULT_BUFFER_SIZE: usize = 1000; -const DEFAULT_BATCH_SIZE: usize = 1000; - #[derive(Debug)] pub struct DefaultJsonHandler { /// The object store to read files from @@ -43,6 +39,8 @@ pub struct DefaultJsonHandler { /// Limit the number of rows per batch. That is, for batch_size = N, then each RecordBatch /// yielded by the stream will have at most N rows. batch_size: usize, + /// Optional reporter for emitting [`MetricEvent::JsonReadCompleted`] events. + reporter: Option>, } impl DefaultJsonHandler { @@ -50,11 +48,18 @@ impl DefaultJsonHandler { Self { store, task_executor, - buffer_size: DEFAULT_BUFFER_SIZE, - batch_size: DEFAULT_BATCH_SIZE, + buffer_size: super::DEFAULT_BUFFER_SIZE, + batch_size: super::DEFAULT_BATCH_SIZE, + reporter: None, } } + /// Set a metrics reporter to receive [`MetricEvent::JsonReadCompleted`] events. + pub fn with_reporter(mut self, reporter: Option>) -> Self { + self.reporter = reporter; + self + } + /// Set the maximum number read requests to buffer in memory at once in /// [Self::read_json_files()]. /// @@ -84,6 +89,73 @@ impl DefaultJsonHandler { } } +/// Internal async implementation of read_json_files +async fn read_json_files_impl( + store: Arc, + files: Vec, + physical_schema: SchemaRef, + _predicate: Option, + batch_size: usize, + buffer_size: usize, +) -> DeltaResult>>> { + if files.is_empty() { + return Ok(Box::pin(stream::empty())); + } + + // Build Arrow schema from only the real JSON columns, omitting any metadata columns + // (e.g. FilePath) that the JSON reader cannot populate from the file content. + let json_arrow_schema = Arc::new(json_arrow_schema(&physical_schema)?); + // Build the reorder index vec once; apply it to every batch via reorder_struct_array. + let reorder_indices: Arc<[_]> = build_json_reorder_indices(&physical_schema)?.into(); + + // An iterator of futures that open each file and post-process each resulting batch. + let file_futures = files.into_iter().map(move |file| { + let store = store.clone(); + let json_arrow_schema = json_arrow_schema.clone(); + let reorder_indices = reorder_indices.clone(); + async move { + let file_path = file.location.to_string(); + let batch_stream = open_json_file(store, json_arrow_schema, batch_size, file).await?; + // Re-insert synthesized metadata columns (e.g. file path) at their schema positions. + let tagged = batch_stream + .map(move |result| fixup_json_read(result?, &reorder_indices, &file_path)) + .boxed(); + Ok::<_, Error>(tagged) + } + }); + + // Create a stream from that iterator which buffers up to `buffer_size` futures at a time. + let result_stream = stream::iter(file_futures) + .buffered(buffer_size) + .try_flatten() + .map_ok(|e| -> Box { Box::new(e) }); + + Ok(Box::pin(result_stream)) +} + +/// Internal async implementation of write_json_file +/// Note: for now we just buffer all the data and write it out all at once +async fn write_json_file_impl( + store: Arc, + path: Url, + buffer: Vec, + overwrite: bool, +) -> DeltaResult<()> { + let put_mode = if overwrite { + PutMode::Overwrite + } else { + PutMode::Create + }; + + let path = Path::from_url_path(path.path())?; + let result = store.put_opts(&path, buffer.into(), put_mode.into()).await; + result.map_err(|e| match e { + object_store::Error::AlreadyExists { .. } => Error::FileAlreadyExists(path.to_string()), + e => e.into(), + })?; + Ok(()) +} + impl JsonHandler for DefaultJsonHandler { fn parse_json( &self, @@ -97,40 +169,33 @@ impl JsonHandler for DefaultJsonHandler { &self, files: &[FileMeta], physical_schema: SchemaRef, - _predicate: Option, + predicate: Option, ) -> DeltaResult { - if files.is_empty() { - return Ok(Box::new(std::iter::empty())); + let future = read_json_files_impl( + self.store.clone(), + files.to_vec(), + physical_schema, + predicate, + self.batch_size, + self.buffer_size, + ); + let inner = super::stream_future_to_iter(self.task_executor.clone(), future)?; + if let Some(reporter) = &self.reporter { + let num_files = files.len() as u64; + let bytes_read = files.iter().map(|f| f.size).sum(); + Ok(Box::new(super::ReadMetricsIterator::new( + inner, + reporter.clone(), + num_files, + bytes_read, + |num_files, bytes_read| MetricEvent::JsonReadCompleted { + num_files, + bytes_read, + }, + ))) + } else { + Ok(inner) } - - let schema = Arc::new(ArrowSchema::try_from_kernel(physical_schema.as_ref())?); - let file_opener = JsonOpener::new(self.batch_size, schema.clone(), self.store.clone()); - - let (tx, rx) = mpsc::sync_channel(self.buffer_size); - let files = files.to_vec(); - let buffer_size = self.buffer_size; - - self.task_executor.spawn(async move { - // an iterator of futures that open each file - let file_futures = files.into_iter().map(|file| file_opener.open(file, None)); - - // create a stream from that iterator which buffers up to `buffer_size` futures at a time - let mut stream = stream::iter(file_futures) - .buffered(buffer_size) - .try_flatten() - .map_ok(|record_batch| -> Box { - Box::new(ArrowEngineData::new(record_batch)) - }); - - // send each record batch over the channel - while let Some(item) = stream.next().await { - if tx.send(item).is_err() { - warn!("read_json receiver end of channel dropped before sending completed"); - } - } - }); - - Ok(Box::new(rx.into_iter())) } // note: for now we just buffer all the data and write it out all at once @@ -140,107 +205,80 @@ impl JsonHandler for DefaultJsonHandler { data: Box> + Send + '_>, overwrite: bool, ) -> DeltaResult<()> { - let buffer = to_json_bytes(data)?; - let put_mode = if overwrite { - PutMode::Overwrite - } else { - PutMode::Create - }; - - let store = self.store.clone(); // cheap Arc - let path = Path::from_url_path(path.path())?; - let path_str = path.to_string(); - self.task_executor - .block_on(async move { store.put_opts(&path, buffer.into(), put_mode.into()).await }) - .map_err(|e| match e { - object_store::Error::AlreadyExists { .. } => Error::FileAlreadyExists(path_str), - e => e.into(), - })?; - Ok(()) + self.task_executor.block_on(write_json_file_impl( + self.store.clone(), + path.clone(), + to_json_bytes(data)?, + overwrite, + )) } } -/// Opens JSON files and returns a stream of record batches -#[allow(missing_debug_implementations)] -pub struct JsonOpener { +/// Opens a JSON file and returns a stream of record batches +async fn open_json_file( + store: Arc, + schema: ArrowSchemaRef, batch_size: usize, - projected_schema: ArrowSchemaRef, - object_store: Arc, -} - -impl JsonOpener { - /// Returns a [`JsonOpener`] - pub fn new( - batch_size: usize, - projected_schema: ArrowSchemaRef, - object_store: Arc, - ) -> Self { - Self { - batch_size, - projected_schema, - object_store, + file_meta: FileMeta, +) -> DeltaResult>> { + let path = Path::from_url_path(file_meta.location.path())?; + let result = store.get(&path).await?; + let builder = ReaderBuilder::new(schema) + .with_batch_size(batch_size) + .with_coerce_primitive(true); + match result.payload { + GetResultPayload::File(file, _) => { + let reader = builder.build(BufReader::new(file))?; + let reader = futures::stream::iter(reader).map_err(Error::from); + + // Emit exactly one error, then stop the stream. We check seen_error BEFORE + // updating it so the first error passes through, but subsequent items don't. + // This is necessary because Arrow's Reader loops the same error indefinitely. + let mut seen_error = false; + let reader = reader.take_while(move |result| { + let return_this = !seen_error; + if result.is_err() { + seen_error = true; + } + futures::future::ready(return_this) + }); + Ok(reader.boxed()) } - } -} - -impl JsonOpener { - pub async fn open( - &self, - file_meta: FileMeta, - _: Option>, - ) -> DeltaResult>> { - let store = self.object_store.clone(); - let schema = self.projected_schema.clone(); - let batch_size = self.batch_size; - - let path = Path::from_url_path(file_meta.location.path())?; - match store.get(&path).await?.payload { - GetResultPayload::File(file, _) => { - let reader = ReaderBuilder::new(schema) - .with_batch_size(batch_size) - .build(BufReader::new(file))?; - Ok(futures::stream::iter(reader).map_err(Error::from).boxed()) - } - GetResultPayload::Stream(s) => { - let mut decoder = ReaderBuilder::new(schema) - .with_batch_size(batch_size) - .build_decoder()?; - - let mut input = s.map_err(Error::from); - let mut buffered = Bytes::new(); - - let s = futures::stream::poll_fn(move |cx| { - loop { - if buffered.is_empty() { - buffered = match ready!(input.poll_next_unpin(cx)) { - Some(Ok(b)) => b, - Some(Err(e)) => return Poll::Ready(Some(Err(e))), - None => break, - }; - } - let read = buffered.len(); - - // NB (from Decoder::decode docs): - // Read JSON objects from `buf` (param), returning the number of bytes read - // - // This method returns once `batch_size` objects have been parsed since the - // last call to [`Self::flush`], or `buf` is exhausted. Any remaining bytes - // should be included in the next call to [`Self::decode`] - let decoded = match decoder.decode(buffered.as_ref()) { - Ok(decoded) => decoded, - Err(e) => return Poll::Ready(Some(Err(e.into()))), + GetResultPayload::Stream(s) => { + let mut decoder = builder.build_decoder()?; + let mut input = s.map_err(Error::from); + let mut buffered = Bytes::new(); + let s = futures::stream::poll_fn(move |cx| { + loop { + if buffered.is_empty() { + buffered = match ready!(input.poll_next_unpin(cx)) { + Some(Ok(b)) => b, + Some(Err(e)) => return Poll::Ready(Some(Err(e))), + None => break, }; + } - buffered.advance(decoded); - if decoded != read { - break; - } + // NB (from Decoder::decode docs): + // Read JSON objects from `buf` (param), returning the number of bytes read + // + // This method returns once `batch_size` objects have been parsed since the + // last call to [`Self::flush`], or `buf` is exhausted. Any remaining bytes + // should be included in the next call to [`Self::decode`] + let decoded = match decoder.decode(buffered.as_ref()) { + Ok(decoded) => decoded, + Err(e) => return Poll::Ready(Some(Err(e.into()))), + }; + + let read = buffered.len(); + buffered.advance(decoded); + if decoded != read { + break; } + } - Poll::Ready(decoder.flush().map_err(Error::from).transpose()) - }); - Ok(s.map_err(Error::from).boxed()) - } + Poll::Ready(decoder.flush().map_err(Error::from).transpose()) + }); + Ok(s.boxed()) } } } @@ -248,29 +286,31 @@ impl JsonOpener { #[cfg(test)] mod tests { use std::collections::{HashMap, HashSet, VecDeque}; + use std::ops::Range; use std::path::PathBuf; use std::sync::{mpsc, Arc, Mutex}; use std::task::Waker; use crate::actions::get_commit_schema; - use crate::arrow::array::{AsArray, Int32Array, RecordBatch, StringArray}; + use crate::arrow::array::{Array, AsArray, Int32Array, RecordBatch, StringArray}; use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; - use crate::engine::arrow_data::ArrowEngineData; + use crate::engine::arrow_data::{ArrowEngineData, EngineDataArrowExt as _}; use crate::engine::default::executor::tokio::{ TokioBackgroundExecutor, TokioMultiThreadExecutor, }; + use crate::object_store::local::LocalFileSystem; + use crate::object_store::memory::InMemory; + use crate::object_store::PutMultipartOptions; + use crate::object_store::{ + GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, PutOptions, + PutPayload, PutResult, Result, + }; use crate::schema::{DataType as DeltaDataType, Schema, StructField}; use crate::utils::test_utils::string_array_to_engine_data; use futures::future; use itertools::Itertools; - use object_store::local::LocalFileSystem; - use object_store::memory::InMemory; - use object_store::PutMultipartOptions; - use object_store::{ - GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, PutOptions, - PutPayload, PutResult, Result, - }; use serde_json::json; + use tracing::info; // TODO: should just use the one from test_utils, but running into dependency issues fn into_record_batch(engine_data: Box) -> RecordBatch { @@ -484,7 +524,7 @@ mod tests { let json_strings = StringArray::from(vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, - r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, + r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, ]); @@ -496,6 +536,55 @@ mod tests { assert_eq!(batch.len(), 4); } + // Test that operationParameters with boolean/numeric primitives are coerced to strings. + // Some delta logs contain values like `"statsOnLoad": false` instead of `"statsOnLoad": "false"`. + // Without `with_coerce_primitive(true)`, this would fail with: + // "whilst decoding field 'commitInfo': whilst decoding field 'operationParameters': expected string got false" + #[test] + fn test_parse_json_coerce_operation_parameters() { + let store = Arc::new(LocalFileSystem::new()); + let handler = DefaultJsonHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); + + // JSON with operationParameters containing boolean and numeric primitives (not strings) + let json_strings = StringArray::from(vec![ + r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","statsOnLoad":false,"numRetries":5},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, + ]); + let output_schema = get_commit_schema().clone(); + + let batch: RecordBatch = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap() + .try_into_record_batch() + .unwrap(); + + assert_eq!(batch.num_rows(), 1); + + // Verify the operationParameters were parsed correctly with primitives coerced to strings + let commit_info = batch.column_by_name("commitInfo").unwrap().as_struct(); + let op_params = commit_info + .column_by_name("operationParameters") + .unwrap() + .as_map(); + + // The map should have 3 entries: mode, statsOnLoad, numRetries + let map_entries = op_params.value(0); + assert_eq!(map_entries.len(), 3); + + // Extract keys and values from the map + let keys = map_entries.column(0).as_string::(); + let values = map_entries.column(1).as_string::(); + + // Build a HashMap for easier lookup + let params: std::collections::HashMap<_, _> = (0..keys.len()) + .map(|i| (keys.value(i), values.value(i))) + .collect(); + + // Verify coerced primitive values: boolean false -> "false", integer 5 -> "5" + assert_eq!(params.get("statsOnLoad"), Some(&"false")); + assert_eq!(params.get("numRetries"), Some(&"5")); + assert_eq!(params.get("mode"), Some(&"ErrorIfExists")); + } + #[test] fn test_parse_json_drop_field() { let store = Arc::new(LocalFileSystem::new()); @@ -508,9 +597,7 @@ mod tests { let batch: RecordBatch = handler .parse_json(string_array_to_engine_data(json_strings), output_schema) .unwrap() - .into_any() - .downcast::() - .map(|sd| sd.into()) + .try_into_record_batch() .unwrap(); assert_eq!(batch.column(0).len(), 1); let add_array = batch.column_by_name("add").unwrap().as_struct(); @@ -623,6 +710,67 @@ mod tests { ); } + use crate::engine::default::DefaultEngineBuilder; + use crate::schema::StructType; + use crate::Engine; + use std::io::Write; + use tempfile::NamedTempFile; + + fn make_invalid_named_temp() -> (NamedTempFile, Url) { + let mut temp_file = NamedTempFile::new().expect("Failed to create temp file"); + write!(temp_file, r#"this is not valid json"#).expect("Failed to write to temp file"); + let path = temp_file.path(); + let file_url = Url::from_file_path(path).expect("Failed to create file URL"); + + info!("Created temporary malformed file at: {file_url}"); + (temp_file, file_url) + } + + #[test] + fn test_read_invalid_json() -> Result<(), Box> { + let _ = tracing_subscriber::fmt().try_init(); + let (_temp_file1, file_url1) = make_invalid_named_temp(); + let (_temp_file2, file_url2) = make_invalid_named_temp(); + let field = StructField::nullable("name", crate::schema::DataType::BOOLEAN); + let schema = Arc::new(StructType::try_new(vec![field]).unwrap()); + let default_engine = DefaultEngineBuilder::new(Arc::new(LocalFileSystem::new())).build(); + + // Helper to check that we get expected number of errors then stream ends + let check_errors = |file_urls: Vec<_>, expected_errors: usize| { + let file_vec: Vec<_> = file_urls + .into_iter() + .map(|url| FileMeta::new(url, 1, 1)) + .collect(); + + let mut iter = default_engine + .json_handler() + .read_json_files(&file_vec, schema.clone(), None) + .unwrap(); + + for _ in 0..expected_errors { + assert!( + iter.next().unwrap().is_err(), + "Read succeeded unexpectedly. The JSON should have been invalid." + ); + } + + assert!( + iter.next().is_none(), + "The stream should end once the read result fails" + ); + }; + + // CASE 1: Single failing file + info!("\nAttempting to read single malformed JSON file..."); + check_errors(vec![file_url1.clone()], 1); + + // CASE 2: Two failing files + info!("\nAttempting to read two malformed JSON files..."); + check_errors(vec![file_url1, file_url2], 2); + + Ok(()) + } + #[tokio::test(flavor = "multi_thread", worker_threads = 3)] async fn test_read_json_files_ordering() { // this test checks that the read_json_files method returns the files in order in the diff --git a/kernel/src/engine/default/mod.rs b/kernel/src/engine/default/mod.rs index 73331fdf9a..39c55f413c 100644 --- a/kernel/src/engine/default/mod.rs +++ b/kernel/src/engine/default/mod.rs @@ -7,10 +7,11 @@ //! the [executor] module. use std::collections::HashMap; +use std::future::Future; use std::sync::Arc; -use self::storage::parse_url_opts; -use object_store::DynObjectStore; +use futures::stream::{BoxStream, StreamExt as _}; +use itertools::Itertools as _; use url::Url; use self::executor::TaskExecutor; @@ -20,10 +21,13 @@ use self::parquet::DefaultParquetHandler; use super::arrow_conversion::TryFromArrow as _; use super::arrow_data::ArrowEngineData; use super::arrow_expression::ArrowEvaluationHandler; +use crate::metrics::MetricsReporter; +use crate::object_store::DynObjectStore; use crate::schema::Schema; use crate::transaction::WriteContext; use crate::{ - DeltaResult, Engine, EngineData, EvaluationHandler, JsonHandler, ParquetHandler, StorageHandler, + DeltaResult, Engine, EngineData, Error, EvaluationHandler, JsonHandler, ParquetHandler, + StorageHandler, }; pub mod executor; @@ -31,85 +35,290 @@ pub mod file_stream; pub mod filesystem; pub mod json; pub mod parquet; +pub mod stats; pub mod storage; +/// Converts a Stream-producing future to a synchronous iterator. +/// +/// This method performs the initial blocking call to extract the stream from the future, and each +/// subsequent call to `next` on the iterator translates to a blocking `stream.next()` call, using +/// the provided `task_executor`. Buffered streams allow concurrency in the form of prefetching, +/// because that initial call will attempt to populate the N buffer slots; every call to +/// `stream.next()` leaves an empty slot (out of N buffer slots) that the stream immediately +/// attempts to fill by launching another future that can make progress in the background while we +/// block on and consume each of the N-1 entries that precede it. +/// +/// This is an internal utility for bridging object_store's async API to +/// Delta Kernel's synchronous handler traits. +pub(crate) fn stream_future_to_iter( + task_executor: Arc, + stream_future: impl Future>> + Send + 'static, +) -> DeltaResult + Send>> { + Ok(Box::new(BlockingStreamIterator { + stream: Some(task_executor.block_on(stream_future)?), + task_executor, + })) +} + +struct BlockingStreamIterator { + stream: Option>, + task_executor: Arc, +} + +impl Iterator for BlockingStreamIterator { + type Item = T; + + fn next(&mut self) -> Option { + // Move the stream into the future so we can block on it. + let mut stream = self.stream.take()?; + let (item, stream) = self + .task_executor + .block_on(async move { (stream.next().await, stream) }); + + // We must not poll an exhausted stream after it returned None. + if item.is_some() { + self.stream = Some(stream); + } + + item + } +} + +const DEFAULT_BUFFER_SIZE: usize = 1000; +const DEFAULT_BATCH_SIZE: usize = 1000; + +/// Wraps a [`FileDataReadResultIterator`] to emit a [`MetricEvent`] exactly once when the +/// iterator is either exhausted or dropped. Used by JSON and Parquet handlers to report +/// the number of files and bytes requested per `read_*_files` call. +pub(super) struct ReadMetricsIterator { + inner: crate::FileDataReadResultIterator, + reporter: Arc, + num_files: u64, + bytes_read: u64, + emitted: bool, + make_event: fn(u64, u64) -> crate::metrics::MetricEvent, +} + +impl ReadMetricsIterator { + pub(super) fn new( + inner: crate::FileDataReadResultIterator, + reporter: Arc, + num_files: u64, + bytes_read: u64, + make_event: fn(u64, u64) -> crate::metrics::MetricEvent, + ) -> Self { + Self { + inner, + reporter, + num_files, + bytes_read, + emitted: false, + make_event, + } + } + + fn emit_once(&mut self) { + if !self.emitted { + self.emitted = true; + self.reporter + .report((self.make_event)(self.num_files, self.bytes_read)); + } + } +} + +impl Iterator for ReadMetricsIterator { + type Item = crate::DeltaResult>; + + fn next(&mut self) -> Option { + let item = self.inner.next(); + if item.is_none() { + self.emit_once(); + } + item + } +} + +impl Drop for ReadMetricsIterator { + fn drop(&mut self) { + self.emit_once(); + } +} + #[derive(Debug)] pub struct DefaultEngine { object_store: Arc, + task_executor: Arc, storage: Arc>, json: Arc>, parquet: Arc>, evaluation: Arc, + metrics_reporter: Option>, } -impl DefaultEngine { - /// Create a new [`DefaultEngine`] instance - /// - /// # Parameters +/// Builder for creating [`DefaultEngine`] instances. +/// +/// # Example +/// +/// ```no_run +/// # use std::sync::Arc; +/// # use delta_kernel::engine::default::DefaultEngineBuilder; +/// # use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; +/// # use delta_kernel::object_store::local::LocalFileSystem; +/// // Build a DefaultEngine with default executor +/// let engine = DefaultEngineBuilder::new(Arc::new(LocalFileSystem::new())) +/// .build(); +/// +/// // Build with a custom executor +/// let engine = DefaultEngineBuilder::new(Arc::new(LocalFileSystem::new())) +/// .with_task_executor(Arc::new(TokioBackgroundExecutor::new())) +/// .build(); +/// ``` +#[derive(Debug)] +pub struct DefaultEngineBuilder { + object_store: Arc, + task_executor: Arc, + metrics_reporter: Option>, +} + +impl DefaultEngineBuilder { + /// Create a new [`DefaultEngineBuilder`] instance with the default executor. + pub fn new(object_store: Arc) -> Self { + Self { + object_store, + task_executor: Arc::new(executor::tokio::TokioBackgroundExecutor::new()), + metrics_reporter: None, + } + } +} + +impl DefaultEngineBuilder { + /// Set the metrics reporter for the engine. + pub fn with_metrics_reporter(mut self, reporter: Arc) -> Self { + self.metrics_reporter = Some(reporter); + self + } + + /// Set a custom task executor for the engine. /// - /// - `table_root`: The URL of the table within storage. - /// - `options`: key/value pairs of options to pass to the object store. - /// - `task_executor`: Used to spawn async IO tasks. See [executor::TaskExecutor]. - pub fn try_new( - table_root: &Url, - options: impl IntoIterator, - task_executor: Arc, - ) -> DeltaResult - where - K: AsRef, - V: Into, - { - // table root is the path of the table in the ObjectStore - let (object_store, _table_root) = parse_url_opts(table_root, options)?; - Ok(Self::new(Arc::new(object_store), task_executor)) + /// See [`executor::TaskExecutor`] for more details. + pub fn with_task_executor( + self, + task_executor: Arc, + ) -> DefaultEngineBuilder { + DefaultEngineBuilder { + object_store: self.object_store, + task_executor, + metrics_reporter: self.metrics_reporter, + } } - /// Create a new [`DefaultEngine`] instance + /// Build the [`DefaultEngine`] instance. + pub fn build(self) -> DefaultEngine { + DefaultEngine::new_with_opts(self.object_store, self.task_executor, self.metrics_reporter) + } +} + +impl DefaultEngine { + /// Create a [`DefaultEngineBuilder`] for constructing a [`DefaultEngine`] with custom options. /// /// # Parameters /// /// - `object_store`: The object store to use. - /// - `task_executor`: Used to spawn async IO tasks. See [executor::TaskExecutor]. - pub fn new(object_store: Arc, task_executor: Arc) -> Self { + pub fn builder( + object_store: Arc, + ) -> DefaultEngineBuilder { + DefaultEngineBuilder::new(object_store) + } +} + +impl DefaultEngine { + fn new_with_opts( + object_store: Arc, + task_executor: Arc, + metrics_reporter: Option>, + ) -> Self { Self { storage: Arc::new(ObjectStoreStorageHandler::new( object_store.clone(), task_executor.clone(), + metrics_reporter.clone(), )), - json: Arc::new(DefaultJsonHandler::new( - object_store.clone(), - task_executor.clone(), - )), - parquet: Arc::new(DefaultParquetHandler::new( - object_store.clone(), - task_executor, - )), + json: Arc::new( + DefaultJsonHandler::new(object_store.clone(), task_executor.clone()) + .with_reporter(metrics_reporter.clone()), + ), + parquet: Arc::new( + DefaultParquetHandler::new(object_store.clone(), task_executor.clone()) + .with_reporter(metrics_reporter.clone()), + ), object_store, + task_executor, evaluation: Arc::new(ArrowEvaluationHandler {}), + metrics_reporter, } } + /// Enter the runtime context of the executor associated with this engine. + /// + /// # Panics + /// + /// When calling `enter` multiple times, the returned guards **must** be dropped in the reverse + /// order that they were acquired. Failure to do so will result in a panic and possible memory + /// leaks. + pub fn enter(&self) -> ::Guard<'_> { + self.task_executor.enter() + } + pub fn get_object_store_for_url(&self, _url: &Url) -> Option> { Some(self.object_store.clone()) } + /// Write `data` as a parquet file using the provided `write_context`. + /// + /// The `partition_values` keys should use **logical** column names. They will be + /// automatically translated to physical names using the column mapping mode from + /// `write_context`. pub async fn write_parquet( &self, data: &ArrowEngineData, write_context: &WriteContext, partition_values: HashMap, ) -> DeltaResult> { + // Validate partition columns exist in the schema and translate logical names to physical names. + let physical_partition_values: HashMap = partition_values + .into_iter() + .map(|(logical_name, value)| -> DeltaResult<(String, String)> { + let field = write_context + .logical_schema() + .field(&logical_name) + .ok_or_else(|| { + Error::generic(format!( + "Partition column '{logical_name}' not found in table schema" + )) + })?; + let physical_name = field + .physical_name(write_context.column_mapping_mode()) + .to_string(); + Ok((physical_name, value)) + }) + .try_collect()?; + let transform = write_context.logical_to_physical(); let input_schema = Schema::try_from_arrow(data.record_batch().schema())?; - let output_schema = write_context.schema(); + let output_schema = write_context.physical_schema(); let logical_to_physical_expr = self.evaluation_handler().new_expression_evaluator( input_schema.into(), transform.clone(), output_schema.clone().into(), - ); + )?; let physical_data = logical_to_physical_expr.evaluate(data)?; self.parquet - .write_parquet_file(write_context.target_dir(), physical_data, partition_values) + .write_parquet_file( + write_context.target_dir(), + physical_data, + physical_partition_values, + Some(write_context.stats_columns()), + ) .await } } @@ -130,6 +339,10 @@ impl Engine for DefaultEngine { fn parquet_handler(&self) -> Arc { self.parquet.clone() } + + fn get_metrics_reporter(&self) -> Option> { + self.metrics_reporter.clone() + } } trait UrlExt { @@ -163,17 +376,88 @@ impl UrlExt for Url { #[cfg(test)] mod tests { - use super::executor::tokio::TokioBackgroundExecutor; use super::*; use crate::engine::tests::test_arrow_engine; - use object_store::local::LocalFileSystem; + use crate::metrics::MetricEvent; + use crate::object_store::local::LocalFileSystem; + + #[derive(Debug)] + struct TestMetricsReporter; + + impl MetricsReporter for TestMetricsReporter { + fn report(&self, _event: MetricEvent) {} + } #[test] fn test_default_engine() { let tmp = tempfile::tempdir().unwrap(); let url = Url::from_directory_path(tmp.path()).unwrap(); let object_store = Arc::new(LocalFileSystem::new()); - let engine = DefaultEngine::new(object_store, Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(object_store).build(); + test_arrow_engine(&engine, &url); + } + + #[test] + fn test_default_engine_builder_new_and_build() { + let tmp = tempfile::tempdir().unwrap(); + let url = Url::from_directory_path(tmp.path()).unwrap(); + let object_store = Arc::new(LocalFileSystem::new()); + let engine = DefaultEngineBuilder::new(object_store).build(); + test_arrow_engine(&engine, &url); + } + + #[test] + fn test_default_engine_builder_with_metrics_reporter() { + let tmp = tempfile::tempdir().unwrap(); + let url = Url::from_directory_path(tmp.path()).unwrap(); + let object_store = Arc::new(LocalFileSystem::new()); + let reporter = Arc::new(TestMetricsReporter); + let engine = DefaultEngineBuilder::new(object_store) + .with_metrics_reporter(reporter) + .build(); + assert!(engine.get_metrics_reporter().is_some()); + test_arrow_engine(&engine, &url); + } + + #[test] + fn test_default_engine_builder_with_custom_executor() { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .unwrap(); + let tmp = tempfile::tempdir().unwrap(); + let url = Url::from_directory_path(tmp.path()).unwrap(); + let object_store = Arc::new(LocalFileSystem::new()); + let executor = Arc::new(executor::tokio::TokioMultiThreadExecutor::new( + rt.handle().clone(), + )); + let engine = DefaultEngineBuilder::new(object_store) + .with_task_executor(executor) + .build(); + test_arrow_engine(&engine, &url); + } + + #[test] + fn test_default_engine_builder_method() { + let tmp = tempfile::tempdir().unwrap(); + let url = Url::from_directory_path(tmp.path()).unwrap(); + let object_store = Arc::new(LocalFileSystem::new()); + let engine = DefaultEngine::builder(object_store).build(); + test_arrow_engine(&engine, &url); + } + + #[test] + fn test_default_engine_builder_all_options() { + let tmp = tempfile::tempdir().unwrap(); + let url = Url::from_directory_path(tmp.path()).unwrap(); + let object_store = Arc::new(LocalFileSystem::new()); + let reporter = Arc::new(TestMetricsReporter); + let executor = Arc::new(executor::tokio::TokioBackgroundExecutor::new()); + let engine = DefaultEngineBuilder::new(object_store) + .with_metrics_reporter(reporter) + .with_task_executor(executor) + .build(); + assert!(engine.get_metrics_reporter().is_some()); test_arrow_engine(&engine, &url); } diff --git a/kernel/src/engine/default/parquet.rs b/kernel/src/engine/default/parquet.rs index ac63877ec8..5bec187ac5 100644 --- a/kernel/src/engine/default/parquet.rs +++ b/kernel/src/engine/default/parquet.rs @@ -4,22 +4,28 @@ use std::collections::HashMap; use std::ops::Range; use std::sync::Arc; +use delta_kernel_derive::internal_api; + use crate::arrow::array::builder::{MapBuilder, MapFieldNames, StringBuilder}; -use crate::arrow::array::{Int64Array, RecordBatch, StringArray, StructArray}; -use crate::arrow::datatypes::{DataType, Field}; +use crate::arrow::array::{Array, Int64Array, RecordBatch, StringArray, StructArray}; +use crate::arrow::datatypes::{DataType, Field, Schema}; +use crate::object_store::path::Path; +use crate::object_store::{DynObjectStore, ObjectStore}; use crate::parquet::arrow::arrow_reader::{ ArrowReaderMetadata, ArrowReaderOptions, ParquetRecordBatchReaderBuilder, }; -use crate::parquet::arrow::arrow_writer::ArrowWriter; +use crate::parquet::arrow::arrow_writer::{ArrowWriter, ArrowWriterOptions}; use crate::parquet::arrow::async_reader::{ParquetObjectReader, ParquetRecordBatchStreamBuilder}; -use futures::StreamExt; -use object_store::path::Path; -use object_store::DynObjectStore; +use crate::parquet::arrow::async_writer::AsyncArrowWriter; +use crate::parquet::arrow::async_writer::ParquetObjectWriter; +use futures::stream::{self, BoxStream}; +use futures::{StreamExt, TryStreamExt}; use uuid::Uuid; use super::file_stream::{FileOpenFuture, FileOpener, FileStream}; +use super::stats::collect_stats; use super::UrlExt; -use crate::engine::arrow_conversion::TryIntoArrow as _; +use crate::engine::arrow_conversion::{TryFromArrow as _, TryIntoArrow as _}; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::arrow_utils::{ fixup_parquet_read, generate_mask, get_requested_indices, ordering_needs_row_indexes, @@ -27,43 +33,58 @@ use crate::engine::arrow_utils::{ }; use crate::engine::default::executor::TaskExecutor; use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; -use crate::schema::SchemaRef; +use crate::expressions::ColumnName; +use crate::metrics::{MetricEvent, MetricsReporter}; +use crate::schema::{SchemaRef, StructType}; use crate::{ - DeltaResult, EngineData, Error, FileDataReadResultIterator, FileMeta, ParquetHandler, - PredicateRef, + DeltaResult, EngineData, Error, FileDataReadResultIterator, FileMeta, ParquetFooter, + ParquetHandler, PredicateRef, }; +/// Returns the standard [`ArrowReaderOptions`] for all kernel parquet reads. +/// +/// Skipping the embedded Arrow IPC schema avoids dependence on Arrow-specific metadata and +/// ensures that type resolution is driven by the kernel schema rather than the file's schema. +pub(in crate::engine) fn reader_options() -> ArrowReaderOptions { + ArrowReaderOptions::new().with_skip_arrow_metadata(true) +} + +/// Returns the standard [`ArrowWriterOptions`] for all kernel parquet writes. +/// +/// Omitting the Arrow IPC schema from the file metadata keeps Delta files interoperable with +/// non-Arrow readers and avoids encoding Arrow-specific type information. +pub(in crate::engine) fn writer_options() -> ArrowWriterOptions { + ArrowWriterOptions::new().with_skip_arrow_metadata(true) +} + #[derive(Debug)] pub struct DefaultParquetHandler { store: Arc, task_executor: Arc, readahead: usize, + /// Optional reporter for emitting [`MetricEvent::ParquetReadCompleted`] events. + reporter: Option>, } /// Metadata of a data file (typically a parquet file). -/// -/// Currently just includes the the number of records as statistics, but will expand to include -/// more statistics and other metadata in the future. #[derive(Debug)] pub struct DataFileMetadata { file_meta: FileMeta, - // NB: We use usize instead of u64 since arrow uses usize for record batch sizes - num_records: usize, + /// Collected statistics for this file (includes numRecords, tightBounds, etc.). + stats: StructArray, } impl DataFileMetadata { - pub fn new(file_meta: FileMeta, num_records: usize) -> Self { - Self { - file_meta, - num_records, - } + pub fn new(file_meta: FileMeta, stats: StructArray) -> Self { + Self { file_meta, stats } } /// Convert DataFileMetadata into a record batch which matches the schema returned by /// [`add_files_schema`]. /// - /// [`add_files_schema`]: crate::transaction::add_files_schema - fn as_record_batch( + /// [`add_files_schema`]: crate::transaction::Transaction::add_files_schema + #[internal_api] + pub(crate) fn as_record_batch( &self, partition_values: &HashMap, ) -> DeltaResult> { @@ -74,7 +95,8 @@ impl DataFileMetadata { last_modified, size, }, - num_records, + stats, + .. } = self; // create the record batch of the write metadata let path = Arc::new(StringArray::from(vec![location.to_string()])); @@ -88,7 +110,12 @@ impl DataFileMetadata { let mut builder = MapBuilder::new(Some(names), key_builder, val_builder); for (k, v) in partition_values { builder.keys().append_value(k); - builder.values().append_value(v); + if v.is_empty() { + // convert empty string to null as per the Delta Spec + builder.values().append_null(); + } else { + builder.values().append_value(v); + } } builder.append(true)?; let partitions = Arc::new(builder.finish()); @@ -98,20 +125,35 @@ impl DataFileMetadata { .map_err(|_| Error::generic("Failed to convert parquet metadata 'size' to i64"))?; let size = Arc::new(Int64Array::from(vec![size])); let modification_time = Arc::new(Int64Array::from(vec![*last_modified])); - let stats = Arc::new(StructArray::try_new_with_length( - vec![Field::new("numRecords", DataType::Int64, true)].into(), - vec![Arc::new(Int64Array::from(vec![*num_records as i64]))], - None, - 1, - )?); - Ok(Box::new(ArrowEngineData::new(RecordBatch::try_new( - Arc::new( - crate::transaction::BASE_ADD_FILES_SCHEMA - .as_ref() - .try_into_arrow()?, + let stats_array = Arc::new(stats.clone()); + + // Build schema dynamically based on stats (stats schema varies based on collected statistics) + let key_value_struct = DataType::Struct( + vec![ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + ] + .into(), + ); + let schema = Schema::new(vec![ + Field::new("path", DataType::Utf8, false), + Field::new( + "partitionValues", + DataType::Map( + Arc::new(Field::new("key_value", key_value_struct, false)), + false, + ), + false, ), - vec![path, partitions, size, modification_time, stats], + Field::new("size", DataType::Int64, false), + Field::new("modificationTime", DataType::Int64, false), + Field::new("stats", stats_array.data_type().clone(), true), + ]); + + Ok(Box::new(ArrowEngineData::new(RecordBatch::try_new( + Arc::new(schema), + vec![path, partitions, size, modification_time, stats_array], )?))) } } @@ -122,6 +164,7 @@ impl DefaultParquetHandler { store, task_executor, readahead: 10, + reporter: None, } } @@ -133,6 +176,12 @@ impl DefaultParquetHandler { self } + /// Set a metrics reporter to receive [`MetricEvent::ParquetReadCompleted`] events. + pub fn with_reporter(mut self, reporter: Option>) -> Self { + self.reporter = reporter; + self + } + // Write `data` to `{path}/.parquet` as parquet using ArrowWriter and return the parquet // metadata (where `` is a generated UUIDv4). // @@ -142,13 +191,20 @@ impl DefaultParquetHandler { &self, path: &url::Url, data: Box, + stats_columns: &[ColumnName], ) -> DeltaResult { let batch: Box<_> = ArrowEngineData::try_from_engine_data(data)?; let record_batch = batch.record_batch(); - let num_records = record_batch.num_rows(); + + // Collect statistics before writing (includes numRecords) + let stats = collect_stats(record_batch, stats_columns)?; let mut buffer = vec![]; - let mut writer = ArrowWriter::try_new(&mut buffer, record_batch.schema(), None)?; + let mut writer = ArrowWriter::try_new_with_options( + &mut buffer, + record_batch.schema(), + writer_options(), + )?; writer.write(record_batch)?; writer.close()?; // writer must be closed to write footer @@ -179,7 +235,7 @@ impl DefaultParquetHandler { } let file_meta = FileMeta::new(path, modification_time, size); - Ok(DataFileMetadata::new(file_meta, num_records)) + Ok(DataFileMetadata::new(file_meta, stats)) } /// Write `data` to `{path}/.parquet` as parquet using ArrowWriter and return the parquet @@ -195,12 +251,76 @@ impl DefaultParquetHandler { path: &url::Url, data: Box, partition_values: HashMap, + stats_columns: Option<&[ColumnName]>, ) -> DeltaResult> { - let parquet_metadata = self.write_parquet(path, data).await?; + let parquet_metadata = self + .write_parquet(path, data, stats_columns.unwrap_or(&[])) + .await?; parquet_metadata.as_record_batch(&partition_values) } } +/// Internal async implementation of read_parquet_files +async fn read_parquet_files_impl( + store: Arc, + files: Vec, + physical_schema: SchemaRef, + predicate: Option, +) -> DeltaResult>>> { + if files.is_empty() { + return Ok(Box::pin(stream::empty())); + } + + let arrow_schema = Arc::new(physical_schema.as_ref().try_into_arrow()?); + + // get the first FileMeta to decide how to fetch the file. + // NB: This means that every file in `FileMeta` _must_ have the same scheme or things will break + // s3:// -> aws (ParquetOpener) + // nothing -> local (ParquetOpener) + // https:// -> assume presigned URL (and fetch without object_store) + // -> reqwest to get data + // -> parse to parquet + // SAFETY: we did is_empty check above, this is ok. + if files[0].location.is_presigned() { + let file_opener = Box::new(PresignedUrlOpener::new( + 1024, + physical_schema.clone(), + predicate, + )); + let stream = FileStream::new(files, arrow_schema, file_opener)?.map_ok( + |record_batch| -> Box { Box::new(ArrowEngineData::new(record_batch)) }, + ); + return Ok(Box::pin(stream)); + } + + // an iterator of futures that open each file + let file_futures = files.into_iter().map(move |file| { + let store = store.clone(); + let schema = physical_schema.clone(); + let predicate = predicate.clone(); + async move { + open_parquet_file( + store, + schema, + predicate, + None, + super::DEFAULT_BATCH_SIZE, + file, + ) + .await + } + }); + // create a stream from that iterator which buffers up to `buffer_size` futures at a time + let result_stream = stream::iter(file_futures) + .buffered(super::DEFAULT_BUFFER_SIZE) + .try_flatten() + .map_ok(|record_batch| -> Box { + Box::new(ArrowEngineData::new(record_batch)) + }); + + Ok(Box::pin(result_stream)) +} + impl ParquetHandler for DefaultParquetHandler { fn read_parquet_files( &self, @@ -208,141 +328,195 @@ impl ParquetHandler for DefaultParquetHandler { physical_schema: SchemaRef, predicate: Option, ) -> DeltaResult { - if files.is_empty() { - return Ok(Box::new(std::iter::empty())); - } - - // get the first FileMeta to decide how to fetch the file. - // NB: This means that every file in `FileMeta` _must_ have the same scheme or things will break - // s3:// -> aws (ParquetOpener) - // nothing -> local (ParquetOpener) - // https:// -> assume presigned URL (and fetch without object_store) - // -> reqwest to get data - // -> parse to parquet - // SAFETY: we did is_empty check above, this is ok. - let file_opener: Box = if files[0].location.is_presigned() { - Box::new(PresignedUrlOpener::new( - 1024, - physical_schema.clone(), - predicate, - )) - } else { - Box::new(ParquetOpener::new( - 1024, - physical_schema.clone(), - predicate, - self.store.clone(), - )) - }; - FileStream::new_async_read_iterator( - self.task_executor.clone(), - Arc::new(physical_schema.as_ref().try_into_arrow()?), - file_opener, - files, - self.readahead, - ) - } -} - -/// Implements [`FileOpener`] for a parquet file -struct ParquetOpener { - // projection: Arc<[usize]>, - batch_size: usize, - table_schema: SchemaRef, - predicate: Option, - limit: Option, - store: Arc, -} - -impl ParquetOpener { - pub(crate) fn new( - batch_size: usize, - table_schema: SchemaRef, - predicate: Option, - store: Arc, - ) -> Self { - Self { - batch_size, - table_schema, + let future = read_parquet_files_impl( + self.store.clone(), + files.to_vec(), + physical_schema, predicate, - limit: None, - store, + ); + let inner = super::stream_future_to_iter(self.task_executor.clone(), future)?; + if let Some(reporter) = &self.reporter { + let num_files = files.len() as u64; + let bytes_read = files.iter().map(|f| f.size).sum(); + Ok(Box::new(super::ReadMetricsIterator::new( + inner, + reporter.clone(), + num_files, + bytes_read, + |num_files, bytes_read| MetricEvent::ParquetReadCompleted { + num_files, + bytes_read, + }, + ))) + } else { + Ok(inner) } } -} -impl FileOpener for ParquetOpener { - fn open(&self, file_meta: FileMeta, _range: Option>) -> DeltaResult { - let path = Path::from_url_path(file_meta.location.path())?; + /// Writes engine data to a Parquet file at the specified location. + /// + /// This implementation uses asynchronous file I/O with object_store to write the Parquet file. + /// If a file already exists at the given location, it will be overwritten. + /// + /// # Parameters + /// + /// - `location` - The full URL path where the Parquet file should be written + /// (e.g., `s3://bucket/path/file.parquet`, `file:///path/to/file.parquet`). + /// - `data` - An iterator of engine data to be written to the Parquet file. + /// + /// # Returns + /// + /// A [`DeltaResult`] indicating success or failure. + fn write_parquet_file( + &self, + location: url::Url, + mut data: Box>> + Send>, + ) -> DeltaResult<()> { let store = self.store.clone(); - let batch_size = self.batch_size; - // let projection = self.projection.clone(); - let table_schema = self.table_schema.clone(); - let predicate = self.predicate.clone(); - let limit = self.limit; + self.task_executor.block_on(async move { + let path = Path::from_url_path(location.path())?; + + // Get first batch to initialize writer with schema + let first_batch = data.next().ok_or_else(|| { + Error::generic("Cannot write parquet file with empty data iterator") + })??; + let first_arrow = ArrowEngineData::try_from_engine_data(first_batch)?; + let first_record_batch: RecordBatch = (*first_arrow).into(); + + let object_writer = ParquetObjectWriter::new(store, path); + let schema = first_record_batch.schema(); + let mut writer = + AsyncArrowWriter::try_new_with_options(object_writer, schema, writer_options())?; + + // Write the first batch + writer.write(&first_record_batch).await?; + + // Write remaining batches + for result in data { + let engine_data = result?; + let arrow_data = ArrowEngineData::try_from_engine_data(engine_data)?; + let batch: RecordBatch = (*arrow_data).into(); + writer.write(&batch).await?; + } - Ok(Box::pin(async move { - let mut reader = { - use object_store::ObjectStoreScheme; - // HACK: unfortunately, `ParquetObjectReader` under the hood does a suffix range - // request which isn't supported by Azure. For now we just detect if the URL is - // pointing to azure and if so, do a HEAD request so we can pass in file size to the - // reader which will cause the reader to avoid a suffix range request. - // see also: https://github.com/delta-io/delta-kernel-rs/issues/968 - // - // TODO(#1010): Note that we don't need this at all and can actually just _always_ - // do the `with_file_size` but need to (1) update our unit tests which often - // hardcode size=0 and (2) update CDF execute which also hardcodes size=0. - if let Ok((ObjectStoreScheme::MicrosoftAzure, _)) = - ObjectStoreScheme::parse(&file_meta.location) - { - // also note doing HEAD then actual GET isn't atomic, and leaves us vulnerable - // to file changing between the two calls. - let meta = store.head(&path).await?; - ParquetObjectReader::new(store, path).with_file_size(meta.size) - } else { - ParquetObjectReader::new(store, path) - } - }; + writer.finish().await?; - let metadata = ArrowReaderMetadata::load_async(&mut reader, Default::default()).await?; - let parquet_schema = metadata.schema(); - let (indices, requested_ordering) = - get_requested_indices(&table_schema, parquet_schema)?; - let options = ArrowReaderOptions::new(); //.with_page_index(enable_page_index); - let mut builder = - ParquetRecordBatchStreamBuilder::new_with_options(reader, options).await?; - if let Some(mask) = generate_mask( - &table_schema, - parquet_schema, - builder.parquet_schema(), - &indices, - ) { - builder = builder.with_projection(mask) - } + Ok(()) + }) + } - // Only create RowIndexBuilder if row indexes are actually needed - let mut row_indexes = ordering_needs_row_indexes(&requested_ordering) - .then(|| RowIndexBuilder::new(builder.metadata().row_groups())); + fn read_parquet_footer(&self, file: &FileMeta) -> DeltaResult { + let store = self.store.clone(); + let location = file.location.clone(); + let file_size = file.size; + + self.task_executor.block_on(async move { + let metadata = if location.is_presigned() { + let client = reqwest::Client::new(); + let response = + client.get(location.as_str()).send().await.map_err(|e| { + Error::generic(format!("Failed to fetch presigned URL: {e}")) + })?; + let bytes = response + .bytes() + .await + .map_err(|e| Error::generic(format!("Failed to read response bytes: {e}")))?; + ArrowReaderMetadata::load(&bytes, reader_options())? + } else { + let path = Path::from_url_path(location.path())?; + let mut reader = ParquetObjectReader::new(store, path).with_file_size(file_size); + ArrowReaderMetadata::load_async(&mut reader, reader_options()).await? + }; - // Filter row groups and row indexes if a predicate is provided - if let Some(ref predicate) = predicate { - builder = builder.with_row_group_filter(predicate, row_indexes.as_mut()); - } - if let Some(limit) = limit { - builder = builder.with_limit(limit) - } + let schema = StructType::try_from_arrow(metadata.schema().as_ref()) + .map(Arc::new) + .map_err(Error::Arrow)?; + Ok(ParquetFooter { schema }) + }) + } +} - let mut row_indexes = row_indexes.map(|rb| rb.build()).transpose()?; - let stream = builder.with_batch_size(batch_size).build()?; +/// Opens a Parquet file and returns a stream of record batches +async fn open_parquet_file( + store: Arc, + table_schema: SchemaRef, + predicate: Option, + limit: Option, + batch_size: usize, + file_meta: FileMeta, +) -> DeltaResult>> { + let file_location = file_meta.location.to_string(); + let path = Path::from_url_path(file_meta.location.path())?; + + let mut reader = { + use crate::object_store::ObjectStoreScheme; + // HACK: unfortunately, `ParquetObjectReader` under the hood does a suffix range + // request which isn't supported by Azure. For now we just detect if the URL is + // pointing to azure and if so, do a HEAD request so we can pass in file size to the + // reader which will cause the reader to avoid a suffix range request. + // see also: https://github.com/delta-io/delta-kernel-rs/issues/968 + + // Since the `Remove` action's size value is optional as specified in the delta protocol + // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#add-file-and-remove-file, + // the extracted size will be zero in this case. Thus, this function + // need to handle the case of zero file_meta.size. + if file_meta.size != 0 { + ParquetObjectReader::new(store, path).with_file_size(file_meta.size) + } else if let Ok((ObjectStoreScheme::MicrosoftAzure, _)) = + ObjectStoreScheme::parse(&file_meta.location) + { + // also note doing HEAD then actual GET isn't atomic, and leaves us vulnerable + // to file changing between the two calls. + let meta = store.head(&path).await?; + ParquetObjectReader::new(store, path).with_file_size(meta.size) + } else { + ParquetObjectReader::new(store, path) + } + }; + + let reader_options = reader_options(); + let metadata = ArrowReaderMetadata::load_async(&mut reader, reader_options.clone()).await?; + let parquet_schema = metadata.schema(); + let (indices, requested_ordering) = get_requested_indices(&table_schema, parquet_schema)?; + let mut builder = + ParquetRecordBatchStreamBuilder::new_with_options(reader, reader_options).await?; + if let Some(mask) = generate_mask( + &table_schema, + parquet_schema, + builder.parquet_schema(), + &indices, + ) { + builder = builder.with_projection(mask) + } - let stream = stream.map(move |rbr| { - fixup_parquet_read(rbr?, &requested_ordering, row_indexes.as_mut()) - }); - Ok(stream.boxed()) - })) + // Only create RowIndexBuilder if row indexes are actually needed + let mut row_indexes = ordering_needs_row_indexes(&requested_ordering) + .then(|| RowIndexBuilder::new(builder.metadata().row_groups())); + + // Filter row groups and row indexes if a predicate is provided + if let Some(ref predicate) = predicate { + builder = builder.with_row_group_filter(predicate, row_indexes.as_mut()); } + if let Some(limit) = limit { + builder = builder.with_limit(limit) + } + + let mut row_indexes = row_indexes.map(|rb| rb.build()).transpose()?; + let stream = builder.with_batch_size(batch_size).build()?; + + let arrow_schema: Arc = Arc::new(table_schema.as_ref().try_into_arrow()?); + let stream = stream.map(move |rbr| { + fixup_parquet_read( + rbr?, + &requested_ordering, + row_indexes.as_mut(), + Some(&file_location), + Some(&arrow_schema), + ) + .map(Into::into) + }); + Ok(stream.boxed()) } /// Implements [`FileOpener`] for a opening a parquet file from a presigned URL @@ -377,18 +551,19 @@ impl FileOpener for PresignedUrlOpener { let predicate = self.predicate.clone(); let limit = self.limit; let client = self.client.clone(); // uses Arc internally according to reqwest docs + let file_location = file_meta.location.to_string(); Ok(Box::pin(async move { // fetch the file from the interweb - let reader = client.get(file_meta.location).send().await?.bytes().await?; - let metadata = ArrowReaderMetadata::load(&reader, Default::default())?; + let reader = client.get(&file_location).send().await?.bytes().await?; + let reader_options = reader_options(); + let metadata = ArrowReaderMetadata::load(&reader, reader_options.clone())?; let parquet_schema = metadata.schema(); let (indices, requested_ordering) = get_requested_indices(&table_schema, parquet_schema)?; - let options = ArrowReaderOptions::new(); let mut builder = - ParquetRecordBatchReaderBuilder::try_new_with_options(reader, options)?; + ParquetRecordBatchReaderBuilder::try_new_with_options(reader, reader_options)?; if let Some(mask) = generate_mask( &table_schema, parquet_schema, @@ -413,9 +588,17 @@ impl FileOpener for PresignedUrlOpener { let reader = builder.with_batch_size(batch_size).build()?; let mut row_indexes = row_indexes.map(|rb| rb.build()).transpose()?; + let arrow_schema: Arc = Arc::new(table_schema.as_ref().try_into_arrow()?); let stream = futures::stream::iter(reader); let stream = stream.map(move |rbr| { - fixup_parquet_read(rbr?, &requested_ordering, row_indexes.as_mut()) + fixup_parquet_read( + rbr?, + &requested_ordering, + row_indexes.as_mut(), + Some(&file_location), + Some(&arrow_schema), + ) + .map(Into::into) }); Ok(stream.boxed()) })) @@ -424,18 +607,26 @@ impl FileOpener for PresignedUrlOpener { #[cfg(test)] mod tests { + use std::collections::HashMap; use std::path::PathBuf; use std::slice; - use crate::arrow::array::{Array, RecordBatch}; - + use crate::arrow::array::{ + Array, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array, Float64Array, + Int16Array, Int32Array, Int64Array, Int8Array, RecordBatch, StringArray, + TimestampMicrosecondArray, + }; + use crate::arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; use crate::engine::arrow_conversion::TryIntoKernel as _; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::default::executor::tokio::TokioBackgroundExecutor; + use crate::engine::default::DEFAULT_BATCH_SIZE; + use crate::object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; + use crate::parquet::arrow::{ARROW_SCHEMA_META_KEY, PARQUET_FIELD_ID_META_KEY}; + use crate::schema::ColumnMetadataKey; use crate::EngineData; use itertools::Itertools; - use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; use url::Url; use crate::utils::current_time_ms; @@ -451,6 +642,65 @@ mod tests { .map(Into::into) } + async fn read_all_rows_helper(file_meta: FileMeta) -> DeltaResult> { + let store = Arc::new(LocalFileSystem::new()); + let path = Path::from_url_path(file_meta.location.path()).unwrap(); + let reader = ParquetObjectReader::new(store.clone(), path); + let physical_schema = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .schema() + .clone(); + let stream = open_parquet_file( + store, + Arc::new(physical_schema.try_into_kernel().unwrap()), + None, + None, + DEFAULT_BATCH_SIZE, + file_meta, + ) + .await + .unwrap(); + + let batches: Vec = stream.try_collect().await.unwrap(); + Ok(batches) + } + + #[tokio::test] + async fn test_open_parquet_file_with_size() { + let path = std::fs::canonicalize(PathBuf::from( + "./tests/data/table-with-dv-small/part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet" + )).unwrap(); + let file_size = std::fs::metadata(&path).unwrap().len(); + let url = Url::from_file_path(path).unwrap(); + let file_meta = FileMeta { + location: url, + last_modified: 0, + size: file_size, + }; + let data = read_all_rows_helper(file_meta).await.unwrap(); + + assert_eq!(data.len(), 1); + assert_eq!(data[0].num_rows(), 10); + } + + #[tokio::test] + async fn test_open_parquet_file_without_size() { + let path = std::fs::canonicalize(PathBuf::from( + "./tests/data/table-with-dv-small/part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet" + )).unwrap(); + let url = Url::from_file_path(path).unwrap(); + let file_meta = FileMeta { + location: url, + last_modified: 0, + size: 0, + }; + let data = read_all_rows_helper(file_meta).await.unwrap(); + + assert_eq!(data.len(), 1); + assert_eq!(data[0].num_rows(), 10); + } + #[tokio::test] async fn test_read_parquet_files() { let store = Arc::new(LocalFileSystem::new()); @@ -491,26 +741,38 @@ mod tests { assert_eq!(data[0].num_rows(), 10); } - #[test] - fn test_as_record_batch() { + #[rstest::rstest] + fn test_as_record_batch(#[values(true, false)] test_empty_str: bool) { let location = Url::parse("file:///test_url").unwrap(); let size = 1_000_000; let last_modified = 10000000000; let num_records = 10; let file_metadata = FileMeta::new(location.clone(), last_modified, size); - let data_file_metadata = DataFileMetadata::new(file_metadata, num_records); - let partition_values = HashMap::from([("partition1".to_string(), "a".to_string())]); + let stats = StructArray::try_new( + vec![ + Field::new("numRecords", ArrowDataType::Int64, true), + Field::new("tightBounds", ArrowDataType::Boolean, true), + ] + .into(), + vec![ + Arc::new(Int64Array::from(vec![num_records as i64])), + Arc::new(BooleanArray::from(vec![true])), + ], + None, + ) + .unwrap(); + let data_file_metadata = DataFileMetadata::new(file_metadata, stats.clone()); + let partition_value = if test_empty_str { + "".to_string() + } else { + "a".to_string() + }; + let partition_values = HashMap::from([("partition1".to_string(), partition_value)]); let actual = data_file_metadata .as_record_batch(&partition_values) .unwrap(); let actual = ArrowEngineData::try_from_engine_data(actual).unwrap(); - let schema = Arc::new( - crate::transaction::BASE_ADD_FILES_SCHEMA - .as_ref() - .try_into_arrow() - .unwrap(), - ); let mut partition_values_builder = MapBuilder::new( Some(MapFieldNames { entry: "key_value".to_string(), @@ -520,17 +782,42 @@ mod tests { StringBuilder::new(), StringBuilder::new(), ); + partition_values_builder.keys().append_value("partition1"); - partition_values_builder.values().append_value("a"); + if test_empty_str { + partition_values_builder.values().append_null(); // empty string should go to null + } else { + partition_values_builder.values().append_value("a"); + } partition_values_builder.append(true).unwrap(); let partition_values = partition_values_builder.finish(); - let stats_struct = StructArray::try_new_with_length( - vec![Field::new("numRecords", DataType::Int64, true)].into(), - vec![Arc::new(Int64Array::from(vec![num_records as i64]))], - None, - 1, - ) - .unwrap(); + + // Build expected schema dynamically based on stats + let stats_field = Field::new("stats", stats.data_type().clone(), true); + let schema = Arc::new(crate::arrow::datatypes::Schema::new(vec![ + Field::new("path", ArrowDataType::Utf8, false), + Field::new( + "partitionValues", + ArrowDataType::Map( + Arc::new(Field::new( + "key_value", + ArrowDataType::Struct( + vec![ + Field::new("key", ArrowDataType::Utf8, false), + Field::new("value", ArrowDataType::Utf8, true), + ] + .into(), + ), + false, + )), + false, + ), + false, + ), + Field::new("size", ArrowDataType::Int64, false), + Field::new("modificationTime", ArrowDataType::Int64, false), + stats_field, + ])); let expected = RecordBatch::try_new( schema, @@ -539,7 +826,7 @@ mod tests { Arc::new(partition_values), Arc::new(Int64Array::from(vec![size as i64])), Arc::new(Int64Array::from(vec![last_modified])), - Arc::new(stats_struct), + Arc::new(stats), ], ) .unwrap(); @@ -562,7 +849,7 @@ mod tests { )); let write_metadata = parquet_handler - .write_parquet(&Url::parse("memory:///data/").unwrap(), data) + .write_parquet(&Url::parse("memory:///data/").unwrap(), data, &[]) .await .unwrap(); @@ -573,7 +860,7 @@ mod tests { last_modified, size, }, - num_records, + ref stats, } = write_metadata; let expected_location = Url::parse("memory:///data/").unwrap(); @@ -591,6 +878,15 @@ mod tests { assert_eq!(&expected_location.join(filename).unwrap(), location); assert_eq!(expected_size, size); assert!(now - last_modified < 10_000); + + // Check numRecords from stats + let num_records = stats + .column_by_name("numRecords") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .value(0); assert_eq!(num_records, 3); // check we can read back @@ -633,9 +929,543 @@ mod tests { assert_result_error_with_message( parquet_handler - .write_parquet(&Url::parse("memory:///data").unwrap(), data) + .write_parquet(&Url::parse("memory:///data").unwrap(), data, &[]) .await, "Generic delta kernel error: Path must end with a trailing slash: memory:///data", ); } + + #[tokio::test] + async fn test_parquet_handler_trait_write() { + let store = Arc::new(InMemory::new()); + let parquet_handler: Arc = Arc::new(DefaultParquetHandler::new( + store.clone(), + Arc::new(TokioBackgroundExecutor::new()), + )); + + let engine_data: Box = Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![ + ( + "x", + Arc::new(Int64Array::from(vec![10, 20, 30])) as Arc, + ), + ( + "y", + Arc::new(Int64Array::from(vec![100, 200, 300])) as Arc, + ), + ]) + .unwrap(), + )); + + // Create iterator with single batch + let data_iter: Box>> + Send> = + Box::new(std::iter::once(Ok(engine_data))); + + // Test writing through the trait method + let file_url = Url::parse("memory:///test/data.parquet").unwrap(); + parquet_handler + .write_parquet_file(file_url.clone(), data_iter) + .unwrap(); + + // Verify we can read the file back + let path = Path::from_url_path(file_url.path()).unwrap(); + let metadata = store.head(&path).await.unwrap(); + let reader = ParquetObjectReader::new(store.clone(), path); + let physical_schema = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .schema() + .clone(); + + let file_meta = FileMeta { + location: file_url, + last_modified: 0, + size: metadata.size, + }; + + let data: Vec = parquet_handler + .read_parquet_files( + slice::from_ref(&file_meta), + Arc::new(physical_schema.try_into_kernel().unwrap()), + None, + ) + .unwrap() + .map(into_record_batch) + .try_collect() + .unwrap(); + + assert_eq!(data.len(), 1); + assert_eq!(data[0].num_rows(), 3); + assert_eq!(data[0].num_columns(), 2); + } + + #[tokio::test] + async fn test_parquet_handler_trait_write_and_read_roundtrip() { + let store = Arc::new(InMemory::new()); + let parquet_handler: Arc = Arc::new(DefaultParquetHandler::new( + store.clone(), + Arc::new(TokioBackgroundExecutor::new()), + )); + + // Create test data with all Delta-supported primitive types + let engine_data: Box = Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![ + // Byte (i8) + ( + "byte_col", + Arc::new(Int8Array::from(vec![1i8, 2, 3, 4, 5])) as Arc, + ), + // Short (i16) + ( + "short_col", + Arc::new(Int16Array::from(vec![100i16, 200, 300, 400, 500])) as Arc, + ), + // Integer (i32) + ( + "int_col", + Arc::new(Int32Array::from(vec![1000i32, 2000, 3000, 4000, 5000])) + as Arc, + ), + // Long (i64) + ( + "long_col", + Arc::new(Int64Array::from(vec![10000i64, 20000, 30000, 40000, 50000])) + as Arc, + ), + // Float (f32) + ( + "float_col", + Arc::new(Float32Array::from(vec![1.1f32, 2.2, 3.3, 4.4, 5.5])) + as Arc, + ), + // Double (f64) + ( + "double_col", + Arc::new(Float64Array::from(vec![1.11f64, 2.22, 3.33, 4.44, 5.55])) + as Arc, + ), + // Boolean + ( + "bool_col", + Arc::new(BooleanArray::from(vec![true, false, true, false, true])) + as Arc, + ), + // String + ( + "string_col", + Arc::new(StringArray::from(vec!["a", "b", "c", "d", "e"])) as Arc, + ), + // Binary + ( + "binary_col", + Arc::new(BinaryArray::from_vec(vec![ + b"bin1", b"bin2", b"bin3", b"bin4", b"bin5", + ])) as Arc, + ), + // Date + ( + "date_col", + Arc::new(Date32Array::from(vec![18262, 18263, 18264, 18265, 18266])) + as Arc, // Days since epoch (2020-01-01 onwards) + ), + // Timestamp (with UTC timezone) + ( + "timestamp_col", + Arc::new( + TimestampMicrosecondArray::from(vec![ + 1609459200000000i64, // 2021-01-01 00:00:00 UTC + 1609545600000000i64, + 1609632000000000i64, + 1609718400000000i64, + 1609804800000000i64, + ]) + .with_timezone("UTC"), + ) as Arc, + ), + // TimestampNtz (without timezone) + ( + "timestamp_ntz_col", + Arc::new(TimestampMicrosecondArray::from(vec![ + 1609459200000000i64, // 2021-01-01 00:00:00 + 1609545600000000i64, + 1609632000000000i64, + 1609718400000000i64, + 1609804800000000i64, + ])) as Arc, + ), + // Decimal (precision 10, scale 2) + ( + "decimal_col", + Arc::new( + Decimal128Array::from(vec![12345i128, 23456, 34567, 45678, 56789]) + .with_precision_and_scale(10, 2) + .unwrap(), + ) as Arc, + ), + ]) + .unwrap(), + )); + + // Create iterator with single batch + let data_iter: Box>> + Send> = + Box::new(std::iter::once(Ok(engine_data))); + + // Write the data + let file_url = Url::parse("memory:///roundtrip/test.parquet").unwrap(); + parquet_handler + .write_parquet_file(file_url.clone(), data_iter) + .unwrap(); + + // Read it back + let path = Path::from_url_path(file_url.path()).unwrap(); + let metadata = store.head(&path).await.unwrap(); + let reader = ParquetObjectReader::new(store.clone(), path); + let physical_schema = ParquetRecordBatchStreamBuilder::new(reader) + .await + .unwrap() + .schema() + .clone(); + + let file_meta = FileMeta { + location: file_url.clone(), + last_modified: 0, + size: metadata.size, + }; + + let data: Vec = parquet_handler + .read_parquet_files( + slice::from_ref(&file_meta), + Arc::new(physical_schema.try_into_kernel().unwrap()), + None, + ) + .unwrap() + .map(into_record_batch) + .try_collect() + .unwrap(); + + // Verify the data + assert_eq!(data.len(), 1); + assert_eq!(data[0].num_rows(), 5); + assert_eq!(data[0].num_columns(), 13); + + let mut col_idx = 0; + + // Verify byte column + let byte_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(byte_col.values(), &[1i8, 2, 3, 4, 5]); + col_idx += 1; + + // Verify short column + let short_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(short_col.values(), &[100i16, 200, 300, 400, 500]); + col_idx += 1; + + // Verify int column + let int_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(int_col.values(), &[1000i32, 2000, 3000, 4000, 5000]); + col_idx += 1; + + // Verify long column + let long_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(long_col.values(), &[10000i64, 20000, 30000, 40000, 50000]); + col_idx += 1; + + // Verify float column + let float_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(float_col.values(), &[1.1f32, 2.2, 3.3, 4.4, 5.5]); + col_idx += 1; + + // Verify double column + let double_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(double_col.values(), &[1.11f64, 2.22, 3.33, 4.44, 5.55]); + col_idx += 1; + + // Verify bool column + let bool_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert!(bool_col.value(0)); + assert!(!bool_col.value(1)); + col_idx += 1; + + // Verify string column + let string_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(string_col.value(0), "a"); + assert_eq!(string_col.value(4), "e"); + col_idx += 1; + + // Verify binary column + let binary_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(binary_col.value(0), b"bin1"); + assert_eq!(binary_col.value(4), b"bin5"); + col_idx += 1; + + // Verify date column + let date_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(date_col.values(), &[18262, 18263, 18264, 18265, 18266]); + col_idx += 1; + + // Verify timestamp column (with UTC timezone) + let timestamp_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(timestamp_col.value(0), 1609459200000000i64); + assert_eq!(timestamp_col.value(4), 1609804800000000i64); + assert!(timestamp_col + .timezone() + .is_some_and(|tz| tz.eq_ignore_ascii_case("utc"))); + col_idx += 1; + + // Verify timestamp_ntz column (without timezone) + let timestamp_ntz_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(timestamp_ntz_col.value(0), 1609459200000000i64); + assert_eq!(timestamp_ntz_col.value(4), 1609804800000000i64); + assert!(timestamp_ntz_col.timezone().is_none()); + col_idx += 1; + + // Verify decimal column + let decimal_col = data[0] + .column(col_idx) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(decimal_col.value(0), 12345i128); + assert_eq!(decimal_col.value(4), 56789i128); + assert_eq!(decimal_col.precision(), 10); + assert_eq!(decimal_col.scale(), 2); + } + + /// Test that field IDs are accessible via ColumnMetadataKey::ParquetFieldId as documented. + /// + /// Per trait definitions in lib.rs, field IDs should be accessible via StructField::get_config_value + /// with ColumnMetadataKey::ParquetFieldId. + #[test] + fn test_parquet_footer_read_with_field_id() { + // Write parquet file with field ID + let field = Field::new("value", ArrowDataType::Int64, false).with_metadata(HashMap::from( + [(PARQUET_FIELD_ID_META_KEY.to_string(), "42".to_string())], + )); + let arrow_schema = Arc::new(ArrowSchema::new(vec![field])); + + let temp_dir = tempfile::tempdir().unwrap(); + let file_path = temp_dir.path().join("field_id_test.parquet"); + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![Arc::new(Int64Array::from(vec![1, 2, 3]))], + ) + .unwrap(); + + let file = std::fs::File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema, None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + // Read footer and verify field ID accessibility + let store = Arc::new(LocalFileSystem::new()); + let handler = DefaultParquetHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); + let file_size = std::fs::metadata(&file_path).unwrap().len(); + let file_meta = FileMeta { + location: Url::from_file_path(&file_path).unwrap(), + last_modified: 0, + size: file_size, + }; + + let footer = handler.read_parquet_footer(&file_meta).unwrap(); + let field = footer + .schema + .fields() + .find(|f| f.name() == "value") + .unwrap(); + + // Field ID is transformed to kernel key when reading + assert_eq!( + field + .metadata() + .get(ColumnMetadataKey::ParquetFieldId.as_ref()), + Some(&"42".into()) + ); + + // Field ID should be accessible via documented API + let field_id = field.get_config_value(&ColumnMetadataKey::ParquetFieldId) + .expect("Field ID should be accessible via ColumnMetadataKey::ParquetFieldId per lib.rs:836-837"); + + match field_id { + crate::schema::MetadataValue::String(id) => assert_eq!(id, "42"), + crate::schema::MetadataValue::Number(id) => assert_eq!(*id, 42), + other => panic!("Expected String or Number, got {other:?}"), + } + } + + /// Test that columns are matched by field ID when column names differ. + /// + /// Per lib.rs:676-680, field IDs (via [`ColumnMetadataKey::ParquetFieldId`]) should take + /// precedence over field names for column matching. + /// + /// [`ColumnMetadataKey::ParquetFieldId`]: crate::schema::ColumnMetadataKey::ParquetFieldId + #[test] + fn test_read_parquet_with_field_id_matching() { + use crate::schema::{ColumnMetadataKey, MetadataValue, StructField, StructType}; + + // Write parquet with field IDs using PARQUET_FIELD_ID_META_KEY (Parquet's native key) + // The kernel will transform these to parquet.field.id when reading + let fields = vec![ + Field::new("id", ArrowDataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + Field::new("name", ArrowDataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "2".to_string(), + )])), + ]; + let arrow_schema = Arc::new(ArrowSchema::new(fields)); + + let temp_dir = tempfile::tempdir().unwrap(); + let file_path = temp_dir.path().join("field_id_matching.parquet"); + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["alice", "bob", "charlie"])), + ], + ) + .unwrap(); + + let file = std::fs::File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema, None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + // Create kernel schema with DIFFERENT names but SAME field IDs + let kernel_schema = Arc::new( + StructType::try_new(vec![ + StructField::new("user_id", crate::schema::DataType::LONG, false).with_metadata([ + ( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(1), + ), + ]), + StructField::new("user_name", crate::schema::DataType::STRING, false) + .with_metadata([( + ColumnMetadataKey::ParquetFieldId.as_ref(), + MetadataValue::Number(2), + )]), + ]) + .unwrap(), + ); + + // Read using kernel schema with different column names + let store = Arc::new(LocalFileSystem::new()); + let handler = DefaultParquetHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); + let file_meta = FileMeta { + location: Url::from_file_path(&file_path).unwrap(), + last_modified: 0, + size: std::fs::metadata(&file_path).unwrap().len(), + }; + + // Should successfully match by field ID despite different names + let data: Vec = handler + .read_parquet_files(slice::from_ref(&file_meta), kernel_schema, None) + .unwrap() + .map(into_record_batch) + .try_collect() + .unwrap(); + + // Verify data was correctly matched by field ID + assert_eq!(data.len(), 1); + let batch = &data[0]; + + let id_col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_col.values(), &[1, 2, 3], "Should match by field ID 1"); + + let name_col = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(name_col.value(0), "alice", "Should match by field ID 2"); + assert_eq!(name_col.value(1), "bob"); + assert_eq!(name_col.value(2), "charlie"); + } + + // Verifies that write_parquet (the internal stats-collecting path) does not embed the Arrow + // IPC schema in the Parquet file metadata. + #[tokio::test] + async fn write_parquet_omits_arrow_schema_metadata() { + let store = Arc::new(InMemory::new()); + let parquet_handler = + DefaultParquetHandler::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + + let data = Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![( + "a", + Arc::new(Int64Array::from(vec![1, 2, 3])) as Arc, + )]) + .unwrap(), + )); + let metadata = parquet_handler + .write_parquet(&Url::parse("memory:///data/").unwrap(), data, &[]) + .await + .unwrap(); + + let path = Path::from_url_path(metadata.file_meta.location.path()).unwrap(); + let reader = ParquetObjectReader::new(store, path); + let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap(); + let kv = builder.metadata().file_metadata().key_value_metadata(); + let has = kv + .map(|kv| kv.iter().any(|e| e.key == ARROW_SCHEMA_META_KEY)) + .unwrap_or(false); + assert!( + !has, + "Parquet file should not contain embedded Arrow schema metadata" + ); + } } diff --git a/kernel/src/engine/default/stats.rs b/kernel/src/engine/default/stats.rs new file mode 100644 index 0000000000..42398db371 --- /dev/null +++ b/kernel/src/engine/default/stats.rs @@ -0,0 +1,1538 @@ +//! Statistics collection for Delta Lake file writes. +//! +//! Provides `collect_stats` to compute min, max, and null count statistics +//! for a single RecordBatch during file writes. + +use std::borrow::Cow; +use std::sync::Arc; + +use delta_kernel_derive::internal_api; + +use crate::arrow::array::{ + new_null_array, Array, ArrayRef, AsArray, BooleanArray, Decimal128Array, Int64Array, + LargeStringArray, PrimitiveArray, RecordBatch, StringArray, StringViewArray, StructArray, +}; +use crate::arrow::compute::kernels::aggregate::{max, max_string, min, min_string}; +use crate::arrow::datatypes::{ + ArrowPrimitiveType, DataType, Date32Type, Date64Type, Decimal128Type, Field, Float32Type, + Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, TimestampMicrosecondType, + TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, + UInt64Type, UInt8Type, +}; +use crate::column_trie::ColumnTrie; +use crate::engine::arrow_utils::fix_nested_null_masks; +use crate::expressions::ColumnName; +use crate::{DeltaResult, Error}; + +/// Maximum prefix length for string statistics (Delta protocol requirement). +const STRING_PREFIX_LENGTH: usize = 32; + +/// Maximum expansion when searching for a valid max truncation point. +const STRING_EXPANSION_LIMIT: usize = STRING_PREFIX_LENGTH * 2; + +/// ASCII DEL character (0x7F) - used as tie-breaker for max values when truncated char is ASCII. +const ASCII_MAX_CHAR: char = '\x7F'; + +/// Maximum Unicode code point - used as tie-breaker for max values when truncated char is non-ASCII. +const UTF8_MAX_CHAR: char = '\u{10FFFF}'; + +// ============================================================================ +// String truncation for Delta statistics +// ============================================================================ + +/// Truncate a string for min statistics. +/// +/// For min values, we simply truncate at the prefix length. The truncated value will always +/// be <= the original, which is correct for min statistics. +/// +/// Returns the original string if it's already within the limit. +fn truncate_min_string(s: &str) -> &str { + if s.len() <= STRING_PREFIX_LENGTH { + return s; + } + // Find char boundary at or before STRING_PREFIX_LENGTH + let end = s + .char_indices() + .take(STRING_PREFIX_LENGTH + 1) + .last() + .map(|(i, _)| i) + .unwrap_or(s.len()); + + // Take exactly STRING_PREFIX_LENGTH chars + let truncated_end = s + .char_indices() + .nth(STRING_PREFIX_LENGTH) + .map(|(i, _)| i) + .unwrap_or(end); + + &s[..truncated_end] +} + +/// Truncate a string for max statistics. +/// +/// For max values, we need to ensure the truncated value is >= all actual values in the column. +/// We do this by appending a "tie-breaker" character after truncation: +/// - ASCII_MAX_CHAR (0x7F) if the character at the truncation point is ASCII (< 0x7F) +/// - UTF8_MAX_CHAR (U+10FFFF) otherwise +/// +/// This ensures correct data skipping behavior: any string starting with the truncated prefix +/// will compare <= the truncated max + tie-breaker. +/// +/// Returns `Cow::Borrowed` if no truncation needed (avoiding allocation), `Cow::Owned` when +/// truncation is performed, or `None` if the string is too long to truncate safely. +fn truncate_max_string(s: &str) -> Option> { + if s.len() <= STRING_PREFIX_LENGTH { + return Some(Cow::Borrowed(s)); + } + + // Start at STRING_PREFIX_LENGTH chars + let char_indices: Vec<(usize, char)> = s.char_indices().collect(); + + // We can expand up to STRING_EXPANSION_LIMIT chars looking for a valid truncation point + let max_chars = char_indices.len().min(STRING_EXPANSION_LIMIT); + + // Start from STRING_PREFIX_LENGTH and look for a valid truncation point + for len in STRING_PREFIX_LENGTH..=max_chars { + if len >= char_indices.len() { + // Reached end of string - return original + return Some(Cow::Borrowed(s)); + } + + let (_, next_char) = char_indices[len]; + + // If the character being truncated is U+10FFFF (max Unicode code point), we cannot + // use this position. The tie-breaker must be >= the truncated char, but nothing is + // greater than U+10FFFF. Include it in the prefix and check the next character. + // (In Scala/Java this is a surrogate pair requiring substring check; in Rust it's one char) + if next_char == UTF8_MAX_CHAR { + continue; + } + + let truncation_byte_idx = char_indices[len].0; + let truncated = &s[..truncation_byte_idx]; + + // Choose tie-breaker based on the character being truncated + let tie_breaker = if next_char < ASCII_MAX_CHAR { + ASCII_MAX_CHAR + } else { + UTF8_MAX_CHAR + }; + + return Some(Cow::Owned(format!("{truncated}{tie_breaker}"))); + } + + // Could not find a valid truncation point within expansion limit + None +} + +// ============================================================================ +// Min/Max computation using Arrow compute kernels +// ============================================================================ + +/// Aggregation type selector. +#[derive(Clone, Copy)] +enum Agg { + Min, + Max, +} + +/// Compute aggregation for a primitive array. +fn agg_primitive(column: &ArrayRef, agg: Agg) -> DeltaResult> +where + T: ArrowPrimitiveType, + T::Native: PartialOrd, + PrimitiveArray: From>>, +{ + let array = column.as_primitive_opt::().ok_or_else(|| { + Error::generic(format!( + "Failed to downcast column to PrimitiveArray<{}>", + std::any::type_name::() + )) + })?; + let result = match agg { + Agg::Min => min(array), + Agg::Max => max(array), + }; + Ok(result.map(|v| Arc::new(PrimitiveArray::::from(vec![Some(v)])) as ArrayRef)) +} + +/// Compute aggregation for a timestamp array, preserving timezone. +fn agg_timestamp( + column: &ArrayRef, + tz: Option>, + agg: Agg, +) -> DeltaResult> +where + T: crate::arrow::datatypes::ArrowTimestampType, + PrimitiveArray: From>>, +{ + let array = column.as_primitive_opt::().ok_or_else(|| { + Error::generic(format!( + "Failed to downcast column to PrimitiveArray<{}>", + std::any::type_name::() + )) + })?; + let result = match agg { + Agg::Min => min(array), + Agg::Max => max(array), + }; + Ok(result.map(|v| { + Arc::new(PrimitiveArray::::from(vec![Some(v)]).with_timezone_opt(tz)) as ArrayRef + })) +} + +/// Compute aggregation for a decimal128 array, preserving precision and scale. +fn agg_decimal( + column: &ArrayRef, + precision: u8, + scale: i8, + agg: Agg, +) -> DeltaResult> { + let array = column + .as_primitive_opt::() + .ok_or_else(|| Error::generic("Failed to downcast column to Decimal128Array"))?; + let result = match agg { + Agg::Min => min(array), + Agg::Max => max(array), + }; + result + .map(|v| { + Decimal128Array::from(vec![Some(v)]) + .with_precision_and_scale(precision, scale) + .map(|arr| Arc::new(arr) as ArrayRef) + }) + .transpose() + .map_err(|e| Error::generic(format!("Invalid decimal precision/scale: {e}"))) +} + +/// Compute aggregation for a string array with truncation. +fn agg_string(column: &ArrayRef, agg: Agg) -> DeltaResult> { + let array = column + .as_string_opt::() + .ok_or_else(|| Error::generic("Failed to downcast column to StringArray"))?; + let result = match agg { + Agg::Min => min_string(array), + Agg::Max => max_string(array), + }; + match (result, agg) { + (Some(s), Agg::Min) => { + let truncated = truncate_min_string(s); + Ok(Some( + Arc::new(StringArray::from(vec![Some(truncated)])) as ArrayRef + )) + } + (Some(s), Agg::Max) => Ok(truncate_max_string(s) + .map(|t| Arc::new(StringArray::from(vec![Some(&*t)])) as ArrayRef)), + (None, _) => Ok(None), + } +} + +/// Compute aggregation for a large string array with truncation. +/// +/// Unlike StringArray, Arrow's compute kernels don't provide min/max for LargeStringArray, +/// so we iterate manually. `iter()` yields `Option<&str>` per element (None for nulls), +/// and `flatten()` filters out nulls so we only compare non-null values. +fn agg_large_string(column: &ArrayRef, agg: Agg) -> DeltaResult> { + let array = column + .as_string_opt::() + .ok_or_else(|| Error::generic("Failed to downcast column to LargeStringArray"))?; + let result = match agg { + Agg::Min => array.iter().flatten().min(), + Agg::Max => array.iter().flatten().max(), + }; + match (result, agg) { + (Some(s), Agg::Min) => { + let truncated = truncate_min_string(s); + Ok(Some( + Arc::new(LargeStringArray::from(vec![Some(truncated)])) as ArrayRef, + )) + } + (Some(s), Agg::Max) => Ok(truncate_max_string(s) + .map(|t| Arc::new(LargeStringArray::from(vec![Some(&*t)])) as ArrayRef)), + (None, _) => Ok(None), + } +} + +/// Compute aggregation for a string view array with truncation. +/// +/// Like LargeStringArray, Arrow's compute kernels don't provide min/max for StringViewArray. +/// See `agg_large_string` for explanation of `iter().flatten()`. +fn agg_string_view(column: &ArrayRef, agg: Agg) -> DeltaResult> { + let array = column + .as_string_view_opt() + .ok_or_else(|| Error::generic("Failed to downcast column to StringViewArray"))?; + let result: Option<&str> = match agg { + Agg::Min => array.iter().flatten().min(), + Agg::Max => array.iter().flatten().max(), + }; + match (result, agg) { + (Some(s), Agg::Min) => { + let truncated = truncate_min_string(s); + Ok(Some( + Arc::new(StringViewArray::from(vec![Some(truncated)])) as ArrayRef + )) + } + (Some(s), Agg::Max) => Ok(truncate_max_string(s) + .map(|t| Arc::new(StringViewArray::from(vec![Some(&*t)])) as ArrayRef)), + (None, _) => Ok(None), + } +} + +/// Compute min or max for a leaf column based on its data type. +fn compute_leaf_agg(column: &ArrayRef, agg: Agg) -> DeltaResult> { + match column.data_type() { + // Integer types + DataType::Int8 => agg_primitive::(column, agg), + DataType::Int16 => agg_primitive::(column, agg), + DataType::Int32 => agg_primitive::(column, agg), + DataType::Int64 => agg_primitive::(column, agg), + DataType::UInt8 => agg_primitive::(column, agg), + DataType::UInt16 => agg_primitive::(column, agg), + DataType::UInt32 => agg_primitive::(column, agg), + DataType::UInt64 => agg_primitive::(column, agg), + + // Float types + DataType::Float32 => agg_primitive::(column, agg), + DataType::Float64 => agg_primitive::(column, agg), + + // Date types + DataType::Date32 => agg_primitive::(column, agg), + DataType::Date64 => agg_primitive::(column, agg), + + // Timestamp types (preserve timezone) + DataType::Timestamp(TimeUnit::Second, tz) => { + agg_timestamp::(column, tz.clone(), agg) + } + DataType::Timestamp(TimeUnit::Millisecond, tz) => { + agg_timestamp::(column, tz.clone(), agg) + } + DataType::Timestamp(TimeUnit::Microsecond, tz) => { + agg_timestamp::(column, tz.clone(), agg) + } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => { + agg_timestamp::(column, tz.clone(), agg) + } + + // Decimal type (preserve precision/scale) + DataType::Decimal128(p, s) => agg_decimal(column, *p, *s, agg), + + // String types (with truncation) + DataType::Utf8 => agg_string(column, agg), + DataType::LargeUtf8 => agg_large_string(column, agg), + DataType::Utf8View => agg_string_view(column, agg), + + // Unsupported types (structs handled separately, others return no min/max) + _ => Ok(None), + } +} + +// ============================================================================ +// Combined stats computation (single traversal) +// ============================================================================ + +/// Statistics computed for a column (leaf or nested struct). +#[derive(Default)] +struct ColumnStats { + null_count: Option, + min_value: Option, + max_value: Option, +} + +/// Compute all statistics for a column in a single traversal. +/// +/// Returns `ColumnStats` containing null_count, min, and max for this column. +/// For struct columns, these are nested StructArrays. For leaf columns, these are scalar arrays. +/// Map, List, and other complex types are skipped (returns default empty stats). +fn compute_column_stats( + column: &ArrayRef, + path: &mut Vec, + filter: &ColumnTrie<'_>, +) -> DeltaResult { + match column.data_type() { + DataType::Struct(fields) => { + let struct_array = column + .as_struct_opt() + .ok_or_else(|| Error::generic("Failed to downcast column to StructArray"))?; + + // Propagate struct-level nulls to all descendants + let fixed_struct = fix_nested_null_masks(struct_array.clone()); + + // Accumulators for each stat type + let mut null_fields: Vec = Vec::new(); + let mut null_arrays: Vec = Vec::new(); + let mut min_fields: Vec = Vec::new(); + let mut min_arrays: Vec = Vec::new(); + let mut max_fields: Vec = Vec::new(); + let mut max_arrays: Vec = Vec::new(); + + for (i, field) in fields.iter().enumerate() { + path.push(field.name().to_string()); + + let child_stats = compute_column_stats(fixed_struct.column(i), path, filter)?; + + if let Some(arr) = child_stats.null_count { + null_fields.push(Field::new(field.name(), arr.data_type().clone(), true)); + null_arrays.push(arr); + } + if let Some(arr) = child_stats.min_value { + min_fields.push(Field::new(field.name(), arr.data_type().clone(), true)); + min_arrays.push(arr); + } + if let Some(arr) = child_stats.max_value { + max_fields.push(Field::new(field.name(), arr.data_type().clone(), true)); + max_arrays.push(arr); + } + + path.pop(); + } + + // Build result structs (None if empty) + let build_struct = + |fields: Vec, arrays: Vec| -> DeltaResult> { + if fields.is_empty() { + Ok(None) + } else { + Ok(Some(Arc::new( + StructArray::try_new(fields.into(), arrays, None) + .map_err(|e| Error::generic(format!("stats struct: {e}")))?, + ) as ArrayRef)) + } + }; + + Ok(ColumnStats { + null_count: build_struct(null_fields, null_arrays)?, + min_value: build_struct(min_fields, min_arrays)?, + max_value: build_struct(max_fields, max_arrays)?, + }) + } + // Complex types: collect nullCount only (no min/max) + DataType::Map(_, _) + | DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) + | DataType::ListView(_) + | DataType::LargeListView(_) => { + if !filter.contains_prefix_of(path) { + return Ok(ColumnStats::default()); + } + Ok(ColumnStats { + null_count: Some(Arc::new(Int64Array::from(vec![column.null_count() as i64]))), + min_value: None, + max_value: None, + }) + } + _ => { + // Leaf: check filter, compute all stats together + if !filter.contains_prefix_of(path) { + return Ok(ColumnStats::default()); + } + + // When min/max is None (all nulls or unsupported type), emit a null-valued + // single-element array to keep the field present in the stats struct. This + // allows downstream consumers (like StatsVerifier) to find the column and + // check nullCount == numRecords. The JSON serializer omits null fields, so + // the on-disk format still matches Spark's ignoreNullFields behavior. + let null_fallback = || -> ArrayRef { Arc::new(new_null_array(column.data_type(), 1)) }; + Ok(ColumnStats { + null_count: Some(Arc::new(Int64Array::from(vec![column.null_count() as i64]))), + min_value: Some(compute_leaf_agg(column, Agg::Min)?.unwrap_or_else(&null_fallback)), + max_value: Some(compute_leaf_agg(column, Agg::Max)?.unwrap_or_else(null_fallback)), + }) + } + } +} + +/// Accumulates (field_name, array) pairs for building a stats struct. +struct StatsAccumulator { + name: &'static str, + fields: Vec, + arrays: Vec, +} + +impl StatsAccumulator { + fn new(name: &'static str) -> Self { + Self { + name, + fields: Vec::new(), + arrays: Vec::new(), + } + } + + fn push(&mut self, field_name: &str, array: ArrayRef) { + self.fields + .push(Field::new(field_name, array.data_type().clone(), true)); + self.arrays.push(array); + } + + fn build(self) -> DeltaResult)>> { + if self.fields.is_empty() { + return Ok(None); + } + let struct_arr = StructArray::try_new(self.fields.into(), self.arrays, None) + .map_err(|e| Error::generic(format!("Failed to create {}: {e}", self.name)))?; + let field = Field::new(self.name, struct_arr.data_type().clone(), true); + Ok(Some((field, Arc::new(struct_arr) as Arc))) + } +} + +/// Collect statistics from a RecordBatch for Delta Lake file statistics. +/// +/// Returns a StructArray with the following fields: +/// - `numRecords`: total row count +/// - `nullCount`: nested struct with null counts per column +/// - `minValues`: nested struct with min values per column (null when all values are null) +/// - `maxValues`: nested struct with max values per column (null when all values are null) +/// - `tightBounds`: always true for new file writes +/// +/// String min/max values are truncated to a 32-character prefix with appropriate tie-breaker +/// characters for max values. See the `stats_schema` module documentation for the full stats +/// value rules. +/// +/// # Arguments +/// * `batch` - The RecordBatch to collect statistics from +/// * `stats_columns` - Column names that should have statistics collected (allowlist). +/// Only these columns will appear in nullCount/minValues/maxValues. +#[internal_api] +pub(crate) fn collect_stats( + batch: &RecordBatch, + stats_columns: &[ColumnName], +) -> DeltaResult { + let filter = ColumnTrie::from_columns(stats_columns); + let schema = batch.schema(); + + // Collect all stats in a single traversal + let mut null_counts = StatsAccumulator::new("nullCount"); + let mut min_values = StatsAccumulator::new("minValues"); + let mut max_values = StatsAccumulator::new("maxValues"); + + for (col_idx, field) in schema.fields().iter().enumerate() { + let mut path = vec![field.name().to_string()]; + let column = batch.column(col_idx); + + // Single traversal computes all three stats + let stats = compute_column_stats(column, &mut path, &filter)?; + + if let Some(arr) = stats.null_count { + null_counts.push(field.name(), arr); + } + if let Some(arr) = stats.min_value { + min_values.push(field.name(), arr); + } + if let Some(arr) = stats.max_value { + max_values.push(field.name(), arr); + } + } + + // Build output struct + let mut fields = vec![Field::new("numRecords", DataType::Int64, true)]; + let mut arrays: Vec> = + vec![Arc::new(Int64Array::from(vec![batch.num_rows() as i64]))]; + + for acc in [null_counts, min_values, max_values] { + if let Some((field, array)) = acc.build()? { + fields.push(field); + arrays.push(array); + } + } + + // tightBounds + fields.push(Field::new("tightBounds", DataType::Boolean, true)); + arrays.push(Arc::new(BooleanArray::from(vec![true]))); + + StructArray::try_new(fields.into(), arrays, None) + .map_err(|e| Error::generic(format!("Failed to create stats struct: {e}"))) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::array::{Array, AsArray, Int32Array, Int64Array, StringArray}; + use crate::arrow::buffer::NullBuffer; + use crate::arrow::compute::concat_batches; + use crate::arrow::datatypes::{Fields, Schema}; + use crate::arrow::datatypes::{Int32Type, Int64Type}; + use crate::engine::arrow_expression::evaluate_expression::to_json; + use crate::expressions::column_name; + use crate::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + + #[test] + fn test_collect_stats_single_batch() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)])); + + let batch = + RecordBatch::try_new(schema, vec![Arc::new(Int64Array::from(vec![1, 2, 3]))]).unwrap(); + + let stats = collect_stats(&batch, &[column_name!("id")]).unwrap(); + + assert_eq!(stats.len(), 1); + let num_records = stats + .column_by_name("numRecords") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(num_records.value(0), 3); + } + + #[test] + fn test_collect_stats_null_counts() { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("value", DataType::Utf8, true), + ])); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec![Some("a"), None, Some("c")])), + ], + ) + .unwrap(); + + let stats = collect_stats(&batch, &[column_name!("id"), column_name!("value")]).unwrap(); + + // Check nullCount struct + let null_count = stats + .column_by_name("nullCount") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // id has 0 nulls + let id_null_count = null_count + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_null_count.value(0), 0); + + // value has 1 null + let value_null_count = null_count + .column_by_name("value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(value_null_count.value(0), 1); + } + + #[test] + fn test_collect_stats_respects_stats_columns() { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new("value", DataType::Utf8, true), + ])); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec![Some("a"), None, Some("c")])), + ], + ) + .unwrap(); + + // Only collect stats for "id", not "value" + let stats = collect_stats(&batch, &[column_name!("id")]).unwrap(); + + let null_count = stats + .column_by_name("nullCount") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // Only id should be present + assert!(null_count.column_by_name("id").is_some()); + assert!(null_count.column_by_name("value").is_none()); + } + + #[test] + fn test_collect_stats_min_max() { + let schema = Arc::new(Schema::new(vec![ + Field::new("number", DataType::Int64, false), + Field::new("name", DataType::Utf8, true), + ])); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![5, 1, 9, 3])), + Arc::new(StringArray::from(vec![ + Some("banana"), + Some("apple"), + Some("cherry"), + None, + ])), + ], + ) + .unwrap(); + + let stats = collect_stats(&batch, &[column_name!("number"), column_name!("name")]).unwrap(); + + // Check minValues + let min_values = stats + .column_by_name("minValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let number_min = min_values + .column_by_name("number") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(number_min.value(0), 1); + + let name_min = min_values + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(name_min.value(0), "apple"); + + // Check maxValues + let max_values = stats + .column_by_name("maxValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let number_max = max_values + .column_by_name("number") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(number_max.value(0), 9); + + let name_max = max_values + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(name_max.value(0), "cherry"); + } + + #[test] + fn test_collect_stats_all_nulls() { + let schema = Arc::new(Schema::new(vec![Field::new( + "value", + DataType::Int64, + true, + )])); + + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(Int64Array::from(vec![ + None as Option, + None, + None, + ]))], + ) + .unwrap(); + + let stats = collect_stats(&batch, &[column_name!("value")]).unwrap(); + + // numRecords should be 3 + let num_records = stats + .column_by_name("numRecords") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(num_records.value(0), 3); + + // nullCount should be 3 + let null_count = stats + .column_by_name("nullCount") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let value_null_count = null_count + .column_by_name("value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(value_null_count.value(0), 3); + + // All-null columns are present in minValues/maxValues but with null values. + // The field must exist so that StatsVerifier can find it via visit_rows and + // check nullCount == numRecords. The JSON serializer omits null fields, so + // the on-disk format still matches Spark's ignoreNullFields behavior. + let min_values = stats + .column_by_name("minValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let min_col = min_values.column_by_name("value").unwrap(); + assert!(min_col.is_null(0)); + + let max_values = stats + .column_by_name("maxValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let max_col = max_values.column_by_name("value").unwrap(); + assert!(max_col.is_null(0)); + } + + #[test] + fn test_collect_stats_empty_stats_columns() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int64, false)])); + + let batch = + RecordBatch::try_new(schema, vec![Arc::new(Int64Array::from(vec![1, 2, 3]))]).unwrap(); + + // No stats columns requested + let stats = collect_stats(&batch, &[]).unwrap(); + + // Should still have numRecords and tightBounds + assert!(stats.column_by_name("numRecords").is_some()); + assert!(stats.column_by_name("tightBounds").is_some()); + + // Should not have nullCount, minValues, maxValues + assert!(stats.column_by_name("nullCount").is_none()); + assert!(stats.column_by_name("minValues").is_none()); + assert!(stats.column_by_name("maxValues").is_none()); + } + + #[test] + fn test_collect_stats_string_truncation_ascii() { + let schema = Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8, false)])); + + // Create an ASCII string longer than 32 characters + let long_string = "a".repeat(50); + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(StringArray::from(vec![long_string.as_str()]))], + ) + .unwrap(); + + let stats = collect_stats(&batch, &[column_name!("text")]).unwrap(); + + // Check minValues - should be truncated to exactly 32 chars + let min_values = stats + .column_by_name("minValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let text_min = min_values + .column_by_name("text") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(text_min.value(0).len(), 32); + assert_eq!(text_min.value(0), "a".repeat(32)); + + // Check maxValues - should be 32 chars + 0x7F tie-breaker (since 'a' < 0x7F) + let max_values = stats + .column_by_name("maxValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let text_max = max_values + .column_by_name("text") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let expected_max = format!("{}\x7F", "a".repeat(32)); + assert_eq!(text_max.value(0), expected_max); + } + + #[test] + fn test_collect_stats_string_truncation_non_ascii() { + let schema = Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8, false)])); + + // Create a string where the character BEING TRUNCATED (at position 32) is non-ASCII. + // The tie-breaker is chosen based on the first char being removed, not the last kept. + // 32 'a's followed by 'À' (>= 0x7F) followed by more chars + let long_string = format!("{}À{}", "a".repeat(32), "b".repeat(20)); + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(StringArray::from(vec![long_string.as_str()]))], + ) + .unwrap(); + + let stats = collect_stats(&batch, &[column_name!("text")]).unwrap(); + + // Check maxValues - should use UTF8_MAX_CHAR since 'À' (the truncated char) >= 0x7F + let max_values = stats + .column_by_name("maxValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let text_max = max_values + .column_by_name("text") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // Should be 32 'a's + U+10FFFF (tie-breaker for non-ASCII truncated char) + let expected_max = format!("{}\u{10FFFF}", "a".repeat(32)); + assert_eq!(text_max.value(0), expected_max); + } + + #[test] + fn test_collect_stats_string_no_truncation_needed() { + let schema = Arc::new(Schema::new(vec![Field::new("text", DataType::Utf8, false)])); + + // String within 32 chars - should not be truncated + let short_string = "hello world"; + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(StringArray::from(vec![short_string]))], + ) + .unwrap(); + + let stats = collect_stats(&batch, &[column_name!("text")]).unwrap(); + + let min_values = stats + .column_by_name("minValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let text_min = min_values + .column_by_name("text") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(text_min.value(0), short_string); + + let max_values = stats + .column_by_name("maxValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let text_max = max_values + .column_by_name("text") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(text_max.value(0), short_string); + } + + #[test] + fn test_truncate_min_string() { + // Short string - no truncation + assert_eq!(truncate_min_string("hello"), "hello"); + + // Exactly 32 chars - no truncation + let s32 = "a".repeat(32); + assert_eq!(truncate_min_string(&s32), s32); + + // Long string - truncated to 32 chars + let s50 = "a".repeat(50); + assert_eq!(truncate_min_string(&s50), "a".repeat(32)); + + // Multi-byte characters + let multi = format!("{}À", "a".repeat(35)); // 'À' is 2 bytes in UTF-8 + assert_eq!(truncate_min_string(&multi).chars().count(), 32); + } + + #[test] + fn test_truncate_max_string() { + // Short string - no truncation, returns Cow::Borrowed + assert_eq!(truncate_max_string("hello").as_deref(), Some("hello")); + + // Exactly 32 chars - no truncation + let s32 = "a".repeat(32); + assert_eq!(truncate_max_string(&s32).as_deref(), Some(s32.as_str())); + + // Long ASCII string - truncated with 0x7F tie-breaker + // The 33rd char ('a') is < 0x7F, so we use 0x7F + let s50 = "a".repeat(50); + let expected = format!("{}\x7F", "a".repeat(32)); + assert_eq!( + truncate_max_string(&s50).as_deref(), + Some(expected.as_str()) + ); + + // Non-ASCII at truncation point - uses UTF8_MAX_CHAR + // 32 'a's then 'À' (which is >= 0x7F), so we use UTF8_MAX + let non_ascii = format!("{}À{}", "a".repeat(32), "b".repeat(20)); + let expected = format!("{}\u{10FFFF}", "a".repeat(32)); + assert_eq!( + truncate_max_string(&non_ascii).as_deref(), + Some(expected.as_str()) + ); + + // U+10FFFF at truncation point - must skip past it + // 32 'a's then U+10FFFF then 'b' - we can't truncate at U+10FFFF (no tie-breaker > it) + // so we include U+10FFFF in prefix and use 'b' to determine tie-breaker + let with_max_char = format!("{}\u{10FFFF}b{}", "a".repeat(32), "c".repeat(10)); + let expected = format!("{}\u{10FFFF}\x7F", "a".repeat(32)); // 'b' < 0x7F + assert_eq!( + truncate_max_string(&with_max_char).as_deref(), + Some(expected.as_str()) + ); + } + + #[test] + fn test_collect_stats_nested_struct() { + // Schema: { nested: { a: int64, b: string } } + let nested_fields = Fields::from(vec![ + Field::new("a", DataType::Int64, false), + Field::new("b", DataType::Utf8, true), + ]); + let schema = Arc::new(Schema::new(vec![Field::new( + "nested", + DataType::Struct(nested_fields.clone()), + false, + )])); + + // Build nested struct data + let a_array = Arc::new(Int64Array::from(vec![10, 5, 20])); + let b_array = Arc::new(StringArray::from(vec![Some("zebra"), Some("apple"), None])); + let nested_struct = StructArray::try_new( + nested_fields, + vec![a_array as ArrayRef, b_array as ArrayRef], + None, + ) + .unwrap(); + + let batch = + RecordBatch::try_new(schema, vec![Arc::new(nested_struct) as ArrayRef]).unwrap(); + + let stats = collect_stats(&batch, &[column_name!("nested")]).unwrap(); + + // Check nullCount.nested.a = 0, nullCount.nested.b = 1 + let null_count = stats + .column_by_name("nullCount") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let nested_null = null_count + .column_by_name("nested") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let a_null = nested_null + .column_by_name("a") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(a_null.value(0), 0); + + let b_null = nested_null + .column_by_name("b") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(b_null.value(0), 1); + + // Check minValues.nested.a = 5, minValues.nested.b = "apple" + let min_values = stats + .column_by_name("minValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let nested_min = min_values + .column_by_name("nested") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let a_min = nested_min + .column_by_name("a") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(a_min.value(0), 5); + + let b_min = nested_min + .column_by_name("b") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(b_min.value(0), "apple"); + + // Check maxValues.nested.a = 20, maxValues.nested.b = "zebra" + let max_values = stats + .column_by_name("maxValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let nested_max = max_values + .column_by_name("nested") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let a_max = nested_max + .column_by_name("a") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(a_max.value(0), 20); + + let b_max = nested_max + .column_by_name("b") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(b_max.value(0), "zebra"); + } + + #[test] + fn test_collect_stats_complex_types_null_count_only() { + use crate::arrow::array::ListArray; + use crate::arrow::buffer::OffsetBuffer; + + // Schema with list column - should have nullCount but no min/max + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int64, false), + Field::new( + "list_col", + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + true, + ), + ])); + + // Build list array: [[1, 2], null, [4, 5, 6]] + let values = Int64Array::from(vec![1, 2, 4, 5, 6]); + let offsets = OffsetBuffer::new(vec![0, 2, 2, 5].into()); + let list_array = ListArray::new( + Arc::new(Field::new("item", DataType::Int64, true)), + offsets, + Arc::new(values), + Some(vec![true, false, true].into()), // second element is null + ); + + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3])), + Arc::new(list_array), + ], + ) + .unwrap(); + + // Request stats for both columns + let stats = collect_stats(&batch, &[column_name!("id"), column_name!("list_col")]).unwrap(); + + let null_count = stats + .column_by_name("nullCount") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // id should have null count = 0 + let id_nulls = null_count + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_nulls.value(0), 0); + + // list_col should have null count = 1 + let list_nulls = null_count + .column_by_name("list_col") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(list_nulls.value(0), 1); + + // minValues should have id but NOT list_col + let min_values = stats + .column_by_name("minValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert!(min_values.column_by_name("id").is_some()); + assert!(min_values.column_by_name("list_col").is_none()); + + // maxValues should have id but NOT list_col + let max_values = stats + .column_by_name("maxValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert!(max_values.column_by_name("id").is_some()); + assert!(max_values.column_by_name("list_col").is_none()); + } + + #[test] + fn test_collect_stats_struct_with_nulls_at_struct_level() { + // Schema: { my_struct: { a: int32, b: int32 (nullable) } } + // Test both struct-level nulls and field-level nulls + let child_fields = Fields::from(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, true), + ]); + + let a_values = Int32Array::from(vec![1, 2, 3, 4]); + // b has field-level nulls at rows 0 and 2 + let b_values = Int32Array::from(vec![None, Some(20), None, Some(40)]); + + // Struct validity: [false, true, true, false] + // In Arrow: false = null, true = valid + // So rows 0 and 3 have null structs (entire struct is null) + let nulls = NullBuffer::from(vec![false, true, true, false]); + + let struct_array = StructArray::new( + child_fields.clone(), + vec![Arc::new(a_values), Arc::new(b_values)], + Some(nulls), + ); + + let schema = Schema::new(vec![Field::new( + "my_struct", + DataType::Struct(child_fields), + true, + )]); + + let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(struct_array)]).unwrap(); + + let stats = collect_stats(&batch, &[column_name!("my_struct")]).unwrap(); + + // Visualizing the data: + // Row 0: struct=NULL, (a=1, b=None are "invisible") + // Row 1: struct=VALID, a=2, b=20 + // Row 2: struct=VALID, a=3, b=None + // Row 3: struct=NULL, (a=4, b=40 are "invisible") + // + // Expected behavior (struct nulls propagate to children): + // - a: visible values are [2, 3], nullCount = 2 (rows 0, 3 are struct-null) + // - b: visible values are [20, None], nullCount = 3 (rows 0, 3 struct-null + row 2 field-null) + // - a: min=2, max=3 + // - b: min=20, max=20 + + // nullCount includes struct-level nulls + assert_eq!( + get_stat::(&stats, "nullCount", "my_struct", "a"), + 2 + ); + assert_eq!( + get_stat::(&stats, "nullCount", "my_struct", "b"), + 3 + ); + + // minValues excludes values from null struct rows + assert_eq!( + get_stat::(&stats, "minValues", "my_struct", "a"), + 2 + ); + assert_eq!( + get_stat::(&stats, "minValues", "my_struct", "b"), + 20 + ); + + // maxValues excludes values from null struct rows + assert_eq!( + get_stat::(&stats, "maxValues", "my_struct", "a"), + 3 + ); + assert_eq!( + get_stat::(&stats, "maxValues", "my_struct", "b"), + 20 + ); + } + + // Generic helper to extract and downcast nested columns from stats + fn get_stat( + stats: &StructArray, + stat_name: &str, + struct_name: &str, + field_name: &str, + ) -> T::Native + where + T: crate::arrow::datatypes::ArrowPrimitiveType, + { + stats + .column_by_name(stat_name) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name(struct_name) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .column_by_name(field_name) + .unwrap() + .as_primitive::() + .value(0) + } + + /// Recursively extracts leaf column names from an Arrow schema for stats collection. + fn extract_leaf_columns(fields: &Fields, prefix: &[String]) -> Vec { + let mut columns = Vec::new(); + for field in fields.iter() { + let mut path = prefix.to_vec(); + path.push(field.name().clone()); + match field.data_type() { + DataType::Struct(sub_fields) => { + columns.extend(extract_leaf_columns(sub_fields, &path)); + } + _ => { + columns.push(ColumnName::new(path)); + } + } + } + columns + } + + /// Recursively compares Spark stats JSON against kernel stats JSON. + /// Only checks keys present in `spark_val`; kernel may have extra keys. + fn assert_stats_match( + spark_val: &serde_json::Value, + kernel_val: &serde_json::Value, + path: &str, + ) { + match (spark_val, kernel_val) { + (serde_json::Value::Object(spark_map), serde_json::Value::Object(kernel_map)) => { + for (key, spark_child) in spark_map { + let child_path = if path.is_empty() { + key.clone() + } else { + format!("{path}.{key}") + }; + let kernel_child = kernel_map + .get(key) + .unwrap_or_else(|| panic!("Kernel stats missing key: {child_path}")); + assert_stats_match(spark_child, kernel_child, &child_path); + } + } + (serde_json::Value::Number(s), serde_json::Value::Number(k)) => { + let sv = s.as_f64().unwrap(); + let kv = k.as_f64().unwrap(); + assert!( + (sv - kv).abs() < 1e-6, + "Numeric mismatch at {path}: spark={sv}, kernel={kv}" + ); + } + (serde_json::Value::String(s), serde_json::Value::String(k)) => { + // Spark uses "Z" suffix for TZ timestamps and no suffix for NTZ. + let s_normalized = s.trim_end_matches('Z').trim_end_matches("+00:00"); + let k_normalized = k.trim_end_matches('Z').trim_end_matches("+00:00"); + + // Spark (Jackson) always includes fractional seconds (e.g., ".000") while Arrow's + // JSON encoder omits them when they are zero. Strip zero-only fractional parts + // so both formats compare equal. + if s_normalized.contains('T') && k_normalized.contains('T') { + let normalize_ts = |ts: &str| -> String { + if let Some(dot_pos) = ts.rfind('.') { + let frac = &ts[dot_pos + 1..]; + if frac.chars().all(|c| c == '0') { + return ts[..dot_pos].to_string(); + } + let trimmed = frac.trim_end_matches('0'); + return format!("{}.{trimmed}", &ts[..dot_pos]); + } + ts.to_string() + }; + let s_norm = normalize_ts(s_normalized); + let k_norm = normalize_ts(k_normalized); + assert_eq!( + s_norm, k_norm, + "Timestamp mismatch at {path}: spark={s}, kernel={k}" + ); + } else { + assert_eq!(s, k, "String mismatch at {path}: spark={s}, kernel={k}"); + } + } + _ => { + assert_eq!( + spark_val, kernel_val, + "Value mismatch at {path}: spark={spark_val}, kernel={kernel_val}" + ); + } + } + } + + // Verify that the `assert_stats_match` test helper correctly accepts equivalent values. + #[test] + fn test_assert_stats_match_accepts_equivalent_values() { + // Extra kernel keys are ignored + let spark = serde_json::json!({"a": 1, "b": "hello"}); + let kernel = serde_json::json!({"a": 1, "b": "hello", "extra": true}); + assert_stats_match(&spark, &kernel, ""); + + // Nested objects with extra kernel keys + let spark = serde_json::json!({"outer": {"inner": 42}}); + let kernel = serde_json::json!({"outer": {"inner": 42, "extra": 0}}); + assert_stats_match(&spark, &kernel, ""); + + // Timestamp with trailing ".000Z" vs no fractional part + let spark = serde_json::json!({"ts": "2023-06-15T12:30:00.000Z"}); + let kernel = serde_json::json!({"ts": "2023-06-15T12:30:00Z"}); + assert_stats_match(&spark, &kernel, ""); + + // Timestamp NTZ (no Z suffix) with trailing ".000" + let spark = serde_json::json!({"ts": "2023-06-15T12:30:00.000"}); + let kernel = serde_json::json!({"ts": "2023-06-15T12:30:00"}); + assert_stats_match(&spark, &kernel, ""); + + // Non-zero fractional seconds with different trailing zeros + let spark = serde_json::json!({"ts": "2023-06-15T12:30:00.500Z"}); + let kernel = serde_json::json!({"ts": "2023-06-15T12:30:00.5Z"}); + assert_stats_match(&spark, &kernel, ""); + } + + // Verify that the `assert_stats_match` test helper correctly rejects mismatched values. + #[test] + fn test_assert_stats_match_rejects_mismatches() { + let result = std::panic::catch_unwind(|| { + let spark = serde_json::json!({"a": 1}); + let kernel = serde_json::json!({"b": 1}); + assert_stats_match(&spark, &kernel, ""); + }); + assert!(result.is_err(), "should panic on missing key"); + + let result = std::panic::catch_unwind(|| { + let spark = serde_json::json!({"val": 1.0}); + let kernel = serde_json::json!({"val": 2.0}); + assert_stats_match(&spark, &kernel, ""); + }); + assert!(result.is_err(), "should panic on numeric mismatch"); + + let result = std::panic::catch_unwind(|| { + let spark = serde_json::json!({"s": "alpha"}); + let kernel = serde_json::json!({"s": "beta"}); + assert_stats_match(&spark, &kernel, ""); + }); + assert!(result.is_err(), "should panic on string mismatch"); + } + + /// Validates that kernel's `collect_stats()` produces file statistics matching Spark's output. + /// + /// Uses test data generated by PySpark containing all supported stat types: integers, floats, + /// date, timestamp, timestamp_ntz, string, decimal, boolean, binary, array, map, and nested + /// structs. Reads the parquet data, recomputes stats with kernel, and compares + /// numRecords/nullCount/minValues/maxValues against Spark's stats from the delta log. + #[test] + fn test_collect_stats_matches_spark() { + // ===== GIVEN ===== + // Load a PySpark-generated Delta table containing all supported stat types + // and extract Spark's reference stats from the commit log. + let test_path = + std::fs::canonicalize("./tests/data/stats-writing-all-types/delta").unwrap(); + + let commit_path = test_path + .join("_delta_log") + .join("00000000000000000001.json"); + let commit_data = std::fs::read_to_string(&commit_path).expect("read commit 1 json"); + + let mut spark_stats_json = None; + let mut parquet_path = None; + + for line in commit_data.lines() { + let action: serde_json::Value = serde_json::from_str(line).expect("parse JSON line"); + if let Some(add) = action.get("add") { + spark_stats_json = Some( + add["stats"] + .as_str() + .expect("stats should be a string") + .to_string(), + ); + parquet_path = Some( + add["path"] + .as_str() + .expect("path should be a string") + .to_string(), + ); + break; + } + } + + let spark_stats_json = spark_stats_json.expect("should find add action with stats"); + let parquet_path = parquet_path.expect("should find add action with path"); + let spark_stats: serde_json::Value = + serde_json::from_str(&spark_stats_json).expect("parse Spark stats JSON"); + + // ===== WHEN ===== + // Read the same parquet file and compute stats using kernel's collect_stats. + let parquet_file_path = test_path.join(&parquet_path); + let file = std::fs::File::open(&parquet_file_path).expect("open parquet file"); + let builder = + ParquetRecordBatchReaderBuilder::try_new(file).expect("create parquet reader builder"); + let schema = builder.schema().clone(); + let reader = builder.build().expect("build parquet reader"); + + let batches: Vec = reader.map(|b| b.expect("read batch")).collect(); + let record_batch = concat_batches(&schema, &batches).expect("concat batches"); + + // Build stats_columns from all leaf columns in the parquet schema + let stats_columns = extract_leaf_columns(schema.fields(), &[]); + let stats_struct = collect_stats(&record_batch, &stats_columns).expect("collect stats"); + + // Convert kernel stats to JSON + let json_array = to_json(&stats_struct).expect("convert stats to JSON"); + let json_strings = json_array.as_string::(); + assert_eq!(json_strings.len(), 1, "should have exactly one stats row"); + let kernel_stats_json_str = json_strings.value(0); + let kernel_stats: serde_json::Value = + serde_json::from_str(kernel_stats_json_str).expect("parse kernel stats JSON"); + + // ===== THEN ===== + // Kernel stats must match Spark's numRecords, nullCount, minValues, and maxValues. + assert_eq!( + spark_stats["numRecords"], kernel_stats["numRecords"], + "numRecords mismatch" + ); + + // Compare nullCount, minValues, maxValues (only keys present in Spark's stats) + for section in &["nullCount", "minValues", "maxValues"] { + if let Some(spark_section) = spark_stats.get(*section) { + let kernel_section = kernel_stats + .get(*section) + .unwrap_or_else(|| panic!("Kernel stats missing {section}")); + assert_stats_match(spark_section, kernel_section, section); + } + } + } +} diff --git a/kernel/src/engine/default/storage.rs b/kernel/src/engine/default/storage.rs index d5fd932389..3c17c463d2 100644 --- a/kernel/src/engine/default/storage.rs +++ b/kernel/src/engine/default/storage.rs @@ -1,18 +1,18 @@ -use object_store::parse_url_opts as parse_url_opts_object_store; -use object_store::path::Path; -use object_store::{Error, ObjectStore}; +use std::collections::HashMap; +use std::sync::{Arc, LazyLock, RwLock}; + use url::Url; +use crate::object_store::path::Path; +use crate::object_store::{self, Error, ObjectStore}; use crate::Error as DeltaError; -use std::collections::HashMap; -use std::sync::{Arc, LazyLock, RwLock}; /// Alias for convenience type ClosureReturn = Result<(Box, Path), Error>; /// This type alias makes it easier to reference the handler closure(s) /// -/// It uses a HashMap which _must_ be converted in our [parse_url_opts] because we -/// cannot use generics in this scenario. +/// It uses a HashMap which _must_ be converted in [store_from_url_opts] +/// because we cannot use generics in this scenario. type HandlerClosure = Arc) -> ClosureReturn + Send + Sync>; /// hashmap containing scheme => handler fn mappings to allow consumers of delta-kernel-rs provide /// their own url opts parsers for different scemes @@ -20,9 +20,9 @@ type Handlers = HashMap; /// The URL_REGISTRY contains the custom URL scheme handlers that will parse URL options static URL_REGISTRY: LazyLock> = LazyLock::new(|| RwLock::new(HashMap::default())); -/// Insert a new URL handler for [parse_url_opts] with the given `scheme`. This allows users to -/// provide their own custom URL handler to plug new [object_store::ObjectStore] instances into -/// delta-kernel +/// Insert a new URL handler for [store_from_url_opts] with the given `scheme`. This allows +/// users to provide their own custom URL handler to plug new [crate::object_store::ObjectStore] +/// instances into delta-kernel, which is used by [store_from_url_opts] to parse the URL. pub fn insert_url_handler( scheme: impl AsRef, handler_closure: HandlerClosure, @@ -36,36 +36,84 @@ pub fn insert_url_handler( Ok(()) } -/// Parse the given URL options to produce a valid and configured [ObjectStore] +/// Create an [`ObjectStore`] from a URL. +/// +/// Returns an `Arc` ready to use with [`crate::engine::default::DefaultEngine`]. +/// +/// This function checks for custom URL handlers registered via [`insert_url_handler`] +/// before falling back to [`object_store`]'s default behavior. +/// +/// # Example +/// +/// ```rust +/// # use url::Url; +/// # use delta_kernel::engine::default::storage::store_from_url; +/// # use delta_kernel::DeltaResult; +/// # fn example() -> DeltaResult<()> { +/// let url = Url::parse("file:///path/to/table")?; +/// let store = store_from_url(&url)?; +/// # Ok(()) +/// # } +/// ``` +pub fn store_from_url(url: &Url) -> crate::DeltaResult> { + store_from_url_opts(url, std::iter::empty::<(&str, &str)>()) +} + +/// Create an [`ObjectStore`] from a URL with custom options. /// -/// This function will first attempt to use any schemes registered via [insert_url_handler], -/// falling back to the default behavior of [object_store::parse_url_opts] -pub fn parse_url_opts(url: &Url, options: I) -> Result<(Box, Path), Error> +/// Returns an `Arc` ready to use with [`crate::engine::default::DefaultEngine`]. +/// +/// This function checks for custom URL handlers registered via [`insert_url_handler`] +/// before falling back to [`object_store`]'s default behavior. +/// +/// # Example +/// +/// ```rust +/// # use url::Url; +/// # use std::collections::HashMap; +/// # use delta_kernel::engine::default::storage::store_from_url_opts; +/// # use delta_kernel::DeltaResult; +/// # fn example() -> DeltaResult<()> { +/// let url = Url::parse("s3://my-bucket/path/to/table")?; +/// let options = HashMap::from([("region", "us-west-2")]); +/// let store = store_from_url_opts(&url, options)?; +/// # Ok(()) +/// # } +/// ``` +pub fn store_from_url_opts( + url: &Url, + options: I, +) -> crate::DeltaResult> where I: IntoIterator, K: AsRef, V: Into, { - if let Ok(handlers) = URL_REGISTRY.read() { + // First attempt to use any schemes registered via insert_url_handler, + // falling back to the default behavior of crate::object_store::parse_url_opts + let (store, _path) = if let Ok(handlers) = URL_REGISTRY.read() { if let Some(handler) = handlers.get(url.scheme()) { - let options: HashMap = HashMap::from_iter( - options - .into_iter() - .map(|(k, v)| (k.as_ref().to_string(), v.into())), - ); - - return handler(url, options); + let options = options + .into_iter() + .map(|(k, v)| (k.as_ref().to_string(), v.into())) + .collect(); + handler(url, options)? + } else { + object_store::parse_url_opts(url, options)? } - } - parse_url_opts_object_store(url, options) + } else { + object_store::parse_url_opts(url, options)? + }; + + Ok(Arc::new(store)) } #[cfg(test)] mod tests { use super::*; + use crate::object_store::{self, path::Path}; use hdfs_native_object_store::HdfsObjectStoreBuilder; - use object_store::{self, path::Path}; /// Example funciton of doing testing of a custom [HdfsObjectStore] construction fn parse_url_opts_hdfs_native( @@ -110,15 +158,14 @@ mod tests { // Currently constructing an [HdfsObjectStore] won't work if there isn't an actual HDFS // to connect to, so the only way to really verify that we got the object store we // expected is to inspect the `store` on the error v_v - if let Err(store_error) = parse_url_opts(&url, options) { - match store_error { - object_store::Error::Generic { store, source: _ } => { - assert_eq!(store, "HdfsObjectStore"); - } - unexpected => panic!("Unexpected error happened: {unexpected:?}"), + match store_from_url_opts(&url, options) { + Err(crate::Error::ObjectStore(object_store::Error::Generic { store, source: _ })) => { + assert_eq!(store, "HdfsObjectStore"); + } + Err(unexpected) => panic!("Unexpected error happened: {unexpected:?}"), + Ok(_) => { + panic!("Expected to get an error when constructing an HdfsObjectStore, but something didn't work as expected! Either the parse_url_opts_hdfs_native function didn't get called, or the hdfs-native-object-store no longer errors when it cannot connect to HDFS"); } - } else { - panic!("Expected to get an error when constructing an HdfsObjectStore, but something didn't work as expected! Either the parse_url_opts_hdfs_native function didn't get called, or the hdfs-native-object-store no longer errors when it cannot connect to HDFS"); } } } diff --git a/kernel/src/engine/ensure_data_types.rs b/kernel/src/engine/ensure_data_types.rs index 2d5a660c36..60b908357f 100644 --- a/kernel/src/engine/ensure_data_types.rs +++ b/kernel/src/engine/ensure_data_types.rs @@ -5,7 +5,7 @@ use std::{ ops::Deref, }; -use crate::arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField}; +use crate::arrow::datatypes::{DataType as ArrowDataType, Field as ArrowField, TimeUnit}; use itertools::Itertools; use super::arrow_conversion::TryIntoArrow as _; @@ -16,31 +16,43 @@ use crate::{ DeltaResult, Error, }; +/// Controls how `ensure_data_types` validates struct fields and metadata. +#[derive(Clone, Copy)] +pub(crate) enum ValidationMode { + /// Check types only. Struct fields are matched by ordinal position, not by name. + /// Nullability and metadata are not checked. Used by the expression evaluator where + /// column mapping can cause physical/logical name mismatches. + TypesOnly, + /// Check types and match struct fields by name, but skip nullability and metadata. + /// Used by the parquet reader where fields are already resolved by name upstream. + TypesAndNames, + /// Check types, names, nullability, and metadata. + Full, +} + /// Ensure a kernel data type matches an arrow data type. This only ensures that the actual "type" /// is the same, but does so recursively into structs, and ensures lists and maps have the correct /// associated types as well. /// -/// If `check_nullability_and_metadata` is true, this will also return an error if it finds a struct -/// field that differs in nullability or metadata between the kernel and arrow schema. If it is -/// false, no checks on nullability or metadata are performed. +/// The `mode` parameter controls how struct fields are matched and whether nullability/metadata +/// are checked. See [`ValidationMode`] for details. /// /// This returns an `Ok(DataTypeCompat)` if the types are compatible, and /// will indicate what kind of compatibility they have, or an error if the types do not match. If -/// there is a `struct` type included, we only ensure that the named fields that the kernel is -/// asking for exist, and that for those fields the types match. Un-selected fields are ignored. +/// there is a `struct` type included and the mode uses name-based matching, we only ensure that +/// the named fields that the kernel is asking for exist, and that for those fields the types +/// match. Un-selected fields are ignored. pub(crate) fn ensure_data_types( kernel_type: &DataType, arrow_type: &ArrowDataType, - check_nullability_and_metadata: bool, + mode: ValidationMode, ) -> DeltaResult { - let check = EnsureDataTypes { - check_nullability_and_metadata, - }; + let check = EnsureDataTypes { mode }; check.ensure_data_types(kernel_type, arrow_type) } struct EnsureDataTypes { - check_nullability_and_metadata: bool, + mode: ValidationMode, } /// Capture the compatibility between two data-types, as passed to [`ensure_data_types`] @@ -107,33 +119,59 @@ impl EnsureDataTypes { Ok(DataTypeCompat::Nested) } (DataType::Struct(kernel_fields), ArrowDataType::Struct(arrow_fields)) => { - // build a list of kernel fields that matches the order of the arrow fields - let mapped_fields = arrow_fields - .iter() - .filter_map(|f| kernel_fields.field(f.name())); - - // keep track of how many fields we matched up - let mut found_fields = 0; - // ensure that for the fields that we found, the types match - for (kernel_field, arrow_field) in mapped_fields.zip(arrow_fields) { - self.ensure_nullability_and_metadata(kernel_field, arrow_field)?; - self.ensure_data_types(&kernel_field.data_type, arrow_field.data_type())?; - found_fields += 1; + match self.mode { + ValidationMode::TypesOnly => { + // Ordinal matching: check field count and types by position, + // ignore names. Column mapping can cause name mismatches between + // physical (arrow) and logical (kernel) field names. + require!(kernel_fields.num_fields() == arrow_fields.len(), { + make_arrow_error(format!( + "Struct field count mismatch: expected {}, got {}", + kernel_fields.num_fields(), + arrow_fields.len() + )) + }); + for (kernel_field, arrow_field) in + kernel_fields.fields().zip(arrow_fields.iter()) + { + self.ensure_data_types( + &kernel_field.data_type, + arrow_field.data_type(), + )?; + } + } + ValidationMode::TypesAndNames | ValidationMode::Full => { + // Name-based matching: look up kernel fields by arrow field name. + // Full mode additionally checks nullability and metadata. + let mapped_fields = arrow_fields + .iter() + .filter_map(|f| kernel_fields.field(f.name())); + + let mut found_fields = 0; + for (kernel_field, arrow_field) in mapped_fields.zip(arrow_fields) { + self.ensure_nullability_and_metadata(kernel_field, arrow_field)?; + self.ensure_data_types( + &kernel_field.data_type, + arrow_field.data_type(), + )?; + found_fields += 1; + } + + require!(kernel_fields.num_fields() == found_fields, { + let arrow_field_map: HashSet<&String> = + HashSet::from_iter(arrow_fields.iter().map(|f| f.name())); + let missing_field_names = kernel_fields + .field_names() + .filter(|kernel_field| !arrow_field_map.contains(kernel_field)) + .take(5) + .join(", "); + make_arrow_error(format!( + "Missing Struct fields {missing_field_names} \ + (Up to five missing fields shown)" + )) + }); + } } - - // require that we found the number of fields that we requested. - require!(kernel_fields.num_fields() == found_fields, { - let arrow_field_map: HashSet<&String> = - HashSet::from_iter(arrow_fields.iter().map(|f| f.name())); - let missing_field_names = kernel_fields - .field_names() - .filter(|kernel_field| !arrow_field_map.contains(kernel_field)) - .take(5) - .join(", "); - make_arrow_error(format!( - "Missing Struct fields {missing_field_names} (Up to five missing fields shown)" - )) - }); Ok(DataTypeCompat::Nested) } _ => Err(make_arrow_error(format!( @@ -148,7 +186,7 @@ impl EnsureDataTypes { kernel_field_is_nullable: bool, arrow_field_is_nullable: bool, ) -> DeltaResult<()> { - if self.check_nullability_and_metadata + if matches!(self.mode, ValidationMode::Full) && kernel_field_is_nullable != arrow_field_is_nullable { Err(Error::Generic(format!( @@ -169,7 +207,7 @@ impl EnsureDataTypes { kernel_field.nullable, arrow_field.is_nullable(), )?; - if self.check_nullability_and_metadata + if matches!(self.mode, ValidationMode::Full) && !metadata_eq(&kernel_field.metadata, arrow_field.metadata()) { Err(Error::Generic(format!( @@ -206,6 +244,15 @@ fn check_cast_compat( Ok(DataTypeCompat::NeedsCast(target_type)) } (Date32, Timestamp(_, None)) => Ok(DataTypeCompat::NeedsCast(target_type)), + // Physical type reinterpretation: some checkpoint writers store date/timestamp columns + // as plain integers without Parquet logical type annotations. The Delta protocol + // guarantees data files conform to the table schema, so a schema mismatch (e.g. Int32 + // where Date32 is expected) would not occur in normal data files. + // + // NOTE: The kernel-type equivalent lives in `PrimitiveType::is_stats_type_compatible_with` + // in `schema/mod.rs`. Changes here must be mirrored there. + (Int32, Date32) => Ok(DataTypeCompat::NeedsCast(target_type)), + (Int64, Timestamp(TimeUnit::Microsecond, _)) => Ok(DataTypeCompat::NeedsCast(target_type)), _ => Err(make_arrow_error(format!( "Incorrect datatype. Expected {target_type}, got {source_type}" ))), @@ -343,16 +390,16 @@ mod tests { assert!(ensure_data_types( &DataType::unshredded_variant(), &unshredded_variant_arrow_type(), - true + ValidationMode::Full ) .is_ok()); assert_result_error_with_message( ensure_data_types( &DataType::unshredded_variant(), &incorrect_variant_arrow_type(), - true, + ValidationMode::Full, ), - "Invalid argument error: Incorrect datatype. Expected Struct(metadata Binary, value Binary), got Struct(field_1 Binary, field_2 Binary)", + "Invalid argument error: Incorrect datatype", ) } @@ -361,14 +408,14 @@ mod tests { assert!(ensure_data_types( &DataType::decimal(5, 2).unwrap(), &ArrowDataType::Decimal128(5, 2), - false + ValidationMode::TypesAndNames ) .is_ok()); assert_result_error_with_message( ensure_data_types( &DataType::decimal(5, 2).unwrap(), &ArrowDataType::Decimal128(5, 3), - false, + ValidationMode::TypesAndNames, ), "Invalid argument error: Incorrect datatype. Expected Decimal128(5, 2), got Decimal128(5, 3)", ) @@ -391,7 +438,7 @@ mod tests { true ))), arrow_field.data_type(), - false + ValidationMode::TypesAndNames ) .is_ok()); @@ -403,7 +450,7 @@ mod tests { false, ))), arrow_field.data_type(), - true, + ValidationMode::Full, ), "Generic delta kernel error: Map has nullablily false in kernel and true in arrow", ); @@ -411,7 +458,7 @@ mod tests { ensure_data_types( &DataType::Map(Box::new(MapType::new(DataType::LONG, DataType::LONG, true))), arrow_field.data_type(), - false, + ValidationMode::TypesAndNames, ), "Invalid argument error: Incorrect datatype. Expected long, got Utf8", ); @@ -422,20 +469,20 @@ mod tests { assert!(ensure_data_types( &DataType::Array(Box::new(ArrayType::new(DataType::LONG, true))), &ArrowDataType::new_list(ArrowDataType::Int64, true), - false + ValidationMode::TypesAndNames ) .is_ok()); assert!(ensure_data_types( &DataType::Array(Box::new(ArrayType::new(DataType::LONG, true))), &ArrowDataType::new_large_list(ArrowDataType::Int64, true), - false + ValidationMode::TypesAndNames ) .is_ok()); assert_result_error_with_message( ensure_data_types( &DataType::Array(Box::new(ArrayType::new(DataType::STRING, true))), &ArrowDataType::new_list(ArrowDataType::Int64, true), - false, + ValidationMode::TypesAndNames, ), "Invalid argument error: Incorrect datatype. Expected Utf8, got Int64", ); @@ -443,7 +490,7 @@ mod tests { ensure_data_types( &DataType::Array(Box::new(ArrayType::new(DataType::LONG, true))), &ArrowDataType::new_list(ArrowDataType::Int64, false), - true, + ValidationMode::Full, ), "Generic delta kernel error: List has nullablily true in kernel and false in arrow", ); @@ -473,7 +520,7 @@ mod tests { ), )]); let arrow_struct = ArrowDataType::try_from_kernel(&schema).unwrap(); - assert!(ensure_data_types(&schema, &arrow_struct, true).is_ok()); + assert!(ensure_data_types(&schema, &arrow_struct, ValidationMode::Full).is_ok()); let kernel_simple = DataType::struct_type_unchecked([ StructField::nullable("w", DataType::LONG), @@ -488,7 +535,12 @@ mod tests { ]), true, ); - assert!(ensure_data_types(&kernel_simple, arrow_simple_ok.data_type(), true).is_ok()); + assert!(ensure_data_types( + &kernel_simple, + arrow_simple_ok.data_type(), + ValidationMode::Full + ) + .is_ok()); let arrow_missing_simple = ArrowField::new_struct( "arrow_struct", @@ -496,7 +548,11 @@ mod tests { true, ); assert_result_error_with_message( - ensure_data_types(&kernel_simple, arrow_missing_simple.data_type(), true), + ensure_data_types( + &kernel_simple, + arrow_missing_simple.data_type(), + ValidationMode::Full, + ), "Invalid argument error: Missing Struct fields x (Up to five missing fields shown)", ); @@ -512,7 +568,7 @@ mod tests { ensure_data_types( &kernel_simple, arrow_nullable_mismatch_simple.data_type(), - true, + ValidationMode::Full, ), "Generic delta kernel error: w has nullablily true in kernel and false in arrow", ); @@ -521,11 +577,21 @@ mod tests { #[test] fn ensure_views() { assert_eq!( - ensure_data_types(&DataType::STRING, &ArrowDataType::Utf8View, true).unwrap(), + ensure_data_types( + &DataType::STRING, + &ArrowDataType::Utf8View, + ValidationMode::Full + ) + .unwrap(), DataTypeCompat::Identical ); assert_eq!( - ensure_data_types(&DataType::BINARY, &ArrowDataType::BinaryView, true).unwrap(), + ensure_data_types( + &DataType::BINARY, + &ArrowDataType::BinaryView, + ValidationMode::Full + ) + .unwrap(), DataTypeCompat::Identical ); assert_eq!( @@ -535,7 +601,7 @@ mod tests { ArrowDataType::Int64, true ))), - true + ValidationMode::Full ) .unwrap(), DataTypeCompat::Identical @@ -547,7 +613,7 @@ mod tests { ArrowDataType::Int64, true ))), - true + ValidationMode::Full ) .unwrap(), DataTypeCompat::Identical @@ -557,12 +623,163 @@ mod tests { #[test] fn ensure_large_strings_and_binary() { assert_eq!( - ensure_data_types(&DataType::STRING, &ArrowDataType::LargeUtf8, true).unwrap(), + ensure_data_types( + &DataType::STRING, + &ArrowDataType::LargeUtf8, + ValidationMode::Full + ) + .unwrap(), DataTypeCompat::Identical ); assert_eq!( - ensure_data_types(&DataType::BINARY, &ArrowDataType::LargeBinary, true).unwrap(), + ensure_data_types( + &DataType::BINARY, + &ArrowDataType::LargeBinary, + ValidationMode::Full + ) + .unwrap(), DataTypeCompat::Identical ); } + + #[test] + fn ensure_int32_to_date_reinterpretation() { + // Int32 -> Date32: checkpoint writers may omit the DATE logical type annotation + assert_eq!( + ensure_data_types( + &DataType::DATE, + &ArrowDataType::Int32, + ValidationMode::TypesAndNames + ) + .unwrap(), + DataTypeCompat::NeedsCast(ArrowDataType::Date32) + ); + // Reverse is not supported: Date32 -> Int32 + assert_result_error_with_message( + ensure_data_types( + &DataType::INTEGER, + &ArrowDataType::Date32, + ValidationMode::TypesAndNames, + ), + "Incorrect datatype", + ); + } + + #[test] + fn ensure_int64_to_timestamp_reinterpretation() { + // Int64 -> Timestamp (with UTC timezone, i.e. kernel `timestamp`) + let ts_utc = ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())); + assert_eq!( + ensure_data_types( + &DataType::TIMESTAMP, + &ArrowDataType::Int64, + ValidationMode::TypesAndNames + ) + .unwrap(), + DataTypeCompat::NeedsCast(ts_utc) + ); + // Int64 -> TimestampNtz (no timezone, i.e. kernel `timestamp_ntz`) + let ts_ntz = ArrowDataType::Timestamp(TimeUnit::Microsecond, None); + assert_eq!( + ensure_data_types( + &DataType::TIMESTAMP_NTZ, + &ArrowDataType::Int64, + ValidationMode::TypesAndNames + ) + .unwrap(), + DataTypeCompat::NeedsCast(ts_ntz) + ); + // Reverse is not supported + assert_result_error_with_message( + ensure_data_types( + &DataType::LONG, + &ArrowDataType::Timestamp(TimeUnit::Microsecond, None), + ValidationMode::TypesAndNames, + ), + "Incorrect datatype", + ); + } + + /// Ensures that every kernel-level checkpoint reinterpretation rule in + /// `PrimitiveType::is_checkpoint_cast_compatible` has a corresponding Arrow cast + /// in `check_cast_compat`. If one side is updated without the other, this test fails. + #[test] + fn checkpoint_reinterpretation_rules_match_arrow_cast_rules() { + use crate::schema::PrimitiveType; + + let reinterpretation_pairs: &[(PrimitiveType, PrimitiveType)] = &[ + (PrimitiveType::Integer, PrimitiveType::Date), + (PrimitiveType::Long, PrimitiveType::Timestamp), + (PrimitiveType::Long, PrimitiveType::TimestampNtz), + ]; + for (source_kernel, target_kernel) in reinterpretation_pairs { + // Verify the kernel-level rule holds + assert!( + source_kernel.is_checkpoint_cast_compatible(target_kernel), + "Kernel should allow {source_kernel:?} -> {target_kernel:?}" + ); + + // Verify the Arrow-level rule holds + let source_arrow = + ArrowDataType::try_from_kernel(&DataType::Primitive(source_kernel.clone())) + .unwrap(); + let target_arrow = + ArrowDataType::try_from_kernel(&DataType::Primitive(target_kernel.clone())) + .unwrap(); + assert!( + check_cast_compat(target_arrow.clone(), &source_arrow).is_ok(), + "Arrow check_cast_compat should allow {source_arrow:?} -> {target_arrow:?} \ + to match kernel rule {source_kernel:?} -> {target_kernel:?}" + ); + } + } + + #[test] + fn types_only_matches_struct_by_ordinal_ignoring_names() { + let kernel = DataType::struct_type_unchecked([ + StructField::nullable("logical_a", DataType::LONG), + StructField::nullable("logical_b", DataType::STRING), + ]); + let arrow = ArrowDataType::Struct( + vec![ + ArrowField::new("physical_x", ArrowDataType::Int64, true), + ArrowField::new("physical_y", ArrowDataType::Utf8, true), + ] + .into(), + ); + assert!(ensure_data_types(&kernel, &arrow, ValidationMode::TypesOnly).is_ok()); + } + + #[test] + fn types_only_rejects_struct_field_count_mismatch() { + let kernel = DataType::struct_type_unchecked([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::LONG), + ]); + let arrow = + ArrowDataType::Struct(vec![ArrowField::new("x", ArrowDataType::Int64, true)].into()); + assert_result_error_with_message( + ensure_data_types(&kernel, &arrow, ValidationMode::TypesOnly), + "Struct field count mismatch", + ); + } + + #[test] + fn types_only_rejects_struct_type_mismatch_by_ordinal() { + let kernel = DataType::struct_type_unchecked([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::LONG), + ]); + let arrow = ArrowDataType::Struct( + vec![ + ArrowField::new("x", ArrowDataType::Int64, true), + ArrowField::new("y", ArrowDataType::Utf8, true), + ] + .into(), + ); + assert_result_error_with_message( + ensure_data_types(&kernel, &arrow, ValidationMode::TypesOnly), + "Incorrect datatype", + ); + } } diff --git a/kernel/src/engine/mod.rs b/kernel/src/engine/mod.rs index 8453b6560e..b0c1927b25 100644 --- a/kernel/src/engine/mod.rs +++ b/kernel/src/engine/mod.rs @@ -28,83 +28,7 @@ pub(crate) mod ensure_data_types; pub mod parquet_row_group_skipping; #[cfg(test)] -mod tests { - use itertools::Itertools; - use object_store::path::Path; - use std::sync::Arc; - use url::Url; +pub(crate) mod tests; - use crate::arrow::array::{RecordBatch, StringArray}; - use crate::arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; - use crate::engine::arrow_data::ArrowEngineData; - use crate::engine_data::FilteredEngineData; - use crate::{Engine, EngineData}; - - use test_utils::delta_path_for_version; - - fn test_list_from_should_sort_and_filter( - engine: &dyn Engine, - base_url: &Url, - engine_data: impl Fn() -> Box, - ) { - let json = engine.json_handler(); - let get_data = || { - let data = engine_data(); - let filtered_data = FilteredEngineData::with_all_rows_selected(data); - Box::new(std::iter::once(Ok(filtered_data))) - }; - - let expected_names: Vec = (1..4) - .map(|i| delta_path_for_version(i, "json")) - .collect_vec(); - - for i in expected_names.iter().rev() { - let path = base_url.join(i.as_ref()).unwrap(); - json.write_json_file(&path, get_data(), false).unwrap(); - } - let path = base_url.join("other").unwrap(); - json.write_json_file(&path, get_data(), false).unwrap(); - - let storage = engine.storage_handler(); - - // list files after an offset - let test_url = base_url.join(expected_names[0].as_ref()).unwrap(); - let files: Vec<_> = storage.list_from(&test_url).unwrap().try_collect().unwrap(); - assert_eq!(files.len(), expected_names.len() - 1); - for (file, expected) in files.iter().zip(expected_names.iter().skip(1)) { - assert_eq!(file.location, base_url.join(expected.as_ref()).unwrap()); - } - - let test_url = base_url - .join(delta_path_for_version(0, "json").as_ref()) - .unwrap(); - let files: Vec<_> = storage.list_from(&test_url).unwrap().try_collect().unwrap(); - assert_eq!(files.len(), expected_names.len()); - - // list files inside a directory / key prefix - let test_url = base_url.join("_delta_log/").unwrap(); - let files: Vec<_> = storage.list_from(&test_url).unwrap().try_collect().unwrap(); - assert_eq!(files.len(), expected_names.len()); - for (file, expected) in files.iter().zip(expected_names.iter()) { - assert_eq!(file.location, base_url.join(expected.as_ref()).unwrap()); - } - } - - fn get_arrow_data() -> Box { - let schema = Arc::new(ArrowSchema::new(vec![Field::new( - "dog", - ArrowDataType::Utf8, - true, - )])); - let data = RecordBatch::try_new( - schema.clone(), - vec![Arc::new(StringArray::from(vec!["remi", "wilson"]))], - ) - .unwrap(); - Box::new(ArrowEngineData::new(data)) - } - - pub(crate) fn test_arrow_engine(engine: &dyn Engine, base_url: &Url) { - test_list_from_should_sort_and_filter(engine, base_url, get_arrow_data); - } -} +#[cfg(test)] +mod cross_engine_tests; diff --git a/kernel/src/engine/parquet_row_group_skipping.rs b/kernel/src/engine/parquet_row_group_skipping.rs index bdef0e13f2..8c80e698c9 100644 --- a/kernel/src/engine/parquet_row_group_skipping.rs +++ b/kernel/src/engine/parquet_row_group_skipping.rs @@ -51,7 +51,7 @@ impl ParquetRowGroupSkipping for ArrowReaderBuilder { } } -/// A ParquetStatsSkippingFilter for row group skipping. It obtains stats from a parquet +/// A [`ParquetStatsProvider`] for data file row group skipping. It obtains stats from a parquet /// [`RowGroupMetaData`] and pre-computes the mapping of each referenced column path to its /// corresponding field index, for O(1) stats lookups. struct RowGroupFilter<'a> { @@ -80,122 +80,15 @@ impl<'a> RowGroupFilter<'a> { .get(col) .map(|&i| self.row_group.column(i).statistics()) } - - fn decimal_from_bytes(bytes: Option<&[u8]>, dtype: DecimalType) -> Option { - // WARNING: The bytes are stored in big-endian order; reverse and then 0-pad to 16 bytes. - let bytes = bytes.filter(|b| b.len() <= 16)?; - let mut bytes = Vec::from(bytes); - bytes.reverse(); - bytes.resize(16, 0u8); - let bytes: [u8; 16] = bytes.try_into().ok()?; - let value = DecimalData::try_new(i128::from_le_bytes(bytes), dtype).ok()?; - Some(value.into()) - } - - fn timestamp_from_date(days: Option<&i32>) -> Option { - let days = u64::try_from(*days?).ok()?; - let timestamp = DateTime::UNIX_EPOCH.checked_add_days(Days::new(days))?; - let timestamp = timestamp.signed_duration_since(DateTime::UNIX_EPOCH); - Some(Scalar::TimestampNtz(timestamp.num_microseconds()?)) - } } impl ParquetStatsProvider for RowGroupFilter<'_> { - // Extracts a stat value, converting from its physical type to the requested logical type. - // - // NOTE: This code is highly redundant with [`get_max_stat_value`] below, but parquet - // ValueStatistics requires T to impl a private trait, so we can't factor out any kind of - // helper method. And macros are hard enough to read that it's not worth defining one. fn get_parquet_min_stat(&self, col: &ColumnName, data_type: &DataType) -> Option { - use PrimitiveType::*; - let value = match (data_type.as_primitive_opt()?, self.get_stats(col)??) { - (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), - (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), - (String, _) => return None, - (Long, Statistics::Int64(s)) => s.min_opt()?.into(), - (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), - (Long, _) => return None, - (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), - (Integer, _) => return None, - (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), - (Short, _) => return None, - (Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(), - (Byte, _) => return None, - (Float, Statistics::Float(s)) => s.min_opt()?.into(), - (Float, _) => return None, - (Double, Statistics::Double(s)) => s.min_opt()?.into(), - (Double, Statistics::Float(s)) => (*s.min_opt()? as f64).into(), - (Double, _) => return None, - (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), - (Boolean, _) => return None, - (Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(), - (Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(), - (Binary, _) => return None, - (Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?), - (Date, _) => return None, - (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), - (Timestamp, _) => return None, // TODO: Int96 timestamps - (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), - (TimestampNtz, Statistics::Int32(s)) => Self::timestamp_from_date(s.min_opt())?, - (TimestampNtz, _) => return None, // TODO: Int96 timestamps - (Decimal(d), Statistics::Int32(i)) => { - DecimalData::try_new(*i.min_opt()?, *d).ok()?.into() - } - (Decimal(d), Statistics::Int64(i)) => { - DecimalData::try_new(*i.min_opt()?, *d).ok()?.into() - } - (Decimal(d), Statistics::FixedLenByteArray(b)) => { - Self::decimal_from_bytes(b.min_bytes_opt(), *d)? - } - (Decimal(..), _) => return None, - }; - Some(value) + extract_min_scalar(data_type, self.get_stats(col)??) } fn get_parquet_max_stat(&self, col: &ColumnName, data_type: &DataType) -> Option { - use PrimitiveType::*; - let value = match (data_type.as_primitive_opt()?, self.get_stats(col)??) { - (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), - (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), - (String, _) => return None, - (Long, Statistics::Int64(s)) => s.max_opt()?.into(), - (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), - (Long, _) => return None, - (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), - (Integer, _) => return None, - (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), - (Short, _) => return None, - (Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(), - (Byte, _) => return None, - (Float, Statistics::Float(s)) => s.max_opt()?.into(), - (Float, _) => return None, - (Double, Statistics::Double(s)) => s.max_opt()?.into(), - (Double, Statistics::Float(s)) => (*s.max_opt()? as f64).into(), - (Double, _) => return None, - (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), - (Boolean, _) => return None, - (Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(), - (Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(), - (Binary, _) => return None, - (Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?), - (Date, _) => return None, - (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), - (Timestamp, _) => return None, // TODO: Int96 timestamps - (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), - (TimestampNtz, Statistics::Int32(s)) => Self::timestamp_from_date(s.max_opt())?, - (TimestampNtz, _) => return None, // TODO: Int96 timestamps - (Decimal(d), Statistics::Int32(i)) => { - DecimalData::try_new(*i.max_opt()?, *d).ok()?.into() - } - (Decimal(d), Statistics::Int64(i)) => { - DecimalData::try_new(*i.max_opt()?, *d).ok()?.into() - } - (Decimal(d), Statistics::FixedLenByteArray(b)) => { - Self::decimal_from_bytes(b.max_bytes_opt(), *d)? - } - (Decimal(..), _) => return None, - }; - Some(value) + extract_max_scalar(data_type, self.get_stats(col)??) } fn get_parquet_nullcount_stat(&self, col: &ColumnName) -> Option { @@ -208,34 +101,136 @@ impl ParquetStatsProvider for RowGroupFilter<'_> { // physical name mapping has been performed. Because we currently lack both the // validation and the name mapping support, we must disable this optimization for the // time being. See https://github.com/delta-io/delta-kernel-rs/issues/434. - return Some(self.get_parquet_rowcount_stat()).filter(|_| false); - }; - - // WARNING: [`Statistics::null_count_opt`] returns Some(0) when the underlying stat is - // missing, causing an IS NULL predicate to wrongly skip the file if it contains any NULL - // values. Manually drill into each arm's [`ValueStatistics`] for the stat's true. - let nullcount = match stats? { - Statistics::Boolean(s) => s.null_count_opt(), - Statistics::Int32(s) => s.null_count_opt(), - Statistics::Int64(s) => s.null_count_opt(), - Statistics::Int96(s) => s.null_count_opt(), - Statistics::Float(s) => s.null_count_opt(), - Statistics::Double(s) => s.null_count_opt(), - Statistics::ByteArray(s) => s.null_count_opt(), - Statistics::FixedLenByteArray(s) => s.null_count_opt(), + return self.get_parquet_rowcount_stat().filter(|_| false); }; - // Parquet nullcount stats are always u64, so we can directly return the value instead of - // wrapping it in a Scalar. We can safely cast it from u64 to i64 because the nullcount can - // never be larger than the rowcount and the parquet rowcount stat is i64. - Some(nullcount? as i64) + extract_nullcount(stats) } - fn get_parquet_rowcount_stat(&self) -> i64 { - self.row_group.num_rows() + fn get_parquet_rowcount_stat(&self) -> Option { + Some(self.row_group.num_rows()) } } +/// Extracts the minimum stat value from parquet footer statistics, converting from the physical +/// parquet type to the requested logical Delta type. +fn extract_min_scalar(data_type: &DataType, stats: &Statistics) -> Option { + use PrimitiveType::*; + let value = match (data_type.as_primitive_opt()?, stats) { + (String, Statistics::ByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), + (String, Statistics::FixedLenByteArray(s)) => s.min_opt()?.as_utf8().ok()?.into(), + (String, _) => return None, + (Long, Statistics::Int64(s)) => s.min_opt()?.into(), + (Long, Statistics::Int32(s)) => (*s.min_opt()? as i64).into(), + (Long, _) => return None, + (Integer, Statistics::Int32(s)) => s.min_opt()?.into(), + (Integer, _) => return None, + (Short, Statistics::Int32(s)) => (*s.min_opt()? as i16).into(), + (Short, _) => return None, + (Byte, Statistics::Int32(s)) => (*s.min_opt()? as i8).into(), + (Byte, _) => return None, + (Float, Statistics::Float(s)) => s.min_opt()?.into(), + (Float, _) => return None, + (Double, Statistics::Double(s)) => s.min_opt()?.into(), + (Double, Statistics::Float(s)) => (*s.min_opt()? as f64).into(), + (Double, _) => return None, + (Boolean, Statistics::Boolean(s)) => s.min_opt()?.into(), + (Boolean, _) => return None, + (Binary, Statistics::ByteArray(s)) => s.min_opt()?.data().into(), + (Binary, Statistics::FixedLenByteArray(s)) => s.min_opt()?.data().into(), + (Binary, _) => return None, + (Date, Statistics::Int32(s)) => Scalar::Date(*s.min_opt()?), + (Date, _) => return None, + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.min_opt()?), + (Timestamp, _) => return None, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.min_opt()?), + (TimestampNtz, Statistics::Int32(s)) => timestamp_from_date(s.min_opt())?, + (TimestampNtz, _) => return None, // TODO: Int96 timestamps + (Decimal(d), Statistics::Int32(i)) => DecimalData::try_new(*i.min_opt()?, *d).ok()?.into(), + (Decimal(d), Statistics::Int64(i)) => DecimalData::try_new(*i.min_opt()?, *d).ok()?.into(), + (Decimal(d), Statistics::FixedLenByteArray(b)) => { + decimal_from_bytes(b.min_bytes_opt(), *d)? + } + (Decimal(..), _) => return None, + }; + Some(value) +} + +/// Extracts the maximum stat value from parquet footer statistics, converting from the physical +/// parquet type to the requested logical Delta type. +fn extract_max_scalar(data_type: &DataType, stats: &Statistics) -> Option { + use PrimitiveType::*; + let value = match (data_type.as_primitive_opt()?, stats) { + (String, Statistics::ByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), + (String, Statistics::FixedLenByteArray(s)) => s.max_opt()?.as_utf8().ok()?.into(), + (String, _) => return None, + (Long, Statistics::Int64(s)) => s.max_opt()?.into(), + (Long, Statistics::Int32(s)) => (*s.max_opt()? as i64).into(), + (Long, _) => return None, + (Integer, Statistics::Int32(s)) => s.max_opt()?.into(), + (Integer, _) => return None, + (Short, Statistics::Int32(s)) => (*s.max_opt()? as i16).into(), + (Short, _) => return None, + (Byte, Statistics::Int32(s)) => (*s.max_opt()? as i8).into(), + (Byte, _) => return None, + (Float, Statistics::Float(s)) => s.max_opt()?.into(), + (Float, _) => return None, + (Double, Statistics::Double(s)) => s.max_opt()?.into(), + (Double, Statistics::Float(s)) => (*s.max_opt()? as f64).into(), + (Double, _) => return None, + (Boolean, Statistics::Boolean(s)) => s.max_opt()?.into(), + (Boolean, _) => return None, + (Binary, Statistics::ByteArray(s)) => s.max_opt()?.data().into(), + (Binary, Statistics::FixedLenByteArray(s)) => s.max_opt()?.data().into(), + (Binary, _) => return None, + (Date, Statistics::Int32(s)) => Scalar::Date(*s.max_opt()?), + (Date, _) => return None, + (Timestamp, Statistics::Int64(s)) => Scalar::Timestamp(*s.max_opt()?), + (Timestamp, _) => return None, // TODO: Int96 timestamps + (TimestampNtz, Statistics::Int64(s)) => Scalar::TimestampNtz(*s.max_opt()?), + (TimestampNtz, Statistics::Int32(s)) => timestamp_from_date(s.max_opt())?, + (TimestampNtz, _) => return None, // TODO: Int96 timestamps + (Decimal(d), Statistics::Int32(i)) => DecimalData::try_new(*i.max_opt()?, *d).ok()?.into(), + (Decimal(d), Statistics::Int64(i)) => DecimalData::try_new(*i.max_opt()?, *d).ok()?.into(), + (Decimal(d), Statistics::FixedLenByteArray(b)) => { + decimal_from_bytes(b.max_bytes_opt(), *d)? + } + (Decimal(..), _) => return None, + }; + Some(value) +} + +/// Extracts the null count from parquet footer statistics for a column. +fn extract_nullcount(stats: Option<&Statistics>) -> Option { + // WARNING: The parquet footer decoding forces missing stats to Some(0), which would cause + // an IS NULL predicate to wrongly skip the file if it contains any NULL values. To be safe, + // we must never return Some(0). See https://github.com/apache/arrow-rs/issues/9451. + let nullcount = stats?.null_count_opt().filter(|n| *n > 0); + + // Parquet nullcount stats are always u64, so we can directly return the value instead of + // wrapping it in a Scalar. We can safely cast it from u64 to i64 because the nullcount can + // never be larger than the rowcount and the parquet rowcount stat is i64. + Some(nullcount? as i64) +} + +fn decimal_from_bytes(bytes: Option<&[u8]>, dtype: DecimalType) -> Option { + // WARNING: The bytes are stored in big-endian order; reverse and then 0-pad to 16 bytes. + let bytes = bytes.filter(|b| b.len() <= 16)?; + let mut bytes = Vec::from(bytes); + bytes.reverse(); + bytes.resize(16, 0u8); + let bytes: [u8; 16] = bytes.try_into().ok()?; + let value = DecimalData::try_new(i128::from_le_bytes(bytes), dtype).ok()?; + Some(value.into()) +} + +fn timestamp_from_date(days: Option<&i32>) -> Option { + let days = u64::try_from(*days?).ok()?; + let timestamp = DateTime::UNIX_EPOCH.checked_add_days(Days::new(days))?; + let timestamp = timestamp.signed_duration_since(DateTime::UNIX_EPOCH); + Some(Scalar::TimestampNtz(timestamp.num_microseconds()?)) +} + /// Given a predicate of interest and a set of parquet column descriptors, build a column -> /// index mapping for columns the predicate references. This ensures O(1) lookup times, for an /// overall O(n) cost to evaluate a predicate tree with n nodes. diff --git a/kernel/src/engine/parquet_row_group_skipping/tests.rs b/kernel/src/engine/parquet_row_group_skipping/tests.rs index d14764eb16..20707f697b 100644 --- a/kernel/src/engine/parquet_row_group_skipping/tests.rs +++ b/kernel/src/engine/parquet_row_group_skipping/tests.rs @@ -66,9 +66,11 @@ fn test_get_stat_values() { filter.get_nullcount_stat(&column_name!("bool")), Some(3i64.into()) ); + + // Should be Some(0), but https://github.com/apache/arrow-rs/issues/9451 assert_eq!( filter.get_nullcount_stat(&column_name!("varlen.utf8")), - Some(0i64.into()) + None // Some(0i64.into()) ); assert_eq!( diff --git a/kernel/src/engine/sync/json.rs b/kernel/src/engine/sync/json.rs index f6a50f9689..cc4b2965f9 100644 --- a/kernel/src/engine/sync/json.rs +++ b/kernel/src/engine/sync/json.rs @@ -1,14 +1,16 @@ +use std::sync::Arc; use std::{fs::File, io::BufReader, io::Write}; -use crate::arrow::datatypes::SchemaRef as ArrowSchemaRef; use crate::arrow::json::ReaderBuilder; use tempfile::NamedTempFile; use url::Url; use super::read_files; use crate::engine::arrow_data::ArrowEngineData; -use crate::engine::arrow_utils::parse_json as arrow_parse_json; -use crate::engine::arrow_utils::to_json_bytes; +use crate::engine::arrow_utils::{ + build_json_reorder_indices, fixup_json_read, json_arrow_schema, parse_json as arrow_parse_json, + to_json_bytes, +}; use crate::engine_data::FilteredEngineData; use crate::schema::SchemaRef; use crate::{ @@ -19,13 +21,20 @@ pub(crate) struct SyncJsonHandler; fn try_create_from_json( file: File, - _schema: SchemaRef, - arrow_schema: ArrowSchemaRef, + schema: SchemaRef, _predicate: Option, + file_location: String, ) -> DeltaResult>> { - let json = ReaderBuilder::new(arrow_schema) + // Build Arrow schema from only the real JSON columns, omitting any metadata columns + // (e.g. FilePath) that the JSON reader cannot populate from the file content. + let json_schema = Arc::new(json_arrow_schema(&schema)?); + // Build the reorder index vec once; apply it to every batch to re-insert synthesized metadata + // columns (e.g. file path) at their schema positions. + let reorder_indices = build_json_reorder_indices(&schema)?; + let json = ReaderBuilder::new(json_schema) + .with_coerce_primitive(true) .build(BufReader::new(file))? - .map(|data| Ok(ArrowEngineData::new(data?))); + .map(move |data| fixup_json_read(data?, &reorder_indices, &file_location)); Ok(json) } diff --git a/kernel/src/engine/sync/mod.rs b/kernel/src/engine/sync/mod.rs index 10e3cc6b14..15a7e8a1b7 100644 --- a/kernel/src/engine/sync/mod.rs +++ b/kernel/src/engine/sync/mod.rs @@ -7,8 +7,6 @@ use crate::{ JsonHandler, ParquetHandler, PredicateRef, SchemaRef, StorageHandler, }; -use crate::arrow::datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; -use crate::engine::arrow_conversion::TryFromKernel as _; use itertools::Itertools; use std::fs::File; use std::sync::Arc; @@ -16,6 +14,7 @@ use tracing::debug; pub(crate) mod json; mod parquet; +pub(crate) use parquet::SyncParquetHandler; mod storage; /// This is a simple (test-only) implementation of [`Engine`]. It only supports reading data from @@ -65,20 +64,18 @@ fn read_files( ) -> DeltaResult where I: Iterator> + Send + 'static, - F: FnMut(File, SchemaRef, ArrowSchemaRef, Option) -> DeltaResult - + Send - + 'static, + F: FnMut(File, SchemaRef, Option, String) -> DeltaResult + Send + 'static, { debug!("Reading files: {files:#?} with schema {schema:#?} and predicate {predicate:#?}"); if files.is_empty() { return Ok(Box::new(std::iter::empty())); } - let arrow_schema = Arc::new(ArrowSchema::try_from_kernel(schema.as_ref())?); let files = files.to_vec(); let result = files .into_iter() // Produces Iterator>>> .map(move |file| { + let location_string = file.location.to_string(); let location = file.location; debug!("Reading {location:#?} with schema {schema:#?} and predicate {predicate:#?}"); let path = location @@ -87,8 +84,8 @@ where try_create_from_file( File::open(path)?, schema.clone(), - arrow_schema.clone(), predicate.clone(), + location_string, ) }) // Flatten to Iterator>> diff --git a/kernel/src/engine/sync/parquet.rs b/kernel/src/engine/sync/parquet.rs index d170a23343..2d183be81c 100644 --- a/kernel/src/engine/sync/parquet.rs +++ b/kernel/src/engine/sync/parquet.rs @@ -1,29 +1,39 @@ use std::fs::File; +use std::sync::Arc; -use crate::arrow::datatypes::SchemaRef as ArrowSchemaRef; +use crate::engine::default::parquet::{reader_options, writer_options}; use crate::parquet::arrow::arrow_reader::{ArrowReaderMetadata, ParquetRecordBatchReaderBuilder}; use super::read_files; +use crate::engine::arrow_conversion::{TryFromArrow as _, TryIntoArrow as _}; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::arrow_utils::{ fixup_parquet_read, generate_mask, get_requested_indices, ordering_needs_row_indexes, RowIndexBuilder, }; use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; -use crate::schema::SchemaRef; -use crate::{DeltaResult, FileDataReadResultIterator, FileMeta, ParquetHandler, PredicateRef}; +use crate::parquet::arrow::arrow_writer::ArrowWriter; +use crate::schema::{SchemaRef, StructType}; +use crate::{ + DeltaResult, Error, FileDataReadResultIterator, FileMeta, ParquetFooter, ParquetHandler, + PredicateRef, +}; + +use url::Url; pub(crate) struct SyncParquetHandler; fn try_create_from_parquet( file: File, schema: SchemaRef, - _arrow_schema: ArrowSchemaRef, predicate: Option, + file_location: String, ) -> DeltaResult>> { - let metadata = ArrowReaderMetadata::load(&file, Default::default())?; + let arrow_schema = Arc::new(schema.as_ref().try_into_arrow()?); + let reader_options = reader_options(); + let metadata = ArrowReaderMetadata::load(&file, reader_options.clone())?; let parquet_schema = metadata.schema(); - let mut builder = ParquetRecordBatchReaderBuilder::try_new(file)?; + let mut builder = ParquetRecordBatchReaderBuilder::try_new_with_options(file, reader_options)?; let (indices, requested_ordering) = get_requested_indices(&schema, parquet_schema)?; if let Some(mask) = generate_mask(&schema, parquet_schema, builder.parquet_schema(), &indices) { builder = builder.with_projection(mask); @@ -40,7 +50,15 @@ fn try_create_from_parquet( let mut row_indexes = row_indexes.map(|rb| rb.build()).transpose()?; let stream = builder.build()?; - Ok(stream.map(move |rbr| fixup_parquet_read(rbr?, &requested_ordering, row_indexes.as_mut()))) + Ok(stream.map(move |rbr| { + fixup_parquet_read( + rbr?, + &requested_ordering, + row_indexes.as_mut(), + Some(&file_location), + Some(&arrow_schema), + ) + })) } impl ParquetHandler for SyncParquetHandler { @@ -52,4 +70,325 @@ impl ParquetHandler for SyncParquetHandler { ) -> DeltaResult { read_files(files, schema, predicate, try_create_from_parquet) } + + /// Writes engine data to a Parquet file at the specified location. + /// + /// This implementation uses synchronous file I/O to write the Parquet file. + /// If a file already exists at the given location, it will be overwritten. + /// + /// # Parameters + /// + /// - `location` - The full URL path where the Parquet file should be written + /// (e.g., `file:///path/to/file.parquet`). + /// - `data` - An iterator of engine data to be written to the Parquet file. + /// + /// # Returns + /// + /// A [`DeltaResult`] indicating success or failure. + fn write_parquet_file( + &self, + location: Url, + mut data: Box>> + Send>, + ) -> DeltaResult<()> { + // Convert URL to file path + let path = location + .to_file_path() + .map_err(|_| crate::Error::generic(format!("Invalid file URL: {location}")))?; + + let mut file = File::create(&path)?; + + // Get first batch to initialize writer with schema + let first_batch = data.next().ok_or_else(|| { + crate::Error::generic("Cannot write parquet file with empty data iterator") + })??; + let first_arrow = ArrowEngineData::try_from_engine_data(first_batch)?; + let first_record_batch: crate::arrow::array::RecordBatch = (*first_arrow).into(); + + let mut writer = ArrowWriter::try_new_with_options( + &mut file, + first_record_batch.schema(), + writer_options(), + )?; + writer.write(&first_record_batch)?; + + // Write remaining batches + for result in data { + let engine_data = result?; + let arrow_data = ArrowEngineData::try_from_engine_data(engine_data)?; + let batch: crate::arrow::array::RecordBatch = (*arrow_data).into(); + writer.write(&batch)?; + } + + writer.close()?; // writer must be closed to write footer + + Ok(()) + } + + fn read_parquet_footer(&self, file: &FileMeta) -> DeltaResult { + let path = file + .location + .to_file_path() + .map_err(|_| Error::generic("SyncEngine can only read local files"))?; + let file = File::open(path)?; + let metadata = ArrowReaderMetadata::load(&file, reader_options())?; + let schema = StructType::try_from_arrow(metadata.schema().as_ref()) + .map(Arc::new) + .map_err(Error::Arrow)?; + Ok(ParquetFooter { schema }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::array::{Array, Int64Array, RecordBatch, StringArray}; + use crate::engine::arrow_conversion::TryIntoKernel as _; + use std::sync::Arc; + use tempfile::tempdir; + use url::Url; + + #[test] + fn test_sync_write_parquet_file() { + let handler = SyncParquetHandler; + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("test.parquet"); + let url = Url::from_file_path(&file_path).unwrap(); + + // Create test data + let engine_data: Box = Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![ + ( + "id", + Arc::new(Int64Array::from(vec![1, 2, 3])) as Arc, + ), + ( + "name", + Arc::new(StringArray::from(vec!["a", "b", "c"])) as Arc, + ), + ]) + .unwrap(), + )); + + // Create iterator with single batch + let data_iter: Box< + dyn Iterator>> + Send, + > = Box::new(std::iter::once(Ok(engine_data))); + + // Write the file + handler.write_parquet_file(url.clone(), data_iter).unwrap(); + + // Verify the file exists + assert!(file_path.exists()); + + // Read it back to verify + let file = File::open(&file_path).unwrap(); + let reader = + crate::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder::try_new(file) + .unwrap(); + let schema = reader.schema().clone(); + let file_size = std::fs::metadata(&file_path).unwrap().len(); + let file_meta = FileMeta { + location: url, + last_modified: 0, + size: file_size, + }; + + let mut result = handler + .read_parquet_files( + &[file_meta], + Arc::new(schema.try_into_kernel().unwrap()), + None, + ) + .unwrap(); + + let engine_data = result.next().unwrap().unwrap(); + let batch = ArrowEngineData::try_from_engine_data(engine_data).unwrap(); + let record_batch = batch.record_batch(); + + // Verify shape + assert_eq!(record_batch.num_rows(), 3); + assert_eq!(record_batch.num_columns(), 2); + + // Verify content - id column + let id_col = record_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_col.values(), &[1, 2, 3]); + + // Verify content - name column + let name_col = record_batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(name_col.value(0), "a"); + assert_eq!(name_col.value(1), "b"); + assert_eq!(name_col.value(2), "c"); + + assert!(result.next().is_none()); + } + + #[test] + fn test_sync_write_parquet_file_with_filter() { + let handler = SyncParquetHandler; + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("test_filtered.parquet"); + let url = Url::from_file_path(&file_path).unwrap(); + + // Create test data with only filtered rows: 1, 3, 5 + let engine_data: Box = Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![ + ( + "id", + Arc::new(Int64Array::from(vec![1, 3, 5])) as Arc, + ), + ( + "name", + Arc::new(StringArray::from(vec!["a", "c", "e"])) as Arc, + ), + ]) + .unwrap(), + )); + + // Create iterator with single batch + let data_iter: Box< + dyn Iterator>> + Send, + > = Box::new(std::iter::once(Ok(engine_data))); + + // Write the file + handler.write_parquet_file(url.clone(), data_iter).unwrap(); + + // Verify the file exists + assert!(file_path.exists()); + + // Read it back to verify only filtered rows are present + let file = File::open(&file_path).unwrap(); + let reader = + crate::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder::try_new(file) + .unwrap(); + let schema = reader.schema().clone(); + let file_size = std::fs::metadata(&file_path).unwrap().len(); + let file_meta = FileMeta { + location: url, + last_modified: 0, + size: file_size, + }; + + let mut result = handler + .read_parquet_files( + &[file_meta], + Arc::new(schema.try_into_kernel().unwrap()), + None, + ) + .unwrap(); + + let engine_data = result.next().unwrap().unwrap(); + let batch = ArrowEngineData::try_from_engine_data(engine_data).unwrap(); + let record_batch = batch.record_batch(); + + // Verify shape - should only have 3 rows (filtered from 5) + assert_eq!(record_batch.num_rows(), 3); + assert_eq!(record_batch.num_columns(), 2); + + // Verify content - id column should have values 1, 3, 5 + let id_col = record_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_col.values(), &[1, 3, 5]); + + // Verify content - name column should have values "a", "c", "e" + let name_col = record_batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(name_col.value(0), "a"); + assert_eq!(name_col.value(1), "c"); + assert_eq!(name_col.value(2), "e"); + + assert!(result.next().is_none()); + } + + #[test] + fn test_sync_write_parquet_file_multiple_batches() { + let handler = SyncParquetHandler; + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("test_multi_batch.parquet"); + let url = Url::from_file_path(&file_path).unwrap(); + + // Create multiple batches + let batch1: Box = Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![( + "value", + Arc::new(Int64Array::from(vec![1, 2, 3])) as Arc, + )]) + .unwrap(), + )); + let batch2: Box = Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![( + "value", + Arc::new(Int64Array::from(vec![4, 5, 6])) as Arc, + )]) + .unwrap(), + )); + let batch3: Box = Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![( + "value", + Arc::new(Int64Array::from(vec![7, 8, 9])) as Arc, + )]) + .unwrap(), + )); + + // Create iterator with multiple batches + let batches = vec![Ok(batch1), Ok(batch2), Ok(batch3)]; + let data_iter: Box< + dyn Iterator>> + Send, + > = Box::new(batches.into_iter()); + + // Write the file + handler.write_parquet_file(url.clone(), data_iter).unwrap(); + + // Verify the file exists + assert!(file_path.exists()); + + // Read it back to verify all batches were written + let file = File::open(&file_path).unwrap(); + let reader = + crate::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder::try_new(file) + .unwrap(); + let schema = reader.schema().clone(); + let file_size = std::fs::metadata(&file_path).unwrap().len(); + let file_meta = FileMeta { + location: url, + last_modified: 0, + size: file_size, + }; + + let mut result = handler + .read_parquet_files( + &[file_meta], + Arc::new(schema.try_into_kernel().unwrap()), + None, + ) + .unwrap(); + + let engine_data = result.next().unwrap().unwrap(); + let batch = ArrowEngineData::try_from_engine_data(engine_data).unwrap(); + let record_batch = batch.record_batch(); + + // Verify we have all 9 rows from 3 batches + assert_eq!(record_batch.num_rows(), 9); + let value_col = record_batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(value_col.values(), &[1, 2, 3, 4, 5, 6, 7, 8, 9]); + + assert!(result.next().is_none()); + } } diff --git a/kernel/src/engine/sync/storage.rs b/kernel/src/engine/sync/storage.rs index 9729b119e4..bf876a629e 100644 --- a/kernel/src/engine/sync/storage.rs +++ b/kernel/src/engine/sync/storage.rs @@ -71,9 +71,27 @@ impl StorageHandler for SyncStorageHandler { Ok(Box::new(iter)) } + fn put(&self, path: &Url, data: Bytes, overwrite: bool) -> DeltaResult<()> { + if path.scheme() != "file" { + return Err(Error::generic("Can only write to local filesystem")); + } + let file_path = path + .to_file_path() + .map_err(|_| Error::generic(format!("Invalid path for put: {path:?}")))?; + if !overwrite && file_path.exists() { + return Err(Error::FileAlreadyExists(file_path.to_string_lossy().into())); + } + std::fs::write(&file_path, &data) + .map_err(|e| Error::generic(format!("Failed to write {}: {e}", file_path.display()))) + } + fn copy_atomic(&self, _src: &Url, _dest: &Url) -> DeltaResult<()> { unimplemented!("SyncStorageHandler does not implement copy"); } + + fn head(&self, _path: &Url) -> DeltaResult { + unimplemented!("head is not implemented for SyncStorageHandler") + } } #[cfg(test)] diff --git a/kernel/src/engine/tests.rs b/kernel/src/engine/tests.rs new file mode 100644 index 0000000000..b534608c90 --- /dev/null +++ b/kernel/src/engine/tests.rs @@ -0,0 +1,333 @@ +//! Shared contract tests for engine handler traits ([`JsonHandler`], [`ParquetHandler`]). +//! +//! Each function here tests a specific piece of the handler contract using only the public +//! trait API plus [`GetData`]/[`RowVisitor`] for result inspection — no engine-specific +//! downcasting. Individual engine test modules call these to verify their implementation +//! satisfies the contract, then add any engine-specific assertions (e.g. Arrow encoding +//! details) in their own tests. + +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::{Arc, LazyLock}; + +use tempfile::{tempdir, NamedTempFile}; +use url::Url; + +use crate::arrow::array::{Array, Int64Array, RecordBatch, StringArray}; +use crate::arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; +use crate::engine::arrow_data::ArrowEngineData; +use crate::engine_data::{FilteredEngineData, GetData, RowVisitor, TypedGetData as _}; +use crate::object_store::path::Path; +use crate::parquet::arrow::arrow_writer::ArrowWriter; +use crate::parquet::arrow::PARQUET_FIELD_ID_META_KEY; +use crate::schema::{ + column_name, ColumnMetadataKey, ColumnName, ColumnNamesAndTypes, DataType, MetadataColumnSpec, + StructField, StructType, +}; +use crate::{DeltaResult, Engine, EngineData, FileMeta, JsonHandler, ParquetHandler}; +use itertools::Itertools; + +use test_utils::delta_path_for_version; + +// --------------------------------------------------------------------------- +// Shared file-setup helpers +// --------------------------------------------------------------------------- + +/// Writes `lines` as newline-delimited JSON to a [`NamedTempFile`] and returns the file +/// together with a [`FileMeta`] pointing at it. The temp file must be kept alive for as +/// long as the `FileMeta` is in use. +pub(crate) fn make_temp_json_file(lines: &[&str]) -> (NamedTempFile, FileMeta) { + let mut temp_file = NamedTempFile::new().expect("Failed to create temp file"); + for line in lines { + use std::io::Write as _; + writeln!(temp_file, "{line}").expect("Failed to write to temp file"); + } + let path = temp_file.path(); + let file_url = Url::from_file_path(path).expect("Failed to create file URL"); + let size = std::fs::metadata(path) + .expect("Failed to stat temp file") + .len(); + let file_meta = FileMeta { + location: file_url, + last_modified: 0, + size, + }; + (temp_file, file_meta) +} + +/// Builds a [`FileMeta`] for a local file path, reading the actual size from the filesystem. +pub(crate) fn file_meta_for(path: &std::path::Path) -> FileMeta { + let url = Url::from_file_path(path).unwrap(); + let size = std::fs::metadata(path).unwrap().len(); + FileMeta { + location: url, + last_modified: 0, + size, + } +} + +// --------------------------------------------------------------------------- +// JsonHandler contract tests +// --------------------------------------------------------------------------- + +/// Contract: any [`JsonHandler`] that receives a schema with a [`MetadataColumnSpec::FilePath`] +/// column must populate it with the file URL for every row, readable via [`GetData`] without +/// any Arrow downcasting. +pub(crate) fn test_json_handler_file_path_contract(handler: &dyn JsonHandler) { + let (_temp, file_meta) = make_temp_json_file(&[r#"{"x": 1}"#, r#"{"x": 2}"#]); + let expected_url = file_meta.location.to_string(); + + let schema = Arc::new( + StructType::try_new([ + StructField::not_null("x", DataType::INTEGER), + StructField::create_metadata_column("_file", MetadataColumnSpec::FilePath), + ]) + .unwrap(), + ); + + let engine_data = handler + .read_json_files(&[file_meta], schema, None) + .unwrap() + .next() + .expect("expected at least one batch") + .unwrap(); + + struct FilePathCollector { + paths: Vec, + } + impl RowVisitor for FilePathCollector { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + static NAT: LazyLock = + LazyLock::new(|| (vec![column_name!("_file")], vec![DataType::STRING]).into()); + NAT.as_ref() + } + fn visit<'a>( + &mut self, + row_count: usize, + getters: &[&'a dyn GetData<'a>], + ) -> DeltaResult<()> { + for i in 0..row_count { + self.paths.push(getters[0].get(i, "_file")?); + } + Ok(()) + } + } + + let mut collector = FilePathCollector { paths: vec![] }; + collector.visit_rows_of(engine_data.as_ref()).unwrap(); + + assert_eq!(collector.paths.len(), 2, "expected 2 rows"); + assert!( + collector.paths.iter().all(|p| p == &expected_url), + "_file values should equal the file URL" + ); +} + +// --------------------------------------------------------------------------- +// ParquetHandler contract tests +// --------------------------------------------------------------------------- + +/// Contract: [`ParquetHandler::read_parquet_footer`] must correctly parse the schema +/// from a real Delta checkpoint file. +pub(crate) fn test_parquet_handler_reads_footer(handler: &dyn ParquetHandler) { + let path = std::fs::canonicalize(PathBuf::from( + "./tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.checkpoint.parquet", + )) + .unwrap(); + let file_meta = file_meta_for(&path); + let footer = handler.read_parquet_footer(&file_meta).unwrap(); + crate::utils::test_utils::validate_checkpoint_schema(&footer.schema); +} + +/// Contract: [`ParquetHandler::read_parquet_footer`] must return an error for a +/// non-existent file. +pub(crate) fn test_parquet_handler_footer_errors_on_missing_file(handler: &dyn ParquetHandler) { + let mut temp_path = std::env::temp_dir(); + temp_path.push("non_existent_kernel_test_file.parquet"); + let file_meta = FileMeta { + location: Url::from_file_path(&temp_path).unwrap(), + last_modified: 0, + size: 0, + }; + assert!( + handler.read_parquet_footer(&file_meta).is_err(), + "expected error for non-existent file" + ); +} + +/// Contract: [`ParquetHandler::read_parquet_footer`] must preserve Arrow field IDs, +/// accessible via [`ColumnMetadataKey::ParquetFieldId`]. +pub(crate) fn test_parquet_handler_footer_preserves_field_ids(handler: &dyn ParquetHandler) { + let make_field_with_id = |name: &str, ty: ArrowDataType, nullable: bool, id: &str| { + Field::new(name, ty, nullable).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + id.to_string(), + )])) + }; + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + make_field_with_id("id", ArrowDataType::Int64, false, "1"), + make_field_with_id("name", ArrowDataType::Utf8, true, "2"), + ])); + + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("field_ids.parquet"); + let batch = RecordBatch::try_new( + arrow_schema.clone(), + vec![ + Arc::new(Int64Array::from(vec![1i64, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + let file = std::fs::File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema, None).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let footer = handler + .read_parquet_footer(&file_meta_for(&file_path)) + .unwrap(); + + for (name, expected_id) in [("id", "1"), ("name", "2")] { + let field = footer.schema.fields().find(|f| f.name() == name).unwrap(); + assert_eq!( + field + .get_config_value(&ColumnMetadataKey::ParquetFieldId) + .map(|v| v.to_string()) + .as_deref(), + Some(expected_id), + "field '{name}' should have field ID {expected_id}" + ); + } +} + +/// Contract: [`ParquetHandler::write_parquet_file`] always overwrites an existing file. +/// Writes `[1, 2, 3]`, then overwrites with `[10, 20]`, and verifies only `[10, 20]` +/// is present. +pub(crate) fn test_parquet_handler_write_always_overwrites(handler: &dyn ParquetHandler) { + let temp_dir = tempdir().unwrap(); + let file_path = temp_dir.path().join("overwrite_test.parquet"); + let url = Url::from_file_path(&file_path).unwrap(); + + let make_data = |values: Vec| -> Box { + Box::new(ArrowEngineData::new( + RecordBatch::try_from_iter(vec![( + "value", + Arc::new(Int64Array::from(values)) as Arc, + )]) + .unwrap(), + )) + }; + + handler + .write_parquet_file( + url.clone(), + Box::new(std::iter::once(Ok(make_data(vec![1, 2, 3])))), + ) + .unwrap(); + handler + .write_parquet_file( + url.clone(), + Box::new(std::iter::once(Ok(make_data(vec![10, 20])))), + ) + .unwrap(); + + let file_meta = file_meta_for(&file_path); + let schema = Arc::new( + handler + .read_parquet_footer(&file_meta) + .unwrap() + .schema + .as_ref() + .clone(), + ); + let batches: Vec = handler + .read_parquet_files(&[file_meta], schema, None) + .unwrap() + .map(|r| { + ArrowEngineData::try_from_engine_data(r.unwrap()) + .unwrap() + .into() + }) + .collect(); + + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 2, "expected 2 rows after overwrite"); + assert_eq!( + batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap() + .values(), + &[10, 20] + ); +} + +// --------------------------------------------------------------------------- +// Storage / Engine helpers (used by the engine-level tests) +// --------------------------------------------------------------------------- + +pub(crate) fn test_arrow_engine(engine: &dyn Engine, base_url: &Url) { + test_list_from_should_sort_and_filter(engine, base_url, get_arrow_data); +} + +fn test_list_from_should_sort_and_filter( + engine: &dyn Engine, + base_url: &Url, + engine_data: impl Fn() -> Box, +) { + let json = engine.json_handler(); + let get_data = || { + let data = engine_data(); + let filtered_data = FilteredEngineData::with_all_rows_selected(data); + Box::new(std::iter::once(Ok(filtered_data))) + }; + + let expected_names: Vec = (1..4) + .map(|i| delta_path_for_version(i, "json")) + .collect_vec(); + + for i in expected_names.iter().rev() { + let path = base_url.join(i.as_ref()).unwrap(); + json.write_json_file(&path, get_data(), false).unwrap(); + } + let path = base_url.join("other").unwrap(); + json.write_json_file(&path, get_data(), false).unwrap(); + + let storage = engine.storage_handler(); + + let test_url = base_url.join(expected_names[0].as_ref()).unwrap(); + let files: Vec<_> = storage.list_from(&test_url).unwrap().try_collect().unwrap(); + assert_eq!(files.len(), expected_names.len() - 1); + for (file, expected) in files.iter().zip(expected_names.iter().skip(1)) { + assert_eq!(file.location, base_url.join(expected.as_ref()).unwrap()); + } + + let test_url = base_url + .join(delta_path_for_version(0, "json").as_ref()) + .unwrap(); + let files: Vec<_> = storage.list_from(&test_url).unwrap().try_collect().unwrap(); + assert_eq!(files.len(), expected_names.len()); + + let test_url = base_url.join("_delta_log/").unwrap(); + let files: Vec<_> = storage.list_from(&test_url).unwrap().try_collect().unwrap(); + assert_eq!(files.len(), expected_names.len()); + for (file, expected) in files.iter().zip(expected_names.iter()) { + assert_eq!(file.location, base_url.join(expected.as_ref()).unwrap()); + } +} + +fn get_arrow_data() -> Box { + let schema = Arc::new(ArrowSchema::new(vec![Field::new( + "dog", + ArrowDataType::Utf8, + true, + )])); + let data = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(StringArray::from(vec!["remi", "wilson"]))], + ) + .unwrap(); + Box::new(ArrowEngineData::new(data)) +} diff --git a/kernel/src/engine_data.rs b/kernel/src/engine_data.rs index 7edb70b7c9..4ce0770e9e 100644 --- a/kernel/src/engine_data.rs +++ b/kernel/src/engine_data.rs @@ -1,6 +1,7 @@ //! Traits that engines need to implement in order to pass data between themselves and kernel. use std::collections::HashMap; +use std::ops::Range; use tracing::debug; @@ -13,7 +14,7 @@ use crate::{AsAny, DeltaResult, Error}; /// /// A value of `true` in the selection vector means the corresponding row is selected (i.e., not deleted), /// while `false` means the row is logically deleted and should be ignored. If the selection vector is shorter -/// then the number of rows in `data` then all rows not covered by the selection vector are assumed to be selected. +/// than the number of rows in `data` then all rows not covered by the selection vector are assumed to be selected. /// /// Interpreting unselected (`false`) rows will result in incorrect/undefined behavior. pub struct FilteredEngineData { @@ -64,6 +65,13 @@ impl FilteredEngineData { selection_vector: vec![], } } + + /// Apply the contained selection vector and return an engine data with only the valid rows + /// included. This consumes the `FilteredEngineData` + pub fn apply_selection_vector(self) -> DeltaResult> { + self.data + .apply_selection_vector(self.selection_vector.clone()) + } } impl HasSelectionVector for FilteredEngineData { @@ -78,75 +86,123 @@ impl HasSelectionVector for FilteredEngineData { } } -/// a trait that an engine exposes to give access to a list -pub trait EngineList { - /// Return the length of the list at the specified row_index in the raw data - fn len(&self, row_index: usize) -> usize; - /// Get the item at `list_index` from the list at `row_index` in the raw data, and return it as a [`String`] - fn get(&self, row_index: usize, list_index: usize) -> String; - /// Materialize the entire list at row_index in the raw data into a `Vec` - fn materialize(&self, row_index: usize) -> Vec; +impl From> for FilteredEngineData { + /// Converts `EngineData` into `FilteredEngineData` with all rows selected. + /// + /// This is a convenience conversion that wraps the provided engine data + /// in a `FilteredEngineData` with an empty selection vector, meaning all + /// rows are logically selected. + /// + /// # Example + /// ```rust,ignore + /// let engine_data: Box = ...; + /// let filtered: FilteredEngineData = engine_data.into(); + /// ``` + fn from(data: Box) -> Self { + Self::with_all_rows_selected(data) + } +} + +/// Uniform read access to a string array, abstracting over the various string representations +/// that list and map columns may use (e.g. Utf8, LargeUtf8, Utf8View). Engines implement this +/// for their string array types so that [`ListItem`] and [`MapItem`] can resolve the concrete +/// type once at construction and access elements via virtual dispatch thereafter. +pub trait StringArrayAccessor { + /// Returns the number of elements in the array. + fn len(&self) -> usize; + /// Returns whether the array has no elements. + fn is_empty(&self) -> bool { + self.len() == 0 + } + /// Returns the string value at the given index. The caller must ensure `index < len()`. + fn value(&self, index: usize) -> &str; + /// Returns whether the value at the given index is non-null. + fn is_valid(&self, index: usize) -> bool; } -/// A list item is useful if the Engine needs to know what row of raw data it needs to access to -/// implement the [`EngineList`] trait. It simply wraps such a list, and the row. +/// A pre-resolved view into a single row's list of strings. The string array type is resolved +/// once at construction, so subsequent element accesses use virtual dispatch rather than +/// repeated downcasting. pub struct ListItem<'a> { - list: &'a dyn EngineList, - row: usize, + values: &'a dyn StringArrayAccessor, + offsets: Range, } impl<'a> ListItem<'a> { - pub fn new(list: &'a dyn EngineList, row: usize) -> ListItem<'a> { - ListItem { list, row } + pub fn new(values: &'a dyn StringArrayAccessor, offsets: Range) -> ListItem<'a> { + ListItem { values, offsets } } pub fn len(&self) -> usize { - self.list.len(self.row) + self.offsets.len() } pub fn is_empty(&self) -> bool { - self.len() == 0 + self.offsets.is_empty() } pub fn get(&self, list_index: usize) -> String { - self.list.get(self.row, list_index) + self.values + .value(self.offsets.start + list_index) + .to_string() } pub fn materialize(&self) -> Vec { - self.list.materialize(self.row) + self.offsets + .clone() + .map(|i| self.values.value(i).to_string()) + .collect() } } -/// a trait that an engine exposes to give access to a map -pub trait EngineMap { - /// Get the item with the specified key from the map at `row_index` in the raw data, and return it as an `Option<&'a str>` - fn get<'a>(&'a self, row_index: usize, key: &str) -> Option<&'a str>; - /// Materialize the entire map at `row_index` in the raw data into a `HashMap`. Note that in - /// conjunction with the `allow_null_container_values` attribute, `materialize` _drops_ any - /// (key, value) pairs where the underlying value was `null`. If preserving `null` values is - /// important, use the `allow_null_container_values` attribute, and manually materialize the map - /// using [`Self::get`]. - fn materialize(&self, row_index: usize) -> HashMap; -} - -/// A map item is useful if the Engine needs to know what row of raw data it needs to access to -/// implement the [`EngineMap`] trait. It simply wraps such a map, and the row. +/// A pre-resolved view into a single row's map of string keys to string values. Like +/// [`ListItem`], the string array types are resolved once at construction. +/// +/// Note: in conjunction with the `allow_null_container_values` attribute, [`materialize`] +/// _drops_ any (key, value) pairs where the underlying value was null. If preserving null +/// values is important, use the `allow_null_container_values` attribute and manually +/// materialize the map using [`MapItem::get`]. +/// +/// [`materialize`]: MapItem::materialize pub struct MapItem<'a> { - map: &'a dyn EngineMap, - row: usize, + keys: &'a dyn StringArrayAccessor, + values: &'a dyn StringArrayAccessor, + offsets: Range, } impl<'a> MapItem<'a> { - pub fn new(map: &'a dyn EngineMap, row: usize) -> MapItem<'a> { - MapItem { map, row } + pub fn new( + keys: &'a dyn StringArrayAccessor, + values: &'a dyn StringArrayAccessor, + offsets: Range, + ) -> MapItem<'a> { + MapItem { + keys, + values, + offsets, + } } pub fn get(&self, key: &str) -> Option<&'a str> { - self.map.get(self.row, key) + let idx = self + .offsets + .clone() + .rev() + .find(|&idx| self.keys.value(idx) == key)?; + self.values.is_valid(idx).then(|| self.values.value(idx)) } pub fn materialize(&self) -> HashMap { - self.map.materialize(self.row) + let mut ret = HashMap::with_capacity(self.offsets.len()); + for idx in self.offsets.clone() { + if self.values.is_valid(idx) { + ret.insert( + self.keys.value(idx).to_string(), + self.values.value(idx).to_string(), + ); + } + } + ret } } @@ -171,7 +227,13 @@ pub trait GetData<'a> { (get_bool, bool), (get_int, i32), (get_long, i64), + (get_float, f32), + (get_double, f64), + (get_date, i32), + (get_timestamp, i64), + (get_decimal, i128), (get_str, &'a str), + (get_binary, &'a [u8]), (get_list, ListItem<'a>), (get_map, MapItem<'a>) ); @@ -192,7 +254,13 @@ impl<'a> GetData<'a> for () { (get_bool, bool), (get_int, i32), (get_long, i64), + (get_float, f32), + (get_double, f64), + (get_date, i32), + (get_timestamp, i64), + (get_decimal, i128), (get_str, &'a str), + (get_binary, &'a [u8]), (get_list, ListItem<'a>), (get_map, MapItem<'a>) ); @@ -222,11 +290,18 @@ macro_rules! impl_typed_get_data { }; } +// Note: get_date and get_timestamp are intentionally excluded because their return types (i32 and +// i64) collide with get_int and get_long, which would produce conflicting TypedGetData impls. +// Use get_date/get_timestamp directly instead of through TypedGetData. impl_typed_get_data!( (get_bool, bool), (get_int, i32), (get_long, i64), + (get_float, f32), + (get_double, f64), + (get_decimal, i128), (get_str, &'a str), + (get_binary, &'a [u8]), (get_list, ListItem<'a>), (get_map, MapItem<'a>) ); @@ -260,6 +335,105 @@ impl<'a> TypedGetData<'a, HashMap> for dyn GetData<'a> + '_ { } } +/// An iterator over the indices of selected rows in an engine-data batch. +/// +/// Each call to [`Iterator::next`] returns the index of the next selected row. +/// +/// Constructed internally and passed (alongside the column getters) to +/// [`FilteredRowVisitor::visit_filtered`]. +pub struct RowIndexIterator<'sv> { + sv_pos: usize, + selection_vector: &'sv [bool], + row_count: usize, +} + +impl<'sv> RowIndexIterator<'sv> { + pub(crate) fn new(row_count: usize, selection_vector: &'sv [bool]) -> Self { + Self { + sv_pos: 0, + selection_vector, + row_count, + } + } + + /// Returns the total number of rows in the batch (selected and deselected). + pub fn num_rows(&self) -> usize { + self.row_count + } +} + +impl<'sv> Iterator for RowIndexIterator<'sv> { + type Item = usize; + + fn next(&mut self) -> Option { + while self.sv_pos < self.row_count { + let pos = self.sv_pos; + self.sv_pos += 1; + if pos >= self.selection_vector.len() || self.selection_vector[pos] { + return Some(pos); + } + } + None + } +} + +/// A visitor that processes [`FilteredEngineData`] with automatic row filtering. +/// +/// Implementors provide [`visit_filtered`] which receives the column getters and a +/// [`RowIndexIterator`] that yields the index of each selected row. +/// The default [`visit_rows_of`] method handles all the plumbing: extracting the selection +/// vector, building the bridge, and calling [`EngineData::visit_rows`]. +/// +/// [`visit_filtered`]: FilteredRowVisitor::visit_filtered +/// [`visit_rows_of`]: FilteredRowVisitor::visit_rows_of +pub trait FilteredRowVisitor { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]); + + /// Process this batch. `getters` contains one [`GetData`] item per requested column. + /// Iterate `rows` to receive the index of each selected row. Use + /// [`RowIndexIterator::num_rows`] to get the total row count (for padding output + /// vectors with null values for deselected rows). + fn visit_filtered<'a>( + &mut self, + getters: &[&'a dyn GetData<'a>], + rows: RowIndexIterator<'_>, + ) -> DeltaResult<()>; + + /// Visit the rows of a [`FilteredEngineData`], automatically respecting the selection vector. + /// + /// Extracts the selection vector and passes a [`RowIndexIterator`] of selected row indices + /// to [`FilteredRowVisitor::visit_filtered`]. + fn visit_rows_of(&mut self, data: &FilteredEngineData) -> DeltaResult<()> + where + Self: Sized, + { + // column_names is 'static so this borrow ends immediately, before bridge borrows self + let column_names = self.selected_column_names_and_types().0; + let mut bridge = FilteredVisitorBridge { + visitor: self, + selection_vector: data.selection_vector(), + }; + data.data().visit_rows(column_names, &mut bridge) + } +} + +/// Private bridge that implements [`RowVisitor`] and forwards to a [`FilteredRowVisitor`]. +struct FilteredVisitorBridge<'bridge, V: FilteredRowVisitor> { + visitor: &'bridge mut V, + selection_vector: &'bridge [bool], +} + +impl RowVisitor for FilteredVisitorBridge<'_, V> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + self.visitor.selected_column_names_and_types() + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + let rows = RowIndexIterator::new(row_count, self.selection_vector); + self.visitor.visit_filtered(getters, rows) + } +} + /// A `RowVisitor` can be called back to visit extracted data. Aside from calling /// [`RowVisitor::visit`] on the visitor passed to [`EngineData::visit_rows`], engines do /// not need to worry about this trait. @@ -318,6 +492,9 @@ pub trait RowVisitor { /// fn append_columns(&self, schema: SchemaRef, columns: Vec) -> DeltaResult> { /// todo!() // convert `SchemaRef` and `ArrayData` into local representation and append them /// } +/// fn apply_selection_vector(self: Box, selection_vector: Vec) -> DeltaResult> { +/// todo!() // filter out unselected rows and return the new set of data +/// } /// } /// ``` pub trait EngineData: AsAny { @@ -363,6 +540,14 @@ pub trait EngineData: AsAny { schema: SchemaRef, columns: Vec, ) -> DeltaResult>; + + /// Apply a selection vector to the data and return a data where only the valid rows are + /// included. This consumes the EngineData, allowing engines to implement this "in place" if + /// desired + fn apply_selection_vector( + self: Box, + selection_vector: Vec, + ) -> DeltaResult>; } #[cfg(test)] @@ -373,23 +558,25 @@ mod tests { DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, }; use crate::engine::arrow_data::ArrowEngineData; + use rstest::rstest; use std::sync::Arc; - #[test] - fn test_with_all_rows_selected_empty_data() { - // Test with empty data + fn get_engine_data(rows: usize) -> Box { let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( "value", ArrowDataType::Utf8, true, )])); - let record_batch = RecordBatch::try_new( - schema, - vec![Arc::new(StringArray::from(Vec::::new()))], - ) - .unwrap(); - let data: Box = Box::new(ArrowEngineData::new(record_batch)); + let data: Vec = (0..rows).map(|i| format!("row{i}")).collect(); + Box::new(ArrowEngineData::new( + RecordBatch::try_new(schema, vec![Arc::new(StringArray::from(data))]).unwrap(), + )) + } + #[test] + fn test_with_all_rows_selected_empty_data() { + // Test with empty data + let data = get_engine_data(0); let filtered_data = FilteredEngineData::with_all_rows_selected(data); assert_eq!(filtered_data.selection_vector().len(), 0); @@ -400,18 +587,7 @@ mod tests { #[test] fn test_with_all_rows_selected_single_row() { // Test with single row - let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( - "value", - ArrowDataType::Utf8, - true, - )])); - let record_batch = RecordBatch::try_new( - schema, - vec![Arc::new(StringArray::from(vec!["single_row"]))], - ) - .unwrap(); - let data: Box = Box::new(ArrowEngineData::new(record_batch)); - + let data = get_engine_data(1); let filtered_data = FilteredEngineData::with_all_rows_selected(data); // According to the new contract, empty selection vector means all rows are selected @@ -423,20 +599,7 @@ mod tests { #[test] fn test_with_all_rows_selected_multiple_rows() { // Test with multiple rows - let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( - "value", - ArrowDataType::Utf8, - true, - )])); - let record_batch = RecordBatch::try_new( - schema, - vec![Arc::new(StringArray::from(vec![ - "row1", "row2", "row3", "row4", - ]))], - ) - .unwrap(); - let data: Box = Box::new(ArrowEngineData::new(record_batch)); - + let data = get_engine_data(4); let filtered_data = FilteredEngineData::with_all_rows_selected(data); // According to the new contract, empty selection vector means all rows are selected @@ -448,18 +611,7 @@ mod tests { #[test] fn test_has_selected_rows_empty_data() { // Test with empty data - let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( - "value", - ArrowDataType::Utf8, - true, - )])); - let record_batch = RecordBatch::try_new( - schema, - vec![Arc::new(StringArray::from(Vec::::new()))], - ) - .unwrap(); - let data: Box = Box::new(ArrowEngineData::new(record_batch)); - + let data = get_engine_data(0); let filtered_data = FilteredEngineData::try_new(data, vec![]).unwrap(); // Empty data should return false even with empty selection vector @@ -469,18 +621,7 @@ mod tests { #[test] fn test_has_selected_rows_selection_vector_shorter_than_data() { // Test with selection vector shorter than data length - let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( - "value", - ArrowDataType::Utf8, - true, - )])); - let record_batch = RecordBatch::try_new( - schema, - vec![Arc::new(StringArray::from(vec!["row1", "row2", "row3"]))], - ) - .unwrap(); - let data: Box = Box::new(ArrowEngineData::new(record_batch)); - + let data = get_engine_data(3); // Selection vector with only 2 elements for 3 rows of data let filtered_data = FilteredEngineData::try_new(data, vec![false, false]).unwrap(); @@ -490,19 +631,7 @@ mod tests { #[test] fn test_has_selected_rows_selection_vector_same_length_all_false() { - // Test with selection vector same length as data, all false - let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( - "value", - ArrowDataType::Utf8, - true, - )])); - let record_batch = RecordBatch::try_new( - schema, - vec![Arc::new(StringArray::from(vec!["row1", "row2"]))], - ) - .unwrap(); - let data: Box = Box::new(ArrowEngineData::new(record_batch)); - + let data = get_engine_data(2); let filtered_data = FilteredEngineData::try_new(data, vec![false, false]).unwrap(); // Should return false because no rows are selected @@ -511,19 +640,7 @@ mod tests { #[test] fn test_has_selected_rows_selection_vector_same_length_some_true() { - // Test with selection vector same length as data, some true - let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( - "value", - ArrowDataType::Utf8, - true, - )])); - let record_batch = RecordBatch::try_new( - schema, - vec![Arc::new(StringArray::from(vec!["row1", "row2", "row3"]))], - ) - .unwrap(); - let data: Box = Box::new(ArrowEngineData::new(record_batch)); - + let data = get_engine_data(3); let filtered_data = FilteredEngineData::try_new(data, vec![true, false, true]).unwrap(); // Should return true because some rows are selected @@ -533,18 +650,7 @@ mod tests { #[test] fn test_try_new_selection_vector_larger_than_data() { // Test with selection vector larger than data length - should return error - let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( - "value", - ArrowDataType::Utf8, - true, - )])); - let record_batch = RecordBatch::try_new( - schema, - vec![Arc::new(StringArray::from(vec!["row1", "row2"]))], - ) - .unwrap(); - let data: Box = Box::new(ArrowEngineData::new(record_batch)); - + let data = get_engine_data(2); // Selection vector with 3 elements for 2 rows of data - should fail let result = FilteredEngineData::try_new(data, vec![true, false, true]); @@ -557,4 +663,132 @@ mod tests { assert!(e.to_string().contains("3 > 2")); } } + + #[test] + fn test_get_binary_some_value() { + use crate::arrow::array::BinaryArray; + + // Use Arrow's BinaryArray implementation + let binary_data: Vec> = vec![Some(b"hello"), Some(b"world"), None]; + let binary_array = BinaryArray::from(binary_data); + + // Cast to dyn GetData to use TypedGetData trait + let getter: &dyn GetData<'_> = &binary_array; + + // Test getting first row + let result: Option<&[u8]> = getter.get_opt(0, "binary_field").unwrap(); + assert_eq!(result, Some(b"hello".as_ref())); + + // Test getting second row + let result: Option<&[u8]> = getter.get_opt(1, "binary_field").unwrap(); + assert_eq!(result, Some(b"world".as_ref())); + + // Test getting None value + let result: Option<&[u8]> = getter.get_opt(2, "binary_field").unwrap(); + assert_eq!(result, None); + } + + #[test] + fn test_get_binary_required() { + use crate::arrow::array::BinaryArray; + + let binary_data: Vec> = vec![Some(b"hello")]; + let binary_array = BinaryArray::from(binary_data); + + // Cast to dyn GetData to use TypedGetData trait + let getter: &dyn GetData<'_> = &binary_array; + + // Test using get() for required field + let result: &[u8] = getter.get(0, "binary_field").unwrap(); + assert_eq!(result, b"hello"); + } + + #[test] + fn test_get_binary_required_missing() { + use crate::arrow::array::BinaryArray; + + let binary_data: Vec> = vec![None]; + let binary_array = BinaryArray::from(binary_data); + + // Cast to dyn GetData to use TypedGetData trait + let getter: &dyn GetData<'_> = &binary_array; + + // Test using get() for missing required field should error + let result: DeltaResult<&[u8]> = getter.get(0, "binary_field"); + assert!(result.is_err()); + if let Err(e) = result { + assert!(e.to_string().contains("Data missing for field")); + } + } + + #[test] + fn test_get_binary_empty_bytes() { + use crate::arrow::array::BinaryArray; + + let binary_data: Vec> = vec![Some(b"")]; + let binary_array = BinaryArray::from(binary_data); + + // Cast to dyn GetData to use TypedGetData trait + let getter: &dyn GetData<'_> = &binary_array; + + // Test getting empty bytes + let result: Option<&[u8]> = getter.get_opt(0, "binary_field").unwrap(); + assert_eq!(result, Some([].as_ref())); + assert_eq!(result.unwrap().len(), 0); + } + + #[test] + fn test_from_engine_data() { + let data = get_engine_data(3); + let data_len = data.len(); // Save length before move + + // Use the From trait to convert + let filtered_data: FilteredEngineData = data.into(); + + // Verify all rows are selected (empty selection vector) + assert!(filtered_data.selection_vector().is_empty()); + assert_eq!(filtered_data.data().len(), data_len); + assert_eq!(filtered_data.data().len(), 3); + assert!(filtered_data.has_selected_rows()); + } + + #[test] + fn filtered_apply_seclection_vector_full() { + let data = get_engine_data(4); + let filtered = FilteredEngineData::try_new(data, vec![true, false, true, false]).unwrap(); + let data = filtered.apply_selection_vector().unwrap(); + assert_eq!(data.len(), 2); + } + + #[test] + fn filtered_apply_seclection_vector_partial() { + let data = get_engine_data(4); + let filtered = FilteredEngineData::try_new(data, vec![true, false]).unwrap(); + let data = filtered.apply_selection_vector().unwrap(); + assert_eq!(data.len(), 3); + } + + fn collect_indices(row_count: usize, selection: &[bool]) -> Vec { + RowIndexIterator::new(row_count, selection).collect() + } + + #[rstest] + #[case(0, &[], vec![])] + #[case(3, &[], vec![0, 1, 2])] + #[case(3, &[true, true, true], vec![0, 1, 2])] + #[case(3, &[false, false, false], vec![])] + #[case(5, &[true, false, false, true, true], vec![0, 3, 4])] + #[case(4, &[false, false, true, true], vec![2, 3])] + #[case(3, &[true, false, false], vec![0])] + // sv shorter than row_count: tail rows implicitly selected + #[case(4, &[false, true], vec![1, 2, 3])] + #[case(4, &[true, false], vec![0, 2, 3])] + #[case(4, &[false, true, false, true], vec![1, 3])] + fn row_index_iter( + #[case] row_count: usize, + #[case] selection: &[bool], + #[case] expected: Vec, + ) { + assert_eq!(collect_indices(row_count, selection), expected); + } } diff --git a/kernel/src/error.rs b/kernel/src/error.rs index d42cd6de46..8e92ff445b 100644 --- a/kernel/src/error.rs +++ b/kernel/src/error.rs @@ -2,6 +2,7 @@ use std::{ backtrace::{Backtrace, BacktraceStatus}, + convert::Infallible, num::ParseIntError, str::Utf8Error, }; @@ -13,7 +14,7 @@ use crate::Version; #[cfg(feature = "default-engine-base")] use crate::arrow::error::ArrowError; #[cfg(feature = "default-engine-base")] -use object_store; +use crate::object_store; /// A [`std::result::Result`] that has the kernel [`Error`] as the error variant pub type DeltaResult = std::result::Result; @@ -116,6 +117,10 @@ pub enum Error { #[error("Selection vector is larger than data length: {0}")] InvalidSelectionVector(String), + /// Transaction state is invalid for the requested operation + #[error("Invalid transaction state: {0}")] + InvalidTransactionState(String), + /// A specified URL was invalid #[error("Invalid url: {0}")] InvalidUrl(#[from] url::ParseError), @@ -187,6 +192,10 @@ pub enum Error { #[error("Unsupported: {0}")] Unsupported(String), + /// Cannot write a version checksum (CRC) file for this snapshot + #[error("Checksum write unsupported: {0}")] + ChecksumWriteUnsupported(String), + /// Parsing error when attempting to deserialize an interval #[error(transparent)] ParseIntervalError(#[from] ParseIntervalError), @@ -210,6 +219,10 @@ pub enum Error { /// Schema mismatch has occurred or invalid schema used somewhere #[error("Schema error: {0}")] Schema(String), + + /// Validation error for file statistics (e.g., missing required clustering column stats) + #[error("Stats validation error: {0}")] + StatsValidation(String), } // Convenience constructors for Error types that take a String argument @@ -274,6 +287,10 @@ impl Error { Self::InvalidProtocol(msg.to_string()) } + pub fn invalid_transaction_state(msg: impl ToString) -> Self { + Self::InvalidTransactionState(msg.to_string()) + } + pub fn unsupported(msg: impl ToString) -> Self { Self::Unsupported(msg.to_string()) } @@ -291,10 +308,14 @@ impl Error { Self::InvalidCheckpoint(msg.to_string()) } - pub(crate) fn schema(msg: impl ToString) -> Self { + pub fn schema(msg: impl ToString) -> Self { Self::Schema(msg.to_string()) } + pub fn stats_validation(msg: impl ToString) -> Self { + Self::StatsValidation(msg.to_string()) + } + // Capture a backtrace when the error is constructed. #[must_use] pub fn with_backtrace(self) -> Self { @@ -342,3 +363,13 @@ impl From for Error { } } } + +/// This impl is needed so the `?` operator can auto-convert `Result` to +/// `DeltaResult`. For example, `TryFrom` impls for infallible conversions use `Infallible` as +/// their error type, and this allows those results to be propagated with `?` in functions +/// returning `DeltaResult`. The match is unreachable since `Infallible` has no variants. +impl From for Error { + fn from(value: Infallible) -> Self { + match value {} + } +} diff --git a/kernel/src/expressions/column_names.rs b/kernel/src/expressions/column_names.rs index 27b6160001..6565f8e9d1 100644 --- a/kernel/src/expressions/column_names.rs +++ b/kernel/src/expressions/column_names.rs @@ -7,7 +7,7 @@ use std::iter::Peekable; use std::ops::Deref; /// A (possibly nested) column name. -#[derive(Debug, Clone, Default, PartialEq, PartialOrd, Eq, Ord)] +#[derive(Debug, Clone, Default, PartialEq, PartialOrd, Eq, Ord, Serialize, Deserialize)] pub struct ColumnName { path: Vec, } @@ -97,6 +97,26 @@ impl ColumnName { pub fn into_inner(self) -> Vec { self.path } + + /// Returns the parent of this column name, or `None` if this is a top-level column. + /// + /// # Examples + /// + /// ``` + /// # use delta_kernel::expressions::ColumnName; + /// let path = ColumnName::new(["user", "address", "street"]); + /// assert_eq!(path.parent(), Some(ColumnName::new(["user", "address"]))); + /// + /// let path = ColumnName::new(["user"]); + /// assert_eq!(path.parent(), None); + /// ``` + pub fn parent(&self) -> Option { + if self.path.len() > 1 { + Some(ColumnName::new(&self.path[..self.path.len() - 1])) + } else { + None + } + } } /// Creates a new column name from a path of field names. Each field name is taken as-is, and may @@ -449,6 +469,7 @@ macro_rules! __joined_column_expr { } #[doc(inline)] pub use __joined_column_expr as joined_column_expr; +use serde::{Deserialize, Serialize}; #[cfg(test)] mod test { @@ -537,6 +558,18 @@ mod test { let name = column_name!("x.y.z"); let name = ColumnName::new(name); assert_eq!(name, column_name!("x.y.z")); + + // parent() + let simple_for_parent = column_name!("x"); + let nested_for_parent = column_name!("x.y"); + assert_eq!(simple_for_parent.parent(), None); + assert_eq!(nested_for_parent.parent(), Some(column_name!("x"))); + + let deep = column_name!("user.address.street"); + assert_eq!(deep.parent(), Some(column_name!("user.address"))); + + let single = ColumnName::new(["field"]); + assert_eq!(single.parent(), None); } #[test] diff --git a/kernel/src/expressions/literal_expression_transform.rs b/kernel/src/expressions/literal_expression_transform.rs index 82d4dc47df..04edbfc199 100644 --- a/kernel/src/expressions/literal_expression_transform.rs +++ b/kernel/src/expressions/literal_expression_transform.rs @@ -7,14 +7,13 @@ use std::ops::Deref as _; use tracing::debug; use crate::expressions::{Expression, Scalar}; -use crate::schema::{ - ArrayType, DataType, MapType, PrimitiveType, SchemaTransform, StructField, StructType, -}; +use crate::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; +use crate::transforms::SchemaTransform; +use crate::DeltaResult; /// [`SchemaTransform`] that will transform a [`Schema`] and an ordered list of leaf values /// (Scalars) into an Expression with a [`Literal`] expr for each leaf. -#[derive(Debug)] -pub(crate) struct LiteralExpressionTransform<'a, T: Iterator> { +struct LiteralExpressionTransform<'a, T: Iterator> { /// Leaf values to insert in schema order. scalars: T, /// A stack of built Expressions. After visiting children, we pop them off to @@ -48,27 +47,27 @@ pub enum Error { Unsupported(String), } -impl<'a, I: Iterator> LiteralExpressionTransform<'a, I> { - pub(crate) fn new(scalars: impl IntoIterator) -> Self { - Self { - scalars: scalars.into_iter(), - stack: Vec::new(), - error: Ok(()), - } +/// Transforms the schema and leaf values into a literal row expression. +pub(crate) fn literal_expression_transform<'a>( + schema: &'a StructType, + scalars: impl IntoIterator, +) -> DeltaResult { + let mut transform = LiteralExpressionTransform { + scalars: scalars.into_iter(), + stack: Vec::new(), + error: Ok(()), + }; + let _ = transform.transform_struct(schema); + transform.error?; + + if let Some(s) = transform.scalars.next() { + return Err(Error::ExcessScalars(s.clone()).into()); } - /// return the Expression we just built (or propagate Error). the top of `stack` should be our - /// final Expression - pub(crate) fn try_into_expr(mut self) -> Result { - self.error?; - - if let Some(s) = self.scalars.next() { - return Err(Error::ExcessScalars(s.clone())); - } - - self.stack.pop().ok_or(Error::EmptyStack) - } + transform.stack.pop().ok_or(Error::EmptyStack.into()) +} +impl<'a, I: Iterator> LiteralExpressionTransform<'a, I> { fn set_error(&mut self, error: Error) { // Only set when the error not yet set if let Err(ref existing_error) = self.error { @@ -208,19 +207,10 @@ mod tests { schema: SchemaRef, expected: Result, ) { - let mut schema_transform = LiteralExpressionTransform::new(values); - let datatype = schema.into(); - let _transformed = schema_transform.transform(&datatype); + let transformed = literal_expression_transform(&schema, values); match expected { - Ok(expected_expr) => { - let actual_expr = schema_transform.try_into_expr().unwrap(); - // TODO: we can't compare NULLs so we convert with .to_string to workaround - // see: https://github.com/delta-io/delta-kernel-rs/pull/677 - assert_eq!(expected_expr.to_string(), actual_expr.to_string()); - } - Err(()) => { - assert!(schema_transform.try_into_expr().is_err()); - } + Ok(expected_expr) => assert_eq!(expected_expr, transformed.unwrap()), + Err(()) => assert!(transformed.is_err()), } } diff --git a/kernel/src/expressions/mod.rs b/kernel/src/expressions/mod.rs index ccba3d5c89..6b72ab20c4 100644 --- a/kernel/src/expressions/mod.rs +++ b/kernel/src/expressions/mod.rs @@ -1,27 +1,30 @@ //! Definitions and functions to create and manipulate kernel expressions +use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::fmt::{Display, Formatter}; use std::sync::Arc; use itertools::Itertools; +use serde::{de, ser, Deserialize, Deserializer, Serialize, Serializer}; pub use self::column_names::{ column_expr, column_expr_ref, column_name, column_pred, joined_column_expr, joined_column_name, ColumnName, }; pub use self::scalars::{ArrayData, DecimalData, MapData, Scalar, StructData}; -use self::transforms::{ExpressionTransform as _, GetColumnReferences}; use crate::kernel_predicates::{ DirectDataSkippingPredicateEvaluator, DirectPredicateEvaluator, IndirectDataSkippingPredicateEvaluator, }; +use crate::schema::SchemaRef; +use crate::transforms::ExpressionTransform; use crate::{DataType, DeltaResult, DynPartialEq}; mod column_names; pub(crate) mod literal_expression_transform; +pub(crate) use literal_expression_transform::literal_expression_transform; mod scalars; -pub mod transforms; pub type ExpressionRef = std::sync::Arc; pub type PredicateRef = std::sync::Arc; @@ -31,14 +34,14 @@ pub type PredicateRef = std::sync::Arc; //////////////////////////////////////////////////////////////////////// /// A unary predicate operator. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] pub enum UnaryPredicateOp { /// Unary Is Null IsNull, } /// A binary predicate operator. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum BinaryPredicateOp { /// Comparison Less Than LessThan, @@ -53,14 +56,14 @@ pub enum BinaryPredicateOp { } /// A unary expression operator. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum UnaryExpressionOp { /// Convert struct data to JSON-encoded strings ToJson, } /// A binary expression operator. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum BinaryExpressionOp { /// Arithmetic Plus Plus, @@ -73,14 +76,14 @@ pub enum BinaryExpressionOp { } /// A variadic expression operator. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum VariadicExpressionOp { /// Collapse multiple values into one by taking the first non-null value Coalesce, } /// A junction (AND/OR) predicate operator. -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] pub enum JunctionPredicateOp { /// Conjunction And, @@ -190,7 +193,7 @@ pub type OpaquePredicateOpRef = Arc; // Expressions and predicates //////////////////////////////////////////////////////////////////////// -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct UnaryPredicate { /// The operator. pub op: UnaryPredicateOp, @@ -198,7 +201,7 @@ pub struct UnaryPredicate { pub expr: Box, } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct BinaryPredicate { /// The operator. pub op: BinaryPredicateOp, @@ -208,7 +211,7 @@ pub struct BinaryPredicate { pub right: Box, } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct UnaryExpression { /// The operator. pub op: UnaryExpressionOp, @@ -216,7 +219,7 @@ pub struct UnaryExpression { pub expr: Box, } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct BinaryExpression { /// The operator. pub op: BinaryExpressionOp, @@ -226,7 +229,7 @@ pub struct BinaryExpression { pub right: Box, } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct VariadicExpression { /// The operator. pub op: VariadicExpressionOp, @@ -234,7 +237,18 @@ pub struct VariadicExpression { pub exprs: Vec, } -#[derive(Clone, Debug, PartialEq)] +/// An expression that parses a JSON string into a struct with the given schema. +/// This is the inverse of `ToJson` - it converts a JSON-encoded string column into a +/// struct column. +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct ParseJsonExpression { + /// The expression that evaluates to a STRING column containing JSON objects. + pub json_expr: Box, + /// The schema defining the structure to parse the JSON into. + pub output_schema: SchemaRef, +} + +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct JunctionPredicate { /// The operator. pub op: JunctionPredicateOp, @@ -245,14 +259,36 @@ pub struct JunctionPredicate { // NOTE: We have to use `Arc` instead of `Box` because // we cannot require `OpaquePredicateOp: Clone` (not a dyn-compatible trait). Instead, we must rely // on cheap `Arc` clone, which does not duplicate the inner object. +// +// TODO(#1564): OpaquePredicate currently does not support serialization or deserialization. In the +// future, the [`OpaquePredicateOp`] trait can be extended to support ser/de. #[derive(Clone, Debug)] pub struct OpaquePredicate { pub op: OpaquePredicateOpRef, pub exprs: Vec, } +fn fail_serialize_opaque_predicate( + _value: &OpaquePredicate, + _serializer: S, +) -> Result +where + S: Serializer, +{ + Err(ser::Error::custom("Cannot serialize an Opaque Predicate")) +} + +fn fail_deserialize_opaque_predicate<'de, D>(_deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + Err(de::Error::custom("Cannot deserialize an Opaque Predicate")) +} impl OpaquePredicate { - fn new(op: OpaquePredicateOpRef, exprs: impl IntoIterator) -> Self { + pub(crate) fn new( + op: OpaquePredicateOpRef, + exprs: impl IntoIterator, + ) -> Self { let exprs = exprs.into_iter().collect(); Self { op, exprs } } @@ -261,6 +297,9 @@ impl OpaquePredicate { // NOTE: We have to use `Arc` instead of `Box` // because we cannot require `OpaqueExpressionOp: Clone` (not a dyn-compatible trait). Instead, we // must rely on cheap `Arc` clone, which does not duplicate the inner object. +// +// TODO(#1564): OpaqueExpression currently does not support serialization or deserialization. In the +// future, the [`OpaqueExpressionOp`] trait can be extended to support ser/de. #[derive(Clone, Debug)] pub struct OpaqueExpression { pub op: OpaqueExpressionOpRef, @@ -268,20 +307,45 @@ pub struct OpaqueExpression { } impl OpaqueExpression { - fn new(op: OpaqueExpressionOpRef, exprs: impl IntoIterator) -> Self { + pub(crate) fn new( + op: OpaqueExpressionOpRef, + exprs: impl IntoIterator, + ) -> Self { let exprs = exprs.into_iter().collect(); Self { op, exprs } } } +fn fail_serialize_opaque_expression( + _value: &OpaqueExpression, + _serializer: S, +) -> Result +where + S: Serializer, +{ + Err(ser::Error::custom("Cannot serialize an Opaque Expression")) +} + +fn fail_deserialize_opaque_expression<'de, D>( + _deserializer: D, +) -> Result +where + D: Deserializer<'de>, +{ + Err(de::Error::custom("Cannot deserialize an Opaque Expression")) +} + /// A transformation affecting a single field (one pieces of a [`Transform`]). The transformation /// could insert 0+ new fields after the target, or could replace the target with 0+ a new fields). -#[derive(Debug, Clone, PartialEq, Default)] +#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] pub struct FieldTransform { /// The list of expressions this field transform emits at the target location. pub exprs: Vec, /// If true, the output expressions replace the input field instead of following after it. pub is_replace: bool, + /// If true, this transform is silently ignored when the target field does not exist in the + /// input. Otherwise, a missing target field produces an error. + pub optional: bool, } /// A transformation that efficiently represents sparse modifications to struct schemas. @@ -291,7 +355,7 @@ pub struct FieldTransform { /// not specifically mentioned by the transform is passed through, unmodified and with the same /// relative field ordering. This is particularly useful for wide schemas where only a few columns /// need to be modified and/or dropped, or where a small number of columns need to be injected. -#[derive(Debug, Clone, PartialEq, Default)] +#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] pub struct Transform { /// The path to the nested input struct this transform operates on (if any). If no path is /// given, the transform operates directly on top-level columns. @@ -328,6 +392,14 @@ impl Transform { self } + /// Like [`Self::with_dropped_field`], but silently ignored if the field does not exist. + pub fn with_dropped_field_if_exists(mut self, name: impl Into) -> Self { + let field_transform = self.field_transform(name); + field_transform.is_replace = true; + field_transform.optional = true; + self + } + /// Specifies an expression to replace a field with. pub fn with_replaced_field(mut self, name: impl Into, expr: ExpressionRef) -> Self { let field_transform = self.field_transform(name); @@ -373,7 +445,7 @@ impl Transform { /// These expressions do not track or validate data types, other than the type /// of literals. It is up to the expression evaluator to validate the /// expression against a schema and add appropriate casts as required. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum Expression { /// A literal value. Literal(Scalar), @@ -381,8 +453,9 @@ pub enum Expression { Column(ColumnName), /// A predicate treated as a boolean expression Predicate(Box), // should this be Arc? - /// A struct computed from a Vec of expressions - Struct(Vec), + /// A struct computed from a Vec of expressions. + /// The optional nullability predicate, if provided and evaluates to false/null, makes the entire struct null. + Struct(Vec, Option), /// A sparse transformation of a struct schema. More efficient than `Struct` for wide schemas /// where only a few fields change, achieving O(changes) instead of O(schema_width) complexity. Transform(Transform), @@ -394,6 +467,8 @@ pub enum Expression { Variadic(VariadicExpression), /// An expression that the engine defines and implements. Kernel interacts with the expression /// only through methods provided by the [`OpaqueExpressionOp`] trait. + #[serde(serialize_with = "fail_serialize_opaque_expression")] + #[serde(deserialize_with = "fail_deserialize_opaque_expression")] Opaque(OpaqueExpression), /// An unknown expression (i.e. one that neither kernel nor engine attempts to evaluate). For /// data skipping purposes, kernel treats unknown expressions as if they were literal NULL @@ -404,6 +479,11 @@ pub enum Expression { /// all rows -- almost certainly NOT what the query author intended. Use `Expression::Opaque` /// for expressions kernel doesn't understand but which engine can still evaluate. Unknown(String), + /// Parse a JSON string expression into a struct with the given schema. + ParseJson(ParseJsonExpression), + /// Extract keys from a `Map` and parse values into a typed struct using + /// Delta's partition value serialization rules. + MapToStruct(MapToStructExpression), } /// A SQL predicate. @@ -411,7 +491,7 @@ pub enum Expression { /// These predicates do not track or validate data types, other than the type /// of literals. It is up to the predicate evaluator to validate the /// predicate against a schema and add appropriate casts as required. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum Predicate { /// A boolean-valued expression, useful for e.g. `AND(, )`. BooleanExpression(Expression), @@ -430,6 +510,8 @@ pub enum Predicate { Junction(JunctionPredicate), /// A predicate that the engine defines and implements. Kernel interacts with the predicate /// only through methods provided by the [`OpaquePredicateOp`] trait. + #[serde(serialize_with = "fail_serialize_opaque_predicate")] + #[serde(deserialize_with = "fail_deserialize_opaque_predicate")] Opaque(OpaquePredicate), /// An unknown predicate (i.e. one that neither kernel nor engine attempts to evaluate). For /// data skipping purposes, kernel treats unknown predicates as if they were literal NULL values @@ -468,21 +550,21 @@ impl JunctionPredicateOp { } impl UnaryExpression { - fn new(op: UnaryExpressionOp, expr: impl Into) -> Self { + pub(crate) fn new(op: UnaryExpressionOp, expr: impl Into) -> Self { let expr = Box::new(expr.into()); Self { op, expr } } } impl UnaryPredicate { - fn new(op: UnaryPredicateOp, expr: impl Into) -> Self { + pub(crate) fn new(op: UnaryPredicateOp, expr: impl Into) -> Self { let expr = Box::new(expr.into()); Self { op, expr } } } impl BinaryExpression { - fn new( + pub(crate) fn new( op: BinaryExpressionOp, left: impl Into, right: impl Into, @@ -494,7 +576,7 @@ impl BinaryExpression { } impl BinaryPredicate { - fn new( + pub(crate) fn new( op: BinaryPredicateOp, left: impl Into, right: impl Into, @@ -506,7 +588,7 @@ impl BinaryPredicate { } impl VariadicExpression { - fn new( + pub(crate) fn new( op: VariadicExpressionOp, exprs: impl IntoIterator>, ) -> Self { @@ -515,8 +597,42 @@ impl VariadicExpression { } } +impl ParseJsonExpression { + pub(crate) fn new(json_expr: impl Into, output_schema: SchemaRef) -> Self { + Self { + json_expr: Box::new(json_expr.into()), + output_schema, + } + } +} + +/// Transforms a `Map` column into a struct whose schema is provided by the +/// evaluator's output type (via `result_type`). Each row in the map column becomes one row in +/// the output struct column: a `key` -> `value` mapping in the map means the struct field named +/// `key` receives `value`, parsed into the field's target type using Delta's partition value +/// serialization rules ([`PrimitiveType::parse_scalar`]). +/// +/// - Missing keys produce null values +/// - Parse errors are propagated (indicating a broken table) +/// - Duplicate map keys are resolved by taking the rightmost entry +/// +/// [`PrimitiveType::parse_scalar`]: crate::schema::PrimitiveType::parse_scalar +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] +pub struct MapToStructExpression { + /// The expression that evaluates to a `Map` column. + pub map_expr: Box, +} + +impl MapToStructExpression { + pub(crate) fn new(map_expr: impl Into) -> Self { + Self { + map_expr: Box::new(map_expr.into()), + } + } +} + impl JunctionPredicate { - fn new(op: JunctionPredicateOp, preds: Vec) -> Self { + pub(crate) fn new(op: JunctionPredicateOp, preds: Vec) -> Self { Self { op, preds } } } @@ -526,7 +642,7 @@ impl Expression { pub fn references(&self) -> HashSet<&ColumnName> { let mut references = GetColumnReferences::default(); let _ = references.transform_expr(self); - references.into_inner() + references.0 } /// Create a new column name expression from input satisfying `FromIterator for ColumnName`. @@ -555,9 +671,28 @@ impl Expression { } } - /// Create a new struct expression + /// Create a new struct expression. + /// + /// The field names and types are supplied by the caller at evaluation time via the + /// `result_type` parameter of the expression evaluator. Use this when the schema is + /// always available from external context (e.g. the expression is the top-level output + /// of [`crate::ExpressionEvaluator`]). pub fn struct_from(exprs: impl IntoIterator>>) -> Self { - Self::Struct(exprs.into_iter().map(Into::into).collect()) + Self::Struct(exprs.into_iter().map(Into::into).collect(), None) + } + + /// Create a new struct expression with a nullability predicate. + /// + /// When the predicate evaluates to false or null for a row, the entire struct is null + /// for that row. + pub fn struct_with_nullability_from( + exprs: impl IntoIterator>>, + nullability_predicate: impl Into>, + ) -> Self { + Self::Struct( + exprs.into_iter().map(Into::into).collect(), + Some(nullability_predicate.into()), + ) } /// Create a new transform expression @@ -632,6 +767,14 @@ impl Expression { Self::Variadic(VariadicExpression::new(op, exprs)) } + /// Creates a new COALESCE expression that returns the first non-null value. + /// + /// COALESCE evaluates expressions in order and returns the first non-null result. + /// If all expressions evaluate to null, the result is null. + pub fn coalesce(exprs: impl IntoIterator>) -> Self { + Self::variadic(VariadicExpressionOp::Coalesce, exprs) + } + /// Creates a new opaque expression pub fn opaque( op: impl OpaqueExpressionOp, @@ -644,6 +787,19 @@ impl Expression { pub fn unknown(name: impl Into) -> Self { Self::Unknown(name.into()) } + + /// Creates a new ParseJson expression that parses a JSON string column into a struct. + /// This is the inverse of `ToJson` - it converts a JSON-encoded string into a struct. + pub fn parse_json(json_expr: impl Into, output_schema: SchemaRef) -> Self { + Self::ParseJson(ParseJsonExpression::new(json_expr, output_schema)) + } + + /// Extracts keys from a `Map` and parses values into a typed struct using + /// Delta's partition value serialization rules. The output struct schema is determined by the + /// evaluator's `result_type`. + pub fn map_to_struct(map_expr: impl Into) -> Self { + Self::MapToStruct(MapToStructExpression::new(map_expr)) + } } impl Predicate { @@ -651,7 +807,7 @@ impl Predicate { pub fn references(&self) -> HashSet<&ColumnName> { let mut references = GetColumnReferences::default(); let _ = references.transform_pred(self); - references.into_inner() + references.0 } /// Creates a new boolean column reference. See also [`Expression::column`]. @@ -740,12 +896,14 @@ impl Predicate { Self::or_from([a.into(), b.into()]) } - /// Creates a new predicate AND(preds...) + /// Creates a new predicate AND(preds...). See [`Self::junction`] for normalization of + /// empty and single-element inputs. pub fn and_from(preds: impl IntoIterator) -> Self { Self::junction(JunctionPredicateOp::And, preds) } - /// Creates a new predicate OR(preds...) + /// Creates a new predicate OR(preds...). See [`Self::junction`] for normalization of + /// empty and single-element inputs. pub fn or_from(preds: impl IntoIterator) -> Self { Self::junction(JunctionPredicateOp::Or, preds) } @@ -769,10 +927,24 @@ impl Predicate { }) } - /// Creates a new junction predicate OP(preds...) + /// Creates a new junction predicate OP(preds...). Normalizes degenerate cases: + /// + /// - Empty junction returns the identity element (the value that has no effect when + /// combined with other predicates under the same operator): + /// - `AND()` -> `true`, because `true AND p` == `p` for any predicate `p`. + /// - `OR()` -> `false`, because `false OR p` == `p` for any predicate `p`. + /// - Single-element junction unwraps the element: `AND(p)` / `OR(p)` -> `p`. pub fn junction(op: JunctionPredicateOp, preds: impl IntoIterator) -> Self { - let preds = preds.into_iter().collect(); - Self::Junction(JunctionPredicate { op, preds }) + let mut preds: Vec<_> = preds.into_iter().collect(); + match preds.len() { + 0 => match op { + JunctionPredicateOp::And => Self::literal(true), + JunctionPredicateOp::Or => Self::literal(false), + }, + // A junction of one predicate is just that predicate. + 1 => preds.remove(0), + _ => Self::Junction(JunctionPredicate { op, preds }), + } } /// Creates a new opaque predicate @@ -860,7 +1032,7 @@ impl Display for Expression { Literal(l) => write!(f, "{l}"), Column(name) => write!(f, "Column({name})"), Predicate(p) => write!(f, "{p}"), - Struct(exprs) => write!(f, "Struct({})", format_child_list(exprs)), + Struct(exprs, _) => write!(f, "Struct({})", format_child_list(exprs)), Transform(transform) => { write!(f, "Transform(")?; let mut sep = ""; @@ -898,6 +1070,15 @@ impl Display for Expression { write!(f, "{op:?}({})", format_child_list(exprs)) } Unknown(name) => write!(f, ""), + ParseJson(p) => { + write!( + f, + "PARSE_JSON({}, )", + p.json_expr, + p.output_schema.fields().len() + ) + } + MapToStruct(m) => write!(f, "MAP_TO_STRUCT({})", m.map_expr), } } } @@ -988,10 +1169,33 @@ impl> std::ops::Div for Expression { } } +/// Retrieves the set of column names referenced by an expression. +#[derive(Default)] +struct GetColumnReferences<'a>(HashSet<&'a ColumnName>); + +impl<'a> ExpressionTransform<'a> for GetColumnReferences<'a> { + fn transform_expr_column(&mut self, name: &'a ColumnName) -> Option> { + self.0.insert(name); + Some(Cow::Borrowed(name)) + } +} + #[cfg(test)] mod tests { + use std::fmt::Debug; + + use serde::de::DeserializeOwned; + use serde::Serialize; + use super::{column_expr, column_pred, Expression as Expr, Predicate as Pred}; + /// Helper function to verify roundtrip serialization/deserialization + fn assert_roundtrip(value: &T) { + let json = serde_json::to_string(value).expect("serialization should succeed"); + let deserialized: T = serde_json::from_str(&json).expect("deserialization should succeed"); + assert_eq!(value, &deserialized, "roundtrip should preserve value"); + } + #[test] fn test_expression_format() { let cases = [ @@ -1054,4 +1258,500 @@ mod tests { assert_eq!(result, expected); } } + + // ==================== Serde Roundtrip Tests ==================== + + mod serde_tests { + use std::sync::Arc; + + use crate::expressions::scalars::{ArrayData, DecimalData, MapData, StructData}; + use crate::expressions::{ + column_expr, column_name, BinaryExpressionOp, BinaryPredicateOp, ColumnName, + Expression, Predicate, Scalar, Transform, UnaryExpressionOp, + }; + use crate::schema::{ArrayType, DataType, DecimalType, MapType, StructField}; + use crate::utils::test_utils::assert_result_error_with_message; + + use super::assert_roundtrip; + + // ==================== Expression::Literal Tests ==================== + + #[test] + fn test_literal_scalars_roundtrip() { + // Test all primitive scalar types that have proper PartialEq + let cases: Vec = vec![ + // Numeric types + Expression::literal(42i32), // Integer + Expression::literal(9999999999i64), // Long + Expression::literal(123i16), // Short + Expression::literal(42i8), // Byte + Expression::literal(1.12345677_32), // Float + Expression::literal(1.12345667_64), // Double + // String and Boolean + Expression::literal("hello world"), + Expression::literal(true), + Expression::literal(false), + // Temporal types + Expression::Literal(Scalar::Timestamp(1234567890000000)), + Expression::Literal(Scalar::TimestampNtz(1234567890000000)), + Expression::Literal(Scalar::Date(19000)), + // Binary + Expression::Literal(Scalar::Binary(vec![1, 2, 3, 4, 5])), + // Decimal + Expression::Literal(Scalar::Decimal( + DecimalData::try_new(12345i128, DecimalType::try_new(10, 2).unwrap()).unwrap(), + )), + ]; + + for expr in &cases { + assert_roundtrip(expr); + } + } + + #[test] + fn test_literal_complex_scalars_roundtrip() { + // Test complex scalar types that need JSON comparison (partial_cmp returns None) + let cases: Vec = vec![ + // Null with different types + Expression::null_literal(DataType::INTEGER), + Expression::null_literal(DataType::STRING), + Expression::null_literal(DataType::BOOLEAN), + // Array + Expression::Literal(Scalar::Array( + ArrayData::try_new( + ArrayType::new(DataType::INTEGER, false), + vec![Scalar::Integer(1), Scalar::Integer(2), Scalar::Integer(3)], + ) + .unwrap(), + )), + // Map + Expression::Literal(Scalar::Map( + MapData::try_new( + MapType::new(DataType::STRING, DataType::INTEGER, false), + vec![ + (Scalar::String("a".to_string()), Scalar::Integer(1)), + (Scalar::String("b".to_string()), Scalar::Integer(2)), + ], + ) + .unwrap(), + )), + // Struct + Expression::Literal(Scalar::Struct( + StructData::try_new( + vec![ + StructField::nullable("x", DataType::INTEGER), + StructField::nullable("y", DataType::STRING), + ], + vec![Scalar::Integer(42), Scalar::String("hello".to_string())], + ) + .unwrap(), + )), + ]; + + for expr in &cases { + assert_roundtrip(expr); + } + } + + // ==================== Expression::Column Tests ==================== + + #[test] + fn test_column_expressions_roundtrip() { + let cases: Vec = vec![ + column_expr!("my_column"), + Expression::column(["parent", "child"]), + Expression::column(["a", "b", "c", "d"]), + ]; + + for expr in &cases { + assert_roundtrip(expr); + } + } + + #[test] + fn test_column_names_roundtrip() { + let cases: Vec = vec![ + column_name!("simple"), + ColumnName::new(["a", "b", "c"]), + ColumnName::new::<&str>([]), + ]; + + for col in &cases { + assert_roundtrip(col); + } + } + + // ==================== Expression Operations Tests ==================== + + #[test] + fn test_unary_expression_roundtrip() { + let expr = Expression::unary(UnaryExpressionOp::ToJson, column_expr!("data")); + assert_roundtrip(&expr); + } + + #[test] + fn test_binary_expressions_roundtrip() { + let ops = [ + BinaryExpressionOp::Plus, + BinaryExpressionOp::Minus, + BinaryExpressionOp::Multiply, + BinaryExpressionOp::Divide, + ]; + + for op in ops { + let expr = Expression::binary(op, column_expr!("a"), Expression::literal(10)); + assert_roundtrip(&expr); + } + } + + #[test] + fn test_variadic_expression_roundtrip() { + let expr = Expression::coalesce([ + column_expr!("a"), + column_expr!("b"), + Expression::literal("default"), + ]); + assert_roundtrip(&expr); + } + + #[test] + fn test_nested_arithmetic_expression_roundtrip() { + // (a + b) * (c - d) / 2 + let left = Expression::binary( + BinaryExpressionOp::Plus, + column_expr!("a"), + column_expr!("b"), + ); + let right = Expression::binary( + BinaryExpressionOp::Minus, + column_expr!("c"), + column_expr!("d"), + ); + let mul = Expression::binary(BinaryExpressionOp::Multiply, left, right); + let expr = Expression::binary(BinaryExpressionOp::Divide, mul, Expression::literal(2)); + assert_roundtrip(&expr); + } + + // ==================== Expression::Struct/Transform/Other Tests ==================== + + #[test] + fn test_struct_expression_roundtrip() { + let expr = Expression::struct_from([ + Arc::new(column_expr!("x")), + Arc::new(Expression::literal(42)), + Arc::new(Expression::literal("hello")), + ]); + assert_roundtrip(&expr); + } + + #[test] + fn test_transform_expressions_roundtrip() { + let cases: Vec = vec![ + // Identity transform + Expression::transform(Transform::new_top_level()), + // Drop field + Expression::transform(Transform::new_top_level().with_dropped_field("old_column")), + // Replace field + Expression::transform( + Transform::new_top_level() + .with_replaced_field("original", Arc::new(Expression::literal(0))), + ), + // Insert fields + Expression::transform( + Transform::new_top_level() + .with_inserted_field(Some("after_col"), Arc::new(column_expr!("new_col"))) + .with_inserted_field( + None::, + Arc::new(Expression::literal("prepended")), + ), + ), + // Nested transform + Expression::transform( + Transform::new_nested(["parent", "child"]).with_dropped_field("to_drop"), + ), + ]; + + for expr in &cases { + assert_roundtrip(expr); + } + } + + #[test] + fn test_expression_wrapping_predicate_roundtrip() { + let pred = Predicate::eq(column_expr!("x"), Expression::literal(10)); + let expr = Expression::from_pred(pred); + assert_roundtrip(&expr); + } + + #[test] + fn test_expression_unknown_roundtrip() { + let expr = Expression::unknown("some_unknown_function()"); + assert_roundtrip(&expr); + } + + #[test] + fn test_map_to_struct_expression_roundtrip() { + let cases: Vec = vec![ + Expression::map_to_struct(column_expr!("pv")), + Expression::map_to_struct(Expression::literal("ignored")), + ]; + + for expr in &cases { + assert_roundtrip(expr); + } + } + + // ==================== Predicate Tests ==================== + + #[test] + fn test_predicate_basics_roundtrip() { + let cases: Vec = vec![ + // Boolean expression + Predicate::from_expr(column_expr!("is_active")), + // Literals + Predicate::literal(true), + Predicate::literal(false), + // NOT + Predicate::not(Predicate::from_expr(column_expr!("x"))), + // Nested NOT + Predicate::not(Predicate::not(Predicate::gt( + column_expr!("x"), + Expression::literal(5), + ))), + // Unknown + Predicate::unknown("some_unknown_predicate()"), + // Unary predicates + Predicate::is_null(column_expr!("nullable_col")), + Predicate::is_not_null(column_expr!("nullable_col")), + ]; + + for pred in &cases { + assert_roundtrip(pred); + } + } + + #[test] + fn test_predicate_null_literal_roundtrip() { + let pred = Predicate::null_literal(); + assert_roundtrip(&pred); + } + + #[test] + fn test_predicate_comparisons_roundtrip() { + let cases: Vec = vec![ + Predicate::eq(column_expr!("x"), Expression::literal(42)), + Predicate::ne(column_expr!("status"), Expression::literal("active")), + Predicate::lt(column_expr!("age"), Expression::literal(18)), + Predicate::le(column_expr!("price"), Expression::literal(100)), + Predicate::gt(column_expr!("score"), Expression::literal(90)), + Predicate::ge(column_expr!("quantity"), Expression::literal(1)), + Predicate::distinct(column_expr!("a"), column_expr!("b")), + ]; + + for pred in &cases { + assert_roundtrip(pred); + } + } + + #[test] + fn test_predicate_in_roundtrip() { + let array_data = ArrayData::try_new( + ArrayType::new(DataType::INTEGER, false), + vec![Scalar::Integer(1), Scalar::Integer(2), Scalar::Integer(3)], + ) + .unwrap(); + let pred = Predicate::binary( + BinaryPredicateOp::In, + column_expr!("x"), + Expression::Literal(Scalar::Array(array_data)), + ); + assert_roundtrip(&pred); + } + + #[test] + fn test_predicate_junctions_roundtrip() { + let cases: Vec = vec![ + // Simple AND + Predicate::and( + Predicate::gt(column_expr!("x"), Expression::literal(0)), + Predicate::lt(column_expr!("x"), Expression::literal(100)), + ), + // Simple OR + Predicate::or( + Predicate::eq(column_expr!("status"), Expression::literal("active")), + Predicate::eq(column_expr!("status"), Expression::literal("pending")), + ), + // Multiple AND + Predicate::and_from([ + Predicate::gt(column_expr!("x"), Expression::literal(0)), + Predicate::lt(column_expr!("x"), Expression::literal(100)), + Predicate::is_not_null(column_expr!("x")), + ]), + // Multiple OR + Predicate::or_from([ + Predicate::eq(column_expr!("type"), Expression::literal("A")), + Predicate::eq(column_expr!("type"), Expression::literal("B")), + Predicate::eq(column_expr!("type"), Expression::literal("C")), + ]), + // Nested: (a > 0 AND b < 100) OR (c = 'special') + Predicate::or( + Predicate::and( + Predicate::gt(column_expr!("a"), Expression::literal(0)), + Predicate::lt(column_expr!("b"), Expression::literal(100)), + ), + Predicate::eq(column_expr!("c"), Expression::literal("special")), + ), + ]; + + for pred in &cases { + assert_roundtrip(pred); + } + } + + // ==================== Complex Nested Structures ==================== + + #[test] + fn test_deeply_nested_structures_roundtrip() { + // COALESCE(a + b, c * d, 0) > 100 + let add = Expression::binary( + BinaryExpressionOp::Plus, + column_expr!("a"), + column_expr!("b"), + ); + let mul = Expression::binary( + BinaryExpressionOp::Multiply, + column_expr!("c"), + column_expr!("d"), + ); + let coalesce = Expression::coalesce([add, mul, Expression::literal(0)]); + let pred = Predicate::gt(coalesce, Expression::literal(100)); + assert_roundtrip(&pred); + + // Expression wrapping a predicate that references expressions + let inner_pred = Predicate::and( + Predicate::eq(column_expr!("x"), Expression::literal(1)), + Predicate::gt( + Expression::binary( + BinaryExpressionOp::Plus, + column_expr!("y"), + column_expr!("z"), + ), + Expression::literal(10), + ), + ); + let expr = Expression::from_pred(inner_pred); + assert_roundtrip(&expr); + } + + // ==================== Opaque Variant Failure Tests ==================== + + #[test] + fn test_opaque_expression_serialize_fails() { + use crate::expressions::{OpaqueExpressionOp, ScalarExpressionEvaluator}; + use crate::DeltaResult; + + #[derive(Debug, PartialEq)] + struct TestOpaqueExprOp; + + impl OpaqueExpressionOp for TestOpaqueExprOp { + fn name(&self) -> &str { + "test_opaque" + } + fn eval_expr_scalar( + &self, + _eval_expr: &ScalarExpressionEvaluator<'_>, + _exprs: &[Expression], + ) -> DeltaResult { + Ok(Scalar::Integer(0)) + } + } + + let expr = Expression::opaque(TestOpaqueExprOp, [Expression::literal(1)]); + let result = serde_json::to_string(&expr); + assert_result_error_with_message(result, "Cannot serialize an Opaque Expression"); + } + + #[test] + fn test_opaque_predicate_serialize_fails() { + use crate::expressions::{OpaquePredicateOp, ScalarExpressionEvaluator}; + use crate::kernel_predicates::{ + DirectDataSkippingPredicateEvaluator, DirectPredicateEvaluator, + IndirectDataSkippingPredicateEvaluator, + }; + use crate::DeltaResult; + + #[derive(Debug, PartialEq)] + struct TestOpaquePredOp; + + impl OpaquePredicateOp for TestOpaquePredOp { + fn name(&self) -> &str { + "test_opaque_pred" + } + fn eval_pred_scalar( + &self, + _eval_expr: &ScalarExpressionEvaluator<'_>, + _eval_pred: &DirectPredicateEvaluator<'_>, + _exprs: &[Expression], + _inverted: bool, + ) -> DeltaResult> { + Ok(Some(true)) + } + fn eval_as_data_skipping_predicate( + &self, + _evaluator: &DirectDataSkippingPredicateEvaluator<'_>, + _exprs: &[Expression], + _inverted: bool, + ) -> Option { + Some(true) + } + fn as_data_skipping_predicate( + &self, + _evaluator: &IndirectDataSkippingPredicateEvaluator<'_>, + _exprs: &[Expression], + _inverted: bool, + ) -> Option { + None + } + } + + let pred = Predicate::opaque(TestOpaquePredOp, [Expression::literal(1)]); + let result = serde_json::to_string(&pred); + assert_result_error_with_message(result, "Cannot serialize an Opaque Predicate"); + } + } + + #[test] + fn single_element_and_from_returns_unwrapped_predicate() { + let inner = Pred::gt(column_expr!("x"), Expr::literal(0)); + let result = Pred::and_from([inner.clone()]); + assert_eq!(result, inner); + } + + #[test] + fn single_element_or_from_returns_unwrapped_predicate() { + let inner = Pred::gt(column_expr!("x"), Expr::literal(0)); + let result = Pred::or_from([inner.clone()]); + assert_eq!(result, inner); + } + + #[test] + fn multi_element_and_from_returns_junction() { + let p1 = Pred::gt(column_expr!("x"), Expr::literal(0)); + let p2 = Pred::lt(column_expr!("x"), Expr::literal(100)); + let result = Pred::and_from([p1.clone(), p2.clone()]); + assert!(matches!(result, Pred::Junction(ref j) if j.preds.len() == 2)); + assert_eq!(result, Pred::and(p1, p2)); + } + + #[test] + fn empty_and_from_returns_identity_literal() { + let result = Pred::and_from(std::iter::empty()); + assert_eq!(result, Pred::literal(true)); + } + + #[test] + fn empty_or_from_returns_identity_literal() { + let result = Pred::or_from(std::iter::empty()); + assert_eq!(result, Pred::literal(false)); + } } diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 19e1c7f462..993566dce6 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -4,13 +4,14 @@ use std::fmt::{Display, Formatter}; use chrono::{DateTime, NaiveDate, NaiveDateTime, TimeZone, Utc}; use itertools::Itertools; +use serde::{Deserialize, Serialize}; use crate::schema::derive_macro_utils::ToDataType; use crate::schema::{ArrayType, DataType, DecimalType, MapType, PrimitiveType, StructField}; use crate::utils::require; use crate::{DeltaResult, Error}; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct DecimalData { bits: i128, ty: DecimalType, @@ -54,7 +55,7 @@ fn get_decimal_precision(value: i128) -> u8 { value.unsigned_abs().checked_ilog10().map_or(0, |p| p + 1) as _ } -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] pub struct ArrayData { tpe: ArrayType, /// This exists currently for literal list comparisons, but should not be depended on see below @@ -94,15 +95,12 @@ impl ArrayData { &self.tpe } - #[deprecated( - note = "These fields will be removed eventually and are unstable. See https://github.com/delta-io/delta-kernel-rs/issues/291" - )] pub fn array_elements(&self) -> &[Scalar] { &self.elements } } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct MapData { data_type: MapType, pairs: Vec<(Scalar, Scalar)>, @@ -160,7 +158,7 @@ impl MapData { } } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct StructData { fields: Vec, values: Vec, @@ -218,7 +216,10 @@ impl StructData { /// A single value, which can be null. Used for representing literal values /// in [Expressions][crate::expressions::Expression]. -#[derive(Debug, Clone)] +/// +/// NOTE: `PartialEq` uses physical (structural) comparison semantics. +/// For SQL NULL semantics, use [`Scalar::logical_eq`] or [`Scalar::logical_partial_cmp`]. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum Scalar { /// 32bit integer Integer(i32), @@ -426,14 +427,40 @@ impl Display for Scalar { } } -impl PartialEq for Scalar { - fn eq(&self, other: &Scalar) -> bool { - self.partial_cmp(other) == Some(Ordering::Equal) +impl Scalar { + /// Logical (SQL semantics) equality comparison of two scalars. + /// + /// Returns `None` if the scalars cannot be compared (different types, NULL values, or + /// unsupported types like Struct/Array/Map). + /// + /// Logical (SQL semantics) equality comparison of two scalars. + /// + /// Returns `true` if the scalars are logically equal, `false` otherwise. + /// + /// NOTE: This implements SQL NULL semantics where NULL is incomparable to everything, + /// including itself, so `NULL != NULL` (returns `false`). + pub fn logical_eq(&self, other: &Self) -> bool { + self.logical_partial_cmp(other) == Some(Ordering::Equal) + } + + /// Physical (structural) equality comparison of two scalars. + /// + /// Returns `true` if the scalars are structurally identical, `false` otherwise. + /// + /// Unlike logical comparison, this treats `Null(dt1) == Null(dt2)` as `true` when `dt1 == dt2`. + /// This is used for query plan comparison, not SQL evaluation. + pub fn physical_eq(&self, other: &Self) -> bool { + self == other } -} -impl PartialOrd for Scalar { - fn partial_cmp(&self, other: &Self) -> Option { + /// Logical (SQL semantics) comparison of two scalars. + /// + /// Returns `None` if the scalars are incomparable (different types, NULL values, or + /// unsupported types like Struct/Array/Map). + /// + /// NOTE: This implements SQL NULL semantics where NULL is incomparable to everything, + /// including itself. + pub fn logical_partial_cmp(&self, other: &Self) -> Option { use Scalar::*; match (self, other) { // NOTE: We intentionally do two match arms for each variant to avoid a catch-all, so @@ -466,7 +493,7 @@ impl PartialOrd for Scalar { .then(|| d1.bits().partial_cmp(&d2.bits())) .flatten(), (Decimal(_), _) => None, - (Null(_), _) => None, // NOTE: NULL values are incomparable by definition + (Null(_), _) => None, // NOTE: NULL values are incomparable by definition (SQL NULL semantics) (Struct(_), _) => None, // TODO: Support Struct? (Array(_), _) => None, // TODO: Support Array? (Map(_), _) => None, // TODO: Support Map? @@ -578,6 +605,20 @@ where } } +impl TryFrom>> for Scalar +where + T: Into + ToDataType, +{ + type Error = Error; + + fn try_from(opt: Option>) -> Result { + match opt { + Some(vec) => vec.try_into(), + None => Ok(Self::Null(ArrayType::new(T::to_data_type(), false).into())), + } + } +} + impl TryFrom> for Scalar where K: Into + ToDataType, @@ -606,6 +647,23 @@ where } } +impl TryFrom>> for Scalar +where + K: Into + ToDataType, + V: Into + ToDataType, +{ + type Error = Error; + + fn try_from(opt: Option>) -> Result { + match opt { + Some(map) => map.try_into(), + None => Ok(Self::Null( + MapType::new(K::to_data_type(), V::to_data_type(), false).into(), + )), + } + } +} + // NOTE: We "cheat" and use the macro support trait `ToDataType` impl + ToDataType> From> for Scalar { fn from(t: Option) -> Self { @@ -923,7 +981,6 @@ mod tests { #[test] fn test_arrays() { - #[allow(deprecated)] let array = Scalar::Array(ArrayData { tpe: ArrayType::new(DataType::INTEGER, false), elements: vec![Scalar::Integer(1), Scalar::Integer(2), Scalar::Integer(3)], @@ -1046,16 +1103,17 @@ mod tests { let a = Scalar::Integer(1); let b = Scalar::Integer(2); let c = Scalar::Null(DataType::INTEGER); - assert_eq!(a.partial_cmp(&b), Some(Ordering::Less)); - assert_eq!(b.partial_cmp(&a), Some(Ordering::Greater)); - assert_eq!(a.partial_cmp(&a), Some(Ordering::Equal)); - assert_eq!(b.partial_cmp(&b), Some(Ordering::Equal)); - assert_eq!(a.partial_cmp(&c), None); - assert_eq!(c.partial_cmp(&a), None); + + assert_eq!(a.logical_partial_cmp(&b), Some(Ordering::Less)); + assert_eq!(b.logical_partial_cmp(&a), Some(Ordering::Greater)); + assert_eq!(a.logical_partial_cmp(&a), Some(Ordering::Equal)); + assert_eq!(b.logical_partial_cmp(&b), Some(Ordering::Equal)); + assert_eq!(a.logical_partial_cmp(&c), None); + assert_eq!(c.logical_partial_cmp(&a), None); // assert that NULL values are incomparable let null = Scalar::Null(DataType::INTEGER); - assert_eq!(null.partial_cmp(&null), None); + assert_eq!(null.logical_partial_cmp(&null), None); } #[test] @@ -1063,14 +1121,14 @@ mod tests { let a = Scalar::Integer(1); let b = Scalar::Integer(2); let c = Scalar::Null(DataType::INTEGER); - assert!(!a.eq(&b)); - assert!(a.eq(&a)); - assert!(!a.eq(&c)); - assert!(!c.eq(&a)); + assert!(!a.logical_eq(&b)); + assert!(a.logical_eq(&a)); + assert!(!a.logical_eq(&c)); + assert!(!c.logical_eq(&a)); // assert that NULL values are incomparable let null = Scalar::Null(DataType::INTEGER); - assert!(!null.eq(&null)); + assert!(!null.logical_eq(&null)); } #[test] @@ -1099,14 +1157,10 @@ mod tests { assert!(!map_data.map_type().value_contains_null()); // Check that both expected pairs are present - let has_key1 = pairs.iter().any(|(k, v)| { - matches!(k, Scalar::String(s) if s == "key1") && matches!(v, Scalar::Integer(42)) - }); - let has_key2 = pairs.iter().any(|(k, v)| { - matches!(k, Scalar::String(s) if s == "key2") && matches!(v, Scalar::Integer(100)) - }); - assert!(has_key1, "Missing key1 -> 42 pair"); - assert!(has_key2, "Missing key2 -> 100 pair"); + let entry1 = (Scalar::String("key1".to_string()), Scalar::Integer(42)); + let entry2 = (Scalar::String("key2".to_string()), Scalar::Integer(100)); + assert!(pairs.contains(&entry1), "Missing key1 -> 42 pair"); + assert!(pairs.contains(&entry2), "Missing key2 -> 100 pair"); Ok(()) } @@ -1138,18 +1192,15 @@ mod tests { assert!(map_data.map_type().value_contains_null()); // Check that all expected pairs are present - let has_key1 = pairs.iter().any(|(k, v)| { - matches!(k, Scalar::String(s) if s == "key1") && matches!(v, Scalar::Integer(42)) - }); - let has_key2 = pairs.iter().any(|(k, v)| { - matches!(k, Scalar::String(s) if s == "key2") && matches!(v, Scalar::Null(_)) - }); - let has_key3 = pairs.iter().any(|(k, v)| { - matches!(k, Scalar::String(s) if s == "key3") && matches!(v, Scalar::Integer(100)) - }); - assert!(has_key1, "Missing key1 -> 42 pair"); - assert!(has_key2, "Missing key2 -> null pair"); - assert!(has_key3, "Missing key3 -> 100 pair"); + let entry1 = (Scalar::String("key1".to_string()), Scalar::Integer(42)); + let entry2 = ( + Scalar::String("key2".to_string()), + Scalar::Null(DataType::INTEGER), + ); + let entry3 = (Scalar::String("key3".to_string()), Scalar::Integer(100)); + assert!(pairs.contains(&entry1), "Missing key1 -> 42 pair"); + assert!(pairs.contains(&entry2), "Missing key2 -> null pair"); + assert!(pairs.contains(&entry3), "Missing key3 -> 100 pair"); Ok(()) } @@ -1173,15 +1224,14 @@ mod tests { let Scalar::Array(array_data) = scalar else { panic!("Expected Array scalar"); }; - #[allow(deprecated)] let elements = array_data.array_elements(); assert_eq!(elements.len(), 3); assert!(!array_data.array_type().contains_null()); // Check that all expected values are present - assert!(matches!(elements[0], Scalar::Integer(42))); - assert!(matches!(elements[1], Scalar::Integer(100))); - assert!(matches!(elements[2], Scalar::Integer(200))); + assert_eq!(elements[0], Scalar::Integer(42)); + assert_eq!(elements[1], Scalar::Integer(100)); + assert_eq!(elements[2], Scalar::Integer(200)); Ok(()) } @@ -1206,15 +1256,14 @@ mod tests { panic!("Expected Array scalar"); }; - #[allow(deprecated)] let elements = array_data.array_elements(); assert_eq!(elements.len(), 3); assert!(array_data.array_type().contains_null()); // Check that all expected values are present - assert!(matches!(elements[0], Scalar::Integer(42))); - assert!(matches!(elements[1], Scalar::Null(_))); - assert!(matches!(elements[2], Scalar::Integer(100))); + assert_eq!(elements[0], Scalar::Integer(42)); + assert!(elements[1].is_null()); + assert_eq!(elements[2], Scalar::Integer(100)); Ok(()) } diff --git a/kernel/src/history_manager/search.rs b/kernel/src/history_manager/search.rs index 720d73b94b..399555b364 100644 --- a/kernel/src/history_manager/search.rs +++ b/kernel/src/history_manager/search.rs @@ -170,49 +170,32 @@ mod tests { Ok(*x) } - #[test] - fn test_exact_match() { - let values = vec![1, 3, 5, 7, 9]; - - // LeastUpper bound with exact match - let result = - binary_search_by_key_with_bounds(&values, 5, get_val, Bound::LeastUpper).unwrap(); - assert_eq!(result, 2); - - // GreatestLower bound with exact match - let result = - binary_search_by_key_with_bounds(&values, 5, get_val, Bound::GreatestLower).unwrap(); - assert_eq!(result, 2); - } - - #[test] - fn test_no_exact_match() { + #[rstest::rstest] + #[case::exact_least_upper(5, Bound::LeastUpper, 2)] + #[case::exact_greatest_lower(5, Bound::GreatestLower, 2)] + #[case::no_match_least_upper(4, Bound::LeastUpper, 2)] + #[case::no_match_greatest_lower(6, Bound::GreatestLower, 2)] + fn test_binary_search( + #[case] search_key: i32, + #[case] bound: Bound, + #[case] expected_index: usize, + ) { let values = vec![1, 3, 5, 7, 9]; - - // LeastUpper bound (find element >= key) - let result = - binary_search_by_key_with_bounds(&values, 4, get_val, Bound::LeastUpper).unwrap(); - assert_eq!(result, 2); // Index of 5 - - // GreatestLower bound (find element <= key) - let result = - binary_search_by_key_with_bounds(&values, 6, get_val, Bound::GreatestLower).unwrap(); - assert_eq!(result, 2); // Index of 5 + let result = binary_search_by_key_with_bounds(&values, search_key, get_val, bound).unwrap(); + assert_eq!(result, expected_index); } - #[test] - fn test_duplicate_values() { + #[rstest::rstest] + #[case::least_upper_first_occurrence(5, Bound::LeastUpper, 2)] + #[case::greatest_lower_last_occurrence(5, Bound::GreatestLower, 4)] + fn test_duplicate_values( + #[case] search_key: i32, + #[case] bound: Bound, + #[case] expected_index: usize, + ) { let values = vec![1, 3, 5, 5, 5, 7, 9]; - - // LeastUpper should find first occurrence - let result = - binary_search_by_key_with_bounds(&values, 5, get_val, Bound::LeastUpper).unwrap(); - assert_eq!(result, 2); // First index of 5 - - // GreatestLower should find last occurrence - let result = - binary_search_by_key_with_bounds(&values, 5, get_val, Bound::GreatestLower).unwrap(); - assert_eq!(result, 4); // Last index of 5 + let result = binary_search_by_key_with_bounds(&values, search_key, get_val, bound).unwrap(); + assert_eq!(result, expected_index); } #[test] diff --git a/kernel/src/kernel_predicates/mod.rs b/kernel/src/kernel_predicates/mod.rs index db4c8850ee..4a97bcfc8f 100644 --- a/kernel/src/kernel_predicates/mod.rs +++ b/kernel/src/kernel_predicates/mod.rs @@ -177,11 +177,13 @@ pub trait KernelPredicateEvaluator { Expr::Opaque(OpaqueExpression { op, exprs }) => { self.eval_pred_expr_opaque(op, exprs, inverted) } - Expr::Struct(_) + Expr::Struct(..) | Expr::Transform(_) | Expr::Unary(_) | Expr::Binary(_) | Expr::Variadic(_) + | Expr::ParseJson(_) + | Expr::MapToStruct(_) | Expr::Unknown(_) => None, } } @@ -203,12 +205,14 @@ pub trait KernelPredicateEvaluator { Expr::Literal(val) => self.eval_pred_scalar_is_null(val, inverted), Expr::Column(col) => self.eval_pred_is_null(col, inverted), Expr::Predicate(_) - | Expr::Struct(_) + | Expr::Struct(..) | Expr::Transform(_) | Expr::Unary(_) | Expr::Binary(_) | Expr::Variadic(_) | Expr::Opaque(_) + | Expr::ParseJson { .. } + | Expr::MapToStruct(_) | Expr::Unknown(_) => { debug!("Unsupported operand: IS [NOT] NULL: {expr:?}"); None @@ -322,6 +326,10 @@ pub trait KernelPredicateEvaluator { /// Evaluates a (possibly inverted) predicate with SQL WHERE semantics. /// + /// NOTE: A NULL literal in a boolean position is treated as unknown (not false), because + /// callers like `build_actions_meta_predicate` use NULL as a sentinel for unsupported arms. + /// Treating it as false would let `AND(supported, NULL)` incorrectly prune files. + /// /// By default, [`Self::eval_pred`] behaves badly for comparisons involving NULL columns /// (e.g. `a < 10` when `a` is NULL), because the comparison correctly evaluates to NULL, but /// NULL values are interpreted as "stats missing" (= cannot skip). This ambiguity can "poison" @@ -452,10 +460,6 @@ pub trait KernelPredicateEvaluator { .into_iter(); self.finish_eval_pred_junction(JunctionPredicateOp::And, &mut preds, false) } - BooleanExpression(Expr::Literal(val)) if val.is_null() => { - // AND(NULL IS NOT NULL, NULL) = AND(FALSE, NULL) = FALSE - self.eval_pred_scalar(&Scalar::from(false), false) - } BooleanExpression(Expr::Predicate(pred)) => self.eval_pred_sql_where(pred, inverted), // Process all remaining predicates normally, because they are not proven safe. Indeed, // predicates like DISTINCT and IS [NOT] NULL are known-unsafe under SQL semantics: @@ -510,15 +514,21 @@ impl KernelPredicateEvaluatorDefaults { Some(val.is_null() != inverted) } - /// A (possibly inverted) partial comparison of two scalars, leveraging the [`PartialOrd`] - /// trait. + /// A (possibly inverted) partial comparison of two scalars using SQL/logical semantics. + /// + /// Returns `None` if the scalars are incomparable (different types, NULL values, or + /// unsupported types like Struct/Array/Map). + /// + /// NOTE: This implements SQL NULL semantics where NULL is incomparable to everything, + /// including itself. For physical/structural comparison of query plans, use `==` on + /// `Scalar` directly (which provides physical equality). pub fn partial_cmp_scalars( ord: Ordering, a: &Scalar, b: &Scalar, inverted: bool, ) -> Option { - let cmp = a.partial_cmp(b)?; + let cmp = a.logical_partial_cmp(b)?; let matched = cmp == ord; Some(matched != inverted) } @@ -616,7 +626,7 @@ impl DefaultKernelPredicateEvaluator { Expr::Literal(value) => Some(value.clone()), Expr::Column(name) => self.resolve_column(name), Expr::Predicate(pred) => self.eval_pred(pred, false).map(Scalar::from), - Expr::Struct(_) | Expr::Transform(_) | Expr::Unary(_) => None, // TODO? + Expr::Struct(..) | Expr::Transform(_) | Expr::Unary(_) => None, // TODO? Expr::Binary(BinaryExpression { op, left, right }) => { let op_fn = match op { BinaryExpressionOp::Plus => Scalar::try_add, @@ -633,6 +643,8 @@ impl DefaultKernelPredicateEvaluator { warn!("Failed to evaluate {:?}: {err:?}", op.as_ref()); }) .ok(), + // ParseJson and MapToStruct produce structured output, not scalar values + Expr::ParseJson(_) | Expr::MapToStruct(_) => None, Expr::Unknown(_) => None, } } diff --git a/kernel/src/kernel_predicates/parquet_stats_skipping.rs b/kernel/src/kernel_predicates/parquet_stats_skipping.rs index 73888c5b76..b6d21f2029 100644 --- a/kernel/src/kernel_predicates/parquet_stats_skipping.rs +++ b/kernel/src/kernel_predicates/parquet_stats_skipping.rs @@ -10,9 +10,8 @@ use std::cmp::Ordering; #[cfg(test)] mod tests; -/// A helper trait (mostly exposed for testing). It provides the four stats getters needed by -/// [`DataSkippingStatsProvider`]. From there, we can automatically derive a -/// [`DataSkippingPredicateEvaluator`]. +/// A helper trait (mostly exposed for testing). It provides the four stats getters needed to +/// derive a [`DataSkippingPredicateEvaluator`] via the blanket impl below. pub(crate) trait ParquetStatsProvider { /// The min-value stat for this column, if the column exists in this file, has the expected /// type, and the parquet footer provides stats for it. @@ -26,8 +25,10 @@ pub(crate) trait ParquetStatsProvider { /// type, and the parquet footer provides stats for it. fn get_parquet_nullcount_stat(&self, col: &ColumnName) -> Option; - /// The rowcount stat for this row group. It is always available in the parquet footer. - fn get_parquet_rowcount_stat(&self) -> i64; + /// The rowcount stat for this row group. Returns `None` if the rowcount is not meaningful + /// (e.g. in checkpoint files where the footer rowcount is the number of add file rows, not + /// the sum of data file row counts). + fn get_parquet_rowcount_stat(&self) -> Option; } // Blanket implementation for all types that impl ParquetStatsProvider. @@ -48,7 +49,7 @@ impl DataSkippingPredicateEvaluator for T { } fn get_rowcount_stat(&self) -> Option { - Some(Scalar::from(self.get_parquet_rowcount_stat())) + self.get_parquet_rowcount_stat().map(Scalar::from) } fn eval_partial_cmp( diff --git a/kernel/src/kernel_predicates/parquet_stats_skipping/tests.rs b/kernel/src/kernel_predicates/parquet_stats_skipping/tests.rs index 44d79e23d0..a7cf8bb1d2 100644 --- a/kernel/src/kernel_predicates/parquet_stats_skipping/tests.rs +++ b/kernel/src/kernel_predicates/parquet_stats_skipping/tests.rs @@ -35,7 +35,7 @@ impl ParquetStatsProvider for UnimplementedTestFilter { unimplemented!() } - fn get_parquet_rowcount_stat(&self) -> i64 { + fn get_parquet_rowcount_stat(&self) -> Option { unimplemented!() } } @@ -148,7 +148,7 @@ impl ParquetStatsProvider for MinMaxTestFilter { unimplemented!() } - fn get_parquet_rowcount_stat(&self) -> i64 { + fn get_parquet_rowcount_stat(&self) -> Option { unimplemented!() } } @@ -221,8 +221,8 @@ impl ParquetStatsProvider for NullCountTestFilter { self.nullcount } - fn get_parquet_rowcount_stat(&self) -> i64 { - self.rowcount + fn get_parquet_rowcount_stat(&self) -> Option { + Some(self.rowcount) } } diff --git a/kernel/src/kernel_predicates/tests.rs b/kernel/src/kernel_predicates/tests.rs index bbe7db0ed5..1f76e95c06 100644 --- a/kernel/src/kernel_predicates/tests.rs +++ b/kernel/src/kernel_predicates/tests.rs @@ -11,12 +11,33 @@ use crate::DeltaResult; use std::collections::HashMap; +/// Helper trait to allow expect_eq! to work with both Option and Option +trait LogicalEq { + fn logical_eq(&self, other: &Self) -> bool; +} + +impl LogicalEq for Option { + fn logical_eq(&self, other: &Self) -> bool { + match (self, other) { + (Some(a), Some(b)) => a.logical_eq(b), + (None, None) => true, + _ => false, + } + } +} + +impl LogicalEq for Option { + fn logical_eq(&self, other: &Self) -> bool { + self == other + } +} + macro_rules! expect_eq { ( $expr: expr, $expect: expr, $fmt: literal ) => { let expect = ($expect); let result = ($expr); assert!( - result == expect, + result.logical_eq(&expect), "Expected {} = {:?}, got {:?}", format!($fmt), expect, @@ -31,27 +52,27 @@ impl ResolveColumnAsScalar for Scalar { } } -#[test] -fn test_default_eval_scalar() { - let test_cases = [ - (Scalar::Boolean(true), false, Some(true)), - (Scalar::Boolean(true), true, Some(false)), - (Scalar::Boolean(false), false, Some(false)), - (Scalar::Boolean(false), true, Some(true)), - (Scalar::Long(1), false, None), - (Scalar::Long(1), true, None), - (Scalar::Null(DataType::BOOLEAN), false, None), - (Scalar::Null(DataType::BOOLEAN), true, None), - (Scalar::Null(DataType::LONG), false, None), - (Scalar::Null(DataType::LONG), true, None), - ]; - for (value, inverted, expect) in test_cases.into_iter() { - assert_eq!( - KernelPredicateEvaluatorDefaults::eval_pred_scalar(&value, inverted), - expect, - "value: {value:?} inverted: {inverted}" - ); - } +#[rstest::rstest] +#[case::bool_true_not_inverted(Scalar::Boolean(true), false, Some(true))] +#[case::bool_true_inverted(Scalar::Boolean(true), true, Some(false))] +#[case::bool_false_not_inverted(Scalar::Boolean(false), false, Some(false))] +#[case::bool_false_inverted(Scalar::Boolean(false), true, Some(true))] +#[case::long_not_inverted(Scalar::Long(1), false, None)] +#[case::long_inverted(Scalar::Long(1), true, None)] +#[case::null_boolean_not_inverted(Scalar::Null(DataType::BOOLEAN), false, None)] +#[case::null_boolean_inverted(Scalar::Null(DataType::BOOLEAN), true, None)] +#[case::null_long_not_inverted(Scalar::Null(DataType::LONG), false, None)] +#[case::null_long_inverted(Scalar::Null(DataType::LONG), true, None)] +fn test_default_eval_scalar( + #[case] value: Scalar, + #[case] inverted: bool, + #[case] expect: Option, +) { + assert_eq!( + KernelPredicateEvaluatorDefaults::eval_pred_scalar(&value, inverted), + expect, + "value: {value:?} inverted: {inverted}" + ); } // verifies that partial orderings behave as expected for all Scalar types @@ -427,46 +448,42 @@ fn test_eval_junction() { } } -#[test] -fn test_eval_column() { - let test_cases = [ - (Scalar::from(true), Some(true)), - (Scalar::from(false), Some(false)), - (Scalar::Null(DataType::BOOLEAN), None), - (Scalar::from(1), None), - ]; +#[rstest::rstest] +#[case::bool_true(Scalar::from(true), Some(true))] +#[case::bool_false(Scalar::from(false), Some(false))] +#[case::null_boolean(Scalar::Null(DataType::BOOLEAN), None)] +#[case::long(Scalar::from(1), None)] +fn test_eval_column( + #[case] input: Scalar, + #[case] expect: Option, + #[values(true, false)] inverted: bool, +) { let col = &column_name!("x"); - for (input, expect) in &test_cases { - let filter = DefaultKernelPredicateEvaluator::from(input.clone()); - for inverted in [true, false] { - expect_eq!( - filter.eval_pred_column(col, inverted), - expect.map(|v| v != inverted), - "{input:?} (inverted: {inverted})" - ); - } - } + let filter = DefaultKernelPredicateEvaluator::from(input.clone()); + expect_eq!( + filter.eval_pred_column(col, inverted), + expect.map(|v| v != inverted), + "{input:?} (inverted: {inverted})" + ); } -#[test] -fn test_eval_not() { - let test_cases = [ - (Scalar::Boolean(true), Some(false)), - (Scalar::Boolean(false), Some(true)), - (Scalar::Null(DataType::BOOLEAN), None), - (Scalar::Long(1), None), - ]; +#[rstest::rstest] +#[case::bool_true(Scalar::Boolean(true), Some(false))] +#[case::bool_false(Scalar::Boolean(false), Some(true))] +#[case::null_boolean(Scalar::Null(DataType::BOOLEAN), None)] +#[case::long(Scalar::Long(1), None)] +fn test_eval_not( + #[case] input: Scalar, + #[case] expect: Option, + #[values(true, false)] inverted: bool, +) { let filter = DefaultKernelPredicateEvaluator::from(UnimplementedColumnResolver); - for (input, expect) in test_cases { - let input = Pred::from_expr(input); - for inverted in [true, false] { - expect_eq!( - filter.eval_pred_not(&input, inverted), - expect.map(|v| v != inverted), - "NOT({input:?}) (inverted: {inverted})" - ); - } - } + let input = Pred::from_expr(input); + expect_eq!( + filter.eval_pred_not(&input, inverted), + expect.map(|v| v != inverted), + "NOT({input:?}) (inverted: {inverted})" + ); } #[test] @@ -717,8 +734,8 @@ impl ParquetStatsProvider for MinStatsValue { Some(0) } - fn get_parquet_rowcount_stat(&self) -> i64 { - 1 + fn get_parquet_rowcount_stat(&self) -> Option { + Some(1) } } @@ -822,8 +839,8 @@ impl ParquetStatsProvider for OneStatsValue { Some(nullcount) } - fn get_parquet_rowcount_stat(&self) -> i64 { - 1 + fn get_parquet_rowcount_stat(&self) -> Option { + Some(1) } } @@ -990,11 +1007,13 @@ fn test_sql_where() { expect_eq!(null_filter.eval_sql_where(pred), Some(false), "{pred}"); expect_eq!(empty_filter.eval_sql_where(pred), None, "{pred}"); - // NULL allows static skipping under SQL semantics + // NULL literal is treated as unknown (not false) under eval_sql_where, so it does not + // force static skipping. This prevents incorrect pruning when indirect data skipping + // rewriters use NULL as a sentinel for unsupported predicate arms. let pred = &Pred::and(NULL, Pred::lt(col.clone(), VAL)); expect_eq!(null_filter.eval(pred), None, "{pred}"); expect_eq!(null_filter.eval_sql_where(pred), Some(false), "{pred}"); - expect_eq!(empty_filter.eval_sql_where(pred), Some(false), "{pred}"); + expect_eq!(empty_filter.eval_sql_where(pred), None, "{pred}"); // Contrast normal vs. SQL WHERE semantics - comparison inside AND inside AND let pred = &Pred::and(TRUE, Pred::and(TRUE, Pred::lt(col.clone(), VAL))); diff --git a/kernel/src/last_checkpoint_hint.rs b/kernel/src/last_checkpoint_hint.rs index 363180856e..f0a8c49303 100644 --- a/kernel/src/last_checkpoint_hint.rs +++ b/kernel/src/last_checkpoint_hint.rs @@ -1,12 +1,14 @@ //! Utities for reading the `_last_checkpoint` file. Maybe this file should instead go under //! log_segment module since it should only really be used there? as hint for listing? -use crate::schema::Schema; +use std::collections::HashMap; + +use crate::schema::SchemaRef; use crate::{DeltaResult, Error, StorageHandler, Version}; use delta_kernel_derive::internal_api; use serde::{Deserialize, Serialize}; -use tracing::warn; +use tracing::{info, instrument, warn}; use url::Url; /// Name of the _last_checkpoint file that provides metadata about the last checkpoint @@ -31,9 +33,11 @@ pub(crate) struct LastCheckpointHint { /// The number of AddFile actions in the checkpoint. pub(crate) num_of_add_files: Option, /// The schema of the checkpoint file. - pub(crate) checkpoint_schema: Option, + pub(crate) checkpoint_schema: Option, /// The checksum of the last checkpoint JSON. pub(crate) checksum: Option, + /// Additional metadata about the last checkpoint. + pub(crate) tags: Option>, } impl LastCheckpointHint { @@ -50,16 +54,24 @@ impl LastCheckpointHint { /// are assumed to cause failure. // TODO(#1047): weird that we propagate FileNotFound as part of the iterator instead of top- // level result coming from storage.read_files + #[instrument(name = "last_checkpoint.read", skip_all, err)] pub(crate) fn try_read( storage: &dyn StorageHandler, log_root: &Url, ) -> DeltaResult> { let file_path = Self::path(log_root)?; match storage.read_files(vec![(file_path, None)])?.next() { - Some(Ok(data)) => Ok(serde_json::from_slice(&data) - .inspect_err(|e| warn!("invalid _last_checkpoint JSON: {e}")) - .ok()), - Some(Err(Error::FileNotFound(_))) => Ok(None), + Some(Ok(data)) => { + let result: Option = serde_json::from_slice(&data) + .inspect_err(|e| warn!("invalid _last_checkpoint JSON: {e}")) + .ok(); + info!(hint = result.as_ref().map(|h| h.summary())); + Ok(result) + } + Some(Err(Error::FileNotFound(_))) => { + info!("_last_checkpoint file not found"); + Ok(None) + } Some(Err(err)) => Err(err), None => { warn!("empty _last_checkpoint file"); @@ -67,4 +79,18 @@ impl LastCheckpointHint { } } } + + /// Succinct summary string for logging purposes. + fn summary(&self) -> String { + format!( + "{{v={}, size={}, parts={:?}}}", + self.version, self.size, self.parts + ) + } + + /// Convert the LastCheckpointHint to JSON bytes + #[cfg(test)] + pub(crate) fn to_json_bytes(&self) -> Vec { + serde_json::to_vec(self).expect("Failed to convert LastCheckpointHint to JSON bytes") + } } diff --git a/kernel/src/lib.rs b/kernel/src/lib.rs index 8a37809113..533c932d57 100644 --- a/kernel/src/lib.rs +++ b/kernel/src/lib.rs @@ -50,7 +50,7 @@ //! connectors are asked to provide the context information they require to execute the actual //! operation. This is done by invoking methods on the [`StorageHandler`] trait. -#![cfg_attr(all(doc, NIGHTLY_CHANNEL), feature(doc_auto_cfg))] +#![cfg_attr(all(doc, NIGHTLY_CHANNEL), feature(doc_cfg))] #![warn( unreachable_pub, trivial_numeric_casts, @@ -88,11 +88,18 @@ mod action_reconciliation; pub mod actions; pub mod checkpoint; pub mod committer; +// Public under test-utils so integration tests can inspect CRC state via Snapshot::get_current_crc_if_loaded_for_testing. +#[cfg(feature = "test-utils")] +pub mod crc; +#[cfg(not(feature = "test-utils"))] +pub(crate) mod crc; pub mod engine_data; pub mod error; pub mod expressions; mod log_compaction; mod log_path; +mod log_reader; +pub mod metrics; pub mod scan; pub mod schema; pub mod snapshot; @@ -101,16 +108,19 @@ pub mod table_configuration; pub mod table_features; pub mod table_properties; pub mod transaction; -pub(crate) mod transforms; +pub mod transforms; pub use log_path::LogPath; mod row_tracking; +pub(crate) mod clustering; + mod arrow_compat; -#[cfg(any(feature = "arrow-55", feature = "arrow-56"))] +#[cfg(any(feature = "arrow-56", feature = "arrow-57"))] pub use arrow_compat::*; +pub(crate) mod column_trie; pub mod kernel_predicates; pub(crate) mod utils; @@ -141,25 +151,34 @@ pub mod last_checkpoint_hint; #[cfg(not(feature = "internal-api"))] pub(crate) mod last_checkpoint_hint; -pub(crate) mod listed_log_files; +pub(crate) mod log_segment_files; #[cfg(feature = "internal-api")] pub mod history_manager; #[cfg(not(feature = "internal-api"))] pub(crate) mod history_manager; -pub use crate::engine_data::FilteredEngineData; +#[cfg(feature = "internal-api")] +pub mod parallel; +#[cfg(not(feature = "internal-api"))] +pub(crate) mod parallel; + +pub use action_reconciliation::{ActionReconciliationIterator, ActionReconciliationIteratorState}; pub use delta_kernel_derive; -pub use engine_data::{EngineData, RowVisitor}; +use delta_kernel_derive::internal_api; +pub use engine_data::{ + EngineData, FilteredEngineData, FilteredRowVisitor, GetData, RowIndexIterator, RowVisitor, +}; pub use error::{DeltaResult, Error}; pub use expressions::{Expression, ExpressionRef, Predicate, PredicateRef}; -pub use log_compaction::{should_compact, LogCompactionDataIterator, LogCompactionWriter}; +pub use log_compaction::{should_compact, LogCompactionWriter}; +pub use metrics::MetricsReporter; pub use snapshot::Snapshot; pub use snapshot::SnapshotRef; -use expressions::literal_expression_transform::LiteralExpressionTransform; +use expressions::literal_expression_transform; use expressions::Scalar; -use schema::{SchemaTransform, StructField, StructType}; +use schema::{StructField, StructType}; #[cfg(any( feature = "default-engine-native-tls", @@ -170,6 +189,11 @@ pub mod engine; /// Delta table version is 8 byte unsigned int pub type Version = u64; + +/// Sentinel version indicating a pre-commit state (table does not exist yet). +/// Used for create-table transactions before the first commit. +pub const PRE_COMMIT_VERSION: Version = u64::MAX; + pub type FileSize = u64; pub type FileIndex = u64; @@ -426,7 +450,7 @@ pub trait EvaluationHandler: AsAny { input_schema: SchemaRef, expression: ExpressionRef, output_type: DataType, - ) -> Arc; + ) -> DeltaResult>; /// Create a [`PredicateEvaluator`] that can evaluate the given [`Predicate`] on columnar /// batches with the given [`Schema`] to produce a column of boolean results. @@ -443,19 +467,51 @@ pub trait EvaluationHandler: AsAny { &self, input_schema: SchemaRef, predicate: PredicateRef, - ) -> Arc; + ) -> DeltaResult>; /// Create a single-row all-null-value [`EngineData`] with the schema specified by /// `output_schema`. // NOTE: we should probably allow DataType instead of SchemaRef, but can expand that in the // future. fn null_row(&self, output_schema: SchemaRef) -> DeltaResult>; + + /// Create a multi-row [`EngineData`] by applying the given schema to multiple rows of values. + /// + /// Each element in `rows` represents one row of data, where each row is a slice of structured + /// scalar values (one scalar per top-level field in the schema). + /// + /// # Parameters + /// + /// - `schema`: Schema describing the structure of each row. + /// - `rows`: Slice of rows, where each row contains one structured scalar per top-level schema + /// field. + /// + /// # Returns + /// + /// A multi-row `EngineData` containing all rows. + /// + /// # Errors + /// + /// Returns an error if any row has a number of scalars that does not match the number of + /// top-level fields in `schema`, or if any scalar value cannot be appended to its corresponding + /// field's builder (e.g. due to a type mismatch). + /// + /// # Example + /// + /// For a schema with fields `[add: Struct, remove: Struct]`, each row should contain exactly 2 + /// scalars: one for the `add` field and one for the `remove` field. + fn create_many( + &self, + schema: SchemaRef, + rows: &[&[Scalar]], + ) -> DeltaResult>; } /// Internal trait to allow us to have a private `create_one` API that's implemented for all /// EvaluationHandlers. // For some reason rustc doesn't detect it's usage so we allow(dead_code) here... #[allow(dead_code)] +#[internal_api] trait EvaluationHandlerExtension: EvaluationHandler { /// Create a single-row [`EngineData`] by applying the given schema to the leaf-values given in /// `values`. @@ -470,11 +526,10 @@ trait EvaluationHandlerExtension: EvaluationHandler { let null_row = self.null_row(null_row_schema.clone())?; // Convert schema and leaf values to an expression - let mut schema_transform = LiteralExpressionTransform::new(values); - schema_transform.transform_struct(schema.as_ref()); - let row_expr = schema_transform.try_into_expr()?; + let row_expr = literal_expression_transform(schema.as_ref(), values)?; - let eval = self.new_expression_evaluator(null_row_schema, row_expr.into(), schema.into()); + let eval = + self.new_expression_evaluator(null_row_schema, row_expr.into(), schema.into())?; eval.evaluate(null_row.as_ref()) } } @@ -506,6 +561,7 @@ impl EvaluationHandlerExtension for T {} /// let engine = todo!(); // create an engine /// let engine_data = my_struct.into_engine_data(schema, engine); /// ``` +#[internal_api] pub(crate) trait IntoEngineData { /// Consume this type to produce a single-row EngineData using the provided schema. fn into_engine_data( @@ -538,6 +594,17 @@ pub trait StorageHandler: AsAny { /// Copy a file atomically from source to destination. If the destination file already exists, /// it must return Err(Error::FileAlreadyExists). fn copy_atomic(&self, src: &Url, dest: &Url) -> DeltaResult<()>; + + /// Write data to the specified path. + /// + /// If `overwrite` is false and the file already exists, this must return + /// `Err(Error::FileAlreadyExists)`. + fn put(&self, path: &Url, data: Bytes, overwrite: bool) -> DeltaResult<()>; + + /// Perform a HEAD request for the given file at a Url, returning the file metadata. + /// + /// If the file does not exist, this must return an `Err` with [`Error::FileNotFound`]. + fn head(&self, path: &Url) -> DeltaResult; } /// Provides JSON handling functionality to Delta Kernel. @@ -569,6 +636,8 @@ pub trait JsonHandler: AsAny { /// iter: [EngineData(1), EngineData(3, 2)] /// iter: [EngineData(2, 1, 3)] /// + /// Additionally, engines may not merge engine data across file boundaries. + /// /// # Parameters /// /// - `files` - File metadata for files to be read. @@ -611,6 +680,26 @@ pub trait JsonHandler: AsAny { ) -> DeltaResult<()>; } +/// Reserved field IDs for metadata columns in Delta tables. +/// +/// These field IDs are reserved and should not be used for regular table columns. +/// They are used to provide file-level metadata as virtual columns during reads. +pub mod reserved_field_ids { + /// Reserved field ID for the file name metadata column (`_file`). + /// This column provides the name of the Parquet file that contains each row. + pub const FILE_NAME: i64 = 2147483646; +} + +/// Metadata from a Parquet file footer. +/// +/// This struct contains metadata extracted from a Parquet file's footer, including the schema. +/// It is designed to be extensible for future additions such as row group statistics. +#[derive(Debug, Clone)] +pub struct ParquetFooter { + /// The schema of the Parquet file, converted to Delta Kernel's schema format. + pub schema: SchemaRef, +} + /// Provides Parquet file related functionalities to Delta Kernel. /// /// Connectors can leverage this trait to provide their own custom @@ -634,17 +723,73 @@ pub trait ParquetHandler: AsAny { /// 2. **Field Name**: If no field ID is present in the `physical_schema`'s [`StructField`] or no matching parquet field ID is found, /// fall back to matching by column name /// + /// # Metadata Columns + /// + /// The ParquetHandler must support virtual metadata columns that provide additional information + /// about each row. These columns are not stored in the Parquet file but are generated at read time. + /// + /// ## Row Index Column + /// + /// When a column in `physical_schema` is marked as a row index metadata column (via + /// [`StructField::create_metadata_column`] with [`schema::MetadataColumnSpec::RowIndex`]), the + /// ParquetHandler must populate it with the 0-based row position within the Parquet file: + /// + /// - **Column name**: User-specified (commonly `"row_index"` or `"_metadata.row_index"`) + /// - **Type**: `LONG` (non-nullable) + /// - **Values**: Sequential integers starting at 0 for each file + /// - **Use case**: Track row positions for downstream processing, or internally used to compute Row IDs + /// + /// Example: A file with 5 rows would have row_index values `[0, 1, 2, 3, 4]`. + /// + /// ## File Name Column (Reserved Field ID) + /// + /// When a column in `physical_schema` has the reserved field ID + /// [`reserved_field_ids::FILE_NAME`] (2147483646), the ParquetHandler must populate it + /// with the file path/name: + /// + /// - **Column name**: `"_file"` + /// - **Type**: `STRING` (non-nullable) + /// - **Field ID**: 2147483646 (reserved) + /// - **Values**: The file path/URL (e.g., `"s3://bucket/path/file.parquet"`) + /// - **Use case**: Track which file each row came from in multi-file reads + /// + /// Example: All rows from the same file would have the same `_file` value. + /// + /// ## Metadata Column Examples + /// + /// ```rust,ignore + /// use delta_kernel::schema::{StructType, StructField, DataType, MetadataColumnSpec}; + /// + /// // Example 1: Schema with row_index metadata column + /// let schema_with_row_index = StructType::try_new([ + /// StructField::nullable("id", DataType::INTEGER), + /// StructField::create_metadata_column("row_index", MetadataColumnSpec::RowIndex), + /// StructField::nullable("value", DataType::STRING), + /// ])?; + /// + /// // Example 2: Schema with _file metadata column (using reserved field ID) + /// let schema_with_file_path = StructType::try_new([ + /// StructField::nullable("id", DataType::INTEGER), + /// StructField::create_metadata_column("_file", MetadataColumnSpec::FilePath), + /// StructField::nullable("value", DataType::STRING), + /// ])?; + /// ``` + /// + /// --- + /// /// If no matching Parquet column is found, `NULL` values are returned /// for nullable columns in `physical_schema`. For non-nullable columns, an error is returned. /// /// - /// ## Examples + /// ## Column Matching Examples /// /// Consider a `physical_schema` with the following fields: - /// - Column 0: `"i_logical"` (integer, non-null) with metadata `"parquet.field.id": 1` + /// - Column 0: `"i_logical"` (integer, non-null) with field ID 1 (via [`ColumnMetadataKey::ParquetFieldId`]) /// - Column 1: `"s"` (string, nullable) with no field ID metadata /// - Column 2: `"i2"` (integer, nullable) with no field ID metadata /// + /// [`ColumnMetadataKey::ParquetFieldId`]: crate::schema::ColumnMetadataKey::ParquetFieldId + /// /// And a Parquet file containing these columns: /// - Column 0: `"i2"` (integer, nullable) with field ID 3 /// - Column 1: `"i"` (integer, non-null) with field ID 1 @@ -683,6 +828,8 @@ pub trait ParquetHandler: AsAny { /// iter: [EngineData(1), EngineData(3, 2)] /// iter: [EngineData(2, 1, 3)] /// + /// Additionally, engines must not merge engine data across file boundaries. + /// /// [`ColumnMetadataKey::ParquetFieldId`]: crate::schema::ColumnMetadataKey fn read_parquet_files( &self, @@ -690,6 +837,63 @@ pub trait ParquetHandler: AsAny { physical_schema: SchemaRef, predicate: Option, ) -> DeltaResult; + + /// Write data to a Parquet file at the specified URL. + /// + /// This method writes the provided `data` to a Parquet file at the given `url`. + /// + /// This will overwrite the file if it already exists. + /// + /// # Parameters + /// + /// - `url` - The full URL path where the Parquet file should be written + /// (e.g., `s3://bucket/path/file.parquet`). + /// - `data` - An iterator of engine data to be written to the Parquet file. + /// + /// # Returns + /// + /// A [`DeltaResult`] indicating success or failure. + fn write_parquet_file( + &self, + location: url::Url, + data: Box>> + Send>, + ) -> DeltaResult<()>; + + /// Read the footer metadata from a Parquet file without reading the data. + /// + /// This method reads only the Parquet file footer (metadata section), which is useful for + /// schema inspection, compatibility checking, and determining whether parsed statistics + /// columns are present and compatible with the current table schema. + /// + /// # Parameters + /// + /// - `file` - File metadata for the Parquet file whose footer should be read. The `size` field + /// should contain the actual file size to enable efficient footer reads without additional + /// I/O operations. + /// + /// # Returns + /// + /// A [`DeltaResult`] containing a [`ParquetFooter`] with the Parquet file's metadata, including + /// the schema converted to Delta Kernel's format. + /// + /// # Field IDs + /// + /// If the Parquet file contains field IDs (written when column mapping is enabled), they are + /// preserved in each [`StructField`]'s metadata. Callers can access field IDs via + /// [`StructField::get_config_value`] with [`ColumnMetadataKey::ParquetFieldId`]. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be accessed or does not exist + /// - The file is not a valid Parquet file + /// - The footer cannot be read or parsed + /// - The schema cannot be converted to Delta Kernel's format + /// + /// [`StructField`]: crate::schema::StructField + /// [`StructField::get_config_value`]: crate::schema::StructField::get_config_value + /// [`ColumnMetadataKey::ParquetFieldId`]: crate::schema::ColumnMetadataKey::ParquetFieldId + fn read_parquet_footer(&self, file: &FileMeta) -> DeltaResult; } /// The `Engine` trait encapsulates all the functionality an engine or connector needs to provide @@ -709,6 +913,14 @@ pub trait Engine: AsAny { /// Get the connector provided [`ParquetHandler`]. fn parquet_handler(&self) -> Arc; + + /// Get the connector provided [`MetricsReporter`] for metrics collection. + /// + /// Returns an optional reporter that will receive metric events from Delta operations. + /// The default implementation returns None (no metrics reporting). + fn get_metrics_reporter(&self) -> Option> { + None + } } // we have an 'internal' feature flag: default-engine-base, which is actually just the shared @@ -732,68 +944,4 @@ compile_error!( // done in unit tests). This module is not exclusively for macro tests only so other doctests can also be added. // https://doc.rust-lang.org/rustdoc/write-documentation/documentation-tests.html#include-items-only-when-collecting-doctests #[cfg(doctest)] -mod doc_tests { - - /// ``` - /// # use delta_kernel_derive::ToSchema; - /// #[derive(ToSchema)] - /// pub struct WithFields { - /// some_name: String, - /// } - /// ``` - #[cfg(doctest)] - pub struct MacroTestStructWithField; - - /// ```compile_fail - /// # use delta_kernel_derive::ToSchema; - /// #[derive(ToSchema)] - /// pub struct NoFields; - /// ``` - #[cfg(doctest)] - pub struct MacroTestStructWithoutField; - - /// ``` - /// # use delta_kernel_derive::ToSchema; - /// # use std::collections::HashMap; - /// #[derive(ToSchema)] - /// pub struct WithAngleBracketPath { - /// map_field: HashMap, - /// } - /// ``` - #[cfg(doctest)] - pub struct MacroTestStructWithAngleBracketedPathField; - - /// ``` - /// # use delta_kernel_derive::ToSchema; - /// # use std::collections::HashMap; - /// #[derive(ToSchema)] - /// pub struct WithAttributedField { - /// #[allow_null_container_values] - /// map_field: HashMap, - /// } - /// ``` - #[cfg(doctest)] - pub struct MacroTestStructWithAttributedField; - - /// ```compile_fail - /// # use delta_kernel_derive::ToSchema; - /// #[derive(ToSchema)] - /// pub struct WithInvalidAttributeTarget { - /// #[allow_null_container_values] - /// some_name: String, - /// } - /// ``` - #[cfg(doctest)] - pub struct MacroTestStructWithInvalidAttributeTarget; - - /// ```compile_fail - /// # use delta_kernel_derive::ToSchema; - /// # use syn::Token; - /// #[derive(ToSchema)] - /// pub struct WithInvalidFieldType { - /// token: Token![struct], - /// } - /// ``` - #[cfg(doctest)] - pub struct MacroTestStructWithInvalidFieldType; -} +mod doctests; diff --git a/kernel/src/listed_log_files.rs b/kernel/src/listed_log_files.rs deleted file mode 100644 index 5283ec0890..0000000000 --- a/kernel/src/listed_log_files.rs +++ /dev/null @@ -1,718 +0,0 @@ -//! [`ListedLogFiles`] is a struct holding the result of listing the delta log. Currently, it -//! exposes three APIs for listing: -//! 1. [`list_commits`]: Lists all commit files between the provided start and end versions. -//! 2. [`list`]: Lists all commit and checkpoint files between the provided start and end versions. -//! 3. [`list_with_checkpoint_hint`]: Lists all commit and checkpoint files after the provided -//! checkpoint hint. -//! -//! After listing, one can leverage the [`ListedLogFiles`] to construct a [`LogSegment`]. -//! -//! [`list_commits`]: Self::list_commits -//! [`list`]: Self::list -//! [`list_with_checkpoint_hint`]: Self::list_with_checkpoint_hint -//! [`LogSegment`]: crate::log_segment::LogSegment - -use std::collections::HashMap; - -use crate::last_checkpoint_hint::LastCheckpointHint; -use crate::path::{LogPathFileType, ParsedLogPath}; -use crate::{DeltaResult, Error, StorageHandler, Version}; - -use delta_kernel_derive::internal_api; - -use itertools::Itertools; -use tracing::log::*; -use url::Url; - -/// Represents the set of log files found during a listing operation in the Delta log directory. -/// -/// - `ascending_commit_files`: All commit and staged commit files found, sorted by version. May contain gaps. -/// - `ascending_compaction_files`: All compaction commit files found, sorted by version. -/// - `checkpoint_parts`: All parts of the most recent complete checkpoint (all same version). Empty if no checkpoint found. -/// - `latest_crc_file`: The CRC file with the highest version, if any. -/// - `latest_commit_file`: The commit file with the highest version, or `None` if no commits were found. -#[derive(Debug)] -#[internal_api] -pub(crate) struct ListedLogFiles { - pub(crate) ascending_commit_files: Vec, - pub(crate) ascending_compaction_files: Vec, - pub(crate) checkpoint_parts: Vec, - pub(crate) latest_crc_file: Option, - pub(crate) latest_commit_file: Option, -} - -/// Returns a fallible iterator of [`ParsedLogPath`] over versions `start_version..=end_version` -/// taking into account the `log_tail` which was (ostentibly) returned from the catalog. If there -/// are fewer files than requested (e.g. `end_version` is past the end of the log), the iterator -/// will simply end before reaching `end_version`. -/// -/// Note that the `log_tail` must strictly adhere to being a 'tail' - that is, it is a contiguous -/// cover of versions `X..=Y` where `Y` is the latest version of the table. If it overlaps with -/// commits listed from the filesystem, the `log_tail` will take precedence. -/// -/// If `start_version` is not specified, the listing will begin from version number 0. If -/// `end_version` is not specified, files up to the most recent version will be included. -/// -/// Note: this may call [`StorageHandler::list_from`] to get the list of log files unless the -/// provided log_tail covers the entire requested range. -/// -/// Note: at a high level we are doing two things: -/// 1. list from the storage handler and filter based on [`ParsedLogPath::should_list`] (to prevent -/// listing staged commits) -/// 2. add the log_tail from the catalog -fn list_log_files( - storage: &dyn StorageHandler, - log_root: &Url, - log_tail: Vec, - start_version: impl Into>, - end_version: impl Into>, -) -> DeltaResult>> { - // check log_tail is only commits - // note that LogSegment checks no gaps/duplicates so we don't duplicate that here - debug_assert!( - log_tail.iter().all(|entry| entry.is_commit()), - "log_tail should only contain commits" - ); - - // calculate listing bounds - let start_version = start_version.into().unwrap_or(0); - let end_version = end_version.into().unwrap_or(Version::MAX); - // start_from is log path to start listing from: the log root with zero-padded start version - let start_from = log_root.join(&format!("{start_version:020}"))?; - // stop before the log_tail or at the requested end, whichever comes first - let log_tail_start = log_tail.first(); - let list_end_version = - log_tail_start.map_or(end_version, |first| first.version.saturating_sub(1)); - - // if the log_tail covers the entire requested range (i.e. starts at or before start_version), - // we skip listing entirely. note that if we don't include this check, we will end up listing - // and then just filtering out all the files we listed. - let listed_files = log_tail_start - // log_tail covers the entire requested range, so no listing is required - .is_none_or(|tail_start| start_version < tail_start.version) - .then(|| -> DeltaResult<_> { - // NOTE: since engine APIs don't limit listing, we list from start_version and filter - Ok(storage - .list_from(&start_from)? - .map(|meta| ParsedLogPath::try_from(meta?)) - // NOTE: this filters out .crc files etc which start with "." - some engines - // produce `.something.parquet.crc` corresponding to `something.parquet`. Kernel - // doesn't care about these files. Critically, note these are _different_ than - // normal `version.crc` files which are listed + captured normally. Additionally - // we likely aren't even 'seeing' these files since lexicographically the string - // "." comes before the string "0". - .filter_map_ok(|path_opt| path_opt.filter(|p| p.should_list())) - .take_while(move |path_res| match path_res { - // discard any path with too-large version; keep errors - Ok(path) => path.version <= list_end_version, - Err(_) => true, - })) - }) - .transpose()? - .into_iter() - .flatten(); - - // return chained [listed_files..log_tail], filtering log_tail by the requested range - let filtered_log_tail = log_tail - .into_iter() - .filter(move |entry| entry.version >= start_version && entry.version <= end_version) - .map(Ok); - - Ok(listed_files.chain(filtered_log_tail)) -} - -/// Groups all checkpoint parts according to the checkpoint they belong to. -/// -/// NOTE: There could be a single-part and/or any number of uuid-based checkpoints. They -/// are all equivalent, and this routine keeps only one of them (arbitrarily chosen). -fn group_checkpoint_parts(parts: Vec) -> HashMap> { - let mut checkpoints: HashMap> = HashMap::new(); - for part_file in parts { - use LogPathFileType::*; - match &part_file.file_type { - SinglePartCheckpoint - | UuidCheckpoint - | MultiPartCheckpoint { - part_num: 1, - num_parts: 1, - } => { - // All single-file checkpoints are equivalent, just keep one - checkpoints.insert(1, vec![part_file]); - } - MultiPartCheckpoint { - part_num: 1, - num_parts, - } => { - // Start a new multi-part checkpoint with at least 2 parts - checkpoints.insert(*num_parts, vec![part_file]); - } - MultiPartCheckpoint { - part_num, - num_parts, - } => { - // Continue a new multi-part checkpoint with at least 2 parts. - // Checkpoint parts are required to be in-order from log listing to build - // a multi-part checkpoint - if let Some(part_files) = checkpoints.get_mut(num_parts) { - // `part_num` is guaranteed to be non-negative and within `usize` range - if *part_num as usize == 1 + part_files.len() { - // Safe to append because all previous parts exist - part_files.push(part_file); - } - } - } - Commit | StagedCommit | CompactedCommit { .. } | Crc | Unknown => {} - } - } - checkpoints -} - -impl ListedLogFiles { - // Note: for now we expose the constructor as pub(crate) to allow for use in testing. Ideally, - // we should explore entirely encapsulating ListedLogFiles within LogSegment - currently - // LogSegment constructor requires a ListedLogFiles. - #[internal_api] - pub(crate) fn try_new( - ascending_commit_files: Vec, - ascending_compaction_files: Vec, - checkpoint_parts: Vec, - latest_crc_file: Option, - latest_commit_file: Option, - ) -> DeltaResult { - // We are adding debug_assertions here since we want to validate invariants that are - // (relatively) expensive to compute - #[cfg(debug_assertions)] - { - assert!(ascending_compaction_files - .windows(2) - .all(|pair| match pair { - [ParsedLogPath { - version: version0, - file_type: LogPathFileType::CompactedCommit { hi: hi0 }, - .. - }, ParsedLogPath { - version: version1, - file_type: LogPathFileType::CompactedCommit { hi: hi1 }, - .. - }] => version0 < version1 || (version0 == version1 && hi0 <= hi1), - _ => false, - })); - - assert!(checkpoint_parts.iter().all(|part| part.is_checkpoint())); - - // for a multi-part checkpoint, check that they are all same version and all the parts are there - if checkpoint_parts.len() > 1 { - assert!(checkpoint_parts - .windows(2) - .all(|pair| pair[0].version == pair[1].version)); - - assert!(checkpoint_parts.iter().all(|part| matches!( - part.file_type, - LogPathFileType::MultiPartCheckpoint { num_parts, .. } - if checkpoint_parts.len() == num_parts as usize - ))); - } - } - - Ok(ListedLogFiles { - ascending_commit_files, - ascending_compaction_files, - checkpoint_parts, - latest_crc_file, - latest_commit_file, - }) - } - - /// List all commits between the provided `start_version` (inclusive) and `end_version` - /// (inclusive). All other types are ignored. - pub(crate) fn list_commits( - storage: &dyn StorageHandler, - log_root: &Url, - start_version: Option, - end_version: Option, - ) -> DeltaResult { - // TODO: plumb through a log_tail provided by our caller - let log_tail = vec![]; - let listed_commits: Vec = - list_log_files(storage, log_root, log_tail, start_version, end_version)? - .filter_ok(|log_file| log_file.is_commit()) - .try_collect()?; - // .last() on a slice is an O(1) operation - let latest_commit_file = listed_commits.last().cloned(); - ListedLogFiles::try_new(listed_commits, vec![], vec![], None, latest_commit_file) - } - - /// List all commit and checkpoint files with versions above the provided `start_version` (inclusive). - /// If successful, this returns a `ListedLogFiles`. - // TODO: encode some of these guarantees in the output types. e.g. we could have: - // - SortedCommitFiles: Vec, is_ascending: bool, end_version: Version - // - CheckpointParts: Vec, checkpoint_version: Version (guarantee all same version) - pub(crate) fn list( - storage: &dyn StorageHandler, - log_root: &Url, - log_tail: Vec, - start_version: Option, - end_version: Option, - ) -> DeltaResult { - let log_files = list_log_files(storage, log_root, log_tail, start_version, end_version)?; - - log_files.process_results(|iter| { - let mut ascending_commit_files = Vec::new(); - let mut ascending_compaction_files = Vec::new(); - let mut checkpoint_parts = vec![]; - let mut latest_crc_file: Option = None; - let mut latest_commit_file: Option = None; - - // Group log files by version - let log_files_per_version = iter.chunk_by(|x| x.version); - - for (version, files) in &log_files_per_version { - let mut new_checkpoint_parts = vec![]; - for file in files { - use LogPathFileType::*; - match file.file_type { - Commit | StagedCommit => ascending_commit_files.push(file), - CompactedCommit { hi } if end_version.is_none_or(|end| hi <= end) => { - ascending_compaction_files.push(file); - } - CompactedCommit { .. } => (), // Failed the bounds check above - SinglePartCheckpoint | UuidCheckpoint | MultiPartCheckpoint { .. } => { - new_checkpoint_parts.push(file) - } - Crc => { - let latest_crc_ref = latest_crc_file.as_ref(); - if latest_crc_ref.is_none_or(|latest| latest.version < file.version) { - latest_crc_file = Some(file); - } - } - Unknown => { - // It is possible that there are other files being stashed away into - // _delta_log/ This is not necessarily forbidden, but something we - // want to know about in a debugging scenario - debug!( - "Found file {} with unknown file type {:?} at version {}", - file.filename, file.file_type, version - ); - } - } - } - // Group and find the first complete checkpoint for this version. - // All checkpoints for the same version are equivalent, so we only take one. - if let Some((_, complete_checkpoint)) = group_checkpoint_parts(new_checkpoint_parts) - .into_iter() - // `num_parts` is guaranteed to be non-negative and within `usize` range - .find(|(num_parts, part_files)| part_files.len() == *num_parts as usize) - { - checkpoint_parts = complete_checkpoint; - // Check if there's a commit file at the same version as this checkpoint. We pop - // the last element from ascending_commit_files (which is sorted by version) and - // set latest_commit_file to it only if it matches the checkpoint version. If it - // doesn't match, we set latest_commit_file to None to discard any older commits - // from before the checkpoint - latest_commit_file = ascending_commit_files - .pop() - .filter(|commit| commit.version == version); - // Log replay only uses commits/compactions after a complete checkpoint - ascending_commit_files.clear(); - ascending_compaction_files.clear(); - } - } - - // Since ascending_commit_files is cleared at each checkpoint, if it's non-empty here - // it contains only commits after the most recent checkpoint. The last element is the - // highest version commit overall, so we update latest_commit_file to it. If it's empty, - // we keep the value set at the checkpoint (if a commit existed at the checkpoint version), - // or remains None. - if let Some(commit_file) = ascending_commit_files.last() { - latest_commit_file = Some(commit_file.clone()); - } - - ListedLogFiles::try_new( - ascending_commit_files, - ascending_compaction_files, - checkpoint_parts, - latest_crc_file, - latest_commit_file, - ) - })? - } - - /// List all commit and checkpoint files after the provided checkpoint. It is guaranteed that all - /// the returned [`ParsedLogPath`]s will have a version less than or equal to the `end_version`. - /// See [`list_log_files_with_version`] for details on the return type. - pub(crate) fn list_with_checkpoint_hint( - checkpoint_metadata: &LastCheckpointHint, - storage: &dyn StorageHandler, - log_root: &Url, - log_tail: Vec, - end_version: Option, - ) -> DeltaResult { - let listed_files = Self::list( - storage, - log_root, - log_tail, - Some(checkpoint_metadata.version), - end_version, - )?; - - let Some(latest_checkpoint) = listed_files.checkpoint_parts.last() else { - // TODO: We could potentially recover here - return Err(Error::invalid_checkpoint( - "Had a _last_checkpoint hint but didn't find any checkpoints", - )); - }; - if latest_checkpoint.version != checkpoint_metadata.version { - info!( - "_last_checkpoint hint is out of date. _last_checkpoint version: {}. Using actual most recent: {}", - checkpoint_metadata.version, - latest_checkpoint.version - ); - } else if listed_files.checkpoint_parts.len() != checkpoint_metadata.parts.unwrap_or(1) { - return Err(Error::InvalidCheckpoint(format!( - "_last_checkpoint indicated that checkpoint should have {} parts, but it has {}", - checkpoint_metadata.parts.unwrap_or(1), - listed_files.checkpoint_parts.len() - ))); - } - Ok(listed_files) - } -} - -#[cfg(test)] -mod list_log_files_with_log_tail_tests { - use std::sync::Arc; - - use futures::executor::block_on; - use object_store::{memory::InMemory, path::Path as ObjectPath, ObjectStore}; - use url::Url; - - use crate::engine::default::executor::tokio::TokioBackgroundExecutor; - use crate::engine::default::filesystem::ObjectStoreStorageHandler; - use crate::FileMeta; - - use super::*; - - // size markers used to identify commit sources in tests - const FILESYSTEM_SIZE_MARKER: u64 = 10; - const CATALOG_SIZE_MARKER: u64 = 7; - - #[derive(Debug, Clone, Copy, PartialEq, Eq)] - enum CommitSource { - Filesystem, - Catalog, - } - - // create test storage given list of log files with custom data content - fn create_storage( - log_files: Vec<(Version, LogPathFileType, CommitSource)>, - ) -> (Box, Url) { - let store = Arc::new(InMemory::new()); - let log_root = Url::parse("memory:///_delta_log/").unwrap(); - - block_on(async { - for (version, file_type, source) in log_files { - let path = match file_type { - LogPathFileType::Commit => { - format!("_delta_log/{version:020}.json") - } - LogPathFileType::StagedCommit => { - let uuid = uuid::Uuid::new_v4(); - format!("_delta_log/_staged_commits/{version:020}.{uuid}.json") - } - LogPathFileType::SinglePartCheckpoint => { - format!("_delta_log/{version:020}.checkpoint.parquet") - } - LogPathFileType::MultiPartCheckpoint { - part_num, - num_parts, - } => { - format!( - "_delta_log/{version:020}.checkpoint.{part_num:010}.{num_parts:010}.parquet" - ) - } - _ => panic!("Unsupported file type in test"), - }; - let data = match source { - CommitSource::Filesystem => bytes::Bytes::from("filesystem"), - CommitSource::Catalog => bytes::Bytes::from("catalog"), - }; - store - .put(&ObjectPath::from(path.as_str()), data.into()) - .await - .expect("Failed to put test file"); - } - }); - - let executor = Arc::new(TokioBackgroundExecutor::new()); - let storage = Box::new(ObjectStoreStorageHandler::new(store, executor)); - (storage, log_root) - } - - // helper to create a ParsedLogPath with specific source marker - fn make_parsed_log_path_with_source( - version: Version, - file_type: LogPathFileType, - source: CommitSource, - ) -> ParsedLogPath { - let url = Url::parse(&format!("memory:///_delta_log/{version:020}.json")).unwrap(); - let mut filename_path_segments = url.path_segments().unwrap(); - let filename = filename_path_segments.next_back().unwrap().to_string(); - let extension = filename.split('.').next_back().unwrap().to_string(); - - let size = match source { - CommitSource::Filesystem => FILESYSTEM_SIZE_MARKER, - CommitSource::Catalog => CATALOG_SIZE_MARKER, - }; - - let location = FileMeta { - location: url, - last_modified: 0, - size, - }; - - ParsedLogPath { - location, - filename, - extension, - version, - file_type, - } - } - - fn assert_source(commit: &ParsedLogPath, expected_source: CommitSource) { - let expected_size = match expected_source { - CommitSource::Filesystem => FILESYSTEM_SIZE_MARKER, - CommitSource::Catalog => CATALOG_SIZE_MARKER, - }; - assert_eq!( - commit.location.size, expected_size, - "Commit version {} should be from {:?}, but size was {}", - commit.version, expected_source, commit.location.size - ); - } - - #[test] - fn test_empty_log_tail() { - let log_files = vec![ - (0, LogPathFileType::Commit, CommitSource::Filesystem), - (1, LogPathFileType::Commit, CommitSource::Filesystem), - (2, LogPathFileType::Commit, CommitSource::Filesystem), - ]; - let (storage, log_root) = create_storage(log_files); - - let result: Vec<_> = list_log_files(storage.as_ref(), &log_root, vec![], Some(1), Some(2)) - .unwrap() - .try_collect() - .unwrap(); - - assert_eq!(result.len(), 2); - assert_eq!(result[0].version, 1); - assert_eq!(result[1].version, 2); - // all should be from filesystem since log_tail is empty - assert_source(&result[0], CommitSource::Filesystem); - assert_source(&result[1], CommitSource::Filesystem); - } - - #[test] - fn test_log_tail_has_latest_commit_files() { - // Filesystem has commits 0-2, log_tail has commits 3-5 (the latest) - let log_files = vec![ - (0, LogPathFileType::Commit, CommitSource::Filesystem), - (1, LogPathFileType::Commit, CommitSource::Filesystem), - (2, LogPathFileType::Commit, CommitSource::Filesystem), - ]; - let (storage, log_root) = create_storage(log_files); - - // log_tail is contiguous, only commits, and represents the latest versions - let log_tail = vec![ - make_parsed_log_path_with_source(3, LogPathFileType::Commit, CommitSource::Catalog), - make_parsed_log_path_with_source(4, LogPathFileType::Commit, CommitSource::Catalog), - make_parsed_log_path_with_source(5, LogPathFileType::Commit, CommitSource::Catalog), - ]; - - let result: Vec<_> = - list_log_files(storage.as_ref(), &log_root, log_tail, Some(0), Some(5)) - .unwrap() - .try_collect() - .unwrap(); - - assert_eq!(result.len(), 6); - // filesystem - assert_eq!(result[0].version, 0); - assert_eq!(result[1].version, 1); - assert_eq!(result[2].version, 2); - assert_source(&result[0], CommitSource::Filesystem); - assert_source(&result[1], CommitSource::Filesystem); - assert_source(&result[2], CommitSource::Filesystem); - // log_tail - assert_eq!(result[3].version, 3); - assert_eq!(result[4].version, 4); - assert_eq!(result[5].version, 5); - assert_source(&result[3], CommitSource::Catalog); - assert_source(&result[4], CommitSource::Catalog); - assert_source(&result[5], CommitSource::Catalog); - } - - #[test] - fn test_request_subset_with_log_tail() { - // Test requesting a subset when log_tail is the latest commits - let log_files = vec![ - (0, LogPathFileType::Commit, CommitSource::Filesystem), - (1, LogPathFileType::Commit, CommitSource::Filesystem), - ]; - let (storage, log_root) = create_storage(log_files); - - // log_tail represents versions 2-4 (latest commits) - let log_tail = vec![ - make_parsed_log_path_with_source(2, LogPathFileType::Commit, CommitSource::Catalog), - make_parsed_log_path_with_source(3, LogPathFileType::Commit, CommitSource::Catalog), - make_parsed_log_path_with_source(4, LogPathFileType::Commit, CommitSource::Catalog), - ]; - - // list for only versions 1-3 - let result: Vec<_> = - list_log_files(storage.as_ref(), &log_root, log_tail, Some(1), Some(3)) - .unwrap() - .try_collect() - .unwrap(); - - // The result includes version 1 from filesystem, and log_tail until requested version (2-3) - assert_eq!(result.len(), 3); - assert_eq!(result[0].version, 1); - assert_eq!(result[1].version, 2); - assert_eq!(result[2].version, 3); - assert_source(&result[0], CommitSource::Filesystem); - assert_source(&result[1], CommitSource::Catalog); - assert_source(&result[2], CommitSource::Catalog); - } - - #[test] - fn test_log_tail_defines_latest_version() { - // log_tail defines the latest version of the table: if there is file system files after log - // tail, they are ignored - let log_files = vec![ - (0, LogPathFileType::Commit, CommitSource::Filesystem), - (1, LogPathFileType::Commit, CommitSource::Filesystem), - (2, LogPathFileType::Commit, CommitSource::Filesystem), // ignored! - ]; - let (storage, log_root) = create_storage(log_files); - - // log_tail is just [1], indicating version 1 is the latest - let log_tail = vec![make_parsed_log_path_with_source( - 1, - LogPathFileType::Commit, - CommitSource::Catalog, - )]; - - let result: Vec<_> = list_log_files(storage.as_ref(), &log_root, log_tail, Some(0), None) - .unwrap() - .try_collect() - .unwrap(); - - // expect only 0 from file system and 1 from log tail - assert_eq!(result.len(), 2); - assert_eq!(result[0].version, 0); - assert_eq!(result[1].version, 1); - assert_source(&result[0], CommitSource::Filesystem); - assert_source(&result[1], CommitSource::Catalog); - } - - #[test] - fn test_log_tail_covers_entire_range_no_listing() { - // test-only storage handler that panics if you use it - struct StorageThatPanics {} - impl StorageHandler for StorageThatPanics { - fn list_from( - &self, - _path: &Url, - ) -> DeltaResult>>> { - panic!("list_from used"); - } - fn read_files( - &self, - _files: Vec, - ) -> DeltaResult>>> { - panic!("read_files used"); - } - fn copy_atomic(&self, src: &Url, dest: &Url) -> DeltaResult<()> { - panic!("copy used from {src} to {dest}"); - } - } - - // when log_tail covers the entire requested range, no filesystem listing should occur - // log_tail covers versions 0-2, which includes the entire range we'll request - let log_tail = vec![ - make_parsed_log_path_with_source(0, LogPathFileType::Commit, CommitSource::Catalog), - make_parsed_log_path_with_source(1, LogPathFileType::Commit, CommitSource::Catalog), - make_parsed_log_path_with_source( - 2, - LogPathFileType::StagedCommit, - CommitSource::Catalog, - ), - ]; - - let storage = StorageThatPanics {}; - let url = Url::parse("memory:///anything").unwrap(); - let result: Vec<_> = list_log_files(&storage, &url, log_tail, Some(0), Some(2)) - .unwrap() - .try_collect() - .unwrap(); - - assert_eq!(result.len(), 3); - assert_eq!(result[0].version, 0); - assert_eq!(result[1].version, 1); - assert_eq!(result[2].version, 2); - assert_source(&result[0], CommitSource::Catalog); - assert_source(&result[1], CommitSource::Catalog); - assert_source(&result[2], CommitSource::Catalog); - } - - #[test] - fn test_listing_omits_staged_commits() { - // note that in the presence of staged commits, we CANNOT trust listing to determine which - // to include in our listing/log segment. This is up to the catalog. (e.g. version - // 5.uuid1.json and 5.uuid2.json can both exist and only catalog can say which is the 'real' - // version 5). - - let log_files = vec![ - (0, LogPathFileType::Commit, CommitSource::Filesystem), - (1, LogPathFileType::Commit, CommitSource::Filesystem), - (1, LogPathFileType::StagedCommit, CommitSource::Filesystem), - (2, LogPathFileType::StagedCommit, CommitSource::Filesystem), - ]; - - let (storage, log_root) = create_storage(log_files); - let result: Vec<_> = list_log_files(storage.as_ref(), &log_root, vec![], None, None) - .unwrap() - .try_collect() - .unwrap(); - - // we must only see two regular commits - assert_eq!(result.len(), 2); - assert_eq!(result[0].version, 0); - assert_eq!(result[1].version, 1); - assert_source(&result[0], CommitSource::Filesystem); - assert_source(&result[1], CommitSource::Filesystem); - } - - #[test] - fn test_listing_with_large_end_version() { - let log_files = vec![ - (0, LogPathFileType::Commit, CommitSource::Filesystem), - (1, LogPathFileType::Commit, CommitSource::Filesystem), - (2, LogPathFileType::StagedCommit, CommitSource::Filesystem), - ]; - - let (storage, log_root) = create_storage(log_files); - // note we let you request end version past the end of log. up to consumer to interpret - let result: Vec<_> = list_log_files(storage.as_ref(), &log_root, vec![], None, Some(3)) - .unwrap() - .try_collect() - .unwrap(); - - // we must only see two regular commits - assert_eq!(result.len(), 2); - assert_eq!(result[0].version, 0); - assert_eq!(result[1].version, 1); - } -} diff --git a/kernel/src/log_compaction/mod.rs b/kernel/src/log_compaction/mod.rs index 4600d8910a..c6f6064e05 100644 --- a/kernel/src/log_compaction/mod.rs +++ b/kernel/src/log_compaction/mod.rs @@ -27,12 +27,12 @@ //! //! ```no_run //! # use std::sync::Arc; -//! # use delta_kernel::{LogCompactionDataIterator, LogCompactionWriter}; +//! # use delta_kernel::{ActionReconciliationIterator, LogCompactionWriter}; //! # use delta_kernel::{Engine, Snapshot, DeltaResult, Error, FileMeta}; //! # use url::Url; //! //! // Engine-specific function to write compaction data -//! fn write_compaction_file(path: &Url, data: LogCompactionDataIterator) -> DeltaResult { +//! fn write_compaction_file(path: &Url, data: ActionReconciliationIterator) -> DeltaResult { //! // In a real implementation, this would write the data to cloud storage //! todo!("Write data batches to storage at path: {}", path) //! } @@ -84,7 +84,7 @@ use crate::schema::{SchemaRef, StructField, StructType, ToSchema as _}; mod writer; -pub use writer::{should_compact, LogCompactionDataIterator, LogCompactionWriter}; +pub use writer::{should_compact, LogCompactionWriter}; #[cfg(test)] mod tests; @@ -97,8 +97,8 @@ static COMPACTION_ACTIONS_SCHEMA: LazyLock = LazyLock::new(|| { StructField::nullable(REMOVE_NAME, Remove::to_schema()), StructField::nullable(METADATA_NAME, Metadata::to_schema()), StructField::nullable(PROTOCOL_NAME, Protocol::to_schema()), - StructField::nullable(DOMAIN_METADATA_NAME, DomainMetadata::to_schema()), StructField::nullable(SET_TRANSACTION_NAME, SetTransaction::to_schema()), + StructField::nullable(DOMAIN_METADATA_NAME, DomainMetadata::to_schema()), StructField::nullable(SIDECAR_NAME, Sidecar::to_schema()), ])) }); diff --git a/kernel/src/log_compaction/tests.rs b/kernel/src/log_compaction/tests.rs index 3c1940c54a..20ec0ceb97 100644 --- a/kernel/src/log_compaction/tests.rs +++ b/kernel/src/log_compaction/tests.rs @@ -91,7 +91,7 @@ fn test_writer_debug_impl() { let snapshot = create_mock_snapshot(); let writer = LogCompactionWriter::try_new(snapshot, 1, 5).unwrap(); - let debug_str = format!("{:?}", writer); + let debug_str = format!("{writer:?}"); assert!(debug_str.contains("LogCompactionWriter")); } @@ -106,13 +106,14 @@ fn test_compaction_data() { let iterator = result.unwrap(); - // Test iterator methods - assert_eq!(iterator.total_actions(), 0); - assert_eq!(iterator.total_add_actions(), 0); + // Test iterator stat initilize + let state = iterator.state(); + assert_eq!(state.actions_count(), 0); + assert_eq!(state.add_actions_count(), 0); // Test debug implementation - let debug_str = format!("{:?}", iterator); - assert!(debug_str.contains("LogCompactionDataIterator")); + let debug_str = format!("{iterator:?}"); + assert!(debug_str.contains("ActionReconciliationIterator")); assert!(debug_str.contains("actions_count")); assert!(debug_str.contains("add_actions_count")); } @@ -149,23 +150,24 @@ fn test_compaction_data_with_actual_iterator() { let mut writer = LogCompactionWriter::try_new(snapshot, 0, 1).unwrap(); let engine = SyncEngine::new(); - let mut iterator = writer.compaction_data(&engine).unwrap(); + let iterator = writer.compaction_data(&engine).unwrap(); + let state = iterator.state(); let mut batch_count = 0; - let initial_actions = iterator.total_actions(); - let initial_add_actions = iterator.total_add_actions(); + let initial_actions = state.actions_count(); + let initial_add_actions = state.add_actions_count(); // Both should start at 0 assert_eq!(initial_actions, 0); assert_eq!(initial_add_actions, 0); - while let Some(batch_result) = iterator.next() { + for batch_result in iterator { batch_count += 1; assert!(batch_result.is_ok()); // After processing some batches, the counts should be >= the initial counts - assert!(iterator.total_actions() >= initial_actions); - assert!(iterator.total_add_actions() >= initial_add_actions); + assert!(state.actions_count() >= initial_actions); + assert!(state.add_actions_count() >= initial_add_actions); } assert!(batch_count > 0, "Expected to process at least one batch"); @@ -199,9 +201,7 @@ fn test_compaction_paths() { let path = writer.compaction_path(); assert!( path.to_string().ends_with(expected_suffix), - "Path {} doesn't end with {}", - path, - expected_suffix + "Path {path} doesn't end with {expected_suffix}" ); } } @@ -223,21 +223,23 @@ fn test_version_filtering() { ); let iterator = result.unwrap(); - assert!(iterator.total_actions() >= 0); - assert!(iterator.total_add_actions() >= 0); + let state = iterator.state(); + assert!(state.actions_count() >= 0); + assert!(state.add_actions_count() >= 0); } } -#[test] -fn test_no_compaction_staged_commits() { +#[tokio::test] +async fn test_no_compaction_staged_commits() { use crate::actions::Add; - use crate::engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}; - use object_store::{memory::InMemory, path::Path, ObjectStore}; + use crate::engine::default::DefaultEngineBuilder; + use crate::object_store::{memory::InMemory, path::Path, ObjectStore}; + use crate::table_features::TableFeature; use std::sync::Arc; // Set up in-memory store let store = Arc::new(InMemory::new()); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Create basic commits with proper metadata and protocol use crate::actions::{Metadata, Protocol}; @@ -248,7 +250,10 @@ fn test_no_compaction_staged_commits() { Metadata::try_new( Some("test-table".into()), None, - StructType::new_unchecked([StructField::nullable("value", KernelDataType::INTEGER)]), + Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + KernelDataType::INTEGER, + )])), vec![], 0, std::collections::HashMap::new(), @@ -256,7 +261,7 @@ fn test_no_compaction_staged_commits() { .unwrap(), ); let protocol = Action::Protocol( - Protocol::try_new(3, 7, Some(Vec::::new()), Some(Vec::::new())).unwrap(), + Protocol::try_new_modern(TableFeature::EMPTY_LIST, TableFeature::EMPTY_LIST).unwrap(), ); let metadata_action = serde_json::to_string(&metadata).unwrap(); @@ -264,36 +269,30 @@ fn test_no_compaction_staged_commits() { // Write version 0 let commit_0_path = Path::from("_delta_log/00000000000000000000.json"); - futures::executor::block_on(async { - store - .put( - &commit_0_path, - format!("{}\n{}", metadata_action, protocol_action).into(), - ) - .await - .unwrap(); - }); + store + .put( + &commit_0_path, + format!("{metadata_action}\n{protocol_action}").into(), + ) + .await + .unwrap(); // Write version 1 let add_action = serde_json::to_string(&Action::Add(Add::default())).unwrap(); let commit_1_path = Path::from("_delta_log/00000000000000000001.json"); - futures::executor::block_on(async { - store - .put(&commit_1_path, add_action.clone().into()) - .await - .unwrap(); - }); + store + .put(&commit_1_path, add_action.clone().into()) + .await + .unwrap(); // Write a staged commit (this would normally be filtered out during listing) let staged_commit_path = Path::from( "_delta_log/_staged_commits/00000000000000000002.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json", ); - futures::executor::block_on(async { - store - .put(&staged_commit_path, add_action.into()) - .await - .unwrap(); - }); + store + .put(&staged_commit_path, add_action.into()) + .await + .unwrap(); let table_root = url::Url::parse("memory:///").unwrap(); let snapshot = Snapshot::builder_for(table_root).build(&engine).unwrap(); diff --git a/kernel/src/log_compaction/writer.rs b/kernel/src/log_compaction/writer.rs index 165e5c2020..dcf8acbf97 100644 --- a/kernel/src/log_compaction/writer.rs +++ b/kernel/src/log_compaction/writer.rs @@ -1,11 +1,8 @@ use url::Url; use super::COMPACTION_ACTIONS_SCHEMA; -use crate::action_reconciliation::log_replay::{ - ActionReconciliationBatch, ActionReconciliationProcessor, -}; -use crate::action_reconciliation::RetentionCalculator; -use crate::engine_data::FilteredEngineData; +use crate::action_reconciliation::log_replay::ActionReconciliationProcessor; +use crate::action_reconciliation::{ActionReconciliationIterator, RetentionCalculator}; use crate::log_replay::LogReplayProcessor; use crate::log_segment::LogSegment; use crate::path::ParsedLogPath; @@ -53,9 +50,9 @@ impl LogCompactionWriter { ))); } - // We disallow compaction if the LogSegment contains any unpublished commits. (could create - // gaps in the version history, thereby breaking old readers) - snapshot.log_segment().validate_no_staged_commits()?; + // We disallow log compaction if the Snapshot is not published. If we didn't, this could + // create gaps in the version history, thereby breaking old readers. + snapshot.log_segment().validate_published()?; // Compute the compaction path once during construction let compaction_path = @@ -80,7 +77,7 @@ impl LogCompactionWriter { pub fn compaction_data( &mut self, engine: &dyn Engine, - ) -> DeltaResult { + ) -> DeltaResult { // Validate that the requested version range is within the snapshot's range let snapshot_end_version = self.snapshot.version(); if self.end_version > snapshot_end_version { @@ -100,11 +97,8 @@ impl LogCompactionWriter { )?; // Read actions from the version-filtered log segment - let actions_iter = compaction_log_segment.read_actions( - engine, - COMPACTION_ACTIONS_SCHEMA.clone(), - None, // No predicate - we want all actions in the version range - )?; + let actions_iter = + compaction_log_segment.read_actions(engine, COMPACTION_ACTIONS_SCHEMA.clone())?; let min_file_retention_timestamp_millis = self.deleted_file_retention_timestamp()?; @@ -119,70 +113,7 @@ impl LogCompactionWriter { // The processor handles reverse chronological processing internally let result_iter = processor.process_actions_iter(actions_iter); - // Wrap the iterator in a LogCompactionDataIterator to track action counts lazily - Ok(LogCompactionDataIterator::new(Box::new(result_iter))) - } -} - -/// Iterator over log compaction data. Provides the reconciled actions that should be written -/// to the compaction file. -pub struct LogCompactionDataIterator { - /// The nested iterator that yields compaction batches with action counts - pub(crate) compaction_batch_iterator: - Box> + Send>, - /// Running total of actions included in the compaction - pub(crate) actions_count: i64, - /// Running total of add actions included in the compaction - pub(crate) add_actions_count: i64, -} - -impl LogCompactionDataIterator { - /// Create a new LogCompactionDataIterator with counters initialized to 0 - pub(crate) fn new( - compaction_batch_iterator: Box< - dyn Iterator> + Send, - >, - ) -> Self { - Self { - compaction_batch_iterator, - actions_count: 0, - add_actions_count: 0, - } - } - - /// Get the total number of actions in the compaction - /// We don't use it currently, leaving it on as a useful observabilty feature. - #[allow(dead_code)] - pub(crate) fn total_actions(&self) -> i64 { - self.actions_count - } - - /// Get the total number of add actions in the compaction - /// We don't use it currently, leaving it on as a useful observabilty feature. - #[allow(dead_code)] - pub(crate) fn total_add_actions(&self) -> i64 { - self.add_actions_count - } -} - -impl std::fmt::Debug for LogCompactionDataIterator { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("LogCompactionDataIterator") - .field("actions_count", &self.actions_count) - .field("add_actions_count", &self.add_actions_count) - .finish() - } -} - -impl Iterator for LogCompactionDataIterator { - type Item = DeltaResult; - - /// Advances the iterator and returns the next value. - fn next(&mut self) -> Option { - Some(self.compaction_batch_iterator.next()?.map(|batch| { - self.actions_count += batch.actions_count; - self.add_actions_count += batch.add_actions_count; - batch.filtered_data - })) + // Wrap the iterator to track action counts lazily + Ok(ActionReconciliationIterator::new(Box::new(result_iter))) } } diff --git a/kernel/src/log_path.rs b/kernel/src/log_path.rs index bbfa37f8b7..fe468873c7 100644 --- a/kernel/src/log_path.rs +++ b/kernel/src/log_path.rs @@ -44,23 +44,27 @@ impl LogPath { last_modified: i64, size: FileSize, ) -> DeltaResult { + let commit_path = Self::staged_commit_url(table_root, filename)?; + let file_meta = FileMeta { + location: commit_path, + last_modified, + size, + }; + LogPath::try_new(file_meta) + } + + /// Create the URL for a staged commit file given the table root and filename. The table_root + /// must point to the root of the table and end with a '/'. + pub fn staged_commit_url(table_root: Url, filename: &str) -> DeltaResult { // TODO: we should introduce TablePath/LogPath types which enforce checks like ending '/' if !table_root.path().ends_with('/') { return Err(Error::invalid_table_location(table_root)); } - - let commit_path = table_root + table_root .join("_delta_log/") .and_then(|url| url.join("_staged_commits/")) .and_then(|url| url.join(filename)) - .map_err(|_| Error::invalid_table_location(table_root))?; - - let file_meta = FileMeta { - location: commit_path, - last_modified, - size, - }; - LogPath::try_new(file_meta) + .map_err(|_| Error::invalid_table_location(table_root)) } } diff --git a/kernel/src/log_reader/checkpoint_manifest.rs b/kernel/src/log_reader/checkpoint_manifest.rs new file mode 100644 index 0000000000..3d2d59de02 --- /dev/null +++ b/kernel/src/log_reader/checkpoint_manifest.rs @@ -0,0 +1,260 @@ +//! Manifest phase for log replay - processes single-part checkpoints and manifest checkpoints. + +use std::sync::{Arc, LazyLock}; + +use itertools::Itertools; +use url::Url; + +use crate::actions::visitors::SidecarVisitor; +use crate::actions::{Add, Remove, Sidecar, ADD_NAME}; +use crate::actions::{REMOVE_NAME, SIDECAR_NAME}; +use crate::log_replay::ActionsBatch; +use crate::path::ParsedLogPath; +use crate::schema::{SchemaRef, StructField, StructType, ToSchema}; +use crate::utils::require; +use crate::{DeltaResult, Engine, Error, FileMeta, RowVisitor}; + +/// Phase that processes single-part checkpoint. This also treats the checkpoint as a manifest file +/// and extracts the sidecar actions during iteration. +#[allow(unused)] +pub(crate) struct CheckpointManifestReader { + actions: Box> + Send>, + sidecar_visitor: SidecarVisitor, + log_root: Url, + is_complete: bool, + manifest_file: FileMeta, +} + +impl CheckpointManifestReader { + /// Create a new manifest phase for a single-part checkpoint. + /// + /// The schema is automatically augmented with the sidecar column since the manifest + /// phase needs to extract sidecar references for phase transitions. + /// + /// # Parameters + /// - `manifest_file`: The checkpoint manifest file to process + /// - `log_root`: Root URL for resolving sidecar paths + /// - `engine`: Engine for reading files + #[allow(unused)] + pub(crate) fn try_new( + engine: Arc, + manifest: &ParsedLogPath, + log_root: Url, + ) -> DeltaResult { + static MANIFEST_READ_SCHMEA: LazyLock = LazyLock::new(|| { + Arc::new(StructType::new_unchecked([ + StructField::nullable(ADD_NAME, Add::to_schema()), + StructField::nullable(REMOVE_NAME, Remove::to_schema()), + StructField::nullable(SIDECAR_NAME, Sidecar::to_schema()), + ])) + }); + + let actions = match manifest.extension.as_str() { + "json" => engine.json_handler().read_json_files( + std::slice::from_ref(&manifest.location), + MANIFEST_READ_SCHMEA.clone(), + None, + )?, + "parquet" => engine.parquet_handler().read_parquet_files( + std::slice::from_ref(&manifest.location), + MANIFEST_READ_SCHMEA.clone(), + None, + )?, + extension => { + return Err(Error::generic(format!( + "Unsupported checkpoint extension: {extension}", + ))); + } + }; + + let actions = Box::new(actions.map_ok(|batch_res| ActionsBatch::new(batch_res, false))); + Ok(Self { + actions, + sidecar_visitor: SidecarVisitor::default(), + log_root, + is_complete: false, + manifest_file: manifest.location.clone(), + }) + } + + /// Extract the sidecars from the manifest file if there were any. + /// NOTE: The iterator must be completely exhausted before calling this + #[allow(unused)] + pub(crate) fn extract_sidecars(self) -> DeltaResult> { + require!( + self.is_complete, + Error::generic(format!( + "Cannot extract sidecars from in-progress ManifestReader for file: {}", + self.manifest_file.location + )) + ); + + let sidecars: Vec<_> = self + .sidecar_visitor + .sidecars + .into_iter() + .map(|s| s.to_filemeta(&self.log_root)) + .try_collect()?; + + Ok(sidecars) + } +} + +impl Iterator for CheckpointManifestReader { + type Item = DeltaResult; + + fn next(&mut self) -> Option { + let Some(result) = self.actions.next() else { + self.is_complete = true; + return None; + }; + + Some(result.and_then(|batch| { + self.sidecar_visitor.visit_rows_of(batch.actions())?; + Ok(batch) + })) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::array::{Array, StringArray, StructArray}; + use crate::engine::arrow_data::EngineDataArrowExt as _; + use crate::utils::test_utils::{assert_result_error_with_message, load_test_table}; + use crate::SnapshotRef; + + use itertools::Itertools; + use std::sync::Arc; + + /// Helper function to test manifest phase with expected add paths and sidecars + fn verify_manifest_phase( + engine: Arc, + snapshot: SnapshotRef, + expected_add_paths: &[&str], + expected_sidecars: &[&str], + ) -> DeltaResult<()> { + let log_segment = snapshot.log_segment(); + let log_root = log_segment.log_root.clone(); + assert_eq!(log_segment.listed.checkpoint_parts.len(), 1); + let checkpoint_file = &log_segment.listed.checkpoint_parts[0]; + let mut manifest_phase = + CheckpointManifestReader::try_new(engine.clone(), checkpoint_file, log_root)?; + + // Extract add file paths and verify expectations + let mut file_paths = vec![]; + for result in manifest_phase.by_ref() { + let batch = result?; + let ActionsBatch { + actions, + is_log_batch, + } = batch; + assert!(!is_log_batch, "Manifest should not be a log batch"); + + let record_batch = actions.try_into_record_batch()?; + let add = record_batch.column_by_name("add").unwrap(); + let add_struct = add.as_any().downcast_ref::().unwrap(); + let path = add_struct + .column_by_name("path") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let batch_paths = path.iter().flatten().map(ToString::to_string).collect_vec(); + file_paths.extend(batch_paths); + } + + // Verify collected add paths + file_paths.sort(); + assert_eq!( + file_paths, expected_add_paths, + "CheckpointManifestReader should extract expected Add file paths from checkpoint" + ); + + // Check sidecars + let actual_sidecars = manifest_phase.extract_sidecars()?; + + assert_eq!( + actual_sidecars.len(), + expected_sidecars.len(), + "Should collect exactly {} actual_sidecars", + expected_sidecars.len() + ); + + // Extract and verify the sidecar paths + let mut collected_paths: Vec = actual_sidecars + .iter() + .map(|fm| { + fm.location + .path_segments() + .and_then(|mut segments| segments.next_back()) + .unwrap_or("") + .to_string() + }) + .collect(); + + collected_paths.sort(); + // Verify they're the expected sidecar files + assert_eq!(collected_paths, expected_sidecars.to_vec()); + + Ok(()) + } + + #[test] + fn test_manifest_phase_extracts_file_paths() -> DeltaResult<()> { + let (engine, snapshot, _tempdir) = load_test_table("with_checkpoint_no_last_checkpoint")?; + verify_manifest_phase( + engine, + snapshot, + &["part-00000-a190be9e-e3df-439e-b366-06a863f51e99-c000.snappy.parquet"], + &[], + ) + } + + #[test] + fn test_manifest_phase_early_finalize_error() -> DeltaResult<()> { + let (engine, snapshot, _tempdir) = load_test_table("with_checkpoint_no_last_checkpoint")?; + + let manifest_phase = CheckpointManifestReader::try_new( + engine.clone(), + &snapshot.log_segment().listed.checkpoint_parts[0], + snapshot.log_segment().log_root.clone(), + )?; + + let result = manifest_phase.extract_sidecars(); + assert_result_error_with_message( + result, + "Cannot extract sidecars from in-progress ManifestReader for file", + ); + Ok(()) + } + + #[test] + fn test_manifest_phase_collects_sidecars() -> DeltaResult<()> { + let (engine, snapshot, _tempdir) = load_test_table("v2-checkpoints-json-with-sidecars")?; + verify_manifest_phase( + engine, + snapshot, + &[], + &[ + "00000000000000000006.checkpoint.0000000001.0000000002.19af1366-a425-47f4-8fa6-8d6865625573.parquet", + "00000000000000000006.checkpoint.0000000002.0000000002.5008b69f-aa8a-4a66-9299-0733a56a7e63.parquet", + ], + ) + } + + #[test] + fn test_manifest_phase_collects_sidecars_parquet() -> DeltaResult<()> { + let (engine, snapshot, _tempdir) = load_test_table("v2-checkpoints-parquet-with-sidecars")?; + verify_manifest_phase( + engine, + snapshot, + &[], + &[ + "00000000000000000006.checkpoint.0000000001.0000000002.76931b15-ead3-480d-b86c-afe55a577fc3.parquet", + "00000000000000000006.checkpoint.0000000002.0000000002.4367b29c-0e87-447f-8e81-9814cc01ad1f.parquet", + ], + ) + } +} diff --git a/kernel/src/log_reader/commit.rs b/kernel/src/log_reader/commit.rs new file mode 100644 index 0000000000..34f128db1c --- /dev/null +++ b/kernel/src/log_reader/commit.rs @@ -0,0 +1,103 @@ +//! Commit phase for log replay - processes JSON commit files. + +use itertools::Itertools; + +use crate::log_replay::ActionsBatch; +use crate::log_segment::LogSegment; +use crate::schema::SchemaRef; +use crate::{DeltaResult, Engine}; + +/// Phase that processes JSON commit files into [`ActionsBatch`]s +pub(crate) struct CommitReader { + actions: Box> + Send>, +} + +impl CommitReader { + /// Create a new commit phase from a log segment. + /// + /// # Parameters + /// - `engine`: Engine for reading files + /// - `log_segment`: The log segment to process + /// - `schema`: The schema to read the json files + pub(crate) fn try_new( + engine: &dyn Engine, + log_segment: &LogSegment, + schema: SchemaRef, + ) -> DeltaResult { + let commit_files = log_segment.find_commit_cover(); + let actions = engine + .json_handler() + .read_json_files(&commit_files, schema, None)? + .map_ok(|batch| ActionsBatch::new(batch, true)); + + Ok(Self { + actions: Box::new(actions), + }) + } +} + +impl Iterator for CommitReader { + type Item = DeltaResult; + + fn next(&mut self) -> Option { + self.actions.next() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::arrow::array::{StringArray, StructArray}; + use crate::engine::arrow_data::EngineDataArrowExt as _; + use crate::scan::COMMIT_READ_SCHEMA; + use crate::utils::test_utils::load_test_table; + use itertools::Itertools; + use std::sync::Arc; + + #[test] + fn test_commit_phase_processes_commits() -> Result<(), Box> { + let (engine, snapshot, _tempdir) = load_test_table("app-txn-no-checkpoint")?; + let log_segment = Arc::new(snapshot.log_segment().clone()); + + let schema = COMMIT_READ_SCHEMA.clone(); + let commit_phase = CommitReader::try_new(engine.as_ref(), &log_segment, schema)?; + + let mut file_paths = vec![]; + for result in commit_phase { + let batch = result?; + let ActionsBatch { + actions, + is_log_batch, + } = batch; + assert!(is_log_batch); + + let record_batch = actions.try_into_record_batch()?; + let add = record_batch.column_by_name("add").unwrap(); + let add_struct = add.as_any().downcast_ref::().unwrap(); + + let path = add_struct + .column_by_name("path") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let batch_paths = path.iter().flatten().map(ToString::to_string).collect_vec(); + file_paths.extend(batch_paths); + } + + file_paths.sort(); + let expected_files = vec![ + "modified=2021-02-01/part-00001-80996595-a345-43b7-b213-e247d6f091f7-c000.snappy.parquet", + "modified=2021-02-01/part-00001-8ebcaf8b-0f48-4213-98c9-5c2156d20a7e-c000.snappy.parquet", + "modified=2021-02-02/part-00001-9a16b9f6-c12a-4609-a9c4-828eacb9526a-c000.snappy.parquet", + "modified=2021-02-02/part-00001-bfac5c74-426e-410f-ab74-21a64e518e9c-c000.snappy.parquet", + ]; + assert_eq!( + file_paths, expected_files, + "CommitReader should find exactly the expected files" + ); + + Ok(()) + } +} diff --git a/kernel/src/log_reader/mod.rs b/kernel/src/log_reader/mod.rs new file mode 100644 index 0000000000..b3c344e09b --- /dev/null +++ b/kernel/src/log_reader/mod.rs @@ -0,0 +1,2 @@ +pub(crate) mod checkpoint_manifest; +pub(crate) mod commit; diff --git a/kernel/src/log_replay.rs b/kernel/src/log_replay.rs index 9f4016da10..2073f3a89d 100644 --- a/kernel/src/log_replay.rs +++ b/kernel/src/log_replay.rs @@ -13,21 +13,24 @@ //! This module provides structures for efficient batch processing, focusing on file action //! deduplication with `FileActionDeduplicator` which tracks unique files across log batches //! to minimize memory usage for tables with extensive history. -use crate::actions::deletion_vector::DeletionVectorDescriptor; -use crate::engine_data::{GetData, TypedGetData}; +use crate::engine_data::GetData; +use crate::log_replay::deduplicator::Deduplicator; use crate::scan::data_skipping::DataSkippingFilter; use crate::{DeltaResult, EngineData}; use delta_kernel_derive::internal_api; use std::collections::HashSet; +use std::sync::Arc; use tracing::debug; +pub(crate) mod deduplicator; + /// The subset of file action fields that uniquely identifies it in the log, used for deduplication /// of adds and removes during log replay. -#[derive(Debug, Hash, Eq, PartialEq)] -pub(crate) struct FileActionKey { +#[derive(Debug, Hash, Eq, PartialEq, serde::Serialize, serde::Deserialize, Clone)] +pub struct FileActionKey { pub(crate) path: String, pub(crate) dv_unique_id: Option, } @@ -56,7 +59,8 @@ pub(crate) struct FileActionDeduplicator<'seen> { seen_file_keys: &'seen mut HashSet, // TODO: Consider renaming to `is_commit_batch`, `deduplicate_batch`, or `save_batch` // to better reflect its role in deduplication logic. - /// Whether we're processing a log batch (as opposed to a checkpoint) + /// Whether we're processing a commit log JSON file (`true`) or a checkpoint file (`false`). + /// When `true`, file actions are added to `seen_file_keys` as they're processed. is_log_batch: bool, /// Index of the getter containing the add.path column add_path_index: usize, @@ -86,12 +90,14 @@ impl<'seen> FileActionDeduplicator<'seen> { remove_dv_start_index, } } +} +impl Deduplicator for FileActionDeduplicator<'_> { /// Checks if log replay already processed this logical file (in which case the current action /// should be ignored). If not already seen, register it so we can recognize future duplicates. /// Returns `true` if we have seen the file and should ignore it, `false` if we have not seen it /// and should process it. - pub(crate) fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { // Note: each (add.path + add.dv_unique_id()) pair has a // unique Add + Remove pair in the log. For example: // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json @@ -117,35 +123,6 @@ impl<'seen> FileActionDeduplicator<'seen> { } } - /// Extracts the deletion vector unique ID if it exists. - /// - /// This function retrieves the necessary fields for constructing a deletion vector unique ID - /// by accessing `getters` at `dv_start_index` and the following two indices. Specifically: - /// - `dv_start_index` retrieves the storage type (`deletionVector.storageType`). - /// - `dv_start_index + 1` retrieves the path or inline deletion vector (`deletionVector.pathOrInlineDv`). - /// - `dv_start_index + 2` retrieves the optional offset (`deletionVector.offset`). - fn extract_dv_unique_id<'a>( - &self, - i: usize, - getters: &[&'a dyn GetData<'a>], - dv_start_index: usize, - ) -> DeltaResult> { - match getters[dv_start_index].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => { - let path_or_inline = - getters[dv_start_index + 1].get(i, "deletionVector.pathOrInlineDv")?; - let offset = getters[dv_start_index + 2].get_opt(i, "deletionVector.offset")?; - - Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - path_or_inline, - offset, - ))) - } - None => Ok(None), - } - } - /// Extracts a file action key and determines if it's an add operation. /// This method examines the data at the given index using the provided getters /// to identify whether a file action exists and what type it is. @@ -159,7 +136,7 @@ impl<'seen> FileActionDeduplicator<'seen> { /// - `Ok(Some((key, is_add)))`: When a file action is found, returns the key and whether it's an add operation /// - `Ok(None)`: When no file action is found /// - `Err(...)`: On any error during extraction - pub(crate) fn extract_file_action<'a>( + fn extract_file_action<'a>( &self, i: usize, getters: &[&'a dyn GetData<'a>], @@ -190,7 +167,7 @@ impl<'seen> FileActionDeduplicator<'seen> { /// /// `true` indicates we are processing a batch from a commit file. /// `false` indicates we are processing a batch from a checkpoint. - pub(crate) fn is_log_batch(&self) -> bool { + fn is_log_batch(&self) -> bool { self.is_log_batch } } @@ -227,6 +204,23 @@ impl ActionsBatch { } } +#[internal_api] +pub(crate) trait ParallelLogReplayProcessor { + type Output; + fn process_actions_batch(&self, actions_batch: ActionsBatch) -> DeltaResult; +} + +impl ParallelLogReplayProcessor for Arc +where + T: ParallelLogReplayProcessor, +{ + type Output = T::Output; + + fn process_actions_batch(&self, actions_batch: ActionsBatch) -> DeltaResult { + T::process_actions_batch(self, actions_batch) + } +} + /// A trait for processing batches of actions from Delta transaction logs during log replay. /// /// Log replay processors scan transaction logs in **reverse chronological order** (newest to oldest), @@ -280,6 +274,8 @@ impl ActionsBatch { /// filtered by the **selection vector** to determine which rows are included in the final checkpoint. /// /// TODO: Refactor the Change Data Feed (CDF) processor to use this trait. +#[allow(rustdoc::broken_intra_doc_links, rustdoc::private_intra_doc_links)] +#[internal_api] pub(crate) trait LogReplayProcessor: Sized { /// The type of results produced by this processor must implement the /// [`HasSelectionVector`] trait to allow filtering out batches with no selected rows. @@ -318,9 +314,9 @@ pub(crate) trait LogReplayProcessor: Sized { action_iter .map(move |actions_batch| self.process_actions_batch(actions_batch?)) .filter(|res| { - // TODO: Leverage .is_none_or() when msrv = 1.82 res.as_ref() - .map_or(true, |result| result.has_selected_rows()) + .ok() + .is_none_or(|result| result.has_selected_rows()) }) } @@ -353,7 +349,345 @@ pub(crate) trait LogReplayProcessor: Sized { /// This trait is used to determine if a processor's output contains any selected rows. /// This is used to filter out batches with no selected rows from the log replay results. +#[internal_api] pub(crate) trait HasSelectionVector { /// Check if the selection vector contains at least one selected row fn has_selected_rows(&self) -> bool; } + +#[cfg(test)] +mod tests { + use super::deduplicator::CheckpointDeduplicator; + use super::*; + use crate::engine_data::GetData; + use crate::DeltaResult; + use std::collections::{HashMap, HashSet}; + + /// Mock GetData implementation for testing + struct MockGetData { + string_values: HashMap<(usize, String), String>, + int_values: HashMap<(usize, String), i32>, + errors: HashMap<(usize, String), String>, + } + + impl MockGetData { + fn new() -> Self { + Self { + string_values: HashMap::new(), + int_values: HashMap::new(), + errors: HashMap::new(), + } + } + + fn add_string(&mut self, row: usize, field: &str, value: &str) { + self.string_values + .insert((row, field.to_string()), value.to_string()); + } + + fn add_int(&mut self, row: usize, field: &str, value: i32) { + self.int_values.insert((row, field.to_string()), value); + } + } + + impl<'a> GetData<'a> for MockGetData { + fn get_str(&'a self, row_index: usize, field_name: &str) -> DeltaResult> { + if let Some(error_msg) = self.errors.get(&(row_index, field_name.to_string())) { + return Err(crate::Error::Generic(error_msg.clone())); + } + Ok(self + .string_values + .get(&(row_index, field_name.to_string())) + .map(|s| s.as_str())) + } + + fn get_int(&'a self, row_index: usize, field_name: &str) -> DeltaResult> { + if let Some(error_msg) = self.errors.get(&(row_index, field_name.to_string())) { + return Err(crate::Error::Generic(error_msg.clone())); + } + Ok(self + .int_values + .get(&(row_index, field_name.to_string())) + .cloned()) + } + } + + /// Helper to create a FileActionDeduplicator with standard indices + fn create_deduplicator( + seen: &mut HashSet, + is_log_batch: bool, + ) -> FileActionDeduplicator<'_> { + FileActionDeduplicator::new( + seen, + is_log_batch, + 0, // add_path_index + 5, // remove_path_index + 2, // add_dv_start_index + 6, // remove_dv_start_index + ) + } + + /// Helper to create a getters array with mocks at specific positions + fn create_getters_with_mocks<'a>( + add_mock: Option<&'a MockGetData>, + remove_mock: Option<&'a MockGetData>, + ) -> Vec<&'a dyn GetData<'a>> { + use std::sync::LazyLock; + static EMPTY: LazyLock = LazyLock::new(MockGetData::new); + + let empty_ref = &*EMPTY; + vec![ + add_mock.unwrap_or(empty_ref), // 0: add.path + empty_ref, // 1: (unused) + add_mock.unwrap_or(empty_ref), // 2: add.dv.storageType + add_mock.unwrap_or(empty_ref), // 3: add.dv.pathOrInlineDv + add_mock.unwrap_or(empty_ref), // 4: add.dv.offset + remove_mock.unwrap_or(empty_ref), // 5: remove.path + remove_mock.unwrap_or(empty_ref), // 6: remove.dv.storageType + remove_mock.unwrap_or(empty_ref), // 7: remove.dv.pathOrInlineDv + remove_mock.unwrap_or(empty_ref), // 8: remove.dv.offset + ] + } + + #[test] + fn test_extract_file_action_add() -> DeltaResult<()> { + let mut seen = HashSet::new(); + let deduplicator = create_deduplicator(&mut seen, true); + + let mut mock_add = MockGetData::new(); + mock_add.add_string(0, "add.path", "file1.parquet"); + let getters = create_getters_with_mocks(Some(&mock_add), None); + let result = deduplicator.extract_file_action(0, &getters, false)?; + + assert!(result.is_some()); + let (key, is_add) = result.unwrap(); + assert_eq!(key.path, "file1.parquet"); + assert!(key.dv_unique_id.is_none()); + assert!(is_add); + + Ok(()) + } + + #[test] + fn test_extract_file_action_remove() -> DeltaResult<()> { + let mut seen = HashSet::new(); + let deduplicator = create_deduplicator(&mut seen, true); + + let mut mock_remove = MockGetData::new(); + mock_remove.add_string(0, "remove.path", "file2.parquet"); + let getters = create_getters_with_mocks(None, Some(&mock_remove)); + let result = deduplicator.extract_file_action(0, &getters, false)?; + + assert!(result.is_some()); + let (key, is_add) = result.unwrap(); + assert_eq!(key.path, "file2.parquet"); + assert!(!is_add); + + Ok(()) + } + + #[test] + fn test_extract_file_action_with_deletion_vector() -> DeltaResult<()> { + let mut seen = HashSet::new(); + let deduplicator = create_deduplicator(&mut seen, true); + + let mut mock_dv = MockGetData::new(); + mock_dv.add_string(0, "add.path", "file_with_dv.parquet"); + mock_dv.add_string(0, "deletionVector.storageType", "s3"); + mock_dv.add_string(0, "deletionVector.pathOrInlineDv", "path/to/dv"); + mock_dv.add_int(0, "deletionVector.offset", 100); + let getters = create_getters_with_mocks(Some(&mock_dv), None); + let result = deduplicator.extract_file_action(0, &getters, false)?; + + assert!(result.is_some()); + let (key, is_add) = result.unwrap(); + assert!(matches!( + key.dv_unique_id.as_deref(), + Some("s3path/to/dv@100") + )); + assert!(is_add); + + Ok(()) + } + + #[test] + fn test_extract_file_action_skip_removes() -> DeltaResult<()> { + let mut seen = HashSet::new(); + let deduplicator = create_deduplicator(&mut seen, true); + + let mut mock_remove = MockGetData::new(); + mock_remove.add_string(0, "remove.path", "file2.parquet"); + let getters = create_getters_with_mocks(None, Some(&mock_remove)); + + // With skip_removes=true, should return None + assert!(deduplicator + .extract_file_action(0, &getters, true)? + .is_none()); + + // With skip_removes=false, should return Some + assert!(deduplicator + .extract_file_action(0, &getters, false)? + .is_some()); + + Ok(()) + } + + #[test] + fn test_extract_file_action_no_action_found() -> DeltaResult<()> { + let mut seen = HashSet::new(); + let deduplicator = create_deduplicator(&mut seen, true); + + let getters = create_getters_with_mocks(None, None); + assert!(deduplicator + .extract_file_action(0, &getters, false)? + .is_none()); + + Ok(()) + } + + #[test] + fn test_check_and_record_seen() { + let mut seen = HashSet::new(); + + // Pre-populate with an existing key + let pre_existing_key = FileActionKey::new("existing.parquet", None); + seen.insert(pre_existing_key.clone()); + + let key1 = FileActionKey::new("file1.parquet", None); + let key2 = FileActionKey::new("file2.parquet", None); + let key_with_dv = FileActionKey::new("file1.parquet", Some("dv1".to_string())); + + // Test with log batch (should record keys) + { + let mut deduplicator = create_deduplicator(&mut seen, true); + + // Pre-existing key should be detected as duplicate + assert!(deduplicator.check_and_record_seen(pre_existing_key.clone())); + + // First time seeing keys, should return false and record them + assert!(!deduplicator.check_and_record_seen(key1.clone())); + assert!(!deduplicator.check_and_record_seen(key2.clone())); + assert!(!deduplicator.check_and_record_seen(key_with_dv.clone())); + + // Second time seeing keys, should return true (duplicates) + assert!(deduplicator.check_and_record_seen(key1.clone())); + assert!(deduplicator.check_and_record_seen(key_with_dv.clone())); + } + + // Keys should be recorded in seen set + assert!(seen.contains(&key1)); + assert!(seen.contains(&key2)); + assert!(seen.contains(&key_with_dv)); + + // Test with checkpoint batch (should NOT record keys) + { + let mut deduplicator = create_deduplicator(&mut seen, false); + + let new_key = FileActionKey::new("new.parquet", None); + + // First time seeing new_key in checkpoint, should return false but NOT record it + assert!(!deduplicator.check_and_record_seen(new_key.clone())); + // Still returns false on second call (not recorded) + assert!(!deduplicator.check_and_record_seen(new_key.clone())); + + // Existing keys from seen set should still be detected + assert!(deduplicator.check_and_record_seen(key1.clone())); + } + } + + #[test] + fn test_is_log_batch() { + let mut seen = HashSet::new(); + + // Test with is_log_batch = true + let deduplicator_log = create_deduplicator(&mut seen, true); + assert!(deduplicator_log.is_log_batch()); + + // Test with is_log_batch = false + let deduplicator_checkpoint = create_deduplicator(&mut seen, false); + assert!(!deduplicator_checkpoint.is_log_batch()); + } + + // ==================== CheckpointDeduplicator Tests ==================== + + #[test] + fn test_checkpoint_extract_file_action_add() -> DeltaResult<()> { + let seen = HashSet::new(); + let deduplicator = CheckpointDeduplicator::try_new(&seen, 0, 2)?; + + let mut mock_add = MockGetData::new(); + mock_add.add_string(0, "add.path", "checkpoint_file.parquet"); + let getters = create_getters_with_mocks(Some(&mock_add), None); + let result = deduplicator.extract_file_action(0, &getters, false)?; + + assert!(result.is_some()); + let (key, is_add) = result.unwrap(); + assert_eq!(key.path, "checkpoint_file.parquet"); + assert!(key.dv_unique_id.is_none()); + assert!(is_add); + + Ok(()) + } + + #[test] + fn test_checkpoint_extract_file_action_with_deletion_vector() -> DeltaResult<()> { + let seen = HashSet::new(); + let deduplicator = CheckpointDeduplicator::try_new(&seen, 0, 2)?; + + let mut mock_dv = MockGetData::new(); + mock_dv.add_string(0, "add.path", "file_with_dv.parquet"); + mock_dv.add_string(0, "deletionVector.storageType", "s3"); + mock_dv.add_string(0, "deletionVector.pathOrInlineDv", "path/to/dv"); + mock_dv.add_int(0, "deletionVector.offset", 100); + let getters = create_getters_with_mocks(Some(&mock_dv), None); + let result = deduplicator.extract_file_action(0, &getters, false)?; + + assert!(result.is_some()); + let (key, is_add) = result.unwrap(); + assert_eq!(key.path, "file_with_dv.parquet"); + assert!(matches!( + key.dv_unique_id.as_deref(), + Some("s3path/to/dv@100") + )); + assert!(is_add); + + Ok(()) + } + + #[test] + fn test_checkpoint_deduplicator_filters_commit_duplicates() -> DeltaResult<()> { + let mut seen = HashSet::new(); + + // Files "seen" during commit processing + seen.insert(FileActionKey::new("modified_in_commit.parquet", None)); + seen.insert(FileActionKey::new( + "modified_with_dv.parquet", + Some("dv123".to_string()), + )); + + let mut deduplicator = CheckpointDeduplicator::try_new(&seen, 0, 2)?; + + // File modified in commit - should be filtered from checkpoint + let commit_modified = FileActionKey::new("modified_in_commit.parquet", None); + assert!( + deduplicator.check_and_record_seen(commit_modified), + "Files seen in commits should be filtered from checkpoint" + ); + + // File with DV modified in commit - should be filtered + let commit_modified_dv = + FileActionKey::new("modified_with_dv.parquet", Some("dv123".to_string())); + assert!( + deduplicator.check_and_record_seen(commit_modified_dv), + "Files with DVs seen in commits should be filtered from checkpoint" + ); + + // File only in checkpoint - should NOT be filtered + let checkpoint_only = FileActionKey::new("checkpoint_only.parquet", None); + assert!( + !deduplicator.check_and_record_seen(checkpoint_only), + "Files only in checkpoint should not be filtered" + ); + + Ok(()) + } +} diff --git a/kernel/src/log_replay/deduplicator.rs b/kernel/src/log_replay/deduplicator.rs new file mode 100644 index 0000000000..e1e609daf4 --- /dev/null +++ b/kernel/src/log_replay/deduplicator.rs @@ -0,0 +1,122 @@ +//! Deduplication abstraction for log replay processors. +//! +//! The [`Deduplicator`] trait supports two deduplication strategies: +//! +//! - **JSON commit files** (`is_log_batch = true`): Tracks (path, dv_unique_id) and updates +//! the hashmap as files are seen. Implementation: [`FileActionDeduplicator`] +//! +//! - **Checkpoint files** (`is_log_batch = false`): Uses (path, dv_unique_id) to filter actions +//! using a read-only hashmap pre-populated from the commit log phase. Future implementation. +//! +//! [`FileActionDeduplicator`]: crate::log_replay::FileActionDeduplicator + +use std::collections::HashSet; + +use crate::actions::deletion_vector::DeletionVectorDescriptor; +use crate::engine_data::{GetData, TypedGetData}; +use crate::log_replay::FileActionKey; +use crate::DeltaResult; + +pub(crate) trait Deduplicator { + /// Extracts a file action key from the data. Returns `(key, is_add)` if found. + /// + /// TODO: Remove the skip_removes field in the future. The caller is responsible for using the + /// correct Deduplicator instance depending on whether the batch belongs to a commit or to a + /// checkpoint. + fn extract_file_action<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + skip_removes: bool, + ) -> DeltaResult>; + + /// Checks if this file has been seen. When `is_log_batch() = true`, updates the hashmap + /// to track new files. Returns `true` if the file should be filtered out. + fn check_and_record_seen(&mut self, key: FileActionKey) -> bool; + + /// Returns `true` for commit log batches (updates hashmap), `false` for checkpoints (read-only). + fn is_log_batch(&self) -> bool; + + /// Extracts the deletion vector unique ID if it exists. + /// + /// This function retrieves the necessary fields for constructing a deletion vector unique ID + /// by accessing `getters` at `dv_start_index` and the following two indices. Specifically: + /// - `dv_start_index` retrieves the storage type (`deletionVector.storageType`). + /// - `dv_start_index + 1` retrieves the path or inline deletion vector (`deletionVector.pathOrInlineDv`). + /// - `dv_start_index + 2` retrieves the optional offset (`deletionVector.offset`). + fn extract_dv_unique_id<'a>( + &self, + i: usize, + getters: &[&'a dyn GetData<'a>], + dv_start_index: usize, + ) -> DeltaResult> { + let Some(storage_type) = + getters[dv_start_index].get_opt(i, "deletionVector.storageType")? + else { + return Ok(None); + }; + let path_or_inline = getters[dv_start_index + 1].get(i, "deletionVector.pathOrInlineDv")?; + let offset = getters[dv_start_index + 2].get_opt(i, "deletionVector.offset")?; + + Ok(Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + path_or_inline, + offset, + ))) + } +} + +/// Read-only deduplicator for checkpoint processing. +/// +/// Unlike [`FileActionDeduplicator`] which mutably tracks files, this uses an immutable +/// reference to filter checkpoint actions against files already seen from commits. +/// Only handles add actions (no removes), and never modifies the seen set. +/// +/// [`FileActionDeduplicator`]: crate::log_replay::FileActionDeduplicator +#[allow(unused)] +pub(crate) struct CheckpointDeduplicator<'a> { + seen_file_keys: &'a HashSet, + add_path_index: usize, + add_dv_start_index: usize, +} + +impl<'a> CheckpointDeduplicator<'a> { + #[allow(unused)] + pub(crate) fn try_new( + seen_file_keys: &'a HashSet, + add_path_index: usize, + add_dv_start_index: usize, + ) -> DeltaResult { + Ok(CheckpointDeduplicator { + seen_file_keys, + add_path_index, + add_dv_start_index, + }) + } +} + +impl Deduplicator for CheckpointDeduplicator<'_> { + /// Extracts add action key only (checkpoints skip removes). `skip_removes` is ignored. + fn extract_file_action<'b>( + &self, + i: usize, + getters: &[&'b dyn GetData<'b>], + _skip_removes: bool, + ) -> DeltaResult> { + let Some(path) = getters[self.add_path_index].get_str(i, "add.path")? else { + return Ok(None); + }; + let dv_unique_id = self.extract_dv_unique_id(i, getters, self.add_dv_start_index)?; + Ok(Some((FileActionKey::new(path, dv_unique_id), true))) + } + + /// Read-only check against seen set. Returns `true` if file should be filtered out. + fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + self.seen_file_keys.contains(&key) + } + + /// Always `false` - checkpoint batches never update the seen set. + fn is_log_batch(&self) -> bool { + false + } +} diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 54438e7b5e..fd28be8214 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -3,34 +3,94 @@ use std::num::NonZero; use std::sync::{Arc, LazyLock}; +use std::time::Instant; + use crate::actions::visitors::SidecarVisitor; use crate::actions::{ - get_commit_schema, schema_contains_file_actions, Metadata, Protocol, Sidecar, METADATA_NAME, - PROTOCOL_NAME, SIDECAR_NAME, + get_log_add_schema, schema_contains_file_actions, Sidecar, DOMAIN_METADATA_NAME, METADATA_NAME, + PROTOCOL_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, }; +use crate::committer::CatalogCommit; +use crate::expressions::ColumnName; use crate::last_checkpoint_hint::LastCheckpointHint; +use crate::log_reader::commit::CommitReader; use crate::log_replay::ActionsBatch; +use crate::metrics::{MetricEvent, MetricId, MetricsReporter}; +use crate::path::LogPathFileType::*; use crate::path::{LogPathFileType, ParsedLogPath}; -use crate::schema::{SchemaRef, StructField, ToSchema as _}; +use crate::schema::{DataType, SchemaRef, StructField, StructType, ToSchema as _}; use crate::utils::require; use crate::{ - DeltaResult, Engine, EngineData, Error, Expression, FileMeta, ParquetHandler, Predicate, - PredicateRef, RowVisitor, StorageHandler, Version, + DeltaResult, Engine, Error, Expression, FileMeta, Predicate, PredicateRef, RowVisitor, + StorageHandler, Version, PRE_COMMIT_VERSION, }; use delta_kernel_derive::internal_api; -#[cfg(feature = "internal-api")] -pub use crate::listed_log_files::ListedLogFiles; -#[cfg(not(feature = "internal-api"))] -use crate::listed_log_files::ListedLogFiles; +#[internal_api] +use crate::log_segment_files::LogSegmentFiles; +use crate::schema::compare::SchemaComparison; use itertools::Itertools; -use tracing::{debug, warn}; +use tracing::{debug, info, instrument, warn}; use url::Url; +mod domain_metadata_replay; +mod protocol_metadata_replay; + +pub(crate) use domain_metadata_replay::DomainMetadataMap; + +#[cfg(test)] +mod crc_tests; #[cfg(test)] mod tests; +/// Information about checkpoint reading for data skipping optimization. +/// +/// Returned alongside the actions iterator from checkpoint reading functions. +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[internal_api] +pub(crate) struct CheckpointReadInfo { + /// Whether the checkpoint has compatible pre-parsed stats for data skipping. + /// When `true`, checkpoint batches can use stats_parsed directly instead of parsing JSON. + #[allow(unused)] + pub has_stats_parsed: bool, + /// Whether the checkpoint has compatible pre-parsed partition values. + /// When `true`, checkpoint batches can read typed partition values directly from + /// `partitionValues_parsed` instead of parsing strings from `partitionValues`. + #[serde(default)] + #[allow(unused)] + pub has_partition_values_parsed: bool, + /// The schema used to read checkpoint files, potentially including stats_parsed. + #[allow(unused)] + pub checkpoint_read_schema: SchemaRef, +} + +impl CheckpointReadInfo { + /// Create a CheckpointReadInfo configured to read checkpoints without using stats_parsed. + /// This is the standard configuration when stats_parsed optimization is not available. + #[allow(unused)] + pub(crate) fn without_stats_parsed() -> Self { + Self { + has_stats_parsed: false, + has_partition_values_parsed: false, + checkpoint_read_schema: get_log_add_schema().clone(), + } + } +} + +/// Result of reading actions from a log segment, containing both the actions iterator +/// and checkpoint metadata. +/// +/// This struct provides named access to the return values instead of tuple indexing. +#[internal_api] +pub(crate) struct ActionsWithCheckpointInfo>> { + /// Iterator over action batches read from the log segment. + pub actions: A, + /// Metadata about checkpoint reading, including the schema used. + #[allow(unused)] + pub checkpoint_info: CheckpointReadInfo, +} + /// A [`LogSegment`] represents a contiguous section of the log and is made of checkpoint files /// and commit files and guarantees the following: /// 1. Commit file versions will not have any gaps between them. @@ -50,89 +110,125 @@ pub(crate) struct LogSegment { pub end_version: Version, pub checkpoint_version: Option, pub log_root: Url, - /// Sorted commit files in the log segment (ascending) - pub ascending_commit_files: Vec, - /// Sorted (by start version) compaction files in the log segment (ascending) - pub ascending_compaction_files: Vec, - /// Checkpoint files in the log segment. - pub checkpoint_parts: Vec, - /// Latest CRC (checksum) file - pub latest_crc_file: Option, - /// The latest commit file found during listing, which may not be part of the - /// contiguous segment but is needed for ICT timestamp reading - pub latest_commit_file: Option, + /// Schema of the checkpoint file(s), if known from `_last_checkpoint` hint. + /// Used to determine if `stats_parsed` is available for data skipping. + pub checkpoint_schema: Option, + /// The set of log files found during listing. + pub listed: LogSegmentFiles, +} + +/// Returns the identifying leaf column path for a known action type, used to build IS NOT NULL +/// predicates that enable row group skipping in checkpoint parquet files. +/// +/// For `txn`, this is effective because all app ids end up in a single checkpoint part when +/// partitioned by `add.path` as the Delta spec requires. Filtering by a specific app id is not +/// worthwhile since all app ids share one part with a large min/max range (typically UUIDs). +fn action_identifying_column(action_name: &str) -> Option { + match action_name { + METADATA_NAME => Some(ColumnName::new([METADATA_NAME, "id"])), + PROTOCOL_NAME => Some(ColumnName::new([PROTOCOL_NAME, "minReaderVersion"])), + SET_TRANSACTION_NAME => Some(ColumnName::new([SET_TRANSACTION_NAME, "appId"])), + DOMAIN_METADATA_NAME => Some(ColumnName::new([DOMAIN_METADATA_NAME, "domain"])), + _ => None, + } +} + +/// Builds an IS NOT NULL predicate for row group skipping based on the action types in `schema`. +/// Returns `None` if any top-level field in the schema is not a recognized action type, since +/// an unknown type could have non-null rows in the same row group, making skipping unsafe. +fn schema_to_is_not_null_predicate(schema: &StructType) -> Option { + // Collect identifying columns for every field; short-circuit to None on any unknown field. + let columns: Vec = schema + .fields() + .map(|f| action_identifying_column(f.name())) + .collect::>()?; + let mut predicates = columns + .into_iter() + .map(|col| Expression::column(col).is_not_null()); + let first = predicates.next()?; + Some(Arc::new(predicates.fold(first, Predicate::or))) } impl LogSegment { + /// Creates a synthetic LogSegment for pre-commit transactions (e.g., create-table). + /// The sentinel version PRE_COMMIT_VERSION indicates no version exists yet on disk. + /// This is used to construct a pre-commit snapshot that provides table configuration + /// (protocol, metadata, schema) for operations like CTAS. + #[allow(dead_code)] // Used by create_table module + pub(crate) fn for_pre_commit(log_root: Url) -> Self { + use crate::PRE_COMMIT_VERSION; + Self { + end_version: PRE_COMMIT_VERSION, + checkpoint_version: None, + log_root, + checkpoint_schema: None, + listed: LogSegmentFiles::default(), + } + } + #[internal_api] pub(crate) fn try_new( - listed_files: ListedLogFiles, + mut listed_files: LogSegmentFiles, log_root: Url, end_version: Option, + checkpoint_schema: Option, ) -> DeltaResult { - let ListedLogFiles { - mut ascending_commit_files, - ascending_compaction_files, - checkpoint_parts, - latest_crc_file, - latest_commit_file, - } = listed_files; - - // Ensure commit file versions are contiguous - require!( - ascending_commit_files - .windows(2) - .all(|cfs| cfs[0].version + 1 == cfs[1].version), - Error::generic(format!( - "Expected ordered contiguous commit files {ascending_commit_files:?}" - )) - ); - - // Commit file versions must be greater than the most recent checkpoint version if it exists - let checkpoint_version = checkpoint_parts.first().map(|checkpoint_file| { - ascending_commit_files.retain(|log_path| checkpoint_file.version < log_path.version); - checkpoint_file.version - }); + validate_compaction_files(&listed_files.ascending_compaction_files)?; + validate_checkpoint_parts(&listed_files.checkpoint_parts)?; + validate_commit_file_types(&listed_files.ascending_commit_files)?; + validate_commit_files_contiguous(&listed_files.ascending_commit_files)?; - // There must be no gap between a checkpoint and the first commit version. Note that - // that all checkpoint parts share the same version. - if let (Some(checkpoint_version), Some(commit_file)) = - (checkpoint_version, ascending_commit_files.first()) - { - require!( - checkpoint_version + 1 == commit_file.version, - Error::InvalidCheckpoint(format!( - "Gap between checkpoint version {} and next commit {}", - checkpoint_version, commit_file.version, - )) - ) - } + // Filter commits before/at checkpoint version + let checkpoint_version = + if let Some(checkpoint_file) = listed_files.checkpoint_parts.first() { + let version = checkpoint_file.version; + listed_files + .ascending_commit_files + .retain(|log_path| version < log_path.version); + Some(version) + } else { + None + }; - // Get the effective version from chosen files - let effective_version = ascending_commit_files - .last() - .or(checkpoint_parts.first()) - .ok_or(Error::generic("No files in log segment"))? - .version; - if let Some(end_version) = end_version { - require!( - effective_version == end_version, - Error::generic(format!( - "LogSegment end version {effective_version} not the same as the specified end version {end_version}" - )) - ); - } + validate_checkpoint_commit_gap(checkpoint_version, &listed_files.ascending_commit_files)?; + let effective_version = validate_end_version( + &listed_files.ascending_commit_files, + &listed_files.checkpoint_parts, + end_version, + )?; - Ok(LogSegment { + let log_segment = LogSegment { end_version: effective_version, checkpoint_version, log_root, - ascending_commit_files, - ascending_compaction_files, - checkpoint_parts, - latest_crc_file, - latest_commit_file, - }) + checkpoint_schema, + listed: listed_files, + }; + + info!(segment = %log_segment.summary()); + + Ok(log_segment) + } + + /// Succinct summary string for logging purposes. + fn summary(&self) -> String { + format!( + "{{v={}, commits={}, checkpoint_v={}, checkpoint_parts={}, compactions={}, crc_v={}, max_pub_v={}}}", + self.end_version, + self.listed.ascending_commit_files.len(), + self.checkpoint_version + .map(|v| v.to_string()) + .unwrap_or_else(|| "none".into()), + self.listed.checkpoint_parts.len(), + self.listed.ascending_compaction_files.len(), + self.listed.latest_crc_file + .as_ref() + .map(|f| f.version.to_string()) + .unwrap_or_else(|| "none".into()), + self.listed.max_published_version + .map(|v| v.to_string()) + .unwrap_or_else(|| "none".into()), + ) } /// Constructs a [`LogSegment`] to be used for [`Snapshot`]. For a `Snapshot` at version `n`: @@ -145,22 +241,48 @@ impl LogSegment { /// - `time_travel_version`: The version of the log that the Snapshot will be at. /// /// [`Snapshot`]: crate::snapshot::Snapshot + /// + /// Reports metrics: `LogSegmentLoaded`. + #[instrument(name = "log_seg.for_snap", skip_all, err)] #[internal_api] pub(crate) fn for_snapshot( storage: &dyn StorageHandler, log_root: Url, log_tail: Vec, time_travel_version: impl Into>, + reporter: Option<&Arc>, + operation_id: Option, ) -> DeltaResult { + let operation_id = operation_id.unwrap_or_default(); + let start = Instant::now(); + let time_travel_version = time_travel_version.into(); let checkpoint_hint = LastCheckpointHint::try_read(storage, &log_root)?; - Self::for_snapshot_impl( + let result = Self::for_snapshot_impl( storage, log_root, log_tail, checkpoint_hint, time_travel_version, - ) + ); + let log_segment_loading_duration = start.elapsed(); + + match result { + Ok(log_segment) => { + reporter.inspect(|r| { + r.report(MetricEvent::LogSegmentLoaded { + operation_id, + duration: log_segment_loading_duration, + num_commit_files: log_segment.listed.ascending_commit_files.len() as u64, + num_checkpoint_files: log_segment.listed.checkpoint_parts.len() as u64, + num_compaction_files: log_segment.listed.ascending_compaction_files.len() + as u64, + }); + }); + Ok(log_segment) + } + Err(e) => Err(e), + } } // factored out for testing @@ -171,23 +293,51 @@ impl LogSegment { checkpoint_hint: Option, time_travel_version: Option, ) -> DeltaResult { - let listed_files = match (checkpoint_hint, time_travel_version) { - (Some(cp), None) => { - ListedLogFiles::list_with_checkpoint_hint(&cp, storage, &log_root, log_tail, None)? - } - (Some(cp), Some(end_version)) if cp.version <= end_version => { - ListedLogFiles::list_with_checkpoint_hint( - &cp, - storage, - &log_root, - log_tail, - Some(end_version), - )? - } - _ => ListedLogFiles::list(storage, &log_root, log_tail, None, time_travel_version)?, + // Extract checkpoint schema from hint (already an Arc, no clone needed) + let checkpoint_schema = checkpoint_hint + .as_ref() + .and_then(|hint| hint.checkpoint_schema.clone()); + + // The end_version is the time_travel_version, if present + // TODO: When max catalog version is implemented, we would use that as end_version if + // time_travel_version is not present + let end_version = time_travel_version; + + // Keep the hint only if it points at or before end_version, or if there is no end_version bound + let usable_hint = checkpoint_hint.filter(|cp| end_version.is_none_or(|v| cp.version <= v)); + + // Cases: + // + // 1. usable_hint present, end_version is Some --> list_with_checkpoint_hint from hint.version TO end_version + // 2. usable_hint present, end_version is None --> list_with_checkpoint_hint from hint.version unbounded + // 3. no usable_hint, end_version is Some --> backward-scan for checkpoint before end_version, + // list from that checkpoint TO end_version + // (falls back to v0 if no checkpoint found) + // 4. no usable_hint, end_version is None --> list from v0 unbounded + + let listed_files = match (usable_hint, end_version) { + // Cases 1 and 2 + (Some(cp), end_version) => LogSegmentFiles::list_with_checkpoint_hint( + &cp, + storage, + &log_root, + log_tail, + end_version, + )?, + // Case 3 + (None, Some(end)) => LogSegmentFiles::list_with_backward_checkpoint_scan( + storage, &log_root, log_tail, end, + )?, + // Case 4 + (None, None) => LogSegmentFiles::list(storage, &log_root, log_tail, None, None)?, }; - LogSegment::try_new(listed_files, log_root, time_travel_version) + LogSegment::try_new( + listed_files, + log_root, + time_travel_version, + checkpoint_schema, + ) } /// Constructs a [`LogSegment`] to be used for `TableChanges`. For a TableChanges between versions @@ -212,21 +362,25 @@ impl LogSegment { // TODO: compactions? let listed_files = - ListedLogFiles::list_commits(storage, &log_root, Some(start_version), end_version)?; + LogSegmentFiles::list_commits(storage, &log_root, Some(start_version), end_version)?; // - Here check that the start version is correct. // - [`LogSegment::try_new`] will verify that the `end_version` is correct if present. - // - [`ListedLogFiles::list_commits`] also checks that there are no gaps between commits. + // - [`LogSegmentFiles::list_commits`] also checks that there are no gaps between commits. // If all three are satisfied, this implies that all the desired commits are present. require!( listed_files - .ascending_commit_files + .ascending_commit_files() .first() .is_some_and(|first_commit| first_commit.version == start_version), Error::generic(format!( - "Expected the first commit to have version {start_version}" + "Expected the first commit to have version {start_version}, got {:?}", + listed_files + .ascending_commit_files() + .first() + .map(|c| c.version) )) ); - LogSegment::try_new(listed_files, log_root, end_version) + LogSegment::try_new(listed_files, log_root, end_version, None) } #[allow(unused)] @@ -256,10 +410,10 @@ impl LogSegment { // this is a list of commits with possible gaps, we want to take the latest contiguous // chunk of commits let mut listed_commits = - ListedLogFiles::list_commits(storage, &log_root, start_from, Some(end_version))?; + LogSegmentFiles::list_commits(storage, &log_root, start_from, Some(end_version))?; // remove gaps - return latest contiguous chunk of commits - let commits = &mut listed_commits.ascending_commit_files; + let commits = listed_commits.ascending_commit_files_mut(); if !commits.is_empty() { let mut start_idx = commits.len() - 1; while start_idx > 0 && commits[start_idx].version == 1 + commits[start_idx - 1].version @@ -269,7 +423,147 @@ impl LogSegment { commits.drain(..start_idx); } - LogSegment::try_new(listed_commits, log_root, Some(end_version)) + LogSegment::try_new(listed_commits, log_root, Some(end_version), None) + } + + /// Creates a new LogSegment with the given commit file added to the end. + /// TODO: Take in multiple commits when Kernel-RS supports txn retries and conflict rebasing. + #[allow(unused)] + pub(crate) fn new_with_commit_appended( + &self, + tail_commit_file: ParsedLogPath, + ) -> DeltaResult { + require!( + tail_commit_file.is_commit(), + Error::internal_error(format!( + "Cannot extend and create new LogSegment. Tail log file is not a commit file. \ + Path: {}, Type: {:?}.", + tail_commit_file.location.location, tail_commit_file.file_type + )) + ); + require!( + tail_commit_file.version == self.end_version.wrapping_add(1), + Error::internal_error(format!( + "Cannot extend and create new LogSegment. Tail commit file version ({}) does not \ + equal LogSegment end_version ({}) + 1.", + tail_commit_file.version, self.end_version + )) + ); + + let mut new_log_segment = self.clone(); + + new_log_segment.end_version = tail_commit_file.version; + new_log_segment + .listed + .ascending_commit_files + .push(tail_commit_file.clone()); + new_log_segment.listed.latest_commit_file = Some(tail_commit_file.clone()); + new_log_segment.listed.max_published_version = match tail_commit_file.file_type { + LogPathFileType::Commit => Some(tail_commit_file.version), + _ => self.listed.max_published_version, + }; + + Ok(new_log_segment) + } + + /// Creates a new LogSegment reflecting a checkpoint written at this segment's version. + /// The checkpoint must be at `end_version`. Kernel does not write multi-part checkpoints, + /// so the checkpoint must be a single file (classic parquet or V2 UUID). + pub(crate) fn try_new_with_checkpoint(&self, checkpoint: ParsedLogPath) -> DeltaResult { + require!( + matches!( + checkpoint.file_type, + LogPathFileType::SinglePartCheckpoint | LogPathFileType::UuidCheckpoint + ), + Error::internal_error(format!( + "Cannot update LogSegment with checkpoint. Path is not a single-file \ + checkpoint. Path: {}, Type: {:?}.", + checkpoint.location.location, checkpoint.file_type + )) + ); + require!( + checkpoint.version == self.end_version, + Error::internal_error(format!( + "Cannot update LogSegment with checkpoint. Checkpoint version ({}) does not \ + equal LogSegment end_version ({}).", + checkpoint.version, self.end_version + )) + ); + + let mut new_log_segment = self.clone(); + new_log_segment.checkpoint_version = Some(checkpoint.version); + new_log_segment.listed.checkpoint_parts = vec![checkpoint]; + // A snapshot at version N only contains commits and compactions at versions <= N, + // so a checkpoint at N covers everything and we can clear them entirely. + new_log_segment.listed.ascending_commit_files.clear(); + new_log_segment.listed.ascending_compaction_files.clear(); + // TODO(#839): Once CheckpointWriter exposes the output schema, thread it through + // here instead of None. Today the schema is computed inside checkpoint_data() but + // not returned. With None, the next scan will read the checkpoint parquet footer + // to determine the schema (e.g. whether stats_parsed or sidecar columns exist). + new_log_segment.checkpoint_schema = None; + Ok(new_log_segment) + } + + /// Creates a new LogSegment with the given CRC file recorded as the latest. + /// The CRC file must be at `end_version`. + pub(crate) fn try_new_with_crc_file(&self, crc_file: ParsedLogPath) -> DeltaResult { + require!( + crc_file.file_type == LogPathFileType::Crc, + Error::internal_error(format!( + "Cannot update LogSegment with CRC. Path is not a CRC file. \ + Path: {}, Type: {:?}.", + crc_file.location, crc_file.file_type + )) + ); + require!( + crc_file.version == self.end_version, + Error::internal_error(format!( + "Cannot update LogSegment with CRC. CRC version ({}) does not \ + equal LogSegment end_version ({}).", + crc_file.version, self.end_version + )) + ); + // Convert to FileMeta with placeholder metadata (size=0, last_modified=0). + // Only the URL matters for CRC files: downstream code uses it for version + // tracking and reading CRC content via `try_read_crc_file`. Neither `size` + // nor `last_modified` is ever accessed. + let crc_file = ParsedLogPath { + location: FileMeta { + location: crc_file.location, + last_modified: 0, + size: 0, + }, + filename: crc_file.filename, + extension: crc_file.extension, + version: crc_file.version, + file_type: crc_file.file_type, + }; + let mut new_log_segment = self.clone(); + new_log_segment.listed.latest_crc_file = Some(crc_file); + Ok(new_log_segment) + } + + pub(crate) fn new_as_published(&self) -> DeltaResult { + // In the future, we can additionally convert the staged commit files to published commit + // files. That would reqire faking their FileMeta locations. + let mut new_log_segment = self.clone(); + new_log_segment.listed.max_published_version = Some(self.end_version); + Ok(new_log_segment) + } + + pub(crate) fn get_unpublished_catalog_commits(&self) -> DeltaResult> { + self.listed + .ascending_commit_files + .iter() + .filter(|file| file.file_type == LogPathFileType::StagedCommit) + .filter(|file| { + self.listed + .max_published_version + .is_none_or(|v| file.version > v) + }) + .map(|file| CatalogCommit::try_new(&self.log_root, file)) + .collect() } /// Read a stream of actions from this log segment. This returns an iterator of @@ -288,8 +582,16 @@ impl LogSegment { /// actions) that are not part of the schema but this is an implementation /// detail that should not be relied on and will likely change. /// - /// `meta_predicate` is an optional expression to filter the log files with. It is _NOT_ the - /// query's predicate, but rather a predicate for filtering log files themselves. + /// Read a stream of actions from this log segment. This returns an iterator of + /// [`ActionsBatch`]s which includes EngineData of actions + a boolean flag indicating whether + /// the data was read from a commit file (true) or a checkpoint file (false). + /// + /// Also returns `CheckpointReadInfo` with stats_parsed compatibility and the checkpoint schema. + /// + /// `meta_predicate` is an optional expression for row group skipping in checkpoint parquet + /// files. It is _NOT_ the query's data predicate, but a hint for skipping irrelevant data. + /// IS NOT NULL predicates are automatically derived from `checkpoint_read_schema` and combined + /// (AND) with `meta_predicate`, so callers only need to supply query-based skipping predicates. #[internal_api] pub(crate) fn read_actions_with_projected_checkpoint_actions( &self, @@ -297,38 +599,55 @@ impl LogSegment { commit_read_schema: SchemaRef, checkpoint_read_schema: SchemaRef, meta_predicate: Option, - ) -> DeltaResult> + Send> { + stats_schema: Option<&StructType>, + partition_schema: Option<&StructType>, + ) -> DeltaResult< + ActionsWithCheckpointInfo> + Send>, + > { + // Combine schema-derived IS NOT NULL predicate with any caller-supplied predicate so + // checkpoint parquet row groups without any relevant action type can be skipped. + // TODO: The semantics of `meta_predicate` will change in a follow-up PR. + let is_not_null_pred = schema_to_is_not_null_predicate(&checkpoint_read_schema); + let effective_predicate = match (is_not_null_pred, meta_predicate) { + (None, x) | (x, None) => x, + (Some(a), Some(b)) => Some(Arc::new(Predicate::and((*a).clone(), (*b).clone()))), + }; + // `replay` expects commit files to be sorted in descending order, so the return value here is correct - let commits_and_compactions = self.find_commit_cover(); - let commit_stream = engine - .json_handler() - .read_json_files( - &commits_and_compactions, - commit_read_schema, - meta_predicate.clone(), - )? - .map_ok(|batch| ActionsBatch::new(batch, true)); + let commit_stream = CommitReader::try_new(engine, self, commit_read_schema)?; - let checkpoint_stream = - self.create_checkpoint_stream(engine, checkpoint_read_schema, meta_predicate)?; + let checkpoint_result = self.create_checkpoint_stream( + engine, + checkpoint_read_schema, + effective_predicate, + stats_schema, + partition_schema, + )?; - Ok(commit_stream.chain(checkpoint_stream)) + Ok(ActionsWithCheckpointInfo { + actions: commit_stream.chain(checkpoint_result.actions), + checkpoint_info: checkpoint_result.checkpoint_info, + }) } - // Same as above, but uses the same schema for reading checkpoints and commits. + /// Same as [`Self::read_actions_with_projected_checkpoint_actions`], but uses the same schema + /// for reading checkpoints and commits. IS NOT NULL predicates are automatically derived from + /// the schema, so callers do not need to supply them. #[internal_api] pub(crate) fn read_actions( &self, engine: &dyn Engine, action_schema: SchemaRef, - meta_predicate: Option, ) -> DeltaResult> + Send> { - self.read_actions_with_projected_checkpoint_actions( + let result = self.read_actions_with_projected_checkpoint_actions( engine, action_schema.clone(), action_schema, - meta_predicate, - ) + None, + None, + None, + )?; + Ok(result.actions) } /// find a minimal set to cover the range of commits we want. This is greedy so not always @@ -336,13 +655,13 @@ impl LogSegment { /// returns files is DESCENDING ORDER, as that's what `replay` expects. This function assumes /// that all files in `self.ascending_commit_files` and `self.ascending_compaction_files` are in /// range for this log segment. This invariant is maintained by our listing code. - fn find_commit_cover(&self) -> Vec { + pub(crate) fn find_commit_cover(&self) -> Vec { // Create an iterator sorted in ascending order by (initial version, end version), e.g. // [00.json, 00.09.compacted.json, 00.99.compacted.json, 01.json, 02.json, ..., 10.json, // 10.19.compacted.json, 11.json, ...] let all_files = itertools::Itertools::merge_by( - self.ascending_commit_files.iter(), - self.ascending_compaction_files.iter(), + self.listed.ascending_commit_files.iter(), + self.listed.ascending_compaction_files.iter(), |path_a, path_b| path_a.version <= path_b.version, ); @@ -374,40 +693,192 @@ impl LogSegment { selected_files } + /// Determines the file actions schema and extracts sidecar file references for checkpoints. + /// + /// This function analyzes the checkpoint to determine: + /// 1. The file actions schema (for stats_parsed / partitionValues_parsed detection) + /// 2. Sidecar file references if this is a V2 checkpoint + /// + /// The logic is: + /// - No checkpoint parts: return (None, []) + /// - Multi-part (always V1, no sidecars): return checkpoint schema directly + /// - UUID-named JSON (always V2): extract sidecars, read first sidecar's schema + /// - Classic-named or UUID-named parquet (V1 or V2): read checkpoint schema from + /// hint or footer, then check for sidecar column to distinguish + /// - Has sidecar column (V2): extract sidecars, read first sidecar's schema + /// - No sidecar column (V1): use checkpoint schema directly + fn get_file_actions_schema_and_sidecars( + &self, + engine: &dyn Engine, + ) -> DeltaResult<(Option, Vec)> { + // Hint schema from `_last_checkpoint` avoids footer reads when available. + let hint_schema = self.checkpoint_schema.as_ref(); + + // All parts of a multi-part checkpoint belong to the same table version and follow + // the same V1 spec, so reading any one part's schema is sufficient. + let Some(checkpoint) = self.listed.checkpoint_parts.first() else { + return Ok((None, vec![])); + }; + + match &checkpoint.file_type { + MultiPartCheckpoint { .. } => { + // Multi-part checkpoints are always V1 and never have sidecars. + let schema = Self::read_checkpoint_schema(engine, checkpoint, hint_schema)?; + Ok((Some(schema), vec![])) + } + UuidCheckpoint if checkpoint.extension.as_str() == "json" => { + // JSON checkpoint is always V2. No checkpoint schema is available since JSON + // checkpoints don't have a parquet footer to read. + self.read_sidecar_schema_and_files(engine, checkpoint, None) + } + SinglePartCheckpoint | UuidCheckpoint if checkpoint.extension.as_str() == "parquet" => { + // Parquet checkpoint (classic-named or UUID-named): either can be V1 or V2. + // Check for sidecar column to distinguish. + let checkpoint_schema = + Self::read_checkpoint_schema(engine, checkpoint, hint_schema)?; + if checkpoint_schema.field(SIDECAR_NAME).is_some() { + self.read_sidecar_schema_and_files(engine, checkpoint, Some(&checkpoint_schema)) + } else { + Ok((Some(checkpoint_schema), vec![])) + } + } + _ => Ok((None, vec![])), + } + } + + /// Returns the checkpoint's parquet schema, using the hint from `_last_checkpoint` if + /// available or reading the parquet footer otherwise. + fn read_checkpoint_schema( + engine: &dyn Engine, + checkpoint: &ParsedLogPath, + hint_schema: Option<&SchemaRef>, + ) -> DeltaResult { + match hint_schema { + Some(schema) => Ok(schema.clone()), + None => Ok(engine + .parquet_handler() + .read_parquet_footer(&checkpoint.location)? + .schema), + } + } + + /// Extracts sidecar file references and reads the file actions schema from the first + /// sidecar's parquet footer. If no sidecars exist, falls back to `checkpoint_schema` + /// since V2 checkpoints may store add actions directly in the main file. + fn read_sidecar_schema_and_files( + &self, + engine: &dyn Engine, + checkpoint: &ParsedLogPath, + checkpoint_schema: Option<&SchemaRef>, + ) -> DeltaResult<(Option, Vec)> { + let sidecar_files = self.extract_sidecar_refs(engine, checkpoint)?; + let file_actions_schema = match sidecar_files.first() { + Some(first) => Some(engine.parquet_handler().read_parquet_footer(first)?.schema), + None => checkpoint_schema.cloned(), + }; + Ok((file_actions_schema, sidecar_files)) + } + /// Returns an iterator over checkpoint data, processing sidecar files when necessary. /// - /// By default, `create_checkpoint_stream` checks for the presence of sidecar files, and - /// reads their contents if present. Checking for sidecar files is skipped if: - /// - The checkpoint is a multi-part checkpoint - /// - The checkpoint read schema does not contain a file action + /// For checkpoints that need file actions, this function: + /// 1. Determines the file actions schema (for stats_parsed / partitionValues_parsed detection) + /// 2. Extracts sidecar file references if present (V2 checkpoints) + /// 3. Reads checkpoint and sidecar data using cached sidecar refs /// - /// For single-part checkpoints, any referenced sidecar files are processed. These - /// sidecar files contain the actual file actions that would otherwise be - /// stored directly in the checkpoint. The sidecar file batches are chained to the - /// checkpoint batch in the top level iterator to be returned. + /// Returns a tuple of the actions iterator and [`CheckpointReadInfo`]. fn create_checkpoint_stream( &self, engine: &dyn Engine, action_schema: SchemaRef, meta_predicate: Option, - ) -> DeltaResult> + Send> { + stats_schema: Option<&StructType>, + partition_schema: Option<&StructType>, + ) -> DeltaResult< + ActionsWithCheckpointInfo> + Send>, + > { let need_file_actions = schema_contains_file_actions(&action_schema); - // Sidecars only contain file actions so don't add it to the schema if not needed - let checkpoint_read_schema = if !need_file_actions || - // Don't duplicate the column if it exists - action_schema.contains(SIDECAR_NAME) || - // With multiple parts the checkpoint can't be v2, so sidecars aren't needed - self.checkpoint_parts.len() > 1 - { - action_schema.clone() + let (file_actions_schema, sidecar_files) = if need_file_actions { + self.get_file_actions_schema_and_sidecars(engine)? + } else { + (None, vec![]) + }; + + // Check if checkpoint has compatible stats_parsed and add it to the schema if so + let has_stats_parsed = + stats_schema + .zip(file_actions_schema.as_ref()) + .is_some_and(|(stats, file_schema)| { + Self::schema_has_compatible_stats_parsed(file_schema, stats) + }); + + let has_partition_values_parsed = partition_schema + .zip(file_actions_schema.as_ref()) + .is_some_and(|(ps, fs)| Self::schema_has_compatible_partition_values_parsed(fs, ps)); + + // Build final schema with any additional fields needed + // (stats_parsed, partitionValues_parsed, sidecar) + let needs_sidecar = need_file_actions && !sidecar_files.is_empty(); + let needs_add_augmentation = has_stats_parsed || has_partition_values_parsed; + let augmented_checkpoint_read_schema = if needs_add_augmentation || needs_sidecar { + let mut new_fields: Vec = if let (true, Some(add_field)) = + (needs_add_augmentation, action_schema.field("add")) + { + let DataType::Struct(add_struct) = add_field.data_type() else { + return Err(Error::internal_error( + "add field in action schema must be a struct", + )); + }; + let mut add_fields: Vec = add_struct.fields().cloned().collect(); + + if let (true, Some(ss)) = (has_stats_parsed, stats_schema) { + add_fields.push(StructField::nullable( + "stats_parsed", + DataType::Struct(Box::new(ss.clone())), + )); + } + + if let (true, Some(ps)) = (has_partition_values_parsed, partition_schema) { + add_fields.push(StructField::nullable( + "partitionValues_parsed", + DataType::Struct(Box::new(ps.clone())), + )); + } + + // Rebuild schema with modified add field + action_schema + .fields() + .map(|f| { + if f.name() == "add" { + StructField::new( + add_field.name(), + StructType::new_unchecked(add_fields.clone()), + add_field.is_nullable(), + ) + .with_metadata(add_field.metadata.clone()) + } else { + f.clone() + } + }) + .collect() + } else { + action_schema.fields().cloned().collect() + }; + + // Add sidecar column at top-level for V2 checkpoints + if needs_sidecar { + new_fields.push(StructField::nullable(SIDECAR_NAME, Sidecar::to_schema())); + } + + Arc::new(StructType::new_unchecked(new_fields)) } else { - Arc::new( - action_schema.add([StructField::nullable(SIDECAR_NAME, Sidecar::to_schema())])?, - ) + // No modifications needed, use schema as-is + action_schema.clone() }; let checkpoint_file_meta: Vec<_> = self + .listed .checkpoint_parts .iter() .map(|f| f.location.clone()) @@ -419,18 +890,18 @@ impl LogSegment { // but it was removed to avoid unnecessary coupling. This is a concrete case // where it *could* have been useful, but for now, we're keeping them separate. // If similar patterns start appearing elsewhere, we should reconsider that decision. - let actions = match self.checkpoint_parts.first() { + let actions = match self.listed.checkpoint_parts.first() { Some(parsed_log_path) if parsed_log_path.extension == "json" => { engine.json_handler().read_json_files( &checkpoint_file_meta, - checkpoint_read_schema.clone(), + augmented_checkpoint_read_schema.clone(), meta_predicate.clone(), )? } Some(parsed_log_path) if parsed_log_path.extension == "parquet" => parquet_handler .read_parquet_files( &checkpoint_file_meta, - checkpoint_read_schema.clone(), + augmented_checkpoint_read_schema.clone(), meta_predicate.clone(), )?, Some(parsed_log_path) => { @@ -444,132 +915,160 @@ impl LogSegment { None => Box::new(std::iter::empty()), }; - let log_root = self.log_root.clone(); + // Read sidecars with the same schema as checkpoint (including stats_parsed if available). + // The sidecar column will be null in sidecar batches, which is harmless. + // Both checkpoint and sidecar parquet files share the same `add.stats_parsed.*` column + // layout, so we reuse the same predicate for row group skipping. + let sidecar_batches = if !sidecar_files.is_empty() { + parquet_handler.read_parquet_files( + &sidecar_files, + augmented_checkpoint_read_schema.clone(), + meta_predicate, + )? + } else { + Box::new(std::iter::empty()) + }; + // Chain checkpoint batches with sidecar batches. + // The boolean flag indicates whether the batch originated from a commit file + // (true) or a checkpoint file (false). let actions_iter = actions - .map(move |checkpoint_batch_result| -> DeltaResult<_> { - let checkpoint_batch = checkpoint_batch_result?; - // This closure maps the checkpoint batch to an iterator of batches - // by chaining the checkpoint batch with sidecar batches if they exist. - - // 1. In the case where the schema does not contain file actions, we return the - // checkpoint batch directly as sidecar files only have to be read when the - // schema contains add/remove action. - // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is - // returned as-is. - let sidecar_content = if need_file_actions && checkpoint_file_meta.len() == 1 { - Self::process_sidecars( - parquet_handler.clone(), // cheap Arc clone - log_root.clone(), - checkpoint_batch.as_ref(), - action_schema.clone(), - meta_predicate.clone(), - )? - } else { - None - }; - - let combined_batches = std::iter::once(Ok(checkpoint_batch)) - .chain(sidecar_content.into_iter().flatten()) - // The boolean flag indicates whether the batch originated from a commit file - // (true) or a checkpoint file (false). - .map_ok(|sidecar_batch| ActionsBatch::new(sidecar_batch, false)); - - Ok(combined_batches) - }) - .flatten_ok() - .map(|result| result?); // result-result to result + .map_ok(|batch| ActionsBatch::new(batch, false)) + .chain(sidecar_batches.map_ok(|batch| ActionsBatch::new(batch, false))); - Ok(actions_iter) + let checkpoint_info = CheckpointReadInfo { + has_stats_parsed, + has_partition_values_parsed, + checkpoint_read_schema: augmented_checkpoint_read_schema, + }; + Ok(ActionsWithCheckpointInfo { + actions: actions_iter, + checkpoint_info, + }) } - /// Processes sidecar files for the given checkpoint batch. - /// - /// This function extracts any sidecar file references from the provided batch. - /// Each sidecar file is read and an iterator of file action batches is returned - fn process_sidecars( - parquet_handler: Arc, - log_root: Url, - batch: &dyn EngineData, - checkpoint_read_schema: SchemaRef, - meta_predicate: Option, - ) -> DeltaResult>> + Send>> { - // Visit the rows of the checkpoint batch to extract sidecar file references - let mut visitor = SidecarVisitor::default(); - visitor.visit_rows_of(batch)?; + /// Extracts sidecar file references from a checkpoint file. + fn extract_sidecar_refs( + &self, + engine: &dyn Engine, + checkpoint: &ParsedLogPath, + ) -> DeltaResult> { + // Read checkpoint with just the sidecar column + let batches = match checkpoint.extension.as_str() { + "json" => engine.json_handler().read_json_files( + std::slice::from_ref(&checkpoint.location), + Self::sidecar_read_schema(), + None, + )?, + "parquet" => engine.parquet_handler().read_parquet_files( + std::slice::from_ref(&checkpoint.location), + Self::sidecar_read_schema(), + None, + )?, + _ => return Ok(vec![]), + }; - // If there are no sidecar files, return early - if visitor.sidecars.is_empty() { - return Ok(None); + // Extract sidecar file references + let mut visitor = SidecarVisitor::default(); + for batch_result in batches { + let batch = batch_result?; + visitor.visit_rows_of(batch.as_ref())?; } - let sidecar_files: Vec<_> = visitor + // Convert to FileMeta + visitor .sidecars .iter() - .map(|sidecar| sidecar.to_filemeta(&log_root)) - .try_collect()?; - - // Read the sidecar files and return an iterator of sidecar file batches - Ok(Some(parquet_handler.read_parquet_files( - &sidecar_files, - checkpoint_read_schema, - meta_predicate, - )?)) + .map(|sidecar| sidecar.to_filemeta(&self.log_root)) + .try_collect() } - // Do a lightweight protocol+metadata log replay to find the latest Protocol and Metadata in - // the LogSegment - pub(crate) fn protocol_and_metadata( - &self, - engine: &dyn Engine, - ) -> DeltaResult<(Option, Option)> { - let actions_batches = self.replay_for_metadata(engine)?; - let (mut metadata_opt, mut protocol_opt) = (None, None); - for actions_batch in actions_batches { - let actions = actions_batch?.actions; - if metadata_opt.is_none() { - metadata_opt = Metadata::try_new_from_data(actions.as_ref())?; - } - if protocol_opt.is_none() { - protocol_opt = Protocol::try_new_from_data(actions.as_ref())?; - } - if metadata_opt.is_some() && protocol_opt.is_some() { - // we've found both, we can stop - break; - } + /// Creates a pruned LogSegment for replay *after* a CRC at `start_v_exclusive`. + /// + /// The CRC covers protocol, metadata, and checkpoint state, so this segment drops + /// checkpoint files, CRC files, and checkpoint schema. Only commits and compactions + /// in `(start_v_exclusive, end_version]` are retained. + pub(crate) fn segment_after_crc(&self, start_v_exclusive: Version) -> Self { + let (commits, compactions) = + self.filtered_commits_and_compactions(Some(start_v_exclusive), self.end_version); + LogSegment { + end_version: self.end_version, + checkpoint_version: None, + log_root: self.log_root.clone(), + checkpoint_schema: None, + listed: LogSegmentFiles { + ascending_commit_files: commits, + ascending_compaction_files: compactions, + checkpoint_parts: vec![], + latest_crc_file: None, + latest_commit_file: None, + max_published_version: None, + }, } - Ok((metadata_opt, protocol_opt)) } - // Get the most up-to-date Protocol and Metadata actions - pub(crate) fn read_metadata(&self, engine: &dyn Engine) -> DeltaResult<(Metadata, Protocol)> { - match self.protocol_and_metadata(engine)? { - (Some(m), Some(p)) => Ok((m, p)), - (None, Some(_)) => Err(Error::MissingMetadata), - (Some(_), None) => Err(Error::MissingProtocol), - (None, None) => Err(Error::MissingMetadataAndProtocol), + /// Creates a pruned LogSegment for replay *before* a CRC at `end_v_inclusive`. + /// + /// Used as fallback when the CRC at `end_v_inclusive` fails to load. Falls back to + /// checkpoint-based replay, so checkpoint files and schema are preserved. Only commits + /// and compactions in `(checkpoint_version, end_v_inclusive]` are retained. Fields not + /// needed for this replay path (CRC file, latest commit file) are dropped. + pub(crate) fn segment_through_crc(&self, end_v_inclusive: Version) -> Self { + let (commits, compactions) = + self.filtered_commits_and_compactions(self.checkpoint_version, end_v_inclusive); + LogSegment { + end_version: self.end_version, + checkpoint_version: self.checkpoint_version, + log_root: self.log_root.clone(), + checkpoint_schema: self.checkpoint_schema.clone(), + listed: LogSegmentFiles { + ascending_commit_files: commits, + ascending_compaction_files: compactions, + checkpoint_parts: self.listed.checkpoint_parts.clone(), + latest_crc_file: None, + latest_commit_file: None, + max_published_version: None, + }, } } - // Replay the commit log, projecting rows to only contain Protocol and Metadata action columns. - fn replay_for_metadata( + /// Filters commits and compactions to those within `(lo_exclusive, hi_inclusive]`. + /// If `lo_exclusive` is `None`, there is no lower bound. + fn filtered_commits_and_compactions( &self, - engine: &dyn Engine, - ) -> DeltaResult> + Send> { - let schema = get_commit_schema().project(&[PROTOCOL_NAME, METADATA_NAME])?; - // filter out log files that do not contain metadata or protocol information - static META_PREDICATE: LazyLock> = LazyLock::new(|| { - Some(Arc::new(Predicate::or( - Expression::column([METADATA_NAME, "id"]).is_not_null(), - Expression::column([PROTOCOL_NAME, "minReaderVersion"]).is_not_null(), - ))) - }); - // read the same protocol and metadata schema for both commits and checkpoints - self.read_actions(engine, schema, META_PREDICATE.clone()) + lo_exclusive: Option, + hi_inclusive: Version, + ) -> (Vec, Vec) { + let above_lo = |v: Version| lo_exclusive.is_none_or(|lo| lo < v); + let commits = self + .listed + .ascending_commit_files + .iter() + .filter(|c| above_lo(c.version) && c.version <= hi_inclusive) + .cloned() + .collect(); + let compactions = self + .listed + .ascending_compaction_files + .iter() + .filter(|c| { + matches!( + c.file_type, + LogPathFileType::CompactedCommit { hi } + if above_lo(c.version) && hi <= hi_inclusive + ) + }) + .cloned() + .collect(); + (commits, compactions) } - /// How many commits since a checkpoint, according to this log segment + /// How many commits since a checkpoint, according to this log segment. + /// Returns 0 for pre-commit snapshots (where end_version is PRE_COMMIT_VERSION). pub(crate) fn commits_since_checkpoint(&self) -> u64 { + if self.end_version == PRE_COMMIT_VERSION { + return 0; + } // we can use 0 as the checkpoint version if there is no checkpoint since `end_version - 0` // is the correct number of commits since a checkpoint if there are no checkpoints let checkpoint_version = self.checkpoint_version.unwrap_or(0); @@ -577,40 +1076,373 @@ impl LogSegment { self.end_version - checkpoint_version } - /// How many commits since a log-compaction or checkpoint, according to this log segment + /// How many commits since a log-compaction or checkpoint, according to this log segment. + /// Returns 0 for pre-commit snapshots (where end_version is PRE_COMMIT_VERSION). pub(crate) fn commits_since_log_compaction_or_checkpoint(&self) -> u64 { + if self.end_version == PRE_COMMIT_VERSION { + return 0; + } // Annoyingly we have to search all the compaction files to determine this, because we only // sort by start version, so technically the max end version could be anywhere in the vec. // We can return 0 in the case there is no compaction since end_version - 0 is the correct // number of commits since compaction if there are no compactions - let max_compaction_end = self.ascending_compaction_files.iter().fold(0, |cur, f| { - if let &ParsedLogPath { - file_type: LogPathFileType::CompactedCommit { hi }, - .. - } = f - { - Version::max(cur, hi) - } else { - warn!("Found invalid ParsedLogPath in ascending_compaction_files: {f:?}"); - cur - } - }); + let max_compaction_end = self + .listed + .ascending_compaction_files + .iter() + .fold(0, |cur, f| { + if let &ParsedLogPath { + file_type: LogPathFileType::CompactedCommit { hi }, + .. + } = f + { + Version::max(cur, hi) + } else { + warn!("Found invalid ParsedLogPath in ascending_compaction_files: {f:?}"); + cur + } + }); // we want to subtract off the max of the max compaction end or the checkpoint version let to_sub = Version::max(self.checkpoint_version.unwrap_or(0), max_compaction_end); debug_assert!(to_sub <= self.end_version); self.end_version - to_sub } - /// Validates that all commit files in this log segment are not staged commits. We use this in - /// places like checkpoint writers, where we require all commits to be published. - pub(crate) fn validate_no_staged_commits(&self) -> DeltaResult<()> { + pub(crate) fn validate_published(&self) -> DeltaResult<()> { require!( - !self - .ascending_commit_files - .iter() - .any(|commit| matches!(commit.file_type, LogPathFileType::StagedCommit)), - Error::generic("Found staged commit file in log segment") + self.listed + .max_published_version + .is_some_and(|v| v == self.end_version), + Error::generic("Log segment is not published") ); Ok(()) } + + /// Schema to read just the sidecar column from a checkpoint file. + fn sidecar_read_schema() -> SchemaRef { + static SIDECAR_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(StructType::new_unchecked([StructField::nullable( + SIDECAR_NAME, + Sidecar::to_schema(), + )])) + }); + SIDECAR_SCHEMA.clone() + } + + /// Checks if a checkpoint schema contains a usable `add.stats_parsed` field. + /// + /// This validates that: + /// 1. The `add.stats_parsed` field exists in the checkpoint schema + /// 2. The types in `stats_parsed` are compatible with the stats schema for data skipping + /// + /// The `stats_schema` parameter contains only the columns referenced in the data skipping + /// predicate. This is built from the predicate and passed in by the caller. + /// + /// Both the checkpoint's `stats_parsed` schema and the `stats_schema` for data skipping + /// use physical column names (not logical names), so direct name comparison is correct. + /// + /// Returns `false` if stats_parsed doesn't exist or has incompatible types. + pub(crate) fn schema_has_compatible_stats_parsed( + checkpoint_schema: &StructType, + stats_schema: &StructType, + ) -> bool { + // Get add.stats_parsed from the checkpoint schema + let Some(stats_parsed) = checkpoint_schema + .field("add") + .and_then(|f| match f.data_type() { + DataType::Struct(s) => s.field("stats_parsed"), + _ => None, + }) + else { + debug!("stats_parsed not compatible: checkpoint schema does not contain add.stats_parsed field"); + return false; + }; + + let DataType::Struct(stats_struct) = stats_parsed.data_type() else { + debug!( + "stats_parsed not compatible: add.stats_parsed field is not a Struct, got {:?}", + stats_parsed.data_type() + ); + return false; + }; + + // Check type compatibility for both minValues and maxValues structs. + // While these typically have the same schema, the protocol doesn't guarantee it, + // so we check both to be safe. + for field_name in ["minValues", "maxValues"] { + let Some(checkpoint_values_field) = stats_struct.field(field_name) else { + // stats_parsed exists but no minValues/maxValues - unusual but valid + continue; + }; + + // minValues/maxValues must be a Struct containing per-column statistics. + // If it exists but isn't a Struct, the schema is malformed and unusable. + let DataType::Struct(checkpoint_values) = checkpoint_values_field.data_type() else { + debug!( + "stats_parsed not compatible: stats_parsed.{} is not a Struct, got {:?}", + field_name, + checkpoint_values_field.data_type() + ); + return false; + }; + + // Get the corresponding field from stats_schema (e.g., stats_schema.minValues) + let Some(stats_values_field) = stats_schema.field(field_name) else { + // stats_schema doesn't have minValues/maxValues, skip this check + continue; + }; + let DataType::Struct(stats_values) = stats_values_field.data_type() else { + // stats_schema.minValues/maxValues isn't a struct - shouldn't happen but skip + continue; + }; + + // Check type compatibility recursively for nested structs. + // Only fields that exist in both schemas need compatible types. + // Extra fields in checkpoint are ignored; missing fields return null. + if !Self::structs_have_compatible_types(checkpoint_values, stats_values, field_name) { + return false; + } + } + + debug!("Checkpoint schema has compatible stats_parsed for data skipping"); + true + } + + /// Recursively checks if two struct types have compatible field types. + /// + /// Used by both `stats_parsed` and `partitionValues_parsed` compatibility checks. + /// For each field in `needed`, if it exists in `available` (checkpoint): + /// - Primitive types: must be compatible via [`PrimitiveType::is_stats_type_compatible_with`] + /// (allows type widening and Parquet physical type reinterpretation) + /// - Nested structs: recursively check inner fields + /// - Missing fields in checkpoint: OK (will return null when accessed) + /// - Extra fields in checkpoint: OK (ignored) + fn structs_have_compatible_types( + available: &StructType, + needed: &StructType, + context: &str, + ) -> bool { + for needed_field in needed.fields() { + let Some(available_field) = available.field(needed_field.name()) else { + // Field missing in checkpoint - that's OK, it will be null + continue; + }; + + match (available_field.data_type(), needed_field.data_type()) { + // Both are structs: recurse + (DataType::Struct(avail_struct), DataType::Struct(need_struct)) => { + let nested_context = format!("{}.{}", context, needed_field.name()); + if !Self::structs_have_compatible_types( + avail_struct, + need_struct, + &nested_context, + ) { + return false; + } + } + // Non-struct types: use stats-specific rules for primitives and standard + // schema rules otherwise. + (avail_type, need_type) => { + let compatible = match (avail_type, need_type) { + (DataType::Primitive(a), DataType::Primitive(b)) => { + a.is_stats_type_compatible_with(b) + } + (a, b) => a.can_read_as(b).is_ok(), + }; + if !compatible { + debug!( + "stats_parsed not compatible: incompatible type for '{}' in {}: \ + checkpoint has {:?}, stats schema needs {:?}", + needed_field.name(), + context, + avail_type, + need_type + ); + return false; + } + } + } + } + true + } + + /// Checks if a checkpoint schema contains a usable `add.partitionValues_parsed` field. + /// + /// Validates that: + /// 1. The `add.partitionValues_parsed` field exists in the checkpoint schema + /// 2. The types for partition columns present in both schemas are compatible + /// + /// Missing partition columns in the checkpoint are OK (they simply won't contribute + /// to row group skipping). Returns `false` if `partitionValues_parsed` doesn't exist + /// or has incompatible types for any shared column. + pub(crate) fn schema_has_compatible_partition_values_parsed( + checkpoint_schema: &StructType, + partition_schema: &StructType, + ) -> bool { + let Some(partition_parsed) = + checkpoint_schema + .field("add") + .and_then(|f| match f.data_type() { + DataType::Struct(s) => s.field("partitionValues_parsed"), + _ => None, + }) + else { + debug!("partitionValues_parsed not compatible: checkpoint schema does not contain add.partitionValues_parsed field"); + return false; + }; + + let DataType::Struct(partition_struct) = partition_parsed.data_type() else { + warn!( + "partitionValues_parsed not compatible: add.partitionValues_parsed is not a Struct, got {:?}", + partition_parsed.data_type() + ); + return false; + }; + + // Flat struct: reuse the recursive type checker (trivial case with no nesting) + if !Self::structs_have_compatible_types( + partition_struct, + partition_schema, + "partitionValues_parsed", + ) { + return false; + } + + debug!("Checkpoint schema has compatible partitionValues_parsed for partition pruning"); + true + } +} + +fn validate_compaction_files(compactions: &[ParsedLogPath]) -> DeltaResult<()> { + for (i, f) in compactions.iter().enumerate() { + let LogPathFileType::CompactedCommit { hi } = f.file_type else { + return Err(Error::generic( + "ascending_compaction_files contains non-compaction file", + )); + }; + if f.version > hi { + return Err(Error::generic(format!( + "compaction file has start version {} > end version {}", + f.version, hi + ))); + } + if let Some(next) = compactions.get(i + 1) { + // next's type is validated on its own iteration; skip sort check if it isn't a + // CompactedCommit since the type error will be caught then. + if let LogPathFileType::CompactedCommit { hi: next_hi } = next.file_type { + if !(f.version < next.version || (f.version == next.version && hi <= next_hi)) { + return Err(Error::generic(format!( + "ascending_compaction_files is not sorted: {f:?} -> {next:?}" + ))); + } + } + } + } + Ok(()) +} + +fn validate_checkpoint_parts(parts: &[ParsedLogPath]) -> DeltaResult<()> { + if parts.is_empty() { + return Ok(()); + } + let n = parts.len(); + let first_version = parts[0].version; + for p in parts { + if !p.is_checkpoint() { + return Err(Error::generic( + "checkpoint_parts contains non-checkpoint file", + )); + } + if p.version != first_version { + return Err(Error::generic( + "multi-part checkpoint parts have different versions", + )); + } + match p.file_type { + LogPathFileType::MultiPartCheckpoint { num_parts, .. } if num_parts as usize == n => {} + LogPathFileType::MultiPartCheckpoint { num_parts, .. } => { + return Err(Error::generic(format!( + "multi-part checkpoint part count mismatch: slice has {n} parts but num_parts field says {num_parts}" + ))); + } + _ if n > 1 => { + return Err(Error::generic(format!( + "multi-part checkpoint part count mismatch: expected {n} multi-part checkpoint files but got a non-multi-part checkpoint" + ))); + } + _ => {} + } + } + Ok(()) +} + +fn validate_commit_file_types(commits: &[ParsedLogPath]) -> DeltaResult<()> { + for f in commits { + if !f.is_commit() { + return Err(Error::generic( + "ascending_commit_files contains non-commit file", + )); + } + } + Ok(()) +} + +fn validate_commit_files_contiguous(commits: &[ParsedLogPath]) -> DeltaResult<()> { + for pair in commits.windows(2) { + if pair[0].version + 1 != pair[1].version { + return Err(Error::generic(format!( + "Expected contiguous commit files, but found gap: {:?} -> {:?}", + pair[0], pair[1] + ))); + } + } + Ok(()) +} + +/// Validates that there is no gap between the checkpoint and the first commit file. +/// +/// When a checkpoint exists and commits are also present (after filtering out commits at or before +/// the checkpoint), the first commit must immediately follow the checkpoint (i.e., be at +/// `checkpoint_version + 1`). A gap indicates missing log files. +fn validate_checkpoint_commit_gap( + checkpoint_version: Option, + commits: &[ParsedLogPath], +) -> DeltaResult<()> { + if let (Some(checkpoint_version), Some(first_commit)) = (checkpoint_version, commits.first()) { + require!( + checkpoint_version + 1 == first_commit.version, + Error::InvalidCheckpoint(format!( + "Gap between checkpoint version {checkpoint_version} and next commit {}", + first_commit.version + )) + ); + } + Ok(()) +} + +/// Validates that the log segment covers exactly `end_version` (when specified) and returns the +/// effective version -- the version of the last commit, or the checkpoint version if no commits +/// are present. +/// +/// Returns an error if the segment is empty (no commits and no checkpoint parts), or if the +/// effective version does not match the requested `end_version`. +fn validate_end_version( + commits: &[ParsedLogPath], + checkpoint_parts: &[ParsedLogPath], + end_version: Option, +) -> DeltaResult { + let effective_version = commits + .last() + .or(checkpoint_parts.first()) + .ok_or(Error::generic("No files in log segment"))? + .version; + if let Some(end_version) = end_version { + require!( + effective_version == end_version, + Error::generic(format!( + "LogSegment end version {effective_version} not the same as the specified end version {end_version}" + )) + ); + } + Ok(effective_version) } diff --git a/kernel/src/log_segment/crc_tests.rs b/kernel/src/log_segment/crc_tests.rs new file mode 100644 index 0000000000..83fe958a6c --- /dev/null +++ b/kernel/src/log_segment/crc_tests.rs @@ -0,0 +1,599 @@ +//! Tests for P&M replay and ICT reads with CRC files. +//! +//! Each test sets up an in-memory Delta log with V2 checkpoint JSONs, commit files, and CRC files, +//! then verifies that Protocol & Metadata loading and ICT reads resolve correctly. + +use std::collections::HashMap; +use std::sync::Arc; + +use serde_json::json; +use url::Url; + +use crate::actions::{CommitInfo, Format, Metadata, Protocol}; +use crate::engine::default::executor::tokio::TokioBackgroundExecutor; +use crate::engine::default::{DefaultEngine, DefaultEngineBuilder}; +use crate::object_store::memory::InMemory; +use crate::object_store::ObjectStore; +use crate::Snapshot; + +use test_utils::{assert_result_error_with_message, delta_path_for_version}; + +// ============================================================================ +// Expected values +// ============================================================================ + +const SCHEMA_STRING: &str = r#"{"type":"struct","fields":[{"name":"id","type":"integer","nullable":true,"metadata":{}},{"name":"val","type":"string","nullable":true,"metadata":{}}]}"#; + +fn protocol_v2() -> Protocol { + Protocol::try_new_modern(["v2Checkpoint"], ["v2Checkpoint"]).unwrap() +} + +fn protocol_v2_dv() -> Protocol { + Protocol::try_new_modern( + ["v2Checkpoint", "deletionVectors"], + ["v2Checkpoint", "deletionVectors"], + ) + .unwrap() +} + +fn protocol_v2_dv_ntz() -> Protocol { + Protocol::try_new_modern( + ["v2Checkpoint", "deletionVectors", "timestampNtz"], + ["v2Checkpoint", "deletionVectors", "timestampNtz"], + ) + .unwrap() +} + +fn protocol_v2_ict() -> Protocol { + Protocol::try_new( + 3, + 7, + Some(["v2Checkpoint"]), + Some(["v2Checkpoint", "inCommitTimestamp"]), + ) + .unwrap() +} + +fn metadata_a() -> Metadata { + Metadata::new_unchecked( + "aaa", + None, + None, + Format::default(), + SCHEMA_STRING, + vec![], + Some(1587968585495), + HashMap::new(), + ) +} + +fn metadata_b() -> Metadata { + Metadata::new_unchecked( + "bbb", + None, + None, + Format::default(), + SCHEMA_STRING, + vec![], + Some(1587968585495), + HashMap::new(), + ) +} + +fn metadata_ict() -> Metadata { + Metadata::new_unchecked( + "5fba94ed-9794-4965-ba6e-6ee3c0d22af9", + None, + None, + Format::default(), + SCHEMA_STRING, + vec![], + Some(1587968585495), + HashMap::from([( + "delta.enableInCommitTimestamps".to_string(), + "true".to_string(), + )]), + ) +} + +fn commit_info() -> CommitInfo { + CommitInfo { + timestamp: Some(1587968586154), + operation: Some("WRITE".to_string()), + ..Default::default() + } +} + +// ============================================================================ +// JSON builders +// ============================================================================ + +const V2_CKPT_SUFFIX: &str = "checkpoint.00000000-0000-0000-0000-000000000000.json"; + +fn protocol_action_json(protocol: &Protocol) -> serde_json::Value { + json!({"protocol": serde_json::to_value(protocol).unwrap()}) +} + +fn metadata_action_json(metadata: &Metadata) -> serde_json::Value { + json!({"metaData": serde_json::to_value(metadata).unwrap()}) +} + +fn commit_info_json() -> serde_json::Value { + json!({"commitInfo": serde_json::to_value(commit_info()).unwrap()}) +} + +fn commit_info_json_with_ict(ict: i64) -> serde_json::Value { + json!({"commitInfo": { + "timestamp": 1587968586154i64, + "operation": "WRITE", + "inCommitTimestamp": ict, + }}) +} + +fn crc_json(protocol: &Protocol, metadata: &Metadata, ict: Option) -> serde_json::Value { + let mut v = json!({ + "tableSizeBytes": 0, + "numFiles": 0, + "numMetadata": 1, + "numProtocol": 1, + "metadata": serde_json::to_value(metadata).unwrap(), + "protocol": serde_json::to_value(protocol).unwrap(), + }); + if let Some(ict) = ict { + v["inCommitTimestampOpt"] = json!(ict); + } + v +} + +// ============================================================================ +// CrcReadTest builder +// ============================================================================ + +/// Operations that can be applied to an in-memory Delta log. +enum Op { + V2Checkpoint { + version: u64, + protocol: Protocol, + metadata: Metadata, + }, + Delta(u64), + DeltaWithPM { + version: u64, + protocol: Option, + metadata: Option, + }, + DeltaWithIct { + version: u64, + ict: i64, + }, + Crc { + version: u64, + protocol: Protocol, + metadata: Metadata, + ict: Option, + }, + CorruptCrc(u64), +} + +/// Declarative test builder: accumulate log operations, then build and assert. +struct CrcReadTest { + ops: Vec, +} + +impl CrcReadTest { + fn new() -> Self { + Self { ops: vec![] } + } + + /// Write a V2 checkpoint at the given version. + fn v2_checkpoint(mut self, version: u64, protocol: Protocol, metadata: Metadata) -> Self { + self.ops.push(Op::V2Checkpoint { + version, + protocol, + metadata, + }); + self + } + + /// Write a plain delta (commitInfo only, no protocol or metadata). + fn delta(mut self, version: u64) -> Self { + self.ops.push(Op::Delta(version)); + self + } + + /// Write a delta with optional protocol and/or metadata overrides. + fn delta_with_p_m( + mut self, + version: u64, + protocol: impl Into>, + metadata: impl Into>, + ) -> Self { + self.ops.push(Op::DeltaWithPM { + version, + protocol: protocol.into(), + metadata: metadata.into(), + }); + self + } + + /// Write a delta with an in-commit timestamp in commitInfo. + fn delta_with_ict(mut self, version: u64, ict: i64) -> Self { + self.ops.push(Op::DeltaWithIct { version, ict }); + self + } + + /// Write a CRC file with the given protocol, metadata, and optional ICT. + fn crc( + mut self, + version: u64, + protocol: Protocol, + metadata: Metadata, + ict: impl Into>, + ) -> Self { + self.ops.push(Op::Crc { + version, + protocol, + metadata, + ict: ict.into(), + }); + self + } + + /// Write a corrupt CRC file. + fn corrupt_crc(mut self, version: u64) -> Self { + self.ops.push(Op::CorruptCrc(version)); + self + } + + /// Execute all operations, returning a built test that can be asserted against. + async fn build(self) -> BuiltCrcTest { + let store = Arc::new(InMemory::new()); + let url = Url::parse("memory:///").unwrap(); + let engine = DefaultEngineBuilder::new(store.clone()).build(); + + for op in self.ops { + match op { + Op::V2Checkpoint { + version: v, + ref protocol, + ref metadata, + } => { + let content = format!( + "{}\n{}\n{}", + protocol_action_json(protocol), + metadata_action_json(metadata), + json!({"checkpointMetadata": {"version": 2}}) + ); + put(&store, v, V2_CKPT_SUFFIX, &content).await; + } + Op::Delta(v) => { + put(&store, v, "json", &commit_info_json().to_string()).await; + } + Op::DeltaWithPM { + version: v, + protocol, + metadata, + } => { + let mut lines = vec![commit_info_json().to_string()]; + if let Some(ref p) = protocol { + lines.push(protocol_action_json(p).to_string()); + } + if let Some(ref m) = metadata { + lines.push(metadata_action_json(m).to_string()); + } + put(&store, v, "json", &lines.join("\n")).await; + } + Op::DeltaWithIct { version: v, ict } => { + put( + &store, + v, + "json", + &commit_info_json_with_ict(ict).to_string(), + ) + .await; + } + Op::Crc { + version: v, + ref protocol, + ref metadata, + ict, + } => { + put( + &store, + v, + "crc", + &crc_json(protocol, metadata, ict).to_string(), + ) + .await; + } + Op::CorruptCrc(v) => { + put(&store, v, "crc", "CORRUPT_CRC_DATA").await; + } + } + } + + BuiltCrcTest { engine, url } + } +} + +struct BuiltCrcTest { + engine: DefaultEngine, + url: Url, +} + +impl BuiltCrcTest { + fn assert_p_m( + &self, + version: impl Into>, + expected_protocol: &Protocol, + expected_metadata: &Metadata, + ) { + let version = version.into(); + let mut builder = Snapshot::builder_for(self.url.clone()); + if let Some(v) = version { + builder = builder.at_version(v); + } + let snapshot = builder.build(&self.engine).unwrap(); + let table_config = snapshot.table_configuration(); + + let version_label = version.map_or("latest".to_string(), |v| format!("v{v}")); + assert_eq!( + table_config.protocol(), + expected_protocol, + "Protocol mismatch at {version_label}" + ); + + assert_eq!( + table_config.metadata(), + expected_metadata, + "Metadata mismatch at {version_label}" + ); + } + + fn assert_ict(&self, version: impl Into>, expected_ict: Option) { + let version = version.into(); + let mut builder = Snapshot::builder_for(self.url.clone()); + if let Some(v) = version { + builder = builder.at_version(v); + } + let snapshot = builder.build(&self.engine).unwrap(); + let ict = snapshot.get_in_commit_timestamp(&self.engine).unwrap(); + + let version_label = version.map_or("latest".to_string(), |v| format!("v{v}")); + assert_eq!(ict, expected_ict, "ICT mismatch at {version_label}"); + } +} + +async fn put(store: &InMemory, version: u64, suffix: &str, content: &str) { + store + .put( + &delta_path_for_version(version, suffix), + content.as_bytes().to_vec().into(), + ) + .await + .unwrap(); +} + +// TODO: Time travel tests +// TODO: Log compaction tests +// TODO: build_from tests +// TODO: _last_checkpoint tests + +// ============================================================================ +// Tests: Baseline (no CRC) +// ============================================================================ + +#[tokio::test] +async fn test_get_p_m_from_delta_no_checkpoint() { + CrcReadTest::new() + .delta_with_p_m(0, protocol_v2(), metadata_a()) // <-- P & M from here + .delta(1) + .delta(2) + .build() + .await + .assert_p_m(None, &protocol_v2(), &metadata_a()); +} + +#[tokio::test] +async fn test_get_p_and_m_from_different_deltas() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) + .delta_with_p_m(1, protocol_v2_dv(), None) // <-- P from here + .delta_with_p_m(2, None, metadata_b()) // <-- M from here + .build() + .await + .assert_p_m(None, &protocol_v2_dv(), &metadata_b()); +} + +#[tokio::test] +async fn test_get_p_m_from_checkpoint() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) // <-- P & M from here + .delta(1) + .delta(2) + .build() + .await + .assert_p_m(None, &protocol_v2(), &metadata_a()); +} + +#[tokio::test] +async fn test_get_p_m_from_delta_after_checkpoint() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) + .delta_with_p_m(1, protocol_v2_dv(), metadata_b()) // <-- P & M from here + .delta(2) + .build() + .await + .assert_p_m(None, &protocol_v2_dv(), &metadata_b()); +} + +// ============================================================================ +// Tests: CRC at target version +// ============================================================================ + +#[tokio::test] +async fn test_get_p_m_from_crc_at_target() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) + .delta(1) + .delta(2) + .crc(2, protocol_v2_dv(), metadata_b(), None) // <-- P & M from here + .build() + .await + .assert_p_m(None, &protocol_v2_dv(), &metadata_b()); +} + +#[tokio::test] +async fn test_crc_preferred_over_delta_at_target() { + // The P & M for the 002.crc and 002.json should NOT be different in practice. + // We only do this for this test so we can differentiate which P & M is used. + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) + .delta(1) + .delta_with_p_m(2, protocol_v2_dv(), metadata_a()) + .crc(2, protocol_v2_dv_ntz(), metadata_b(), None) // <-- P & M from here + .build() + .await + .assert_p_m(None, &protocol_v2_dv_ntz(), &metadata_b()); +} + +#[tokio::test] +async fn test_corrupt_crc_at_target_falls_back() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) // <-- P & M from here + .delta(1) + .delta(2) + .corrupt_crc(2) // <-- Corrupt! Fall back to replay. + .build() + .await + .assert_p_m(None, &protocol_v2(), &metadata_a()); +} + +#[tokio::test] +async fn test_crc_wins_over_checkpoint() { + // The P & M for the 002.crc and the v2 checkpoint should NOT be different in practice. + // We only do this for this test so we can differentiate which P & M is used. + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) + .delta(1) + .delta(2) + .v2_checkpoint(2, protocol_v2(), metadata_a()) + .crc(2, protocol_v2_dv(), metadata_b(), None) // <-- P & M from here + .build() + .await + .assert_p_m(None, &protocol_v2_dv(), &metadata_b()); +} + +#[tokio::test] +async fn test_checkpoint_on_corrupt_crc() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) + .delta(1) + .delta(2) + .v2_checkpoint(2, protocol_v2(), metadata_a()) // <-- P & M from here + .corrupt_crc(2) // <-- Corrupt! Fall back to replay. + .build() + .await + .assert_p_m(None, &protocol_v2(), &metadata_a()); +} + +// ============================================================================ +// Tests: CRC at version < target +// ============================================================================ + +#[tokio::test] +async fn test_crc_at_earlier_version() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) + .delta(1) + .crc(1, protocol_v2_dv(), metadata_b(), None) // <-- P & M from here + .delta(2) + .build() + .await + .assert_p_m(None, &protocol_v2_dv(), &metadata_b()); +} + +#[tokio::test] +async fn test_get_p_from_newer_delta_over_older_crc() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) + .delta(1) + .crc(1, protocol_v2_dv(), metadata_b(), None) // <-- M from here + .delta_with_p_m(2, protocol_v2_dv_ntz(), None) // <-- P from here + .build() + .await + .assert_p_m(None, &protocol_v2_dv_ntz(), &metadata_b()); +} + +#[tokio::test] +async fn test_get_m_from_newer_delta_over_older_crc() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) + .delta(1) + .crc(1, protocol_v2_dv(), metadata_b(), None) // <-- P from here + .delta_with_p_m(2, None, metadata_a()) // <-- M from here + .build() + .await + .assert_p_m(None, &protocol_v2_dv(), &metadata_a()); +} + +#[tokio::test] +async fn test_corrupt_crc_at_non_target_version_falls_back() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2(), metadata_a()) // <-- P & M from here + .delta(1) + .corrupt_crc(1) // <-- Corrupt! Fall back to replay. + .delta(2) + .build() + .await + .assert_p_m(None, &protocol_v2(), &metadata_a()); +} + +#[tokio::test] +async fn test_crc_before_checkpoint_is_ignored() { + CrcReadTest::new() + .delta_with_p_m(0, protocol_v2(), metadata_a()) + .delta(1) + .crc(1, protocol_v2_dv_ntz(), metadata_b(), None) + .v2_checkpoint(2, protocol_v2_dv(), metadata_a()) // <-- P & M from here + .delta(3) + .build() + .await + .assert_p_m(None, &protocol_v2_dv(), &metadata_a()); +} + +// ============================================================================ +// Tests: ICT from CRC +// ============================================================================ + +#[tokio::test] +async fn test_ict_from_crc_at_snapshot_version() { + CrcReadTest::new() + .v2_checkpoint(0, protocol_v2_ict(), metadata_ict()) + .delta_with_ict(1, 2000) + .crc(1, protocol_v2_ict(), metadata_ict(), 1000) // <-- ICT from here + .build() + .await + .assert_ict(None, Some(1000)); +} + +#[tokio::test] +async fn test_ict_errors_when_crc_has_no_ict() { + let setup = CrcReadTest::new() + .v2_checkpoint(0, protocol_v2_ict(), metadata_ict()) + .delta_with_ict(1, 2000) + .crc(1, protocol_v2_ict(), metadata_ict(), None) + .build() + .await; + + let snapshot = Snapshot::builder_for(setup.url.clone()) + .build(&setup.engine) + .unwrap(); + + let result = snapshot.get_in_commit_timestamp(&setup.engine); + + assert_result_error_with_message( + result, + "In-Commit Timestamp not found in CRC file at version 1", + ); +} diff --git a/kernel/src/log_segment/domain_metadata_replay.rs b/kernel/src/log_segment/domain_metadata_replay.rs new file mode 100644 index 0000000000..b79a6319b4 --- /dev/null +++ b/kernel/src/log_segment/domain_metadata_replay.rs @@ -0,0 +1,247 @@ +//! Domain metadata replay logic for [`LogSegment`]. +//! +//! This module contains the method that performs a log replay to extract the latest domain +//! metadata actions from a [`LogSegment`]. + +use std::collections::{HashMap, HashSet}; + +use tracing::instrument; + +use crate::actions::get_log_domain_metadata_schema; +use crate::actions::visitors::DomainMetadataVisitor; +use crate::actions::DomainMetadata; +use crate::log_replay::ActionsBatch; +use crate::{DeltaResult, Engine, RowVisitor as _}; + +use super::LogSegment; + +pub(crate) type DomainMetadataMap = HashMap; + +impl LogSegment { + /// Scan this log segment for domain metadata actions. If a specific set of domains is + /// provided, terminate log replay early once all requested domains have been found. If no + /// filter is given, replay the entire log to collect all domains. + /// + /// Returns the latest domain metadata for each domain, accounting for tombstones + /// (`removed=true`) — removed domain metadatas will _never_ be present in the returned map. + #[instrument(name = "domain_metadata.scan", skip_all, fields(domains = ?domains.map(|d| d.iter().collect::>())), err)] + pub(crate) fn scan_domain_metadatas( + &self, + domains: Option<&HashSet<&str>>, + engine: &dyn Engine, + ) -> DeltaResult { + let domain_filter = domains.map(|set| { + set.iter() + .map(|s| s.to_string()) + .collect::>() + }); + let mut visitor = DomainMetadataVisitor::new(domain_filter); + // If a specific set of domains is requested then we can terminate log replay early as + // soon as all requested domains have been found. If all domains are requested then we + // are forced to replay the entire log. + for actions in self.read_domain_metadata_batches(engine)? { + let domain_metadatas = actions?.actions; + visitor.visit_rows_of(domain_metadatas.as_ref())?; + // if all requested domains have been found, terminate early + if visitor.filter_found() { + break; + } + } + + Ok(visitor.into_domain_metadatas()) + } + + /// Read action batches from the log, projecting rows to only contain domain metadata columns. + fn read_domain_metadata_batches( + &self, + engine: &dyn Engine, + ) -> DeltaResult> + Send> { + let schema = get_log_domain_metadata_schema(); + self.read_actions(engine, schema.clone()) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + use std::sync::Arc; + + use url::Url; + + use crate::actions::visitors::DomainMetadataVisitor; + use crate::committer::FileSystemCommitter; + use crate::engine::default::DefaultEngineBuilder; + use crate::object_store::memory::InMemory; + use crate::schema::{DataType, StructField, StructType}; + use crate::transaction::create_table::create_table as create_table_txn; + use crate::{RowVisitor as _, Snapshot}; + + /// Builds a two-commit in-memory Delta table: + /// commit 0: protocol + metadata (with domainMetadata feature) + "domainC" + /// commit 1: "domainA" + "domainB" + /// + /// Log replay visits commits newest-first, so commit 1 is the first batch and commit 0 + /// is the second batch. + fn build_two_commit_log() -> (impl crate::Engine, std::sync::Arc) { + let store = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(store).build(); + let url = Url::parse("memory:///").unwrap(); + + // Commit 0: CREATE TABLE (protocol + metadata) with "domainC" in the same commit. + // The domainMetadata writer feature is enabled so domain metadata actions are valid. + let _ = create_table_txn( + url.as_str(), + Arc::new(StructType::new_unchecked(vec![StructField::new( + "id", + DataType::INTEGER, + true, + )])), + "test", + ) + .with_table_properties([("delta.feature.domainMetadata", "supported")]) + .build(&engine, Box::new(FileSystemCommitter::new())) + .unwrap() + .with_domain_metadata("domainC".to_string(), "cfgC".to_string()) + .commit(&engine) + .unwrap(); + + // Commit 1: add domainA and domainB via an existing-table transaction. + let snapshot = Snapshot::builder_for(url.clone()).build(&engine).unwrap(); + let _ = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine) + .unwrap() + .with_domain_metadata("domainA".to_string(), "cfgA".to_string()) + .with_domain_metadata("domainB".to_string(), "cfgB".to_string()) + .commit(&engine) + .unwrap(); + + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + (engine, snapshot) + } + + /// Proves early termination actually fires: when both requested domains are found in the + /// first (newest) batch, the iterator is broken before the second (older) batch is consumed. + /// + /// Strategy: count total batches via `read_domain_metadata_batches`, then manually drive + /// the same loop that `scan_domain_metadatas` uses and count how many batches are consumed + /// before `filter_found()` triggers the break. Asserting consumed < total is the only way + /// to confirm the iterator is abandoned early — the domain values alone cannot distinguish + /// this because `or_insert` in the visitor makes results identical whether or not the second + /// batch was read. + #[tokio::test] + async fn test_scan_domain_metadatas_early_termination() { + let (engine, snapshot) = build_two_commit_log(); + let log_segment = snapshot.log_segment(); + + // Sanity-check: the log has exactly 2 batches (one per commit). + let total_batches = log_segment + .read_domain_metadata_batches(&engine) + .unwrap() + .filter(|r| r.is_ok()) + .count(); + assert_eq!( + total_batches, 2, + "expected 2 total batches (one per commit)" + ); + + // Drive the loop manually — identical to the body of scan_domain_metadatas — and + // count how many batches are consumed before filter_found() breaks the loop. + let filter = HashSet::from(["domainA".to_string(), "domainB".to_string()]); + let mut visitor = DomainMetadataVisitor::new(Some(filter)); + let mut batches_consumed = 0; + for actions in log_segment.read_domain_metadata_batches(&engine).unwrap() { + batches_consumed += 1; + visitor + .visit_rows_of(actions.unwrap().actions.as_ref()) + .unwrap(); + if visitor.filter_found() { + break; + } + } + + // The key assertion: only 1 of the 2 batches was consumed — early termination worked. + assert_eq!( + batches_consumed, 1, + "should break after the first (newest) batch once both domains are found" + ); + assert!( + batches_consumed < total_batches, + "early termination must consume fewer batches than the total" + ); + + // Also verify correct results: domainA and domainB present, domainC absent. + let result = visitor.into_domain_metadatas(); + assert_eq!(result.len(), 2); + assert_eq!(result["domainA"].configuration(), "cfgA"); + assert_eq!(result["domainB"].configuration(), "cfgB"); + assert!( + !result.contains_key("domainC"), + "domainC must not appear — second batch was not read" + ); + } + + #[tokio::test] + async fn test_scan_domain_metadatas_with_single_domain_filter_returns_only_that_domain() { + let (engine, snapshot) = build_two_commit_log(); + let result = snapshot + .log_segment() + .scan_domain_metadatas(Some(&HashSet::from(["domainA"])), &engine) + .unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result["domainA"].configuration(), "cfgA"); + } + + #[tokio::test] + async fn test_scan_domain_metadatas_with_subset_filter_returns_matching_domains() { + let (engine, snapshot) = build_two_commit_log(); + let result = snapshot + .log_segment() + .scan_domain_metadatas(Some(&HashSet::from(["domainA", "domainC"])), &engine) + .unwrap(); + assert_eq!(result.len(), 2); + assert_eq!(result["domainA"].configuration(), "cfgA"); + assert_eq!(result["domainC"].configuration(), "cfgC"); + } + + #[tokio::test] + async fn test_scan_domain_metadatas_with_no_filter_returns_all_domains() { + let (engine, snapshot) = build_two_commit_log(); + let result = snapshot + .log_segment() + .scan_domain_metadatas(None, &engine) + .unwrap(); + assert_eq!(result.len(), 3); + assert_eq!(result["domainA"].configuration(), "cfgA"); + assert_eq!(result["domainB"].configuration(), "cfgB"); + assert_eq!(result["domainC"].configuration(), "cfgC"); + } + + #[tokio::test] + async fn test_scan_domain_metadatas_with_split_domains_does_not_terminate_early() { + let (engine, snapshot) = build_two_commit_log(); + let log_segment = snapshot.log_segment(); + + // domainA is in commit 1 (batch 0), domainC is in commit 0 (batch 1). + // filter_found() must not trigger after batch 0 alone. + let filter = HashSet::from(["domainA".to_string(), "domainC".to_string()]); + let mut visitor = DomainMetadataVisitor::new(Some(filter)); + let mut batches_consumed = 0; + for actions in log_segment.read_domain_metadata_batches(&engine).unwrap() { + batches_consumed += 1; + visitor + .visit_rows_of(actions.unwrap().actions.as_ref()) + .unwrap(); + if visitor.filter_found() { + break; + } + } + + assert_eq!( + batches_consumed, 2, + "must read both batches when requested domains span two commits" + ); + let result = visitor.into_domain_metadatas(); + assert_eq!(result["domainA"].configuration(), "cfgA"); + assert_eq!(result["domainC"].configuration(), "cfgC"); + } +} diff --git a/kernel/src/log_segment/protocol_metadata_replay.rs b/kernel/src/log_segment/protocol_metadata_replay.rs new file mode 100644 index 0000000000..6b0da90729 --- /dev/null +++ b/kernel/src/log_segment/protocol_metadata_replay.rs @@ -0,0 +1,197 @@ +//! Protocol and Metadata replay logic for [`LogSegment`]. +//! +//! This module contains the methods that perform a lightweight log replay to extract the latest +//! Protocol and Metadata actions from a [`LogSegment`]. + +use crate::actions::{get_commit_schema, Metadata, Protocol, METADATA_NAME, PROTOCOL_NAME}; +use crate::crc::{CrcLoadResult, LazyCrc}; +use crate::log_replay::ActionsBatch; +use crate::{DeltaResult, Engine, Error}; + +use tracing::{info, instrument, warn}; + +use super::LogSegment; + +impl LogSegment { + /// Read the latest Protocol and Metadata from this log segment, using CRC when available. + /// Returns an error if either is missing. + /// + /// This is the checked variant of [`Self::read_protocol_metadata_unchecked`], used for + /// fresh snapshot creation where both Protocol and Metadata must exist. + pub(crate) fn read_protocol_metadata( + &self, + engine: &dyn Engine, + lazy_crc: &LazyCrc, + ) -> DeltaResult<(Metadata, Protocol)> { + match self.read_protocol_metadata_opt(engine, lazy_crc)? { + (Some(m), Some(p)) => Ok((m, p)), + (None, Some(_)) => Err(Error::MissingMetadata), + (Some(_), None) => Err(Error::MissingProtocol), + (None, None) => Err(Error::MissingMetadataAndProtocol), + } + } + + /// Read the latest Protocol and Metadata from this log segment, using CRC when available. + /// Returns `None` for either if not found. + /// + /// This is the unchecked variant of [`Self::read_protocol_metadata_checked`], used for + /// incremental snapshot updates where the caller can fall back to an existing snapshot's + /// Protocol and Metadata. + /// + /// The `lazy_crc` parameter allows the CRC to be loaded at most once and shared for + /// future use (domain metadata, in-commit timestamp, etc.). + #[instrument(name = "log_seg.load_p_m", skip_all, err)] + pub(crate) fn read_protocol_metadata_opt( + &self, + engine: &dyn Engine, + lazy_crc: &LazyCrc, + ) -> DeltaResult<(Option, Option)> { + let crc_version = lazy_crc.crc_version(); + + // Case 1: If CRC at target version, use it directly and exit early. + if crc_version == Some(self.end_version) { + if let CrcLoadResult::Loaded(crc) = lazy_crc.get_or_load(engine) { + info!("P&M from CRC at target version {}", self.end_version); + return Ok((Some(crc.metadata.clone()), Some(crc.protocol.clone()))); + } + warn!( + "CRC at target version {} failed to load, falling back to log replay", + self.end_version + ); + } + + // We didn't return above, so we need to do log replay to find P&M. + // + // Case 2: CRC exists at an earlier version => Prune the log segment to only replay + // commits *after* the CRC version. + // (a) If we find new P&M in the pruned replay, return it. + // (b) If we don't find new P&M, fall back to the CRC. + // (c) If the CRC also fails, fall back to replaying the remaining segment + // (checkpoint + commits up through the CRC version). + // + // Case 3: CRC at target version failed to load => Full P&M log replay. + // + // Case 4: No CRC exists at all => Full P&M log replay. + + if let Some(crc_v) = crc_version.filter(|&v| v < self.end_version) { + // Case 2(a): Replay only commits after CRC version + info!("Pruning log segment to commits after CRC version {}", crc_v); + let pruned = self.segment_after_crc(crc_v); + let (metadata_opt, protocol_opt) = pruned.replay_for_pm(engine, None, None)?; + + if metadata_opt.is_some() && protocol_opt.is_some() { + info!("Found P&M from pruned log replay"); + return Ok((metadata_opt, protocol_opt)); + } + + // Case 2(b): P&M incomplete from pruned replay, try CRC. + // Use `or_else` so any newer P or M found in the pruned replay takes priority + // over the (older) CRC values. + if let CrcLoadResult::Loaded(crc) = lazy_crc.get_or_load(engine) { + info!("P&M fallback to CRC (no P&M changes after CRC version)"); + return Ok(( + metadata_opt.or_else(|| Some(crc.metadata.clone())), + protocol_opt.or_else(|| Some(crc.protocol.clone())), + )); + } + + // Case 2(c): CRC failed to load. Replay the remaining segment (checkpoint + + // commits up through CRC version), carrying forward any partial results from the + // pruned replay above. + warn!( + "CRC at version {} failed to load, replaying remaining segment", + crc_v + ); + let remaining = self.segment_through_crc(crc_v); + return remaining.replay_for_pm(engine, metadata_opt, protocol_opt); + } + + // Case 3 / Case 4: Full P&M log replay. + self.replay_for_pm(engine, None, None) + } + + /// Replays the log segment for Protocol and Metadata, merging with any already-found values. + /// Stops early once both are found. + fn replay_for_pm( + &self, + engine: &dyn Engine, + mut metadata_opt: Option, + mut protocol_opt: Option, + ) -> DeltaResult<(Option, Option)> { + for actions_batch in self.read_pm_batches(engine)? { + let actions = actions_batch?.actions; + if metadata_opt.is_none() { + metadata_opt = Metadata::try_new_from_data(actions.as_ref())?; + } + if protocol_opt.is_none() { + protocol_opt = Protocol::try_new_from_data(actions.as_ref())?; + } + if metadata_opt.is_some() && protocol_opt.is_some() { + break; + } + } + Ok((metadata_opt, protocol_opt)) + } + + // Replay the commit log, projecting rows to only contain Protocol and Metadata action columns. + fn read_pm_batches( + &self, + engine: &dyn Engine, + ) -> DeltaResult> + Send> { + let schema = get_commit_schema().project(&[PROTOCOL_NAME, METADATA_NAME])?; + self.read_actions(engine, schema) + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use itertools::Itertools; + use test_log::test; + + use crate::engine::sync::SyncEngine; + use crate::Snapshot; + + // NOTE: In addition to testing the meta-predicate for metadata replay, this test also verifies + // that the parquet reader properly infers nullcount = rowcount for missing columns. The two + // checkpoint part files that contain transaction app ids have truncated schemas that would + // otherwise fail skipping due to their missing nullcount stat: + // + // Row group 0: count: 1 total(compressed): 111 B total(uncompressed):107 B + // -------------------------------------------------------------------------------- + // type nulls min / max + // txn.appId BINARY 0 "3ae45b72-24e1-865a-a211-3..." / "3ae45b72-24e1-865a-a211-3..." + // txn.version INT64 0 "4390" / "4390" + #[test] + fn test_replay_for_metadata() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let engine = SyncEngine::new(); + + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + let data: Vec<_> = snapshot + .log_segment() + .read_pm_batches(&engine) + .unwrap() + .try_collect() + .unwrap(); + + // The checkpoint has five parts, each containing one action: + // 1. txn (physically missing P&M columns) + // 2. metaData + // 3. protocol + // 4. add + // 5. txn (physically missing P&M columns) + // + // The parquet reader should skip parts 1, 3, and 5. Note that the actual `read_metadata` + // always skips parts 4 and 5 because it terminates the iteration after finding both P&M. + // + // NOTE: Each checkpoint part is a single-row file -- guaranteed to produce one row group. + // + // WARNING: https://github.com/delta-io/delta-kernel-rs/issues/434 -- We currently + // read parts 1 and 5 (4 in all instead of 2) because row group skipping is disabled for + // missing columns, but can still skip part 3 because has valid nullcount stats for P&M. + assert_eq!(data.len(), 4); + } +} diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 6986fc4262..e24d6333c3 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -1,79 +1,82 @@ +use std::sync::Arc; use std::sync::LazyLock; -use std::{path::PathBuf, sync::Arc}; -use futures::executor::block_on; use itertools::Itertools; -use object_store::{memory::InMemory, path::Path, ObjectStore}; -use test_log::test; +use rstest::rstest; use url::Url; use crate::actions::visitors::AddVisitor; use crate::actions::{ - get_all_actions_schema, get_commit_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, REMOVE_NAME, - SIDECAR_NAME, + get_all_actions_schema, get_commit_schema, Add, Sidecar, ADD_NAME, DOMAIN_METADATA_NAME, + METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, SET_TRANSACTION_NAME, SIDECAR_NAME, }; +use crate::arrow::array::StringArray; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::default::executor::tokio::TokioBackgroundExecutor; use crate::engine::default::filesystem::ObjectStoreStorageHandler; -use crate::engine::default::DefaultEngine; +use crate::engine::default::DefaultEngineBuilder; +use crate::engine::sync::json::SyncJsonHandler; use crate::engine::sync::SyncEngine; +use crate::expressions::ColumnName; use crate::last_checkpoint_hint::LastCheckpointHint; use crate::log_replay::ActionsBatch; -use crate::log_segment::{ListedLogFiles, LogSegment}; +use crate::log_segment::LogSegment; +use crate::log_segment_files::LogSegmentFiles; +use crate::object_store::{memory::InMemory, path::Path, ObjectStore}; use crate::parquet::arrow::ArrowWriter; use crate::path::{LogPathFileType, ParsedLogPath}; use crate::scan::test_utils::{ add_batch_simple, add_batch_with_remove, sidecar_batch_with_given_paths, + sidecar_batch_with_given_paths_and_sizes, }; +use crate::schema::{DataType, StructField, StructType}; +use crate::utils::test_utils::string_array_to_engine_data; use crate::utils::test_utils::{assert_batch_matches, assert_result_error_with_message, Action}; use crate::{ - DeltaResult, Engine as _, EngineData, Expression, FileMeta, PredicateRef, RowVisitor, Snapshot, - StorageHandler, + DeltaResult, Engine as _, EngineData, Expression, FileMeta, JsonHandler, Predicate, + PredicateRef, RowVisitor, StorageHandler, +}; +use test_utils::{ + compacted_log_path_for_versions, delta_path_for_version, staged_commit_path_for_version, }; -use test_utils::{compacted_log_path_for_versions, delta_path_for_version}; use super::*; -// NOTE: In addition to testing the meta-predicate for metadata replay, this test also verifies -// that the parquet reader properly infers nullcount = rowcount for missing columns. The two -// checkpoint part files that contain transaction app ids have truncated schemas that would -// otherwise fail skipping due to their missing nullcount stat: -// -// Row group 0: count: 1 total(compressed): 111 B total(uncompressed):107 B -// -------------------------------------------------------------------------------- -// type nulls min / max -// txn.appId BINARY 0 "3ae45b72-24e1-865a-a211-3..." / "3ae45b72-24e1-865a-a211-3..." -// txn.version INT64 0 "4390" / "4390" -#[test] -fn test_replay_for_metadata() { - let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); - let url = url::Url::from_directory_path(path.unwrap()).unwrap(); - let engine = SyncEngine::new(); - - let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); - let data: Vec<_> = snapshot - .log_segment() - .replay_for_metadata(&engine) - .unwrap() - .try_collect() - .unwrap(); +use crate::actions::visitors::SidecarVisitor; +use crate::ParquetHandler; + +/// Processes sidecar files for the given checkpoint batch. +/// +/// This function extracts any sidecar file references from the provided batch. +/// Each sidecar file is read and an iterator of file action batches is returned. +fn process_sidecars( + parquet_handler: Arc, + log_root: Url, + batch: &dyn EngineData, + checkpoint_read_schema: SchemaRef, + meta_predicate: Option, +) -> DeltaResult>> + Send>> { + // Visit the rows of the checkpoint batch to extract sidecar file references + let mut visitor = SidecarVisitor::default(); + visitor.visit_rows_of(batch)?; + + // If there are no sidecar files, return early + if visitor.sidecars.is_empty() { + return Ok(None); + } - // The checkpoint has five parts, each containing one action: - // 1. txn (physically missing P&M columns) - // 2. metaData - // 3. protocol - // 4. add - // 5. txn (physically missing P&M columns) - // - // The parquet reader should skip parts 1, 3, and 5. Note that the actual `read_metadata` - // always skips parts 4 and 5 because it terminates the iteration after finding both P&M. - // - // NOTE: Each checkpoint part is a single-row file -- guaranteed to produce one row group. - // - // WARNING: https://github.com/delta-io/delta-kernel-rs/issues/434 -- We currently - // read parts 1 and 5 (4 in all instead of 2) because row group skipping is disabled for - // missing columns, but can still skip part 3 because has valid nullcount stats for P&M. - assert_eq!(data.len(), 4); + let sidecar_files: Vec<_> = visitor + .sidecars + .iter() + .map(|sidecar| sidecar.to_filemeta(&log_root)) + .try_collect()?; + + // Read the sidecar files and return an iterator of sidecar file batches + Ok(Some(parquet_handler.read_parquet_files( + &sidecar_files, + checkpoint_read_schema, + meta_predicate, + )?)) } // get an ObjectStore path for a checkpoint file, based on version, part number, and total number of parts @@ -85,7 +88,7 @@ fn delta_path_for_multipart_checkpoint(version: u64, part_num: u32, num_parts: u // Utility method to build a log using a list of log paths and an optional checkpoint hint. The // LastCheckpointHint is written to `_delta_log/_last_checkpoint`. -fn build_log_with_paths_and_checkpoint( +async fn build_log_with_paths_and_checkpoint( paths: &[Path], checkpoint_metadata: Option<&LastCheckpointHint>, ) -> (Box, Url) { @@ -94,27 +97,26 @@ fn build_log_with_paths_and_checkpoint( let data = bytes::Bytes::from("kernel-data"); // add log files to store - block_on(async { - for path in paths { - store - .put(path, data.clone().into()) - .await - .expect("put log file in store"); - } - if let Some(checkpoint_metadata) = checkpoint_metadata { - let checkpoint_str = - serde_json::to_string(checkpoint_metadata).expect("Serialize checkpoint"); - store - .put( - &Path::from("_delta_log/_last_checkpoint"), - checkpoint_str.into(), - ) - .await - .expect("Write _last_checkpoint"); - } - }); + for path in paths { + store + .put(path, data.clone().into()) + .await + .expect("put log file in store"); + } + if let Some(checkpoint_metadata) = checkpoint_metadata { + let checkpoint_str = + serde_json::to_string(checkpoint_metadata).expect("Serialize checkpoint"); + store + .put( + &Path::from("_delta_log/_last_checkpoint"), + checkpoint_str.into(), + ) + .await + .expect("Write _last_checkpoint"); + } - let storage = ObjectStoreStorageHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); + let storage = + ObjectStoreStorageHandler::new(store, Arc::new(TokioBackgroundExecutor::new()), None); let table_root = Url::parse("memory:///").expect("valid url"); let log_root = table_root.join("_delta_log/").unwrap(); @@ -133,7 +135,7 @@ fn new_in_memory_store() -> (Arc, Url) { } // Writes a record batch obtained from engine data to the in-memory store at a given path. -fn write_parquet_to_store( +async fn write_parquet_to_store( store: &Arc, path: String, data: Box, @@ -146,36 +148,43 @@ fn write_parquet_to_store( writer.write(record_batch)?; writer.close()?; - block_on(async { store.put(&Path::from(path), buffer.into()).await })?; + store.put(&Path::from(path), buffer.into()).await?; Ok(()) } /// Writes all actions to a _delta_log parquet checkpoint file in the store. /// This function formats the provided filename into the _delta_log directory. -pub(crate) fn add_checkpoint_to_store( +pub(crate) async fn add_checkpoint_to_store( store: &Arc, data: Box, filename: &str, ) -> DeltaResult<()> { let path = format!("_delta_log/{filename}"); - write_parquet_to_store(store, path, data) + write_parquet_to_store(store, path, data).await } -/// Writes all actions to a _delta_log/_sidecars file in the store. +/// Writes all actions to a _delta_log/_sidecars file in the store and return the [`FileMeta`]. /// This function formats the provided filename into the _sidecars subdirectory. -fn add_sidecar_to_store( +async fn add_sidecar_to_store( store: &Arc, data: Box, filename: &str, -) -> DeltaResult<()> { +) -> DeltaResult { let path = format!("_delta_log/_sidecars/{filename}"); - write_parquet_to_store(store, path, data) + write_parquet_to_store(store, path.clone(), data).await?; + let size = get_file_size(store, &path).await; + let location = Url::parse(&format!("memory:///{path}")).expect("valid url"); + Ok(FileMeta { + location, + last_modified: 0, + size, + }) } /// Writes all actions to a _delta_log json checkpoint file in the store. /// This function formats the provided filename into the _delta_log directory. -fn write_json_to_store( +async fn write_json_to_store( store: &Arc, actions: Vec, filename: &str, @@ -187,29 +196,35 @@ fn write_json_to_store( let content = json_lines.join("\n"); let checkpoint_path = format!("_delta_log/{filename}"); - tokio::runtime::Runtime::new() - .expect("create tokio runtime") - .block_on(async { - store - .put(&Path::from(checkpoint_path), content.into()) - .await - })?; + store + .put(&Path::from(checkpoint_path), content.into()) + .await?; Ok(()) } fn create_log_path(path: &str) -> ParsedLogPath { + create_log_path_with_size(path, 0) +} + +fn create_log_path_with_size(path: &str, size: u64) -> ParsedLogPath { ParsedLogPath::try_from(FileMeta { location: Url::parse(path).expect("Invalid file URL"), last_modified: 0, - size: 0, + size, }) .unwrap() .unwrap() } -#[test] -fn build_snapshot_with_uuid_checkpoint_parquet() { +/// Gets the file size from the store for use in FileMeta +async fn get_file_size(store: &Arc, path: &str) -> u64 { + let object_meta = store.head(&Path::from(path)).await.unwrap(); + object_meta.size +} + +#[tokio::test] +async fn build_snapshot_with_uuid_checkpoint_parquet() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -223,7 +238,8 @@ fn build_snapshot_with_uuid_checkpoint_parquet() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -233,8 +249,8 @@ fn build_snapshot_with_uuid_checkpoint_parquet() { None, ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); assert_eq!(checkpoint_parts[0].version, 5); @@ -244,8 +260,8 @@ fn build_snapshot_with_uuid_checkpoint_parquet() { assert_eq!(versions, expected_versions); } -#[test] -fn build_snapshot_with_uuid_checkpoint_json() { +#[tokio::test] +async fn build_snapshot_with_uuid_checkpoint_json() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -259,7 +275,8 @@ fn build_snapshot_with_uuid_checkpoint_json() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -269,8 +286,8 @@ fn build_snapshot_with_uuid_checkpoint_json() { None, ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); assert_eq!(checkpoint_parts[0].version, 5); @@ -280,8 +297,8 @@ fn build_snapshot_with_uuid_checkpoint_json() { assert_eq!(versions, expected_versions); } -#[test] -fn build_snapshot_with_correct_last_uuid_checkpoint() { +#[tokio::test] +async fn build_snapshot_with_correct_last_uuid_checkpoint() { let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, @@ -290,6 +307,7 @@ fn build_snapshot_with_correct_last_uuid_checkpoint() { num_of_add_files: None, checkpoint_schema: None, checksum: None, + tags: None, }; let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -307,7 +325,8 @@ fn build_snapshot_with_correct_last_uuid_checkpoint() { delta_path_for_version(7, "json"), ], Some(&checkpoint_metadata), - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -317,8 +336,8 @@ fn build_snapshot_with_correct_last_uuid_checkpoint() { None, ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); assert_eq!(commit_files.len(), 2); @@ -326,8 +345,9 @@ fn build_snapshot_with_correct_last_uuid_checkpoint() { assert_eq!(commit_files[0].version, 6); assert_eq!(commit_files[1].version, 7); } -#[test] -fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() { + +#[tokio::test] +async fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -350,7 +370,8 @@ fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -360,8 +381,8 @@ fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() { None, ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 4); assert_eq!(checkpoint_parts[0].version, 3); @@ -371,8 +392,8 @@ fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() { assert_eq!(versions, expected_versions); } -#[test] -fn build_snapshot_with_out_of_date_last_checkpoint() { +#[tokio::test] +async fn build_snapshot_with_out_of_date_last_checkpoint() { let checkpoint_metadata = LastCheckpointHint { version: 3, size: 10, @@ -381,6 +402,7 @@ fn build_snapshot_with_out_of_date_last_checkpoint() { num_of_add_files: None, checkpoint_schema: None, checksum: None, + tags: None, }; let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -395,7 +417,8 @@ fn build_snapshot_with_out_of_date_last_checkpoint() { delta_path_for_version(7, "json"), ], Some(&checkpoint_metadata), - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -405,8 +428,8 @@ fn build_snapshot_with_out_of_date_last_checkpoint() { None, ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); assert_eq!(commit_files.len(), 2); @@ -414,8 +437,9 @@ fn build_snapshot_with_out_of_date_last_checkpoint() { assert_eq!(commit_files[0].version, 6); assert_eq!(commit_files[1].version, 7); } -#[test] -fn build_snapshot_with_correct_last_multipart_checkpoint() { + +#[tokio::test] +async fn build_snapshot_with_correct_last_multipart_checkpoint() { let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, @@ -424,6 +448,7 @@ fn build_snapshot_with_correct_last_multipart_checkpoint() { num_of_add_files: None, checkpoint_schema: None, checksum: None, + tags: None, }; let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -443,7 +468,8 @@ fn build_snapshot_with_correct_last_multipart_checkpoint() { delta_path_for_version(7, "json"), ], Some(&checkpoint_metadata), - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -453,8 +479,8 @@ fn build_snapshot_with_correct_last_multipart_checkpoint() { None, ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 3); assert_eq!(commit_files.len(), 2); @@ -463,8 +489,8 @@ fn build_snapshot_with_correct_last_multipart_checkpoint() { assert_eq!(commit_files[1].version, 7); } -#[test] -fn build_snapshot_with_missing_checkpoint_part_from_hint_fails() { +#[tokio::test] +async fn build_snapshot_with_missing_checkpoint_part_from_hint_fails() { let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, @@ -473,6 +499,7 @@ fn build_snapshot_with_missing_checkpoint_part_from_hint_fails() { num_of_add_files: None, checkpoint_schema: None, checksum: None, + tags: None, }; let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -492,7 +519,8 @@ fn build_snapshot_with_missing_checkpoint_part_from_hint_fails() { delta_path_for_version(7, "json"), ], Some(&checkpoint_metadata), - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -506,8 +534,9 @@ fn build_snapshot_with_missing_checkpoint_part_from_hint_fails() { "Invalid Checkpoint: Had a _last_checkpoint hint but didn't find any checkpoints", ) } -#[test] -fn build_snapshot_with_bad_checkpoint_hint_fails() { + +#[tokio::test] +async fn build_snapshot_with_bad_checkpoint_hint_fails() { let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, @@ -516,6 +545,7 @@ fn build_snapshot_with_bad_checkpoint_hint_fails() { num_of_add_files: None, checkpoint_schema: None, checksum: None, + tags: None, }; let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -534,7 +564,8 @@ fn build_snapshot_with_bad_checkpoint_hint_fails() { delta_path_for_version(7, "json"), ], Some(&checkpoint_metadata), - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -550,8 +581,8 @@ fn build_snapshot_with_bad_checkpoint_hint_fails() { ) } -#[test] -fn build_snapshot_with_missing_checkpoint_part_no_hint() { +#[tokio::test] +async fn build_snapshot_with_missing_checkpoint_part_no_hint() { // Part 2 of 3 is missing from checkpoint 5. The Snapshot should be made of checkpoint // number 3 and commit files 4 to 7. let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -571,7 +602,8 @@ fn build_snapshot_with_missing_checkpoint_part_no_hint() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -582,8 +614,8 @@ fn build_snapshot_with_missing_checkpoint_part_no_hint() { ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); assert_eq!(checkpoint_parts[0].version, 3); @@ -593,8 +625,8 @@ fn build_snapshot_with_missing_checkpoint_part_no_hint() { assert_eq!(versions, expected_versions); } -#[test] -fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpoint() { +#[tokio::test] +async fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpoint() { // When the _last_checkpoint is out of date and the most recent checkpoint is incomplete, the // Snapshot should be made of the most recent complete checkpoint and the commit files that // follow it. @@ -606,6 +638,7 @@ fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpo num_of_add_files: None, checkpoint_schema: None, checksum: None, + tags: None, }; let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -623,7 +656,8 @@ fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpo delta_path_for_version(7, "json"), ], Some(&checkpoint_metadata), - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -633,8 +667,8 @@ fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpo None, ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); assert_eq!(checkpoint_parts[0].version, 3); @@ -644,8 +678,8 @@ fn build_snapshot_with_out_of_date_last_checkpoint_and_incomplete_recent_checkpo assert_eq!(versions, expected_versions); } -#[test] -fn build_snapshot_without_checkpoints() { +#[tokio::test] +async fn build_snapshot_without_checkpoints() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -661,7 +695,8 @@ fn build_snapshot_without_checkpoints() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; ///////// Specify no checkpoint or end version ///////// let log_segment = LogSegment::for_snapshot_impl( @@ -672,8 +707,8 @@ fn build_snapshot_without_checkpoints() { None, ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); assert_eq!(checkpoint_parts[0].version, 5); @@ -692,8 +727,8 @@ fn build_snapshot_without_checkpoints() { Some(2), ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); assert_eq!(checkpoint_parts[0].version, 1); @@ -704,8 +739,8 @@ fn build_snapshot_without_checkpoints() { assert_eq!(versions, expected_versions); } -#[test] -fn build_snapshot_with_checkpoint_greater_than_time_travel_version() { +#[tokio::test] +async fn build_snapshot_with_checkpoint_greater_than_time_travel_version() { let checkpoint_metadata = LastCheckpointHint { version: 5, size: 10, @@ -714,6 +749,7 @@ fn build_snapshot_with_checkpoint_greater_than_time_travel_version() { num_of_add_files: None, checkpoint_schema: None, checksum: None, + tags: None, }; let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ @@ -730,7 +766,8 @@ fn build_snapshot_with_checkpoint_greater_than_time_travel_version() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -740,8 +777,8 @@ fn build_snapshot_with_checkpoint_greater_than_time_travel_version() { Some(4), ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); assert_eq!(checkpoint_parts[0].version, 3); @@ -750,8 +787,8 @@ fn build_snapshot_with_checkpoint_greater_than_time_travel_version() { assert_eq!(commit_files[0].version, 4); } -#[test] -fn build_snapshot_with_start_checkpoint_and_time_travel_version() { +#[tokio::test] +async fn build_snapshot_with_start_checkpoint_and_time_travel_version() { let checkpoint_metadata = LastCheckpointHint { version: 3, size: 10, @@ -760,6 +797,7 @@ fn build_snapshot_with_start_checkpoint_and_time_travel_version() { num_of_add_files: None, checkpoint_schema: None, checksum: None, + tags: None, }; let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -774,7 +812,8 @@ fn build_snapshot_with_start_checkpoint_and_time_travel_version() { delta_path_for_version(7, "json"), ], Some(&checkpoint_metadata), - ); + ) + .await; let log_segment = LogSegment::for_snapshot_impl( storage.as_ref(), @@ -785,12 +824,69 @@ fn build_snapshot_with_start_checkpoint_and_time_travel_version() { ) .unwrap(); - assert_eq!(log_segment.checkpoint_parts[0].version, 3); - assert_eq!(log_segment.ascending_commit_files.len(), 1); - assert_eq!(log_segment.ascending_commit_files[0].version, 4); + assert_eq!(log_segment.listed.checkpoint_parts[0].version, 3); + assert_eq!(log_segment.listed.ascending_commit_files.len(), 1); + assert_eq!(log_segment.listed.ascending_commit_files[0].version, 4); } -#[test] -fn build_table_changes_with_commit_versions() { + +#[rstest::rstest] +#[case::no_hint(None)] +#[case::stale_hint(Some(LastCheckpointHint { + version: 10, // stale: 10 > end_version 5, so it is discarded + size: 10, + parts: None, + size_in_bytes: None, + num_of_add_files: None, + checkpoint_schema: None, + checksum: None, + tags: None, +}))] +#[tokio::test] +async fn build_snapshot_time_travel_no_checkpoint_falls_back_to_v0( + #[case] hint: Option, +) { + let paths: Vec = (0..=5).map(|v| delta_path_for_version(v, "json")).collect(); + let (storage, log_root) = build_log_with_paths_and_checkpoint(&paths, None).await; + + let log_segment = + LogSegment::for_snapshot_impl(storage.as_ref(), log_root, vec![], hint, Some(5)).unwrap(); + + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; + + assert_eq!(checkpoint_parts.len(), 0); + let versions = commit_files.into_iter().map(|x| x.version).collect_vec(); + assert_eq!(versions, vec![0, 1, 2, 3, 4, 5]); +} + +#[tokio::test] +async fn build_snapshot_time_travel_no_hint_checkpoint_at_end_version_included() { + let (storage, log_root) = build_log_with_paths_and_checkpoint( + &[ + delta_path_for_version(0, "json"), + delta_path_for_version(1, "json"), + delta_path_for_version(2, "json"), + delta_path_for_version(3, "json"), + delta_path_for_version(4, "json"), + delta_path_for_version(5, "json"), + delta_path_for_version(5, "checkpoint.parquet"), + ], + None, + ) + .await; + + let log_segment = + LogSegment::for_snapshot_impl(storage.as_ref(), log_root, vec![], None, Some(5)).unwrap(); + + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; + assert_eq!(checkpoint_parts.len(), 1); + assert_eq!(checkpoint_parts[0].version, 5); + assert_eq!(commit_files.len(), 0); +} + +#[tokio::test] +async fn build_table_changes_with_commit_versions() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -806,14 +902,15 @@ fn build_table_changes_with_commit_versions() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; ///////// Specify start version and end version ///////// let log_segment = LogSegment::for_table_changes(storage.as_ref(), log_root.clone(), 2, 5).unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; // Checkpoints should be omitted assert_eq!(checkpoint_parts.len(), 0); @@ -827,8 +924,8 @@ fn build_table_changes_with_commit_versions() { let log_segment = LogSegment::for_table_changes(storage.as_ref(), log_root.clone(), 0, Some(0)).unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; // Checkpoints should be omitted assert_eq!(checkpoint_parts.len(), 0); @@ -838,8 +935,8 @@ fn build_table_changes_with_commit_versions() { ///////// Specify no start or end version ///////// let log_segment = LogSegment::for_table_changes(storage.as_ref(), log_root, 0, None).unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; // Checkpoints should be omitted assert_eq!(checkpoint_parts.len(), 0); @@ -850,8 +947,8 @@ fn build_table_changes_with_commit_versions() { assert_eq!(versions, expected_versions); } -#[test] -fn test_non_contiguous_log() { +#[tokio::test] +async fn test_non_contiguous_log() { // Commit with version 1 is missing let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ @@ -859,13 +956,14 @@ fn test_non_contiguous_log() { delta_path_for_version(2, "json"), ], None, - ); + ) + .await; let log_segment_res = LogSegment::for_table_changes(storage.as_ref(), log_root.clone(), 0, None); // check the error message up to the timestamp - let expected_error_pattern = "Generic delta kernel error: Expected ordered contiguous \ - commit files [ParsedLogPath { location: FileMeta { location: Url { scheme: \"memory\", \ + let expected_error_pattern = "Generic delta kernel error: Expected contiguous commit files, \ + but found gap: ParsedLogPath { location: FileMeta { location: Url { scheme: \"memory\", \ cannot_be_a_base: false, username: \"\", password: None, host: None, port: None, path: \ \"/_delta_log/00000000000000000000.json\", query: None, fragment: None }, last_modified:"; assert_result_error_with_message(log_segment_res, expected_error_pattern); @@ -885,8 +983,8 @@ fn test_non_contiguous_log() { ); } -#[test] -fn table_changes_fails_with_larger_start_version_than_end() { +#[tokio::test] +async fn table_changes_fails_with_larger_start_version_than_end() { // Commit with version 1 is missing let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ @@ -894,43 +992,40 @@ fn table_changes_fails_with_larger_start_version_than_end() { delta_path_for_version(1, "json"), ], None, - ); + ) + .await; let log_segment_res = LogSegment::for_table_changes(storage.as_ref(), log_root, 1, Some(0)); assert_result_error_with_message(log_segment_res, "Generic delta kernel error: Failed to build LogSegment: start_version cannot be greater than end_version"); } -#[test] -fn test_sidecar_to_filemeta_valid_paths() -> DeltaResult<()> { + +#[test_log::test(rstest::rstest)] +#[case::simple_path("example.parquet", "file:///var/_delta_log/_sidecars/example.parquet")] +#[case::full_path( + "file:///var/_delta_log/_sidecars/example.parquet", + "file:///var/_delta_log/_sidecars/example.parquet" +)] +#[case::nested_path( + "test/test/example.parquet", + "file:///var/_delta_log/_sidecars/test/test/example.parquet" +)] +fn test_sidecar_to_filemeta_valid_paths( + #[case] input_path: &str, + #[case] expected_url: &str, +) -> DeltaResult<()> { let log_root = Url::parse("file:///var/_delta_log/")?; - let test_cases = [ - ( - "example.parquet", - "file:///var/_delta_log/_sidecars/example.parquet", - ), - ( - "file:///var/_delta_log/_sidecars/example.parquet", - "file:///var/_delta_log/_sidecars/example.parquet", - ), - ( - "test/test/example.parquet", - "file:///var/_delta_log/_sidecars/test/test/example.parquet", - ), - ]; - - for (input_path, expected_url) in test_cases.into_iter() { - let sidecar = Sidecar { - path: expected_url.to_string(), - modification_time: 0, - size_in_bytes: 1000, - tags: None, - }; + let sidecar = Sidecar { + path: expected_url.to_string(), + modification_time: 0, + size_in_bytes: 1000, + tags: None, + }; - let filemeta = sidecar.to_filemeta(&log_root)?; - assert_eq!( - filemeta.location.as_str(), - expected_url, - "Mismatch for input path: {input_path}" - ); - } + let filemeta = sidecar.to_filemeta(&log_root)?; + assert_eq!( + filemeta.location.as_str(), + expected_url, + "Mismatch for input path: {input_path}" + ); Ok(()) } @@ -940,7 +1035,7 @@ fn test_checkpoint_batch_with_no_sidecars_returns_none() -> DeltaResult<()> { let engine = Arc::new(SyncEngine::new()); let checkpoint_batch = add_batch_simple(get_all_actions_schema().clone()); - let mut iter = LogSegment::process_sidecars( + let mut iter = process_sidecars( engine.parquet_handler(), log_root, checkpoint_batch.as_ref(), @@ -956,29 +1051,37 @@ fn test_checkpoint_batch_with_no_sidecars_returns_none() -> DeltaResult<()> { Ok(()) } -#[test] -fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult<()> { +#[tokio::test] +async fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult<()> { let (store, log_root) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); let read_schema = get_all_actions_schema().project(&[ADD_NAME, REMOVE_NAME, SIDECAR_NAME])?; - add_sidecar_to_store( + let sidecar1_size = add_sidecar_to_store( &store, add_batch_simple(read_schema.clone()), "sidecarfile1.parquet", - )?; - add_sidecar_to_store( + ) + .await? + .size; + + let sidecar2_size = add_sidecar_to_store( &store, add_batch_with_remove(read_schema.clone()), "sidecarfile2.parquet", - )?; + ) + .await? + .size; - let checkpoint_batch = sidecar_batch_with_given_paths( - vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], + let checkpoint_batch = sidecar_batch_with_given_paths_and_sizes( + vec![ + ("sidecarfile1.parquet", sidecar1_size), + ("sidecarfile2.parquet", sidecar2_size), + ], read_schema.clone(), ); - let mut iter = LogSegment::process_sidecars( + let mut iter = process_sidecars( engine.parquet_handler(), log_root, checkpoint_batch.as_ref(), @@ -999,14 +1102,14 @@ fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult< #[test] fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() -> DeltaResult<()> { let (store, log_root) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); let checkpoint_batch = sidecar_batch_with_given_paths( vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], get_all_actions_schema().clone(), ); - let mut iter = LogSegment::process_sidecars( + let mut iter = process_sidecars( engine.parquet_handler(), log_root, checkpoint_batch.as_ref(), @@ -1023,21 +1126,25 @@ fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() -> DeltaResult<( Ok(()) } -#[test] -fn test_reading_sidecar_files_with_predicate() -> DeltaResult<()> { +#[tokio::test] +async fn test_reading_sidecar_files_with_predicate() -> DeltaResult<()> { let (store, log_root) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); let read_schema = get_all_actions_schema().project(&[ADD_NAME, REMOVE_NAME, SIDECAR_NAME])?; - let checkpoint_batch = - sidecar_batch_with_given_paths(vec!["sidecarfile1.parquet"], read_schema.clone()); - // Add a sidecar file with only add actions - add_sidecar_to_store( + let sidecar_size = add_sidecar_to_store( &store, add_batch_simple(read_schema.clone()), "sidecarfile1.parquet", - )?; + ) + .await? + .size; + + let checkpoint_batch = sidecar_batch_with_given_paths_and_sizes( + vec![("sidecarfile1.parquet", sidecar_size)], + read_schema.clone(), + ); // Filter out sidecar files that do not contain remove actions let remove_predicate: LazyLock> = LazyLock::new(|| { @@ -1046,7 +1153,7 @@ fn test_reading_sidecar_files_with_predicate() -> DeltaResult<()> { )) }); - let mut iter = LogSegment::process_sidecars( + let mut iter = process_sidecars( engine.parquet_handler(), log_root, checkpoint_batch.as_ref(), @@ -1062,17 +1169,18 @@ fn test_reading_sidecar_files_with_predicate() -> DeltaResult<()> { Ok(()) } -#[test] -fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_no_file_actions( +#[tokio::test] +async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_no_file_actions( ) -> DeltaResult<()> { let (store, log_root) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); add_checkpoint_to_store( &store, // Create a checkpoint batch with sidecar actions to verify that the sidecar actions are not read. sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], get_commit_schema().clone()), "00000000000000000001.checkpoint.parquet", - )?; + ) + .await?; let checkpoint_one_file = log_root .join("00000000000000000001.checkpoint.parquet")? @@ -1081,18 +1189,23 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_ let v2_checkpoint_read_schema = get_commit_schema().project(&[METADATA_NAME])?; let log_segment = LogSegment::try_new( - ListedLogFiles::try_new( - vec![], - vec![], - vec![create_log_path(&checkpoint_one_file)], - None, - Some(create_log_path("file:///00000000000000000001.json")), - )?, + LogSegmentFiles { + checkpoint_parts: vec![create_log_path(&checkpoint_one_file)], + latest_commit_file: Some(create_log_path("file:///00000000000000000001.json")), + ..Default::default() + }, log_root, None, + None, + )?; + let checkpoint_result = log_segment.create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema.clone(), + None, // meta_predicate + None, // stats_schema + None, // partition_schema )?; - let mut iter = - log_segment.create_checkpoint_stream(&engine, v2_checkpoint_read_schema.clone(), None)?; + let mut iter = checkpoint_result.actions; // Assert that the first batch returned is from reading checkpoint file 1 let ActionsBatch { @@ -1109,11 +1222,11 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_ Ok(()) } -#[test] -fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_multi_part( +#[tokio::test] +async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_multi_part( ) -> DeltaResult<()> { let (store, log_root) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Multi-part checkpoints should never contain sidecar actions. // This test intentionally includes batches with sidecar actions in multi-part checkpoints @@ -1128,12 +1241,17 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_mul &store, sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], get_all_actions_schema().clone()), checkpoint_part_1, - )?; + ) + .await?; add_checkpoint_to_store( &store, sidecar_batch_with_given_paths(vec!["sidecar2.parquet"], get_all_actions_schema().clone()), checkpoint_part_2, - )?; + ) + .await?; + + let cp1_size = get_file_size(&store, &format!("_delta_log/{checkpoint_part_1}")).await; + let cp2_size = get_file_size(&store, &format!("_delta_log/{checkpoint_part_2}")).await; let checkpoint_one_file = log_root.join(checkpoint_part_1)?.to_string(); let checkpoint_two_file = log_root.join(checkpoint_part_2)?.to_string(); @@ -1141,21 +1259,26 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_mul let v2_checkpoint_read_schema = get_commit_schema().project(&[ADD_NAME])?; let log_segment = LogSegment::try_new( - ListedLogFiles::try_new( - vec![], - vec![], - vec![ - create_log_path(&checkpoint_one_file), - create_log_path(&checkpoint_two_file), + LogSegmentFiles { + checkpoint_parts: vec![ + create_log_path_with_size(&checkpoint_one_file, cp1_size), + create_log_path_with_size(&checkpoint_two_file, cp2_size), ], - None, - Some(create_log_path("file:///00000000000000000001.json")), - )?, + latest_commit_file: Some(create_log_path("file:///00000000000000000001.json")), + ..Default::default() + }, log_root, None, + None, + )?; + let checkpoint_result = log_segment.create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema.clone(), + None, // meta_predicate + None, // stats_schema + None, // partition_schema )?; - let mut iter = - log_segment.create_checkpoint_stream(&engine, v2_checkpoint_read_schema.clone(), None)?; + let mut iter = checkpoint_result.actions; // Assert the correctness of batches returned for expected_sidecar in ["sidecar1.parquet", "sidecar2.parquet"].iter() { @@ -1177,37 +1300,50 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_mul Ok(()) } -#[test] -fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars() -> DeltaResult<()> -{ +#[tokio::test] +async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars( +) -> DeltaResult<()> { let (store, log_root) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); add_checkpoint_to_store( &store, add_batch_simple(get_commit_schema().clone()), "00000000000000000001.checkpoint.parquet", - )?; + ) + .await?; let checkpoint_one_file = log_root .join("00000000000000000001.checkpoint.parquet")? .to_string(); + // Get the actual file size for proper footer reading + let checkpoint_size = + get_file_size(&store, "_delta_log/00000000000000000001.checkpoint.parquet").await; + let v2_checkpoint_read_schema = get_all_actions_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let log_segment = LogSegment::try_new( - ListedLogFiles::try_new( - vec![], - vec![], - vec![create_log_path(&checkpoint_one_file)], - None, - Some(create_log_path("file:///00000000000000000001.json")), - )?, + LogSegmentFiles { + checkpoint_parts: vec![create_log_path_with_size( + &checkpoint_one_file, + checkpoint_size, + )], + latest_commit_file: Some(create_log_path("file:///00000000000000000001.json")), + ..Default::default() + }, log_root, None, + None, )?; - let mut iter = - log_segment.create_checkpoint_stream(&engine, v2_checkpoint_read_schema.clone(), None)?; + let checkpoint_result = log_segment.create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema.clone(), + None, // meta_predicate + None, // stats_schema + None, // partition_schema + )?; + let mut iter = checkpoint_result.actions; // Assert that the first batch returned is from reading checkpoint file 1 let ActionsBatch { @@ -1221,10 +1357,11 @@ fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars Ok(()) } -#[test] -fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() -> DeltaResult<()> { +#[tokio::test] +async fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars( +) -> DeltaResult<()> { let (store, log_root) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); let filename = "00000000000000000010.checkpoint.80a083e8-7026-4e79-81be-64bd76c43a11.json"; @@ -1236,25 +1373,31 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() ..Default::default() })], filename, - )?; + ) + .await?; let checkpoint_one_file = log_root.join(filename)?.to_string(); let v2_checkpoint_read_schema = get_all_actions_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let log_segment = LogSegment::try_new( - ListedLogFiles::try_new( - vec![], - vec![], - vec![create_log_path(&checkpoint_one_file)], - None, - Some(create_log_path("file:///00000000000000000001.json")), - )?, + LogSegmentFiles { + checkpoint_parts: vec![create_log_path(&checkpoint_one_file)], + latest_commit_file: Some(create_log_path("file:///00000000000000000001.json")), + ..Default::default() + }, log_root, None, + None, + )?; + let checkpoint_result = log_segment.create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema, + None, // meta_predicate + None, // stats_schema + None, // partition_schema )?; - let mut iter = - log_segment.create_checkpoint_stream(&engine, v2_checkpoint_read_schema, None)?; + let mut iter = checkpoint_result.actions; // Assert that the first batch returned is from reading checkpoint file 1 let ActionsBatch { @@ -1278,51 +1421,75 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() // - As sidecar references are present, the corresponding sidecar files are processed correctly. // - Batches from both the checkpoint file and sidecar files are returned. // - Each returned batch is correctly flagged with is_log_batch set to false -#[test] -fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches( +#[tokio::test] +async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches( ) -> DeltaResult<()> { let (store, log_root) = new_in_memory_store(); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); - - add_checkpoint_to_store( - &store, - sidecar_batch_with_given_paths( - vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], - get_all_actions_schema().clone(), - ), - "00000000000000000001.checkpoint.parquet", - )?; + let engine = DefaultEngineBuilder::new(store.clone()).build(); - add_sidecar_to_store( + // Write sidecars first so we can get their actual sizes + let sidecar1_size = add_sidecar_to_store( &store, add_batch_simple(get_commit_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile1.parquet", - )?; - add_sidecar_to_store( + ) + .await? + .size; + + let sidecar2_size = add_sidecar_to_store( &store, add_batch_with_remove(get_commit_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile2.parquet", - )?; + ) + .await? + .size; + + // Now create checkpoint with correct sidecar sizes + add_checkpoint_to_store( + &store, + sidecar_batch_with_given_paths_and_sizes( + vec![ + ("sidecarfile1.parquet", sidecar1_size), + ("sidecarfile2.parquet", sidecar2_size), + ], + get_all_actions_schema().clone(), + ), + "00000000000000000001.checkpoint.parquet", + ) + .await?; let checkpoint_file_path = log_root .join("00000000000000000001.checkpoint.parquet")? .to_string(); - let v2_checkpoint_read_schema = get_all_actions_schema().project(&[ADD_NAME])?; + // Get the actual file size for proper footer reading + let checkpoint_size = + get_file_size(&store, "_delta_log/00000000000000000001.checkpoint.parquet").await; + + // Sidecar batches now use the same schema as checkpoint (including sidecar column) + let v2_checkpoint_read_schema = get_all_actions_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let log_segment = LogSegment::try_new( - ListedLogFiles::try_new( - vec![], - vec![], - vec![create_log_path(&checkpoint_file_path)], - None, - Some(create_log_path("file:///00000000000000000001.json")), - )?, + LogSegmentFiles { + checkpoint_parts: vec![create_log_path_with_size( + &checkpoint_file_path, + checkpoint_size, + )], + latest_commit_file: Some(create_log_path("file:///00000000000000000001.json")), + ..Default::default() + }, log_root, None, + None, )?; - let mut iter = - log_segment.create_checkpoint_stream(&engine, v2_checkpoint_read_schema.clone(), None)?; + let checkpoint_result = log_segment.create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema.clone(), + None, // meta_predicate + None, // stats_schema + None, // partition_schema + )?; + let mut iter = checkpoint_result.actions; // Assert that the first batch returned is from reading checkpoint file 1 let ActionsBatch { @@ -1334,8 +1501,11 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch // verify no behavior change. assert_batch_matches( first_batch, - sidecar_batch_with_given_paths( - vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], + sidecar_batch_with_given_paths_and_sizes( + vec![ + ("sidecarfile1.parquet", sidecar1_size), + ("sidecarfile2.parquet", sidecar2_size), + ], get_all_actions_schema().project(&[ADD_NAME, SIDECAR_NAME])?, ), ); @@ -1366,40 +1536,61 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch Ok(()) } -fn create_segment_for( - commit_versions: &[u64], - compaction_versions: &[(u64, u64)], +#[derive(Default)] +struct LogSegmentConfig<'a> { + published_commit_versions: &'a [u64], + staged_commit_versions: &'a [u64], + compaction_versions: &'a [(u64, u64)], checkpoint_version: Option, version_to_load: Option, -) -> LogSegment { - let mut paths: Vec = commit_versions +} + +async fn create_segment_for(segment: LogSegmentConfig<'_>) -> LogSegment { + let mut paths: Vec = segment + .published_commit_versions .iter() .map(|version| delta_path_for_version(*version, "json")) .chain( - compaction_versions + segment + .compaction_versions .iter() .map(|(start, end)| compacted_log_path_for_versions(*start, *end, "json")), ) .collect(); - if let Some(version) = checkpoint_version { + if let Some(version) = segment.checkpoint_version { paths.push(delta_path_for_version( version, "checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json", )); } - let (storage, log_root) = build_log_with_paths_and_checkpoint(&paths, None); + let (storage, log_root) = build_log_with_paths_and_checkpoint(&paths, None).await; + let table_root = Url::parse("memory:///").expect("valid url"); + let staged_commits_log_tail: Vec = segment + .staged_commit_versions + .iter() + .map(|version| staged_commit_path_for_version(*version)) + .map(|path| { + ParsedLogPath::try_from(FileMeta { + location: table_root.join(path.as_ref()).unwrap(), + last_modified: 0, + size: 0, + }) + .unwrap() + .unwrap() + }) + .collect(); LogSegment::for_snapshot_impl( storage.as_ref(), log_root.clone(), - vec![], // log_tail + staged_commits_log_tail, None, - version_to_load, + segment.version_to_load, ) .unwrap() } -#[test] -fn test_list_log_files_with_version() -> DeltaResult<()> { +#[tokio::test] +async fn test_list_log_files_with_version() -> DeltaResult<()> { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -1409,8 +1600,9 @@ fn test_list_log_files_with_version() -> DeltaResult<()> { delta_path_for_version(2, "json"), ], None, - ); - let result = ListedLogFiles::list( + ) + .await; + let result = LogSegmentFiles::list( storage.as_ref(), &log_root, vec![], // log_tail @@ -1429,18 +1621,20 @@ fn test_list_log_files_with_version() -> DeltaResult<()> { Ok(()) } -fn test_compaction_listing( +async fn test_compaction_listing( commit_versions: &[u64], compaction_versions: &[(u64, u64)], checkpoint_version: Option, version_to_load: Option, ) { - let log_segment = create_segment_for( - commit_versions, + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: commit_versions, compaction_versions, checkpoint_version, version_to_load, - ); + ..Default::default() + }) + .await; let version_to_load = version_to_load.unwrap_or(u64::MAX); let checkpoint_cuttoff = checkpoint_version.map(|v| v as i64).unwrap_or(-1); let expected_commit_versions: Vec<&u64> = commit_versions @@ -1453,15 +1647,16 @@ fn test_compaction_listing( .collect(); assert_eq!( - log_segment.ascending_commit_files.len(), + log_segment.listed.ascending_commit_files.len(), expected_commit_versions.len() ); assert_eq!( - log_segment.ascending_compaction_files.len(), + log_segment.listed.ascending_compaction_files.len(), expected_compaction_versions.len() ); for (commit_file, expected_version) in log_segment + .listed .ascending_commit_files .iter() .zip(expected_commit_versions.iter()) @@ -1471,6 +1666,7 @@ fn test_compaction_listing( } for (compaction_file, (expected_start, expected_end)) in log_segment + .listed .ascending_compaction_files .iter() .zip(expected_compaction_versions.iter()) @@ -1488,84 +1684,92 @@ fn test_compaction_listing( } } -#[test] -fn test_compaction_simple() { +#[tokio::test] +async fn test_compaction_simple() { test_compaction_listing( &[0, 1, 2], &[(1, 2)], None, // checkpoint version None, // version to load - ); + ) + .await; } -#[test] -fn test_compaction_in_version_range() { +#[tokio::test] +async fn test_compaction_in_version_range() { test_compaction_listing( &[0, 1, 2, 3], &[(1, 2)], None, // checkpoint version Some(2), // version to load - ); + ) + .await; } -#[test] -fn test_compaction_out_of_version_range() { +#[tokio::test] +async fn test_compaction_out_of_version_range() { test_compaction_listing( &[0, 1, 2, 3, 4], &[(1, 3)], None, // checkpoint version Some(2), // version to load - ); + ) + .await; } -#[test] -fn test_multi_compaction() { +#[tokio::test] +async fn test_multi_compaction() { test_compaction_listing( &[0, 1, 2, 3, 4, 5], &[(1, 2), (3, 5)], None, // checkpoint version None, //version to load - ); + ) + .await; } -#[test] -fn test_multi_compaction_one_out_of_range() { +#[tokio::test] +async fn test_multi_compaction_one_out_of_range() { test_compaction_listing( &[0, 1, 2, 3, 4, 5], &[(1, 2), (3, 5)], None, // checkpoint version Some(4), // version to load - ); + ) + .await; } -#[test] -fn test_compaction_with_checkpoint() { +#[tokio::test] +async fn test_compaction_with_checkpoint() { test_compaction_listing( &[0, 1, 2, 4, 5], &[(1, 2), (4, 5)], Some(3), // checkpoint version None, // version to load - ); + ) + .await; } -#[test] -fn test_compaction_to_early_with_checkpoint() { +#[tokio::test] +async fn test_compaction_to_early_with_checkpoint() { test_compaction_listing( &[0, 1, 2, 4, 5], &[(1, 2)], Some(3), // checkpoint version None, // version to load - ); + ) + .await; } -#[test] -fn test_compaction_starts_at_checkpoint() { +#[tokio::test] +async fn test_compaction_starts_at_checkpoint() { test_compaction_listing( &[0, 1, 2, 4, 5], &[(3, 5)], Some(3), // checkpoint version None, // version to load - ); + ) + .await; } enum ExpectedFile { @@ -1573,19 +1777,21 @@ enum ExpectedFile { Compaction(Version, Version), } -fn test_commit_cover( +async fn test_commit_cover( commit_versions: &[u64], compaction_versions: &[(u64, u64)], checkpoint_version: Option, version_to_load: Option, expected_files: &[ExpectedFile], ) { - let log_segment = create_segment_for( - commit_versions, + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: commit_versions, compaction_versions, checkpoint_version, version_to_load, - ); + ..Default::default() + }) + .await; let cover = log_segment.find_commit_cover(); // our test-utils include "_delta_log" in the path, which is already in log_segment.log_root, so // we don't use them. TODO: Unify this @@ -1605,30 +1811,32 @@ fn test_commit_cover( } } -#[test] -fn test_commit_cover_one_compaction() { +#[tokio::test] +async fn test_commit_cover_one_compaction() { test_commit_cover( &[0, 1, 2], &[(1, 2)], None, // checkpoint version None, // version to load &[ExpectedFile::Compaction(1, 2), ExpectedFile::Commit(0)], - ); + ) + .await; } -#[test] -fn test_commit_cover_in_version_range() { +#[tokio::test] +async fn test_commit_cover_in_version_range() { test_commit_cover( &[0, 1, 2, 3], &[(1, 2)], None, // checkpoint version Some(2), // version to load &[ExpectedFile::Compaction(1, 2), ExpectedFile::Commit(0)], - ); + ) + .await; } -#[test] -fn test_commit_cover_out_of_version_range() { +#[tokio::test] +async fn test_commit_cover_out_of_version_range() { test_commit_cover( &[0, 1, 2, 3, 4], &[(1, 3)], @@ -1639,11 +1847,12 @@ fn test_commit_cover_out_of_version_range() { ExpectedFile::Commit(1), ExpectedFile::Commit(0), ], - ); + ) + .await; } -#[test] -fn test_commit_cover_multi_compaction() { +#[tokio::test] +async fn test_commit_cover_multi_compaction() { test_commit_cover( &[0, 1, 2, 3, 4, 5], &[(1, 2), (3, 5)], @@ -1654,11 +1863,12 @@ fn test_commit_cover_multi_compaction() { ExpectedFile::Compaction(1, 2), ExpectedFile::Commit(0), ], - ); + ) + .await; } -#[test] -fn test_commit_cover_multi_compaction_one_out_of_range() { +#[tokio::test] +async fn test_commit_cover_multi_compaction_one_out_of_range() { test_commit_cover( &[0, 1, 2, 3, 4, 5], &[(1, 2), (3, 5)], @@ -1670,44 +1880,48 @@ fn test_commit_cover_multi_compaction_one_out_of_range() { ExpectedFile::Compaction(1, 2), ExpectedFile::Commit(0), ], - ); + ) + .await; } -#[test] -fn test_commit_cover_compaction_with_checkpoint() { +#[tokio::test] +async fn test_commit_cover_compaction_with_checkpoint() { test_commit_cover( &[0, 1, 2, 4, 5], &[(1, 2), (4, 5)], Some(3), // checkpoint version None, // version to load &[ExpectedFile::Compaction(4, 5)], - ); + ) + .await; } -#[test] -fn test_commit_cover_too_early_with_checkpoint() { +#[tokio::test] +async fn test_commit_cover_too_early_with_checkpoint() { test_commit_cover( &[0, 1, 2, 4, 5], &[(1, 2)], Some(3), // checkpoint version None, // version to load &[ExpectedFile::Commit(5), ExpectedFile::Commit(4)], - ); + ) + .await; } -#[test] -fn test_commit_cover_starts_at_checkpoint() { +#[tokio::test] +async fn test_commit_cover_starts_at_checkpoint() { test_commit_cover( &[0, 1, 2, 4, 5], &[(3, 5)], Some(3), // checkpoint version None, // version to load &[ExpectedFile::Commit(5), ExpectedFile::Commit(4)], - ); + ) + .await; } -#[test] -fn test_commit_cover_wider_range() { +#[tokio::test] +async fn test_commit_cover_wider_range() { test_commit_cover( &Vec::from_iter(0..20), &[(0, 5), (0, 10), (5, 10), (13, 19)], @@ -1719,11 +1933,12 @@ fn test_commit_cover_wider_range() { ExpectedFile::Commit(11), ExpectedFile::Compaction(0, 10), ], - ); + ) + .await; } -#[test] -fn test_commit_cover_no_compactions() { +#[tokio::test] +async fn test_commit_cover_no_compactions() { test_commit_cover( &Vec::from_iter(0..4), &[], @@ -1735,11 +1950,12 @@ fn test_commit_cover_no_compactions() { ExpectedFile::Commit(1), ExpectedFile::Commit(0), ], - ); + ) + .await; } -#[test] -fn test_commit_cover_minimal_overlap() { +#[tokio::test] +async fn test_commit_cover_minimal_overlap() { test_commit_cover( &Vec::from_iter(0..6), &[(0, 2), (2, 5)], @@ -1751,147 +1967,311 @@ fn test_commit_cover_minimal_overlap() { ExpectedFile::Commit(3), ExpectedFile::Compaction(0, 2), ], - ); + ) + .await; } #[test] -#[cfg(debug_assertions)] -fn test_debug_assert_listed_log_file_in_order_compaction_files() { - let _ = ListedLogFiles::try_new( - vec![], - vec![ - create_log_path("file:///00000000000000000000.00000000000000000004.compacted.json"), - create_log_path("file:///00000000000000000001.00000000000000000002.compacted.json"), - ], - vec![], +fn test_validate_listed_log_file_in_order_compaction_files() { + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + ascending_commit_files: vec![create_log_path( + "file:///_delta_log/00000000000000000001.json", + )], + ascending_compaction_files: vec![ + create_log_path( + "file:///_delta_log/00000000000000000000.00000000000000000004.compacted.json", + ), + create_log_path( + "file:///_delta_log/00000000000000000001.00000000000000000002.compacted.json", + ), + ], + ..Default::default() + }, + log_root, None, - Some(create_log_path("file:///00000000000000000001.json")), - ); + None, + ) + .is_ok()); } #[test] -#[should_panic] -#[cfg(debug_assertions)] -fn test_debug_assert_listed_log_file_out_of_order_compaction_files() { - let _ = ListedLogFiles::try_new( - vec![], - vec![ - create_log_path("file:///00000000000000000000.00000000000000000004.compacted.json"), - create_log_path("file:///00000000000000000000.00000000000000000003.compacted.json"), - ], - vec![], +fn test_validate_listed_log_file_out_of_order_compaction_files() { + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + ascending_commit_files: vec![create_log_path( + "file:///_delta_log/00000000000000000001.json", + )], + ascending_compaction_files: vec![ + create_log_path( + "file:///_delta_log/00000000000000000000.00000000000000000004.compacted.json", + ), + create_log_path( + "file:///_delta_log/00000000000000000000.00000000000000000003.compacted.json", + ), + ], + ..Default::default() + }, + log_root, None, - Some(create_log_path("file:///00000000000000000001.json")), - ); + None, + ) + .is_err()); } #[test] -#[should_panic] -#[cfg(debug_assertions)] -fn test_debug_assert_listed_log_file_different_multipart_checkpoint_versions() { - let _ = ListedLogFiles::try_new( - vec![], - vec![], - vec![ - create_log_path("00000000000000000010.checkpoint.0000000001.0000000002.parquet"), - create_log_path("00000000000000000011.checkpoint.0000000002.0000000002.parquet"), - ], +fn test_validate_listed_log_file_different_multipart_checkpoint_versions() { + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + checkpoint_parts: vec![ + create_log_path( + "file:///_delta_log/00000000000000000010.checkpoint.0000000001.0000000002.parquet", + ), + create_log_path( + "file:///_delta_log/00000000000000000011.checkpoint.0000000002.0000000002.parquet", + ), + ], + ..Default::default() + }, + log_root, None, - Some(create_log_path("file:///00000000000000000001.json")), - ); + None, + ) + .is_err()); } #[test] -#[should_panic] -#[cfg(debug_assertions)] -fn test_debug_assert_listed_log_file_invalid_multipart_checkpoint() { - let _ = ListedLogFiles::try_new( - vec![], - vec![], - vec![ - create_log_path("00000000000000000010.checkpoint.0000000001.0000000003.parquet"), - create_log_path("00000000000000000011.checkpoint.0000000002.0000000003.parquet"), - ], +fn test_validate_listed_log_file_out_of_order_commit_files() { + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + ascending_commit_files: vec![ + create_log_path("file:///_delta_log/00000000000000000003.json"), + create_log_path("file:///_delta_log/00000000000000000001.json"), + ], + ..Default::default() + }, + log_root, None, - Some(create_log_path("file:///00000000000000000001.json")), - ); + None, + ) + .is_err()); +} + +#[test] +fn test_validate_listed_log_file_checkpoint_parts_contains_non_checkpoint() { + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + checkpoint_parts: vec![create_log_path( + "file:///_delta_log/00000000000000000010.json", + )], + ..Default::default() + }, + log_root, + None, + None, + ) + .is_err()); +} + +#[test] +fn test_validate_listed_log_file_multipart_checkpoint_part_count_mismatch() { + // Two parts that agree on version but claim num_parts=3 (count mismatch: 2 != 3) + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + checkpoint_parts: vec![ + create_log_path( + "file:///_delta_log/00000000000000000010.checkpoint.0000000001.0000000003.parquet", + ), + create_log_path( + "file:///_delta_log/00000000000000000010.checkpoint.0000000002.0000000003.parquet", + ), + ], + ..Default::default() + }, + log_root, + None, + None, + ) + .is_err()); +} + +#[test] +fn test_validate_listed_log_file_single_multipart_checkpoint_num_parts_mismatch() { + // A single checkpoint file that claims num_parts=2: the count (1) disagrees with num_parts + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + checkpoint_parts: vec![create_log_path( + "file:///_delta_log/00000000000000000010.checkpoint.0000000001.0000000002.parquet", + )], + ..Default::default() + }, + log_root, + None, + None, + ) + .is_err()); +} + +#[test] +fn test_validate_listed_log_file_multiple_single_part_checkpoints() { + // Two SinglePartCheckpoints at the same version: n=2 but neither is a MultiPartCheckpoint + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + checkpoint_parts: vec![ + create_log_path("file:///_delta_log/00000000000000000010.checkpoint.parquet"), + create_log_path("file:///_delta_log/00000000000000000010.checkpoint.parquet"), + ], + ..Default::default() + }, + log_root, + None, + None, + ) + .is_err()); +} + +#[test] +fn test_validate_listed_log_file_commit_files_contains_non_commit() { + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + ascending_commit_files: vec![create_log_path( + "file:///_delta_log/00000000000000000010.checkpoint.parquet", + )], + ..Default::default() + }, + log_root, + None, + None, + ) + .is_err()); } #[test] -fn commits_since() { +fn test_validate_listed_log_file_compaction_files_contains_non_compaction() { + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + ascending_commit_files: vec![create_log_path( + "file:///_delta_log/00000000000000000002.json", + )], + ascending_compaction_files: vec![create_log_path( + "file:///_delta_log/00000000000000000001.json", + )], + ..Default::default() + }, + log_root, + None, + None, + ) + .is_err()); +} + +#[test] +fn test_validate_listed_log_file_compaction_start_exceeds_end() { + // A compaction file where the start version is greater than the end version + let log_root = Url::parse("file:///_delta_log/").unwrap(); + assert!(LogSegment::try_new( + LogSegmentFiles { + ascending_commit_files: vec![create_log_path( + "file:///_delta_log/00000000000000000005.json", + )], + ascending_compaction_files: vec![create_log_path( + "file:///_delta_log/00000000000000000005.00000000000000000002.compacted.json", + )], + ..Default::default() + }, + log_root, + None, + None, + ) + .is_err()); +} + +#[tokio::test] +async fn commits_since() { // simple - let log_segment = create_segment_for( - &Vec::from_iter(0..=4), - &[], - None, // No checkpoint - None, // Version to load - ); + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &Vec::from_iter(0..=4), + ..Default::default() + }) + .await; assert_eq!(log_segment.commits_since_checkpoint(), 4); assert_eq!(log_segment.commits_since_log_compaction_or_checkpoint(), 4); // with compaction, no checkpoint - let log_segment = create_segment_for( - &Vec::from_iter(0..=4), - &[(0, 2)], - None, // No checkpoint - None, // Version to load - ); + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &Vec::from_iter(0..=4), + compaction_versions: &[(0, 2)], + ..Default::default() + }) + .await; assert_eq!(log_segment.commits_since_checkpoint(), 4); assert_eq!(log_segment.commits_since_log_compaction_or_checkpoint(), 2); // checkpoint, no compaction - let log_segment = create_segment_for( - &Vec::from_iter(0..=6), - &[], - Some(3), // Checkpoint @ 3 - None, // Version to load - ); + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &Vec::from_iter(0..=6), + checkpoint_version: Some(3), + ..Default::default() + }) + .await; assert_eq!(log_segment.commits_since_checkpoint(), 3); assert_eq!(log_segment.commits_since_log_compaction_or_checkpoint(), 3); // checkpoint and compaction less than checkpoint - let log_segment = create_segment_for( - &Vec::from_iter(0..=6), - &[(0, 2)], - Some(3), // Checkpoint @ 3 - None, // Version to load - ); + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &Vec::from_iter(0..=6), + compaction_versions: &[(0, 2)], + checkpoint_version: Some(3), + ..Default::default() + }) + .await; assert_eq!(log_segment.commits_since_checkpoint(), 3); assert_eq!(log_segment.commits_since_log_compaction_or_checkpoint(), 3); // checkpoint and compaction greater than checkpoint - let log_segment = create_segment_for( - &Vec::from_iter(0..=6), - &[(3, 4)], - Some(2), // Checkpoint @ 2 - None, // Version to load - ); + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &Vec::from_iter(0..=6), + compaction_versions: &[(3, 4)], + checkpoint_version: Some(2), + ..Default::default() + }) + .await; assert_eq!(log_segment.commits_since_checkpoint(), 4); assert_eq!(log_segment.commits_since_log_compaction_or_checkpoint(), 2); // multiple compactions - let log_segment = create_segment_for( - &Vec::from_iter(0..=6), - &[(1, 2), (3, 4)], - None, // No Checkpoint - None, // Version to load - ); + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &Vec::from_iter(0..=6), + compaction_versions: &[(1, 2), (3, 4)], + ..Default::default() + }) + .await; assert_eq!(log_segment.commits_since_checkpoint(), 6); assert_eq!(log_segment.commits_since_log_compaction_or_checkpoint(), 2); // multiple compactions, out of order - let log_segment = create_segment_for( - &Vec::from_iter(0..=10), - &[(1, 2), (3, 9), (4, 6)], - None, // No Checkpoint - None, // Version to load - ); + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &Vec::from_iter(0..=10), + compaction_versions: &[(1, 2), (3, 9), (4, 6)], + ..Default::default() + }) + .await; assert_eq!(log_segment.commits_since_checkpoint(), 10); assert_eq!(log_segment.commits_since_log_compaction_or_checkpoint(), 1); } -#[test] -fn for_timestamp_conversion_gets_commit_range() { +#[tokio::test] +async fn for_timestamp_conversion_gets_commit_range() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -1907,12 +2287,13 @@ fn for_timestamp_conversion_gets_commit_range() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_timestamp_conversion(storage.as_ref(), log_root.clone(), 7, None).unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert!(checkpoint_parts.is_empty()); @@ -1920,8 +2301,8 @@ fn for_timestamp_conversion_gets_commit_range() { assert_eq!(vec![0, 1, 2, 3, 4, 5, 6, 7], versions); } -#[test] -fn for_timestamp_conversion_with_old_end_version() { +#[tokio::test] +async fn for_timestamp_conversion_with_old_end_version() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -1937,12 +2318,13 @@ fn for_timestamp_conversion_with_old_end_version() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_timestamp_conversion(storage.as_ref(), log_root.clone(), 5, None).unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert!(checkpoint_parts.is_empty()); @@ -1950,8 +2332,8 @@ fn for_timestamp_conversion_with_old_end_version() { assert_eq!(vec![0, 1, 2, 3, 4, 5], versions); } -#[test] -fn for_timestamp_conversion_only_contiguous_ranges() { +#[tokio::test] +async fn for_timestamp_conversion_only_contiguous_ranges() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -1967,12 +2349,13 @@ fn for_timestamp_conversion_only_contiguous_ranges() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_timestamp_conversion(storage.as_ref(), log_root.clone(), 7, None).unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert!(checkpoint_parts.is_empty()); @@ -1980,8 +2363,8 @@ fn for_timestamp_conversion_only_contiguous_ranges() { assert_eq!(vec![5, 6, 7], versions); } -#[test] -fn for_timestamp_conversion_with_limit() { +#[tokio::test] +async fn for_timestamp_conversion_with_limit() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -1997,7 +2380,8 @@ fn for_timestamp_conversion_with_limit() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_timestamp_conversion( storage.as_ref(), @@ -2006,16 +2390,17 @@ fn for_timestamp_conversion_with_limit() { Some(NonZero::new(3).unwrap()), ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert!(checkpoint_parts.is_empty()); let versions = commit_files.iter().map(|x| x.version).collect_vec(); assert_eq!(vec![5, 6, 7], versions); } -#[test] -fn for_timestamp_conversion_with_large_limit() { + +#[tokio::test] +async fn for_timestamp_conversion_with_large_limit() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -2031,7 +2416,8 @@ fn for_timestamp_conversion_with_large_limit() { delta_path_for_version(7, "json"), ], None, - ); + ) + .await; let log_segment = LogSegment::for_timestamp_conversion( storage.as_ref(), @@ -2040,8 +2426,8 @@ fn for_timestamp_conversion_with_large_limit() { Some(NonZero::new(20).unwrap()), ) .unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; + let commit_files = log_segment.listed.ascending_commit_files; + let checkpoint_parts = log_segment.listed.checkpoint_parts; assert!(checkpoint_parts.is_empty()); @@ -2049,19 +2435,20 @@ fn for_timestamp_conversion_with_large_limit() { assert_eq!(vec![0, 1, 2, 3, 4, 5, 6, 7], versions); } -#[test] -fn for_timestamp_conversion_no_commit_files() { +#[tokio::test] +async fn for_timestamp_conversion_no_commit_files() { let (storage, log_root) = build_log_with_paths_and_checkpoint( &[delta_path_for_version(5, "checkpoint.parquet")], None, - ); + ) + .await; let res = LogSegment::for_timestamp_conversion(storage.as_ref(), log_root.clone(), 0, None); assert_result_error_with_message(res, "Generic delta kernel error: No files in log segment"); } -#[test] -fn test_latest_commit_file_field_is_captured() { +#[tokio::test] +async fn test_latest_commit_file_field_is_captured() { // Test that the latest commit is preserved even after checkpoint filtering let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ @@ -2074,22 +2461,24 @@ fn test_latest_commit_file_field_is_captured() { delta_path_for_version(5, "json"), ], None, - ); + ) + .await; let log_segment = - LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None).unwrap(); + LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None, None, None) + .unwrap(); // The latest commit should be version 5 - assert_eq!(log_segment.latest_commit_file.unwrap().version, 5); + assert_eq!(log_segment.listed.latest_commit_file.unwrap().version, 5); // The log segment should only contain commits 3, 4, 5 (after checkpoint 2) - assert_eq!(log_segment.ascending_commit_files.len(), 3); - assert_eq!(log_segment.ascending_commit_files[0].version, 3); - assert_eq!(log_segment.ascending_commit_files[2].version, 5); + assert_eq!(log_segment.listed.ascending_commit_files.len(), 3); + assert_eq!(log_segment.listed.ascending_commit_files[0].version, 3); + assert_eq!(log_segment.listed.ascending_commit_files[2].version, 5); } -#[test] -fn test_latest_commit_file_with_checkpoint_filtering() { +#[tokio::test] +async fn test_latest_commit_file_with_checkpoint_filtering() { // Test when commits get filtered by checkpoint let (storage, log_root) = build_log_with_paths_and_checkpoint( &[ @@ -2100,40 +2489,44 @@ fn test_latest_commit_file_with_checkpoint_filtering() { delta_path_for_version(4, "json"), ], None, - ); + ) + .await; let log_segment = - LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None).unwrap(); + LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None, None, None) + .unwrap(); // The latest commit should be version 4 - assert_eq!(log_segment.latest_commit_file.unwrap().version, 4); + assert_eq!(log_segment.listed.latest_commit_file.unwrap().version, 4); // The log segment should have only commit 4 (after checkpoint 3) - assert_eq!(log_segment.ascending_commit_files.len(), 1); - assert_eq!(log_segment.ascending_commit_files[0].version, 4); + assert_eq!(log_segment.listed.ascending_commit_files.len(), 1); + assert_eq!(log_segment.listed.ascending_commit_files[0].version, 4); } -#[test] -fn test_latest_commit_file_with_no_commits() { +#[tokio::test] +async fn test_latest_commit_file_with_no_commits() { // Test when there are only checkpoints and no commits at all // This should now succeed with latest_commit_file as None let (storage, log_root) = build_log_with_paths_and_checkpoint( &[delta_path_for_version(2, "checkpoint.parquet")], None, - ); + ) + .await; let log_segment = - LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None).unwrap(); + LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None, None, None) + .unwrap(); // latest_commit_file should be None when there are no commits - assert!(log_segment.latest_commit_file.is_none()); + assert!(log_segment.listed.latest_commit_file.is_none()); // The checkpoint should be at version 2 assert_eq!(log_segment.checkpoint_version, Some(2)); } -#[test] -fn test_latest_commit_file_with_checkpoint_at_same_version() { +#[tokio::test] +async fn test_latest_commit_file_with_checkpoint_at_same_version() { // Test when checkpoint is at the same version as the latest commit // This tests: 0.json, 1.json, 1.checkpoint.parquet let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -2143,23 +2536,25 @@ fn test_latest_commit_file_with_checkpoint_at_same_version() { delta_path_for_version(1, "checkpoint.parquet"), ], None, - ); + ) + .await; let log_segment = - LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None).unwrap(); + LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None, None, None) + .unwrap(); // The latest commit should be version 1 (saved before filtering) - assert_eq!(log_segment.latest_commit_file.unwrap().version, 1); + assert_eq!(log_segment.listed.latest_commit_file.unwrap().version, 1); // The log segment should have no commit files (all filtered by checkpoint at version 1) - assert_eq!(log_segment.ascending_commit_files.len(), 0); + assert_eq!(log_segment.listed.ascending_commit_files.len(), 0); // The checkpoint should be at version 1 assert_eq!(log_segment.checkpoint_version, Some(1)); } -#[test] -fn test_latest_commit_file_edge_case_commit_before_checkpoint() { +#[tokio::test] +async fn test_latest_commit_file_edge_case_commit_before_checkpoint() { // Test edge case: 0.json, 1.checkpoint.parquet // The latest_commit_file should NOT be set to version 0 since there's no commit at version 1 let (storage, log_root) = build_log_with_paths_and_checkpoint( @@ -2168,113 +2563,1825 @@ fn test_latest_commit_file_edge_case_commit_before_checkpoint() { delta_path_for_version(1, "checkpoint.parquet"), ], None, - ); + ) + .await; let log_segment = - LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None).unwrap(); + LogSegment::for_snapshot(storage.as_ref(), log_root.clone(), vec![], None, None, None) + .unwrap(); // latest_commit_file should be None since there's no commit at the checkpoint version - assert!(log_segment.latest_commit_file.is_none()); + assert!(log_segment.listed.latest_commit_file.is_none()); // The checkpoint should be at version 1 assert_eq!(log_segment.checkpoint_version, Some(1)); // There should be no commits in the log segment (all filtered by checkpoint) - assert_eq!(log_segment.ascending_commit_files.len(), 0); + assert_eq!(log_segment.listed.ascending_commit_files.len(), 0); } #[test] fn test_log_segment_contiguous_commit_files() { - let res = ListedLogFiles::try_new( - vec![ - create_log_path("file:///00000000000000000001.json"), - create_log_path("file:///00000000000000000002.json"), - create_log_path("file:///00000000000000000003.json"), - ], - vec![], - vec![], + let log_root = Url::parse("file:///_delta_log/").unwrap(); + + // contiguous commits are accepted + assert!(LogSegment::try_new( + LogSegmentFiles { + ascending_commit_files: vec![ + create_log_path("file:///_delta_log/00000000000000000001.json"), + create_log_path("file:///_delta_log/00000000000000000002.json"), + create_log_path("file:///_delta_log/00000000000000000003.json"), + ], + ..Default::default() + }, + log_root.clone(), None, - Some(create_log_path("file:///00000000000000000001.json")), - ); - assert!(res.is_ok()); + None, + ) + .is_ok()); - // allow gaps in ListedLogFiles - let listed = ListedLogFiles::try_new( - vec![ - create_log_path("file:///00000000000000000001.json"), - create_log_path("file:///00000000000000000003.json"), - ], - vec![], - vec![], + // gaps are disallowed by LogSegment::try_new + let log_segment = LogSegment::try_new( + LogSegmentFiles { + ascending_commit_files: vec![ + create_log_path("file:///_delta_log/00000000000000000001.json"), + create_log_path("file:///_delta_log/00000000000000000003.json"), + ], + ..Default::default() + }, + log_root, + None, None, - Some(create_log_path("file:///00000000000000000001.json")), ); - - // disallow gaps in LogSegment - let log_segment = LogSegment::try_new(listed.unwrap(), Url::parse("file:///").unwrap(), None); assert_result_error_with_message( log_segment, - "Generic delta kernel error: Expected ordered \ - contiguous commit files [ParsedLogPath { location: FileMeta { location: Url { scheme: \ + "Generic delta kernel error: Expected contiguous commit files, but found gap: \ + ParsedLogPath { location: FileMeta { location: Url { scheme: \ \"file\", cannot_be_a_base: false, username: \"\", password: None, host: None, port: \ - None, path: \"/00000000000000000001.json\", query: None, fragment: None }, last_modified: \ + None, path: \"/_delta_log/00000000000000000001.json\", query: None, fragment: None }, last_modified: \ 0, size: 0 }, filename: \"00000000000000000001.json\", extension: \"json\", version: 1, \ - file_type: Commit }, ParsedLogPath { location: FileMeta { location: Url { scheme: \ + file_type: Commit } -> ParsedLogPath { location: FileMeta { location: Url { scheme: \ \"file\", cannot_be_a_base: false, username: \"\", password: None, host: None, port: \ - None, path: \"/00000000000000000003.json\", query: None, fragment: None }, last_modified: \ + None, path: \"/_delta_log/00000000000000000003.json\", query: None, fragment: None }, last_modified: \ 0, size: 0 }, filename: \"00000000000000000003.json\", extension: \"json\", version: 3, \ - file_type: Commit }]", + file_type: Commit }", ); } -#[test] -fn test_publish_validation() { - use crate::Error; - - // Test with only regular committed files - should pass validation - let regular_commits = vec![ - create_log_path("file:///path/_delta_log/00000000000000000000.json"), - create_log_path("file:///path/_delta_log/00000000000000000001.json"), - create_log_path("file:///path/_delta_log/00000000000000000002.json"), - ]; - - let log_segment = LogSegment { - ascending_commit_files: regular_commits, - ascending_compaction_files: vec![], - checkpoint_parts: vec![], - checkpoint_version: None, - log_root: Url::parse("file:///path/").unwrap(), - end_version: 2, - latest_crc_file: None, - latest_commit_file: None, +/// Test that checkpoint_schema from _last_checkpoint hint is properly propagated to LogSegment +#[tokio::test] +async fn test_checkpoint_schema_propagation_from_hint() { + use crate::schema::{StructField, StructType}; + + // Create a sample schema that would be in _last_checkpoint + let sample_schema: SchemaRef = Arc::new(StructType::new_unchecked([ + StructField::nullable("add", StructType::new_unchecked([])), + StructField::nullable("remove", StructType::new_unchecked([])), + ])); + + let checkpoint_metadata = LastCheckpointHint { + version: 5, + size: 10, + parts: Some(1), + size_in_bytes: None, + num_of_add_files: None, + checkpoint_schema: Some(sample_schema.clone()), + checksum: None, + tags: None, }; - assert!(log_segment.validate_no_staged_commits().is_ok()); - - // Test with a staged commit - should fail validation - let with_staged = vec![ - create_log_path("file:///path/_delta_log/00000000000000000000.json"), - create_log_path("file:///path/_delta_log/00000000000000000001.json"), - create_log_path("file:///path/_delta_log/_staged_commits/00000000000000000002.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json"), - ]; - - let log_segment_with_staged = LogSegment { - ascending_commit_files: with_staged, - ascending_compaction_files: vec![], - checkpoint_parts: vec![], - checkpoint_version: None, - log_root: Url::parse("file:///path/").unwrap(), - end_version: 2, - latest_crc_file: None, - latest_commit_file: None, + let (storage, log_root) = build_log_with_paths_and_checkpoint( + &[ + delta_path_for_version(0, "json"), + delta_path_for_version(5, "checkpoint.parquet"), + delta_path_for_version(5, "json"), + delta_path_for_version(6, "json"), + ], + Some(&checkpoint_metadata), + ) + .await; + + let log_segment = LogSegment::for_snapshot_impl( + storage.as_ref(), + log_root, + vec![], // log_tail + Some(checkpoint_metadata), + None, + ) + .unwrap(); + + // Verify checkpoint_schema is propagated + assert!(log_segment.checkpoint_schema.is_some()); + assert_eq!(log_segment.checkpoint_schema.unwrap(), sample_schema); +} + +/// Test get_file_actions_schema_and_sidecars with V1 parquet checkpoint using hint schema +/// This verifies the optimization path where hint schema is used directly (avoiding footer read) +#[tokio::test] +async fn test_get_file_actions_schema_v1_parquet_with_hint() -> DeltaResult<()> { + use crate::schema::{StructField, StructType}; + + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngineBuilder::new(store.clone()).build(); + + // Create a V1 checkpoint (without sidecar column) + let v1_schema = get_commit_schema().project(&[ADD_NAME, REMOVE_NAME])?; + add_checkpoint_to_store( + &store, + add_batch_simple(v1_schema.clone()), + "00000000000000000001.checkpoint.parquet", + ) + .await?; + + let checkpoint_file = log_root + .join("00000000000000000001.checkpoint.parquet")? + .to_string(); + + // Create a hint schema without sidecar field (indicates V1) + let hint_schema: SchemaRef = Arc::new(StructType::new_unchecked([ + StructField::nullable("add", StructType::new_unchecked([])), + StructField::nullable("remove", StructType::new_unchecked([])), + ])); + + let log_segment = LogSegment::try_new( + LogSegmentFiles { + checkpoint_parts: vec![create_log_path(&checkpoint_file)], + latest_commit_file: Some(create_log_path("file:///00000000000000000002.json")), + ..Default::default() + }, + log_root, + None, + Some(hint_schema.clone()), // V1 hint schema (no sidecar field) + )?; + + // With V1 hint, should use hint schema and avoid footer read + let (schema, sidecars) = log_segment.get_file_actions_schema_and_sidecars(&engine)?; + assert!(schema.is_some(), "Should return hint schema for V1"); + assert_eq!( + schema.unwrap(), + hint_schema, + "Should use hint schema directly" + ); + assert!(sidecars.is_empty(), "V1 checkpoint should have no sidecars"); + + Ok(()) +} + +// Multi-part V1 checkpoint returns file_actions_schema with stats_parsed from hint or footer. +#[rstest] +#[case::with_hint(true)] +#[case::without_hint(false)] +#[tokio::test] +async fn test_get_file_actions_schema_multi_part_v1(#[case] use_hint: bool) -> DeltaResult<()> { + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngineBuilder::new(store.clone()).build(); + + let checkpoint_part_1 = "00000000000000000001.checkpoint.0000000001.0000000002.parquet"; + let checkpoint_part_2 = "00000000000000000001.checkpoint.0000000002.0000000002.parquet"; + + // Build a V1 checkpoint schema with stats_parsed containing an integer column. + let stats_parsed = StructType::new_unchecked([ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable( + "minValues", + StructType::new_unchecked([StructField::nullable("id", DataType::LONG)]), + ), + StructField::nullable( + "maxValues", + StructType::new_unchecked([StructField::nullable("id", DataType::LONG)]), + ), + ]); + let add_schema = StructType::new_unchecked([ + StructField::nullable("path", DataType::STRING), + StructField::nullable("stats_parsed", stats_parsed), + ]); + let remove_schema = + StructType::new_unchecked([StructField::nullable("path", DataType::STRING)]); + let v1_schema = Arc::new(StructType::new_unchecked([ + StructField::nullable(ADD_NAME, add_schema), + StructField::nullable(REMOVE_NAME, remove_schema), + ])); + + add_checkpoint_to_store( + &store, + add_batch_simple(v1_schema.clone()), + checkpoint_part_1, + ) + .await?; + add_checkpoint_to_store( + &store, + add_batch_simple(v1_schema.clone()), + checkpoint_part_2, + ) + .await?; + + let cp1_size = get_file_size(&store, &format!("_delta_log/{checkpoint_part_1}")).await; + let cp2_size = get_file_size(&store, &format!("_delta_log/{checkpoint_part_2}")).await; + + let cp1_file = log_root.join(checkpoint_part_1)?.to_string(); + let cp2_file = log_root.join(checkpoint_part_2)?.to_string(); + + let log_segment = LogSegment::try_new( + LogSegmentFiles { + checkpoint_parts: vec![ + create_log_path_with_size(&cp1_file, cp1_size), + create_log_path_with_size(&cp2_file, cp2_size), + ], + latest_commit_file: Some(create_log_path("file:///00000000000000000002.json")), + ..Default::default() + }, + log_root, + None, + use_hint.then(|| v1_schema.clone() as SchemaRef), + )?; + + let (schema, sidecars) = log_segment.get_file_actions_schema_and_sidecars(&engine)?; + let schema = schema.expect("Multi-part V1 should return file actions schema"); + + // Verify stats_parsed is detectable in the returned schema. + let add_field = schema.field(ADD_NAME).expect("should have add field"); + let DataType::Struct(add_struct) = add_field.data_type() else { + panic!("add field should be a struct type"); }; + assert!( + add_struct.field("stats_parsed").is_some(), + "Returned schema should include stats_parsed for data skipping" + ); + assert!(sidecars.is_empty(), "Multi-part V1 should have no sidecars"); + + Ok(()) +} + +// ============================================================================ +// max_published_version tests +// ============================================================================ - // Should fail with staged commits - let result = log_segment_with_staged.validate_no_staged_commits(); - assert!(result.is_err()); - if let Err(Error::Generic(msg)) = result { - assert_eq!(msg, "Found staged commit file in log segment"); - } else { - panic!("Expected Error::Generic"); +#[tokio::test] +async fn test_max_published_version_only_published_commits() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2, 3, 4], + ..Default::default() + }) + .await; + assert_eq!(log_segment.listed.max_published_version.unwrap(), 4); +} + +#[tokio::test] +async fn test_max_published_version_checkpoint_followed_by_published_commits() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[5, 6, 7, 8], + checkpoint_version: Some(5), + ..Default::default() + }) + .await; + assert_eq!(log_segment.listed.max_published_version.unwrap(), 8); +} + +#[tokio::test] +async fn test_max_published_version_only_staged_commits() { + let log_segment = create_segment_for(LogSegmentConfig { + staged_commit_versions: &[0, 1, 2, 3, 4], + ..Default::default() + }) + .await; + assert_eq!(log_segment.listed.max_published_version, None); +} + +#[tokio::test] +async fn test_max_published_version_checkpoint_followed_by_staged_commits() { + let log_segment = create_segment_for(LogSegmentConfig { + staged_commit_versions: &[5, 6, 7, 8], + checkpoint_version: Some(5), + ..Default::default() + }) + .await; + assert_eq!(log_segment.listed.max_published_version, None); +} + +#[tokio::test] +async fn test_max_published_version_published_and_staged_commits_no_overlap() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2], + staged_commit_versions: &[3, 4], + ..Default::default() + }) + .await; + assert_eq!(log_segment.listed.max_published_version.unwrap(), 2); +} + +#[tokio::test] +async fn test_max_published_version_checkpoint_followed_by_published_and_staged_commits_no_overlap() +{ + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[5, 6, 7], + staged_commit_versions: &[8, 9, 10], + checkpoint_version: Some(5), + ..Default::default() + }) + .await; + assert_eq!(log_segment.listed.max_published_version.unwrap(), 7); +} + +#[tokio::test] +async fn test_max_published_version_published_and_staged_commits_with_overlap() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2], + staged_commit_versions: &[2, 3, 4], + ..Default::default() + }) + .await; + assert_eq!(log_segment.listed.max_published_version.unwrap(), 2); +} + +#[tokio::test] +async fn test_max_published_version_checkpoint_followed_by_published_and_staged_commits_with_overlap( +) { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[5, 6, 7, 8, 9], + staged_commit_versions: &[7, 8, 9, 10], + checkpoint_version: Some(5), + ..Default::default() + }) + .await; + assert_eq!(log_segment.listed.max_published_version.unwrap(), 9); +} + +#[tokio::test] +async fn test_max_published_version_checkpoint_only() { + let log_segment = create_segment_for(LogSegmentConfig { + checkpoint_version: Some(5), + ..Default::default() + }) + .await; + assert_eq!(log_segment.listed.max_published_version, None); +} + +// ============================================================================ +// schema_has_compatible_stats_parsed tests +// ============================================================================ + +// Helper to create a checkpoint schema with stats_parsed for testing +fn create_checkpoint_schema_with_stats_parsed(min_values_fields: Vec) -> StructType { + let stats_parsed = StructType::new_unchecked([ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable( + "minValues", + StructType::new_unchecked(min_values_fields.clone()), + ), + StructField::nullable("maxValues", StructType::new_unchecked(min_values_fields)), + ]); + + let add_schema = StructType::new_unchecked([ + StructField::nullable("path", DataType::STRING), + StructField::nullable("stats_parsed", stats_parsed), + ]); + + StructType::new_unchecked([StructField::nullable("add", add_schema)]) +} + +// Helper to create a stats_schema with proper structure (numRecords, minValues, maxValues) +fn create_stats_schema(column_fields: Vec) -> StructType { + StructType::new_unchecked([ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable( + "minValues", + StructType::new_unchecked(column_fields.clone()), + ), + StructField::nullable("maxValues", StructType::new_unchecked(column_fields)), + ]) +} + +// Helper to create a checkpoint schema without stats_parsed +fn create_checkpoint_schema_without_stats_parsed() -> StructType { + use crate::schema::StructType; + + let add_schema = StructType::new_unchecked([ + StructField::nullable("path", DataType::STRING), + StructField::nullable("stats", DataType::STRING), + ]); + + StructType::new_unchecked([StructField::nullable("add", add_schema)]) +} + +#[test] +fn test_schema_has_compatible_stats_parsed_basic() { + // Create a checkpoint schema with stats_parsed containing an integer column + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "id", + DataType::INTEGER, + )]); + + // Exact type match should work + let stats_schema = create_stats_schema(vec![StructField::nullable("id", DataType::INTEGER)]); + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); + + // Type widening (int -> long) should work + let stats_schema_widened = + create_stats_schema(vec![StructField::nullable("id", DataType::LONG)]); + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema_widened + )); + + // Incompatible type (string -> int) should fail + let checkpoint_schema_string = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "id", + DataType::STRING, + )]); + assert!(!LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema_string, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_missing_column_ok() { + // Checkpoint has "id" column, stats schema needs "other" column + // Missing column is acceptable - it will return null when accessed + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "id", + DataType::INTEGER, + )]); + + let stats_schema = create_stats_schema(vec![StructField::nullable("other", DataType::INTEGER)]); + + // Missing column in checkpoint is OK - it will return null when accessed, + // which is acceptable for data skipping (just means we can't skip based on that column) + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_extra_column_ok() { + // Checkpoint has extra columns not needed by stats schema (should be OK) + let checkpoint_schema = create_checkpoint_schema_with_stats_parsed(vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("extra", DataType::STRING), + ]); + + let stats_schema = create_stats_schema(vec![StructField::nullable("id", DataType::INTEGER)]); + + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_no_stats_parsed() { + // Checkpoint schema without stats_parsed field + let checkpoint_schema = create_checkpoint_schema_without_stats_parsed(); + + let stats_schema = create_stats_schema(vec![StructField::nullable("id", DataType::INTEGER)]); + + assert!(!LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_empty_stats_schema() { + // Empty stats schema (no columns needed for data skipping) + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "id", + DataType::INTEGER, + )]); + + let stats_schema = create_stats_schema(vec![]); + + // If no columns are needed for data skipping, any stats_parsed is compatible + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_multiple_columns() { + // Multiple columns - check that we iterate over all columns and find incompatibility + let checkpoint_schema = create_checkpoint_schema_with_stats_parsed(vec![ + StructField::nullable("good_col", DataType::LONG), + StructField::nullable("bad_col", DataType::STRING), + ]); + + // First column matches, second is incompatible + let stats_schema = create_stats_schema(vec![ + StructField::nullable("good_col", DataType::LONG), + StructField::nullable("bad_col", DataType::INTEGER), + ]); + + assert!(!LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_missing_min_max_values() { + // stats_parsed exists but has no minValues/maxValues fields - unusual but valid (continue case) + let stats_parsed = StructType::new_unchecked([ + StructField::nullable("numRecords", DataType::LONG), + // No minValues or maxValues fields + ]); + + let add_schema = StructType::new_unchecked([ + StructField::nullable("path", DataType::STRING), + StructField::nullable("stats_parsed", stats_parsed), + ]); + + let checkpoint_schema = StructType::new_unchecked([StructField::nullable("add", add_schema)]); + + let stats_schema = create_stats_schema(vec![StructField::nullable("id", DataType::INTEGER)]); + + // Should return true - missing minValues/maxValues is handled gracefully with continue + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_min_values_not_struct() { + // minValues/maxValues exist but are not Struct types - malformed schema (return false case) + let stats_parsed = StructType::new_unchecked([ + StructField::nullable("numRecords", DataType::LONG), + // minValues is a primitive type instead of a Struct + StructField::nullable("minValues", DataType::STRING), + StructField::nullable("maxValues", DataType::STRING), + ]); + + let add_schema = StructType::new_unchecked([ + StructField::nullable("path", DataType::STRING), + StructField::nullable("stats_parsed", stats_parsed), + ]); + + let checkpoint_schema = StructType::new_unchecked([StructField::nullable("add", add_schema)]); + + let stats_schema = create_stats_schema(vec![StructField::nullable("id", DataType::INTEGER)]); + + // Should return false - minValues/maxValues must be Struct types + assert!(!LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_nested_struct() { + // Create a nested struct: user: { name: string, age: integer } + let user_struct = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + ]); + + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "user", + user_struct.clone(), + )]); + + // Exact match should work + let stats_schema = create_stats_schema(vec![StructField::nullable("user", user_struct)]); + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_nested_struct_with_extra_fields() { + // Checkpoint has extra nested fields not needed by stats schema + let checkpoint_user = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + StructField::nullable("extra", DataType::STRING), // extra field + ]); + + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "user", + checkpoint_user, + )]); + + // Stats schema only needs a subset of fields + let stats_user = StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + + let stats_schema = create_stats_schema(vec![StructField::nullable("user", stats_user)]); + + // Extra fields in checkpoint nested struct should be OK + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_nested_struct_missing_field_ok() { + // Checkpoint is missing a nested field that stats schema needs + let checkpoint_user = + StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "user", + checkpoint_user, + )]); + + // Stats schema needs more fields than checkpoint has + let stats_user = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), // missing in checkpoint + ]); + + let stats_schema = create_stats_schema(vec![StructField::nullable("user", stats_user)]); + + // Missing nested field is OK - it will return null when accessed + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_nested_struct_type_mismatch() { + // Checkpoint has incompatible type in nested field + let checkpoint_user = StructType::new_unchecked([ + StructField::nullable("name", DataType::INTEGER), // wrong type! + ]); + + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "user", + checkpoint_user, + )]); + + let stats_user = StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + + let stats_schema = create_stats_schema(vec![StructField::nullable("user", stats_user)]); + + // Type mismatch in nested field should fail + assert!(!LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_deeply_nested() { + // Deeply nested: company: { department: { team: { name: string } } } + let team = StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + let department = StructType::new_unchecked([StructField::nullable("team", team.clone())]); + let company = StructType::new_unchecked([StructField::nullable("department", department)]); + + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "company", + company.clone(), + )]); + + let stats_schema = create_stats_schema(vec![StructField::nullable("company", company)]); + + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_deeply_nested_type_mismatch() { + // Type mismatch deep in nested structure + let checkpoint_team = + StructType::new_unchecked([StructField::nullable("name", DataType::INTEGER)]); // wrong! + let checkpoint_dept = + StructType::new_unchecked([StructField::nullable("team", checkpoint_team)]); + let checkpoint_company = + StructType::new_unchecked([StructField::nullable("department", checkpoint_dept)]); + + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "company", + checkpoint_company, + )]); + + let stats_team = StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + let stats_dept = StructType::new_unchecked([StructField::nullable("team", stats_team)]); + let stats_company = + StructType::new_unchecked([StructField::nullable("department", stats_dept)]); + + let stats_schema = create_stats_schema(vec![StructField::nullable("company", stats_company)]); + + // Type mismatch deep in hierarchy should be detected + assert!(!LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_long_to_timestamp() { + // Checkpoint stores timestamp stats as Int64 (no logical type annotation) + let checkpoint_schema = create_checkpoint_schema_with_stats_parsed(vec![ + StructField::nullable("ts_col", DataType::LONG), + StructField::nullable("ts_ntz_col", DataType::LONG), + ]); + + // Stats schema expects Timestamp and TimestampNtz types + let stats_schema = create_stats_schema(vec![ + StructField::nullable("ts_col", DataType::TIMESTAMP), + StructField::nullable("ts_ntz_col", DataType::TIMESTAMP_NTZ), + ]); + + // Long -> Timestamp/TimestampNtz reinterpretation should be accepted + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_timestamp_to_long_rejected() { + // Checkpoint has Timestamp-typed stats + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "ts_col", + DataType::TIMESTAMP, + )]); + + // Stats schema expects Long -- narrowing should be rejected + let stats_schema = create_stats_schema(vec![StructField::nullable("ts_col", DataType::LONG)]); + + assert!(!LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_integer_to_date() { + // Checkpoint stores date stats as Int32 (no DATE logical annotation) + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "date_col", + DataType::INTEGER, + )]); + + // Stats schema expects Date type + let stats_schema = create_stats_schema(vec![StructField::nullable("date_col", DataType::DATE)]); + + // Integer -> Date reinterpretation should be accepted + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_schema_has_compatible_stats_parsed_date_to_integer_rejected() { + // Checkpoint has Date-typed stats + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "date_col", + DataType::DATE, + )]); + + // Stats schema expects Integer -- narrowing should be rejected + let stats_schema = + create_stats_schema(vec![StructField::nullable("date_col", DataType::INTEGER)]); + + assert!(!LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +// Type widening + checkpoint reinterpretation interaction scenarios. +// Verifies that schema evolution doesn't create false-positive type matches. +#[rstest] +// Standard widening: Integer -> Long in old checkpoint after column was widened +#[case::widening_integer_to_long(DataType::INTEGER, DataType::LONG, true)] +// Checkpoint reinterpretation: Int32 without DATE annotation -> Date +#[case::reinterpret_integer_to_date(DataType::INTEGER, DataType::DATE, true)] +// Checkpoint reinterpretation: Int64 without TIMESTAMP annotation -> Timestamp +#[case::reinterpret_long_to_timestamp(DataType::LONG, DataType::TIMESTAMP, true)] +// Compound: checkpoint dropped Date annotation (Int32) + column widened to Timestamp. +// Integer -> Timestamp is neither a widening nor reinterpretation rule. +#[case::reinterpret_plus_widen_integer_to_timestamp(DataType::INTEGER, DataType::TIMESTAMP, false)] +#[case::reinterpret_plus_widen_integer_to_timestamp_ntz( + DataType::INTEGER, + DataType::TIMESTAMP_NTZ, + false +)] +// Date -> Timestamp is a valid Delta type widening rule, but kernel's can_widen_to does not +// currently support it. This test documents the current behavior. +#[case::date_widened_to_timestamp(DataType::DATE, DataType::TIMESTAMP, false)] +fn test_stats_parsed_widening_and_reinterpretation_interaction( + #[case] checkpoint_type: DataType, + #[case] stats_type: DataType, + #[case] expected: bool, +) { + let checkpoint_schema = + create_checkpoint_schema_with_stats_parsed(vec![StructField::nullable( + "col", + checkpoint_type, + )]); + let stats_schema = create_stats_schema(vec![StructField::nullable("col", stats_type)]); + + assert_eq!( + LogSegment::schema_has_compatible_stats_parsed(&checkpoint_schema, &stats_schema), + expected + ); +} + +#[test] +fn test_stats_parsed_mixed_widening_and_reinterpretation() { + // Multiple columns with different compatibility paths should all pass. + let checkpoint_schema = create_checkpoint_schema_with_stats_parsed(vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("ts_col", DataType::LONG), + StructField::nullable("date_col", DataType::INTEGER), + ]); + let stats_schema = create_stats_schema(vec![ + StructField::nullable("id", DataType::LONG), + StructField::nullable("ts_col", DataType::TIMESTAMP), + StructField::nullable("date_col", DataType::DATE), + ]); + + assert!(LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +#[test] +fn test_stats_parsed_mixed_with_one_incompatible_rejects_all() { + // One incompatible column (Integer -> Timestamp) rejects the whole schema. + let checkpoint_schema = create_checkpoint_schema_with_stats_parsed(vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("ts_col", DataType::LONG), + StructField::nullable("bad_col", DataType::INTEGER), + ]); + let stats_schema = create_stats_schema(vec![ + StructField::nullable("id", DataType::LONG), + StructField::nullable("ts_col", DataType::TIMESTAMP), + StructField::nullable("bad_col", DataType::TIMESTAMP), + ]); + + assert!(!LogSegment::schema_has_compatible_stats_parsed( + &checkpoint_schema, + &stats_schema + )); +} + +// ============================================================================ +// create_checkpoint_stream: partitionValues_parsed schema augmentation tests +// ============================================================================ + +/// Creates a checkpoint batch with `add.partitionValues_parsed` in the parquet schema. +fn add_batch_with_partition_values_parsed(output_schema: SchemaRef) -> Box { + let handler = SyncJsonHandler {}; + let json_strings: StringArray = vec![ + r#"{"add":{"path":"part-00000.parquet","partitionValues":{"id":"1"},"partitionValues_parsed":{"id":1},"size":635,"modificationTime":1677811178336,"dataChange":true}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["id"],"configuration":{},"createdTime":1677811175819}}"#, + ] + .into(); + let parsed = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap(); + ArrowEngineData::try_from_engine_data(parsed).unwrap() +} + +#[tokio::test] +async fn test_checkpoint_stream_sets_has_partition_values_parsed() -> DeltaResult<()> { + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngineBuilder::new(store.clone()).build(); + + // Build a schema that includes add.partitionValues_parsed.id: integer + let partition_parsed_struct = + StructType::new_unchecked([StructField::nullable("id", DataType::INTEGER)]); + let add_struct = StructType::new_unchecked([ + StructField::nullable("path", DataType::STRING), + StructField::nullable( + "partitionValues", + crate::schema::MapType::new(DataType::STRING, DataType::STRING, true), + ), + StructField::nullable("partitionValues_parsed", partition_parsed_struct), + StructField::nullable("size", DataType::LONG), + StructField::nullable("modificationTime", DataType::LONG), + StructField::nullable("dataChange", DataType::BOOLEAN), + ]); + let metadata_struct = StructType::new_unchecked([ + StructField::nullable("id", DataType::STRING), + StructField::nullable( + "format", + StructType::new_unchecked([StructField::nullable("provider", DataType::STRING)]), + ), + StructField::nullable("schemaString", DataType::STRING), + StructField::nullable( + "partitionColumns", + crate::schema::ArrayType::new(DataType::STRING, false), + ), + StructField::nullable( + "configuration", + crate::schema::MapType::new(DataType::STRING, DataType::STRING, true), + ), + StructField::nullable("createdTime", DataType::LONG), + ]); + let checkpoint_schema: SchemaRef = Arc::new(StructType::new_unchecked([ + StructField::nullable("add", add_struct), + StructField::nullable("metaData", metadata_struct), + ])); + + add_checkpoint_to_store( + &store, + add_batch_with_partition_values_parsed(checkpoint_schema), + "00000000000000000001.checkpoint.parquet", + ) + .await?; + + let checkpoint_file = log_root + .join("00000000000000000001.checkpoint.parquet")? + .to_string(); + let checkpoint_size = + get_file_size(&store, "_delta_log/00000000000000000001.checkpoint.parquet").await; + + // Use a read schema that includes the add field + let read_schema: SchemaRef = Arc::new(StructType::new_unchecked([StructField::nullable( + "add", + StructType::new_unchecked([ + StructField::nullable("path", DataType::STRING), + StructField::nullable( + "partitionValues", + crate::schema::MapType::new(DataType::STRING, DataType::STRING, true), + ), + StructField::nullable("size", DataType::LONG), + StructField::nullable("modificationTime", DataType::LONG), + StructField::nullable("dataChange", DataType::BOOLEAN), + ]), + )])); + + let log_segment = LogSegment::try_new( + LogSegmentFiles { + checkpoint_parts: vec![create_log_path_with_size(&checkpoint_file, checkpoint_size)], + latest_commit_file: Some(create_log_path("file:///00000000000000000001.json")), + ..Default::default() + }, + log_root, + None, + None, + )?; + + // Pass a partition schema to trigger partitionValues_parsed detection + let partition_schema = + StructType::new_unchecked([StructField::nullable("id", DataType::INTEGER)]); + let checkpoint_result = log_segment.create_checkpoint_stream( + &engine, + read_schema, + None, // meta_predicate + None, // stats_schema + Some(&partition_schema), + )?; + + // Verify that checkpoint_info reports partitionValues_parsed as available + assert!( + checkpoint_result + .checkpoint_info + .has_partition_values_parsed, + "Expected has_partition_values_parsed to be true" + ); + + // Verify that partitionValues_parsed was added to the checkpoint read schema + let schema = &checkpoint_result.checkpoint_info.checkpoint_read_schema; + let add_field = schema.field("add").expect("schema should have 'add' field"); + let DataType::Struct(add_struct) = add_field.data_type() else { + panic!("add field should be a struct"); + }; + assert!( + add_struct.field("partitionValues_parsed").is_some(), + "checkpoint read schema should include add.partitionValues_parsed" + ); + + Ok(()) +} + +#[tokio::test] +async fn test_checkpoint_stream_no_partition_values_parsed_when_incompatible() -> DeltaResult<()> { + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngineBuilder::new(store.clone()).build(); + + // Write a checkpoint WITHOUT partitionValues_parsed + add_checkpoint_to_store( + &store, + add_batch_simple(get_all_actions_schema().project(&[ADD_NAME])?), + "00000000000000000001.checkpoint.parquet", + ) + .await?; + + let checkpoint_file = log_root + .join("00000000000000000001.checkpoint.parquet")? + .to_string(); + let checkpoint_size = + get_file_size(&store, "_delta_log/00000000000000000001.checkpoint.parquet").await; + + let read_schema = get_all_actions_schema().project(&[ADD_NAME])?; + + let log_segment = LogSegment::try_new( + LogSegmentFiles { + checkpoint_parts: vec![create_log_path_with_size(&checkpoint_file, checkpoint_size)], + latest_commit_file: Some(create_log_path("file:///00000000000000000001.json")), + ..Default::default() + }, + log_root, + None, + None, + )?; + + // Pass a partition schema — but the checkpoint doesn't have partitionValues_parsed + let partition_schema = + StructType::new_unchecked([StructField::nullable("id", DataType::INTEGER)]); + let checkpoint_result = log_segment.create_checkpoint_stream( + &engine, + read_schema.clone(), + None, + None, + Some(&partition_schema), + )?; + + // Verify it's false + assert!( + !checkpoint_result + .checkpoint_info + .has_partition_values_parsed, + "Expected has_partition_values_parsed to be false" + ); + + // Verify partitionValues_parsed was NOT added to the schema + let schema = &checkpoint_result.checkpoint_info.checkpoint_read_schema; + if let Some(add_field) = schema.field("add") { + let DataType::Struct(add_struct) = add_field.data_type() else { + panic!("add field should be a struct"); + }; + assert!( + add_struct.field("partitionValues_parsed").is_none(), + "checkpoint read schema should NOT include add.partitionValues_parsed" + ); + } + + Ok(()) +} + +// ============================================================================ +// schema_has_compatible_partition_values_parsed tests +// ============================================================================ + +/// Helper to create a checkpoint schema with `add.partitionValues_parsed` for testing. +fn create_checkpoint_schema_with_partition_parsed( + partition_fields: Vec, +) -> StructType { + let partition_parsed = StructType::new_unchecked(partition_fields); + let add_struct = StructType::new_unchecked([ + StructField::nullable("path", DataType::STRING), + StructField::nullable("partitionValues_parsed", partition_parsed), + ]); + StructType::new_unchecked([StructField::nullable("add", add_struct)]) +} + +/// Helper to create a checkpoint schema without `partitionValues_parsed`. +fn create_checkpoint_schema_without_partition_parsed() -> StructType { + let add_struct = StructType::new_unchecked([StructField::nullable("path", DataType::STRING)]); + StructType::new_unchecked([StructField::nullable("add", add_struct)]) +} + +#[test] +fn test_partition_values_parsed_compatible_basic() { + let checkpoint_schema = create_checkpoint_schema_with_partition_parsed(vec![ + StructField::nullable("date", DataType::DATE), + StructField::nullable("region", DataType::STRING), + ]); + let partition_schema = StructType::new_unchecked([ + StructField::nullable("date", DataType::DATE), + StructField::nullable("region", DataType::STRING), + ]); + assert!(LogSegment::schema_has_compatible_partition_values_parsed( + &checkpoint_schema, + &partition_schema, + )); +} + +#[test] +fn test_partition_values_parsed_missing_field() { + let checkpoint_schema = + create_checkpoint_schema_with_partition_parsed(vec![StructField::nullable( + "date", + DataType::DATE, + )]); + // Partition schema expects both date and region, but checkpoint only has date. + // Missing fields are OK — they just won't contribute to row group skipping. + let partition_schema = StructType::new_unchecked([ + StructField::nullable("date", DataType::DATE), + StructField::nullable("region", DataType::STRING), + ]); + assert!(LogSegment::schema_has_compatible_partition_values_parsed( + &checkpoint_schema, + &partition_schema, + )); +} + +#[test] +fn test_partition_values_parsed_extra_field() { + // Checkpoint has extra fields beyond what partition schema needs — fine + let checkpoint_schema = create_checkpoint_schema_with_partition_parsed(vec![ + StructField::nullable("date", DataType::DATE), + StructField::nullable("region", DataType::STRING), + StructField::nullable("extra", DataType::INTEGER), + ]); + let partition_schema = + StructType::new_unchecked([StructField::nullable("date", DataType::DATE)]); + assert!(LogSegment::schema_has_compatible_partition_values_parsed( + &checkpoint_schema, + &partition_schema, + )); +} + +#[test] +fn test_partition_values_parsed_type_mismatch() { + let checkpoint_schema = + create_checkpoint_schema_with_partition_parsed(vec![StructField::nullable( + "date", + DataType::STRING, + )]); + let partition_schema = + StructType::new_unchecked([StructField::nullable("date", DataType::DATE)]); + assert!(!LogSegment::schema_has_compatible_partition_values_parsed( + &checkpoint_schema, + &partition_schema, + )); +} + +#[test] +fn test_partition_values_parsed_not_present() { + let checkpoint_schema = create_checkpoint_schema_without_partition_parsed(); + let partition_schema = + StructType::new_unchecked([StructField::nullable("date", DataType::DATE)]); + assert!(!LogSegment::schema_has_compatible_partition_values_parsed( + &checkpoint_schema, + &partition_schema, + )); +} + +#[test] +fn test_partition_values_parsed_not_a_struct() { + // partitionValues_parsed is a string instead of a struct + let add_struct = StructType::new_unchecked([ + StructField::nullable("path", DataType::STRING), + StructField::nullable("partitionValues_parsed", DataType::STRING), + ]); + let checkpoint_schema = StructType::new_unchecked([StructField::nullable("add", add_struct)]); + let partition_schema = + StructType::new_unchecked([StructField::nullable("date", DataType::DATE)]); + assert!(!LogSegment::schema_has_compatible_partition_values_parsed( + &checkpoint_schema, + &partition_schema, + )); +} + +#[test] +fn test_partition_values_parsed_empty_partition_schema() { + let checkpoint_schema = + create_checkpoint_schema_with_partition_parsed(vec![StructField::nullable( + "date", + DataType::DATE, + )]); + // Empty partition schema — any partitionValues_parsed is compatible + let partition_schema = StructType::new_unchecked(Vec::::new()); + assert!(LogSegment::schema_has_compatible_partition_values_parsed( + &checkpoint_schema, + &partition_schema, + )); +} + +// ============================================================================ +// new_with_commit tests +// ============================================================================ + +/// Asserts that `new` is `orig` extended with exactly one commit via `LogSegment::new_with_commit`. +fn assert_log_segment_extended(orig: LogSegment, new: LogSegment) { + // Check: What should have changed + assert_eq!(orig.end_version + 1, new.end_version); + assert_eq!( + orig.listed.ascending_commit_files.len() + 1, + new.listed.ascending_commit_files.len() + ); + assert_eq!( + orig.listed.latest_commit_file.as_ref().unwrap().version + 1, + new.listed.latest_commit_file.as_ref().unwrap().version + ); + + // Check: What should be the same + fn normalize(log_segment: LogSegment) -> LogSegment { + use crate::log_segment_files::LogSegmentFiles; + LogSegment { + end_version: 0, + listed: LogSegmentFiles { + max_published_version: None, + ascending_commit_files: vec![], + latest_commit_file: None, + ..log_segment.listed + }, + ..log_segment + } + } + + assert_eq!(normalize(orig), normalize(new)); +} + +#[tokio::test] +async fn test_new_with_commit_published_commit() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2, 3, 4], + ..Default::default() + }) + .await; + let table_root = Url::parse("memory:///").unwrap(); + let new_commit = ParsedLogPath::create_parsed_published_commit(&table_root, 5); + + let new_log_segment = log_segment + .clone() + .new_with_commit_appended(new_commit) + .unwrap(); + + assert_eq!(new_log_segment.listed.max_published_version, Some(5)); + assert_log_segment_extended(log_segment, new_log_segment); +} + +#[tokio::test] +async fn test_new_with_commit_staged_commit() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2, 3, 4], + ..Default::default() + }) + .await; + let table_root = Url::parse("memory:///").unwrap(); + let new_commit = ParsedLogPath::create_parsed_staged_commit(&table_root, 5); + + let new_log_segment = log_segment + .clone() + .new_with_commit_appended(new_commit) + .unwrap(); + + assert_eq!(new_log_segment.listed.max_published_version, Some(4)); + assert_log_segment_extended(log_segment, new_log_segment); +} + +#[tokio::test] +async fn test_new_with_commit_not_commit_type() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2, 3, 4], + ..Default::default() + }) + .await; + let checkpoint = create_log_path("file:///_delta_log/00000000000000000005.checkpoint.parquet"); + + let result = log_segment.new_with_commit_appended(checkpoint); + + assert_result_error_with_message( + result, + "Cannot extend and create new LogSegment. Tail log file is not a commit file.", + ); +} + +#[tokio::test] +async fn test_new_with_commit_not_end_version_plus_one() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2, 3, 4], + ..Default::default() + }) + .await; + let table_root = Url::parse("memory:///").unwrap(); + + let wrong_version_commit = ParsedLogPath::create_parsed_published_commit(&table_root, 10); + let result = log_segment.new_with_commit_appended(wrong_version_commit); + + assert_result_error_with_message( + result, + "Cannot extend and create new LogSegment. Tail commit file version (10) does not equal LogSegment end_version (4) + 1." + ); +} + +// ============================================================================ +// try_new_with_checkpoint tests +// ============================================================================ + +#[rstest] +#[case::non_checkpoint_file( + "file:///_delta_log/00000000000000000002.json", + "Path is not a single-file checkpoint" +)] +#[case::multi_part_checkpoint( + "file:///_delta_log/00000000000000000002.checkpoint.0000000001.0000000002.parquet", + "Path is not a single-file checkpoint" +)] +#[case::wrong_version( + "file:///_delta_log/00000000000000000005.checkpoint.parquet", + "Checkpoint version (5) does not equal LogSegment end_version (2)" +)] +#[tokio::test] +async fn test_try_new_with_checkpoint_rejects_invalid_path( + #[case] path: &str, + #[case] expected_error: &str, +) { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2], + ..Default::default() + }) + .await; + let result = log_segment.try_new_with_checkpoint(create_log_path(path)); + assert_result_error_with_message(result, expected_error); +} + +#[rstest] +#[case::classic_parquet("file:///_delta_log/00000000000000000002.checkpoint.parquet")] +#[case::v2_uuid( + "file:///_delta_log/00000000000000000002.checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.parquet" +)] +#[tokio::test] +async fn test_try_new_with_checkpoint_sets_checkpoint_and_clears_commits(#[case] path: &str) { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2], + compaction_versions: &[(0, 2)], + ..Default::default() + }) + .await; + assert!(!log_segment.listed.ascending_commit_files.is_empty()); + assert!(!log_segment.listed.ascending_compaction_files.is_empty()); + + let ckpt_path = create_log_path(path); + let result = log_segment.try_new_with_checkpoint(ckpt_path).unwrap(); + + assert_eq!(result.checkpoint_version, Some(2)); + assert_eq!(result.listed.checkpoint_parts.len(), 1); + assert_eq!(result.listed.checkpoint_parts[0].version, 2); + assert!(result.listed.ascending_commit_files.is_empty()); + assert!(result.listed.ascending_compaction_files.is_empty()); + assert!(result.checkpoint_schema.is_none()); + + // latest_commit_file is preserved for ICT access even though commits are cleared + assert_eq!( + result.listed.latest_commit_file.as_ref().map(|f| f.version), + log_segment + .listed + .latest_commit_file + .as_ref() + .map(|f| f.version) + ); + + // Structural fields are preserved + assert_eq!(result.end_version, log_segment.end_version); + assert_eq!(result.log_root, log_segment.log_root); +} + +// ============================================================================ +// try_new_with_crc_file tests +// ============================================================================ + +#[rstest] +#[case::non_crc_file( + "file:///_delta_log/00000000000000000002.json", + "Path is not a CRC file" +)] +#[case::wrong_version( + "file:///_delta_log/00000000000000000005.crc", + "CRC version (5) does not equal LogSegment end_version (2)" +)] +#[tokio::test] +async fn test_try_new_with_crc_file_rejects_invalid_path( + #[case] path: &str, + #[case] expected_error: &str, +) { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2], + ..Default::default() + }) + .await; + let url = Url::parse(path).unwrap(); + let crc_path = ParsedLogPath::try_from(url).unwrap().unwrap(); + let result = log_segment.try_new_with_crc_file(crc_path); + assert_result_error_with_message(result, expected_error); +} + +#[tokio::test] +async fn test_try_new_with_crc_file_sets_crc_and_preserves_other_fields() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2], + checkpoint_version: Some(1), + ..Default::default() + }) + .await; + let url = Url::parse("file:///_delta_log/00000000000000000002.crc").unwrap(); + let crc_path = ParsedLogPath::try_from(url).unwrap().unwrap(); + let result = log_segment.try_new_with_crc_file(crc_path).unwrap(); + + let crc_file = result.listed.latest_crc_file.as_ref().unwrap(); + assert_eq!(crc_file.version, 2); + + // Everything else is preserved + assert_eq!(result.end_version, log_segment.end_version); + assert_eq!(result.checkpoint_version, log_segment.checkpoint_version); + assert_eq!( + result.listed.ascending_commit_files.len(), + log_segment.listed.ascending_commit_files.len() + ); + assert_eq!( + result.listed.checkpoint_parts.len(), + log_segment.listed.checkpoint_parts.len() + ); + assert_eq!(result.log_root, log_segment.log_root); +} + +// ============================================================================ +// get_unpublished_catalog_commits tests +// ============================================================================ + +#[tokio::test] +async fn test_get_unpublished_catalog_commits() { + let log_segment = create_segment_for(LogSegmentConfig { + published_commit_versions: &[0, 1, 2], + staged_commit_versions: &[2, 3, 4], + ..Default::default() + }) + .await; + + assert_eq!(log_segment.listed.max_published_version, Some(2)); + let unpublished = log_segment.get_unpublished_catalog_commits().unwrap(); + let versions: Vec<_> = unpublished.iter().map(|c| c.version()).collect(); + assert_eq!(versions, vec![3, 4]); +} + +// ============================================================================ +// Tests: segment_after_crc / segment_through_crc +// ============================================================================ + +fn extract_commit_versions(seg: &LogSegment) -> Vec { + seg.listed + .ascending_commit_files + .iter() + .map(|c| c.version) + .collect() +} + +fn extract_compaction_ranges(seg: &LogSegment) -> Vec<(u64, u64)> { + seg.listed + .ascending_compaction_files + .iter() + .map(|c| match c.file_type { + LogPathFileType::CompactedCommit { hi } => (c.version, hi), + _ => panic!("expected compaction"), + }) + .collect() +} + +struct CrcPruningCase { + commits: &'static [u64], + compactions: &'static [(u64, u64)], + checkpoint: Option, + crc_version: u64, + after_commits: &'static [u64], + after_compactions: &'static [(u64, u64)], + through_commits: &'static [u64], + through_compactions: &'static [(u64, u64)], +} + +#[rstest::rstest] +// 0 1 2 3 4 5 6 7 8 9 +// commits: x x x x x x x x x x +// crc: | +// after commits: x x x x x +// through commits: x x x x x +#[case::only_deltas_no_checkpoint(CrcPruningCase { + commits: &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + compactions: &[], + checkpoint: None, + crc_version: 4, + after_commits: &[5, 6, 7, 8, 9], + after_compactions: &[], + through_commits: &[0, 1, 2, 3, 4], + through_compactions: &[], +})] +// 0 1 2 3 4 5 6 7 8 9 +// checkpoint: | +// commits: x x x x x x x +// crc: | +// after commits: x x x x x +// through commits: x x +#[case::only_deltas_with_checkpoint(CrcPruningCase { + commits: &[3, 4, 5, 6, 7, 8, 9], + compactions: &[], + checkpoint: Some(2), + crc_version: 4, + after_commits: &[5, 6, 7, 8, 9], + after_compactions: &[], + through_commits: &[3, 4], + through_compactions: &[], +})] +// 0 1 2 3 4 5 6 7 8 9 +// commits: x x x x x x x x x x +// compactions: [-----] +// crc: | +// after commits: x x x x x +// after compactions: [-----] +// through commits: x x x x x +#[case::compaction_after_crc(CrcPruningCase { + commits: &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + compactions: &[(5, 7)], + checkpoint: None, + crc_version: 4, + after_commits: &[5, 6, 7, 8, 9], + after_compactions: &[(5, 7)], + through_commits: &[0, 1, 2, 3, 4], + through_compactions: &[], +})] +// 0 1 2 3 4 5 6 7 8 9 +// commits: x x x x x x x x x x +// compactions: [-----------] +// crc: | +// after commits: x x x x x +// through commits: x x x x x +// through compactions: +#[case::compaction_overlaps_crc(CrcPruningCase { + commits: &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + compactions: &[(2, 6)], + checkpoint: None, + crc_version: 4, + after_commits: &[5, 6, 7, 8, 9], + after_compactions: &[], + through_commits: &[0, 1, 2, 3, 4], + through_compactions: &[], +})] +// 0 1 2 3 4 5 6 7 8 9 +// commits: x x x x x x x x x x +// compactions: [-----] +// crc: | +// after commits: x x x x x +// through commits: x x x x x +// through compactions: [-----] +#[case::compaction_before_crc(CrcPruningCase { + commits: &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + compactions: &[(0, 2)], + checkpoint: None, + crc_version: 4, + after_commits: &[5, 6, 7, 8, 9], + after_compactions: &[], + through_commits: &[0, 1, 2, 3, 4], + through_compactions: &[(0, 2)], +})] +#[tokio::test] +async fn test_segment_crc_filtering(#[case] case: CrcPruningCase) { + let seg = create_segment_for(LogSegmentConfig { + published_commit_versions: case.commits, + compaction_versions: case.compactions, + checkpoint_version: case.checkpoint, + ..Default::default() + }) + .await; + + let after = seg.segment_after_crc(case.crc_version); + assert_eq!(extract_commit_versions(&after), case.after_commits); + assert_eq!(extract_compaction_ranges(&after), case.after_compactions); + assert!(after.checkpoint_version.is_none()); + assert!(after.listed.checkpoint_parts.is_empty()); + + let through = seg.segment_through_crc(case.crc_version); + assert_eq!(extract_commit_versions(&through), case.through_commits); + assert_eq!( + extract_compaction_ranges(&through), + case.through_compactions + ); + assert_eq!(through.checkpoint_version, case.checkpoint); +} + +#[rstest::rstest] +#[case::empty_schema(StructType::new_unchecked([]), None)] +#[case::metadata_field( + StructType::new_unchecked([StructField::nullable( + METADATA_NAME, + StructType::new_unchecked([]), + )]), + Some(Arc::new( + Expression::column(ColumnName::new([METADATA_NAME, "id"])).is_not_null(), + )), +)] +#[case::protocol_field( + StructType::new_unchecked([StructField::nullable( + PROTOCOL_NAME, + StructType::new_unchecked([]), + )]), + Some(Arc::new( + Expression::column(ColumnName::new([PROTOCOL_NAME, "minReaderVersion"])).is_not_null(), + )), +)] +#[case::txn_field( + StructType::new_unchecked([StructField::nullable( + SET_TRANSACTION_NAME, + StructType::new_unchecked([]), + )]), + Some(Arc::new( + Expression::column(ColumnName::new([SET_TRANSACTION_NAME, "appId"])).is_not_null(), + )), +)] +#[case::domain_metadata_field( + StructType::new_unchecked([StructField::nullable( + DOMAIN_METADATA_NAME, + StructType::new_unchecked([]), + )]), + Some(Arc::new( + Expression::column(ColumnName::new([DOMAIN_METADATA_NAME, "domain"])).is_not_null(), + )), +)] +#[case::unknown_field_returns_none( + StructType::new_unchecked([StructField::nullable(ADD_NAME, StructType::new_unchecked([]))]), + None, +)] +#[case::multiple_known_fields( + StructType::new_unchecked([ + StructField::nullable(METADATA_NAME, StructType::new_unchecked([])), + StructField::nullable(PROTOCOL_NAME, StructType::new_unchecked([])), + ]), + Some(Arc::new(Predicate::or( + Expression::column(ColumnName::new([METADATA_NAME, "id"])).is_not_null(), + Expression::column(ColumnName::new([PROTOCOL_NAME, "minReaderVersion"])).is_not_null(), + ))), +)] +#[case::known_and_unknown_field_returns_none( + StructType::new_unchecked([ + StructField::nullable(METADATA_NAME, StructType::new_unchecked([])), + StructField::nullable(ADD_NAME, StructType::new_unchecked([])), + ]), + None, +)] +fn test_schema_to_is_not_null_predicate( + #[case] schema: StructType, + #[case] expected: Option, +) { + assert_eq!(schema_to_is_not_null_predicate(&schema), expected); +} + +/// Verify that `read_actions` correctly handles null values in map fields across all +/// action types. The Delta protocol allows null values in `partitionValues` maps (a null +/// partition value means the partition column is null for that file) and in `tags` maps. +/// +/// Spark defaults all `Map[String, String]` types to `valueContainsNull = true`, and +/// checkpoint writing calls `schema.asNullable` which forces all maps nullable. The +/// schema must match this behavior. +/// +/// This test reads JSON actions through `DefaultEngine` + `InMemory` store + +/// `log_segment.read_actions()`, then re-validates the resulting Arrow `StructArray` with +/// `StructArray::try_new`. Without the fix, non-nullable map value fields cause: +/// "Found unmasked nulls for non-nullable StructArray field 'value'" +#[rstest] +// remove.partitionValues.month: null +#[case::remove_partition_values( + "remove", + "partitionValues", + r#"{"remove":{"path":"file.parquet","deletionTimestamp":1000,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"year":"2024","month":null},"size":100}}"# +)] +// remove.tags.key2: null +#[case::remove_tags( + "remove", + "tags", + r#"{"remove":{"path":"file.parquet","deletionTimestamp":1000,"dataChange":true,"tags":{"key1":"val1","key2":null}}}"# +)] +// add.partitionValues.month: null +#[case::add_partition_values( + "add", + "partitionValues", + r#"{"add":{"path":"file.parquet","partitionValues":{"year":"2024","month":null},"size":100,"modificationTime":1000,"dataChange":true}}"# +)] +// add.tags.key2: null +#[case::add_tags( + "add", + "tags", + r#"{"add":{"path":"file.parquet","partitionValues":{},"size":100,"modificationTime":1000,"dataChange":true,"tags":{"key1":"val1","key2":null}}}"# +)] +// cdc.partitionValues.month: null +#[case::cdc_partition_values( + "cdc", + "partitionValues", + r#"{"cdc":{"path":"file.parquet","partitionValues":{"year":"2024","month":null},"size":100,"dataChange":false}}"# +)] +// cdc.tags.key2: null +#[case::cdc_tags( + "cdc", + "tags", + r#"{"cdc":{"path":"file.parquet","partitionValues":{},"size":100,"dataChange":false,"tags":{"key1":"val1","key2":null}}}"# +)] +// sidecar.tags.key2: null +#[case::sidecar_tags( + "sidecar", + "tags", + r#"{"sidecar":{"path":"sidecar.parquet","sizeInBytes":100,"modificationTime":1000,"tags":{"key1":"val1","key2":null}}}"# +)] +// checkpointMetadata.tags.key2: null +#[case::checkpoint_metadata_tags( + "checkpointMetadata", + "tags", + r#"{"checkpointMetadata":{"version":0,"tags":{"key1":"val1","key2":null}}}"# +)] +// Known issues: these map fields don't yet have #[allow_null_container_values]. +// commitInfo.operationParameters.description: null +#[should_panic(expected = "StructArray re-validation failed")] +#[case::commit_info_operation_parameters_known_issue( + "commitInfo", + "operationParameters", + r#"{"commitInfo":{"timestamp":1000,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","description":null}}}"# +)] +// metaData.configuration.key2: null +#[should_panic(expected = "StructArray re-validation failed")] +#[case::metadata_configuration_known_issue( + "metaData", + "configuration", + r#"{"metaData":{"id":"test","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[]}","partitionColumns":[],"configuration":{"key1":"val1","key2":null},"createdTime":1000}}"# +)] +#[tokio::test] +async fn read_actions_with_null_map_values( + #[case] action_name: &str, + #[case] map_field: &str, + #[case] json_action: &str, +) { + use crate::arrow::array::{Array, AsArray, MapArray, StructArray}; + + let store = Arc::new(InMemory::new()); + let log_root = Url::parse("memory:///_delta_log/").unwrap(); + + // Write a single commit file with the action containing null map values. + store + .put( + &delta_path_for_version(0, "json"), + json_action.to_string().into(), + ) + .await + .unwrap(); + + // Build engine and read actions -- same as DeltaActionExtractor::get_actions. + let engine = DefaultEngineBuilder::new(store).build(); + let log_segment = + LogSegment::for_table_changes(engine.storage_handler().as_ref(), log_root, 0, Some(0)) + .unwrap(); + + // Use all_actions_schema to cover sidecar and checkpointMetadata (checkpoint-only actions). + let action_schema = get_all_actions_schema().clone(); + let action_batches = log_segment + .read_actions(&engine, action_schema) + .expect("read_actions should succeed"); + + // Iterate batches and verify the map value field is nullable. + let mut found = false; + for batch_result in action_batches { + let actions_batch = batch_result.expect("Iterating action batches should succeed"); + + let data_any = actions_batch.actions.into_any(); + let arrow_data = data_any + .downcast_ref::() + .expect("ArrowEngineData"); + let rb = arrow_data.record_batch(); + + let Some(action_col) = rb.column_by_name(action_name) else { + continue; + }; + let action_struct = action_col + .as_struct_opt() + .unwrap_or_else(|| panic!("{action_name} column should be a struct")); + let map_col = action_struct + .column_by_name(map_field) + .unwrap_or_else(|| panic!("{action_name}.{map_field} not found")); + let map_array = map_col + .as_any() + .downcast_ref::() + .unwrap_or_else(|| panic!("{action_name}.{map_field} should be a MapArray")); + // Re-validate the entries StructArray with its own schema, same as what Arrow's + // IPC deserializer does. Without the fix, this fails with: + // "Found unmasked nulls for non-nullable StructArray field 'value'" + let entries = map_array.entries(); + StructArray::try_new( + entries.fields().clone(), + entries.columns().to_vec(), + entries.nulls().cloned(), + ) + .unwrap_or_else(|e| { + panic!( + "{action_name}.{map_field} entries StructArray re-validation failed: {e}. \ + This means the schema has non-nullable value field but the data has nulls." + ) + }); + found = true; } + assert!(found, "Should have found a {action_name} action batch"); } diff --git a/kernel/src/log_segment_files.rs b/kernel/src/log_segment_files.rs new file mode 100644 index 0000000000..e6f9cd4312 --- /dev/null +++ b/kernel/src/log_segment_files.rs @@ -0,0 +1,518 @@ +//! [`LogSegmentFiles`] is a struct holding the result of listing the delta log. Currently, it +//! exposes four APIs for listing: +//! 1. [`list_commits`]: Lists all commit files between the provided start and end versions. +//! 2. [`list`]: Lists all commit and checkpoint files between the provided start and end versions. +//! 3. [`list_with_checkpoint_hint`]: Lists all commit and checkpoint files after the provided +//! checkpoint hint. +//! 4. [`list_with_backward_checkpoint_scan`]: Scans backward from an end version in +//! 1000-version windows until a complete checkpoint is found or the log is exhausted. +//! +//! After listing, one can leverage the [`LogSegmentFiles`] to construct a [`LogSegment`]. +//! +//! [`list_with_backward_checkpoint_scan`]: Self::list_with_backward_checkpoint_scan +//! +//! [`list_commits`]: Self::list_commits +//! [`list`]: Self::list +//! [`list_with_checkpoint_hint`]: Self::list_with_checkpoint_hint +//! [`LogSegment`]: crate::log_segment::LogSegment + +use std::collections::HashMap; + +use crate::last_checkpoint_hint::LastCheckpointHint; +use crate::path::{LogPathFileType, ParsedLogPath}; +use crate::{DeltaResult, Error, StorageHandler, Version}; + +use delta_kernel_derive::internal_api; + +use itertools::Itertools; +use tracing::{debug, info, instrument}; +use url::Url; + +#[cfg(test)] +mod tests; + +/// Represents the set of log files found during a listing operation in the Delta log directory. +/// +/// - `ascending_commit_files`: All commit and staged commit files found, sorted by version. May contain gaps. +/// - `ascending_compaction_files`: All compaction commit files found, sorted by version. +/// - `checkpoint_parts`: All parts of the most recent complete checkpoint (all same version). Empty if no checkpoint found. +/// - `latest_crc_file`: The CRC file with the highest version, only if version >= checkpoint version. +/// - `latest_commit_file`: The commit file with the highest version, or `None` if no commits were +/// found. This field may be present even when `ascending_commit_files` is empty, such as when a +/// checkpoint subsumes all commits. In that case, it is retained because downstream code (e.g. +/// In-Commit Timestamp reading) needs access to the commit file at the snapshot version. +/// - `max_published_version`: The highest published commit file version, or `None` if no published commits were found. +#[derive(Debug, Default, Clone, PartialEq, Eq)] +#[internal_api] +pub(crate) struct LogSegmentFiles { + pub ascending_commit_files: Vec, + pub ascending_compaction_files: Vec, + pub checkpoint_parts: Vec, + pub latest_crc_file: Option, + pub latest_commit_file: Option, + pub max_published_version: Option, +} + +/// Returns a lazy iterator of [`ParsedLogPath`]s from the filesystem over versions +/// `[start_version, end_version]`. The iterator handles parsing, filtering out non-listable +/// files (e.g. staged commits, dot-prefixed files), and stopping at `end_version`. +/// +/// This is a thin wrapper around [`StorageHandler::list_from`] that provides the standard +/// Delta log file discovery pipeline. Callers are responsible for handling the `log_tail` +/// (catalog-provided commits) and tracking `max_published_version`. +fn list_from_storage( + storage: &dyn StorageHandler, + log_root: &Url, + start_version: Version, + end_version: Version, +) -> DeltaResult>> { + let start_from = log_root.join(&format!("{start_version:020}"))?; + let files = storage + .list_from(&start_from)? + .map(|meta| ParsedLogPath::try_from(meta?)) + // NOTE: this filters out .crc files etc which start with "." - some engines + // produce `.something.parquet.crc` corresponding to `something.parquet`. Kernel + // doesn't care about these files. Critically, note these are _different_ than + // normal `version.crc` files which are listed + captured normally. Additionally + // we likely aren't even 'seeing' these files since lexicographically the string + // "." comes before the string "0". + .filter_map_ok(|path_opt| path_opt.filter(|p| p.should_list())) + .take_while(move |path_res| match path_res { + // discard any path with too-large version; keep errors + Ok(path) => path.version <= end_version, + Err(_) => true, + }); + Ok(files) +} + +/// Groups all checkpoint parts according to the checkpoint they belong to. +/// +/// NOTE: There could be a single-part and/or any number of uuid-based checkpoints. They +/// are all equivalent, and this routine keeps only one of them (arbitrarily chosen). +fn group_checkpoint_parts(parts: Vec) -> HashMap> { + let mut checkpoints: HashMap> = HashMap::new(); + for part_file in parts { + use LogPathFileType::*; + match &part_file.file_type { + SinglePartCheckpoint + | UuidCheckpoint + | MultiPartCheckpoint { + part_num: 1, + num_parts: 1, + } => { + // All single-file checkpoints are equivalent, just keep one + checkpoints.insert(1, vec![part_file]); + } + MultiPartCheckpoint { + part_num: 1, + num_parts, + } => { + // Start a new multi-part checkpoint with at least 2 parts + checkpoints.insert(*num_parts, vec![part_file]); + } + MultiPartCheckpoint { + part_num, + num_parts, + } => { + // Continue a new multi-part checkpoint with at least 2 parts. + // Checkpoint parts are required to be in-order from log listing to build + // a multi-part checkpoint + if let Some(part_files) = checkpoints.get_mut(num_parts) { + // `part_num` is guaranteed to be non-negative and within `usize` range + if *part_num as usize == 1 + part_files.len() { + // Safe to append because all previous parts exist + part_files.push(part_file); + } + } + } + Commit | StagedCommit | CompactedCommit { .. } | Crc | Unknown => {} + } + } + checkpoints +} + +/// Returns the version of the latest complete checkpoint in `files`, or `None` if no complete +/// checkpoint exists. +fn find_complete_checkpoint_version(ascending_files: &[ParsedLogPath]) -> Option { + ascending_files + .iter() + .filter(|f| f.is_checkpoint()) + .chunk_by(|f| f.version) + .into_iter() + .filter_map(|(version, parts)| { + let owned: Vec = parts.cloned().collect(); + group_checkpoint_parts(owned) + .iter() + .any(|(num_parts, part_files)| part_files.len() == *num_parts as usize) + .then_some(version) + }) + .last() +} + +/// Accumulates and groups log files during listing. Each "group" consists of all files that +/// share the same version number (e.g., commit, checkpoint parts, CRC files). +/// +/// We need to group by version because: +/// 1. A version may have multiple checkpoint parts that must be collected before we can +/// determine if the checkpoint is complete +/// 2. If a complete checkpoint exists, we can discard all commits before it +/// +/// Groups are flushed (processed) when we encounter a file with a different version or +/// reach EOF, at which point we check for complete checkpoints and update our state. +#[derive(Default)] +struct ListingAccumulator { + /// The result being built up + output: LogSegmentFiles, + /// Staging area for checkpoint parts at the current version group; always empty when iteration ends + pending_checkpoint_parts: Vec, + /// End-version bound used in process_file() to filter CompactedCommit files + end_version: Option, + /// The version of the current group being accumulated + group_version: Option, +} + +impl ListingAccumulator { + fn process_file(&mut self, file: ParsedLogPath) { + use LogPathFileType::*; + match file.file_type { + Commit | StagedCommit => self.output.ascending_commit_files.push(file), + CompactedCommit { hi } if self.end_version.is_none_or(|end| hi <= end) => { + self.output.ascending_compaction_files.push(file); + } + CompactedCommit { .. } => (), // Failed the bounds check above + SinglePartCheckpoint | UuidCheckpoint | MultiPartCheckpoint { .. } => { + self.pending_checkpoint_parts.push(file) + } + Crc => { + self.output.latest_crc_file.replace(file); + } + Unknown => { + // It is possible that there are other files being stashed away into + // _delta_log/ This is not necessarily forbidden, but something we + // want to know about in a debugging scenario + debug!( + "Found file {} with unknown file type {:?} at version {}", + file.filename, file.file_type, file.version + ); + } + } + } + + /// Called before processing each new file. If `file_version` differs from the current + /// `group_version`, finalizes the current group by calling `flush_checkpoint_group`, + /// then advances `group_version` to the new version. On the first call (when + /// `group_version` is `None`), simply initializes it. + fn maybe_flush_and_advance(&mut self, file_version: Version) { + match self.group_version { + Some(gv) if file_version != gv => { + self.flush_checkpoint_group(gv); + self.group_version = Some(file_version); + } + None => { + self.group_version = Some(file_version); + } + _ => {} // same version, no flush needed + } + } + + /// Groups and finds the first complete checkpoint for this version. + /// All checkpoints for the same version are equivalent, so we only take one. + /// + /// If this version has a complete checkpoint, we can drop the existing commit and + /// compaction files we collected so far -- except we must keep the latest commit. + fn flush_checkpoint_group(&mut self, version: Version) { + let pending_checkpoint_parts = std::mem::take(&mut self.pending_checkpoint_parts); + if let Some((_, complete_checkpoint)) = group_checkpoint_parts(pending_checkpoint_parts) + .into_iter() + // `num_parts` is guaranteed to be non-negative and within `usize` range + .find(|(num_parts, part_files)| part_files.len() == *num_parts as usize) + { + self.output.checkpoint_parts = complete_checkpoint; + // Keep the commit at the checkpoint version (if any) before clearing all older commits. + self.output.latest_commit_file = self + .output + .ascending_commit_files + .last() + .filter(|c| c.version == version) + .cloned(); + // Log replay only uses commits/compactions after a complete checkpoint + self.output.ascending_commit_files.clear(); + self.output.ascending_compaction_files.clear(); + // Drop CRC file if older than checkpoint (CRC must be >= checkpoint version) + if self + .output + .latest_crc_file + .as_ref() + .is_some_and(|crc| crc.version < version) + { + self.output.latest_crc_file = None; + } + } + } +} + +/// Number of versions covered by each backward-scan window in +/// `LogSegmentFiles::list_with_backward_checkpoint_scan` +const BACKWARD_SCAN_WINDOW_SIZE: u64 = 1000; + +impl LogSegmentFiles { + /// Assembles a `LogSegmentFiles` from `fs_files` (an iterator of files + /// listed from storage) and `log_tail` (catalog-provided commits). + /// + /// - `fs_files`: files listed from storage in ascending version order + /// - `log_tail`: list of commits that takes precedence over the filesystem ones + /// - `start_version`: start version of the entire listing range provided; in practice, + /// this is the lower bound (inclusive) for log_tail entries included in the result + /// - `end_version`: upper bound (inclusive) on versions to include, `None` means no bound + pub(crate) fn build_log_segment_files( + fs_files: impl Iterator>, + log_tail: Vec, + start_version: Version, + end_version: Option, + ) -> DeltaResult { + // check log_tail is only commits + // note that LogSegment checks no gaps/duplicates so we don't duplicate that here + debug_assert!( + log_tail.iter().all(|entry| entry.is_commit()), + "log_tail should only contain commits" + ); + + let log_tail_start_version = log_tail.first().map(|f| f.version); + let end = end_version.unwrap_or(Version::MAX); + + let mut acc = ListingAccumulator { + end_version, + ..Default::default() + }; + + // Phase 1: Stream filesystem files lazily (no collect). + // We always list from the filesystem even when the log_tail covers the entire commit + // range, because non-commit files (CRC, checkpoints, compactions) only exist on the + // filesystem — the log_tail only provides commit files. + for file_result in fs_files { + let file = file_result?; + + // Track max published commit version from ALL filesystem Commit files, + // including those that will be skipped because log_tail takes precedence. + if matches!(file.file_type, LogPathFileType::Commit) { + acc.output.max_published_version = + acc.output.max_published_version.max(Some(file.version)); + } + + // Skip filesystem commits at versions covered by the log_tail (the log_tail + // is authoritative for commits). Non-commit files are always kept. + if file.is_commit() + && log_tail_start_version.is_some_and(|tail_start| file.version >= tail_start) + { + continue; + } + + acc.maybe_flush_and_advance(file.version); + acc.process_file(file); + } + + // Phase 2: Process log_tail entries. We do this after Phase 1 because log_tail commits + // start at log_tail_start_version and are in ascending version order — they always extend + // (or overlap with, but supersede) the filesystem-listed commits. Processing them after + // Phase 1 maintains ascending version order throughout, which is required by the checkpoint + // grouping logic. Note that Phase 1 already skipped filesystem commits at log_tail + // versions, so there's no duplication here. + // + // log_tail entries at versions before a checkpoint may still be included + // here - LogSegment::try_new is the safeguard that filters those out unconditionally + let filtered_log_tail = log_tail + .into_iter() + .filter(|entry| entry.version >= start_version && entry.version <= end); + for file in filtered_log_tail { + // Track max published version for published commits from the log_tail + if matches!(file.file_type, LogPathFileType::Commit) { + acc.output.max_published_version = + acc.output.max_published_version.max(Some(file.version)); + } + + acc.maybe_flush_and_advance(file.version); + acc.process_file(file); + } + + // Flush the final group + if let Some(gv) = acc.group_version { + acc.flush_checkpoint_group(gv); + } + + // Since ascending_commit_files is cleared at each checkpoint, if it's non-empty here + // it contains only commits after the most recent checkpoint. The last element is the + // highest version commit overall, so we update latest_commit_file to it. If it's empty, + // we keep the value set at the checkpoint (if a commit existed at the checkpoint version), + // or remains None. + if let Some(commit_file) = acc.output.ascending_commit_files.last() { + acc.output.latest_commit_file = Some(commit_file.clone()); + } + + Ok(acc.output) + } + + pub(crate) fn ascending_commit_files(&self) -> &Vec { + &self.ascending_commit_files + } + + pub(crate) fn ascending_commit_files_mut(&mut self) -> &mut Vec { + &mut self.ascending_commit_files + } + + pub(crate) fn checkpoint_parts(&self) -> &Vec { + &self.checkpoint_parts + } + + pub(crate) fn latest_commit_file(&self) -> &Option { + &self.latest_commit_file + } + + /// List all commits between the provided `start_version` (inclusive) and `end_version` + /// (inclusive). All other types are ignored. + pub(crate) fn list_commits( + storage: &dyn StorageHandler, + log_root: &Url, + start_version: Option, + end_version: Option, + ) -> DeltaResult { + // TODO: plumb through a log_tail provided by our caller + let start = start_version.unwrap_or(0); + let end = end_version.unwrap_or(Version::MAX); + let fs_iter = list_from_storage(storage, log_root, start, end)?; + + let mut listed_commits = Vec::new(); + let mut max_published_version: Option = None; + + for file_result in fs_iter { + let file = file_result?; + if matches!(file.file_type, LogPathFileType::Commit) { + max_published_version = max_published_version.max(Some(file.version)); + listed_commits.push(file); + } + } + + let latest_commit_file = listed_commits.last().cloned(); + Ok(LogSegmentFiles { + ascending_commit_files: listed_commits, + latest_commit_file, + max_published_version, + ..Default::default() + }) + } + + /// List all commit and checkpoint files with versions above the provided `start_version` (inclusive). + /// If successful, this returns a `LogSegmentFiles`. + /// + /// The `log_tail` is an optional sequence of commits provided by the caller, e.g. via + /// [`SnapshotBuilder::with_log_tail`]. It may contain either published or staged commits. The + /// `log_tail` must strictly adhere to being a 'tail' — a contiguous cover of versions `X..=Y` + /// where `Y` is the latest version of the table. If it overlaps with commits listed from the + /// filesystem, the `log_tail` will take precedence for commits; non-commit files (CRC, + /// checkpoints, compactions) are always taken from the filesystem. + // TODO: encode some of these guarantees in the output types. e.g. we could have: + // - SortedCommitFiles: Vec, is_ascending: bool, end_version: Version + // - CheckpointParts: Vec, checkpoint_version: Version (guarantee all same version) + #[instrument(name = "log.list", skip_all, fields(start = ?start_version, end = ?end_version), err)] + pub(crate) fn list( + storage: &dyn StorageHandler, + log_root: &Url, + log_tail: Vec, + start_version: Option, + end_version: Option, + ) -> DeltaResult { + let start = start_version.unwrap_or(0); + let end = end_version.unwrap_or(Version::MAX); + let fs_iter = list_from_storage(storage, log_root, start, end)?; + Self::build_log_segment_files(fs_iter, log_tail, start, end_version) + } + + /// List all commit and checkpoint files after the provided checkpoint. It is guaranteed that all + /// the returned [`ParsedLogPath`]s will have a version less than or equal to the `end_version`. + pub(crate) fn list_with_checkpoint_hint( + checkpoint_metadata: &LastCheckpointHint, + storage: &dyn StorageHandler, + log_root: &Url, + log_tail: Vec, + end_version: Option, + ) -> DeltaResult { + let listed_files = Self::list( + storage, + log_root, + log_tail, + Some(checkpoint_metadata.version), + end_version, + )?; + + let Some(latest_checkpoint) = listed_files.checkpoint_parts.last() else { + // Kernel should not compensate for corrupt tables, so we fail if we can't find a checkpoint + return Err(Error::invalid_checkpoint( + "Had a _last_checkpoint hint but didn't find any checkpoints", + )); + }; + if latest_checkpoint.version != checkpoint_metadata.version { + info!( + "_last_checkpoint hint is out of date. _last_checkpoint version: {}. Using actual most recent: {}", + checkpoint_metadata.version, + latest_checkpoint.version + ); + } else if listed_files.checkpoint_parts.len() != checkpoint_metadata.parts.unwrap_or(1) { + return Err(Error::InvalidCheckpoint(format!( + "_last_checkpoint indicated that checkpoint should have {} parts, but it has {}", + checkpoint_metadata.parts.unwrap_or(1), + listed_files.checkpoint_parts.len() + ))); + } + Ok(listed_files) + } + + /// Returns a [`LogSegmentFiles`] ending at `end_version`, rooted at the most recent complete + /// checkpoint at or before `end_version`, or rooted at version 0 if no checkpoint is found. + /// + /// To find the checkpoint without a full forward listing from version 0, this scans backward + /// from `end_version` in windows of size [`BACKWARD_SCAN_WINDOW_SIZE`], stopping as soon as + /// a complete checkpoint is found (or version 0 is reached). + /// Then, all files from the windows that were scanned are combined with `log_tail` to produce a log segment + /// rooted at the checkpoint version (or version 0 if no checkpoint) with all commits after the + /// checkpoint version. A log_tail commit at exactly the checkpoint version may be included at this + /// stage but will be filtered out by `LogSegment::try_new`. + /// + /// For example, given the desired end_version = 12500 and a checkpoint at v8900: + /// - Window 1 [11501, 12501): no checkpoint -> continue + /// - Window 2 [10501, 11501): no checkpoint -> continue + /// - Window 3 [9501, 10501): no checkpoint -> continue + /// - Window 4 [8501, 9501): checkpoint at v8900 found -> stop + /// All files from windows 1-4 are combined with `log_tail` to produce a log segment + /// rooted at the checkpoint at v8900 with all commits from v8901 to v12500. + #[instrument(name = "log.list_with_backward_checkpoint_scan", skip_all, fields(end = end_version), err)] + pub(crate) fn list_with_backward_checkpoint_scan( + storage: &dyn StorageHandler, + log_root: &Url, + log_tail: Vec, + end_version: Version, + ) -> DeltaResult { + // Scan backward in 1000-version windows, collecting ALL file types, until a complete + // checkpoint is found or the log is exhausted. + let mut windows: Vec> = Vec::new(); + let mut found_checkpoint_version: Option = None; + // upper is the exclusive upper bound of the next window; adding 1 includes end_version + // in the first window. The inclusive range passed to list_from_storage is [lower, upper - 1]. + let mut upper = end_version + 1; + while upper > 0 { + let lower = upper.saturating_sub(BACKWARD_SCAN_WINDOW_SIZE); + let window_files: Vec<_> = + list_from_storage(storage, log_root, lower, upper - 1)?.try_collect()?; + + found_checkpoint_version = find_complete_checkpoint_version(&window_files); + windows.push(window_files); + + if found_checkpoint_version.is_some() { + break; + } + upper = lower; + } + + let fs_iter = windows.into_iter().rev().flatten().map(Ok); + let start = found_checkpoint_version.unwrap_or(0); + Self::build_log_segment_files(fs_iter, log_tail, start, Some(end_version)) + } +} diff --git a/kernel/src/log_segment_files/tests.rs b/kernel/src/log_segment_files/tests.rs new file mode 100644 index 0000000000..176b9ad2ff --- /dev/null +++ b/kernel/src/log_segment_files/tests.rs @@ -0,0 +1,919 @@ +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; + +use url::Url; + +use rstest::rstest; + +use crate::engine::default::executor::tokio::TokioBackgroundExecutor; +use crate::engine::default::filesystem::ObjectStoreStorageHandler; +use crate::object_store::{memory::InMemory, path::Path as ObjectPath, ObjectStore}; +use crate::FileMeta; + +use super::*; + +// size markers used to identify commit sources in tests +const FILESYSTEM_SIZE_MARKER: u64 = 10; +const CATALOG_SIZE_MARKER: u64 = 7; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum CommitSource { + Filesystem, + Catalog, +} + +// create test storage given list of log files with custom data content +async fn create_storage( + log_files: Vec<(Version, LogPathFileType, CommitSource)>, +) -> (Box, Url) { + let store = Arc::new(InMemory::new()); + let log_root = Url::parse("memory:///_delta_log/").unwrap(); + + for (version, file_type, source) in log_files { + let path = match file_type { + LogPathFileType::Commit => { + format!("_delta_log/{version:020}.json") + } + LogPathFileType::StagedCommit => { + let uuid = uuid::Uuid::new_v4(); + format!("_delta_log/_staged_commits/{version:020}.{uuid}.json") + } + LogPathFileType::SinglePartCheckpoint => { + format!("_delta_log/{version:020}.checkpoint.parquet") + } + LogPathFileType::MultiPartCheckpoint { + part_num, + num_parts, + } => { + format!( + "_delta_log/{version:020}.checkpoint.{part_num:010}.{num_parts:010}.parquet" + ) + } + LogPathFileType::Crc => { + format!("_delta_log/{version:020}.crc") + } + LogPathFileType::CompactedCommit { hi } => { + format!("_delta_log/{version:020}.{hi:020}.compacted.json") + } + LogPathFileType::UuidCheckpoint | LogPathFileType::Unknown => { + panic!("Unsupported file type in test: {file_type:?}") + } + }; + let data = match source { + CommitSource::Filesystem => bytes::Bytes::from("filesystem"), + CommitSource::Catalog => bytes::Bytes::from("catalog"), + }; + store + .put(&ObjectPath::from(path.as_str()), data.into()) + .await + .expect("Failed to put test file"); + } + + let executor = Arc::new(TokioBackgroundExecutor::new()); + let storage = Box::new(ObjectStoreStorageHandler::new(store, executor, None)); + (storage, log_root) +} + +// helper to create a ParsedLogPath with specific source marker +fn make_parsed_log_path_with_source( + version: Version, + file_type: LogPathFileType, + source: CommitSource, +) -> ParsedLogPath { + let url = Url::parse(&format!("memory:///_delta_log/{version:020}.json")).unwrap(); + let mut filename_path_segments = url.path_segments().unwrap(); + let filename = filename_path_segments.next_back().unwrap().to_string(); + let extension = filename.split('.').next_back().unwrap().to_string(); + + let size = match source { + CommitSource::Filesystem => FILESYSTEM_SIZE_MARKER, + CommitSource::Catalog => CATALOG_SIZE_MARKER, + }; + + let location = FileMeta { + location: url, + last_modified: 0, + size, + }; + + ParsedLogPath { + location, + filename, + extension, + version, + file_type, + } +} + +fn assert_source(commit: &ParsedLogPath, expected_source: CommitSource) { + let expected_size = match expected_source { + CommitSource::Filesystem => FILESYSTEM_SIZE_MARKER, + CommitSource::Catalog => CATALOG_SIZE_MARKER, + }; + assert_eq!( + commit.location.size, expected_size, + "Commit version {} should be from {:?}, but size was {}", + commit.version, expected_source, commit.location.size + ); +} + +/// A [`StorageHandler`] wrapper that counts the number of `list_from` calls. +/// Used to verify that `list_with_backward_checkpoint_scan` issues the expected +/// number of storage listing requests. +struct CountingStorageHandler { + inner: Box, + list_from_count: AtomicU32, +} + +impl CountingStorageHandler { + fn new(inner: Box) -> Self { + Self { + inner, + list_from_count: AtomicU32::new(0), + } + } + + fn call_count(&self) -> u32 { + self.list_from_count.load(Ordering::Relaxed) + } +} + +impl StorageHandler for CountingStorageHandler { + fn list_from( + &self, + path: &Url, + ) -> DeltaResult>>> { + self.list_from_count.fetch_add(1, Ordering::Relaxed); + self.inner.list_from(path) + } + + fn read_files( + &self, + _files: Vec, + ) -> DeltaResult>>> { + panic!("read_files should not be called during listing"); + } + + fn put(&self, _path: &Url, _data: bytes::Bytes, _overwrite: bool) -> DeltaResult<()> { + panic!("put should not be called during listing"); + } + + fn copy_atomic(&self, _src: &Url, _dest: &Url) -> DeltaResult<()> { + panic!("copy_atomic should not be called during listing"); + } + + fn head(&self, _path: &Url) -> DeltaResult { + panic!("head should not be called during listing"); + } +} + +/// Helper to call `LogSegmentFiles::list()` and destructure the result for assertions. +/// Returns (ascending_commit_files, ascending_compaction_files, checkpoint_parts, +/// latest_crc_file, latest_commit_file, max_published_version). +#[allow(clippy::type_complexity)] +fn list_and_destructure( + storage: &dyn StorageHandler, + log_root: &Url, + log_tail: Vec, + start_version: Option, + end_version: Option, +) -> ( + Vec, + Vec, + Vec, + Option, + Option, + Option, +) { + let r = LogSegmentFiles::list(storage, log_root, log_tail, start_version, end_version).unwrap(); + ( + r.ascending_commit_files, + r.ascending_compaction_files, + r.checkpoint_parts, + r.latest_crc_file, + r.latest_commit_file, + r.max_published_version, + ) +} + +// ===== list() tests ===== + +#[tokio::test] +async fn test_empty_log_tail() { + let log_files = vec![ + (0, LogPathFileType::Commit, CommitSource::Filesystem), + (1, LogPathFileType::Commit, CommitSource::Filesystem), + (2, LogPathFileType::Commit, CommitSource::Filesystem), + ]; + let (storage, log_root) = create_storage(log_files).await; + + let (commits, _, _, _, latest_commit, max_pub) = + list_and_destructure(storage.as_ref(), &log_root, vec![], Some(1), Some(2)); + + assert_eq!(commits.len(), 2); + assert_eq!(commits[0].version, 1); + assert_eq!(commits[1].version, 2); + assert_source(&commits[0], CommitSource::Filesystem); + assert_source(&commits[1], CommitSource::Filesystem); + assert_eq!(latest_commit.unwrap().version, 2); + assert_eq!(max_pub, Some(2)); +} + +#[tokio::test] +async fn test_log_tail_has_latest_commit_files() { + // Filesystem has commits 0-2, log_tail has commits 3-5 (the latest) + let log_files = vec![ + (0, LogPathFileType::Commit, CommitSource::Filesystem), + (1, LogPathFileType::Commit, CommitSource::Filesystem), + (2, LogPathFileType::Commit, CommitSource::Filesystem), + ]; + let (storage, log_root) = create_storage(log_files).await; + + let log_tail = vec![ + make_parsed_log_path_with_source(3, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(4, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(5, LogPathFileType::Commit, CommitSource::Catalog), + ]; + + let (commits, _, _, _, latest_commit, max_pub) = + list_and_destructure(storage.as_ref(), &log_root, log_tail, Some(0), Some(5)); + + assert_eq!(commits.len(), 6); + // filesystem commits 0-2 + for (i, commit) in commits.iter().enumerate().take(3) { + assert_eq!(commit.version, i as u64); + assert_source(commit, CommitSource::Filesystem); + } + // catalog commits 3-5 + for (i, commit) in commits.iter().enumerate().skip(3) { + assert_eq!(commit.version, i as u64); + assert_source(commit, CommitSource::Catalog); + } + assert_eq!(latest_commit.unwrap().version, 5); + assert_eq!(max_pub, Some(5)); +} + +#[tokio::test] +async fn test_request_subset_with_log_tail() { + // Test requesting a subset when log_tail is the latest commits + let log_files = vec![ + (0, LogPathFileType::Commit, CommitSource::Filesystem), + (1, LogPathFileType::Commit, CommitSource::Filesystem), + ]; + let (storage, log_root) = create_storage(log_files).await; + + // log_tail represents versions 2-4 (latest commits) + let log_tail = vec![ + make_parsed_log_path_with_source(2, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(3, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(4, LogPathFileType::Commit, CommitSource::Catalog), + ]; + + // list for only versions 1-3 + let (commits, _, _, _, latest_commit, max_pub) = + list_and_destructure(storage.as_ref(), &log_root, log_tail, Some(1), Some(3)); + + assert_eq!(commits.len(), 3); + assert_eq!(commits[0].version, 1); + assert_eq!(commits[1].version, 2); + assert_eq!(commits[2].version, 3); + assert_source(&commits[0], CommitSource::Filesystem); + assert_source(&commits[1], CommitSource::Catalog); + assert_source(&commits[2], CommitSource::Catalog); + assert_eq!(latest_commit.unwrap().version, 3); + assert_eq!(max_pub, Some(3)); +} + +#[tokio::test] +async fn test_log_tail_defines_latest_version() { + // log_tail defines the latest version of the table: if there is file system files after log + // tail, they are ignored. But we still list all filesystem files to track max_published_version. + let log_files = vec![ + (0, LogPathFileType::Commit, CommitSource::Filesystem), + (1, LogPathFileType::Commit, CommitSource::Filesystem), + (2, LogPathFileType::Commit, CommitSource::Filesystem), // <-- max_published_version + ]; + let (storage, log_root) = create_storage(log_files).await; + + // log_tail is just [1], indicating version 1 is the latest + let log_tail = vec![make_parsed_log_path_with_source( + 1, + LogPathFileType::Commit, + CommitSource::Catalog, + )]; + + let (commits, _, _, _, latest_commit, max_pub) = + list_and_destructure(storage.as_ref(), &log_root, log_tail, Some(0), None); + + // expect only 0 from file system and 1 from log tail + assert_eq!(commits.len(), 2); + assert_eq!(commits[0].version, 0); + assert_eq!(commits[1].version, 1); + assert_source(&commits[0], CommitSource::Filesystem); + assert_source(&commits[1], CommitSource::Catalog); + assert_eq!(latest_commit.unwrap().version, 1); + // max_published_version should reflect the highest published commit on filesystem + assert_eq!(max_pub, Some(2)); +} + +#[test] +fn test_log_tail_covers_entire_range_empty_filesystem() { + // Test-only storage handler that returns an empty listing. + // When the log_tail covers the entire commit range, we still call list_from + // (to pick up non-commit files like CRC/checkpoints), but the filesystem may + // have nothing — e.g. a purely catalog-managed table. + struct EmptyStorageHandler; + impl StorageHandler for EmptyStorageHandler { + fn list_from( + &self, + _path: &Url, + ) -> DeltaResult>>> { + Ok(Box::new(std::iter::empty())) + } + fn read_files( + &self, + _files: Vec, + ) -> DeltaResult>>> { + panic!("read_files should not be called during listing"); + } + fn put(&self, _path: &Url, _data: bytes::Bytes, _overwrite: bool) -> DeltaResult<()> { + panic!("put should not be called during listing"); + } + fn copy_atomic(&self, _src: &Url, _dest: &Url) -> DeltaResult<()> { + panic!("copy_atomic should not be called during listing"); + } + fn head(&self, _path: &Url) -> DeltaResult { + panic!("head should not be called during listing"); + } + } + + // log_tail covers versions 0-2, the entire range + let log_tail = vec![ + make_parsed_log_path_with_source(0, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(1, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(2, LogPathFileType::StagedCommit, CommitSource::Catalog), + ]; + + let storage = EmptyStorageHandler; + let url = Url::parse("memory:///anything/_delta_log/").unwrap(); + let (commits, _, _, _, latest_commit, max_pub) = + list_and_destructure(&storage, &url, log_tail, Some(0), Some(2)); + + // Only log_tail commits should appear (filesystem is empty) + assert_eq!(commits.len(), 3); + assert_eq!(commits[0].version, 0); + assert_eq!(commits[1].version, 1); + assert_eq!(commits[2].version, 2); + assert_source(&commits[0], CommitSource::Catalog); + assert_source(&commits[1], CommitSource::Catalog); + assert_source(&commits[2], CommitSource::Catalog); + assert_eq!(latest_commit.unwrap().version, 2); + // Only published (non-staged) commits from log_tail count for max_published_version + assert_eq!(max_pub, Some(1)); +} + +#[tokio::test] +async fn test_log_tail_covers_entire_range_with_crc() { + // When log_tail covers the entire requested range (starts at version 0), commit files + // from the filesystem should be excluded (log_tail is authoritative for commits), but + // non-commit files (CRC, checkpoints) should still be picked up from the filesystem. + let log_files = vec![ + (0, LogPathFileType::Commit, CommitSource::Filesystem), + (1, LogPathFileType::Commit, CommitSource::Filesystem), + (2, LogPathFileType::Crc, CommitSource::Filesystem), + ]; + let (storage, log_root) = create_storage(log_files).await; + + // log_tail covers versions 0-2, which includes the entire range we'll request + let log_tail = vec![ + make_parsed_log_path_with_source(0, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(1, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(2, LogPathFileType::StagedCommit, CommitSource::Catalog), + ]; + + let (commits, _, _, latest_crc, latest_commit, max_pub) = + list_and_destructure(storage.as_ref(), &log_root, log_tail, Some(0), Some(2)); + + // 3 commits from log_tail: 0, 1, 2 + assert_eq!(commits.len(), 3); + assert_source(&commits[0], CommitSource::Catalog); + assert_source(&commits[1], CommitSource::Catalog); + assert_source(&commits[2], CommitSource::Catalog); + + // CRC at version 2 from filesystem is preserved + let crc = latest_crc.unwrap(); + assert_eq!(crc.version, 2); + assert!(matches!(crc.file_type, LogPathFileType::Crc)); + + assert_eq!(latest_commit.unwrap().version, 2); + // Only published commits count: filesystem 0,1 (skipped but tracked) + log_tail 0,1 + assert_eq!(max_pub, Some(1)); +} + +#[tokio::test] +async fn test_listing_omits_staged_commits() { + // note that in the presence of staged commits, we CANNOT trust listing to determine which + // to include in our listing/log segment. This is up to the catalog. (e.g. version + // 5.uuid1.json and 5.uuid2.json can both exist and only catalog can say which is the 'real' + // version 5). + + let log_files = vec![ + (0, LogPathFileType::Commit, CommitSource::Filesystem), + (1, LogPathFileType::Commit, CommitSource::Filesystem), // <-- max_published_version + (1, LogPathFileType::StagedCommit, CommitSource::Filesystem), + (2, LogPathFileType::StagedCommit, CommitSource::Filesystem), + ]; + + let (storage, log_root) = create_storage(log_files).await; + let (commits, _, _, _, latest_commit, max_pub) = + list_and_destructure(storage.as_ref(), &log_root, vec![], None, None); + + // we must only see two regular commits + assert_eq!(commits.len(), 2); + assert_eq!(commits[0].version, 0); + assert_eq!(commits[1].version, 1); + assert_source(&commits[0], CommitSource::Filesystem); + assert_source(&commits[1], CommitSource::Filesystem); + assert_eq!(latest_commit.unwrap().version, 1); + assert_eq!(max_pub, Some(1)); +} + +#[tokio::test] +async fn test_listing_with_large_end_version() { + let log_files = vec![ + (0, LogPathFileType::Commit, CommitSource::Filesystem), + (1, LogPathFileType::Commit, CommitSource::Filesystem), // <-- max_published_version + (2, LogPathFileType::StagedCommit, CommitSource::Filesystem), + ]; + + let (storage, log_root) = create_storage(log_files).await; + // note we let you request end version past the end of log. up to consumer to interpret + let (commits, _, _, _, latest_commit, max_pub) = + list_and_destructure(storage.as_ref(), &log_root, vec![], None, Some(3)); + + // we must only see two regular commits + assert_eq!(commits.len(), 2); + assert_eq!(commits[0].version, 0); + assert_eq!(commits[1].version, 1); + assert_eq!(latest_commit.unwrap().version, 1); + assert_eq!(max_pub, Some(1)); +} + +#[tokio::test] +async fn test_non_commit_files_at_log_tail_versions_are_preserved() { + // Filesystem has commits 0-5, a checkpoint at version 7, and a CRC at version 8. + // Log tail provides commits 6-10. The checkpoint and CRC are on the filesystem + // at versions covered by the log_tail and must NOT be filtered out. + // + // After processing through ListingAccumulator, the checkpoint at version 7 + // causes commits before it to be cleared, keeping only commits after the checkpoint. + let log_files = vec![ + (0, LogPathFileType::Commit, CommitSource::Filesystem), + (1, LogPathFileType::Commit, CommitSource::Filesystem), + (2, LogPathFileType::Commit, CommitSource::Filesystem), + (3, LogPathFileType::Commit, CommitSource::Filesystem), + (4, LogPathFileType::Commit, CommitSource::Filesystem), + (5, LogPathFileType::Commit, CommitSource::Filesystem), + ( + 7, + LogPathFileType::SinglePartCheckpoint, + CommitSource::Filesystem, + ), + (8, LogPathFileType::Crc, CommitSource::Filesystem), + ]; + let (storage, log_root) = create_storage(log_files).await; + + let log_tail = vec![ + make_parsed_log_path_with_source(6, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(7, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(8, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(9, LogPathFileType::Commit, CommitSource::Catalog), + make_parsed_log_path_with_source(10, LogPathFileType::Commit, CommitSource::Catalog), + ]; + + let (commits, _, checkpoint_parts, latest_crc, latest_commit, max_pub) = + list_and_destructure(storage.as_ref(), &log_root, log_tail, Some(0), Some(10)); + + // Checkpoint at version 7 is preserved from filesystem + assert_eq!(checkpoint_parts.len(), 1); + assert_eq!(checkpoint_parts[0].version, 7); + assert!(checkpoint_parts[0].is_checkpoint()); + + // CRC at version 8 is preserved from filesystem + let crc = latest_crc.unwrap(); + assert_eq!(crc.version, 8); + assert!(matches!(crc.file_type, LogPathFileType::Crc)); + + // After checkpoint processing: commits before checkpoint are cleared, + // only log_tail commits 6-10 remain (added after checkpoint flush) + assert_eq!(commits.len(), 5); + for (i, commit) in commits.iter().enumerate() { + assert_eq!(commit.version, (i + 6) as u64); + assert_source(commit, CommitSource::Catalog); + } + assert_eq!(latest_commit.unwrap().version, 10); + + // max_published_version reflects all published commits seen (filesystem 0-5 + log_tail 6-10) + assert_eq!(max_pub, Some(10)); +} + +// ===== list_with_backward_checkpoint_scan() tests ===== + +// Log from v0 to v1005. Each case places an optional single-part checkpoint and +// verifies the expected commits, checkpoint version, and number of storage listings. +// +// Window boundaries (window size=1000, end_version=1005, exclusive upper): +// Window 1: [6, 1006) covers v6..=v1005 +// Window 2: [0, 6) covers v0..=v5 +// +// A checkpoint at v6+ is found in window 1 (1 listing); at v5 or lower in window 2 +// (2 listings). A checkpoint beyond end_version is never seen. +#[rstest] +// No checkpoint: scan exhausts both windows, all 1006 commits returned +#[case::no_checkpoint(None, 0..=1005, None, 2)] +// Checkpoint beyond end_version is never seen; same behavior as no checkpoint +#[case::checkpoint_beyond_end(Some(1006), 0..=1005, None, 2)] +// Checkpoint at end_version: found in window 1, no commits after it +#[case::checkpoint_at_end(Some(1005), 0..0, Some(1005), 1)] +// Checkpoint at v5: falls in window 2 -> 2 listings; commits 6..=1005 returned. +// Tests the inclusive window boundary: window 1 covers [6, 1006) or [6, 1005] (lower = 1006 - 1000 = 6), +// so v5 falls just outside it and requires a second listing, while v6 (next case) does not. +#[case::checkpoint_in_second_window(Some(5), 6..=1005, Some(5), 2)] +// Checkpoint at v6: falls in window 1 -> 1 listing; commits 7..=1005 returned +#[case::checkpoint_in_first_window(Some(6), 7..=1005, Some(6), 1)] +#[tokio::test] +async fn backward_scan_single_checkpoint_cases( + #[case] checkpoint_version: Option, + #[case] expected_commits: impl Iterator, + #[case] expected_checkpoint: Option, + #[case] expected_listings: u32, +) { + let mut log_files: Vec<(Version, LogPathFileType, CommitSource)> = (0u64..=1005) + .map(|v| (v, LogPathFileType::Commit, CommitSource::Filesystem)) + .collect(); + + if let Some(cp) = checkpoint_version { + log_files.push(( + cp, + LogPathFileType::SinglePartCheckpoint, + CommitSource::Filesystem, + )); + } + + let (storage, log_root) = create_storage(log_files).await; + let counter = CountingStorageHandler::new(storage); + + let result = + LogSegmentFiles::list_with_backward_checkpoint_scan(&counter, &log_root, vec![], 1005) + .unwrap(); + + assert_eq!(counter.call_count(), expected_listings); + + assert_eq!( + result.checkpoint_parts.len(), + if expected_checkpoint.is_some() { 1 } else { 0 } + ); + if let Some(cp_version) = expected_checkpoint { + assert_eq!(result.checkpoint_parts[0].version, cp_version); + } + + assert!(result + .ascending_commit_files + .iter() + .map(|f| f.version) + .eq(expected_commits)); +} + +/// end_version=3000. Window 2 contains an incomplete 2-of-2 multipart checkpoint (only +/// part 1 present). find_complete_checkpoint_version must return None for window 2, causing +/// the scan to continue to window 3, where a complete single-part checkpoint at v500 is +/// found. Verifies that incomplete parts from window 2 are discarded and do not pollute +/// the result's checkpoint_parts. +/// +/// Window 1 [2001, 3001): commits v2001..=v3000, no checkpoint -> continue +/// Window 2 [1001, 2001): commits v1001..=v2000, v1500 (1-of-2 parts) incomplete -> continue +/// Window 3 [1, 1001): commits v1..=v1000, v500 (complete) -> checkpoint found -> break +fn files_incomplete_in_second_window_complete_in_third_window( +) -> Vec<(Version, LogPathFileType, CommitSource)> { + let mut log_files: Vec<(Version, LogPathFileType, CommitSource)> = (0u64..=3000) + .map(|v| (v, LogPathFileType::Commit, CommitSource::Filesystem)) + .collect(); + log_files.push(( + 500, + LogPathFileType::SinglePartCheckpoint, + CommitSource::Filesystem, + )); + log_files.push(( + 1500, + LogPathFileType::MultiPartCheckpoint { + part_num: 1, + num_parts: 2, + }, + CommitSource::Filesystem, + )); + log_files +} +fn multipart_checkpoint_files() -> Vec<(Version, LogPathFileType, CommitSource)> { + // Log v0..=v52 with a complete 3-part checkpoint at v50. + // Single window [0, 53): checkpoint found -> stop. + let mut log_files: Vec<(Version, LogPathFileType, CommitSource)> = (0u64..=52) + .map(|v| (v, LogPathFileType::Commit, CommitSource::Filesystem)) + .collect(); + log_files.extend([ + ( + 50, + LogPathFileType::MultiPartCheckpoint { + part_num: 1, + num_parts: 3, + }, + CommitSource::Filesystem, + ), + ( + 50, + LogPathFileType::MultiPartCheckpoint { + part_num: 2, + num_parts: 3, + }, + CommitSource::Filesystem, + ), + ( + 50, + LogPathFileType::MultiPartCheckpoint { + part_num: 3, + num_parts: 3, + }, + CommitSource::Filesystem, + ), + ]); + log_files +} + +struct BackwardScanExpected { + listings: u32, + checkpoint_parts: usize, + checkpoint_version: Version, + commit_count: usize, + first_commit: Version, + last_commit: Version, +} + +// Case 1: complete 3-part checkpoint at v50, single window needed +// Case 2: incomplete 1-of-2 part at v1500 in window 2, complete checkpoint at v500 in window 3 +#[rstest] +#[case::multipart_checkpoint( + multipart_checkpoint_files(), + 52, + BackwardScanExpected { listings: 1, checkpoint_parts: 3, checkpoint_version: 50, commit_count: 2, first_commit: 51, last_commit: 52 } + )] +#[case::incomplete_in_second_window_complete_in_third( + files_incomplete_in_second_window_complete_in_third_window(), + 3000, + BackwardScanExpected { listings: 3, checkpoint_parts: 1, checkpoint_version: 500, commit_count: 2500, first_commit: 501, last_commit: 3000 } + )] +#[tokio::test] +async fn backward_scan_multipart_checkpoint_cases( + #[case] log_files: Vec<(Version, LogPathFileType, CommitSource)>, + #[case] end_version: Version, + #[case] expected: BackwardScanExpected, +) { + let BackwardScanExpected { + listings: expected_listings, + checkpoint_parts: expected_checkpoint_parts, + checkpoint_version: expected_checkpoint_version, + commit_count: expected_commit_count, + first_commit: expected_first_commit, + last_commit: expected_last_commit, + } = expected; + let (storage, log_root) = create_storage(log_files).await; + let counter = CountingStorageHandler::new(storage); + + let result = LogSegmentFiles::list_with_backward_checkpoint_scan( + &counter, + &log_root, + vec![], + end_version, + ) + .unwrap(); + + assert_eq!(counter.call_count(), expected_listings); + assert_eq!(result.checkpoint_parts.len(), expected_checkpoint_parts); + assert!(result + .checkpoint_parts + .iter() + .all(|p| p.version == expected_checkpoint_version)); + assert_eq!(result.ascending_commit_files.len(), expected_commit_count); + assert_eq!( + result.ascending_commit_files.first().unwrap().version, + expected_first_commit + ); + assert_eq!( + result.ascending_commit_files.last().unwrap().version, + expected_last_commit + ); + assert_eq!( + result.latest_commit_file.unwrap().version, + expected_last_commit + ); +} + +#[tokio::test] +async fn backward_scan_with_log_tail_derives_lower_bound_from_checkpoint() { + // FS: commits v0..=v7 + checkpoint at v5. log_tail: catalog commits v8..=v10. + // The checkpoint at v5 sets the lower bound to v6, so FS commits v6 and v7 plus all + // catalog entries v8..=v10 are included. + let mut log_files: Vec<(Version, LogPathFileType, CommitSource)> = (0u64..=7) + .map(|v| (v, LogPathFileType::Commit, CommitSource::Filesystem)) + .collect(); + log_files.push(( + 5, + LogPathFileType::SinglePartCheckpoint, + CommitSource::Filesystem, + )); + let (storage, log_root) = create_storage(log_files).await; + + let log_tail: Vec<_> = (8u64..=10) + .map(|v| { + make_parsed_log_path_with_source(v, LogPathFileType::Commit, CommitSource::Catalog) + }) + .collect(); + + let result = LogSegmentFiles::list_with_backward_checkpoint_scan( + storage.as_ref(), + &log_root, + log_tail, + 10, + ) + .unwrap(); + + assert_eq!(result.checkpoint_parts.len(), 1); + assert_eq!(result.checkpoint_parts[0].version, 5); + + // FS commits v6, v7 after the checkpoint; catalog commits v8..=v10 + let expected = [ + (6, CommitSource::Filesystem), + (7, CommitSource::Filesystem), + (8, CommitSource::Catalog), + (9, CommitSource::Catalog), + (10, CommitSource::Catalog), + ]; + assert_eq!(result.ascending_commit_files.len(), expected.len()); + for (file, (version, source)) in result.ascending_commit_files.iter().zip(expected) { + assert_eq!(file.version, version); + assert_source(file, source); + } + assert_eq!(result.latest_commit_file.unwrap().version, 10); +} + +#[tokio::test] +async fn backward_scan_with_log_tail_starting_before_checkpoint() { + // FS: commits v0..=v5 + checkpoint at v5 + CRC at v6. log_tail: catalog commits v3..=v8, + // starting before the checkpoint. The checkpoint at v5 sets the lower bound to v5, so + // log_tail v3..=v4 are excluded. The log_tail commit at v5 passes through (it is at the + // checkpoint version). The CRC at v6 is preserved even though v6 is within the log_tail range. + let mut log_files: Vec<(Version, LogPathFileType, CommitSource)> = (0u64..=5) + .map(|v| (v, LogPathFileType::Commit, CommitSource::Filesystem)) + .collect(); + log_files.push(( + 5, + LogPathFileType::SinglePartCheckpoint, + CommitSource::Filesystem, + )); + log_files.push((6, LogPathFileType::Crc, CommitSource::Filesystem)); + let (storage, log_root) = create_storage(log_files).await; + + let log_tail: Vec<_> = (3u64..=8) + .map(|v| { + make_parsed_log_path_with_source(v, LogPathFileType::Commit, CommitSource::Catalog) + }) + .collect(); + + let result = LogSegmentFiles::list_with_backward_checkpoint_scan( + storage.as_ref(), + &log_root, + log_tail, + 8, + ) + .unwrap(); + + assert_eq!(result.checkpoint_parts.len(), 1); + assert_eq!(result.checkpoint_parts[0].version, 5); + + // CRC at v6 is preserved even though v6 is within the log_tail range + let crc = result.latest_crc_file.unwrap(); + assert_eq!(crc.version, 6); + assert!(matches!(crc.file_type, LogPathFileType::Crc)); + + // v5 passes the start version filter (>= 5) and is included here + assert_eq!(result.ascending_commit_files.len(), 4); + for (i, commit) in result.ascending_commit_files.iter().enumerate() { + assert_eq!(commit.version, (i + 5) as u64); + assert_source(commit, CommitSource::Catalog); + } + assert_eq!(result.latest_commit_file.unwrap().version, 8); +} + +#[tokio::test] +async fn backward_scan_log_tail_defines_latest_version() { + // FS: commits v0..=v5. log_tail: catalog commit v4. end_version=5. + // FS v4 and v5 are filtered since log_tail_start=4. max_published_version is Some(5), + // the highest FS commit seen within end_version, even though v5 is not in + // ascending_commit_files. + let log_files: Vec<(Version, LogPathFileType, CommitSource)> = (0u64..=5) + .map(|v| (v, LogPathFileType::Commit, CommitSource::Filesystem)) + .collect(); + let (storage, log_root) = create_storage(log_files).await; + + let log_tail = vec![make_parsed_log_path_with_source( + 4, + LogPathFileType::Commit, + CommitSource::Catalog, + )]; + + let result = LogSegmentFiles::list_with_backward_checkpoint_scan( + storage.as_ref(), + &log_root, + log_tail, + 5, + ) + .unwrap(); + + let expected = [ + (0, CommitSource::Filesystem), + (1, CommitSource::Filesystem), + (2, CommitSource::Filesystem), + (3, CommitSource::Filesystem), + (4, CommitSource::Catalog), + ]; + assert_eq!(result.ascending_commit_files.len(), expected.len()); + for (file, (version, source)) in result.ascending_commit_files.iter().zip(expected) { + assert_eq!(file.version, version); + assert_source(file, source); + } + assert_eq!(result.latest_commit_file.unwrap().version, 4); + assert_eq!(result.max_published_version, Some(5)); +} + +// ===== find_complete_checkpoint_version direct unit tests (other cases already covered by tests above) ===== + +fn incomplete_then_complete_files() -> Vec { + // Commits v0..=10, an incomplete checkpoint at v5 (1 of 3 parts), and a complete + // checkpoint at v10. find_complete_checkpoint_version must continue past the failed group + // and find the complete one. + let mut files: Vec = (0..=10) + .map(|v| { + make_parsed_log_path_with_source(v, LogPathFileType::Commit, CommitSource::Filesystem) + }) + .collect(); + files.push(make_parsed_log_path_with_source( + 5, + LogPathFileType::MultiPartCheckpoint { + part_num: 1, + num_parts: 3, + }, + CommitSource::Filesystem, + )); + files.push(make_parsed_log_path_with_source( + 10, + LogPathFileType::SinglePartCheckpoint, + CommitSource::Filesystem, + )); + files +} + +fn two_complete_checkpoints_files() -> Vec { + // Commits v0..=10, complete checkpoint at v5 and complete checkpoint at v10. + // The function must return the latest (v10), not the first (v5). + let mut files: Vec = (0..=10) + .map(|v| { + make_parsed_log_path_with_source(v, LogPathFileType::Commit, CommitSource::Filesystem) + }) + .collect(); + files.push(make_parsed_log_path_with_source( + 5, + LogPathFileType::SinglePartCheckpoint, + CommitSource::Filesystem, + )); + files.push(make_parsed_log_path_with_source( + 10, + LogPathFileType::SinglePartCheckpoint, + CommitSource::Filesystem, + )); + files +} + +#[rstest] +// Commits v0..=5, no checkpoint files +#[case::no_checkpoint( + (0u64..=5).map(|v| make_parsed_log_path_with_source(v, LogPathFileType::Commit, CommitSource::Filesystem)).collect(), + None + )] +// Commits v0..=10, incomplete checkpoint at v5, complete checkpoint at v10 +#[case::incomplete_then_complete(incomplete_then_complete_files(), Some(10))] +// Commits v0..=10, complete checkpoint at v5 and v10: must return v10 (latest) +#[case::two_complete(two_complete_checkpoints_files(), Some(10))] +fn find_complete_checkpoint_version_cases( + #[case] files: Vec, + #[case] expected: Option, +) { + assert_eq!(find_complete_checkpoint_version(&files), expected); +} diff --git a/kernel/src/metrics/events.rs b/kernel/src/metrics/events.rs new file mode 100644 index 0000000000..f970e9e78d --- /dev/null +++ b/kernel/src/metrics/events.rs @@ -0,0 +1,246 @@ +//! Metric event types and utilities. + +use std::fmt; +use std::time::Duration; +use uuid::Uuid; + +/// Unique identifier for a metrics operation. +/// +/// Each operation (Snapshot, Transaction, Scan) gets a unique MetricId that +/// is used to correlate all events from that operation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct MetricId(Uuid); + +/// Identifies which scan execution path produced a scan metadata metrics event. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ScanType { + /// Sequential phase of [`crate::scan::Scan::parallel_scan_metadata`]. + SequentialPhase, + /// Parallel phase of [`crate::scan::Scan::parallel_scan_metadata`]. + ParallelPhase, + /// Scan metadata from [`crate::scan::Scan::scan_metadata`]. + Full, +} + +impl std::fmt::Display for ScanType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let scan_type = match self { + ScanType::SequentialPhase => "sequential", + ScanType::ParallelPhase => "parallel", + ScanType::Full => "full", + }; + write!(f, "{scan_type}") + } +} + +impl MetricId { + /// Generate a new unique MetricId. + pub fn new() -> Self { + Self(Uuid::new_v4()) + } +} + +impl Default for MetricId { + fn default() -> Self { + Self::new() + } +} + +impl fmt::Display for MetricId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +/// Metric events emitted during Delta Kernel operations. +/// +/// Some events include an `operation_id` (MetricId) that uniquely identifies the operation +/// instance. This allows correlating multiple events from the same operation. +#[derive(Debug, Clone)] +pub enum MetricEvent { + /// Log segment loading completed (listing and organizing log files). + LogSegmentLoaded { + operation_id: MetricId, + duration: Duration, + num_commit_files: u64, + num_checkpoint_files: u64, + num_compaction_files: u64, + }, + + /// Protocol and metadata loading completed. + ProtocolMetadataLoaded { + operation_id: MetricId, + duration: Duration, + }, + + /// Snapshot creation completed successfully. + SnapshotCompleted { + operation_id: MetricId, + version: u64, + total_duration: Duration, + }, + + /// Snapshot creation failed. + SnapshotFailed { + operation_id: MetricId, + duration: Duration, + }, + + /// Storage list operation completed. + /// These events track storage-level latencies and are emitted automatically + /// by the default storage handler implementation. + StorageListCompleted { duration: Duration, num_files: u64 }, + + /// Storage read operation completed. + StorageReadCompleted { + duration: Duration, + num_files: u64, + bytes_read: u64, + }, + + /// Storage copy operation completed. + StorageCopyCompleted { duration: Duration }, + + /// JSON file read operation completed (one event per [`JsonHandler::read_json_files`] call). + /// + /// `bytes_read` is the sum of `FileMeta::size` for the requested files (on-disk size), + /// which is the best available approximation without re-reading the bytes. + /// + /// [`JsonHandler::read_json_files`]: crate::JsonHandler::read_json_files + JsonReadCompleted { num_files: u64, bytes_read: u64 }, + + /// Parquet file read operation completed (one event per + /// [`ParquetHandler::read_parquet_files`] call). + /// + /// `bytes_read` is the sum of `FileMeta::size` for the requested files (on-disk size), + /// which is the best available approximation without re-reading the bytes. + /// + /// [`ParquetHandler::read_parquet_files`]: crate::ParquetHandler::read_parquet_files + ParquetReadCompleted { num_files: u64, bytes_read: u64 }, + + /// Scan metadata iteration completed. + /// + /// Emitted when the scan metadata iterator is exhausted. This event captures metrics about the + /// log replay process, including file counts and timing information. + ScanMetadataCompleted { + /// Unique ID to correlate this scan with other events. + operation_id: MetricId, + /// Indicates which scan execution path produced this event. + /// + /// This is `SequentialPhase` or `ParallelPhase` for parallel log replay, and `Full` for + /// [`crate::scan::Scan::scan_metadata`]. + scan_type: ScanType, + /// Total duration from scan start to iterator exhaustion. + total_duration: Duration, + /// Add files that entered the deduplication visitor. This excludes files filtered by + /// data skipping before deduplication. For the total number of add actions in the log, + /// this value plus `num_predicate_filtered` gives a closer approximation. + num_add_files_seen: u64, + /// Add files that survived log replay (files to read). + num_active_add_files: u64, + /// Remove files seen (from delta/commit files only). + num_remove_files_seen: u64, + /// Non-file actions seen (protocol, metadata, etc.). + num_non_file_actions: u64, + /// Files filtered by predicates (data skipping + partition pruning). + num_predicate_filtered: u64, + /// Peak size of the deduplication hash set. + peak_hash_set_size: usize, + /// Time spent in the deduplication visitor (milliseconds). + dedup_visitor_time_ms: u64, + /// Time spent evaluating predicates (milliseconds). + predicate_eval_time_ms: u64, + }, +} + +impl fmt::Display for MetricEvent { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + MetricEvent::LogSegmentLoaded { + operation_id, + duration, + num_commit_files, + num_checkpoint_files, + num_compaction_files, + } => write!( + f, + "LogSegmentLoaded(id={operation_id}, duration={duration:?}, commits={num_commit_files}, checkpoints={num_checkpoint_files}, compactions={num_compaction_files})" + ), + MetricEvent::ProtocolMetadataLoaded { + operation_id, + duration, + } => write!( + f, + "ProtocolMetadataLoaded(id={operation_id}, duration={duration:?})" + ), + MetricEvent::SnapshotCompleted { + operation_id, + version, + total_duration, + } => write!( + f, + "SnapshotCompleted(id={operation_id}, version={version}, duration={total_duration:?})" + ), + MetricEvent::SnapshotFailed { + operation_id, + duration, + } => write!( + f, + "SnapshotFailed(id={operation_id}, duration={duration:?})" + ), + MetricEvent::StorageListCompleted { + duration, + num_files, + } => write!( + f, + "StorageListCompleted(duration={duration:?}, files={num_files})" + ), + MetricEvent::StorageReadCompleted { + duration, + num_files, + bytes_read, + } => write!( + f, + "StorageReadCompleted(duration={duration:?}, files={num_files}, bytes={bytes_read})" + ), + MetricEvent::StorageCopyCompleted { duration } => write!( + f, + "StorageCopyCompleted(duration={duration:?})" + ), + MetricEvent::JsonReadCompleted { + num_files, + bytes_read, + } => write!( + f, + "JsonReadCompleted(files={num_files}, bytes={bytes_read})" + ), + MetricEvent::ParquetReadCompleted { + num_files, + bytes_read, + } => write!( + f, + "ParquetReadCompleted(files={num_files}, bytes={bytes_read})" + ), + MetricEvent::ScanMetadataCompleted { + operation_id, + scan_type, + total_duration, + num_add_files_seen, + num_active_add_files, + num_remove_files_seen, + num_non_file_actions, + num_predicate_filtered, + peak_hash_set_size, + dedup_visitor_time_ms, + predicate_eval_time_ms, + } => write!( + f, + "ScanMetadataCompleted(id={operation_id}, scan_type={scan_type}, duration={total_duration:?}, \ + add_files_seen={num_add_files_seen}, active_add_files={num_active_add_files}, \ + remove_files_seen={num_remove_files_seen}, non_file_actions={num_non_file_actions}, \ + predicate_filtered={num_predicate_filtered}, peak_hash_set_size={peak_hash_set_size}, \ + dedup_visitor_time_ms={dedup_visitor_time_ms}, predicate_eval_time_ms={predicate_eval_time_ms})" + ), + } + } +} diff --git a/kernel/src/metrics/mod.rs b/kernel/src/metrics/mod.rs new file mode 100644 index 0000000000..f2570f2c5e --- /dev/null +++ b/kernel/src/metrics/mod.rs @@ -0,0 +1,74 @@ +//! Metrics collection for Delta Kernel operations. +//! +//! This module provides metrics tracking for various Delta operations including +//! snapshot creation, scans, and transactions. Metrics are collected during operations +//! and reported as events via the `MetricsReporter` trait. +//! +//! Each operation (Snapshot, Transaction, Scan) is assigned a unique operation ID ([`MetricId`]) +//! when it starts, and all subsequent events for that operation reference this ID. +//! This allows reporters to correlate events and track operation lifecycles. +//! +//! # Example: Implementing a Custom MetricsReporter +//! +//! ``` +//! use std::sync::Arc; +//! use delta_kernel::metrics::{MetricsReporter, MetricEvent}; +//! +//! #[derive(Debug)] +//! struct LoggingReporter; +//! +//! impl MetricsReporter for LoggingReporter { +//! fn report(&self, event: MetricEvent) { +//! match event { +//! MetricEvent::LogSegmentLoaded { operation_id, duration, num_commit_files, .. } => { +//! println!("Log segment loaded in {:?}: {} commits", duration, num_commit_files); +//! } +//! MetricEvent::SnapshotCompleted { operation_id, version, total_duration } => { +//! println!("Snapshot completed: v{} in {:?}", version, total_duration); +//! } +//! MetricEvent::SnapshotFailed { operation_id, duration } => { +//! println!("Snapshot failed: {} after {:?}", operation_id, duration); +//! } +//! _ => {} +//! } +//! } +//! } +//! ``` +//! +//! # Example: Implementing a Composite Reporter +//! +//! If you need to send metrics to multiple destinations, you can create a composite reporter: +//! +//! ``` +//! use std::sync::Arc; +//! use delta_kernel::metrics::{MetricsReporter, MetricEvent}; +//! +//! #[derive(Debug)] +//! struct CompositeReporter { +//! reporters: Vec>, +//! } +//! +//! impl MetricsReporter for CompositeReporter { +//! fn report(&self, event: MetricEvent) { +//! for reporter in &self.reporters { +//! reporter.report(event.clone()); +//! } +//! } +//! } +//! ``` +//! +//! # Storage Metrics +//! +//! Storage operations (list, read, copy) are automatically instrumented when using +//! `DefaultEngine` with a metrics reporter. The default storage handler implementation +//! emits `StorageListCompleted`, `StorageReadCompleted`, and `StorageCopyCompleted` +//! events that track latencies at the storage layer. +//! +//! These metrics are standalone and track aggregate storage performance without +//! correlating to specific Snapshot/Transaction operations. + +mod events; +mod reporter; + +pub use events::{MetricEvent, MetricId, ScanType}; +pub use reporter::MetricsReporter; diff --git a/kernel/src/metrics/reporter.rs b/kernel/src/metrics/reporter.rs new file mode 100644 index 0000000000..5357aef0fe --- /dev/null +++ b/kernel/src/metrics/reporter.rs @@ -0,0 +1,14 @@ +//! Metrics reporter trait and implementations. + +use super::MetricEvent; + +/// Trait for reporting metrics events from Delta operations. +/// +/// Implementations of this trait receive metric events as they occur during operations +/// and can forward them to monitoring systems like Prometheus, DataDog, etc. +/// +/// Events are emitted throughout an operation's lifecycle, allowing real-time monitoring. +pub trait MetricsReporter: Send + Sync + std::fmt::Debug { + /// Report a metric event. + fn report(&self, event: MetricEvent); +} diff --git a/kernel/src/parallel/mod.rs b/kernel/src/parallel/mod.rs new file mode 100644 index 0000000000..814f738b33 --- /dev/null +++ b/kernel/src/parallel/mod.rs @@ -0,0 +1,13 @@ +//! Two-phase log replay for parallel execution of checkpoint processing. + +#[cfg(feature = "internal-api")] +pub mod parallel_phase; +#[cfg(not(feature = "internal-api"))] +pub(crate) mod parallel_phase; + +#[cfg(feature = "internal-api")] +pub mod sequential_phase; +#[cfg(not(feature = "internal-api"))] +pub(crate) mod sequential_phase; + +pub(crate) mod parallel_scan_metadata; diff --git a/kernel/src/parallel/parallel_phase.rs b/kernel/src/parallel/parallel_phase.rs new file mode 100644 index 0000000000..a32f8e8d67 --- /dev/null +++ b/kernel/src/parallel/parallel_phase.rs @@ -0,0 +1,946 @@ +//! Parallel phase of log replay - processes checkpoint leaf files (sidecars or multi-part parts). +//! +//! This phase runs after [`SequentialPhase`] completes and is designed for parallel execution. +//! Partition the leaf files across executors and create one `ParallelPhase` per partition. +//! +//! [`SequentialPhase`]: super::sequential_phase::SequentialPhase +#![allow(unused)] + +use std::sync::Arc; + +use delta_kernel_derive::internal_api; + +use crate::log_replay::ActionsBatch; +use crate::log_replay::ParallelLogReplayProcessor; +use crate::scan::CHECKPOINT_READ_SCHEMA; +use crate::schema::SchemaRef; +use crate::EngineData; +use crate::{DeltaResult, Engine, FileMeta}; + +use itertools::Itertools; + +/// Processes checkpoint leaf files in parallel using a shared processor. +/// +/// This struct is designed for distributed execution where checkpoint leaf files (sidecars or +/// multi-part checkpoint parts) are partitioned across multiple executors. Each executor creates +/// its own `ParallelPhase` instance with a subset of files, but all instances share the same +/// processor (typically wrapped in `Arc`) to coordinate deduplication. +/// +/// Implements `Iterator` to yield processed batches. The processor is responsible for filtering +/// out actions for files already seen in the sequential_phase. +/// +/// # Example workflow +/// - Partition leaf files across N executors +/// - Create one `ParallelPhase>` per executor with its file subset +/// - Each instance processes its files independently while sharing deduplication state +/// cbindgen:ignore +#[internal_api] +pub(crate) struct ParallelPhase { + processor: P, + leaf_checkpoint_reader: Box>>, +} + +impl ParallelPhase

{ + /// Creates a new parallel phase for processing checkpoint leaf files. + /// + /// # Parameters + /// - `engine`: Engine for reading parquet files + /// - `processor`: Shared processor (wrap in `Arc` for distribution across executors) + /// - `leaf_files`: Checkpoint leaf files (sidecars or multi-part checkpoint parts) + /// - `read_schema`: Schema to use for reading checkpoint files + #[internal_api] + #[allow(unused)] + pub(crate) fn try_new( + engine: Arc, + processor: P, + leaf_files: Vec, + read_schema: SchemaRef, + ) -> DeltaResult { + let leaf_checkpoint_reader = engine + .parquet_handler() + .read_parquet_files(&leaf_files, read_schema, None)? + .map_ok(|batch| ActionsBatch::new(batch, false)); + Ok(Self { + processor, + leaf_checkpoint_reader: Box::new(leaf_checkpoint_reader), + }) + } + + /// Creates a new parallel phase from an existing iterator of EngineData. + /// + /// Use this constructor when you want to parallelize processing at the row group level. + /// Instead of reading entire checkpoint files, you can provide an iterator that yields + /// individual row groups, allowing finer-grained parallelization. + /// + /// # Parameters + /// - `processor`: Shared processor (wrap in `Arc` for distribution across executors) + /// - `iter`: Iterator yielding checkpoint action batches, typically from individual row groups + #[internal_api] + #[allow(unused)] + pub(crate) fn new_from_iter( + processor: P, + iter: impl IntoIterator>> + 'static, + ) -> Self { + let leaf_checkpoint_reader = iter + .into_iter() + .map_ok(|batch| ActionsBatch::new(batch, false)); + Self { + processor, + leaf_checkpoint_reader: Box::new(leaf_checkpoint_reader), + } + } + + /// Returns the schema used for reading checkpoint files. + /// + /// This schema defines the structure expected when reading checkpoint parquet files, + /// including the action types (add, remove, etc.) and their fields. + #[internal_api] + #[allow(unused)] + pub(crate) fn file_read_schema() -> SchemaRef { + CHECKPOINT_READ_SCHEMA.clone() + } +} + +/// Yields processed batches from checkpoint leaf files. +/// +/// Each call to `next()` reads one batch from the checkpoint reader and processes it through +/// the processor. The processor applies filtering logic (e.g., removing files already seen in +/// the sequential phase) and returns the processed output. +/// +/// # Errors +/// Returns `DeltaResult` errors for: +/// - File reading failures +/// - Parquet parsing errors +/// - Processing errors from the processor +impl Iterator for ParallelPhase

{ + type Item = DeltaResult; + + fn next(&mut self) -> Option { + self.leaf_checkpoint_reader + .next() + .map(|batch| self.processor.process_actions_batch(batch?)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::actions::get_log_add_schema; + use crate::engine::arrow_data::ArrowEngineData; + use crate::engine::default::DefaultEngine; + use crate::log_replay::FileActionKey; + use crate::log_segment::CheckpointReadInfo; + use crate::object_store::memory::InMemory; + use crate::object_store::path::Path; + use crate::object_store::ObjectStore; + use crate::parallel::parallel_scan_metadata::AfterSequentialScanMetadata; + use crate::parallel::parallel_scan_metadata::{ParallelScanMetadata, ParallelState}; + use crate::parquet::arrow::arrow_writer::ArrowWriter; + use crate::scan::log_replay::ScanLogReplayProcessor; + use crate::scan::state::ScanFile; + use crate::scan::state_info::tests::get_simple_state_info; + use crate::schema::{DataType, StructField, StructType}; + use crate::utils::test_utils::{load_test_table, parse_json_batch}; + use crate::{PredicateRef, SnapshotRef}; + use std::collections::HashSet; + use std::sync::Arc; + use std::thread; + use url::Url; + + // ============================================================ + // Test helpers for focused ParallelPhase tests + // ============================================================ + + /// Writes a record batch to the in-memory store at a given path. + async fn write_parquet_to_store( + store: &Arc, + path: &str, + data: Box, + ) -> DeltaResult<()> { + let batch = ArrowEngineData::try_from_engine_data(data)?; + let record_batch = batch.record_batch(); + + let mut buffer = vec![]; + let mut writer = ArrowWriter::try_new(&mut buffer, record_batch.schema(), None)?; + writer.write(record_batch)?; + writer.close()?; + + store.put(&Path::from(path), buffer.into()).await?; + + Ok(()) + } + + /// Gets the file size from the store for use in FileMeta + async fn get_file_size(store: &Arc, path: &str) -> u64 { + let object_meta = store.head(&Path::from(path)).await.unwrap(); + object_meta.size + } + + /// Creates a simple table schema for tests + fn test_schema() -> Arc { + Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])) + } + + /// Creates a ScanLogReplayProcessor with a pre-populated HashMap. + fn create_processor_with_seen_files( + engine: &dyn crate::Engine, + seen_paths: &[&str], + ) -> DeltaResult { + let state_info = Arc::new(get_simple_state_info(test_schema(), vec![])?); + + let seen_file_keys: HashSet = seen_paths + .iter() + .map(|path| FileActionKey::new(*path, None)) + .collect(); + + let checkpoint_info = CheckpointReadInfo::without_stats_parsed(); + + ScanLogReplayProcessor::new_with_seen_files( + engine, + state_info, + checkpoint_info, + seen_file_keys, + false, + ) + } + + // ============================================================ + // Focused ParallelPhase tests with in-memory sidecars + // ============================================================ + + /// Helper to run a ParallelPhase test with in-memory sidecars. + /// + /// # Parameters + /// - `add_paths`: Paths of add actions to include in the sidecar + /// - `seen_paths`: Paths to pre-populate in the processor's seen HashMap + /// - `expected_paths`: Expected output paths after filtering + async fn run_parallel_phase_test( + add_paths: &[&str], + seen_paths: &[&str], + expected_paths: &[&str], + ) -> DeltaResult<()> { + let store = Arc::new(InMemory::new()); + let url = Url::parse("memory:///")?; + let engine = DefaultEngine::builder(store.clone()).build(); + + // Create sidecar with add actions + let json_adds = add_paths + .iter() + .enumerate() + .map(|(i, path)| { + format!( + r#"{{"add":{{"path":"{}","partitionValues":{{}},"size":{},"modificationTime":{},"dataChange":true}}}}"#, + path, + (i + 1) * 100, + (i + 1) * 1000 + ) + }).collect_vec(); + let sidecar_data = parse_json_batch(json_adds.into()); + + // Write sidecar to store + let sidecar_path = "_delta_log/_sidecars/test.parquet"; + write_parquet_to_store(&store, sidecar_path, sidecar_data).await?; + + // Create processor with seen files + let processor = Arc::new(create_processor_with_seen_files(&engine, seen_paths)?); + + // Create FileMeta for the sidecar + let file_meta = FileMeta { + location: url.join(sidecar_path)?, + last_modified: 0, + size: get_file_size(&store, sidecar_path).await, + }; + + let mut parallel = ParallelPhase::try_new( + Arc::new(engine), + processor.clone(), + vec![file_meta], + CHECKPOINT_READ_SCHEMA.clone(), + )?; + + let mut all_paths = parallel.try_fold(Vec::new(), |acc, metadata_res| { + metadata_res?.visit_scan_files(acc, |ps: &mut Vec, scan_file| { + ps.push(scan_file.path); + }) + })?; + + // Verify results + all_paths.sort(); + let mut expected: Vec<&str> = expected_paths.to_vec(); + expected.sort(); + assert_eq!(all_paths, expected); + + Ok(()) + } + + #[tokio::test] + async fn test_parallel_phase_empty_hashmap_all_adds_pass() -> DeltaResult<()> { + run_parallel_phase_test( + &["file1.parquet", "file2.parquet", "file3.parquet"], + &[], + &["file1.parquet", "file2.parquet", "file3.parquet"], + ) + .await + } + + #[tokio::test] + async fn test_parallel_phase_with_removes_filters_matching_adds() -> DeltaResult<()> { + run_parallel_phase_test( + &["file1.parquet", "file2.parquet", "file3.parquet"], + &["file2.parquet"], + &["file1.parquet", "file3.parquet"], + ) + .await + } + + #[tokio::test] + async fn test_parallel_phase_all_files_removed() -> DeltaResult<()> { + run_parallel_phase_test( + &["removed1.parquet", "removed2.parquet"], + &["removed1.parquet", "removed2.parquet"], + &[], + ) + .await + } + + #[tokio::test] + async fn test_parallel_phase_multiple_sidecars() -> DeltaResult<()> { + // This test uses multiple sidecar files, so we need custom logic + let store = Arc::new(InMemory::new()); + let url = Url::parse("memory:///")?; + let engine = DefaultEngine::builder(store.clone()).build(); + + // Create two sidecars + let sidecar1_data = parse_json_batch(vec![ + r#"{"add":{"path":"sidecar1_file1.parquet","partitionValues":{},"size":100,"modificationTime":1000,"dataChange":true}}"#, + r#"{"add":{"path":"sidecar1_file2.parquet","partitionValues":{},"size":200,"modificationTime":2000,"dataChange":true}}"#, + ].into()); + let sidecar2_data = parse_json_batch(vec![ + r#"{"add":{"path":"sidecar2_file1.parquet","partitionValues":{},"size":300,"modificationTime":3000,"dataChange":true}}"#, + ].into()); + + let sidecar1_path = "_delta_log/_sidecars/sidecar1.parquet"; + let sidecar2_path = "_delta_log/_sidecars/sidecar2.parquet"; + write_parquet_to_store(&store, sidecar1_path, sidecar1_data).await?; + write_parquet_to_store(&store, sidecar2_path, sidecar2_data).await?; + + let processor = Arc::new(create_processor_with_seen_files( + &engine, + &["sidecar1_file2.parquet"], + )?); + + let file_metas = vec![ + FileMeta { + location: url.join(sidecar1_path)?, + last_modified: 0, + size: get_file_size(&store, sidecar1_path).await, + }, + FileMeta { + location: url.join(sidecar2_path)?, + last_modified: 0, + size: get_file_size(&store, sidecar2_path).await, + }, + ]; + + let mut parallel = ParallelPhase::try_new( + Arc::new(engine), + processor.clone(), + file_metas, + CHECKPOINT_READ_SCHEMA.clone(), + )?; + + let mut all_paths = parallel.try_fold(Vec::new(), |acc, metadata_res| { + metadata_res?.visit_scan_files(acc, |ps: &mut Vec, scan_file| { + ps.push(scan_file.path); + }) + })?; + + all_paths.sort(); + assert_eq!( + all_paths, + vec!["sidecar1_file1.parquet", "sidecar2_file1.parquet"] + ); + + Ok(()) + } + + // ============================================================ + // Integration tests using real test tables + // ============================================================ + + /// Get expected file paths using the scan_metadata API (single-node approach). + fn get_expected_paths( + engine: &dyn crate::Engine, + snapshot: &SnapshotRef, + predicate: Option, + ) -> DeltaResult> { + let mut builder = snapshot.clone().scan_builder(); + if let Some(pred) = predicate { + builder = builder.with_predicate(pred); + } + let scan = builder.build()?; + let mut scan_metadata_iter = scan.scan_metadata(engine)?; + + let mut paths = scan_metadata_iter.try_fold(Vec::new(), |acc, metadata_res| { + metadata_res?.visit_scan_files(acc, |ps: &mut Vec, scan_file: ScanFile| { + ps.push(scan_file.path); + }) + })?; + paths.sort(); + Ok(paths) + } + + fn verify_parallel_workflow( + table_name: &str, + predicate: Option, + with_serde: bool, + one_file_per_worker: bool, + dispatcher: Option, + ) -> DeltaResult<()> { + let (engine, snapshot, _tempdir) = load_test_table(table_name)?; + + let expected_paths = get_expected_paths(engine.as_ref(), &snapshot, predicate.clone())?; + + let mut builder = snapshot.scan_builder(); + if let Some(pred) = predicate { + builder = builder.with_predicate(pred); + } + let scan = builder.build()?; + let mut sequential = scan.parallel_scan_metadata(engine.clone())?; + + let mut all_paths = sequential.try_fold(Vec::new(), |acc, metadata_res| { + metadata_res?.visit_scan_files(acc, |ps: &mut Vec, scan_file| { + ps.push(scan_file.path); + }) + })?; + + match sequential.finish()? { + AfterSequentialScanMetadata::Done => {} + AfterSequentialScanMetadata::Parallel { state, files } => { + let final_state = if with_serde { + // Serialize and then deserialize to test the serde path + let serialized_bytes = state.into_bytes()?; + Arc::new(ParallelState::from_bytes( + engine.as_ref(), + &serialized_bytes, + )?) + } else { + // Non-serde: just use the state directly + Arc::new(*state) + }; + + let partitions: Vec> = if one_file_per_worker { + files.into_iter().map(|f| vec![f]).collect() + } else { + vec![files] + }; + + let handles = partitions + .into_iter() + .map(|partition_files| { + let engine = engine.clone(); + let state = final_state.clone(); + let dispatcher = dispatcher.clone(); + + thread::spawn(move || -> DeltaResult> { + // Set the dispatcher in this thread to capture logs + let _guard = dispatcher.map(|d| tracing::dispatcher::set_default(&d)); + + assert!(!partition_files.is_empty()); + + let mut parallel = ParallelScanMetadata::try_new( + engine.clone(), + state, + partition_files, + )?; + + parallel.try_fold(Vec::new(), |acc, metadata_res| { + metadata_res?.visit_scan_files( + acc, + |ps: &mut Vec, scan_file| { + ps.push(scan_file.path); + }, + ) + }) + }) + }) + .collect_vec(); + + for handle in handles { + let paths = handle.join().expect("Thread panicked")?; + all_paths.extend(paths); + } + + // Log metrics after all parallel workers complete + final_state.log_metrics(); + } + } + + all_paths.sort(); + assert_eq!( + all_paths, expected_paths, + "Parallel workflow paths don't match scan_metadata paths for table '{table_name}'" + ); + + Ok(()) + } + + /// Extract a metric value from logs by searching for "metric_name=value" + fn extract_metric(logs: &str, metric_name: &str) -> u64 { + let Some(pos) = logs.find(&format!("{}=", metric_name)) else { + panic!("Failed to find {} in logs", metric_name); + }; + let after = &logs[pos + metric_name.len() + 1..]; + // Find the end of the value (whitespace, comma, or closing paren) + let end_pos = after + .find(|c: char| c.is_whitespace() || c == ',' || c == ')') + .unwrap_or_else(|| panic!("Failed to find end of {} value", metric_name)); + let value_str = &after[..end_pos]; + value_str + .parse() + .unwrap_or_else(|_| panic!("Failed to parse {} value: {}", metric_name, value_str)) + } + + /// Expected metric values for a phase (sequential or parallel) + #[derive(Debug, Clone)] + struct ExpectedMetrics { + add_files_seen: u64, + active_add_files: u64, + remove_files_seen: u64, + non_file_actions: u64, + predicate_filtered: u64, + } + + /// Test case for parallel log replay workflow + struct ParallelLogReplayCase { + path: &'static str, + predicate: Option, + expected_sequential_metrics: ExpectedMetrics, + expected_parallel_metrics: Option, + } + + fn verify_metrics_in_logs( + logs: &str, + table_name: &str, + sequential_expected: &ExpectedMetrics, + parallel_expected: Option<&ExpectedMetrics>, + ) { + // Find the Sequential scan log line and extract metrics from it + let sequential_pos = logs + .find("Sequential scan metadata completed") + .unwrap_or_else(|| { + panic!( + "Expected Sequential completion log for table '{}'", + table_name + ) + }); + let sequential_logs = &logs[sequential_pos..]; + + // Extract and verify counter values from Phase 1 (sequential log line) + let add_files_seen = extract_metric(sequential_logs, "add_files_seen"); + let active_add_files = extract_metric(sequential_logs, "active_add_files"); + let remove_files_seen = extract_metric(sequential_logs, "remove_files_seen"); + let non_file_actions = extract_metric(sequential_logs, "non_file_actions"); + let predicate_filtered = extract_metric(sequential_logs, "predicate_filtered"); + + assert_eq!( + add_files_seen, sequential_expected.add_files_seen, + "Sequential add_files_seen mismatch" + ); + assert_eq!( + active_add_files, sequential_expected.active_add_files, + "Sequential active_add_files mismatch" + ); + assert_eq!( + remove_files_seen, sequential_expected.remove_files_seen, + "Sequential remove_files_seen mismatch" + ); + assert_eq!( + non_file_actions, sequential_expected.non_file_actions, + "Sequential non_file_actions mismatch", + ); + assert_eq!( + predicate_filtered, sequential_expected.predicate_filtered, + "Sequential predicate_filtered mismatch", + ); + + // Verify timing metrics are present and parseable (values may be 0 for fast operations) + let _dedup_time = extract_metric(sequential_logs, "dedup_visitor_time_ms"); + let _predicate_eval_time = extract_metric(sequential_logs, "predicate_eval_time_ms"); + + // Verify Parallel metrics if expected + if let Some(expected) = parallel_expected { + // Accumulate totals across all parallel logs + let mut total_add_files_seen = 0u64; + let mut total_active_add_files = 0u64; + let mut total_remove_files_seen = 0u64; + let mut total_non_file_actions = 0u64; + let mut total_predicate_filtered = 0u64; + let mut search_start = 0; + + while let Some(pos) = logs[search_start..].find("Parallel scan metadata completed") { + let absolute_pos = search_start + pos; + let remaining = &logs[absolute_pos..]; + + // Extract and accumulate metrics + total_add_files_seen += extract_metric(remaining, "add_files_seen"); + total_active_add_files += extract_metric(remaining, "active_add_files"); + total_remove_files_seen += extract_metric(remaining, "remove_files_seen"); + total_non_file_actions += extract_metric(remaining, "non_file_actions"); + total_predicate_filtered += extract_metric(remaining, "predicate_filtered"); + + // Verify timing metrics are present and parseable in parallel phase + let _dedup_time = extract_metric(remaining, "dedup_visitor_time_ms"); + let _predicate_eval_time = extract_metric(remaining, "predicate_eval_time_ms"); + + search_start = absolute_pos + 1; + } + + // Verify accumulated totals match expected values + assert_eq!( + total_add_files_seen, expected.add_files_seen, + "Parallel add_files_seen mismatch" + ); + assert_eq!( + total_active_add_files, expected.active_add_files, + "Parallel active_add_files mismatch" + ); + assert_eq!( + total_remove_files_seen, expected.remove_files_seen, + "Parallel remove_files_seen mismatch" + ); + assert_eq!( + total_non_file_actions, expected.non_file_actions, + "Parallel non_file_actions mismatch" + ); + assert_eq!( + total_predicate_filtered, expected.predicate_filtered, + "Parallel predicate_filtered mismatch" + ); + } + } + + /// Tests parallel workflow with sidecars and verifies metrics logging. + /// + /// This parameterized test covers both JSON and Parquet checkpoint sidecars, + /// with all combinations of serialization and worker configurations. + /// + /// Note: This test captures logs from spawned threads by sharing the tracing dispatcher. + /// If running with other tests in parallel causes flakiness, use `--test-threads=1`. + #[rstest::rstest] + #[case::json_sidecars(ParallelLogReplayCase { + path: "v2-checkpoints-json-with-sidecars", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + add_files_seen: 0, + active_add_files: 0, + remove_files_seen: 0, + non_file_actions: 5, + predicate_filtered: 0, + }, + expected_parallel_metrics: Some(ExpectedMetrics { + add_files_seen: 101, + active_add_files: 101, + remove_files_seen: 0, + non_file_actions: 0, + predicate_filtered: 0, + }), + })] + #[case::parquet_sidecars(ParallelLogReplayCase { + path: "v2-checkpoints-parquet-with-sidecars", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + add_files_seen: 0, + active_add_files: 0, + remove_files_seen: 0, + non_file_actions: 5, + predicate_filtered: 0, + }, + expected_parallel_metrics: Some(ExpectedMetrics { + add_files_seen: 101, + active_add_files: 101, + remove_files_seen: 0, + non_file_actions: 0, + predicate_filtered: 0, + }), + })] + #[case::data_skipping(ParallelLogReplayCase { + // Tests data skipping filtering based on column stats (min/max values) + path: "v2-checkpoints-json-with-sidecars", + predicate: Some({ + use crate::expressions::{column_expr, Expression as Expr}; + Arc::new(Expr::gt(column_expr!("id"), Expr::literal(20i64))) + }), + expected_sequential_metrics: ExpectedMetrics { + add_files_seen: 0, + active_add_files: 0, + remove_files_seen: 0, + non_file_actions: 5, + predicate_filtered: 0, + }, + // Data skipping predicate filters 4 files (101 -> 97). + // add_files_seen counts files AFTER data skipping. + expected_parallel_metrics: Some(ExpectedMetrics { + add_files_seen: 97, + active_add_files: 97, + remove_files_seen: 0, + non_file_actions: 0, + predicate_filtered: 4, + }), + })] + #[case::partition_pruning(ParallelLogReplayCase { + // Tests partition pruning filtering based on partition column values. + // Table is partitioned by 'letter' with partitions: a, b, c, e, null. + // Predicate letter='a' prunes 4 files (b, c, e, null), leaving 2 letter=a files. + // All 4 non-matching files are pruned by the columnar DataSkippingFilter. The is_add + // guard (OR(NOT is_add, pred)) only protects Remove/non-file rows, not Adds with null + // partition values -- those are correctly filtered since is_add=true for them. + path: "basic_partitioned", + predicate: Some({ + use crate::expressions::{column_expr, Expression as Expr}; + Arc::new(Expr::eq(column_expr!("letter"), Expr::literal("a"))) + }), + expected_sequential_metrics: ExpectedMetrics { + // Columnar filter prunes all 4 non-matching files (b, c, e, null) before the + // visitor. The is_add guard protects Removes but not null-partition Adds. + add_files_seen: 2, + active_add_files: 2, + remove_files_seen: 0, + non_file_actions: 4, + predicate_filtered: 4, + }, + // No parallel phase (no V2 checkpoint with sidecars) + expected_parallel_metrics: None, + })] + #[case::json_without_sidecars(ParallelLogReplayCase { + path: "v2-checkpoints-json-without-sidecars", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + add_files_seen: 3, + active_add_files: 3, + remove_files_seen: 0, + non_file_actions: 4, + predicate_filtered: 0, + }, + expected_parallel_metrics: None, + })] + #[case::json_with_last_checkpoint(ParallelLogReplayCase { + path: "v2-checkpoints-json-with-last-checkpoint", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + add_files_seen: 0, + active_add_files: 0, + remove_files_seen: 0, + non_file_actions: 4, + predicate_filtered: 0, + }, + expected_parallel_metrics: Some(ExpectedMetrics { + add_files_seen: 2, + active_add_files: 2, + remove_files_seen: 0, + non_file_actions: 0, + predicate_filtered: 0, + }), + })] + #[case::parquet_without_sidecars(ParallelLogReplayCase { + path: "v2-checkpoints-parquet-without-sidecars", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + add_files_seen: 3, + active_add_files: 3, + remove_files_seen: 0, + non_file_actions: 4, + predicate_filtered: 0, + }, + expected_parallel_metrics: None, + })] + #[case::parquet_with_last_checkpoint(ParallelLogReplayCase { + path: "v2-checkpoints-parquet-with-last-checkpoint", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + add_files_seen: 0, + active_add_files: 0, + remove_files_seen: 0, + non_file_actions: 4, + predicate_filtered: 0, + }, + expected_parallel_metrics: Some(ExpectedMetrics { + add_files_seen: 2, + active_add_files: 2, + remove_files_seen: 0, + non_file_actions: 0, + predicate_filtered: 0, + }), + })] + #[case::v2_classic_json(ParallelLogReplayCase { + path: "v2-classic-checkpoint-json", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + add_files_seen: 0, + active_add_files: 0, + remove_files_seen: 0, + non_file_actions: 4, + predicate_filtered: 0, + }, + expected_parallel_metrics: Some(ExpectedMetrics { + add_files_seen: 4, + active_add_files: 4, + remove_files_seen: 0, + non_file_actions: 0, + predicate_filtered: 0, + }), + })] + #[case::v2_classic_parquet(ParallelLogReplayCase { + path: "v2-classic-checkpoint-parquet", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + add_files_seen: 0, + active_add_files: 0, + remove_files_seen: 0, + non_file_actions: 4, + predicate_filtered: 0, + }, + expected_parallel_metrics: Some(ExpectedMetrics { + add_files_seen: 4, + active_add_files: 4, + remove_files_seen: 0, + non_file_actions: 0, + predicate_filtered: 0, + }), + })] + #[case::no_parallel_needed(ParallelLogReplayCase { + path: "table-without-dv-small", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + // This table has single-part checkpoint, completes in sequential phase + add_files_seen: 1, + active_add_files: 1, + remove_files_seen: 0, + non_file_actions: 3, + predicate_filtered: 0, + }, + // No parallel phase needed + expected_parallel_metrics: None, + })] + #[case::with_removes_deduplication(ParallelLogReplayCase { + // This table has removes that filter checkpoint adds, showing add_files_seen > active_add_files + path: "with_checkpoint_no_last_checkpoint", + predicate: None, + expected_sequential_metrics: ExpectedMetrics { + // Checkpoint 2 contains: add B (surviving state at v2), metadata/protocol + // Commit 3 (after checkpoint) has: add C, remove B + // Log replay: process commit 3 first (add C active, remove B recorded), + // then checkpoint (add B filtered by remove) + // Result: 2 adds seen, 1 active (only C), 1 remove seen, B filtered by dedup + add_files_seen: 2, + active_add_files: 1, + remove_files_seen: 1, + non_file_actions: 4, + predicate_filtered: 0, + }, + expected_parallel_metrics: None, + })] + fn test_parallel_workflow_with_metrics( + #[case] test_case: ParallelLogReplayCase, + #[values(false, true)] with_serde: bool, + #[values(false, true)] one_file_per_worker: bool, + ) -> DeltaResult<()> { + use test_utils::LoggingTest; + + // Set up log capture + let logging_test = LoggingTest::new(); + + // Capture the dispatcher to share with spawned threads + let dispatcher = tracing::dispatcher::get_default(|d| d.clone()); + + verify_parallel_workflow( + test_case.path, + test_case.predicate, + with_serde, + one_file_per_worker, + Some(dispatcher), + )?; + + // Verify metrics were logged + let logs = logging_test.logs(); + verify_metrics_in_logs( + &logs, + test_case.path, + &test_case.expected_sequential_metrics, + test_case.expected_parallel_metrics.as_ref(), + ); + + Ok(()) + } + + #[test] + fn test_parallel_with_skip_stats() -> DeltaResult<()> { + let (engine, snapshot, _tempdir) = load_test_table("v2-checkpoints-json-with-sidecars")?; + + // Get expected paths using single-node scan_metadata with skip_stats=true + let scan = snapshot + .clone() + .scan_builder() + .with_skip_stats(true) + .build()?; + let mut single_node_iter = scan.scan_metadata(engine.as_ref())?; + let mut expected_paths = single_node_iter.try_fold(Vec::new(), |acc, metadata_res| { + metadata_res?.visit_scan_files(acc, |ps: &mut Vec, scan_file| { + assert!( + scan_file.stats.is_none(), + "Single-node: scan_file.stats should be None when skip_stats=true" + ); + ps.push(scan_file.path); + }) + })?; + expected_paths.sort(); + + // Run parallel workflow with skip_stats=true + let scan = snapshot.scan_builder().with_skip_stats(true).build()?; + let mut sequential = scan.parallel_scan_metadata(engine.clone())?; + + // Verify stats is None in sequential results and collect paths + let mut all_paths = sequential.try_fold(Vec::new(), |acc, metadata_res| { + metadata_res?.visit_scan_files(acc, |ps: &mut Vec, scan_file| { + assert!( + scan_file.stats.is_none(), + "sequential: scan_file.stats should be None when skip_stats=true" + ); + ps.push(scan_file.path); + }) + })?; + + match sequential.finish()? { + AfterSequentialScanMetadata::Done => {} + AfterSequentialScanMetadata::Parallel { state, files } => { + // Verify stats is None in parallel results and collect paths + let mut parallel = + ParallelScanMetadata::try_new(engine.clone(), Arc::from(state), files)?; + + let parallel_paths = parallel.try_fold(Vec::new(), |acc, metadata_res| { + metadata_res?.visit_scan_files(acc, |ps: &mut Vec, scan_file| { + assert!( + scan_file.stats.is_none(), + "parallel: scan_file.stats should be None when skip_stats=true" + ); + ps.push(scan_file.path); + }) + })?; + + all_paths.extend(parallel_paths); + } + } + + // Verify parallel workflow returns same files as single-node + all_paths.sort(); + assert_eq!( + all_paths, expected_paths, + "Parallel workflow with skip_stats=true should return same files as single-node scan_metadata" + ); + + Ok(()) + } +} diff --git a/kernel/src/parallel/parallel_scan_metadata.rs b/kernel/src/parallel/parallel_scan_metadata.rs new file mode 100644 index 0000000000..e2d38fcb49 --- /dev/null +++ b/kernel/src/parallel/parallel_scan_metadata.rs @@ -0,0 +1,228 @@ +use std::sync::Arc; + +use delta_kernel_derive::internal_api; +use tracing::{info_span, Span}; + +use crate::log_replay::{ActionsBatch, ParallelLogReplayProcessor}; +use crate::parallel::parallel_phase::ParallelPhase; +use crate::parallel::sequential_phase::{AfterSequential, SequentialPhase}; +use crate::scan::log_replay::{ScanLogReplayProcessor, SerializableScanState}; +use crate::scan::ScanMetadata; +use crate::schema::SchemaRef; +use crate::{DeltaResult, Engine, EngineData, Error, FileMeta}; + +/// Result of sequential scan metadata processing. +/// +/// This enum indicates whether distributed processing is needed: +/// - `Done`: All processing completed sequentially - no distributed phase needed. +/// - `Parallel`: Contains state and files for parallel processing. +pub enum AfterSequentialScanMetadata { + Done, + Parallel { + state: Box, + files: Vec, + }, +} + +/// Sequential scan metadata processing. +/// +/// This phase processes commits and single-part checkpoint manifests sequentially. +/// After exhaustion, call `finish()` to get the result which indicates whether +/// a distributed phase is needed. +pub struct SequentialScanMetadata { + pub(crate) sequential: SequentialPhase, + span: Span, +} + +impl SequentialScanMetadata { + pub(crate) fn new(sequential: SequentialPhase) -> Self { + Self { + sequential, + // TODO: Associate a unique scan ID with this span to correlate sequential and parallel phases + span: info_span!("sequential_scan_metadata"), + } + } + + pub fn finish(self) -> DeltaResult { + let _guard = self.span.enter(); + match self.sequential.finish()? { + AfterSequential::Done(processor) => { + processor + .get_metrics() + .log("Sequential scan metadata completed"); + Ok(AfterSequentialScanMetadata::Done) + } + AfterSequential::Parallel { processor, files } => { + // Log sequential metrics and reset counters for parallel phase + processor + .get_metrics() + .log("Sequential scan metadata completed"); + processor.get_metrics().reset_counters(); + + Ok(AfterSequentialScanMetadata::Parallel { + state: Box::new(ParallelState { inner: processor }), + files, + }) + } + } + } +} + +impl Iterator for SequentialScanMetadata { + type Item = DeltaResult; + + fn next(&mut self) -> Option { + let _guard = self.span.enter(); + self.sequential.next() + } +} + +/// State for parallel scan metadata processing. +/// +/// This state can be serialized and distributed to remote workers, or wrapped +/// in Arc and shared across threads for local parallel processing. +pub struct ParallelState { + inner: ScanLogReplayProcessor, +} + +impl ParallelLogReplayProcessor for Arc { + type Output = ScanMetadata; + + fn process_actions_batch(&self, actions_batch: ActionsBatch) -> DeltaResult { + self.inner.process_actions_batch(actions_batch) + } +} + +impl ParallelState { + /// Log the accumulated metrics from parallel processing. + /// + /// Call this after all parallel workers complete. The metrics will be logged + /// in the current tracing span context. + /// + /// # Example + /// + /// ```no_run + /// # use std::sync::Arc; + /// # use delta_kernel::scan::ParallelState; + /// # use tracing::instrument; + /// #[instrument(skip_all, name = "parallel_scan")] + /// async fn process(state: Arc) { + /// // ... spawn workers that share Arc ... + /// // ... wait for workers to complete ... + /// + /// // Log accumulated metrics + /// state.log_metrics(); + /// } + /// ``` + pub fn log_metrics(&self) { + self.inner + .get_metrics() + .log("Parallel scan metadata completed"); + } + + /// Get the schema to use for reading checkpoint files. + /// + /// Returns the checkpoint read schema which may have stats excluded + /// if skip_stats was enabled when the scan was created. + pub fn file_read_schema(&self) -> SchemaRef { + self.inner.checkpoint_info().checkpoint_read_schema.clone() + } + + /// Serialize the processor state for distributed processing. + /// + /// Returns a `SerializableScanState` containing all information needed to + /// reconstruct this state on remote compute nodes. + /// + /// # Errors + /// Returns an error if the state cannot be serialized (e.g., contains opaque predicates). + #[internal_api] + #[allow(unused)] + pub(crate) fn into_serializable_state(self) -> DeltaResult { + self.inner.into_serializable_state() + } + + /// Reconstruct a ParallelState from serialized state. + /// + /// # Parameters + /// - `engine`: Engine for creating evaluators and filters + /// - `state`: The serialized state from a previous `into_serializable_state()` call + #[internal_api] + #[allow(unused)] + pub(crate) fn from_serializable_state( + engine: &dyn Engine, + state: SerializableScanState, + ) -> DeltaResult { + let inner = ScanLogReplayProcessor::from_serializable_state(engine, state)?; + Ok(Self { inner }) + } + + /// Serialize the processor state directly to bytes. + /// + /// This is a convenience method that combines `into_serializable_state()` with + /// JSON serialization. For more control over serialization format, use + /// `into_serializable_state()` directly. + /// + /// # Errors + /// Returns an error if the state cannot be serialized. + #[allow(unused)] + pub fn into_bytes(self) -> DeltaResult> { + let state = self.into_serializable_state()?; + serde_json::to_vec(&state) + .map_err(|e| Error::generic(format!("Failed to serialize ParallelState to bytes: {e}"))) + } + + /// Reconstruct a ParallelState from bytes. + /// + /// This is a convenience method that combines JSON deserialization with + /// `from_serializable_state()`. The bytes must have been produced by `into_bytes()`. + /// + /// # Parameters + /// - `engine`: Engine for creating evaluators and filters + /// - `bytes`: The serialized bytes from a previous `into_bytes()` call + #[allow(unused)] + pub fn from_bytes(engine: &dyn Engine, bytes: &[u8]) -> DeltaResult { + let state: SerializableScanState = + serde_json::from_slice(bytes).map_err(Error::MalformedJson)?; + Self::from_serializable_state(engine, state) + } +} + +pub struct ParallelScanMetadata { + pub(crate) processor: ParallelPhase>, + span: Span, +} + +impl ParallelScanMetadata { + pub fn try_new( + engine: Arc, + state: Arc, + leaf_files: Vec, + ) -> DeltaResult { + let read_schema = state.file_read_schema(); + Ok(Self { + processor: ParallelPhase::try_new(engine, state, leaf_files, read_schema)?, + // TODO: Associate the same scan ID from sequential phase to correlate phases + span: info_span!("parallel_scan_metadata"), + }) + } + + pub fn new_from_iter( + state: Arc, + iter: impl IntoIterator>> + 'static, + ) -> Self { + Self { + processor: ParallelPhase::new_from_iter(state.clone(), iter), + // TODO: Associate the same scan ID from sequential phase to correlate phases + span: info_span!("parallel_scan_metadata"), + } + } +} + +impl Iterator for ParallelScanMetadata { + type Item = DeltaResult; + + fn next(&mut self) -> Option { + let _guard = self.span.enter(); + self.processor.next() + } +} diff --git a/kernel/src/parallel/sequential_phase.rs b/kernel/src/parallel/sequential_phase.rs new file mode 100644 index 0000000000..de395fc88e --- /dev/null +++ b/kernel/src/parallel/sequential_phase.rs @@ -0,0 +1,347 @@ +//! Sequential log replay processor that happens before the parallel phase. +//! +//! This module provides sequential phase log replay that processes commits and +//! single-part checkpoint manifests, then returns the processor and any files (sidecars or +//! multi-part checkpoint parts) for parallel processing by the parallel phase. This phase +//! must be completed before the parallel phase can start. +//! +//! For multi-part checkpoints, the sequential phase skips manifest processing and returns +//! the checkpoint parts for parallel processing. +#![allow(unused)] + +use std::sync::Arc; + +use itertools::Itertools; + +use crate::log_reader::checkpoint_manifest::CheckpointManifestReader; +use crate::log_reader::commit::CommitReader; +use crate::log_replay::LogReplayProcessor; +use crate::log_segment::LogSegment; +use crate::scan::COMMIT_READ_SCHEMA; +use crate::utils::require; +use crate::{DeltaResult, Engine, Error, FileMeta}; +use delta_kernel_derive::internal_api; + +/// Sequential log replay processor for parallel execution. +/// +/// This iterator processes log replay sequentially: +/// 1. Commit files (JSON) +/// 2. Manifest (single-part checkpoint, if present) +/// +/// After exhaustion, call `finish()` to extract: +/// - The processor (for serialization and distribution) +/// - Files (sidecars or multi-part checkpoint parts) for parallel processing +/// +/// # Type Parameters +/// - `P`: A [`LogReplayProcessor`] implementation that processes action batches +/// +/// # Example +/// +/// ```ignore +/// let mut sequential = SequentialPhase::try_new(processor, log_segment, engine)?; +/// +/// // Iterate over sequential batches +/// for batch in sequential.by_ref() { +/// let metadata = batch?; +/// // Process metadata +/// } +/// +/// // Extract processor and files for distribution (if needed) +/// match sequential.finish()? { +/// AfterSequential::Parallel { processor, files } => { +/// // Parallel phase needed - distribute files for parallel processing. +/// // If crossing the network boundary, the processor must be serialized. +/// let serialized = processor.serialize()?; +/// let partitions = partition_files(files, num_workers); +/// for (worker, partition) in partitions { +/// worker.send(serialized.clone(), partition)?; +/// } +/// } +/// AfterSequential::Done(processor) => { +/// // No parallel phase needed - all processing complete sequentially +/// println!("Log replay complete"); +/// } +/// } +/// ``` +/// cbindgen:ignore +#[internal_api] +pub(crate) struct SequentialPhase { + // The processor that will be used to process the action batches + processor: P, + // The commit reader that will be used to read the commit files + commit_phase: Option, + // The checkpoint manifest reader that will be used to read the checkpoint manifest files. + // If the checkpoint is single-part, this will be Some(CheckpointManifestReader). + checkpoint_manifest_phase: Option, + // Whether the iterator has been fully exhausted + is_finished: bool, + // Checkpoint parts for potential parallel phase processing + checkpoint_parts: Vec, +} + +/// Result of sequential log replay processing. +/// cbindgen:ignore +#[internal_api] +pub(crate) enum AfterSequential { + /// All processing complete sequentially - no parallel phase needed. + Done(P), + /// Parallel phase needed - distribute files for parallel processing. + Parallel { processor: P, files: Vec }, +} + +impl SequentialPhase

{ + /// Create a new sequential phase log replay. + /// + /// # Parameters + /// - `processor`: The log replay processor + /// - `log_segment`: The log segment to process + /// - `engine`: Engine for reading files + #[internal_api] + pub(crate) fn try_new( + processor: P, + log_segment: &LogSegment, + engine: Arc, + ) -> DeltaResult { + let commit_phase = Some(CommitReader::try_new( + engine.as_ref(), + log_segment, + COMMIT_READ_SCHEMA.clone(), + )?); + + // Concurrently start reading the checkpoint manifest. Only create a checkpoint manifest + // reader if the checkpoint is single-part. + let checkpoint_manifest_phase = match log_segment.listed.checkpoint_parts.as_slice() { + [single_part] => Some(CheckpointManifestReader::try_new( + engine, + single_part, + log_segment.log_root.clone(), + )?), + _ => None, + }; + + let checkpoint_parts = log_segment + .listed + .checkpoint_parts + .iter() + .map(|path| path.location.clone()) + .collect_vec(); + + Ok(Self { + processor, + commit_phase, + checkpoint_manifest_phase, + is_finished: false, + checkpoint_parts, + }) + } + + /// Complete sequential phase and extract processor + files for distribution. + /// + /// Must be called after the iterator is exhausted. + /// + /// # Returns + /// - `Done`: All processing done sequentially - no parallel phase needed + /// - `Parallel`: Parallel phase needed. The resulting files may be processed + /// in parallel. + /// + /// # Errors + /// Returns an error if called before iterator exhaustion. + #[internal_api] + pub(crate) fn finish(self) -> DeltaResult> { + if !self.is_finished { + return Err(Error::generic( + "Must exhaust iterator before calling finish()", + )); + } + + let parallel_files = match self.checkpoint_manifest_phase { + Some(manifest_reader) => manifest_reader.extract_sidecars()?, + None => { + let parts = self.checkpoint_parts; + require!( + parts.len() != 1, + Error::generic( + "Invariant violation: If there is exactly one checkpoint part, + there must be a manifest reader" + ) + ); + // If this is a multi-part checkpoint, use the checkpoint parts for parallel phase + parts + } + }; + + if parallel_files.is_empty() { + Ok(AfterSequential::Done(self.processor)) + } else { + Ok(AfterSequential::Parallel { + processor: self.processor, + files: parallel_files, + }) + } + } +} + +impl Iterator for SequentialPhase

{ + type Item = DeltaResult; + + fn next(&mut self) -> Option { + let next = self + .commit_phase + .as_mut() + .and_then(|commit_phase| commit_phase.next()) + .or_else(|| { + self.commit_phase = None; + self.checkpoint_manifest_phase.as_mut()?.next() + }); + + let Some(result) = next else { + self.is_finished = true; + return None; + }; + + Some(result.and_then(|batch| self.processor.process_actions_batch(batch))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::scan::AfterSequentialScanMetadata; + use crate::utils::test_utils::{assert_result_error_with_message, load_test_table}; + + /// Core helper function to verify sequential processing with expected adds and sidecars. + fn verify_sequential_processing( + table_name: &str, + expected_adds: &[&str], + expected_sidecars: &[&str], + ) -> DeltaResult<()> { + let (engine, snapshot, _tempdir) = load_test_table(table_name)?; + + let scan = snapshot.scan_builder().build()?; + let mut sequential = scan.parallel_scan_metadata(engine)?; + + // Process all batches and collect Add file paths + let mut file_paths = Vec::new(); + for result in sequential.by_ref() { + let metadata = result?; + file_paths = + metadata.visit_scan_files(file_paths, |ps: &mut Vec, file_stat| { + ps.push(file_stat.path); + })?; + } + + // Assert collected adds match expected + file_paths.sort(); + assert_eq!( + file_paths, expected_adds, + "Sequential phase should collect expected Add file paths" + ); + + // Call finish() and verify result based on expected sidecars + let result = sequential.finish()?; + match (expected_sidecars, result) { + (sidecars, AfterSequentialScanMetadata::Done) => { + assert!( + sidecars.is_empty(), + "Expected Done but got sidecars {sidecars:?}" + ); + } + (expected_sidecars, AfterSequentialScanMetadata::Parallel { files, .. }) => { + assert_eq!( + files.len(), + expected_sidecars.len(), + "Should collect exactly {} sidecar files", + expected_sidecars.len() + ); + + // Extract and verify sidecar file paths + let mut collected_paths = files + .iter() + .map(|fm| { + fm.location + .path_segments() + .and_then(|mut segments| segments.next_back()) + .unwrap_or("") + .to_string() + }) + .collect_vec(); + + collected_paths.sort(); + assert_eq!(collected_paths, expected_sidecars); + } + } + + Ok(()) + } + + #[test] + fn test_sequential_v2_with_commits_only() -> DeltaResult<()> { + verify_sequential_processing( + "table-without-dv-small", + &["part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet"], + &[], // No sidecars + ) + } + + #[test] + fn test_sequential_v2_with_sidecars() -> DeltaResult<()> { + verify_sequential_processing( + "v2-checkpoints-json-with-sidecars", + &[], // No adds in sequential phase (all in checkpoint sidecars) + &[ + "00000000000000000006.checkpoint.0000000001.0000000002.19af1366-a425-47f4-8fa6-8d6865625573.parquet", + "00000000000000000006.checkpoint.0000000002.0000000002.5008b69f-aa8a-4a66-9299-0733a56a7e63.parquet", + ], + ) + } + + #[test] + fn test_sequential_finish_before_exhaustion_error() -> DeltaResult<()> { + let (engine, snapshot, _tempdir) = load_test_table("table-without-dv-small")?; + + let scan = snapshot.scan_builder().build()?; + let sequential = scan.parallel_scan_metadata(engine)?; + + // Try to call finish() before exhausting the iterator + let result = sequential.finish(); + assert_result_error_with_message(result, "Must exhaust iterator before calling finish()"); + + Ok(()) + } + + #[test] + fn test_sequential_checkpoint_without_sidecars() -> DeltaResult<()> { + verify_sequential_processing( + "v2-checkpoints-json-without-sidecars", + &[ + // Adds from checkpoint manifest processed in sequential phase + "test%25file%25prefix-part-00000-0e32f92c-e232-4daa-b734-369d1a800502-c000.snappy.parquet", + "test%25file%25prefix-part-00000-91daf7c5-9ba0-4f76-aefd-0c3b21d33c6c-c000.snappy.parquet", + "test%25file%25prefix-part-00001-a5c41be1-ded0-4b18-a638-a927d233876e-c000.snappy.parquet", + ], + &[], // No sidecars + ) + } + + #[test] + fn test_sequential_parquet_checkpoint_with_sidecars() -> DeltaResult<()> { + verify_sequential_processing( + "v2-checkpoints-parquet-with-sidecars", + &[], // No adds in sequential phase + &[ + // Expected sidecars + "00000000000000000006.checkpoint.0000000001.0000000002.76931b15-ead3-480d-b86c-afe55a577fc3.parquet", + "00000000000000000006.checkpoint.0000000002.0000000002.4367b29c-0e87-447f-8e81-9814cc01ad1f.parquet", + ], + ) + } + + #[test] + fn test_sequential_checkpoint_no_commits() -> DeltaResult<()> { + verify_sequential_processing( + "with_checkpoint_no_last_checkpoint", + &["part-00000-70b1dcdf-0236-4f63-a072-124cdbafd8a0-c000.snappy.parquet"], // Add from commit 3 + &[], // No sidecars + ) + } +} diff --git a/kernel/src/path.rs b/kernel/src/path.rs index e3c094ef1c..10381aa69e 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -5,6 +5,7 @@ use std::str::FromStr; use crate::actions::visitors::InCommitTimestampVisitor; use crate::engine_data::RowVisitor; +use crate::utils::require; use crate::{DeltaResult, Engine, Error, FileMeta, Version}; use delta_kernel_derive::internal_api; @@ -21,7 +22,10 @@ const MULTIPART_PART_LEN: usize = 10; const UUID_PART_LEN: usize = 36; /// The subdirectory name within the table root where the delta log resides -const DELTA_LOG_DIR: &str = "_delta_log/"; +const DELTA_LOG_DIR: &str = "_delta_log"; +const DELTA_LOG_DIR_WITH_SLASH: &str = "_delta_log/"; +/// The subdirectory name within the delta log where staged commits reside +const STAGED_COMMITS_DIR: &str = "_staged_commits/"; #[derive(Debug, Clone, PartialEq, Eq)] #[internal_api] @@ -70,11 +74,12 @@ pub(crate) struct ParsedLogPath { } // Internal helper used by TryFrom below. It parses a fixed-length string into the numeric -// type expected by the caller. A wrong length produces an error, even if the parse succeeded. -fn parse_path_part(value: &str, expect_len: usize, location: &Url) -> DeltaResult { +// type expected by the caller. A parsing failure returns None. A wrong length produces None, even +// if the parse succeeded. +fn parse_path_part(value: &str, expect_len: usize) -> Option { match value.parse() { - Ok(result) if value.len() == expect_len => Ok(result), - _ => Err(Error::invalid_log_path(location)), + Ok(result) if value.len() == expect_len => Some(result), + _ => None, } } @@ -97,14 +102,18 @@ impl AsUrl for Url { } } +fn path_contains_delta_log_dir(mut path_segments: std::str::Split<'_, char>) -> bool { + path_segments.any(|p| p == DELTA_LOG_DIR) +} + impl ParsedLogPath { // NOTE: We can't actually impl TryFrom because Option is a foreign struct even if T is local. #[internal_api] pub(crate) fn try_from(location: Location) -> DeltaResult>> { let url = location.as_url(); - let mut path_segments = url - .path_segments() - .ok_or_else(|| Error::invalid_log_path(url))?; + let Some(mut path_segments) = url.path_segments() else { + return Ok(None); + }; #[allow(clippy::unwrap_used)] let filename = path_segments .next_back() @@ -112,7 +121,7 @@ impl ParsedLogPath { .to_string(); let subdir = path_segments.next_back(); if filename.is_empty() { - return Err(Error::invalid_log_path(url)); + return Ok(None); // Not a valid log path } let mut split = filename.split('.'); @@ -126,7 +135,7 @@ impl ParsedLogPath { // parsing succeeds for a wrong-length numeric string. let version = match version.parse().ok() { Some(v) if version.len() == VERSION_LEN => v, - Some(_) => return Err(Error::invalid_log_path(url)), + Some(_) => return Ok(None), // has a version but it's not 20 chars None => return Ok(None), }; @@ -137,33 +146,62 @@ impl ParsedLogPath { None => return Ok(None), }; + // this check determines if we're in the delta log dir, or in the staged commits dir. The check is: + // 1. If the dir is named _staged_commits, check if the parent dir is _delta_log, and ensure + // no higher level directories are _also_ named _delta_log. If those checks pass we're in + // the staged_commits dir + // 2. if the dir is named _delta_log, ensure no higher level directories are _also_ named + // _delta_log. If those checks pass, we're in the delta log dir + let (in_delta_log_dir, in_staged_commits_dir) = if subdir == Some("_staged_commits") { + if path_segments.next_back() == Some(DELTA_LOG_DIR) + && !path_contains_delta_log_dir(path_segments) + { + (false, true) + } else { + (false, false) + } + } else { + ( + subdir == Some(DELTA_LOG_DIR) && !path_contains_delta_log_dir(path_segments), + false, + ) + }; + // Parse the file type, based on the number of remaining parts let file_type = match split.as_slice() { - ["json"] => LogPathFileType::Commit, - [uuid, "json"] if subdir == Some("_staged_commits") => { + ["json"] if in_delta_log_dir => LogPathFileType::Commit, + [uuid, "json"] if in_staged_commits_dir => { // staged commits like _delta_log/_staged_commits/00000000000000000000.{uuid}.json - match parse_path_part::(uuid, UUID_PART_LEN, url) { - Ok(_uuid) => LogPathFileType::StagedCommit, - Err(_) => LogPathFileType::Unknown, + match parse_path_part::(uuid, UUID_PART_LEN) { + Some(_uuid) => LogPathFileType::StagedCommit, + None => LogPathFileType::Unknown, } } - ["crc"] => LogPathFileType::Crc, - ["checkpoint", "parquet"] => LogPathFileType::SinglePartCheckpoint, - ["checkpoint", uuid, "json" | "parquet"] => { - let _ = parse_path_part::(uuid, UUID_PART_LEN, url)?; + ["crc"] if in_delta_log_dir => LogPathFileType::Crc, + ["checkpoint", "parquet"] if in_delta_log_dir => LogPathFileType::SinglePartCheckpoint, + ["checkpoint", uuid, "json" | "parquet"] if in_delta_log_dir => { + let Some(_) = parse_path_part::(uuid, UUID_PART_LEN) else { + return Ok(None); + }; LogPathFileType::UuidCheckpoint } - [hi, "compacted", "json"] => { - let hi = parse_path_part(hi, VERSION_LEN, url)?; + [hi, "compacted", "json"] if in_delta_log_dir => { + let Some(hi) = parse_path_part(hi, VERSION_LEN) else { + return Ok(None); + }; LogPathFileType::CompactedCommit { hi } } - ["checkpoint", part_num, num_parts, "parquet"] => { - let part_num = parse_path_part(part_num, MULTIPART_PART_LEN, url)?; - let num_parts = parse_path_part(num_parts, MULTIPART_PART_LEN, url)?; + ["checkpoint", part_num, num_parts, "parquet"] if in_delta_log_dir => { + let Some(part_num) = parse_path_part(part_num, MULTIPART_PART_LEN) else { + return Ok(None); + }; + let Some(num_parts) = parse_path_part(num_parts, MULTIPART_PART_LEN) else { + return Ok(None); + }; // A valid part_num must be in the range [1, num_parts] if !(0 < part_num && part_num <= num_parts) { - return Err(Error::invalid_log_path(url)); + return Ok(None); } LogPathFileType::MultiPartCheckpoint { part_num, @@ -183,6 +221,21 @@ impl ParsedLogPath { })) } + /// Parse a location into a commit path (published or staged), returning an error if invalid or + /// not a commit. + pub(crate) fn parse_commit(location: Location) -> DeltaResult { + let url = location.as_url().to_string(); + let parsed = Self::try_from(location)?.ok_or_else(|| Error::invalid_log_path(&url))?; + require!( + parsed.is_commit(), + Error::generic(format!( + "Expected a commit path, got {} of type {:?}", + url, parsed.file_type + )) + ); + Ok(parsed) + } + pub(crate) fn should_list(&self) -> bool { match self.file_type { LogPathFileType::Commit @@ -265,7 +318,7 @@ impl ParsedLogPath { impl ParsedLogPath { /// Helper method to create a path with the given filename generator fn create_path(table_root: &Url, filename: String) -> DeltaResult { - let location = table_root.join(DELTA_LOG_DIR)?.join(&filename)?; + let location = table_root.join(DELTA_LOG_DIR_WITH_SLASH)?.join(&filename)?; Self::try_from(location)?.ok_or_else(|| { Error::internal_error(format!("Attempted to create an invalid path: {filename}")) }) @@ -287,7 +340,6 @@ impl ParsedLogPath { } /// Create a new ParsedCheckpointPath for a classic parquet checkpoint file - #[allow(dead_code)] // TODO: Remove this once we have a use case for it pub(crate) fn new_classic_parquet_checkpoint( table_root: &Url, version: Version, @@ -318,15 +370,13 @@ impl ParsedLogPath { Ok(path) } - // TODO: remove after support for writing CRC files - #[allow(unused)] - /// Create a new ParsedCommitPath for a new CRC file + /// Create a new `ParsedLogPath` for a version checksum (CRC) file. pub(crate) fn new_crc(table_root: &Url, version: Version) -> DeltaResult { let filename = format!("{version:020}.crc"); let path = Self::create_path(table_root, filename)?; - if path.file_type != LogPathFileType::Crc { + if !matches!(path.file_type, LogPathFileType::Crc) { return Err(Error::internal_error( - "ParsedLogPath::new_crc created a non-crc path", + "ParsedLogPath::new_crc created a non-CRC path", )); } Ok(path) @@ -352,36 +402,53 @@ impl ParsedLogPath { /// A wrapper around parsed log path to provide more structure/safety when handling /// table/log/commit paths. #[derive(Debug, Clone)] -pub(crate) struct LogRoot(Url); +pub(crate) struct LogRoot { + table_root: Url, + log_root: Url, +} impl LogRoot { /// Create a new LogRoot from the table root URL (e.g. s3://bucket/table -> /// s3://bucket/table/_delta_log/) /// /// TODO: could take a `table_root: TableRoot` - pub(crate) fn new(table_root: Url) -> DeltaResult { - // FIXME: need to check for trailing slash - Ok(Self(table_root.join(DELTA_LOG_DIR)?)) + pub(crate) fn new(mut table_root: Url) -> DeltaResult { + if !table_root.path().ends_with('/') { + let new_path = format!("{}/", table_root.path()); + table_root.set_path(&new_path); + } + let log_root = table_root.join(DELTA_LOG_DIR_WITH_SLASH)?; + Ok(Self { + table_root, + log_root, + }) + } + + pub(crate) fn table_root(&self) -> &Url { + &self.table_root + } + + pub(crate) fn log_root(&self) -> &Url { + &self.log_root } /// Create a new commit path (absolute path) for the given version. pub(crate) fn new_commit_path(&self, version: Version) -> DeltaResult> { let filename = format!("{version:020}.json"); - let path = self.0.join(&filename)?; + let path = self.log_root().join(&filename)?; ParsedLogPath::try_from(path)?.ok_or_else(|| { Error::internal_error(format!("Attempted to create an invalid path: {filename}")) }) } /// Create a new staged commit path (absolute path) for the given version. - #[allow(unused)] // TODO: Remove this once we remove catalog-managed feature pub(crate) fn new_staged_commit_path( &self, version: Version, ) -> DeltaResult> { let uuid = uuid::Uuid::new_v4(); let filename = format!("{version:020}.{uuid}.json"); - let path = self.0.join(&filename)?; + let path = self.log_root().join(STAGED_COMMITS_DIR)?.join(&filename)?; ParsedLogPath::try_from(path)?.ok_or_else(|| { Error::internal_error(format!("Attempted to create an invalid path: {filename}")) }) @@ -389,18 +456,73 @@ impl LogRoot { } #[cfg(test)] -mod tests { +pub(crate) mod tests { use std::path::PathBuf; use std::sync::Arc; use super::*; - use crate::engine::default::executor::tokio::TokioBackgroundExecutor; - use crate::engine::default::DefaultEngine; + use crate::engine::default::DefaultEngineBuilder; use crate::engine::sync::SyncEngine; + use crate::object_store::memory::InMemory; use crate::utils::test_utils::assert_result_error_with_message; - use object_store::memory::InMemory; use test_utils::add_commit; + impl ParsedLogPath { + pub(crate) fn create_parsed_published_commit(table_root: &Url, version: Version) -> Self { + let filename = format!("{version:020}.json"); + let location = table_root + .join(DELTA_LOG_DIR_WITH_SLASH) + .unwrap() + .join(&filename) + .unwrap(); + let parsed = ParsedLogPath::try_from(FileMeta::new(location, 0, 0)) + .unwrap() + .unwrap(); + assert!(parsed.file_type == LogPathFileType::Commit); + parsed + } + + pub(crate) fn create_parsed_staged_commit(table_root: &Url, version: Version) -> Self { + let uuid = Uuid::new_v4(); + let filename = format!("{version:020}.{uuid}.json"); + let location = table_root + .join(DELTA_LOG_DIR_WITH_SLASH) + .unwrap() + .join(STAGED_COMMITS_DIR) + .unwrap() + .join(&filename) + .unwrap(); + let parsed = ParsedLogPath::try_from(FileMeta::new(location, 0, 0)) + .unwrap() + .unwrap(); + assert!(parsed.file_type == LogPathFileType::StagedCommit); + parsed + } + + pub(crate) fn create_parsed_crc(table_root: &Url, version: Version) -> Self { + let filename = format!("{version:020}.crc"); + let location = table_root + .join(DELTA_LOG_DIR_WITH_SLASH) + .unwrap() + .join(&filename) + .unwrap(); + let parsed = ParsedLogPath::try_from(FileMeta::new(location, 0, 0)) + .unwrap() + .unwrap(); + assert!(parsed.file_type == LogPathFileType::Crc); + parsed + } + } + + fn table_root_dir_url() -> Url { + let path = PathBuf::from("./tests/data/table-with-dv-small/"); + let path = std::fs::canonicalize(path).unwrap(); + assert!(path.is_dir()); + let url = url::Url::from_directory_path(path).unwrap(); + assert!(url.path().ends_with('/')); + url + } + fn table_log_dir_url() -> Url { let path = PathBuf::from("./tests/data/table-with-dv-small/_delta_log/"); let path = std::fs::canonicalize(path).unwrap(); @@ -419,7 +541,8 @@ mod tests { assert!(log_path .path() .ends_with("/tests/data/table-with-dv-small/_delta_log/subdir/")); - ParsedLogPath::try_from(log_path).expect_err("directory path"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); // ignored - not versioned let log_path = table_log_dir.join("_last_checkpoint").unwrap(); @@ -455,11 +578,13 @@ mod tests { // invalid - version has too many digits let log_path = table_log_dir.join("000000000000000000010.json").unwrap(); - ParsedLogPath::try_from(log_path).expect_err("too many digits"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); // invalid - version has too few digits let log_path = table_log_dir.join("0000000000000000010.json").unwrap(); - ParsedLogPath::try_from(log_path).expect_err("too few digits"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); // unknown - two parts let log_path = table_log_dir.join("00000000000000000010.foo").unwrap(); @@ -605,7 +730,8 @@ mod tests { let log_path = table_log_dir .join("00000000000000000002.checkpoint.foo.parquet") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("not a uuid"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); // invalid file extension let log_path = table_log_dir @@ -623,11 +749,8 @@ mod tests { let log_path = table_log_dir .join("00000000000000000010.checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e.parquet") .unwrap(); - let result = ParsedLogPath::try_from(log_path); - assert!( - matches!(result, Err(Error::InvalidLogPath(_))), - "Expected an error for UUID with exactly 35 characters" - ); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); } #[test] @@ -651,7 +774,8 @@ mod tests { let log_path = table_log_dir .join("00000000000000000008.checkpoint.0000000000.0000000002.parquet") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("invalid part 0"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); let log_path = table_log_dir .join("00000000000000000008.checkpoint.0000000001.0000000002.parquet") @@ -696,27 +820,32 @@ mod tests { let log_path = table_log_dir .join("00000000000000000008.checkpoint.0000000003.0000000002.parquet") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("invalid part 3"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); let log_path = table_log_dir .join("00000000000000000008.checkpoint.000000001.0000000002.parquet") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("invalid part_num"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); let log_path = table_log_dir .join("00000000000000000008.checkpoint.0000000001.000000002.parquet") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("invalid num_parts"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); let log_path = table_log_dir .join("00000000000000000008.checkpoint.00000000x1.0000000002.parquet") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("invalid part_num"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); let log_path = table_log_dir .join("00000000000000000008.checkpoint.0000000001.00000000x2.parquet") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("invalid num_parts"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); } #[test] @@ -758,23 +887,26 @@ mod tests { let log_path = table_log_dir .join("00000000000000000008.0000000000000000015.compacted.json") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("too few digits in hi"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); let log_path = table_log_dir .join("00000000000000000008.000000000000000000015.compacted.json") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("too many digits in hi"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); let log_path = table_log_dir .join("00000000000000000008.00000000000000000a15.compacted.json") .unwrap(); - ParsedLogPath::try_from(log_path).expect_err("non-numeric hi"); + let log_path = ParsedLogPath::try_from(log_path).unwrap(); + assert!(log_path.is_none()); } #[test] fn test_new_commit() { - let table_log_dir = table_log_dir_url(); - let log_path = ParsedLogPath::new_commit(&table_log_dir, 10).unwrap(); + let table_root_dir = table_root_dir_url(); + let log_path = ParsedLogPath::new_commit(&table_root_dir, 10).unwrap(); assert_eq!(log_path.version, 10); assert!(log_path.is_commit()); assert_eq!(log_path.extension, "json"); @@ -784,8 +916,8 @@ mod tests { #[test] fn test_new_uuid_parquet_checkpoint() { - let table_log_dir = table_log_dir_url(); - let log_path = ParsedLogPath::new_uuid_parquet_checkpoint(&table_log_dir, 10).unwrap(); + let table_root_dir = table_root_dir_url(); + let log_path = ParsedLogPath::new_uuid_parquet_checkpoint(&table_root_dir, 10).unwrap(); assert_eq!(log_path.version, 10); assert!(log_path.is_checkpoint()); @@ -806,8 +938,8 @@ mod tests { #[test] fn test_new_classic_parquet_checkpoint() { - let table_log_dir = table_log_dir_url(); - let log_path = ParsedLogPath::new_classic_parquet_checkpoint(&table_log_dir, 10).unwrap(); + let table_root_dir = table_root_dir_url(); + let log_path = ParsedLogPath::new_classic_parquet_checkpoint(&table_root_dir, 10).unwrap(); assert_eq!(log_path.version, 10); assert!(log_path.is_checkpoint()); @@ -904,12 +1036,13 @@ mod tests { #[tokio::test] async fn test_read_in_commit_timestamp_success() { let store = Arc::new(InMemory::new()); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); - let table_url = url::Url::parse("memory://test/").unwrap(); + let engine = DefaultEngineBuilder::new(store.clone()).build(); + let table_root = "memory://test/"; + let table_url = url::Url::parse(table_root).unwrap(); // Create a commit file with ICT using add_commit let commit_content = r#"{"commitInfo":{"timestamp":1000,"inCommitTimestamp":2000},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"writerFeatures":["inCommitTimestamp"]},"metaData":{"id":"test","schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true}]}"}}"#; - add_commit(store.as_ref(), 0, commit_content.to_string()) + add_commit(table_root, store.as_ref(), 0, commit_content.to_string()) .await .unwrap(); @@ -933,12 +1066,13 @@ mod tests { #[tokio::test] async fn test_read_in_commit_timestamp_missing_ict() { let store = Arc::new(InMemory::new()); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); - let table_url = url::Url::parse("memory://test/").unwrap(); + let engine = DefaultEngineBuilder::new(store.clone()).build(); + let table_root = "memory://test/"; + let table_url = url::Url::parse(table_root).unwrap(); // Create a commit file without ICT let commit_content = r#"{"commitInfo":{"timestamp":1000},"protocol":{"minReaderVersion":3,"minWriterVersion":7},"metaData":{"id":"test","schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true}]}"}}"#; - add_commit(store.as_ref(), 0, commit_content.to_string()) + add_commit(table_root, store.as_ref(), 0, commit_content.to_string()) .await .unwrap(); diff --git a/kernel/src/row_tracking.rs b/kernel/src/row_tracking.rs index d4f56b3dd3..5bf2a08061 100644 --- a/kernel/src/row_tracking.rs +++ b/kernel/src/row_tracking.rs @@ -2,7 +2,6 @@ use std::sync::LazyLock; use serde::{Deserialize, Serialize}; -use crate::actions::domain_metadata::domain_metadata_configuration; use crate::actions::DomainMetadata; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; use crate::schema::{ColumnName, ColumnNamesAndTypes, DataType}; @@ -16,9 +15,10 @@ pub(crate) struct RowTrackingDomainMetadata { row_id_high_water_mark: i64, } -impl RowTrackingDomainMetadata { - const ROW_TRACKING_DOMAIN_NAME: &str = "delta.rowTracking"; +/// The domain name for row tracking metadata. +pub(crate) const ROW_TRACKING_DOMAIN_NAME: &str = "delta.rowTracking"; +impl RowTrackingDomainMetadata { pub(crate) fn new(row_id_high_water_mark: i64) -> Self { RowTrackingDomainMetadata { row_id_high_water_mark, @@ -45,14 +45,11 @@ impl RowTrackingDomainMetadata { snapshot: &Snapshot, engine: &dyn Engine, ) -> DeltaResult> { - Ok(domain_metadata_configuration( - snapshot.log_segment(), - Self::ROW_TRACKING_DOMAIN_NAME, - engine, - )? - .map(|domain_metadata| serde_json::from_str::(&domain_metadata)) - .transpose()? - .map(|metadata| metadata.row_id_high_water_mark)) + Ok(snapshot + .get_domain_metadata_internal(ROW_TRACKING_DOMAIN_NAME, engine)? + .map(|config| serde_json::from_str::(&config)) + .transpose()? + .map(|metadata| metadata.row_id_high_water_mark)) } } @@ -61,7 +58,7 @@ impl TryFrom for DomainMetadata { fn try_from(metadata: RowTrackingDomainMetadata) -> DeltaResult { Ok(DomainMetadata::new( - RowTrackingDomainMetadata::ROW_TRACKING_DOMAIN_NAME.to_string(), + ROW_TRACKING_DOMAIN_NAME.to_string(), serde_json::to_string(&metadata)?, )) } @@ -166,7 +163,7 @@ mod tests { } } - fn create_getters<'a>(num_records_mock: &'a MockGetData) -> Vec<&'a dyn GetData<'a>> { + fn create_getters(num_records_mock: &MockGetData) -> Vec<&dyn GetData<'_>> { vec![num_records_mock] } diff --git a/kernel/src/scan/data_skipping.rs b/kernel/src/scan/data_skipping.rs index 27a035ddaa..a2a018ffe8 100644 --- a/kernel/src/scan/data_skipping.rs +++ b/kernel/src/scan/data_skipping.rs @@ -1,28 +1,31 @@ -use std::borrow::Cow; use std::cmp::Ordering; +use std::collections::HashSet; use std::sync::{Arc, LazyLock}; +use std::time::Instant; -use tracing::debug; +use tracing::{debug, error}; -use crate::actions::get_log_add_schema; use crate::actions::visitors::SelectionVectorVisitor; use crate::error::DeltaResult; use crate::expressions::{ - column_expr, joined_column_expr, BinaryPredicateOp, ColumnName, Expression as Expr, - ExpressionRef, JunctionPredicateOp, OpaquePredicateOpRef, Predicate as Pred, PredicateRef, - Scalar, + column_expr, column_name, joined_column_expr, BinaryPredicateOp, ColumnName, + Expression as Expr, ExpressionRef, JunctionPredicateOp, OpaquePredicateOpRef, + Predicate as Pred, PredicateRef, Scalar, }; use crate::kernel_predicates::{ DataSkippingPredicateEvaluator, KernelPredicateEvaluator, KernelPredicateEvaluatorDefaults, }; -use crate::schema::{DataType, PrimitiveType, SchemaRef, SchemaTransform, StructField, StructType}; -use crate::{ - Engine, EngineData, ExpressionEvaluator, JsonHandler, PredicateEvaluator, RowVisitor as _, -}; +use crate::scan::metrics::ScanMetrics; +use crate::schema::{DataType, SchemaRef, StructField, StructType}; +use crate::utils::require; +use crate::{Engine, EngineData, Error, ExpressionEvaluator, PredicateEvaluator, RowVisitor as _}; +pub(crate) mod stats_schema; #[cfg(test)] mod tests; +use delta_kernel_derive::internal_api; + /// Rewrites a predicate to a predicate that can be used to skip files based on their stats. /// Returns `None` if the predicate is not eligible for data skipping. /// @@ -41,188 +44,441 @@ mod tests; /// predicate is dropped. #[cfg(test)] pub(crate) fn as_data_skipping_predicate(pred: &Pred) -> Option { - DataSkippingPredicateCreator.eval(pred) + DataSkippingPredicateCreator::new(&Default::default()).eval(pred) +} + +#[cfg(test)] +pub(crate) fn as_data_skipping_predicate_with_partitions( + pred: &Pred, + partition_columns: &HashSet, +) -> Option { + DataSkippingPredicateCreator::new(partition_columns).eval(pred) } /// Like `as_data_skipping_predicate`, but invokes [`KernelPredicateEvaluator::eval_sql_where`] /// instead of [`KernelPredicateEvaluator::eval`]. -fn as_sql_data_skipping_predicate(pred: &Pred) -> Option { - DataSkippingPredicateCreator.eval_sql_where(pred) +fn as_sql_data_skipping_predicate( + pred: &Pred, + partition_columns: &HashSet, +) -> Option { + DataSkippingPredicateCreator::new(partition_columns).eval_sql_where(pred) } +#[internal_api] pub(crate) struct DataSkippingFilter { - stats_schema: SchemaRef, - select_stats_evaluator: Arc, + /// Evaluator that extracts file-level statistics from the input batch. The caller provides + /// the expression at construction time, which determines where stats come from: + /// - Scan path: `column_expr!("stats_parsed")` reads the already-parsed struct from + /// a transformed batch (where `add.*` fields are flattened to top-level columns). + /// - Table changes path: `Expression::parse_json(column_expr!("add.stats"), schema)` parses + /// JSON from a raw action batch (where stats are nested under `add.stats`). + stats_evaluator: Arc, skipping_evaluator: Arc, filter_evaluator: Arc, - json_handler: Arc, + /// Metrics to record data skipping statistics. + metrics: Option>, } impl DataSkippingFilter { - /// Creates a new data skipping filter. Returns None if there is no predicate, or the predicate - /// is ineligible for data skipping. + /// Creates a new data skipping filter. Returns `None` if there is no predicate, or the + /// predicate is ineligible for data skipping. + /// + /// NOTE: `None` is equivalent to a trivial filter that always returns TRUE (= keeps all files), + /// but using an `Option` lets the engine easily avoid the overhead of applying trivial filters. /// - /// NOTE: None is equivalent to a trivial filter that always returns TRUE (= keeps all files), - /// but using an Option lets the engine easily avoid the overhead of applying trivial filters. + /// # Parameters + /// - `engine`: Engine for creating evaluators + /// - `predicate`: Optional predicate for data skipping + /// - `stats_schema`: The data stats schema (numRecords, nullCount, minValues, maxValues). + /// Pass `None` if no data stats are available. + /// - `stats_expr`: Expression to extract data stats from the batch, producing output matching + /// `stats_schema`. For example, `column_expr!("stats_parsed")` for pre-parsed stats, or + /// `Expression::parse_json(column_expr!("add.stats"), stats_schema)` for JSON parsing. + /// - `partition_schema`: Schema of typed partition columns referenced by the predicate + /// (physical names). Pass `None` if no partition columns are referenced. + /// - `partition_expr`: Expression to extract partition values from the batch, producing output + /// matching `partition_schema`. Typically a `MapToStruct` expression that converts the + /// `partitionValues` string map into a typed struct. Only used when `partition_schema` is + /// `Some`. + /// - `input_schema`: Schema of the batch that will be passed to [`apply()`](Self::apply) + /// - `metrics`: Optional metrics to record data skipping statistics. + #[allow(clippy::too_many_arguments)] pub(crate) fn new( engine: &dyn Engine, - physical_predicate: Option<(PredicateRef, SchemaRef)>, + predicate: Option, + stats_schema: Option<&SchemaRef>, + stats_expr: ExpressionRef, + partition_schema: Option<&SchemaRef>, + partition_expr: ExpressionRef, + input_schema: SchemaRef, + metrics: Option>, ) -> Option { - static STATS_EXPR: LazyLock = - LazyLock::new(|| Arc::new(column_expr!("add.stats"))); static FILTER_PRED: LazyLock = LazyLock::new(|| Arc::new(column_expr!("output").distinct(Expr::literal(false)))); - - let (predicate, referenced_schema) = physical_predicate?; + static FILTER_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(StructType::new_unchecked([StructField::nullable( + "output", + DataType::BOOLEAN, + )])) + }); + + let predicate = predicate?; debug!("Creating a data skipping filter for {:#?}", predicate); - // Convert all fields into nullable, as stats may not be available for all columns - // (and usually aren't for partition columns). - struct NullableStatsTransform; - impl<'a> SchemaTransform<'a> for NullableStatsTransform { - fn transform_struct_field( - &mut self, - field: &'a StructField, - ) -> Option> { - use Cow::*; - let field = match self.transform(&field.data_type)? { - Borrowed(_) if field.is_nullable() => Borrowed(field), - data_type => Owned(StructField { - name: field.name.clone(), - data_type: data_type.into_owned(), - nullable: true, - metadata: field.metadata.clone(), - }), - }; - Some(field) - } - } - - // Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG) - struct NullCountStatsTransform; - impl<'a> SchemaTransform<'a> for NullCountStatsTransform { - fn transform_primitive( - &mut self, - _ptype: &'a PrimitiveType, - ) -> Option> { - Some(Cow::Owned(PrimitiveType::Long)) - } - } - - let stats_schema = NullableStatsTransform - .transform_struct(&referenced_schema)? - .into_owned(); - - let nullcount_schema = NullCountStatsTransform - .transform_struct(&stats_schema)? - .into_owned(); - let stats_schema = Arc::new(StructType::new_unchecked([ - StructField::nullable("numRecords", DataType::LONG), - StructField::nullable("nullCount", nullcount_schema), - StructField::nullable("minValues", stats_schema.clone()), - StructField::nullable("maxValues", stats_schema), - ])); + // Build the unified evaluation schema and extraction expression. Data stats and partition + // values are conceptually separate, but the evaluator needs a single schema/expression. + let (unified_schema, unified_expr, partition_columns) = + Self::build_unified_schema_and_expr( + stats_schema, + stats_expr, + partition_schema, + partition_expr, + )?; + + let stats_evaluator = engine + .evaluation_handler() + .new_expression_evaluator( + input_schema, + unified_expr, + unified_schema.as_ref().clone().into(), + ) + .inspect_err(|e| error!("Failed to create stats evaluator: {e}")) + .ok()?; // Skipping happens in several steps: // - // 1. The stats selector fetches add.stats from the metadata + // 1. The stats evaluator extracts file-level statistics from the batch (the expression + // provided by the caller determines how: reading a pre-parsed column, parsing JSON, etc.) // // 2. The predicate (skipping evaluator) produces false for any file whose stats prove we // can safely skip it. A value of true means the stats say we must keep the file, and // null means we could not determine whether the file is safe to skip, because its stats // were missing/null. // - // 3. The selection evaluator does DISTINCT(col(predicate), 'false') to produce true (= keep) when - // the predicate is true/null and false (= skip) when the predicate is false. - let select_stats_evaluator = engine.evaluation_handler().new_expression_evaluator( - // safety: kernel is very broken if we don't have the schema for Add actions - get_log_add_schema().clone(), - STATS_EXPR.clone(), - DataType::STRING, - ); - - let skipping_evaluator = engine.evaluation_handler().new_predicate_evaluator( - stats_schema.clone(), - Arc::new(as_sql_data_skipping_predicate(&predicate)?), - ); - + // 3. The selection evaluator does DISTINCT(col(predicate), 'false') to produce true + // (= keep) when the predicate is true/null and false (= skip) when the predicate + // is false. + let skipping_evaluator = engine + .evaluation_handler() + .new_predicate_evaluator( + unified_schema.clone(), + Arc::new(as_sql_data_skipping_predicate( + &predicate, + &partition_columns, + )?), + ) + .inspect_err(|e| error!("Failed to create skipping evaluator: {e}")) + .ok()?; + + // The filter evaluator operates on the skipping evaluator's output, which is a single + // boolean column named "output" (not the unified stats schema). let filter_evaluator = engine .evaluation_handler() - .new_predicate_evaluator(stats_schema.clone(), FILTER_PRED.clone()); + .new_predicate_evaluator(FILTER_SCHEMA.clone(), FILTER_PRED.clone()) + .inspect_err(|e| error!("Failed to create filter evaluator: {e}")) + .ok()?; Some(Self { - stats_schema, - select_stats_evaluator, + stats_evaluator, skipping_evaluator, filter_evaluator, - json_handler: engine.json_handler(), + metrics, }) } - /// Apply the DataSkippingFilter to an EngineData batch of actions. Returns a selection vector - /// which can be applied to the actions to find those that passed data skipping. - pub(crate) fn apply(&self, actions: &dyn EngineData) -> DeltaResult> { - // retrieve and parse stats from actions data - let stats = self.select_stats_evaluator.evaluate(actions)?; - assert_eq!(stats.len(), actions.len()); - let parsed_stats = self - .json_handler - .parse_json(stats, self.stats_schema.clone())?; - assert_eq!(parsed_stats.len(), actions.len()); - - // evaluate the predicate on the parsed stats, then convert to selection vector - let skipping_predicate = self.skipping_evaluator.evaluate(&*parsed_stats)?; - assert_eq!(skipping_predicate.len(), actions.len()); + /// Builds the unified schema and extraction expression from separate data stats and partition + /// value inputs. Returns `None` if neither stats nor partition values are available. + /// + /// The caller provides a flat stats schema (e.g. `{ numRecords, minValues: { x } }`) and an + /// expression to extract it from the input batch. This function wraps both under a + /// `stats_parsed` struct field, producing a nested schema like + /// `{ stats_parsed: { numRecords, minValues: { x } } }` and a corresponding + /// `struct_from([stats_expr])` extraction expression. This ensures the unified schema aligns + /// with the `stats_parsed.*` prefixed column references emitted by + /// `DataSkippingPredicateCreator`. Partition values are similarly wrapped under + /// `partitionValues_parsed` when present. + fn build_unified_schema_and_expr( + physical_stats_schema: Option<&SchemaRef>, + stats_expr: ExpressionRef, + physical_partition_schema: Option<&SchemaRef>, + partition_expr: ExpressionRef, + ) -> Option<(SchemaRef, ExpressionRef, HashSet)> { + let partition_columns: HashSet = physical_partition_schema + .map(|s| s.fields().map(|f| f.name().to_string()).collect()) + .unwrap_or_default(); + + let stats_field = |stats: &SchemaRef| { + StructField::nullable( + "stats_parsed", + DataType::Struct(Box::new(stats.as_ref().clone())), + ) + }; + let partition_field = |ps: &SchemaRef| { + StructField::nullable( + "partitionValues_parsed", + DataType::Struct(Box::new(ps.as_ref().clone())), + ) + }; + let is_add_field = StructField::not_null("is_add", DataType::BOOLEAN); + + // When partition columns are present, include an `is_add` boolean so that partition + // predicates can guard against filtering Remove rows. Derived from `path IS NOT NULL` + // in the input batch (Add rows have non-null path, Remove/non-file rows have null). + let unified_schema = match (physical_stats_schema, physical_partition_schema) { + (Some(stats), Some(ps)) => Arc::new(StructType::new_unchecked([ + stats_field(stats), + partition_field(ps), + is_add_field, + ])), + (Some(stats), None) => Arc::new(StructType::new_unchecked([stats_field(stats)])), + (None, Some(ps)) => Arc::new(StructType::new_unchecked([ + partition_field(ps), + is_add_field, + ])), + (None, None) => return None, + }; + + let is_add_expr: ExpressionRef = Arc::new(Pred::is_not_null(column_expr!("path")).into()); + + let unified_expr = match ( + physical_stats_schema.is_some(), + physical_partition_schema.is_some(), + ) { + (true, true) => Arc::new(Expr::struct_from([stats_expr, partition_expr, is_add_expr])), + (true, false) => Arc::new(Expr::struct_from([stats_expr])), + (false, true) => Arc::new(Expr::struct_from([partition_expr, is_add_expr])), + (false, false) => return None, + }; + + Some((unified_schema, unified_expr, partition_columns)) + } + + /// Apply the DataSkippingFilter to an EngineData batch. Returns a selection vector + /// which can be applied to the batch to find rows that passed data skipping. + pub(crate) fn apply(&self, batch: &dyn EngineData) -> DeltaResult> { + let start_time = Instant::now(); + let batch_len = batch.len(); + + let file_stats = self.stats_evaluator.evaluate(batch)?; + require!( + file_stats.len() == batch_len, + Error::internal_error(format!( + "stats evaluator output length {} != batch length {}", + file_stats.len(), + batch_len + )) + ); + + let skipping_predicate = self.skipping_evaluator.evaluate(&*file_stats)?; + require!( + skipping_predicate.len() == batch_len, + Error::internal_error(format!( + "skipping evaluator output length {} != batch length {}", + skipping_predicate.len(), + batch_len + )) + ); + let selection_vector = self .filter_evaluator .evaluate(skipping_predicate.as_ref())?; - assert_eq!(selection_vector.len(), actions.len()); + debug_assert_eq!(selection_vector.len(), batch_len); + require!( + selection_vector.len() == batch_len, + Error::internal_error(format!( + "filter evaluator output length {} != batch length {}", + selection_vector.len(), + batch_len + )) + ); - // visit the engine's selection vector to produce a Vec let mut visitor = SelectionVectorVisitor::default(); visitor.visit_rows_of(selection_vector.as_ref())?; + + if visitor.num_filtered > 0 { + debug!( + "data skipping filtered {}/{batch_len} rows from batch", + visitor.num_filtered + ); + } + + if let Some(metrics) = self.metrics.as_ref() { + metrics.add_predicate_filtered(visitor.num_filtered); + metrics.add_predicate_eval_time_ns(start_time.elapsed().as_nanos() as u64) + } + Ok(visitor.selection_vector) + } +} - // TODO(zach): add some debug info about data skipping that occurred - // let before_count = actions.length(); - // debug!( - // "number of actions before/after data skipping: {before_count} / {}", - // filtered_actions.num_rows() - // ); +/// Rewrites a predicate for parquet row group skipping in checkpoint/sidecar files. +/// Returns `None` if the predicate is not eligible for data skipping. +/// +/// Adds IS NULL guards on each stat column reference so the parquet RowGroupFilter +/// conservatively keeps row groups containing files with missing stats (null stat values +/// are invisible to footer min/max). For example, `col_a > 100` becomes: +/// ```text +/// OR(maxValues.col_a IS NULL, maxValues.col_a > 100) +/// ``` +/// +/// Partition columns are excluded since their values live in `add.partitionValues_parsed`, +/// not `add.stats_parsed`. +pub(crate) fn as_checkpoint_skipping_predicate( + pred: &Pred, + partition_columns: &[String], +) -> Option { + let partition_columns: HashSet<&str> = partition_columns.iter().map(String::as_str).collect(); + NullGuardedDataSkippingPredicateCreator { partition_columns }.eval(pred) +} + +/// Maps an ordering and inversion flag to the corresponding comparison predicate. +fn comparison_predicate(ord: Ordering, col: Expr, val: &Scalar, inverted: bool) -> Pred { + let pred_fn = match (ord, inverted) { + (Ordering::Less, false) => Pred::lt, + (Ordering::Less, true) => Pred::ge, + (Ordering::Equal, false) => Pred::eq, + (Ordering::Equal, true) => Pred::ne, + (Ordering::Greater, false) => Pred::gt, + (Ordering::Greater, true) => Pred::le, + }; + pred_fn(col, val.clone()) +} + +/// Collects sub-predicates into a junction (AND/OR), replacing unsupported sub-predicates (None) +/// with a single NULL literal to preserve correct three-valued logic. One NULL is enough to +/// produce the correct behavior during predicate evaluation; additional NULLs are redundant. +fn collect_junction_preds( + mut op: JunctionPredicateOp, + preds: &mut dyn Iterator>, + inverted: bool, +) -> Pred { + if inverted { + op = op.invert(); + } + let mut keep_null = true; + let preds: Vec<_> = preds + .flat_map(|p| match p { + Some(pred) => Some(pred), + None => keep_null.then(|| { + keep_null = false; + Pred::null_literal() + }), + }) + .collect(); + Pred::junction(op, preds) +} + +/// Adjusts a comparison value before comparing against a max stat, to account for the Delta +/// protocol allowing timestamp stats to be truncated to millisecond precision (see Per-file +/// Statistics: "Timestamp columns are truncated down to milliseconds"). Truncation floors to +/// the nearest millisecond, so: `stored_max <= actual_max <= stored_max + 999us`. We subtract +/// 999us from the comparison value to avoid incorrectly pruning files whose actual max may be +/// higher than the stored (truncated) max. +/// +/// For example, if a file's actual max is `4_000_500us` (4.000500s), Spark truncates the +/// stored max stat to `4_000_000us` (4.000s). A predicate `ts > 4_000_400` would incorrectly +/// prune this file by comparing against the truncated max. By adjusting the comparison value +/// to `4_000_400 - 999 = 3_999_401`, we ensure the file is kept. +/// +/// Non-timestamp values pass through unchanged. +fn adjust_scalar_for_max_stat_truncation(val: &Scalar) -> Scalar { + match val { + Scalar::Timestamp(micros) => Scalar::Timestamp(micros.saturating_sub(999)), + Scalar::TimestampNtz(micros) => Scalar::TimestampNtz(micros.saturating_sub(999)), + other => other.clone(), } } -struct DataSkippingPredicateCreator; +/// Rewrites user predicates into stats-based predicates for data skipping. +/// +/// For data columns, rewrites to `stats_parsed.minValues.*`/`stats_parsed.maxValues.*`/ +/// `stats_parsed.nullCount.*`. +/// For partition columns, rewrites to `partitionValues_parsed.*` since the partition value is +/// the exact value for every row in the file (serving as both min and max). +struct DataSkippingPredicateCreator<'a> { + /// Physical names of partition columns. For these columns, stats come from + /// `partitionValues.` (exact values) instead of min/max ranges. + partition_columns: &'a HashSet, +} + +impl<'a> DataSkippingPredicateCreator<'a> { + fn new(partition_columns: &'a HashSet) -> Self { + Self { partition_columns } + } + + fn is_partition_column(&self, col: &ColumnName) -> bool { + let path = col.path(); + path.len() == 1 && self.partition_columns.contains(path[0].as_str()) + } + + /// Wraps a partition predicate with `OR(NOT is_add, pred)` to protect Remove rows from + /// being filtered. Remove rows have null add-side partition values, which would cause + /// partition predicates to incorrectly evaluate to false. The `is_add` column (derived + /// from `path IS NOT NULL`) ensures Removes always pass the partition filter. + fn guard_for_removes(&self, pred: Pred) -> Pred { + Pred::or(Pred::not(Pred::from(column_name!("is_add"))), pred) + } +} -impl DataSkippingPredicateEvaluator for DataSkippingPredicateCreator { +impl DataSkippingPredicateEvaluator for DataSkippingPredicateCreator<'_> { type Output = Pred; type ColumnStat = Expr; - /// Retrieves the minimum value of a column, if it exists and has the requested type. + /// Retrieves the minimum value of a column. For partition columns, returns the exact + /// partition value (which serves as both min and max). fn get_min_stat(&self, col: &ColumnName, _data_type: &DataType) -> Option { - Some(joined_column_expr!("minValues", col)) + if self.is_partition_column(col) { + Some(joined_column_expr!("partitionValues_parsed", col)) + } else { + Some(Expr::from(column_name!("stats_parsed.minValues").join(col))) + } + } + + /// Retrieves the maximum value of a column. For partition columns, returns the exact + /// partition value. + fn get_max_stat(&self, col: &ColumnName, _data_type: &DataType) -> Option { + if self.is_partition_column(col) { + Some(joined_column_expr!("partitionValues_parsed", col)) + } else { + Some(Expr::from(column_name!("stats_parsed.maxValues").join(col))) + } } - /// Retrieves the maximum value of a column, if it exists and has the requested type. - // TODO(#1002): we currently don't support file skipping on timestamp columns' max stat since - // they are truncated to milliseconds in add.stats. - fn get_max_stat(&self, col: &ColumnName, data_type: &DataType) -> Option { - match data_type { - &DataType::TIMESTAMP | &DataType::TIMESTAMP_NTZ => None, - _ => Some(joined_column_expr!("maxValues", col)), + /// Compares a column's max stat against a literal value, adjusting for timestamp + /// truncation on non-partition columns. Partition values are exact and not subject to + /// JSON stats truncation, so no adjustment is needed. For data columns, the comparison + /// value is adjusted by [`adjust_scalar_for_max_stat_truncation`]. + fn partial_cmp_max_stat( + &self, + col: &ColumnName, + val: &Scalar, + ord: Ordering, + inverted: bool, + ) -> Option { + let max = self.get_max_stat(col, &val.data_type())?; + if self.is_partition_column(col) { + return self.eval_partial_cmp(ord, max, val, inverted); } + let adjusted = adjust_scalar_for_max_stat_truncation(val); + self.eval_partial_cmp(ord, max, &adjusted, inverted) } - /// Retrieves the null count of a column, if it exists. + /// Retrieves the null count of a column. Partition columns don't have nullCount stats. fn get_nullcount_stat(&self, col: &ColumnName) -> Option { - Some(joined_column_expr!("nullCount", col)) + if self.is_partition_column(col) { + None + } else { + Some(Expr::from(column_name!("stats_parsed.nullCount").join(col))) + } } - /// Retrieves the row count of a column (parquet footers always include this stat). + /// Retrieves the row count statistic. fn get_rowcount_stat(&self) -> Option { - Some(column_expr!("numRecords")) + Some(column_expr!("stats_parsed.numRecords")) } + /// For partition columns, wraps the comparison with `OR(NOT is_add, comparison)` so that + /// Remove rows (which have null add-side partition values) are never filtered. fn eval_partial_cmp( &self, ord: Ordering, @@ -230,15 +486,15 @@ impl DataSkippingPredicateEvaluator for DataSkippingPredicateCreator { val: &Scalar, inverted: bool, ) -> Option { - let pred_fn = match (ord, inverted) { - (Ordering::Less, false) => Pred::lt, - (Ordering::Less, true) => Pred::ge, - (Ordering::Equal, false) => Pred::eq, - (Ordering::Equal, true) => Pred::ne, - (Ordering::Greater, false) => Pred::gt, - (Ordering::Greater, true) => Pred::le, - }; - Some(pred_fn(col, val.clone())) + // Detect partition columns by the prefix set in get_min_stat/get_max_stat. + let is_partition = matches!(&col, Expr::Column(name) + if name.path().first().is_some_and(|f| f == "partitionValues_parsed")); + let cmp = comparison_predicate(ord, col, val, inverted); + Some(if is_partition { + self.guard_for_removes(cmp) + } else { + cmp + }) } fn eval_pred_scalar(&self, val: &Scalar, inverted: bool) -> Option { @@ -249,14 +505,25 @@ impl DataSkippingPredicateEvaluator for DataSkippingPredicateCreator { KernelPredicateEvaluatorDefaults::eval_pred_scalar_is_null(val, inverted).map(Pred::literal) } - // NOTE: This is nearly identical to the impl for ParquetStatsProvider in - // parquet_stats_skipping.rs, except it uses `Expression` and `Predicate` instead of `Scalar`. + /// For partition columns, checks `partitionValues_parsed. IS [NOT] NULL` directly, + /// wrapped with `OR(NOT is_add, ...)` to protect Remove rows from being filtered. + /// For data columns, uses nullCount stats. fn eval_pred_is_null(&self, col: &ColumnName, inverted: bool) -> Option { - let safe_to_skip = match inverted { - true => self.get_rowcount_stat()?, // all-null - false => Expr::literal(0i64), // no-null - }; - Some(Pred::ne(self.get_nullcount_stat(col)?, safe_to_skip)) + if self.is_partition_column(col) { + let pv_expr = joined_column_expr!("partitionValues_parsed", col); + let pred = if inverted { + Pred::is_not_null(pv_expr) + } else { + Pred::is_null(pv_expr) + }; + Some(self.guard_for_removes(pred)) + } else { + let safe_to_skip = match inverted { + true => self.get_rowcount_stat()?, // all-null + false => Expr::literal(0i64), // no-null + }; + Some(Pred::ne(self.get_nullcount_stat(col)?, safe_to_skip)) + } } fn eval_pred_binary_scalars( @@ -281,29 +548,168 @@ impl DataSkippingPredicateEvaluator for DataSkippingPredicateCreator { fn finish_eval_pred_junction( &self, - mut op: JunctionPredicateOp, + op: JunctionPredicateOp, preds: &mut dyn Iterator>, inverted: bool, ) -> Option { + Some(collect_junction_preds(op, preds, inverted)) + } +} + +/// Like [`DataSkippingPredicateCreator`] but adds IS NULL guards on stat column references +/// for safe parquet row group filtering. Partition columns are excluded since their values +/// live in `add.partitionValues_parsed`, not `add.stats_parsed`. +struct NullGuardedDataSkippingPredicateCreator<'a> { + partition_columns: HashSet<&'a str>, +} + +impl NullGuardedDataSkippingPredicateCreator<'_> { + /// Returns true if the column is a partition column (no stats in `stats_parsed`). + fn is_partition_column(&self, col: &ColumnName) -> bool { + let path = col.path(); + path.len() == 1 && self.partition_columns.contains(path[0].as_str()) + } +} + +impl DataSkippingPredicateEvaluator for NullGuardedDataSkippingPredicateCreator<'_> { + type Output = Pred; + type ColumnStat = Expr; + + // These stat methods produce unprefixed column references (e.g. `minValues.col`) because + // the checkpoint skipping path applies its own `add.stats_parsed` prefix afterward. + // Partition columns return None since their values live in `add.partitionValues_parsed`. + + fn get_min_stat(&self, col: &ColumnName, _data_type: &DataType) -> Option { + if self.is_partition_column(col) { + return None; + } + Some(joined_column_expr!("minValues", col)) + } + + fn get_max_stat(&self, col: &ColumnName, _data_type: &DataType) -> Option { + if self.is_partition_column(col) { + return None; + } + Some(joined_column_expr!("maxValues", col)) + } + + fn get_nullcount_stat(&self, col: &ColumnName) -> Option { + if self.is_partition_column(col) { + return None; + } + Some(joined_column_expr!("nullCount", col)) + } + + fn get_rowcount_stat(&self) -> Option { + Some(column_expr!("numRecords")) + } + + /// Compares a column's max stat against a literal value, adjusting for timestamp + /// truncation. See [`adjust_scalar_for_max_stat_truncation`]. + /// + /// No partition column guard needed: `get_max_stat` returns `None` for partition columns, + /// so their exact values never reach the adjustment. + fn partial_cmp_max_stat( + &self, + col: &ColumnName, + val: &Scalar, + ord: Ordering, + inverted: bool, + ) -> Option { + let max = self.get_max_stat(col, &val.data_type())?; + let adjusted = adjust_scalar_for_max_stat_truncation(val); + self.eval_partial_cmp(ord, max, &adjusted, inverted) + } + + /// Wraps a stat column comparison with an IS NULL guard. + /// + /// `col > 100` → `OR(maxValues.col IS NULL, maxValues.col > 100)` + /// + /// `col = 100` (calls this twice, once per stat): + /// ```text + /// AND( + /// OR(minValues.col IS NULL, minValues.col <= 100), + /// OR(maxValues.col IS NULL, maxValues.col >= 100) + /// ) + /// ``` + fn eval_partial_cmp( + &self, + ord: Ordering, + col: Expr, + val: &Scalar, + inverted: bool, + ) -> Option { + let comparison = comparison_predicate(ord, col.clone(), val, inverted); + Some(Pred::or(Pred::is_null(col), comparison)) + } + + /// No guard needed — no stat column reference. `TRUE` → `Some(true)`. + fn eval_pred_scalar(&self, val: &Scalar, inverted: bool) -> Option { + KernelPredicateEvaluatorDefaults::eval_pred_scalar(val, inverted).map(Pred::literal) + } + + /// No guard needed — no stat column reference. `NULL IS NULL` → `Some(true)`. + fn eval_pred_scalar_is_null(&self, val: &Scalar, inverted: bool) -> Option { + KernelPredicateEvaluatorDefaults::eval_pred_scalar_is_null(val, inverted).map(Pred::literal) + } + + /// IS NULL guard on nullCount stat. + /// + /// `IS NULL` → `OR(nullCount.col IS NULL, nullCount.col != 0)`: + /// column vs literal — RowGroupFilter can evaluate via footer stats. + /// + /// `IS NOT NULL` → returns `None`. The unguarded version produces + /// `nullCount.col != numRecords`, which is column vs column. The RowGroupFilter can + /// only resolve one column at a time, so it can never prune with this predicate. + // TODO(#1873): IS NOT NULL pruning requires cross-column range comparison in RowGroupFilter. + // Skippable when the nullCount and numRecords ranges don't overlap (e.g. nullCount in + // [0, 0] vs numRecords in [500, 2000] proves all files have non-null values). + fn eval_pred_is_null(&self, col: &ColumnName, inverted: bool) -> Option { if inverted { - op = op.invert(); + return None; // IS NOT NULL: column vs column, can't prune (#1873) } - // NOTE: We can potentially see a LOT of NULL inputs in a big WHERE clause with lots of - // unsupported data skipping operations. We can't "just" flatten them all away for AND, - // because that could produce TRUE where NULL would otherwise be expected. Similarly, we - // don't want to "just" try_collect inputs for OR, because that can cause OR to produce NULL - // where FALSE would otherwise be expected. So, we filter out all nulls except the first, - // observing that one NULL is enough to produce the correct behavior during predicate eval. - let mut keep_null = true; - let preds: Vec<_> = preds - .flat_map(|p| match p { - Some(pred) => Some(pred), - None => keep_null.then(|| { - keep_null = false; - Pred::null_literal() - }), - }) - .collect(); - Some(Pred::junction(op, preds)) + let nullcount = self.get_nullcount_stat(col)?; + let comparison = Pred::ne(nullcount.clone(), Expr::literal(0i64)); + Some(Pred::or(Pred::is_null(nullcount), comparison)) + } + + /// No guard needed — no stat column reference. `5 < 10` → `Some(true)`. + fn eval_pred_binary_scalars( + &self, + op: BinaryPredicateOp, + left: &Scalar, + right: &Scalar, + inverted: bool, + ) -> Option { + KernelPredicateEvaluatorDefaults::eval_pred_binary_scalars(op, left, right, inverted) + .map(Pred::literal) + } + + /// Unsupported. Opaque predicates can construct stat column references directly, + /// bypassing IS NULL guards and risking false pruning. Returns `None` to conservatively + /// drop these from the skipping predicate. + fn eval_pred_opaque( + &self, + _op: &OpaquePredicateOpRef, + _exprs: &[Expr], + _inverted: bool, + ) -> Option { + None + } + + /// Combines sub-predicates with AND/OR. `col_a > 100 AND col_b < 50` → + /// ```text + /// AND( + /// OR(maxValues.col_a IS NULL, maxValues.col_a > 100), + /// OR(minValues.col_b IS NULL, minValues.col_b < 50) + /// ) + /// ``` + fn finish_eval_pred_junction( + &self, + op: JunctionPredicateOp, + preds: &mut dyn Iterator>, + inverted: bool, + ) -> Option { + Some(collect_junction_preds(op, preds, inverted)) } } diff --git a/kernel/src/scan/data_skipping/stats_schema/column_filter.rs b/kernel/src/scan/data_skipping/stats_schema/column_filter.rs new file mode 100644 index 0000000000..da8d96d3b7 --- /dev/null +++ b/kernel/src/scan/data_skipping/stats_schema/column_filter.rs @@ -0,0 +1,413 @@ +//! Column filtering logic for statistics based on table properties. +//! +//! This module contains [`StatsColumnFilter`], which determines which columns +//! should have statistics collected based on table configuration. + +use crate::{ + column_trie::ColumnTrie, + schema::{ColumnName, DataType, Schema, StructField}, + table_properties::DataSkippingNumIndexedCols, +}; + +/// Configuration for statistics columns +pub(crate) struct StatsConfig<'a> { + /// Explicit list of columns to collect statistics for. When `Some`, takes precedence over + /// `data_skipping_num_indexed_cols`. + /// See delta.dataSkippingStatsColumns in the Delta protocol for more details. + pub(crate) data_skipping_stats_columns: Option<&'a [ColumnName]>, + /// Maximum number of leaf columns to include. Ignored when `data_skipping_stats_columns` is set. + /// See delta.dataSkippingNumIndexedCols in the Delta protocol for more details. + pub(crate) data_skipping_num_indexed_cols: Option, +} + +/// Handles column filtering logic for statistics based on table properties. +/// +/// Filters columns according to: +/// * `dataSkippingStatsColumns` - explicit list of columns to include (takes precedence) +/// * `dataSkippingNumIndexedCols` - number of leaf columns to include (default 32) +/// * Required columns (e.g. clustering columns) - always included per Delta protocol requirements +/// * Requested columns - optional output filter that does not affect column counting +/// +/// Per the Delta protocol, writers MUST write per-file statistics for certain required columns +/// (such as clustering columns), regardless of table property settings. +pub(crate) struct StatsColumnFilter<'col> { + /// Maximum number of leaf columns to include. Set from `delta.dataSkippingNumIndexedCols` + /// table property. `Some` when using column-count-based filtering, `None` when + /// `delta.dataSkippingStatsColumns` is specified (which takes precedence). + n_columns: Option, + /// Counter for leaf columns included so far. Used to enforce the `n_columns` limit. + added_columns: u64, + /// Trie built from `StatsConfig::data_skipping_stats_columns` for O(path_length) prefix + /// matching. `Some` when using explicit column list, `None` when using the + /// `StatsConfig::data_skipping_num_indexed_cols` count-based approach. + data_skipping_stats_trie: Option>, + /// Trie built from required columns (e.g. clustering columns) for O(path_length) lookup + /// during traversal. Used by `should_include_for_table()` to allow required columns past + /// the limit. + required_trie: Option>, + /// Required columns (e.g. clustering columns) to add after the main traversal in + /// `collect_columns()`. Only set when using `delta.dataSkippingNumIndexedCols` (when using + /// `delta.dataSkippingStatsColumns`, required columns are merged into + /// `data_skipping_stats_trie`). + required_columns: Option<&'col [ColumnName]>, + /// Trie built from requested columns for O(path_length) lookup. When `Some`, only columns + /// matching this trie are included in the output. This filter does not affect column + /// counting — it is applied after the table-level inclusion decision. + requested_trie: Option>, + /// Current path during schema traversal. Pushed on field entry, popped on exit. + path: Vec, +} + +impl<'col> StatsColumnFilter<'col> { + /// Creates a new StatsColumnFilter with optional required and requested columns. + /// + /// Required columns (e.g. clustering columns) are always included in statistics, even when + /// `dataSkippingStatsColumns` or `dataSkippingNumIndexedCols` would otherwise exclude them. + /// + /// Requested columns optionally filter the output without affecting column counting. When + /// `Some`, only columns matching the requested set are included in the final output. + pub(crate) fn new( + config: &StatsConfig<'col>, + required_columns: Option<&'col [ColumnName]>, + requested_columns: Option<&'col [ColumnName]>, + ) -> Self { + let requested_trie = requested_columns + .filter(|cols| !cols.is_empty()) + .map(ColumnTrie::from_columns); + + // If data_skipping_stats_columns is specified, it takes precedence + // over data_skipping_num_indexed_cols, even if that is also specified. + if let Some(column_names) = config.data_skipping_stats_columns { + let mut combined_trie = ColumnTrie::from_columns(column_names); + + // Add required columns to the trie so they're included during traversal + if let Some(required_cols) = required_columns { + for col in required_cols { + let col_path: Vec = col.iter().map(|s| s.to_string()).collect(); + if !combined_trie.contains_prefix_of(&col_path) { + tracing::warn!( + "Required column '{}' not in dataSkippingStatsColumns; adding anyway", + col + ); + } + combined_trie.insert(col); + } + } + + Self { + n_columns: None, + added_columns: 0, + data_skipping_stats_trie: Some(combined_trie), + required_trie: None, // Already in data_skipping_stats_trie + required_columns: None, // Already added to trie + requested_trie, + path: Vec::new(), + } + } else { + let n_cols = config.data_skipping_num_indexed_cols.unwrap_or_default(); + let required_trie = required_columns.map(ColumnTrie::from_columns); + Self { + n_columns: Some(n_cols), + added_columns: 0, + data_skipping_stats_trie: None, + required_trie, + required_columns, // Will be handled in Pass 2 of collect_columns() + requested_trie, + path: Vec::new(), + } + } + } + + // ==================== Public API ==================== + + /// Collects logical column names that should have statistics. + /// + /// Traversal is done in two passes: + /// 1. Pass 1: Traverse schema to collect columns up to the limit + /// 2. Pass 2: Directly look up required columns not already included + pub(crate) fn collect_columns(&mut self, schema: &Schema, result: &mut Vec) { + // Pass 1: Collect columns according to table properties + for field in schema.fields() { + self.collect_field(field, result); + } + + // Pass 2: Add required columns not already included + // Uses O(n) contains check, but required columns are typically few (1-4) + if let Some(required_cols) = self.required_columns { + for col in required_cols { + if result.contains(col) { + continue; + } + // Verify the required column exists in schema before adding + if schema.walk_column_fields(col).is_ok() { + tracing::warn!( + "Required column '{}' exceeds dataSkippingNumIndexedCols limit; \ + adding anyway", + col + ); + result.push(col.clone()); + } else { + tracing::warn!( + "Required column '{}' not found in table schema; skipping", + col + ); + } + } + } + } + + // ==================== BaseStatsTransform Integration ==================== + // These methods are used by BaseStatsTransform during schema traversal. + + /// Returns true if the column limit has been reached. + pub(crate) fn at_column_limit(&self) -> bool { + matches!( + self.n_columns, + Some(DataSkippingNumIndexedCols::NumColumns(n)) if self.added_columns >= n + ) + } + + /// Returns true if the current path should be included based on table-level filtering config. + /// Required columns (e.g. clustering columns) are always included, even past the column limit. + pub(crate) fn should_include_for_table(&self) -> bool { + match &self.data_skipping_stats_trie { + // In explicit dataSkippingStatsColumns mode, include exactly columns selected by the trie. + // Required columns are already merged into the trie during initialization. + Some(trie) => trie.contains_prefix_of(&self.path), + // In count-based mode, include until limit; required columns can exceed the limit. + None => !self.at_column_limit() || self.is_required_column(), + } + } + + /// Returns true if the current path should be included based on the requested columns + /// filter. When no requested columns are set, all columns pass this check. + pub(crate) fn should_include_for_requested(&self) -> bool { + self.requested_trie + .as_ref() + .map(|trie| trie.contains_prefix_of(&self.path)) + .unwrap_or(true) + } + + /// Returns true if the current path is a required column (e.g. clustering column). + fn is_required_column(&self) -> bool { + self.required_trie + .as_ref() + .is_some_and(|trie| trie.contains_prefix_of(&self.path)) + } + + /// Enters a field path for filtering decisions. + pub(crate) fn enter_field(&mut self, name: &str) { + self.path.push(name.to_string()); + } + + /// Exits the current field path. + pub(crate) fn exit_field(&mut self) { + self.path.pop(); + } + + /// Records that a leaf column was included. + pub(crate) fn record_included(&mut self) { + self.added_columns += 1; + } + + // ==================== Internal Helpers ==================== + + /// Pass 1: Collect columns up to the limit, stopping when limit is reached. + fn collect_field(&mut self, field: &StructField, result: &mut Vec) { + // Stop traversal once we've hit the column limit + // Required columns will be added in Pass 2 + if self.at_column_limit() { + return; + } + + self.path.push(field.name.clone()); + + match field.data_type() { + DataType::Struct(struct_type) => { + for child in struct_type.fields() { + self.collect_field(child, result); + } + } + // Map, Array, and Variant types are not eligible for statistics collection. + DataType::Map(_) | DataType::Array(_) | DataType::Variant(_) => {} + _ => { + if self.should_include_for_table() { + result.push(ColumnName::new(&self.path)); + self.added_columns += 1; + } + } + } + + self.path.pop(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{schema::StructType, table_properties::TableProperties}; + + fn make_props_with_num_cols(n: u64) -> TableProperties { + [( + "delta.dataSkippingNumIndexedCols".to_string(), + n.to_string(), + )] + .into() + } + + fn make_props_with_stats_cols(cols: &str) -> TableProperties { + [( + "delta.dataSkippingStatsColumns".to_string(), + cols.to_string(), + )] + .into() + } + + /// Standard 3-column schema for required column tests: a (LONG), b (STRING), c (INTEGER) + fn abc_schema() -> StructType { + StructType::new_unchecked([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::STRING), + StructField::nullable("c", DataType::INTEGER), + ]) + } + + /// Helper to run column collection and return results + fn collect_stats_columns( + props: &TableProperties, + required_cols: Option<&[ColumnName]>, + schema: &Schema, + ) -> Vec { + let config = StatsConfig { + data_skipping_stats_columns: props.data_skipping_stats_columns.as_deref(), + data_skipping_num_indexed_cols: props.data_skipping_num_indexed_cols, + }; + let mut filter = StatsColumnFilter::new(&config, required_cols, None); + let mut columns = Vec::new(); + filter.collect_columns(schema, &mut columns); + columns + } + + // ==================== Required column tests ==================== + + #[rstest::rstest] + #[case::required_overrides_limit( + 1, // num_indexed_cols limit + vec!["c"], // required columns (3rd column) + vec!["a", "c"] // expected: "a" (within limit) + "c" (required) + )] + #[case::no_required_uses_limit( + 2, // num_indexed_cols limit + vec![], // no required columns + vec!["a", "b"] // expected: first 2 columns within limit + )] + fn test_required_with_num_indexed_cols( + #[case] num_cols: u64, + #[case] required: Vec<&str>, + #[case] expected: Vec<&str>, + ) { + let props = make_props_with_num_cols(num_cols); + let required_cols: Vec = + required.iter().map(|c| ColumnName::new([*c])).collect(); + let required_ref = if required_cols.is_empty() { + None + } else { + Some(required_cols.as_slice()) + }; + let schema = abc_schema(); + + let columns = collect_stats_columns(&props, required_ref, &schema); + + let expected_cols: Vec = + expected.iter().map(|c| ColumnName::new([*c])).collect(); + assert_eq!(columns, expected_cols); + } + + #[rstest::rstest] + #[case::required_added_to_stats( + "a", // stats columns + vec!["c"], // required columns + vec!["a", "c"] // expected: "a" (explicit) + "c" (required) + )] + #[case::required_already_in_stats( + "a,b", // stats columns include required col + vec!["a"], // required columns (already in stats) + vec!["a", "b"] // expected: no duplicates + )] + fn test_required_with_stats_columns( + #[case] stats_cols: &str, + #[case] required: Vec<&str>, + #[case] expected: Vec<&str>, + ) { + let props = make_props_with_stats_cols(stats_cols); + let required_cols: Vec = + required.iter().map(|c| ColumnName::new([*c])).collect(); + let schema = abc_schema(); + + let columns = collect_stats_columns(&props, Some(&required_cols), &schema); + + let expected_cols: Vec = + expected.iter().map(|c| ColumnName::new([*c])).collect(); + assert_eq!(columns, expected_cols); + } + + #[test] + fn test_nested_required_column_with_limit() { + // Test that nested required columns are found even with a column limit. + let props = make_props_with_num_cols(2); + + // Required column is deeply nested: user.address.city + let required_cols = vec![ColumnName::new(["user", "address", "city"])]; + + let address_struct = StructType::new_unchecked([ + StructField::nullable("street", DataType::STRING), + StructField::nullable("city", DataType::STRING), // required column + StructField::nullable("zip", DataType::STRING), + ]); + let user_struct = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("address", DataType::Struct(Box::new(address_struct))), + ]); + let other_struct = StructType::new_unchecked([ + StructField::nullable("foo", DataType::STRING), + StructField::nullable("bar", DataType::STRING), + ]); + + let schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::STRING), + StructField::nullable("user", DataType::Struct(Box::new(user_struct))), + StructField::nullable("other", DataType::Struct(Box::new(other_struct))), + StructField::nullable("extra1", DataType::STRING), + StructField::nullable("extra2", DataType::STRING), + ]); + + let columns = collect_stats_columns(&props, Some(&required_cols), &schema); + + // Should include: id, name (first 2 within limit) + user.address.city (required) + assert_eq!( + columns, + vec![ + ColumnName::new(["id"]), + ColumnName::new(["name"]), + ColumnName::new(["user", "address", "city"]), + ] + ); + } + + #[test] + fn test_required_column_not_in_schema() { + // Required column that doesn't exist in schema should be silently ignored + let props = make_props_with_num_cols(2); + let required_cols = vec![ColumnName::new(["nonexistent", "column"])]; + let schema = abc_schema(); + + let columns = collect_stats_columns(&props, Some(&required_cols), &schema); + + // Should only include normal columns, required column not found + assert_eq!( + columns, + vec![ColumnName::new(["a"]), ColumnName::new(["b"]),] + ); + } +} diff --git a/kernel/src/scan/data_skipping/stats_schema/mod.rs b/kernel/src/scan/data_skipping/stats_schema/mod.rs new file mode 100644 index 0000000000..d52199419d --- /dev/null +++ b/kernel/src/scan/data_skipping/stats_schema/mod.rs @@ -0,0 +1,1267 @@ +//! This module contains logic to compute the expected schema for file statistics + +mod column_filter; + +use std::borrow::Cow; +use std::sync::Arc; + +use crate::schema::{ + ArrayType, ColumnName, DataType, MapType, PrimitiveType, Schema, SchemaRef, StructField, + StructType, +}; +use crate::transforms::SchemaTransform; +use crate::{DeltaResult, Error}; + +use column_filter::StatsColumnFilter; +pub(crate) use column_filter::StatsConfig; + +/// Generates the expected schema for file statistics. +/// +/// The base stats schema is dependent on the current table configuration and derived via: +/// - only fields present in data files are included (use physical names, no partition columns) +/// - if the table property `delta.dataSkippingStatsColumns` is set, include only those columns. +/// Column names may refer to struct fields in which case all child fields are included. +/// - otherwise the first `dataSkippingNumIndexedCols` (default 32) leaf fields are included. +/// - all fields are made nullable. +/// +/// The `nullCount` struct field is a nested structure mirroring the table's column hierarchy. +/// It tracks the count of null values for each column. All leaf fields from the base schema +/// are converted to LONG type (since null counts are always integers). +/// +/// Note: Map, Array, and Variant types are excluded from statistics entirely (including +/// `nullCount`) as they are not eligible for data skipping. The `nullCount` schema includes +/// primitive types that aren't eligible for min/max (e.g., Boolean, Binary) since null counts +/// are still meaningful for those types. +/// +/// The `minValues`/`maxValues` struct fields are also nested structures mirroring the table's +/// column hierarchy. They additionally filter out leaf fields with non-eligible data types +/// (e.g., Boolean, Binary) via [`is_skipping_eligible_datatype`]. +/// +/// ## Stats value rules +/// +/// Statistics returned to kernel must follow these rules: +/// +/// - `numRecords`: the total number of rows in the file. +/// - `nullCount`: the number of null values in the column. Always present for included columns. +/// - `minValues`/`maxValues`: the smallest/largest non-null value in the column. When a column +/// contains only null values, there are no non-null values to aggregate, so the column has no +/// entry in `minValues`/`maxValues`. The `nullCount` entry is still present and equals +/// `numRecords`. +/// - String min/max values must be truncated to a prefix no longer than 32 characters. For min +/// values, simple prefix truncation is valid (the truncated value is always <= the original). +/// For max values, a tie-breaker character must be appended after truncation to ensure the +/// result is >= all actual values: ASCII DEL (0x7F) when the truncated character is ASCII, +/// or U+10FFFF otherwise. If a valid truncation point cannot be found within 64 characters, +/// the max value is omitted (returning `None`). +/// - Binary min/max values are not collected (Binary is not eligible for data skipping). +/// - Boolean values are not eligible for min/max statistics but do have `nullCount`. +/// +/// The `tightBounds` field is a boolean indicating whether the min/max statistics are "tight" +/// (accurate) or "wide" (potentially outdated). When `tightBounds` is `true`, the statistics +/// accurately reflect the data in the file. When `false`, the file may have deletion vectors +/// and the statistics haven't been recomputed to exclude deleted rows. +/// +/// See the Delta protocol for more details on statistics: +/// +/// +/// The overall schema is then: +/// ```ignored +/// { +/// numRecords: long, +/// nullCount: , +/// minValues: , +/// maxValues: , +/// tightBounds: boolean, +/// } +/// ``` +/// +/// For a table with physical schema: +/// +/// ```ignore +/// { +/// id: long, +/// user: { +/// name: string, +/// age: integer, +/// }, +/// } +/// ``` +/// +/// the expected stats schema would be: +/// ```ignore +/// { +/// numRecords: long, +/// nullCount: { +/// id: long, +/// user: { +/// name: long, +/// age: long, +/// }, +/// }, +/// minValues: { +/// id: long, +/// user: { +/// name: string, +/// age: integer, +/// }, +/// }, +/// maxValues: { +/// id: long, +/// user: { +/// name: string, +/// age: integer, +/// }, +/// }, +/// tightBounds: boolean, +/// } +/// ``` +/// Generates the expected schema for file statistics. +/// +/// All inputs (schema, config, and column names) must use the same column naming +/// mode -- either all physical or all logical. The output uses the same naming mode. +/// +/// # Parameters +/// +/// - `data_schema`: The table's data schema (partition columns excluded). +/// - `config`: Stats configuration controlling which columns are included. +/// - `required_columns`: Columns that must always be included in statistics (write path). +/// Per the Delta protocol, clustering columns must have statistics regardless of table +/// property settings. +/// - `requested_columns`: Filter output to only these columns (read path). If specified, +/// only columns that also pass the `config` filtering will be included. +#[allow(unused)] +pub(crate) fn expected_stats_schema( + data_schema: &Schema, + config: &StatsConfig<'_>, + required_columns: Option<&[ColumnName]>, + requested_columns: Option<&[ColumnName]>, +) -> DeltaResult { + let mut fields = Vec::with_capacity(5); + fields.push(StructField::nullable("numRecords", DataType::LONG)); + + // generate the base stats schema: + // - make all fields nullable + // - include fields according to table properties (num_indexed_cols, stats_columns, ...) + // - always include required columns (e.g. clustering columns, per Delta protocol) + // - optionally filter output to only requested columns + let mut base_transform = BaseStatsTransform::new(config, required_columns, requested_columns); + if let Some(base_schema) = base_transform.transform_struct(data_schema) { + let base_schema = base_schema.into_owned(); + + // convert all leaf fields to data type LONG for null count + let mut null_count_transform = NullCountStatsTransform; + if let Some(null_count_schema) = null_count_transform.transform_struct(&base_schema) { + fields.push(StructField::nullable( + "nullCount", + null_count_schema.into_owned(), + )); + }; + + // include only min/max skipping eligible fields (data types) + let mut min_max_transform = MinMaxStatsTransform; + if let Some(min_max_schema) = min_max_transform.transform_struct(&base_schema) { + let min_max_schema = min_max_schema.into_owned(); + fields.push(StructField::nullable("minValues", min_max_schema.clone())); + fields.push(StructField::nullable("maxValues", min_max_schema)); + } + } + + // tightBounds indicates whether min/max statistics are accurate (true) or potentially + // outdated due to deletion vectors (false) + fields.push(StructField::nullable("tightBounds", DataType::BOOLEAN)); + + StructType::try_new(fields) +} + +/// Returns the column names that should have statistics collected. +/// +/// This extracts just the column names without building the full stats schema, +/// making it more efficient when only the column list is needed. +/// +/// Per the Delta protocol, required columns (e.g. clustering columns) are always included in +/// statistics, regardless of the `delta.dataSkippingStatsColumns` or +/// `delta.dataSkippingNumIndexedCols` settings. +#[allow(unused)] +pub(crate) fn stats_column_names( + data_schema: &Schema, + config: &StatsConfig<'_>, + required_columns: Option<&[ColumnName]>, +) -> Vec { + let mut filter = StatsColumnFilter::new(config, required_columns, None); + let mut columns = Vec::new(); + filter.collect_columns(data_schema, &mut columns); + columns +} + +/// Creates a stats schema from a referenced schema (e.g. columns from a predicate). +/// Returns schema: `{ numRecords, nullCount, minValues, maxValues }` +/// +/// This is used to build the schema for parsing JSON stats and for reading stats_parsed +/// from checkpoints when only a subset of columns is needed (e.g. predicate-referenced columns). +pub(crate) fn build_stats_schema(referenced_schema: &StructType) -> Option { + let stats_schema = schema_with_all_fields_nullable(referenced_schema).ok()?; + + let nullcount_schema = NullCountStatsTransform + .transform_struct(&stats_schema)? + .into_owned(); + + let schema = StructType::new_unchecked([ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable("nullCount", nullcount_schema), + StructField::nullable("minValues", stats_schema.clone()), + StructField::nullable("maxValues", stats_schema), + ]); + + // Strip field metadata. The stats types are derived from the table schema, but the metadata on + // the fields should not be included in the stats fields + let schema = StripFieldMetadataTransform + .transform_struct(&schema) + .map(|s| s.into_owned()) + .unwrap_or(schema); + + Some(Arc::new(schema)) +} + +/// Strips all field metadata from a schema. +/// +/// Field metadata describes the logical table column, not the stats values themselves. This +/// transform strips that metadata, and must be applied to stats schemas to avoid schema possible +/// mismatches when reading `stats_parsed` from older data since that field metadata could have +/// changed. +pub(crate) struct StripFieldMetadataTransform; +impl<'a> SchemaTransform<'a> for StripFieldMetadataTransform { + fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { + Some(match self.transform(&field.data_type)? { + Cow::Borrowed(_) if field.metadata.is_empty() => Cow::Borrowed(field), + data_type => Cow::Owned(StructField { + name: field.name.clone(), + data_type: data_type.into_owned(), + nullable: field.is_nullable(), + metadata: Default::default(), + }), + }) + } +} + +/// Make all fields of a schema nullable. +/// Used for stats schemas where stats may not be available for all columns. +pub(crate) fn schema_with_all_fields_nullable(schema: &Schema) -> DeltaResult { + match NullableStatsTransform.transform_struct(schema) { + Some(schema) => Ok(schema.into_owned()), + None => Err(Error::internal_error("NullableStatsTransform failed")), + } +} + +/// Transforms a schema to make all fields nullable. +/// Used for stats schemas where stats may not be available for all columns. +pub(crate) struct NullableStatsTransform; +impl<'a> SchemaTransform<'a> for NullableStatsTransform { + fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { + let data_type = self.transform(&field.data_type)?; + Some(make_nullable_field(field, data_type)) + } +} + +// helper used by both NullableStatsTransform and BaseStatsTransform +fn make_nullable_field<'a>( + field: &'a StructField, + data_type: Cow<'a, DataType>, +) -> Cow<'a, StructField> { + match data_type { + Cow::Borrowed(_) if field.is_nullable() => Cow::Borrowed(field), + data_type => Cow::Owned(StructField { + name: field.name.clone(), + data_type: data_type.into_owned(), + nullable: true, + metadata: field.metadata.clone(), + }), + } +} + +/// Converts a stats schema into a nullCount schema where all leaf fields become LONG. +/// +/// The nullCount struct field tracks the number of null values for each column. +/// All leaf fields (primitives, arrays, maps, variants) are converted to LONG type +/// since null counts are always integers, while struct fields are recursed into +/// to preserve the nested structure. Field metadata (including column mapping info) +/// is preserved for all fields. +pub(crate) struct NullCountStatsTransform; +impl<'a> SchemaTransform<'a> for NullCountStatsTransform { + fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { + // Only recurse into struct fields; convert all other types (leaf fields) to LONG + match &field.data_type { + DataType::Struct(_) => self.recurse_into_struct_field(field), + _ => Some(Cow::Owned(StructField { + name: field.name.clone(), + data_type: DataType::LONG, + nullable: true, + metadata: field.metadata.clone(), + })), + } + } +} + +/// Transforms a table schema into a base stats schema. +/// +/// Base stats schema in this case refers the subsets of fields in the table schema +/// that may be considered for stats collection. Depending on the type of stats - min/max/nullcount/... - +/// additional transformations may be applied. +/// +/// All fields in the output are nullable. Clustering columns are always included per +/// the Delta protocol. +#[allow(unused)] +struct BaseStatsTransform<'col> { + filter: StatsColumnFilter<'col>, +} + +impl<'col> BaseStatsTransform<'col> { + #[allow(unused)] + fn new( + config: &StatsConfig<'col>, + required_columns: Option<&'col [ColumnName]>, + requested_columns: Option<&'col [ColumnName]>, + ) -> Self { + Self { + filter: StatsColumnFilter::new(config, required_columns, requested_columns), + } + } +} + +impl<'a> SchemaTransform<'a> for BaseStatsTransform<'_> { + // Always traverse struct fields -- only primitive leaf values count against the column limit + fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { + self.filter.enter_field(field.name()); + let data_type = self.transform(&field.data_type); + self.filter.exit_field(); + Some(make_nullable_field(field, data_type?)) + } + + fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option> { + if !self.filter.should_include_for_table() { + return None; + } + + // The n_columns limit is based on schema order, so we count all leaf columns that pass the + // table filter, but then we only generate stats for requested columns. + self.filter.record_included(); + self.filter + .should_include_for_requested() + .then_some(Cow::Borrowed(ptype)) + } + + fn transform_array(&mut self, _: &'a ArrayType) -> Option> { + None // not stats-eligible + } + + fn transform_map(&mut self, _: &'a MapType) -> Option> { + None // not stats-eligible + } + + fn transform_variant(&mut self, _: &'a StructType) -> Option> { + None // not stats-eligible + } +} + +// removes all fields with non eligible data types +// +// should only be applied to schema processed via `BaseStatsTransform`. +#[allow(unused)] +struct MinMaxStatsTransform; + +impl<'a> SchemaTransform<'a> for MinMaxStatsTransform { + // Array, Map, and Variant fields are filtered out by BaseStatsTransform, so these methods + // are typically not called. They're kept as a safety net in case the transform is used + // independently or the filtering logic changes. + fn transform_array(&mut self, _: &'a ArrayType) -> Option> { + None + } + fn transform_map(&mut self, _: &'a MapType) -> Option> { + None + } + fn transform_variant(&mut self, _: &'a StructType) -> Option> { + None + } + + fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option> { + is_skipping_eligible_datatype(ptype).then_some(Cow::Borrowed(ptype)) + } +} + +/// Checks if a data type is eligible for min/max file skipping. +/// +/// This is also used to validate clustering column types, since clustering requires +/// per-file statistics on clustering columns. +/// +/// Note: Boolean and Binary are intentionally excluded as min/max statistics provide minimal +/// skipping benefit for low-cardinality or opaque data types. +/// +/// See: +pub(crate) fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool { + matches!( + data_type, + &PrimitiveType::Byte + | &PrimitiveType::Short + | &PrimitiveType::Integer + | &PrimitiveType::Long + | &PrimitiveType::Float + | &PrimitiveType::Double + | &PrimitiveType::Date + | &PrimitiveType::Timestamp + | &PrimitiveType::TimestampNtz + | &PrimitiveType::String + | PrimitiveType::Decimal(_) + ) +} + +#[cfg(test)] +mod tests { + use crate::schema::ArrayType; + use crate::table_properties::TableProperties; + + use super::*; + + fn stats_config_from_table_properties(properties: &TableProperties) -> StatsConfig<'_> { + StatsConfig { + data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(), + data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols, + } + } + + /// Builds an expected stats schema from the given null count and min/max nested schemas. + fn expected_stats(null_count: StructType, min_max: StructType) -> StructType { + StructType::new_unchecked([ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable("nullCount", null_count), + StructField::nullable("minValues", min_max.clone()), + StructField::nullable("maxValues", min_max), + StructField::nullable("tightBounds", DataType::BOOLEAN), + ]) + } + + #[test] + fn test_stats_schema_simple() { + let properties: TableProperties = [("key", "value")].into(); + let file_schema = StructType::new_unchecked([StructField::nullable("id", DataType::LONG)]); + + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + let expected = expected_stats(file_schema.clone(), file_schema); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_stats_schema_nested() { + let properties: TableProperties = [("key", "value")].into(); + + let user_struct = StructType::new_unchecked([ + StructField::not_null("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + ]); + let file_schema = StructType::new_unchecked([ + StructField::not_null("id", DataType::LONG), + StructField::not_null("user", DataType::Struct(Box::new(user_struct.clone()))), + ]); + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + // Expected result: The stats schema should maintain the nested structure + // but make all fields nullable + let expected_min_max = NullableStatsTransform + .transform_struct(&file_schema) + .unwrap() + .into_owned(); + let null_count = NullCountStatsTransform + .transform_struct(&expected_min_max) + .unwrap() + .into_owned(); + + let expected = expected_stats(null_count, expected_min_max); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_stats_schema_with_non_eligible_field() { + let properties: TableProperties = [("key", "value")].into(); + + // Create a nested logical schema with: + // - top-level field "id" (LONG) - eligible for data skipping + // - nested struct "metadata" containing: + // - "name" (STRING) - eligible for data skipping + // - "tags" (ARRAY) - NOT eligible for data skipping + // - "score" (DOUBLE) - eligible for data skipping + + // Create array type for a field that's not eligible for data skipping + let array_type = DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))); + let metadata_struct = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("tags", array_type), + StructField::nullable("score", DataType::DOUBLE), + ]); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable( + "metadata", + DataType::Struct(Box::new(metadata_struct.clone())), + ), + ]); + + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + // nullCount excludes array fields (tags) - only eligible primitive types + let expected_null_nested = StructType::new_unchecked([ + StructField::nullable("name", DataType::LONG), + StructField::nullable("score", DataType::LONG), + ]); + let expected_null = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("metadata", DataType::Struct(Box::new(expected_null_nested))), + ]); + + let expected_nested = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("score", DataType::DOUBLE), + ]); + let expected_fields = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("metadata", DataType::Struct(Box::new(expected_nested))), + ]); + + let expected = expected_stats(expected_null, expected_fields); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_stats_schema_col_names() { + let properties: TableProperties = [( + "delta.dataSkippingStatsColumns".to_string(), + "`user.info`.name".to_string(), + )] + .into(); + + let user_struct = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + ]); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("user.info", DataType::Struct(Box::new(user_struct.clone()))), + ]); + + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + let expected_nested = + StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + let expected_fields = StructType::new_unchecked([StructField::nullable( + "user.info", + DataType::Struct(Box::new(expected_nested)), + )]); + let null_count = NullCountStatsTransform + .transform_struct(&expected_fields) + .unwrap() + .into_owned(); + + let expected = expected_stats(null_count, expected_fields); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_stats_schema_n_cols() { + let properties: TableProperties = [( + "delta.dataSkippingNumIndexedCols".to_string(), + "1".to_string(), + )] + .into(); + + let logical_schema = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + ]); + + let stats_schema = expected_stats_schema( + &logical_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + let expected_fields = + StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + let null_count = NullCountStatsTransform + .transform_struct(&expected_fields) + .unwrap() + .into_owned(); + + let expected = expected_stats(null_count, expected_fields); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_stats_schema_different_fields_in_null_vs_minmax() { + let properties: TableProperties = [("key", "value")].into(); + + // Create a schema with fields that have different eligibility for min/max vs null count + // - "id" (LONG) - eligible for both null count and min/max + // - "is_active" (BOOLEAN) - eligible for null count but NOT for min/max + // - "metadata" (BINARY) - eligible for null count but NOT for min/max + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("is_active", DataType::BOOLEAN), + StructField::nullable("metadata", DataType::BINARY), + ]); + + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + // Expected nullCount schema: all fields converted to LONG + let expected_null_count = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("is_active", DataType::LONG), + StructField::nullable("metadata", DataType::LONG), + ]); + + // Expected minValues/maxValues schema: only eligible fields (no boolean, no binary) + let expected_min_max = + StructType::new_unchecked([StructField::nullable("id", DataType::LONG)]); + + let expected = expected_stats(expected_null_count, expected_min_max); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_stats_schema_nested_different_fields_in_null_vs_minmax() { + let properties: TableProperties = [("key", "value")].into(); + + // Create a nested schema where some nested fields are eligible for min/max and others aren't + let user_struct = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), // eligible for min/max + StructField::nullable("is_admin", DataType::BOOLEAN), // NOT eligible for min/max + StructField::nullable("age", DataType::INTEGER), // eligible for min/max + StructField::nullable("profile_pic", DataType::BINARY), // NOT eligible for min/max + ]); + + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("user", DataType::Struct(Box::new(user_struct.clone()))), + StructField::nullable("is_deleted", DataType::BOOLEAN), // NOT eligible for min/max + ]); + + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + // Expected nullCount schema: all fields converted to LONG, maintaining structure + let expected_null_user = StructType::new_unchecked([ + StructField::nullable("name", DataType::LONG), + StructField::nullable("is_admin", DataType::LONG), + StructField::nullable("age", DataType::LONG), + StructField::nullable("profile_pic", DataType::LONG), + ]); + let expected_null_count = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("user", DataType::Struct(Box::new(expected_null_user))), + StructField::nullable("is_deleted", DataType::LONG), + ]); + + // Expected minValues/maxValues schema: only eligible fields + let expected_minmax_user = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + ]); + let expected_min_max = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("user", DataType::Struct(Box::new(expected_minmax_user))), + ]); + + let expected = expected_stats(expected_null_count, expected_min_max); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_stats_schema_only_non_eligible_fields() { + let properties: TableProperties = [("key", "value")].into(); + + // Create a schema with only fields that are NOT eligible for min/max skipping + let file_schema = StructType::new_unchecked([ + StructField::nullable("is_active", DataType::BOOLEAN), + StructField::nullable("metadata", DataType::BINARY), + StructField::nullable( + "tags", + DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))), + ), + ]); + + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + // nullCount includes boolean and binary (primitives) but excludes array + let expected_null_count = StructType::new_unchecked([ + StructField::nullable("is_active", DataType::LONG), + StructField::nullable("metadata", DataType::LONG), + ]); + + // minValues/maxValues: no fields are eligible (boolean/binary excluded) + let expected = StructType::new_unchecked([ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable("nullCount", expected_null_count), + StructField::nullable("tightBounds", DataType::BOOLEAN), + ]); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_stats_schema_map_array_dont_count_against_limit() { + // Test that Map and Array fields don't count against the column limit. + // With a limit of 2, if we have: array, map, col1, col2, col3 + // We should get stats for col1 and col2 (the first 2 eligible columns), + // not be limited by the array and map fields. + let properties: TableProperties = [( + "delta.dataSkippingNumIndexedCols".to_string(), + "2".to_string(), + )] + .into(); + + let file_schema = StructType::new_unchecked([ + StructField::nullable( + "tags", + DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))), + ), + StructField::nullable( + "metadata", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::STRING, + true, + ))), + ), + StructField::nullable("col1", DataType::LONG), + StructField::nullable("col2", DataType::STRING), + StructField::nullable("col3", DataType::INTEGER), // Should be excluded by limit + ]); + + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + // nullCount has only eligible primitive columns (col1 and col2). + // Map/Array/Variant are excluded from all stats. + let expected_null_count = StructType::new_unchecked([ + StructField::nullable("col1", DataType::LONG), + StructField::nullable("col2", DataType::LONG), + ]); + + // minValues/maxValues only have eligible primitive types (col1 and col2). + // Map/Array are filtered out by MinMaxStatsTransform. + let expected_min_max = StructType::new_unchecked([ + StructField::nullable("col1", DataType::LONG), + StructField::nullable("col2", DataType::STRING), + ]); + + let expected = expected_stats(expected_null_count, expected_min_max); + + assert_eq!(&expected, &stats_schema); + } + + // ==================== stats_column_names tests ==================== + + #[test] + fn test_stats_column_names_default() { + let properties: TableProperties = [("key", "value")].into(); + + let user_struct = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + ]); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("user", DataType::Struct(Box::new(user_struct))), + ]); + + let config = StatsConfig { + data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(), + data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols, + }; + let columns = stats_column_names(&file_schema, &config, None); + + // With default settings, all leaf columns should be included + assert_eq!( + columns, + vec![ + ColumnName::new(["id"]), + ColumnName::new(["user", "name"]), + ColumnName::new(["user", "age"]), + ] + ); + } + + #[test] + fn test_stats_column_names_with_num_indexed_cols() { + let properties: TableProperties = [( + "delta.dataSkippingNumIndexedCols".to_string(), + "2".to_string(), + )] + .into(); + + let file_schema = StructType::new_unchecked([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::STRING), + StructField::nullable("c", DataType::INTEGER), + StructField::nullable("d", DataType::DOUBLE), + ]); + + let config = StatsConfig { + data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(), + data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols, + }; + let columns = stats_column_names(&file_schema, &config, None); + + // Only first 2 columns should be included + assert_eq!( + columns, + vec![ColumnName::new(["a"]), ColumnName::new(["b"]),] + ); + } + + #[test] + fn test_stats_column_names_with_stats_columns() { + let properties: TableProperties = [( + "delta.dataSkippingStatsColumns".to_string(), + "id,user.age".to_string(), + )] + .into(); + + let user_struct = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + ]); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("user", DataType::Struct(Box::new(user_struct))), + StructField::nullable("extra", DataType::STRING), + ]); + + let config = StatsConfig { + data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(), + data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols, + }; + let columns = stats_column_names(&file_schema, &config, None); + + // Only specified columns should be included (user.name and extra excluded) + assert_eq!( + columns, + vec![ColumnName::new(["id"]), ColumnName::new(["user", "age"]),] + ); + } + + #[test] + fn test_stats_column_names_skips_non_eligible_types() { + let properties: TableProperties = [("key", "value")].into(); + + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable( + "tags", + DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))), + ), + StructField::nullable( + "metadata", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::STRING, + true, + ))), + ), + StructField::nullable("name", DataType::STRING), + ]); + + let config = StatsConfig { + data_skipping_stats_columns: properties.data_skipping_stats_columns.as_deref(), + data_skipping_num_indexed_cols: properties.data_skipping_num_indexed_cols, + }; + let columns = stats_column_names(&file_schema, &config, None); + + // Array and Map types should be excluded + assert_eq!( + columns, + vec![ColumnName::new(["id"]), ColumnName::new(["name"]),] + ); + } + + // ==================== clustering column tests ==================== + + #[test] + fn test_stats_schema_with_clustering_past_limit() { + // Test that clustering columns are included in stats schema even when past the limit + let properties: TableProperties = [( + "delta.dataSkippingNumIndexedCols".to_string(), + "1".to_string(), + )] + .into(); + + let file_schema = StructType::new_unchecked([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::STRING), + StructField::nullable("c", DataType::INTEGER), + ]); + + // "c" is a clustering column, should be included even though limit is 1 + let clustering_columns = vec![ColumnName::new(["c"])]; + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + Some(&clustering_columns), + None, + ) + .unwrap(); + + // Only "a" (first column) and "c" (clustering) should be included + let expected_null_count = StructType::new_unchecked([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("c", DataType::LONG), + ]); + let expected_min_max = StructType::new_unchecked([ + StructField::nullable("a", DataType::LONG), + StructField::nullable("c", DataType::INTEGER), + ]); + + let expected = expected_stats(expected_null_count, expected_min_max); + + assert_eq!(&expected, &stats_schema); + } + + // ==================== requested_columns filtering tests ==================== + + #[test] + fn test_requested_filters_to_single_column() { + let properties: TableProperties = [("key", "value")].into(); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::STRING), + StructField::nullable("value", DataType::INTEGER), + ]); + + let columns = [ColumnName::new(["id"])]; + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + Some(&columns), + ) + .unwrap(); + + let expected_nested = + StructType::new_unchecked([StructField::nullable("id", DataType::LONG)]); + + let expected = expected_stats(expected_nested.clone(), expected_nested); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_none_requested_returns_full_schema() { + // None for requested_columns means no output filtering — include all columns + let properties: TableProperties = [("key", "value")].into(); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::STRING), + ]); + + let with_none = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + // Should include both columns + let min_values = with_none.field("minValues").expect("should have minValues"); + if let DataType::Struct(inner) = min_values.data_type() { + assert!(inner.field("id").is_some()); + assert!(inner.field("name").is_some()); + } else { + panic!("minValues should be a struct"); + } + } + + #[test] + fn test_requested_column_outside_limit_excluded() { + // requested_columns alone does NOT bypass the column limit — only required_columns does + let properties: TableProperties = [("delta.dataSkippingNumIndexedCols", "1")].into(); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::STRING), + ]); + + // "name" is outside the limit (limit is 1), and is only requested, not required + let columns = [ColumnName::new(["name"])]; + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + Some(&columns), + ) + .unwrap(); + + // No data columns pass both filters, so only numRecords + tightBounds + let expected = StructType::new_unchecked([ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable("tightBounds", DataType::BOOLEAN), + ]); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_required_bypasses_limit_with_requested_filter() { + // When a column is both required AND requested, it bypasses the limit and + // appears in the output. This is the pattern used by the read path. + let properties: TableProperties = [("delta.dataSkippingNumIndexedCols", "1")].into(); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::STRING), + ]); + + let columns = [ColumnName::new(["name"])]; + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + Some(&columns), + Some(&columns), + ) + .unwrap(); + + let expected_nested = + StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + let expected_null = + StructType::new_unchecked([StructField::nullable("name", DataType::LONG)]); + + let expected = expected_stats(expected_null, expected_nested); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_requested_does_not_affect_column_counting() { + // With num_indexed_cols=2, "id" and "name" are within the limit. + // requested_columns=["name"] filters the output to just "name", + // but "id" still counts toward the limit (so "value" stays excluded). + let properties: TableProperties = [("delta.dataSkippingNumIndexedCols", "2")].into(); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::STRING), + StructField::nullable("value", DataType::INTEGER), + ]); + + let columns = [ColumnName::new(["name"])]; + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + Some(&columns), + ) + .unwrap(); + + // Only "name" appears in the output (filtered), even though "id" counted toward the limit + let expected_nested = + StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + let expected_null = + StructType::new_unchecked([StructField::nullable("name", DataType::LONG)]); + + let expected = expected_stats(expected_null, expected_nested); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_multiple_requested_columns() { + let properties: TableProperties = [("key", "value")].into(); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::STRING), + StructField::nullable("value", DataType::INTEGER), + ]); + + let columns = [ColumnName::new(["id"]), ColumnName::new(["name"])]; + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + Some(&columns), + ) + .unwrap(); + + let expected_nested = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::STRING), + ]); + let expected_null = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::LONG), + ]); + + let expected = expected_stats(expected_null, expected_nested); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_nested_requested_column() { + let properties: TableProperties = [("key", "value")].into(); + let user_struct = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + ]); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("user", DataType::Struct(Box::new(user_struct))), + ]); + + let columns = [ColumnName::new(["user", "name"])]; + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + Some(&columns), + ) + .unwrap(); + + let expected_user_nested = + StructType::new_unchecked([StructField::nullable("name", DataType::STRING)]); + let expected_nested = StructType::new_unchecked([StructField::nullable( + "user", + DataType::Struct(Box::new(expected_user_nested)), + )]); + + let expected_user_null = + StructType::new_unchecked([StructField::nullable("name", DataType::LONG)]); + let expected_null = StructType::new_unchecked([StructField::nullable( + "user", + DataType::Struct(Box::new(expected_user_null)), + )]); + + let expected = expected_stats(expected_null, expected_nested); + + assert_eq!(&expected, &stats_schema); + } + + #[test] + fn test_empty_requested_columns() { + let properties: TableProperties = [("key", "value")].into(); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("name", DataType::STRING), + ]); + + // Empty columns list should return the full schema (same as None) + let columns: [ColumnName; 0] = []; + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + Some(&columns), + ) + .unwrap(); + let full_stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + None, + ) + .unwrap(); + + assert_eq!(&full_stats_schema, &stats_schema); + } + + #[test] + fn test_mixed_nested_and_top_requested() { + let properties: TableProperties = [("key", "value")].into(); + let user_struct = StructType::new_unchecked([ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + ]); + let file_schema = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("user", DataType::Struct(Box::new(user_struct))), + StructField::nullable("value", DataType::DOUBLE), + ]); + + let columns = [ColumnName::new(["id"]), ColumnName::new(["user", "age"])]; + let stats_schema = expected_stats_schema( + &file_schema, + &stats_config_from_table_properties(&properties), + None, + Some(&columns), + ) + .unwrap(); + + let expected_user_nested = + StructType::new_unchecked([StructField::nullable("age", DataType::INTEGER)]); + let expected_nested = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable( + "user", + DataType::Struct(Box::new(expected_user_nested.clone())), + ), + ]); + + let expected_user_null = + StructType::new_unchecked([StructField::nullable("age", DataType::LONG)]); + let expected_null = StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("user", DataType::Struct(Box::new(expected_user_null))), + ]); + + let expected = expected_stats(expected_null, expected_nested); + + assert_eq!(&expected, &stats_schema); + } +} diff --git a/kernel/src/scan/data_skipping/tests.rs b/kernel/src/scan/data_skipping/tests.rs index f492b8fab4..6184227b52 100644 --- a/kernel/src/scan/data_skipping/tests.rs +++ b/kernel/src/scan/data_skipping/tests.rs @@ -2,6 +2,7 @@ use super::*; use crate::expressions::column_name; use crate::kernel_predicates::{DefaultKernelPredicateEvaluator, UnimplementedColumnResolver}; +use rstest::rstest; use std::collections::HashMap; const TRUE: Option = Some(true); @@ -29,8 +30,11 @@ fn test_eval_is_null() { let do_test = |nullcount: i64, expected: &[Option]| { let resolver = HashMap::from_iter([ - (column_name!("numRecords"), Scalar::from(2i64)), - (column_name!("nullCount.x"), Scalar::from(nullcount)), + (column_name!("stats_parsed.numRecords"), Scalar::from(2i64)), + ( + column_name!("stats_parsed.nullCount.x"), + Scalar::from(nullcount), + ), ]); let filter = DefaultKernelPredicateEvaluator::from(resolver); for (pred, expect) in predicates.iter().zip(expected) { @@ -72,8 +76,8 @@ fn test_eval_binary_comparisons() { let do_test = |min: &Scalar, max: &Scalar, expected: &[Option]| { let resolver = HashMap::from_iter([ - (column_name!("minValues.x"), min.clone()), - (column_name!("maxValues.x"), max.clone()), + (column_name!("stats_parsed.minValues.x"), min.clone()), + (column_name!("stats_parsed.maxValues.x"), max.clone()), ]); let filter = DefaultKernelPredicateEvaluator::from(resolver); for (pred, expect) in predicates.iter().zip(expected.iter()) { @@ -150,6 +154,14 @@ fn test_eval_junction() { (&[NULL, FALSE, TRUE], FALSE, TRUE), ]; let filter = DefaultKernelPredicateEvaluator::from(UnimplementedColumnResolver); + + // Helper: evaluate a skipping predicate, treating None (can't create skipping predicate) + // as NULL (unknown/can't skip) -- both mean "keep all files". + let eval_skipping = |pred: &Pred| -> Option { + let skipping_pred = as_data_skipping_predicate(pred)?; + filter.eval(&skipping_pred) + }; + for (inputs, expect_and, expect_or) in test_cases { let inputs: Vec<_> = inputs .iter() @@ -160,25 +172,21 @@ fn test_eval_junction() { .collect(); let pred = Pred::and_from(inputs.clone()); - let pred = as_data_skipping_predicate(&pred).unwrap(); - expect_eq!(filter.eval(&pred), *expect_and, "AND({inputs:?})"); + expect_eq!(eval_skipping(&pred), *expect_and, "AND({inputs:?})"); let pred = Pred::or_from(inputs.clone()); - let pred = as_data_skipping_predicate(&pred).unwrap(); - expect_eq!(filter.eval(&pred), *expect_or, "OR({inputs:?})"); + expect_eq!(eval_skipping(&pred), *expect_or, "OR({inputs:?})"); let pred = Pred::not(Pred::and_from(inputs.clone())); - let pred = as_data_skipping_predicate(&pred).unwrap(); expect_eq!( - filter.eval(&pred), + eval_skipping(&pred), expect_and.map(|val| !val), "NOT AND({inputs:?})" ); let pred = Pred::not(Pred::or_from(inputs.clone())); - let pred = as_data_skipping_predicate(&pred).unwrap(); expect_eq!( - filter.eval(&pred), + eval_skipping(&pred), expect_or.map(|val| !val), "NOT OR({inputs:?})" ); @@ -205,10 +213,13 @@ fn test_eval_distinct() { let do_test = |min: &Scalar, max: &Scalar, nullcount: i64, expected: &[Option]| { let resolver = HashMap::from_iter([ - (column_name!("numRecords"), Scalar::from(2i64)), - (column_name!("nullCount.x"), Scalar::from(nullcount)), - (column_name!("minValues.x"), min.clone()), - (column_name!("maxValues.x"), max.clone()), + (column_name!("stats_parsed.numRecords"), Scalar::from(2i64)), + ( + column_name!("stats_parsed.nullCount.x"), + Scalar::from(nullcount), + ), + (column_name!("stats_parsed.minValues.x"), min.clone()), + (column_name!("stats_parsed.maxValues.x"), max.clone()), ]); let filter = DefaultKernelPredicateEvaluator::from(resolver); for (pred, expect) in predicates.iter().zip(expected) { @@ -276,10 +287,16 @@ fn test_sql_where() { HashMap::new() } else { HashMap::from_iter([ - (column_name!("numRecords"), Scalar::from(ROWCOUNT)), - (column_name!("nullCount.x"), Scalar::from(nulls)), - (column_name!("minValues.x"), min.clone()), - (column_name!("maxValues.x"), max.clone()), + ( + column_name!("stats_parsed.numRecords"), + Scalar::from(ROWCOUNT), + ), + ( + column_name!("stats_parsed.nullCount.x"), + Scalar::from(nulls), + ), + (column_name!("stats_parsed.minValues.x"), min.clone()), + (column_name!("stats_parsed.maxValues.x"), max.clone()), ]) }; let filter = DefaultKernelPredicateEvaluator::from(resolver); @@ -289,7 +306,8 @@ fn test_sql_where() { expect, "{pred:#?} became {skipping_pred:#?} ({min}..{max}, {nulls} nulls)" ); - let skipping_sql_pred = as_sql_data_skipping_predicate(pred).unwrap(); + let skipping_sql_pred = + as_sql_data_skipping_predicate(pred, &Default::default()).unwrap(); expect_eq!( filter.eval(&skipping_sql_pred), expect_sql, @@ -319,10 +337,13 @@ fn test_sql_where() { do_test(ALL_NULL, pred, PRESENT, None, Some(false)); do_test(ALL_NULL, pred, MISSING, None, None); - // NULL inside AND allows static skipping under SQL semantics + // NULL literal is treated as unknown (not false) under eval_sql_where, so it does not + // force static skipping on its own. With present-but-all-null stats, the comparison arm + // still evaluates to false (null-safe check fails), so AND(unknown, false) = false. + // With missing stats, both arms are unknown, so AND(unknown, unknown) = unknown. let pred = &Pred::and(NULL, Pred::lt(col.clone(), VAL)); do_test(ALL_NULL, pred, PRESENT, None, Some(false)); - do_test(ALL_NULL, pred, MISSING, None, Some(false)); + do_test(ALL_NULL, pred, MISSING, None, None); // Comparison inside AND inside AND works let pred = &Pred::and(TRUE, Pred::and(TRUE, Pred::lt(col.clone(), VAL))); @@ -340,69 +361,1003 @@ fn test_sql_where() { do_test(ALL_NULL, pred, MISSING, None, None); } -// TODO(#1002): we currently don't support file skipping on timestamp columns' max stat since they -// are truncated to milliseconds in add.stats. #[test] -fn test_timestamp_skipping_disabled() { - let creator = DataSkippingPredicateCreator; +fn test_timestamp_stats_enabled() { + let empty = HashSet::new(); + let creator = DataSkippingPredicateCreator { + partition_columns: &empty, + }; let col = &column_name!("timestamp_col"); assert!( creator.get_min_stat(col, &DataType::TIMESTAMP).is_some(), - "get_min_stat should return Some: allow data skipping on timestamp minValues" + "get_min_stat should return Some for timestamp minValues" ); - assert_eq!( - creator.get_max_stat(col, &DataType::TIMESTAMP), - None, - "get_max_stat should return None: no data skipping on timestamp maxValues" + assert!( + creator.get_max_stat(col, &DataType::TIMESTAMP).is_some(), + "get_max_stat should return Some for timestamp maxValues" ); assert!( creator .get_min_stat(col, &DataType::TIMESTAMP_NTZ) .is_some(), - "get_min_stat should return Some: allow data skipping on timestamp_ntz minValues" + "get_min_stat should return Some for timestamp_ntz minValues" ); + assert!( + creator + .get_max_stat(col, &DataType::TIMESTAMP_NTZ) + .is_some(), + "get_max_stat should return Some for timestamp_ntz maxValues" + ); +} + +#[test] +fn test_adjust_scalar_for_max_stat_truncation() { + // Timestamp: subtracts 999us assert_eq!( - creator.get_max_stat(col, &DataType::TIMESTAMP_NTZ), - None, - "get_max_stat should return None: no data skipping on timestamp_ntz maxValues" + adjust_scalar_for_max_stat_truncation(&Scalar::Timestamp(1_000_000)), + Scalar::Timestamp(999_001) ); + // TimestampNtz: subtracts 999us + assert_eq!( + adjust_scalar_for_max_stat_truncation(&Scalar::TimestampNtz(1_000_000)), + Scalar::TimestampNtz(999_001) + ); + // Non-timestamp: unchanged + assert_eq!( + adjust_scalar_for_max_stat_truncation(&Scalar::from(42i64)), + Scalar::from(42i64) + ); + // Saturating at i64::MIN + assert_eq!( + adjust_scalar_for_max_stat_truncation(&Scalar::Timestamp(i64::MIN)), + Scalar::Timestamp(i64::MIN) + ); + // Near-zero: goes negative + assert_eq!( + adjust_scalar_for_max_stat_truncation(&Scalar::Timestamp(500)), + Scalar::Timestamp(-499) + ); +} + +// Verifies the guarded checkpoint skipping predicate: +// - Prunes when stats are present and below threshold +// - Keeps when stats are present and above threshold +// - Conservatively keeps when stats are null (IS NULL guard fires) +#[rstest] +#[case::stats_below_threshold(Scalar::from(50), FALSE, "max=50, col>100 should skip")] +#[case::stats_above_threshold(Scalar::from(150), TRUE, "max=150, col>100 should keep")] +#[case::stats_null( + Scalar::Null(DataType::INTEGER), + TRUE, + "null max should keep (IS NULL guard)" +)] +fn test_checkpoint_skipping_semantic( + #[case] max_val: Scalar, + #[case] expected: Option, + #[case] description: &str, +) { + let pred = Pred::gt(column_expr!("x"), Scalar::from(100)); + let skipping_pred = as_checkpoint_skipping_predicate(&pred, &[]).unwrap(); + let resolver = HashMap::from_iter([(column_name!("maxValues.x"), max_val)]); + let filter = DefaultKernelPredicateEvaluator::from(resolver); + expect_eq!(filter.eval(&skipping_pred), expected, "{description}"); } -// TODO(#1002): we currently don't support file skipping on timestamp columns' max stat since they -// are truncated to milliseconds in add.stats. +// Verifies that the IS NULL guard changes behavior compared to a regular data skipping predicate: +// without the guard, null stats produce NULL (unknown); with the guard, they produce TRUE (keep). #[test] -fn test_timestamp_predicates_dont_data_skip() { +fn test_checkpoint_skipping_null_guard_vs_regular() { + let pred = Pred::gt(column_expr!("x"), Scalar::from(100)); + let resolver = + HashMap::from_iter([(column_name!("maxValues.x"), Scalar::Null(DataType::INTEGER))]); + let filter = DefaultKernelPredicateEvaluator::from(resolver); + + let guarded = as_checkpoint_skipping_predicate(&pred, &[]).unwrap(); + expect_eq!( + filter.eval(&guarded), + TRUE, + "guarded pred with null stats -> TRUE (keep)" + ); + + let regular = as_data_skipping_predicate(&pred).unwrap(); + expect_eq!( + filter.eval(®ular), + NULL, + "regular pred with null stats -> NULL (unknown)" + ); +} + +// Verifies that a conjunction can still prune when one column has null stats but the other +// column's stats are sufficient. For `col_a > 100 AND col_b < 50`, the guarded predicate is: +// +// AND( +// OR(maxValues.col_a IS NULL, maxValues.col_a > 100), +// OR(minValues.col_b IS NULL, minValues.col_b < 50) +// ) +// +// Even if col_a's stats are null, col_b's stats alone can prune the row group. +#[test] +fn test_checkpoint_skipping_conjunction_partial_null_stats() { + let pred = Pred::and( + Pred::gt(column_expr!("col_a"), Scalar::from(100)), + Pred::lt(column_expr!("col_b"), Scalar::from(50)), + ); + let skipping_pred = as_checkpoint_skipping_predicate(&pred, &[]).unwrap(); + + // Both stats present and both allow pruning -> skip + let resolver = HashMap::from_iter([ + (column_name!("maxValues.col_a"), Scalar::from(50)), + (column_name!("minValues.col_b"), Scalar::from(60)), + ]); + let filter = DefaultKernelPredicateEvaluator::from(resolver); + expect_eq!( + filter.eval(&skipping_pred), + FALSE, + "both cols prunable -> skip" + ); + + // col_a stats null, but col_b stats alone are enough to prune -> still skip + let resolver = HashMap::from_iter([ + ( + column_name!("maxValues.col_a"), + Scalar::Null(DataType::INTEGER), + ), + (column_name!("minValues.col_b"), Scalar::from(60)), + ]); + let filter = DefaultKernelPredicateEvaluator::from(resolver); + expect_eq!( + filter.eval(&skipping_pred), + FALSE, + "col_a null but col_b prunable -> still skip" + ); + + // col_a stats null and col_b doesn't allow pruning -> keep + let resolver = HashMap::from_iter([ + ( + column_name!("maxValues.col_a"), + Scalar::Null(DataType::INTEGER), + ), + (column_name!("minValues.col_b"), Scalar::from(30)), + ]); + let filter = DefaultKernelPredicateEvaluator::from(resolver); + expect_eq!( + filter.eval(&skipping_pred), + TRUE, + "col_a null and col_b not prunable -> keep" + ); +} + +// Verifies the null-guarded checkpoint skipping path also applies the 999us timestamp +// truncation adjustment to max stat comparisons. +#[rstest] +fn test_checkpoint_skipping_timestamp_adjustment( + #[values(Scalar::Timestamp(1_000_000), Scalar::TimestampNtz(1_000_000))] timestamp: Scalar, +) { let col = &column_expr!("ts_col"); - for timestamp in [&Scalar::Timestamp(1000000), &Scalar::TimestampNtz(1000000)] { - // LT will do minValues -> OK - let pred = Pred::lt(col.clone(), timestamp.clone()); - let skipping_pred = as_data_skipping_predicate(&pred); - assert_eq!( - skipping_pred.unwrap().to_string(), - "Column(minValues.ts_col) < 1000000" - ); - // GT will do maxValues -> BLOCKED - let pred = Pred::gt(col.clone(), timestamp.clone()); - let skipping_pred = as_data_skipping_predicate(&pred); - assert!( - skipping_pred.is_none(), - "Expected no data skipping for timestamp predicate: {pred:#?}, got {skipping_pred:#?}" - ); + // GT: should produce OR(maxValues.ts_col IS NULL, maxValues.ts_col > 999001) + let pred = Pred::gt(col.clone(), timestamp.clone()); + let skipping_pred = as_checkpoint_skipping_predicate(&pred, &[]).unwrap(); + assert_eq!( + skipping_pred.to_string(), + "OR(Column(maxValues.ts_col) IS NULL, Column(maxValues.ts_col) > 999001)" + ); - let pred = Pred::eq(col.clone(), timestamp.clone()); - let skipping_pred = as_data_skipping_predicate(&pred); - assert_eq!( - skipping_pred.unwrap().to_string(), - "AND(NOT(Column(minValues.ts_col) > 1000000), null)" - ); + // EQ: max stat leg should use adjusted literal + let pred = Pred::eq(col.clone(), timestamp.clone()); + let skipping_pred = as_checkpoint_skipping_predicate(&pred, &[]).unwrap(); + assert_eq!( + skipping_pred.to_string(), + "AND(OR(Column(minValues.ts_col) IS NULL, NOT(Column(minValues.ts_col) > 1000000)), \ + OR(Column(maxValues.ts_col) IS NULL, NOT(Column(maxValues.ts_col) < 999001)))" + ); +} - let pred = Pred::ne(col.clone(), timestamp.clone()); - let skipping_pred = as_data_skipping_predicate(&pred); - assert_eq!( - skipping_pred.unwrap().to_string(), - "OR(NOT(Column(minValues.ts_col) = 1000000), null)" - ); +// Timestamp predicates use max stats with a 999us adjustment to account for millisecond +// truncation in Delta JSON stats. +#[rstest] +fn test_timestamp_predicates_use_adjusted_max_stats( + #[values(Scalar::Timestamp(1_000_000), Scalar::TimestampNtz(1_000_000))] timestamp: Scalar, +) { + let col = &column_expr!("ts_col"); + + // LT uses minValues (no adjustment needed for min stats) + let pred = Pred::lt(col.clone(), timestamp.clone()); + assert_eq!( + as_data_skipping_predicate(&pred).unwrap().to_string(), + "Column(stats_parsed.minValues.ts_col) < 1000000" + ); + + // GT uses maxValues with adjusted literal (1000000 - 999 = 999001) + let pred = Pred::gt(col.clone(), timestamp.clone()); + assert_eq!( + as_data_skipping_predicate(&pred).unwrap().to_string(), + "Column(stats_parsed.maxValues.ts_col) > 999001" + ); + + // EQ uses both min (unadjusted) and max (adjusted) + let pred = Pred::eq(col.clone(), timestamp.clone()); + assert_eq!( + as_data_skipping_predicate(&pred).unwrap().to_string(), + "AND(NOT(Column(stats_parsed.minValues.ts_col) > 1000000), \ + NOT(Column(stats_parsed.maxValues.ts_col) < 999001))" + ); + + // NE uses both min (unadjusted) and max (adjusted) + let pred = Pred::ne(col.clone(), timestamp.clone()); + assert_eq!( + as_data_skipping_predicate(&pred).unwrap().to_string(), + "OR(NOT(Column(stats_parsed.minValues.ts_col) = 1000000), \ + NOT(Column(stats_parsed.maxValues.ts_col) = 999001))" + ); + + // GE (col >= val) uses maxValues with adjusted literal + let pred = Pred::ge(col.clone(), timestamp.clone()); + assert_eq!( + as_data_skipping_predicate(&pred).unwrap().to_string(), + "NOT(Column(stats_parsed.maxValues.ts_col) < 999001)" + ); + + // LE (col <= val) uses minValues only (no adjustment needed) + let pred = Pred::le(col.clone(), timestamp.clone()); + assert_eq!( + as_data_skipping_predicate(&pred).unwrap().to_string(), + "NOT(Column(stats_parsed.minValues.ts_col) > 1000000)" + ); +} + +// Partition timestamp columns use exact values (not truncated), so no adjustment is applied. +#[test] +fn test_partition_timestamp_column_no_adjustment() { + let partition_columns: HashSet = ["ts_part".to_string()].into(); + let pred = Pred::gt(column_expr!("ts_part"), Scalar::Timestamp(1_000_000)); + let skipping_pred = + as_data_skipping_predicate_with_partitions(&pred, &partition_columns).unwrap(); + assert_eq!( + skipping_pred.to_string(), + "OR(NOT(Column(is_add)), Column(partitionValues_parsed.ts_part) > 1000000)" + ); +} + +// Tests for partition-aware data skipping + +/// Helper to build a partition columns set with a single "part_col" entry. +fn test_partition_columns() -> HashSet { + ["part_col".to_string()].into() +} + +/// Helper to build a resolver for mixed partition + data stats evaluation. +fn mixed_resolver( + part_val: &str, + max_data: i32, +) -> DefaultKernelPredicateEvaluator> { + DefaultKernelPredicateEvaluator::from(HashMap::from_iter([ + ( + column_name!("partitionValues_parsed.part_col"), + Scalar::from(part_val), + ), + ( + column_name!("stats_parsed.maxValues.data_col"), + Scalar::from(max_data), + ), + (column_name!("is_add"), Scalar::from(true)), + ])) +} + +#[test] +fn test_partition_column_rewrite() { + let partition_columns = test_partition_columns(); + + // Partition column equality rewrites to partitionValues (not minValues/maxValues) + let pred = Pred::eq(column_expr!("part_col"), Scalar::from("2025-01-01")); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns); + let pred_str = skipping_pred.as_ref().map(|p| p.to_string()); + assert!( + pred_str + .as_ref() + .is_some_and(|s| s.contains("partitionValues_parsed.part_col")), + "Expected partitionValues_parsed.part_col, got {pred_str:?}" + ); + assert!( + pred_str + .as_ref() + .is_some_and(|s| !s.contains("minValues") && !s.contains("maxValues")), + "Should not contain minValues/maxValues for partition columns" + ); + + // Data column still rewrites to stats_parsed.minValues/maxValues + let pred = Pred::gt(column_expr!("data_col"), Scalar::from(100)); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns); + let pred_str = skipping_pred.as_ref().map(|p| p.to_string()); + assert!( + pred_str + .as_ref() + .is_some_and(|s| s.contains("stats_parsed.maxValues.data_col")), + "Expected stats_parsed.maxValues.data_col for data column, got {pred_str:?}" + ); +} + +#[rstest] +#[case::is_null( + Pred::is_null(column_expr!("part_col")), + "OR(NOT(Column(is_add)), Column(partitionValues_parsed.part_col) IS NULL)" +)] +#[case::is_not_null( + Pred::is_not_null(column_expr!("part_col")), + "OR(NOT(Column(is_add)), NOT(Column(partitionValues_parsed.part_col) IS NULL))" +)] +fn test_partition_column_is_null(#[case] pred: Pred, #[case] expected: &str) { + let partition_columns = test_partition_columns(); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns); + assert_eq!( + skipping_pred.as_ref().map(|p| p.to_string()).as_deref(), + Some(expected), + ); +} + +#[test] +fn test_mixed_partition_and_data_or_predicate() { + let partition_columns = test_partition_columns(); + + // Mixed OR: partition_col = 'X' OR data_col > 100 + // This should produce a valid skipping predicate (not None) because both + // operands are now eligible for data skipping. + let pred = Pred::or( + Pred::eq(column_expr!("part_col"), Scalar::from("X")), + Pred::gt(column_expr!("data_col"), Scalar::from(100)), + ); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns); + assert!( + skipping_pred.is_some(), + "Mixed partition+data OR should produce a valid skipping predicate" + ); + let pred_str = skipping_pred.as_ref().map(|p| p.to_string()); + assert!( + pred_str + .as_ref() + .is_some_and(|s| s.contains("partitionValues_parsed.part_col")), + "Should reference partitionValues for partition column" + ); + assert!( + pred_str + .as_ref() + .is_some_and(|s| s.contains("stats_parsed.maxValues.data_col")), + "Should reference stats_parsed.maxValues for data column" + ); +} + +#[rstest] +#[case::both_miss("Y", 50, FALSE)] +#[case::partition_match("X", 50, TRUE)] +#[case::data_match("Y", 200, TRUE)] +fn test_mixed_partition_and_data_or_evaluation( + #[case] part_val: &str, + #[case] max_data: i32, + #[case] expected: Option, +) { + let partition_columns = test_partition_columns(); + + // WHERE part_col = 'X' OR data_col > 100 + let pred = Pred::or( + Pred::eq(column_expr!("part_col"), Scalar::from("X")), + Pred::gt(column_expr!("data_col"), Scalar::from(100)), + ); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns) + .expect("should exist"); + + let filter = mixed_resolver(part_val, max_data); + assert_eq!( + filter.eval(&skipping_pred), + expected, + "part_col='{part_val}' max(data_col)={max_data}" + ); +} + +#[rstest] +#[case::both_match("X", 200, TRUE)] +#[case::partition_miss("Y", 200, FALSE)] +#[case::data_miss("X", 50, FALSE)] +#[case::both_miss("Y", 50, FALSE)] +fn test_mixed_partition_and_data_and_evaluation( + #[case] part_val: &str, + #[case] max_data: i32, + #[case] expected: Option, +) { + let partition_columns = test_partition_columns(); + + // WHERE part_col = 'X' AND data_col > 100 + let pred = Pred::and( + Pred::eq(column_expr!("part_col"), Scalar::from("X")), + Pred::gt(column_expr!("data_col"), Scalar::from(100)), + ); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns) + .expect("should exist"); + + let filter = mixed_resolver(part_val, max_data); + assert_eq!( + filter.eval(&skipping_pred), + expected, + "part_col='{part_val}' max(data_col)={max_data}" + ); +} + +#[test] +fn test_partition_column_comparison_uses_exact_value() { + let partition_columns = test_partition_columns(); + + // part_col > 'B' rewrites both min and max to partitionValues_parsed.part_col + let pred = Pred::gt(column_expr!("part_col"), Scalar::from("B")); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns) + .expect("should exist"); + + // part_col='A': 'A' > 'B' is false -> skip + let resolver = DefaultKernelPredicateEvaluator::from(HashMap::from_iter([ + ( + column_name!("partitionValues_parsed.part_col"), + Scalar::from("A"), + ), + (column_name!("is_add"), Scalar::from(true)), + ])); + assert_eq!(resolver.eval(&skipping_pred), FALSE); + + // part_col='C': 'C' > 'B' is true -> keep + let resolver = DefaultKernelPredicateEvaluator::from(HashMap::from_iter([ + ( + column_name!("partitionValues_parsed.part_col"), + Scalar::from("C"), + ), + (column_name!("is_add"), Scalar::from(true)), + ])); + assert_eq!(resolver.eval(&skipping_pred), TRUE); +} + +#[test] +fn test_partition_only_predicate() { + let partition_columns = test_partition_columns(); + + // Partition-only: no data columns involved + let pred = Pred::eq(column_expr!("part_col"), Scalar::from("X")); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns) + .expect("should exist"); + let pred_str = skipping_pred.to_string(); + assert!( + pred_str.contains("partitionValues_parsed.part_col"), + "Should reference partitionValues_parsed" + ); + assert!( + !pred_str.contains("stats_parsed"), + "Partition-only predicate should not reference stats_parsed" + ); +} + +#[test] +fn test_sql_where_partition_rewrite() { + let partition_columns = test_partition_columns(); + + // Partition column equality: SQL WHERE should rewrite to partitionValues_parsed + let pred = Pred::eq(column_expr!("part_col"), Scalar::from("X")); + let sql_pred = as_sql_data_skipping_predicate(&pred, &partition_columns) + .expect("partition eq should produce SQL skipping pred"); + let pred_str = sql_pred.to_string(); + assert!( + pred_str.contains("partitionValues_parsed.part_col"), + "SQL WHERE should reference partitionValues_parsed, got {pred_str}" + ); +} + +#[rstest] +#[case::partition_match_data_above("X", 200, TRUE)] +#[case::partition_miss_data_above("Y", 200, FALSE)] +#[case::partition_match_data_below("X", 50, FALSE)] +#[case::both_miss("Y", 50, FALSE)] +fn test_sql_where_mixed_partition_and_data_evaluation( + #[case] part_val: &str, + #[case] max_data: i32, + #[case] expected: Option, +) { + let partition_columns = test_partition_columns(); + + // WHERE part_col = 'X' AND data_col > 100 + let pred = Pred::and( + Pred::eq(column_expr!("part_col"), Scalar::from("X")), + Pred::gt(column_expr!("data_col"), Scalar::from(100)), + ); + let sql_pred = as_sql_data_skipping_predicate(&pred, &partition_columns) + .expect("mixed AND should produce SQL skipping pred"); + + let resolver = HashMap::from_iter([ + ( + column_name!("partitionValues_parsed.part_col"), + Scalar::from(part_val), + ), + (column_name!("stats_parsed.numRecords"), Scalar::from(2i64)), + ( + column_name!("stats_parsed.nullCount.data_col"), + Scalar::from(0i64), + ), + ( + column_name!("stats_parsed.maxValues.data_col"), + Scalar::from(max_data), + ), + (column_name!("is_add"), Scalar::from(true)), + ]); + let filter = DefaultKernelPredicateEvaluator::from(resolver); + assert_eq!( + filter.eval(&sql_pred), + expected, + "part_col='{part_val}' max(data_col)={max_data}" + ); +} + +// The is_add guard (OR(NOT is_add, pred)) ensures Remove rows are never pruned by +// partition predicates, regardless of whether the partition value matches. +#[rstest] +#[case::non_matching_partition("Y", false, TRUE, "non-matching partition, Remove kept via guard")] +#[case::matching_partition("X", false, TRUE, "matching partition, Remove kept via guard")] +#[case::add_non_matching("Y", true, FALSE, "non-matching partition, Add correctly pruned")] +#[case::add_matching("X", true, TRUE, "matching partition, Add correctly kept")] +fn is_add_guard_keeps_remove_rows( + #[case] part_val: &str, + #[case] is_add: bool, + #[case] expected: Option, + #[case] _scenario: &str, +) { + let partition_columns = test_partition_columns(); + let pred = Pred::eq(column_expr!("part_col"), Scalar::from("X")); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns) + .expect("should exist"); + + let resolver = DefaultKernelPredicateEvaluator::from(HashMap::from_iter([ + ( + column_name!("partitionValues_parsed.part_col"), + Scalar::from(part_val), + ), + (column_name!("is_add"), Scalar::from(is_add)), + ])); + assert_eq!( + resolver.eval(&skipping_pred), + expected, + "part_col='{part_val}' is_add={is_add}" + ); +} + +// Mixed AND with is_add=false and null stats: Remove rows have null data stats, so the data +// arm evaluates to NULL. AND(true_from_guard, NULL) = NULL, which the DISTINCT filter treats +// as "keep". This verifies Removes are not pruned even when the data arm cannot be satisfied. +#[rstest] +#[case::remove_null_stats("Y", false, "Remove: AND(guard=true, stats=NULL) = NULL -> kept")] +#[case::add_null_stats_partition_match("X", true, "Add: AND(true, NULL) = NULL -> kept")] +#[case::add_null_stats_partition_miss("Y", true, "Add: AND(false, NULL) = false -> pruned")] +fn mixed_and_with_null_stats_and_is_add_guard( + #[case] part_val: &str, + #[case] is_add: bool, + #[case] _scenario: &str, +) { + let partition_columns = test_partition_columns(); + let pred = Pred::and( + Pred::eq(column_expr!("part_col"), Scalar::from("X")), + Pred::gt(column_expr!("data_col"), Scalar::from(100)), + ); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns) + .expect("should exist"); + + let resolver = DefaultKernelPredicateEvaluator::from(HashMap::from_iter([ + ( + column_name!("partitionValues_parsed.part_col"), + Scalar::from(part_val), + ), + ( + column_name!("stats_parsed.maxValues.data_col"), + Scalar::Null(DataType::INTEGER), + ), + (column_name!("is_add"), Scalar::from(is_add)), + ])); + let result = resolver.eval(&skipping_pred); + if !is_add { + assert_ne!(result, FALSE, "Remove rows must never be pruned"); } } + +// Null partition values: IS NULL / IS NOT NULL predicates on partition columns must +// correctly evaluate against null values in partitionValues_parsed. +#[rstest] +#[case::is_null_with_null_value( + Pred::is_null(column_expr!("part_col")), + Scalar::Null(DataType::STRING), + TRUE, + "null partition value matches IS NULL" +)] +#[case::is_null_with_non_null_value( + Pred::is_null(column_expr!("part_col")), + Scalar::from("X"), + FALSE, + "non-null partition value rejected by IS NULL" +)] +#[case::is_not_null_with_null_value( + Pred::is_not_null(column_expr!("part_col")), + Scalar::Null(DataType::STRING), + FALSE, + "null partition value rejected by IS NOT NULL" +)] +#[case::is_not_null_with_non_null_value( + Pred::is_not_null(column_expr!("part_col")), + Scalar::from("X"), + TRUE, + "non-null partition value matches IS NOT NULL" +)] +fn null_partition_value_evaluation( + #[case] pred: Pred, + #[case] part_val: Scalar, + #[case] expected: Option, + #[case] _scenario: &str, +) { + let partition_columns = test_partition_columns(); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns) + .expect("should exist"); + + let resolver = DefaultKernelPredicateEvaluator::from(HashMap::from_iter([ + (column_name!("partitionValues_parsed.part_col"), part_val), + (column_name!("is_add"), Scalar::from(true)), + ])); + assert_eq!(resolver.eval(&skipping_pred), expected); +} + +// Multiple partition columns: predicates referencing two partition columns should both +// rewrite to partitionValues_parsed and both get is_add guards. +#[test] +fn multiple_partition_columns_rewrite_and_evaluation() { + let partition_columns: HashSet = + ["part_a", "part_b"].iter().map(|s| s.to_string()).collect(); + + let pred = Pred::and( + Pred::eq(column_expr!("part_a"), Scalar::from("X")), + Pred::eq(column_expr!("part_b"), Scalar::from("Y")), + ); + let skipping_pred = as_data_skipping_predicate_with_partitions(&pred, &partition_columns) + .expect("should exist"); + let pred_str = skipping_pred.to_string(); + assert!( + pred_str.contains("partitionValues_parsed.part_a"), + "Should reference partitionValues_parsed.part_a, got {pred_str}" + ); + assert!( + pred_str.contains("partitionValues_parsed.part_b"), + "Should reference partitionValues_parsed.part_b, got {pred_str}" + ); + assert!( + !pred_str.contains("stats_parsed"), + "Should not reference stats_parsed for partition-only pred, got {pred_str}" + ); + + // Both match -> kept + let resolver = DefaultKernelPredicateEvaluator::from(HashMap::from_iter([ + ( + column_name!("partitionValues_parsed.part_a"), + Scalar::from("X"), + ), + ( + column_name!("partitionValues_parsed.part_b"), + Scalar::from("Y"), + ), + (column_name!("is_add"), Scalar::from(true)), + ])); + assert_eq!(resolver.eval(&skipping_pred), TRUE); + + // First misses -> pruned + let resolver = DefaultKernelPredicateEvaluator::from(HashMap::from_iter([ + ( + column_name!("partitionValues_parsed.part_a"), + Scalar::from("Z"), + ), + ( + column_name!("partitionValues_parsed.part_b"), + Scalar::from("Y"), + ), + (column_name!("is_add"), Scalar::from(true)), + ])); + assert_eq!(resolver.eval(&skipping_pred), FALSE); + + // Remove row: both miss but is_add=false -> kept via guard + let resolver = DefaultKernelPredicateEvaluator::from(HashMap::from_iter([ + ( + column_name!("partitionValues_parsed.part_a"), + Scalar::from("Z"), + ), + ( + column_name!("partitionValues_parsed.part_b"), + Scalar::from("W"), + ), + (column_name!("is_add"), Scalar::from(false)), + ])); + assert_ne!( + resolver.eval(&skipping_pred), + FALSE, + "Remove must not be pruned" + ); +} + +// Without normalization, `AND([unknown])` would become `AND([NULL])` via +// `collect_junction_preds`, which evaluates to `Some(false)` under `eval_sql_where` and +// incorrectly prunes all row groups. The junction constructor normalizes `AND([unknown])` +// to just `unknown`, which correctly returns `None` (no pushdown). +#[test] +fn single_unsupported_pred_in_junction_disables_checkpoint_pushdown() { + let pred = Pred::and_from([Pred::unknown("unsupported")]); + let skipping_pred = as_checkpoint_skipping_predicate(&pred, &[]); + assert!( + skipping_pred.is_none(), + "Single unsupported predicate in a junction should disable pushdown, got: {skipping_pred:?}" + ); +} + +// -- Integration tests: end-to-end data skipping with real tables ------------------- +// +// Two test tables are used: +// +// `app-txn-checkpoint` (4 files, partitioned by `modified` (string)): +// - 2 files: modified="2021-02-01", value in [4, 11] +// - 2 files: modified="2021-02-02", value in [1, 3] +// - Version 0 (JSON) + version 1 (JSON + checkpoint) exercises both code paths. +// +// `parsed-stats` (6 files, non-partitioned): +// - File 1-6: id ranges [1,100]..[501,600], ts_col min values 1M..11M microseconds +// - Version 3 checkpoint + versions 4-5 JSON commits. + +use std::path::PathBuf; + +use crate::engine::sync::SyncEngine; +use crate::Snapshot; + +/// Counts files selected after data skipping for the given predicate and table. +fn count_selected(table_dir: &str, pred: PredicateRef) -> usize { + let path = std::fs::canonicalize(PathBuf::from(table_dir)).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + let scan = Snapshot::builder_for(url) + .build(engine.as_ref()) + .unwrap() + .scan_builder() + .with_predicate(pred) + .build() + .unwrap(); + scan.scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap() + .iter() + .flat_map(|sm| sm.scan_files.selection_vector()) + .filter(|&&s| s) + .count() +} + +const PARTITIONED_TABLE: &str = "./tests/data/app-txn-checkpoint/"; +const STATS_TABLE: &str = "./tests/data/parsed-stats/"; + +// -- Partition-only predicates (app-txn-checkpoint) --------------------------- + +#[rstest] +#[case::eq_match(Pred::eq(column_expr!("modified"), Expr::literal("2021-02-01")), 2)] +#[case::eq_no_match(Pred::eq(column_expr!("modified"), Expr::literal("2099-01-01")), 0)] +#[case::neq(Pred::ne(column_expr!("modified"), Expr::literal("2021-02-01")), 2)] +#[case::gt(Pred::gt(column_expr!("modified"), Expr::literal("2021-02-01")), 2)] +#[case::lt(Pred::lt(column_expr!("modified"), Expr::literal("2021-02-02")), 2)] +#[case::gte_all(Pred::ge(column_expr!("modified"), Expr::literal("2021-02-01")), 4)] +#[case::lte_all(Pred::le(column_expr!("modified"), Expr::literal("2021-02-02")), 4)] +#[case::range_anded( + Pred::and( + Pred::ge(column_expr!("modified"), Expr::literal("2021-02-01")), + Pred::le(column_expr!("modified"), Expr::literal("2021-02-01")), + ), + 2 +)] +fn partition_only_skipping(#[case] pred: Pred, #[case] expected: usize) { + assert_eq!(count_selected(PARTITIONED_TABLE, Arc::new(pred)), expected); +} + +// -- Data-stats-only predicates (app-txn-checkpoint) -------------------------- + +#[rstest] +#[case::gt_prunes_low(Pred::gt(column_expr!("value"), Expr::literal(9i32)), 2)] +#[case::lt_prunes_high(Pred::lt(column_expr!("value"), Expr::literal(4i32)), 2)] +#[case::gt_above_max(Pred::gt(column_expr!("value"), Expr::literal(11i32)), 0)] +#[case::le_at_max(Pred::le(column_expr!("value"), Expr::literal(11i32)), 4)] +#[case::range_anded( + Pred::and( + Pred::ge(column_expr!("value"), Expr::literal(1i32)), + Pred::le(column_expr!("value"), Expr::literal(3i32)), + ), + 2 +)] +fn data_stats_only_skipping(#[case] pred: Pred, #[case] expected: usize) { + assert_eq!(count_selected(PARTITIONED_TABLE, Arc::new(pred)), expected); +} + +// -- Mixed AND: both partition and data conditions must hold ------------------- + +#[rstest] +#[case::partition_match_data_match( + "2021-02-01", + 3i32, + 2, + "partition prunes 02-02; data keeps 02-01 (max=11 > 3)" +)] +#[case::partition_match_data_miss( + "2021-02-02", + 3i32, + 0, + "partition keeps 02-02 but max=3 NOT >3; partition prunes 02-01" +)] +#[case::partition_miss("2099-01-01", 0i32, 0, "no files match partition")] +fn mixed_and_skipping( + #[case] partition_val: &str, + #[case] data_threshold: i32, + #[case] expected: usize, + #[case] _scenario: &str, +) { + let pred = Arc::new(Pred::and( + column_expr!("modified").eq(Expr::literal(partition_val)), + column_expr!("value").gt(Expr::literal(data_threshold)), + )); + assert_eq!(count_selected(PARTITIONED_TABLE, pred), expected); +} + +// -- Mixed OR: a file survives if either leg matches -------------------------- + +#[rstest] +#[case::both_match("2021-02-02", 9i32, 4, "02-02 matches partition; 02-01 has max=11 > 9")] +#[case::partition_saves_some( + "2021-02-02", + 11i32, + 2, + "02-02 matches partition; 02-01 max=11 NOT >11 -> pruned" +)] +#[case::data_saves_some( + "2099-01-01", -1i32, 4, + "no partition match; all files have max >= 0 so value > -1 keeps all" +)] +#[case::both_miss( + "2099-01-01", + 11i32, + 0, + "no partition match; max=11 NOT >11 -> all pruned" +)] +fn mixed_or_skipping( + #[case] partition_val: &str, + #[case] data_threshold: impl Into, + #[case] expected: usize, + #[case] _scenario: &str, +) { + let pred = Arc::new(Pred::or( + column_expr!("modified").eq(Expr::literal(partition_val)), + column_expr!("value").gt(Expr::literal(data_threshold.into())), + )); + assert_eq!(count_selected(PARTITIONED_TABLE, pred), expected); +} + +// -- Nested AND(partition, OR(data, data)) ------------------------------------ + +#[rstest] +#[case::loose_bound(10i32, 4, "max=11 > 10 keeps 02-01; min=1 < 2 keeps 02-02")] +#[case::strict_bound(11i32, 2, "max=11 NOT >11 prunes 02-01; min=1 < 2 keeps 02-02")] +fn nested_and_or_skipping( + #[case] upper_bound: i32, + #[case] expected: usize, + #[case] _scenario: &str, +) { + let pred = Arc::new(Pred::and( + Pred::ge(column_expr!("modified"), Expr::literal("2021-02-01")), + Pred::or( + Pred::lt(column_expr!("value"), Expr::literal(2i32)), + Pred::gt(column_expr!("value"), Expr::literal(upper_bound)), + ), + )); + assert_eq!(count_selected(PARTITIONED_TABLE, pred), expected); +} + +// -- Parsed stats skipping (non-partitioned table) ---------------------------- + +#[test] +fn parsed_stats_skipping() { + // id > 400 should skip files 1-4 (max id: 100, 200, 300, 400) and keep files 5-6 + let pred = Arc::new(Pred::gt(column_expr!("id"), Expr::literal(400i64))); + assert_eq!(count_selected(STATS_TABLE, pred), 2); +} + +// -- Timestamp predicate skipping (parsed-stats table) ------------------------ +// Timestamp predicates now use max stats with a 999us adjustment for truncation. +// Table has 6 files with ts_col ranges: [1M,2M], [3M,4M], [5M,6M], [7M,8M], [9M,10M], [11M,12M] + +#[rstest] +#[case::bare_ts_gt_keeps_all( + // ts_col > 2M -> adjusted: max > 1,999,001 -> all 6 files have max >= 2M -> 6 + Pred::gt(column_expr!("ts_col"), Expr::literal(Scalar::Timestamp(2_000_000))), + 6 +)] +#[case::bare_ts_lt_skips( + // ts_col < 3M -> min < 3M -> file 1 (min=1M) -> 1 + Pred::lt(column_expr!("ts_col"), Expr::literal(Scalar::Timestamp(3_000_000))), + 1 +)] +#[case::and_mixed_id_and_ts( + // id > 400 keeps files 5-6; ts_col > 2M keeps all 6; AND -> 2 + Pred::and( + Pred::gt(column_expr!("id"), Expr::literal(400i64)), + Pred::gt(column_expr!("ts_col"), Expr::literal(Scalar::Timestamp(2_000_000))), + ), + 2 +)] +#[case::or_mixed_id_and_ts( + // id > 400 keeps 5-6; ts_col > 2M keeps 1-6; OR -> 6 + Pred::or( + Pred::gt(column_expr!("id"), Expr::literal(400i64)), + Pred::gt(column_expr!("ts_col"), Expr::literal(Scalar::Timestamp(2_000_000))), + ), + 6 +)] +#[case::and_two_ts_predicates( + // ts_col > 2M (adjusted max > 1,999,001 -> all) AND ts_col > 5M (adjusted max > 4,999,001 + // -> files 3-6) -> 4 + Pred::and( + Pred::gt(column_expr!("ts_col"), Expr::literal(Scalar::Timestamp(2_000_000))), + Pred::gt(column_expr!("ts_col"), Expr::literal(Scalar::Timestamp(5_000_000))), + ), + 4 +)] +#[case::or_two_ts_predicates( + // ts_col > 2M keeps all; ts_col > 5M keeps files 3-6; OR -> 6 + Pred::or( + Pred::gt(column_expr!("ts_col"), Expr::literal(Scalar::Timestamp(2_000_000))), + Pred::gt(column_expr!("ts_col"), Expr::literal(Scalar::Timestamp(5_000_000))), + ), + 6 +)] +fn timestamp_predicate_skipping(#[case] pred: Pred, #[case] expected: usize) { + assert_eq!(count_selected(STATS_TABLE, Arc::new(pred)), expected); +} + +// -- Unsupported predicate handling (parsed-stats table) ---------------------- +// Column-column comparisons are unsupported for data skipping (no literal to infer type). +// Verifies that junctions degrade gracefully when one or both legs can't be evaluated. + +#[rstest] +#[case::bare_unsupported_returns_all( + // col > col is unsupported -> None -> keep all files + Pred::gt(column_expr!("id"), column_expr!("salary")), + 6 +)] +#[case::and_supported_with_unsupported( + // id > 400 keeps files 5-6; id > salary is unsupported; AND -> 2 + Pred::and( + Pred::gt(column_expr!("id"), Expr::literal(400i64)), + Pred::gt(column_expr!("id"), column_expr!("salary")), + ), + 2 +)] +#[case::or_supported_with_unsupported( + // id > 400 keeps 5-6; id > salary is unsupported; OR -> all 6 + Pred::or( + Pred::gt(column_expr!("id"), Expr::literal(400i64)), + Pred::gt(column_expr!("id"), column_expr!("salary")), + ), + 6 +)] +#[case::and_all_unsupported( + // Both legs unsupported -> None -> keep all 6 + Pred::and( + Pred::gt(column_expr!("id"), column_expr!("salary")), + Pred::gt(column_expr!("id"), column_expr!("age")), + ), + 6 +)] +#[case::or_all_unsupported( + // Both legs unsupported -> None -> keep all 6 + Pred::or( + Pred::gt(column_expr!("id"), column_expr!("salary")), + Pred::gt(column_expr!("id"), column_expr!("age")), + ), + 6 +)] +fn unsupported_predicate_skipping(#[case] pred: Pred, #[case] expected: usize) { + assert_eq!(count_selected(STATS_TABLE, Arc::new(pred)), expected); +} diff --git a/kernel/src/scan/field_classifiers.rs b/kernel/src/scan/field_classifiers.rs index 81611a46c4..f0092e4a2e 100644 --- a/kernel/src/scan/field_classifiers.rs +++ b/kernel/src/scan/field_classifiers.rs @@ -1,10 +1,10 @@ //! Field classifier implementations for different scan types (regular and CDF scans) +use crate::scan::transform_spec::FieldTransformSpec; use crate::schema::StructField; use crate::table_changes::{ CHANGE_TYPE_COL_NAME, COMMIT_TIMESTAMP_COL_NAME, COMMIT_VERSION_COL_NAME, }; -use crate::transforms::FieldTransformSpec; /// Trait for classifying fields during StateInfo construction. Allows different scan types /// (regular, CDF) to customize field handling. Note that the default set of field handling occurs diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 8008087f8b..dfacf9453e 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -1,35 +1,97 @@ use std::clone::Clone; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::sync::{Arc, LazyLock}; +use delta_kernel_derive::internal_api; +use serde::{Deserialize, Serialize}; + use super::data_skipping::DataSkippingFilter; +use super::metrics::ScanMetrics; use super::state_info::StateInfo; use super::{PhysicalPredicate, ScanMetadata}; use crate::actions::deletion_vector::DeletionVectorDescriptor; -use crate::actions::get_log_add_schema; use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::expressions::{column_name, ColumnName, Expression, ExpressionRef, PredicateRef}; -use crate::kernel_predicates::{DefaultKernelPredicateEvaluator, KernelPredicateEvaluator as _}; -use crate::log_replay::{ActionsBatch, FileActionDeduplicator, FileActionKey, LogReplayProcessor}; +use crate::expressions::{ + column_expr, column_expr_ref, column_name, ColumnName, Expression, ExpressionRef, PredicateRef, +}; +use crate::log_replay::deduplicator::{CheckpointDeduplicator, Deduplicator}; +use crate::log_replay::{ + ActionsBatch, FileActionDeduplicator, FileActionKey, LogReplayProcessor, + ParallelLogReplayProcessor, +}; +use crate::log_segment::CheckpointReadInfo; +use crate::scan::transform_spec::{get_transform_expr, parse_partition_values, TransformSpec}; use crate::scan::Scalar; use crate::schema::ToSchema as _; use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; -use crate::transforms::{get_transform_expr, parse_partition_values, TransformSpec}; +use crate::table_features::ColumnMappingMode; use crate::utils::require; use crate::{DeltaResult, Engine, Error, ExpressionEvaluator}; +/// Internal serializable state (schemas, transform spec, column mapping, etc.) +/// NOTE: This is opaque to the user - it is passed through as a blob. +#[derive(serde::Serialize, serde::Deserialize, Clone)] +#[serde(deny_unknown_fields)] +struct InternalScanState { + logical_schema: Arc, + physical_schema: Arc, + predicate_schema: Option>, + transform_spec: Option>, + column_mapping_mode: ColumnMappingMode, + /// Physical stats schema for reading/parsing stats from checkpoint files + physical_stats_schema: Option, + #[serde(default)] + skip_stats: bool, + /// Physical partition schema for checkpoint partition pruning via `partitionValues_parsed` + physical_partition_schema: Option, +} + +/// Serializable processor state for distributed processing. This can be serialized using the +/// default serde serialization, or through custom serialization in the engine. +/// +/// This struct contains all the information needed to reconstruct a `ScanLogReplayProcessor` +/// on remote compute nodes, enabling distributed log replay processing. +/// +/// # Serialization Limitations +/// +/// - **Opaque expressions**: Predicates containing [`Predicate::Opaque`] or expressions containing +/// [`Expression::Opaque`] cannot be serialized using serde. Attempting to serialize state with +/// opaque expressions will result in an error. Connectors that require opaque expression support +/// can work around this by serializing the predicate separately using their own serialization +/// mechanism, then reconstructing the processor state on the remote node. +/// +/// - **Large state**: The `seen_file_keys` field can be large for tables with many commits. +/// Connectors are free to serialize this field using their own format (e.g., more compact binary +/// representations) rather than using the serde-based serialization. +/// +/// [`Predicate::Opaque`]: crate::expressions::Predicate::Opaque +/// [`Expression::Opaque`]: crate::expressions::Expression::Opaque +#[derive(Deserialize, Serialize)] +#[serde(deny_unknown_fields)] +pub struct SerializableScanState { + /// Optional predicate for data skipping (if provided) + pub predicate: Option, + /// Opaque internal state blob + pub internal_state_blob: Vec, + /// Set of file action keys that have already been processed. + pub seen_file_keys: HashSet, + /// Information about checkpoint reading for stats optimization + pub(crate) checkpoint_info: CheckpointReadInfo, +} + /// [`ScanLogReplayProcessor`] performs log replay (processes actions) specifically for doing a table scan. /// /// During a table scan, the processor reads batches of log actions (in reverse chronological order) /// and performs the following steps: /// /// - Data Skipping: Applies a predicate-based filter (via [`DataSkippingFilter`]) to quickly skip -/// files that are irrelevant for the query. -/// - Partition Pruning: Uses an optional partition filter (extracted from a physical predicate) -/// to exclude actions whose partition values do not meet the required criteria. +/// files that are irrelevant for the query. This includes both data column stats (min/max/nullCount) +/// and partition value filtering in a single columnar pass. A secondary row-level partition filter +/// catches remaining files the columnar pass cannot prune (e.g. null partition values where +/// null-safety conservatively keeps them). /// - Action Deduplication: Leverages the [`FileActionDeduplicator`] to ensure that for each unique file /// (identified by its path and deletion vector unique ID), only the latest valid Add action is processed. -/// - Transformation: Applies a built-in transformation (`add_transform`) to convert selected Add actions +/// - Transformation: Applies a built-in transformation (`log_transform` or `checkpoint_transform`) to convert selected Add actions /// into [`ScanMetadata`], the intermediate format passed to the engine. /// - Row Transform Passthrough: Any user-provided row-level transformation expressions (e.g. those derived /// from projection or filters) are preserved and passed through to the engine, which applies them as part @@ -40,49 +102,277 @@ use crate::{DeltaResult, Engine, Error, ExpressionEvaluator}; /// produces a [`ScanMetadata`] result. This result includes the transformed batch, a selection /// vector indicating which rows are valid, and any row-level transformation expressions that need /// to be applied to the selected rows. -pub(crate) struct ScanLogReplayProcessor { - partition_filter: Option, +#[allow(rustdoc::broken_intra_doc_links, rustdoc::private_intra_doc_links)] +pub struct ScanLogReplayProcessor { data_skipping_filter: Option, - add_transform: Arc, + /// Transform for log batches (commit files) - uses ParseJson for stats and MapToStruct + /// for partition values + log_transform: Arc, + /// Transform for checkpoint batches - reads pre-parsed stats_parsed and + /// partitionValues_parsed directly when available, otherwise parses from raw columns + checkpoint_transform: Arc, state_info: Arc, /// A set of (data file path, dv_unique_id) pairs that have been seen thus /// far in the log. This is used to filter out files with Remove actions as /// well as duplicate entries in the log. seen_file_keys: HashSet, + /// Skip reading file statistics. + skip_stats: bool, + /// Information about checkpoint reading for stats optimization + checkpoint_info: CheckpointReadInfo, + /// Metrics related to the scan + metrics: Arc, } impl ScanLogReplayProcessor { + // These index positions correspond to the order of columns defined in + // `selected_column_names_and_types()` + const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters + const ADD_PARTITION_VALUES_INDEX: usize = 1; // Position of "add.partitionValues" in getters + const ADD_DV_START_INDEX: usize = 2; // Start position of add deletion vector columns + const BASE_ROW_ID_INDEX: usize = 5; // Position of add.baseRowId in getters + const REMOVE_PATH_INDEX: usize = 6; // Position of "remove.path" in getters + const REMOVE_DV_START_INDEX: usize = 7; // Start position of remove deletion vector columns + /// Create a new [`ScanLogReplayProcessor`] instance - fn new(engine: &dyn Engine, state_info: Arc) -> Self { - // Extract the physical predicate from StateInfo's PhysicalPredicate enum. - // The DataSkippingFilter and partition_filter components expect the predicate - // in the format Option<(PredicateRef, SchemaRef)>, so we need to convert from - // the enum representation to the tuple format. + pub(crate) fn new( + engine: &dyn Engine, + state_info: Arc, + checkpoint_info: CheckpointReadInfo, + skip_stats: bool, + ) -> DeltaResult { + let dedup_capacity = state_info.dedup_capacity_hint(); + Self::new_with_seen_files( + engine, + state_info, + checkpoint_info, + HashSet::with_capacity(dedup_capacity), + skip_stats, + ) + } + + /// Create new [`ScanLogReplayProcessor`] with pre-populated seen_file_keys. + /// + /// This is useful when reconstructing a processor from serialized state, where the + /// seen_file_keys have already been computed during a previous phase of log replay. + /// + /// # Parameters + /// - `engine`: Engine for creating evaluators and filters + /// - `state_info`: StateInfo containing schemas, transforms, and predicates + /// - `checkpoint_info`: Information about checkpoint reading for stats optimization + /// - `seen_file_keys`: Pre-computed set of file action keys that have been seen + /// - `skip_stats`: Skip reading file statistics + pub(crate) fn new_with_seen_files( + engine: &dyn Engine, + state_info: Arc, + checkpoint_info: CheckpointReadInfo, + seen_file_keys: HashSet, + skip_stats: bool, + ) -> DeltaResult { + let CheckpointReadInfo { + has_stats_parsed, + has_partition_values_parsed, + checkpoint_read_schema, + } = checkpoint_info.clone(); + + // Create metrics first so we can pass them to DataSkippingFilter + let metrics = Arc::new(ScanMetrics::default()); + + // Extract the physical predicate for data skipping and partition filtering. + // DataSkippingFilter expects Option<(PredicateRef, SchemaRef)>. let physical_predicate = match &state_info.physical_predicate { - PhysicalPredicate::Some(predicate, schema) => { - // Valid predicate that can be used for data skipping and partition filtering - Some((predicate.clone(), schema.clone())) - } - PhysicalPredicate::StaticSkipAll => { - debug_assert!(false, "StaticSkipAll case should be handled at a higher level and not reach this code"); - None - } - PhysicalPredicate::None => { - // No predicate provided - None + PhysicalPredicate::Some(predicate, schema) => Some((predicate.clone(), schema.clone())), + _ => None, + }; + + // When skip_stats is enabled, disable both data column skipping and partition pruning. + // Both rely on the same DataSkippingFilter columnar pass, so they are controlled together. + let (stats_schema_for_transform, partition_schema_for_transform) = if skip_stats { + (None, None) + } else { + ( + state_info.physical_stats_schema.clone(), + state_info.physical_partition_schema.clone(), + ) + }; + + let output_schema = scan_row_schema_with_parsed_columns( + stats_schema_for_transform.clone(), + partition_schema_for_transform.clone(), + ); + + // Create data skipping filter that reads stats_parsed and partitionValues_parsed + // from the transformed batch. This avoids double JSON parsing -- the transform parses + // JSON once, then data skipping reads the already-parsed columns from the output. + // + // When partition columns are referenced by the predicate, the filter receives a + // partition schema + expression to extract typed partition values. Since the transform + // already produces `partitionValues_parsed`, the filter reads that column directly. + let data_skipping_filter = if skip_stats { + None + } else { + DataSkippingFilter::new( + engine, + physical_predicate.as_ref().map(|(p, _)| p.clone()), + stats_schema_for_transform.as_ref(), + column_expr_ref!("stats_parsed"), + partition_schema_for_transform.as_ref(), + column_expr_ref!("partitionValues_parsed"), + output_schema.clone(), + Some(metrics.clone()), + ) + }; + + Ok(Self { + data_skipping_filter, + // Log transform: parse JSON for stats, MapToStruct for partition values + log_transform: engine.evaluation_handler().new_expression_evaluator( + checkpoint_read_schema.clone(), + get_add_transform_expr( + stats_schema_for_transform.clone(), + false, + skip_stats, + partition_schema_for_transform.clone(), + false, + ), + output_schema.clone().into(), + )?, + // Checkpoint transform: read pre-parsed columns directly when available + checkpoint_transform: engine.evaluation_handler().new_expression_evaluator( + checkpoint_read_schema, + get_add_transform_expr( + stats_schema_for_transform, + has_stats_parsed, + skip_stats, + partition_schema_for_transform, + has_partition_values_parsed, + ), + output_schema.into(), + )?, + seen_file_keys, + state_info, + skip_stats, + checkpoint_info, + metrics, + }) + } + + /// Get a reference to the checkpoint info. + pub(crate) fn checkpoint_info(&self) -> &CheckpointReadInfo { + &self.checkpoint_info + } + + pub(crate) fn get_metrics(&self) -> &ScanMetrics { + self.metrics.as_ref() + } + + /// Serialize the processor state for distributed processing. + /// + /// Consumes the processor and returns a `SerializableScanState` containing: + /// - The predicate (if any) for data skipping + /// - An opaque internal state blob (schemas, transform spec, column mapping mode) + /// - The set of seen file keys including their deletion vector information + /// + /// The returned state can be used with `from_serializable_state` to reconstruct the + /// processor on remote compute nodes. + /// + /// WARNING: The SerializableScanState may only be deserialized using an equal binary version + /// of delta-kernel-rs. Using different versions for serialization and deserialization leads to + /// undefined behaviour! + #[internal_api] + #[allow(unused)] + pub(crate) fn into_serializable_state(self) -> DeltaResult { + let StateInfo { + logical_schema, + physical_schema, + physical_predicate, + transform_spec, + column_mapping_mode, + physical_stats_schema, + physical_partition_schema, + } = self.state_info.as_ref().clone(); + + // Extract predicate from PhysicalPredicate + let (predicate, predicate_schema) = match physical_predicate { + PhysicalPredicate::Some(pred, schema) => (Some(pred), Some(schema)), + _ => (None, None), + }; + + // Serialize internal state to JSON blob (schemas, transform spec, and column mapping mode) + let internal_state = InternalScanState { + logical_schema, + physical_schema, + transform_spec, + predicate_schema, + column_mapping_mode, + physical_stats_schema, + skip_stats: self.skip_stats, + physical_partition_schema, + }; + let internal_state_blob = serde_json::to_vec(&internal_state) + .map_err(|e| Error::generic(format!("Failed to serialize internal state: {e}")))?; + + Ok(SerializableScanState { + predicate, + internal_state_blob, + seen_file_keys: self.seen_file_keys, + checkpoint_info: self.checkpoint_info, + }) + } + + /// Reconstruct a processor from serialized state. + /// + /// Creates a new processor with the provided state. All fields (partition_filter, + /// data_skipping_filter, log_transform, checkpoint_transform, and seen_file_keys) are + /// reconstructed from the serialized state and engine. + /// + /// # Parameters + /// - `engine`: Engine for creating evaluators and filters + /// - `state`: The serialized state containing predicate, internal state blob, and seen file keys + /// + /// # Returns + /// A new `ScanLogReplayProcessor` wrapped in an Arc. + /// + #[internal_api] + #[allow(unused)] + pub(crate) fn from_serializable_state( + engine: &dyn Engine, + state: SerializableScanState, + ) -> DeltaResult { + // Deserialize internal state from json + let internal_state: InternalScanState = + serde_json::from_slice(&state.internal_state_blob).map_err(Error::MalformedJson)?; + + // Reconstruct PhysicalPredicate from predicate and predicate schema + let physical_predicate = match state.predicate { + Some(predicate) => { + let Some(predicate_schema) = internal_state.predicate_schema else { + return Err(Error::generic( + "Invalid serialized internal state. Expected predicate schema.", + )); + }; + PhysicalPredicate::Some(predicate, predicate_schema) } + None => PhysicalPredicate::None, }; - Self { - partition_filter: physical_predicate.as_ref().map(|(e, _)| e.clone()), - data_skipping_filter: DataSkippingFilter::new(engine, physical_predicate), - add_transform: engine.evaluation_handler().new_expression_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ), - seen_file_keys: Default::default(), + + let state_info = Arc::new(StateInfo { + logical_schema: internal_state.logical_schema, + physical_schema: internal_state.physical_schema, + physical_predicate, + transform_spec: internal_state.transform_spec, + column_mapping_mode: internal_state.column_mapping_mode, + physical_stats_schema: internal_state.physical_stats_schema, + physical_partition_schema: internal_state.physical_partition_schema, + }); + + Self::new_with_seen_files( + engine, state_info, - } + state.checkpoint_info, + state.seen_file_keys, + internal_state.skip_stats, + ) } } @@ -90,74 +380,33 @@ impl ScanLogReplayProcessor { /// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId) /// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the /// first action for a given file is a remove, then that file does not show up in the result at all. -struct AddRemoveDedupVisitor<'seen> { - deduplicator: FileActionDeduplicator<'seen>, +struct AddRemoveDedupVisitor<'a, D: Deduplicator> { + deduplicator: D, selection_vector: Vec, - logical_schema: SchemaRef, - physical_schema: SchemaRef, - transform_spec: Option>, - partition_filter: Option, + state_info: Arc, row_transform_exprs: Vec>, + metrics: &'a ScanMetrics, } -impl AddRemoveDedupVisitor<'_> { - // These index positions correspond to the order of columns defined in - // `selected_column_names_and_types()` - const ADD_PATH_INDEX: usize = 0; // Position of "add.path" in getters - const ADD_PARTITION_VALUES_INDEX: usize = 1; // Position of "add.partitionValues" in getters - const ADD_DV_START_INDEX: usize = 2; // Start position of add deletion vector columns - const BASE_ROW_ID_INDEX: usize = 5; // Position of add.baseRowId in getters - const REMOVE_PATH_INDEX: usize = 6; // Position of "remove.path" in getters - const REMOVE_DV_START_INDEX: usize = 7; // Start position of remove deletion vector columns - +impl<'a, D: Deduplicator> AddRemoveDedupVisitor<'a, D> { fn new( - seen: &mut HashSet, + deduplicator: D, selection_vector: Vec, - logical_schema: SchemaRef, - physical_schema: SchemaRef, - transform_spec: Option>, - partition_filter: Option, - is_log_batch: bool, - ) -> AddRemoveDedupVisitor<'_> { + state_info: Arc, + metrics: &'a ScanMetrics, + ) -> AddRemoveDedupVisitor<'a, D> { AddRemoveDedupVisitor { - deduplicator: FileActionDeduplicator::new( - seen, - is_log_batch, - Self::ADD_PATH_INDEX, - Self::REMOVE_PATH_INDEX, - Self::ADD_DV_START_INDEX, - Self::REMOVE_DV_START_INDEX, - ), + deduplicator, selection_vector, - logical_schema, - physical_schema, - transform_spec, - partition_filter, + state_info, row_transform_exprs: Vec::new(), + metrics, } } - fn is_file_partition_pruned( - &self, - partition_values: &HashMap, - ) -> bool { - if partition_values.is_empty() { - return false; - } - let Some(partition_filter) = &self.partition_filter else { - return false; - }; - let partition_values: HashMap<_, _> = partition_values - .values() - .map(|(k, v)| (ColumnName::new([k]), v.clone())) - .collect(); - let evaluator = DefaultKernelPredicateEvaluator::from(partition_values); - evaluator.eval_sql_where(partition_filter) == Some(false) - } - /// True if this row contains an Add action that should survive log replay. Skip it if the row /// is not an Add action, or the file has already been seen previously. - fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { + fn is_valid_add<'b>(&mut self, i: usize, getters: &[&'b dyn GetData<'b>]) -> DeltaResult { // When processing file actions, we extract path and deletion vector information based on action type: // - For Add actions: path is at index 0, followed by DV fields at indexes 2-4 // - For Remove actions (in log batches only): path is at index 5, followed by DV fields at indexes 6-8 @@ -170,25 +419,29 @@ impl AddRemoveDedupVisitor<'_> { !self.deduplicator.is_log_batch(), // skip_removes. true if this is a checkpoint batch )? else { + self.metrics.incr_non_file_actions(); return Ok(false); }; - // Apply partition pruning (to adds only) before deduplication, so that we don't waste memory - // tracking pruned files. Removes don't get pruned and we'll still have to track them. - // - // WARNING: It's not safe to partition-prune removes (just like it's not safe to data skip - // removes), because they are needed to suppress earlier incompatible adds we might - // encounter if the table's schema was replaced after the most recent checkpoint. - let partition_values = match &self.transform_spec { + if is_add { + self.metrics.incr_add_files_seen() + } else { + self.metrics.incr_remove_files_seen() + }; + + // Parse partition values for building the per-row transform expression. + // Partition pruning is handled by DataSkippingFilter in the columnar data skipping phase, + // so we only need to parse values here for the transform. + let partition_values = match &self.state_info.transform_spec { Some(transform) if is_add => { - let partition_values = - getters[Self::ADD_PARTITION_VALUES_INDEX].get(i, "add.partitionValues")?; - let partition_values = - parse_partition_values(&self.logical_schema, transform, &partition_values)?; - if self.is_file_partition_pruned(&partition_values) { - return Ok(false); - } - partition_values + let partition_values = getters[ScanLogReplayProcessor::ADD_PARTITION_VALUES_INDEX] + .get(i, "add.partitionValues")?; + parse_partition_values( + &self.state_info.logical_schema, + transform, + &partition_values, + self.state_info.column_mapping_mode, + )? } _ => Default::default(), }; @@ -198,15 +451,16 @@ impl AddRemoveDedupVisitor<'_> { return Ok(false); } let base_row_id: Option = - getters[Self::BASE_ROW_ID_INDEX].get_opt(i, "add.baseRowId")?; + getters[ScanLogReplayProcessor::BASE_ROW_ID_INDEX].get_opt(i, "add.baseRowId")?; let transform = self + .state_info .transform_spec .as_ref() .map(|transform| { get_transform_expr( transform, partition_values, - &self.physical_schema, + &self.state_info.physical_schema, base_row_id, ) }) @@ -216,11 +470,12 @@ impl AddRemoveDedupVisitor<'_> { self.row_transform_exprs.resize_with(i, Default::default); self.row_transform_exprs.push(transform); } + self.metrics.incr_active_add_files(); Ok(true) } } -impl RowVisitor for AddRemoveDedupVisitor<'_> { +impl RowVisitor for AddRemoveDedupVisitor<'_, D> { fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { // NOTE: The visitor assumes a schema with adds first and removes optionally afterward. static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { @@ -254,6 +509,8 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { } fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + let start = std::time::Instant::now(); + let is_log_batch = self.deduplicator.is_log_batch(); let expected_getters = if is_log_batch { 10 } else { 6 }; require!( @@ -269,10 +526,20 @@ impl RowVisitor for AddRemoveDedupVisitor<'_> { self.selection_vector[i] = self.is_valid_add(i, getters)?; } } + + self.metrics + .add_dedup_visitor_time_ns(start.elapsed().as_nanos() as u64); + Ok(()) } } +pub(crate) static FILE_CONSTANT_VALUES_NAME: &str = "fileConstantValues"; +pub(crate) static BASE_ROW_ID_NAME: &str = "baseRowId"; +pub(crate) static DEFAULT_ROW_COMMIT_VERSION_NAME: &str = "defaultRowCommitVersion"; +pub(crate) static CLUSTERING_PROVIDER_NAME: &str = "clusteringProvider"; +pub(crate) static TAGS_NAME: &str = "tags"; + // NB: If you update this schema, ensure you update the comment describing it in the doc comment // for `scan_row_schema` in scan/mod.rs! You'll also need to update ScanFileVisitor as the // indexes will be off, and [`get_add_transform_expr`] below to match it. @@ -281,7 +548,17 @@ pub(crate) static SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); let file_constant_values = StructType::new_unchecked([ StructField::nullable("partitionValues", partition_values), - StructField::nullable("baseRowId", DataType::LONG), + StructField::nullable(BASE_ROW_ID_NAME, DataType::LONG), + StructField::nullable(DEFAULT_ROW_COMMIT_VERSION_NAME, DataType::LONG), + StructField::nullable( + "tags", + MapType::new( + DataType::STRING, + DataType::STRING, + /*valueContainsNull*/ true, + ), + ), + StructField::nullable(CLUSTERING_PROVIDER_NAME, DataType::STRING), ]); Arc::new(StructType::new_unchecked([ StructField::nullable("path", DataType::STRING), @@ -289,83 +566,285 @@ pub(crate) static SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| StructField::nullable("modificationTime", DataType::LONG), StructField::nullable("stats", DataType::STRING), StructField::nullable("deletionVector", DeletionVectorDescriptor::to_schema()), - StructField::nullable("fileConstantValues", file_constant_values), + StructField::nullable(FILE_CONSTANT_VALUES_NAME, file_constant_values), ])) }); -pub(crate) static SCAN_ROW_DATATYPE: LazyLock = - LazyLock::new(|| SCAN_ROW_SCHEMA.clone().into()); +/// Build the scan row schema with optional `stats_parsed` and `partitionValues_parsed` columns. +/// +/// When `stats_schema` is provided, adds a `stats_parsed` struct column with that schema. +/// When `partition_schema` is provided, adds a `partitionValues_parsed` struct column with that +/// schema. +fn scan_row_schema_with_parsed_columns( + stats_schema: Option, + partition_schema: Option, +) -> SchemaRef { + let needs_extra = stats_schema.is_some() || partition_schema.is_some(); + if !needs_extra { + return SCAN_ROW_SCHEMA.clone(); + } + let mut fields: Vec = SCAN_ROW_SCHEMA.fields().cloned().collect(); + if let Some(schema) = stats_schema { + fields.push(StructField::nullable( + "stats_parsed", + schema.as_ref().clone(), + )); + } + if let Some(schema) = partition_schema { + fields.push(StructField::nullable( + "partitionValues_parsed", + schema.as_ref().clone(), + )); + } + Arc::new(StructType::new_unchecked(fields)) +} -fn get_add_transform_expr() -> ExpressionRef { - use crate::expressions::column_expr_ref; - static EXPR: LazyLock = LazyLock::new(|| { - Arc::new(Expression::Struct(vec![ - column_expr_ref!("add.path"), - column_expr_ref!("add.size"), - column_expr_ref!("add.modificationTime"), - column_expr_ref!("add.stats"), - column_expr_ref!("add.deletionVector"), - Arc::new(Expression::Struct(vec![ - column_expr_ref!("add.partitionValues"), - column_expr_ref!("add.baseRowId"), - ])), - ])) - }); - EXPR.clone() +/// Build the add transform expression with optional stats and partition value parsing. +/// +/// # Parameters +/// - `physical_stats_schema`: Schema for parsing stats from JSON and for output (physical column +/// names), or None if stats should not be included in output. +/// - `has_stats_parsed`: Whether checkpoint has pre-parsed stats_parsed column. +/// - `skip_stats`: When true, replaces the stats column with a null literal, avoiding reads of the +/// raw stats JSON string from checkpoint parquet files. +/// - `partition_schema`: Schema of typed partition columns for data skipping, or None if partition +/// value parsing is not needed. +/// - `has_partition_values_parsed`: Whether checkpoint has pre-parsed partitionValues_parsed column. +/// +/// The transform includes `stats_parsed` only when `physical_stats_schema` is Some, +/// and `partitionValues_parsed` only when `partition_schema` is Some. +/// Stats are output using physical column names. +fn get_add_transform_expr( + physical_stats_schema: Option, + has_stats_parsed: bool, + skip_stats: bool, + partition_schema: Option, + has_partition_values_parsed: bool, +) -> ExpressionRef { + let stats_expr = if skip_stats { + Arc::new(Expression::Literal(Scalar::Null(DataType::STRING))) + } else { + column_expr_ref!("add.stats") + }; + let mut fields = vec![ + column_expr_ref!("add.path"), + column_expr_ref!("add.size"), + column_expr_ref!("add.modificationTime"), + stats_expr, + column_expr_ref!("add.deletionVector"), + Arc::new(Expression::struct_from([ + column_expr_ref!("add.partitionValues"), + column_expr_ref!("add.baseRowId"), + column_expr_ref!("add.defaultRowCommitVersion"), + column_expr_ref!("add.tags"), + column_expr_ref!("add.clusteringProvider"), + ])), + ]; + + // Add stats_parsed when stats output is requested (using physical column names) + if let Some(stats_schema) = physical_stats_schema { + let stats_parsed_expr = if has_stats_parsed { + // Checkpoint has stats_parsed column - read directly + column_expr!("add.stats_parsed") + } else { + // No stats_parsed available (JSON log files) - parse JSON + Expression::parse_json(column_expr!("add.stats"), stats_schema) + }; + fields.push(Arc::new(stats_parsed_expr)); + } + + // Add partitionValues_parsed when partition columns are needed for data skipping + if partition_schema.is_some() { + let pv_parsed_expr = if has_partition_values_parsed { + // Checkpoint has partitionValues_parsed column - read directly + column_expr!("add.partitionValues_parsed") + } else { + // No partitionValues_parsed available (JSON log files) - parse from string map + Expression::map_to_struct(column_expr!("add.partitionValues")) + }; + fields.push(Arc::new(pv_parsed_expr)); + } + + Arc::new(Expression::struct_from(fields)) } -// TODO: remove once `scan_metadata_from` is pub. +// TODO: Move this to transaction/mod.rs once `scan_metadata_from` is pub, as this is used for +// deletion vector update transformations. #[allow(unused)] pub(crate) fn get_scan_metadata_transform_expr() -> ExpressionRef { - use crate::expressions::column_expr_ref; static EXPR: LazyLock = LazyLock::new(|| { - Arc::new(Expression::Struct(vec![Arc::new(Expression::Struct( - vec![ + Arc::new(Expression::struct_from([Arc::new( + Expression::struct_from([ column_expr_ref!("path"), column_expr_ref!("fileConstantValues.partitionValues"), column_expr_ref!("size"), column_expr_ref!("modificationTime"), column_expr_ref!("stats"), + column_expr_ref!("fileConstantValues.tags"), column_expr_ref!("deletionVector"), column_expr_ref!("fileConstantValues.baseRowId"), - ], - ))])) + column_expr_ref!("fileConstantValues.defaultRowCommitVersion"), + column_expr_ref!("fileConstantValues.clusteringProvider"), + ]), + )])) }); EXPR.clone() } +impl ParallelLogReplayProcessor for ScanLogReplayProcessor { + type Output = ::Output; + + // WARNING: This function performs all the same operations as [`::process_actions_batch`]! (See trait impl block below) Any changes + // performed to this function probably also need to be applied to the other copy of the + // function. The copy exists because [`LogReplayProcessor`] requires a `&mut self`, while + // [`ParallelLogReplayProcessor`] requires `&self`. Presently, the different in mutabilities + // cannot easily be unified. + fn process_actions_batch(&self, actions_batch: ActionsBatch) -> DeltaResult { + let ActionsBatch { + actions, + is_log_batch, + } = actions_batch; + require!( + !is_log_batch, + Error::generic("Parallel checkpoint processor may only be applied to checkpoint files") + ); + + // Step 1: Apply transform FIRST (parses JSON once, outputs stats_parsed). + // This is done before data skipping so we can read the already-parsed stats. + // We use the checkpoint_transform because we checked above that we're reading a checkpoint. + let transformed = self.checkpoint_transform.evaluate(actions.as_ref())?; + debug_assert_eq!(transformed.len(), actions.len()); + require!( + transformed.len() == actions.len(), + Error::internal_error(format!( + "checkpoint transform output length {} != actions length {}", + transformed.len(), + actions.len() + )) + ); + + // Step 2: Build selection vector from TRANSFORMED batch (reads stats_parsed directly). + // This avoids double JSON parsing -- the transform already parsed the stats. + // Data skipping is safe for Remove rows: their add-side columns (stats_parsed, + // partitionValues_parsed) are null. For stats, the skipping predicate wraps comparisons + // with ISNULL guards that keep rows with missing stats. For partition values, the + // predicate is wrapped with OR(NOT is_add, pred) via guard_for_removes, so non-Add + // rows always pass the partition filter regardless of null partition values. + let selection_vector = self.build_selection_vector(transformed.as_ref())?; + debug_assert_eq!(selection_vector.len(), actions.len()); + require!( + selection_vector.len() == actions.len(), + Error::internal_error(format!( + "selection vector length {} != actions length {}", + selection_vector.len(), + actions.len() + )) + ); + + // Step 3: Run deduplication visitor on RAW batch (needs add.path, remove.path, etc.) + let deduplicator = CheckpointDeduplicator::try_new( + &self.seen_file_keys, + Self::ADD_PATH_INDEX, + Self::ADD_DV_START_INDEX, + )?; + let mut visitor = AddRemoveDedupVisitor::new( + deduplicator, + selection_vector, + self.state_info.clone(), + &self.metrics, + ); + visitor.visit_rows_of(actions.as_ref())?; + + // Step 4: Return transformed batch with updated selection vector + let scan_metadata = ScanMetadata::try_new( + transformed, + visitor.selection_vector, + visitor.row_transform_exprs, + )?; + self.metrics + .update_peak_hash_set_size(self.seen_file_keys.len()); + Ok(scan_metadata) + } +} + impl LogReplayProcessor for ScanLogReplayProcessor { type Output = ScanMetadata; + // WARNING: This function performs all the same operations as [`::process_actions_batch`]! Any changes performed to this function + // probably also need to be applied to the other copy. The copy exists because + // [`LogReplayProcessor`] requires a `&mut self`, while [`ParallelLogReplayProcessor`] requires + // `&self`. Presently, the different in mutabilities cannot easily be unified. fn process_actions_batch(&mut self, actions_batch: ActionsBatch) -> DeltaResult { let ActionsBatch { actions, is_log_batch, } = actions_batch; - // Build an initial selection vector for the batch which has had the data skipping filter - // applied. The selection vector is further updated by the deduplication visitor to remove - // rows that are not valid adds. - let selection_vector = self.build_selection_vector(actions.as_ref())?; - assert_eq!(selection_vector.len(), actions.len()); - let mut visitor = AddRemoveDedupVisitor::new( + // Step 1: Apply transform FIRST (outputs stats_parsed and partitionValues_parsed). + // Use the correct transform based on batch type: + // - Log batches: parse JSON for stats, MapToStruct for partition values + // - Checkpoint batches: read pre-parsed columns directly when available + let transform = if is_log_batch { + &self.log_transform + } else { + &self.checkpoint_transform + }; + let transformed = transform.evaluate(actions.as_ref())?; + require!( + transformed.len() == actions.len(), + Error::internal_error(format!( + "transform output length {} != actions length {}", + transformed.len(), + actions.len() + )) + ); + + // Step 2: Build selection vector from TRANSFORMED batch (reads stats_parsed directly). + // This avoids double JSON parsing -- the transform already parsed the stats. + // Data skipping is safe for Remove rows: their add-side columns (stats_parsed, + // partitionValues_parsed) are null. For stats, the skipping predicate wraps comparisons + // with ISNULL guards that keep rows with missing stats. For partition values, the + // predicate is wrapped with OR(NOT is_add, pred) via guard_for_removes, so non-Add + // rows always pass the partition filter regardless of null partition values. + let selection_vector = self.build_selection_vector(transformed.as_ref())?; + debug_assert_eq!(selection_vector.len(), actions.len()); + require!( + selection_vector.len() == actions.len(), + Error::internal_error(format!( + "selection vector length {} != actions length {}", + selection_vector.len(), + actions.len() + )) + ); + + // Step 3: Run deduplication visitor on RAW batch (needs add.path, remove.path, etc.) + let deduplicator = FileActionDeduplicator::new( &mut self.seen_file_keys, - selection_vector, - self.state_info.logical_schema.clone(), - self.state_info.physical_schema.clone(), - self.state_info.transform_spec.clone(), - self.partition_filter.clone(), is_log_batch, + Self::ADD_PATH_INDEX, + Self::REMOVE_PATH_INDEX, + Self::ADD_DV_START_INDEX, + Self::REMOVE_DV_START_INDEX, + ); + let mut visitor = AddRemoveDedupVisitor::new( + deduplicator, + selection_vector, + self.state_info.clone(), + &self.metrics, ); visitor.visit_rows_of(actions.as_ref())?; - // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let result = self.add_transform.evaluate(actions.as_ref())?; - ScanMetadata::try_new( - result, + // Step 4: Return transformed batch with updated selection vector + let scan_metadata = ScanMetadata::try_new( + transformed, visitor.selection_vector, visitor.row_transform_exprs, - ) + )?; + self.metrics + .update_peak_hash_set_size(self.seen_file_keys.len()); + Ok(scan_metadata) } fn data_skipping_filter(&self) -> Option<&DataSkippingFilter> { @@ -374,68 +853,133 @@ impl LogReplayProcessor for ScanLogReplayProcessor { } /// Given an iterator of [`ActionsBatch`]s (batches of actions read from the log) and a predicate, -/// returns an iterator of [`ScanMetadata`]s (which includes the files to be scanned as -/// [`FilteredEngineData`] and transforms that must be applied to correctly read the data). Each row -/// that is selected in the returned `engine_data` _must_ be processed to complete the scan. +/// returns a tuple of: +/// 1. An iterator of [`ScanMetadata`]s (which includes the files to be scanned as +/// [`FilteredEngineData`] and transforms that must be applied to correctly read the data). +/// 2. An `Arc` containing metrics collected during log replay. +/// +/// Each row that is selected in the returned `engine_data` _must_ be processed to complete the scan. /// Non-selected rows _must_ be ignored. /// +/// When `skip_stats` is true, file statistics are not read from checkpoint parquet files and +/// columnar data skipping is disabled (no stats-based or partition-value-based pruning), but +/// row-level partition filtering still applies. +/// /// Note: The iterator of [`ActionsBatch`]s ('action_iter' parameter) must be sorted by the order of /// the actions in the log from most recent to least recent. pub(crate) fn scan_action_iter( engine: &dyn Engine, action_iter: impl Iterator>, state_info: Arc, -) -> impl Iterator> { - ScanLogReplayProcessor::new(engine, state_info).process_actions_iter(action_iter) + checkpoint_info: CheckpointReadInfo, + skip_stats: bool, +) -> DeltaResult<( + impl Iterator>, + Arc, +)> { + let processor = ScanLogReplayProcessor::new(engine, state_info, checkpoint_info, skip_stats)?; + let metrics = processor.metrics.clone(); + Ok((processor.process_actions_iter(action_iter), metrics)) } #[cfg(test)] mod tests { - use std::{collections::HashMap, sync::Arc}; + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; + + use rstest::rstest; use crate::actions::get_commit_schema; - use crate::expressions::{BinaryExpressionOp, Scalar, VariadicExpressionOp}; + use crate::engine::sync::SyncEngine; + use crate::expressions::{ + BinaryExpressionOp, Expression, OpaquePredicateOp, Predicate, Scalar, + ScalarExpressionEvaluator, + }; + use crate::kernel_predicates::{ + DirectDataSkippingPredicateEvaluator, DirectPredicateEvaluator, + IndirectDataSkippingPredicateEvaluator, + }; use crate::log_replay::ActionsBatch; - use crate::scan::state::{DvInfo, Stats}; + use crate::log_segment::CheckpointReadInfo; + use crate::scan::state::ScanFile; use crate::scan::state_info::tests::{ - assert_transform_spec, get_simple_state_info, get_state_info, + assert_transform_spec, get_simple_state_info, get_state_info, ROW_TRACKING_FEATURES, }; use crate::scan::state_info::StateInfo; use crate::scan::test_utils::{ add_batch_for_row_id, add_batch_simple, add_batch_with_partition_col, - add_batch_with_remove, run_with_validate_callback, + add_batch_with_remove, add_batch_with_remove_and_partition, run_with_validate_callback, }; use crate::scan::PhysicalPredicate; use crate::schema::MetadataColumnSpec; + use crate::schema::{DataType, SchemaRef, StructField, StructType}; + use crate::table_features::ColumnMappingMode; + use crate::utils::test_utils::assert_result_error_with_message; + use crate::DeltaResult; use crate::Expression as Expr; - use crate::{ - engine::sync::SyncEngine, - schema::{DataType, SchemaRef, StructField, StructType}, - ExpressionRef, + use crate::ExpressionRef; + + use super::{ + scan_action_iter, InternalScanState, ScanLogReplayProcessor, SerializableScanState, }; - use super::scan_action_iter; + fn test_checkpoint_info() -> CheckpointReadInfo { + CheckpointReadInfo::without_stats_parsed() + } + + /// A minimal opaque predicate op for testing serialization behavior + #[derive(Debug, PartialEq)] + struct OpaqueTestOp(String); + + impl OpaquePredicateOp for OpaqueTestOp { + fn name(&self) -> &str { + &self.0 + } + + fn eval_pred_scalar( + &self, + _eval_expr: &ScalarExpressionEvaluator<'_>, + _evaluator: &DirectPredicateEvaluator<'_>, + _exprs: &[Expr], + _inverted: bool, + ) -> DeltaResult> { + unimplemented!() + } + + fn eval_as_data_skipping_predicate( + &self, + _predicate_evaluator: &DirectDataSkippingPredicateEvaluator<'_>, + _exprs: &[Expr], + _inverted: bool, + ) -> Option { + unimplemented!() + } + + fn as_data_skipping_predicate( + &self, + _predicate_evaluator: &IndirectDataSkippingPredicateEvaluator<'_>, + _exprs: &[Expr], + _inverted: bool, + ) -> Option { + unimplemented!() + } + } // dv-info is more complex to validate, we validate that works in the test for visit_scan_files // in state.rs - fn validate_simple( - _: &mut (), - path: &str, - size: i64, - stats: Option, - _: DvInfo, - _: Option, - part_vals: HashMap, - ) { + fn validate_simple(_: &mut (), scan_file: ScanFile) { assert_eq!( - path, + scan_file.path, "part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet" ); - assert_eq!(size, 635); - assert!(stats.is_some()); - assert_eq!(stats.as_ref().unwrap().num_records, 10); - assert_eq!(part_vals.get("date"), Some(&"2017-12-10".to_string())); - assert_eq!(part_vals.get("non-existent"), None); + assert_eq!(scan_file.size, 635); + assert!(scan_file.stats.is_some()); + assert_eq!(scan_file.stats.as_ref().unwrap().num_records, 10); + assert_eq!( + scan_file.partition_values.get("date"), + Some(&"2017-12-10".to_string()) + ); + assert_eq!(scan_file.partition_values.get("non-existent"), None); } #[test] @@ -471,14 +1015,20 @@ mod tests { physical_schema: logical_schema.clone(), physical_predicate: PhysicalPredicate::None, transform_spec: None, + column_mapping_mode: ColumnMappingMode::None, + physical_stats_schema: None, + physical_partition_schema: None, }); - let iter = scan_action_iter( + let (iter, _metrics) = scan_action_iter( &SyncEngine::new(), batch .into_iter() .map(|batch| Ok(ActionsBatch::new(batch as _, true))), state_info, - ); + test_checkpoint_info(), + false, + ) + .unwrap(); for res in iter { let scan_metadata = res.unwrap(); assert!( @@ -497,13 +1047,16 @@ mod tests { let partition_cols = vec!["date".to_string()]; let state_info = get_simple_state_info(schema, partition_cols).unwrap(); let batch = vec![add_batch_with_partition_col()]; - let iter = scan_action_iter( + let (iter, _metrics) = scan_action_iter( &SyncEngine::new(), batch .into_iter() .map(|batch| Ok(ActionsBatch::new(batch as _, true))), Arc::new(state_info), - ); + test_checkpoint_info(), + false, + ) + .unwrap(); fn validate_transform(transform: Option<&ExpressionRef>, expected_date_offset: i32) { assert!(transform.is_some()); @@ -551,12 +1104,17 @@ mod tests { schema.clone(), vec![], None, + ROW_TRACKING_FEATURES, [ ("delta.enableRowTracking", "true"), ( "delta.rowTracking.materializedRowIdColumnName", "row_id_col", ), + ( + "delta.rowTracking.materializedRowCommitVersionColumnName", + "row_commit_version_col", + ), ] .iter() .map(|(k, v)| (k.to_string(), v.to_string())) @@ -574,13 +1132,16 @@ mod tests { ); let batch = vec![add_batch_for_row_id(get_commit_schema().clone())]; - let iter = scan_action_iter( + let (iter, _metrics) = scan_action_iter( &SyncEngine::new(), batch .into_iter() .map(|batch| Ok(ActionsBatch::new(batch as _, true))), Arc::new(state_info), - ); + test_checkpoint_info(), + false, + ) + .unwrap(); for res in iter { let scan_metadata = res.unwrap(); @@ -595,21 +1156,483 @@ mod tests { assert!(row_id_transform.is_replace); assert_eq!(row_id_transform.exprs.len(), 1); let expr = &row_id_transform.exprs[0]; - let expeceted_expr = Arc::new(Expr::variadic( - VariadicExpressionOp::Coalesce, - vec![ - Expr::column(["row_id_col"]), - Expr::binary( - BinaryExpressionOp::Plus, - Expr::literal(42i64), - Expr::column(["row_indexes_for_row_id_0"]), - ), - ], - )); + let expeceted_expr = Arc::new(Expr::coalesce([ + Expr::column(["row_id_col"]), + Expr::binary( + BinaryExpressionOp::Plus, + Expr::literal(42i64), + Expr::column(["row_indexes_for_row_id_0"]), + ), + ])); assert_eq!(expr, &expeceted_expr); } else { panic!("Should have been a transform expression"); } } } + + #[test] + fn test_serialization_basic_state_and_dv_dropping() { + // Test basic StateInfo preservation and FileActionKey preservation + let engine = SyncEngine::new(); + let schema: SchemaRef = Arc::new(StructType::new_unchecked([ + StructField::new("id", DataType::INTEGER, true), + StructField::new("value", DataType::STRING, true), + ])); + let checkpoint_info = test_checkpoint_info(); + let mut processor = ScanLogReplayProcessor::new( + &engine, + Arc::new(get_simple_state_info(schema.clone(), vec![]).unwrap()), + checkpoint_info.clone(), + false, + ) + .unwrap(); + + // Add file keys with and without DV info + let key1 = crate::log_replay::FileActionKey::new("file1.parquet", None); + let key2 = crate::log_replay::FileActionKey::new("file2.parquet", Some("dv-1".to_string())); + let key3 = crate::log_replay::FileActionKey::new("file3.parquet", Some("dv-2".to_string())); + processor.seen_file_keys.insert(key1.clone()); + processor.seen_file_keys.insert(key2.clone()); + processor.seen_file_keys.insert(key3.clone()); + + let state_info = processor.state_info.clone(); + let deserialized = ScanLogReplayProcessor::from_serializable_state( + &engine, + processor.into_serializable_state().unwrap(), + ) + .unwrap(); + + // Verify StateInfo fields preserved + assert_eq!( + deserialized.state_info.logical_schema, + state_info.logical_schema + ); + assert_eq!( + deserialized.state_info.physical_schema, + state_info.physical_schema + ); + assert_eq!( + deserialized.state_info.column_mapping_mode, + state_info.column_mapping_mode + ); + + // Verify all file keys are preserved with their DV info + assert_eq!(deserialized.seen_file_keys.len(), 3); + assert!(deserialized.seen_file_keys.contains(&key1)); + assert!(deserialized.seen_file_keys.contains(&key2)); + assert!(deserialized.seen_file_keys.contains(&key3)); + } + + #[test] + fn test_serialization_with_predicate() { + // Test that PhysicalPredicate and predicate schema are preserved + let engine = SyncEngine::new(); + let schema: SchemaRef = Arc::new(StructType::new_unchecked([ + StructField::new("id", DataType::INTEGER, true), + StructField::new("value", DataType::STRING, true), + ])); + let predicate = Arc::new(crate::expressions::Predicate::eq( + Expr::column(["id"]), + Expr::literal(10i32), + )); + let state_info = Arc::new( + get_state_info( + schema.clone(), + vec![], + Some(predicate.clone()), + &[], // no table features + HashMap::new(), + vec![], + ) + .unwrap(), + ); + let original_pred_schema = match &state_info.physical_predicate { + PhysicalPredicate::Some(_, s) => s.clone(), + _ => panic!("Expected predicate"), + }; + let checkpoint_info = test_checkpoint_info(); + let processor = ScanLogReplayProcessor::new( + &engine, + state_info.clone(), + checkpoint_info.clone(), + false, + ) + .unwrap(); + let deserialized = ScanLogReplayProcessor::from_serializable_state( + &engine, + processor.into_serializable_state().unwrap(), + ) + .unwrap(); + + match &deserialized.state_info.physical_predicate { + PhysicalPredicate::Some(pred, pred_schema) => { + assert_eq!(pred.as_ref(), predicate.as_ref()); + assert_eq!(pred_schema.as_ref(), original_pred_schema.as_ref()); + } + _ => panic!("Expected PhysicalPredicate::Some"), + } + } + + #[test] + fn test_serialization_with_transforms() { + // Test transform_spec preservation (partition columns + row tracking) + let engine = SyncEngine::new(); + let schema: SchemaRef = Arc::new(StructType::new_unchecked([ + StructField::new("value", DataType::INTEGER, true), + StructField::new("date", DataType::DATE, true), + ])); + let state_info = Arc::new( + get_state_info( + schema, + vec!["date".to_string()], + None, + ROW_TRACKING_FEATURES, + [ + ("delta.enableRowTracking", "true"), + ( + "delta.rowTracking.materializedRowIdColumnName", + "row_id_col", + ), + ( + "delta.rowTracking.materializedRowCommitVersionColumnName", + "row_commit_version_col", + ), + ] + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + vec![("row_id", MetadataColumnSpec::RowId)], + ) + .unwrap(), + ); + let original_transform = state_info.transform_spec.clone(); + assert!(original_transform.is_some()); + let checkpoint_info = test_checkpoint_info(); + let processor = ScanLogReplayProcessor::new( + &engine, + state_info.clone(), + checkpoint_info.clone(), + false, + ) + .unwrap(); + let deserialized = ScanLogReplayProcessor::from_serializable_state( + &engine, + processor.into_serializable_state().unwrap(), + ) + .unwrap(); + assert_eq!(deserialized.state_info.transform_spec, original_transform); + } + + #[test] + fn test_serialization_column_mapping_modes() { + // Test that different ColumnMappingMode values are preserved + let engine = SyncEngine::new(); + for mode in [ + ColumnMappingMode::None, + ColumnMappingMode::Id, + ColumnMappingMode::Name, + ] { + let schema: SchemaRef = Arc::new(StructType::new_unchecked([StructField::new( + "id", + DataType::INTEGER, + true, + )])); + let state_info = Arc::new(StateInfo { + logical_schema: schema.clone(), + physical_schema: schema, + physical_predicate: PhysicalPredicate::None, + transform_spec: None, + column_mapping_mode: mode, + physical_stats_schema: None, + physical_partition_schema: None, + }); + let checkpoint_info = test_checkpoint_info(); + let processor = + ScanLogReplayProcessor::new(&engine, state_info, checkpoint_info.clone(), false) + .unwrap(); + let deserialized = ScanLogReplayProcessor::from_serializable_state( + &engine, + processor.into_serializable_state().unwrap(), + ) + .unwrap(); + assert_eq!(deserialized.state_info.column_mapping_mode, mode); + } + } + + #[test] + fn test_serialization_edge_cases() { + // Test edge cases: empty seen_file_keys, no predicate, no transform_spec + let engine = SyncEngine::new(); + let checkpoint_info = test_checkpoint_info(); + let schema: SchemaRef = Arc::new(StructType::new_unchecked([StructField::new( + "id", + DataType::INTEGER, + true, + )])); + let state_info = Arc::new(StateInfo { + logical_schema: schema.clone(), + physical_schema: schema, + physical_predicate: PhysicalPredicate::None, + transform_spec: None, + column_mapping_mode: ColumnMappingMode::None, + physical_stats_schema: None, + physical_partition_schema: None, + }); + let processor = + ScanLogReplayProcessor::new(&engine, state_info, checkpoint_info.clone(), false) + .unwrap(); + let serialized = processor.into_serializable_state().unwrap(); + assert!(serialized.predicate.is_none()); + let deserialized = + ScanLogReplayProcessor::from_serializable_state(&engine, serialized).unwrap(); + assert_eq!(deserialized.seen_file_keys.len(), 0); + assert!(deserialized.state_info.transform_spec.is_none()); + } + + #[test] + fn test_serialization_invalid_json() { + // Test that invalid JSON blobs are properly rejected + let engine = SyncEngine::new(); + let checkpoint_info = test_checkpoint_info(); + let invalid_state = SerializableScanState { + predicate: None, + internal_state_blob: vec![0, 1, 2, 3, 255], // Invalid JSON + seen_file_keys: HashSet::new(), + checkpoint_info, + }; + assert!(ScanLogReplayProcessor::from_serializable_state(&engine, invalid_state).is_err()); + } + + #[test] + fn test_serialization_missing_predicate_schema() { + // Test that missing predicate_schema when predicate exists is detected + let engine = SyncEngine::new(); + let schema: SchemaRef = Arc::new(StructType::new_unchecked([StructField::new( + "id", + DataType::INTEGER, + true, + )])); + let checkpoint_info = test_checkpoint_info(); + let invalid_internal_state = InternalScanState { + logical_schema: schema.clone(), + physical_schema: schema, + predicate_schema: None, // Missing! + transform_spec: None, + column_mapping_mode: ColumnMappingMode::None, + physical_stats_schema: None, + skip_stats: false, + physical_partition_schema: None, + }; + let predicate = Arc::new(crate::expressions::Predicate::column(["id"])); + let invalid_blob = serde_json::to_vec(&invalid_internal_state).unwrap(); + let invalid_state = SerializableScanState { + predicate: Some(predicate), // Predicate exists but schema is None + internal_state_blob: invalid_blob, + seen_file_keys: HashSet::new(), + checkpoint_info, + }; + let result = ScanLogReplayProcessor::from_serializable_state(&engine, invalid_state); + assert!(result.is_err()); + if let Err(e) = result { + assert!(e.to_string().contains("predicate schema")); + } + } + + #[test] + fn deserialize_internal_state_with_extry_fields_fails() { + let schema: SchemaRef = Arc::new(StructType::new_unchecked([StructField::new( + "id", + DataType::INTEGER, + true, + )])); + let invalid_internal_state = InternalScanState { + logical_schema: schema.clone(), + physical_schema: schema, + predicate_schema: None, + transform_spec: None, + column_mapping_mode: ColumnMappingMode::None, + physical_stats_schema: None, + skip_stats: false, + physical_partition_schema: None, + }; + let blob = serde_json::to_string(&invalid_internal_state).unwrap(); + let mut obj: serde_json::Value = serde_json::from_str(&blob).unwrap(); + obj["new_field"] = serde_json::json!("my_new_value"); + let invalid_blob = obj.to_string(); + + let res: Result = serde_json::from_str(&invalid_blob); + assert_result_error_with_message(res, "unknown field"); + } + + #[test] + fn deserialize_serializable_scan_state_with_extra_fields_fails() { + let state = SerializableScanState { + predicate: None, + internal_state_blob: vec![], + seen_file_keys: HashSet::new(), + checkpoint_info: test_checkpoint_info(), + }; + let blob = serde_json::to_string(&state).unwrap(); + let mut obj: serde_json::Value = serde_json::from_str(&blob).unwrap(); + obj["new_field"] = serde_json::json!("my_new_value"); + let invalid_blob = obj.to_string(); + + let res: Result = serde_json::from_str(&invalid_blob); + assert_result_error_with_message(res, "unknown field"); + } + + #[test] + fn serializng_scan_state_with_opaque_predicate_fails() { + // Opaque predicates cannot be serialized. Connectors requiring opaque expression support + // must serialize the predicate separately using their own mechanism. + + // Create an opaque predicate + let opaque_predicate = Arc::new(Predicate::opaque(OpaqueTestOp("test_op".to_string()), [])); + + // Directly create a SerializableScanState with the opaque predicate + let state = SerializableScanState { + predicate: Some(opaque_predicate), + internal_state_blob: vec![], + seen_file_keys: HashSet::new(), + checkpoint_info: test_checkpoint_info(), + }; + + // Serialization should fail because opaque expressions cannot be serialized + let result = serde_json::to_string(&state); + assert_result_error_with_message(result, "Cannot serialize an Opaque Predicate"); + } + + #[test] + fn test_scan_action_iter_with_skip_stats() { + let batch = vec![add_batch_simple(get_commit_schema().clone())]; + let schema: SchemaRef = Arc::new(StructType::new_unchecked([ + StructField::new("value", DataType::INTEGER, true), + StructField::new("date", DataType::DATE, true), + ])); + let state_info = get_simple_state_info(schema, vec!["date".to_string()]).unwrap(); + + let (iter, _metrics) = scan_action_iter( + &SyncEngine::new(), + batch + .into_iter() + .map(|batch| Ok(ActionsBatch::new(batch as _, true))), + Arc::new(state_info), + test_checkpoint_info(), + true, + ) + .unwrap(); + + let mut found_add = false; + for res in iter { + let scan_metadata = res.unwrap(); + scan_metadata + .visit_scan_files((), |_: &mut (), scan_file: ScanFile| { + assert!(scan_file.stats.is_none()); + }) + .unwrap(); + found_add = true; + } + assert!(found_add); + } + + /// Verify that Remove actions are not pruned by data skipping. The transform reads from + /// `add.*` columns, so Remove rows produce null `stats_parsed` and `partitionValues_parsed` + /// (Remove actions have their own `remove.partitionValues` and `remove.stats`, but those + /// are not read by the transform). If a Remove were pruned, it would not be recorded in + /// `seen_file_keys`, and a subsequent Add for the same path could incorrectly survive + /// deduplication. + /// + /// Stats-based skipping is safe because null stats evaluate to NULL via ISNULL guards in + /// the predicate construction. Partition-based skipping requires the `is_add` guard + /// (`OR(NOT is_add, pred)`) because `eval_sql_where` adds IS NOT NULL guards that would + /// otherwise turn null partition values into `false`, filtering the Remove. + #[rstest] + #[case::stats_only( + Arc::new(StructType::new_unchecked([ + StructField::new("value", DataType::INTEGER, true), + ])), + vec![], + Arc::new(Expression::column(["value"]).gt(Expression::literal(5i32))), + false, // use batch without partition column + )] + #[case::partition_predicate( + Arc::new(StructType::new_unchecked([ + StructField::new("value", DataType::INTEGER, true), + StructField::new("date", DataType::DATE, true), + ])), + vec!["date".to_string()], + Arc::new(Expression::column(["date"]).eq(Expression::literal(Scalar::Date(17_510)))), + true, // use batch with partition column + )] + #[case::mixed_stats_and_partition( + Arc::new(StructType::new_unchecked([ + StructField::new("value", DataType::INTEGER, true), + StructField::new("date", DataType::DATE, true), + ])), + vec!["date".to_string()], + Arc::new(Predicate::and( + Expression::column(["value"]).gt(Expression::literal(5i32)), + Expression::column(["date"]).eq(Expression::literal(Scalar::Date(17_510))), + )), + true, // use batch with partition column + )] + fn data_skipping_does_not_prune_remove_actions( + #[case] schema: SchemaRef, + #[case] partition_columns: Vec, + #[case] predicate: Arc, + #[case] with_partition: bool, + ) { + let state_info = get_state_info( + schema, + partition_columns, + Some(predicate), + &[], + HashMap::new(), + vec![], + ) + .unwrap(); + + // Batch: [Remove c001, Add c001, Add c000, Metadata] + // The Remove must not be pruned -- it records c001 as seen, suppressing the c001 Add. + let batch = if with_partition { + vec![add_batch_with_remove_and_partition( + get_commit_schema().clone(), + )] + } else { + vec![add_batch_with_remove(get_commit_schema().clone())] + }; + let (iter, _metrics) = scan_action_iter( + &SyncEngine::new(), + batch + .into_iter() + .map(|batch| Ok(ActionsBatch::new(batch as _, true))), + Arc::new(state_info), + test_checkpoint_info(), + false, + ) + .unwrap(); + + let mut add_paths: Vec = Vec::new(); + for res in iter { + let scan_metadata = res.unwrap(); + let paths = scan_metadata + .visit_scan_files( + Vec::new(), + |paths: &mut Vec, scan_file: ScanFile| { + paths.push(scan_file.path.to_string()); + }, + ) + .unwrap(); + add_paths.extend(paths); + } + + // Only c000 should survive: Remove suppressed c001 via deduplication + assert_eq!(add_paths.len(), 1, "Expected exactly one add to survive"); + assert!( + add_paths[0].contains("c000"), + "Expected c000 add to survive, got: {}", + add_paths[0] + ); + } } diff --git a/kernel/src/scan/metrics.rs b/kernel/src/scan/metrics.rs new file mode 100644 index 0000000000..a4990ed337 --- /dev/null +++ b/kernel/src/scan/metrics.rs @@ -0,0 +1,151 @@ +//! Metrics for scan log replay operations. + +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::time::Duration; + +use tracing::info; + +use crate::metrics::{MetricEvent, MetricId, ScanType}; + +/// Metrics collected during scan log replay. Metrics are updated and read using relaxed ordering +/// to keep updates fast across parallel executing threads. +pub(crate) struct ScanMetrics { + /// Add files seen during add remove deduplication. This does not include data skipped add + /// files. + /// Java equivalent: `addFilesCounter` + num_add_files_seen: AtomicU64, + /// Add files that survived log replay (files to read). includes files that survived + /// dataskipping, partition pruning, and add/remove deduplication. + /// Java equivalent: `activeAddFilesCounter` + num_active_add_files: AtomicU64, + /// Remove files seen (from delta/commit files only). + /// Java equivalent: `removeFilesFromDeltaFilesCounter` + num_remove_files_seen: AtomicU64, + /// Non-file actions seen (protocol, metadata, etc.). + num_non_file_actions: AtomicU64, + /// Files filtered by predicates (data skipping + partition pruning). + num_predicate_filtered: AtomicU64, + /// Peak size of the deduplication hash set. + peak_hash_set_size: AtomicUsize, + /// Time spent in the deduplication visitor (ns). + dedup_visitor_time_ns: AtomicU64, + /// Time spent evaluating predicates (ns). This includes data skipping and partition pruning. + predicate_eval_time_ns: AtomicU64, +} + +impl Default for ScanMetrics { + fn default() -> Self { + Self { + num_add_files_seen: AtomicU64::new(0), + num_active_add_files: AtomicU64::new(0), + num_remove_files_seen: AtomicU64::new(0), + num_non_file_actions: AtomicU64::new(0), + num_predicate_filtered: AtomicU64::new(0), + peak_hash_set_size: AtomicUsize::new(0), + dedup_visitor_time_ns: AtomicU64::new(0), + predicate_eval_time_ns: AtomicU64::new(0), + } + } +} + +impl ScanMetrics { + pub(crate) fn incr_add_files_seen(&self) { + self.num_add_files_seen.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn incr_active_add_files(&self) { + self.num_active_add_files.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn incr_remove_files_seen(&self) { + self.num_remove_files_seen.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn incr_non_file_actions(&self) { + self.num_non_file_actions.fetch_add(1, Ordering::Relaxed); + } + + pub(crate) fn add_predicate_filtered(&self, value: u64) { + self.num_predicate_filtered + .fetch_add(value, Ordering::Relaxed); + } + + pub(crate) fn update_peak_hash_set_size(&self, value: usize) { + self.peak_hash_set_size.fetch_max(value, Ordering::Relaxed); + } + + pub(crate) fn add_dedup_visitor_time_ns(&self, duration_ns: u64) { + self.dedup_visitor_time_ns + .fetch_add(duration_ns, Ordering::Relaxed); + } + + pub(crate) fn add_predicate_eval_time_ns(&self, duration_ns: u64) { + self.predicate_eval_time_ns + .fetch_add(duration_ns, Ordering::Relaxed); + } + + /// Reset counters to zero for a new phase. + /// + /// This is used between sequential and parallel phases to get fresh metrics + /// without reconstructing the entire processor. The peak hash set size is + /// preserved since it represents a high-water mark across all phases. + pub(crate) fn reset_counters(&self) { + self.num_add_files_seen.store(0, Ordering::Relaxed); + self.num_active_add_files.store(0, Ordering::Relaxed); + self.num_remove_files_seen.store(0, Ordering::Relaxed); + self.num_non_file_actions.store(0, Ordering::Relaxed); + self.num_predicate_filtered.store(0, Ordering::Relaxed); + self.dedup_visitor_time_ns.store(0, Ordering::Relaxed); + self.predicate_eval_time_ns.store(0, Ordering::Relaxed); + } + + /// Snapshot all counters into a `MetricEvent::ScanMetadataCompleted`. + /// + /// `scan_type` identifies whether this event was emitted by full scan metadata replay or + /// by a phase of parallel scan metadata replay. + pub(crate) fn to_event( + &self, + operation_id: MetricId, + scan_type: ScanType, + total_duration: Duration, + ) -> MetricEvent { + MetricEvent::ScanMetadataCompleted { + operation_id, + scan_type, + total_duration, + num_add_files_seen: self.num_add_files_seen.load(Ordering::Relaxed), + num_active_add_files: self.num_active_add_files.load(Ordering::Relaxed), + num_remove_files_seen: self.num_remove_files_seen.load(Ordering::Relaxed), + num_non_file_actions: self.num_non_file_actions.load(Ordering::Relaxed), + num_predicate_filtered: self.num_predicate_filtered.load(Ordering::Relaxed), + peak_hash_set_size: self.peak_hash_set_size.load(Ordering::Relaxed), + dedup_visitor_time_ms: self.dedup_visitor_time_ns.load(Ordering::Relaxed) / 1_000_000, + predicate_eval_time_ms: self.predicate_eval_time_ns.load(Ordering::Relaxed) / 1_000_000, + } + } + + /// Log all metrics with a message in the current tracing span context. + pub(crate) fn log(&self, message: impl AsRef) { + let add_files_seen = self.num_add_files_seen.load(Ordering::Relaxed); + let active_add_files = self.num_active_add_files.load(Ordering::Relaxed); + let remove_files_seen = self.num_remove_files_seen.load(Ordering::Relaxed); + let non_file_actions = self.num_non_file_actions.load(Ordering::Relaxed); + let predicate_filtered = self.num_predicate_filtered.load(Ordering::Relaxed); + let peak_hash_set_size = self.peak_hash_set_size.load(Ordering::Relaxed); + let dedup_visitor_time_ms = self.dedup_visitor_time_ns.load(Ordering::Relaxed) / 1_000_000; + let predicate_eval_time_ms = + self.predicate_eval_time_ns.load(Ordering::Relaxed) / 1_000_000; + info!( + add_files_seen, + active_add_files, + remove_files_seen, + non_file_actions, + predicate_filtered, + peak_hash_set_size, + dedup_visitor_time_ms, + predicate_eval_time_ms, + "{}", + message.as_ref() + ); + } +} diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 7c325db99b..7e6b52a279 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -3,30 +3,44 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::sync::{Arc, LazyLock}; +use std::time::Instant; use delta_kernel_derive::internal_api; use itertools::Itertools; -use tracing::debug; +use tracing::{debug, info}; use url::Url; +use crate::metrics::MetricId; +use crate::scan::metrics::ScanMetrics; +use crate::utils::IteratorExt; + +use self::data_skipping::as_checkpoint_skipping_predicate; use self::log_replay::get_scan_metadata_transform_expr; use crate::actions::deletion_vector::{ deletion_treemap_to_bools, split_vector, DeletionVectorDescriptor, }; -use crate::actions::{get_commit_schema, ADD_NAME, REMOVE_NAME}; +use crate::actions::{get_commit_schema, Add, ADD_NAME, REMOVE_NAME}; use crate::engine_data::FilteredEngineData; -use crate::expressions::transforms::ExpressionTransform; use crate::expressions::{ColumnName, ExpressionRef, Predicate, PredicateRef, Scalar}; -use crate::kernel_predicates::{DefaultKernelPredicateEvaluator, EmptyColumnResolver}; -use crate::listed_log_files::ListedLogFiles; +use crate::kernel_predicates::{ + DefaultKernelPredicateEvaluator, EmptyColumnResolver, KernelPredicateEvaluator as _, +}; use crate::log_replay::{ActionsBatch, HasSelectionVector}; -use crate::log_segment::LogSegment; -use crate::scan::state::{DvInfo, Stats}; +use crate::log_segment::{ActionsWithCheckpointInfo, CheckpointReadInfo, LogSegment}; +use crate::log_segment_files::LogSegmentFiles; +use crate::metrics::ScanType; +use crate::parallel::sequential_phase::SequentialPhase; +use crate::scan::log_replay::ScanLogReplayProcessor; +use crate::scan::log_replay::{ + BASE_ROW_ID_NAME, CLUSTERING_PROVIDER_NAME, DEFAULT_ROW_COMMIT_VERSION_NAME, +}; use crate::scan::state_info::StateInfo; use crate::schema::{ - ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, + ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, StructField, StructType, ToSchema as _, }; +use crate::table_features::{ColumnMappingMode, Operation}; +use crate::transforms::{ExpressionTransform, SchemaTransform}; use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, SnapshotRef, Version}; use self::log_replay::scan_action_iter; @@ -34,26 +48,77 @@ use self::log_replay::scan_action_iter; pub(crate) mod data_skipping; pub(crate) mod field_classifiers; pub mod log_replay; +pub(crate) mod metrics; pub mod state; pub(crate) mod state_info; +pub(crate) mod transform_spec; + +#[cfg(test)] +pub(crate) mod test_utils; + +#[cfg(test)] +mod tests; // safety: we define get_commit_schema() and _know_ it contains ADD_NAME and REMOVE_NAME #[allow(clippy::unwrap_used)] -static COMMIT_READ_SCHEMA: LazyLock = LazyLock::new(|| { +pub(crate) static COMMIT_READ_SCHEMA: LazyLock = LazyLock::new(|| { get_commit_schema() .project(&[ADD_NAME, REMOVE_NAME]) .unwrap() }); // safety: we define get_commit_schema() and _know_ it contains ADD_NAME and SIDECAR_NAME #[allow(clippy::unwrap_used)] -static CHECKPOINT_READ_SCHEMA: LazyLock = +pub(crate) static CHECKPOINT_READ_SCHEMA: LazyLock = LazyLock::new(|| get_commit_schema().project(&[ADD_NAME]).unwrap()); +/// Checkpoint schema WITHOUT stats for column projection pushdown. +/// When skip_stats is enabled, we use this schema to avoid reading the stats column from parquet. +pub(crate) static CHECKPOINT_READ_SCHEMA_NO_STATS: LazyLock = LazyLock::new(|| { + let add_schema = Add::to_schema(); + let fields_no_stats: Vec<_> = add_schema + .fields() + .filter(|f| f.name() != "stats") + .cloned() + .collect(); + let add_no_stats = StructType::new_unchecked(fields_no_stats); + Arc::new(StructType::new_unchecked([StructField::nullable( + ADD_NAME, + add_no_stats, + )])) +}); + +#[allow(unused)] +pub use crate::parallel::parallel_scan_metadata::{ + AfterSequentialScanMetadata, ParallelScanMetadata, ParallelState, SequentialScanMetadata, +}; + +/// Controls how file statistics are handled during a scan. +/// +/// This enum determines whether and which statistics columns appear in scan metadata output, +/// and whether internal data skipping is enabled. +#[derive(Debug, Clone)] +pub enum StatsOutputMode { + /// Output all table stats columns in `stats_parsed`. + AllColumns, + /// Output stats for specific columns. An empty list means no stats output, but + /// predicate-based data skipping still works internally. + Columns(Vec), + /// Skip reading stats entirely. Disables data skipping. + Skip, +} + +impl Default for StatsOutputMode { + fn default() -> Self { + StatsOutputMode::Columns(Vec::new()) + } +} + /// Builder to scan a snapshot of a table. pub struct ScanBuilder { snapshot: SnapshotRef, schema: Option, predicate: Option, + stats_output_mode: StatsOutputMode, } impl std::fmt::Debug for ScanBuilder { @@ -61,6 +126,7 @@ impl std::fmt::Debug for ScanBuilder { f.debug_struct("ScanBuilder") .field("schema", &self.schema) .field("predicate", &self.predicate) + .field("stats_output_mode", &self.stats_output_mode) .finish() } } @@ -72,6 +138,7 @@ impl ScanBuilder { snapshot: snapshot.into(), schema: None, predicate: None, + stats_output_mode: StatsOutputMode::default(), } } @@ -104,11 +171,71 @@ impl ScanBuilder { /// /// NOTE: The filtering is best-effort and can produce false positives (rows that should should /// have been filtered out but were kept). + /// + /// This method can be combined with [`include_all_stats_columns`]. When both are used, the kernel + /// performs data skipping internally using the predicate AND outputs parsed statistics to the + /// engine via the `stats_parsed` column in scan metadata. + /// + /// [`include_all_stats_columns`]: ScanBuilder::include_all_stats_columns pub fn with_predicate(mut self, predicate: impl Into>) -> Self { self.predicate = predicate.into(); self } + /// Include all parsed statistics in scan metadata. + /// + /// When enabled, the scan will include a `stats_parsed` column in the scan metadata + /// containing pre-parsed file statistics (minValues, maxValues, nullCount, numRecords) + /// that integrations can use for their own data skipping logic. + /// + /// The statistics schema is determined by the table's configuration + /// (`delta.dataSkippingStatsColumns` or `delta.dataSkippingNumIndexedCols`). In the future, + /// a requested columns filter may limit which columns appear in the output without + /// affecting the table-level column counting. + /// + /// This method can be combined with [`with_predicate`]. When both are used, the kernel + /// performs data skipping internally using the predicate AND outputs parsed statistics to the + /// engine via the `stats_parsed` column in scan metadata. + /// + /// [`with_predicate`]: ScanBuilder::with_predicate + pub fn include_all_stats_columns(mut self) -> Self { + self.stats_output_mode = StatsOutputMode::AllColumns; + self + } + + /// Include specific columns in the scan metadata. + /// + /// When `columns` is non-empty, only those columns' statistics appear in `stats_parsed`. + /// When `columns` is empty, no stats are output (equivalent to the default behavior). + /// + /// [`with_stats_columns`]: ScanBuilder::with_stats_columns + /// [`build`]: ScanBuilder::build + pub fn with_stats_columns(mut self, columns: Vec) -> Self { + self.stats_output_mode = StatsOutputMode::Columns(columns); + self + } + + /// Skip reading file statistics from checkpoint files. + /// + /// When enabled: + /// - Parquet checkpoint reads use column projection to skip the stats column + /// - The `stats` field in scan results will be `None` + /// - Columnar data skipping is disabled (no stats-based or partition-value-based pruning), + /// but row-level partition filtering still applies + /// + /// If called after [`include_all_stats_columns`] or [`with_stats_columns`], the last call wins. + /// + /// Use this when data skipping is handled externally (e.g., by the query engine). + /// + /// [`include_all_stats_columns`]: ScanBuilder::include_all_stats_columns + /// [`with_stats_columns`]: ScanBuilder::with_stats_columns + pub fn with_skip_stats(mut self, skip_stats: bool) -> Self { + if skip_stats { + self.stats_output_mode = StatsOutputMode::Skip; + } + self + } + /// Build the [`Scan`]. /// /// This does not scan the table at this point, but does do some work to ensure that the @@ -119,16 +246,22 @@ impl ScanBuilder { // if no schema is provided, use snapshot's entire schema (e.g. SELECT *) let logical_schema = self.schema.unwrap_or_else(|| self.snapshot.schema()); + self.snapshot + .table_configuration() + .ensure_operation_supported(Operation::Scan)?; + let state_info = StateInfo::try_new( logical_schema, self.snapshot.table_configuration(), self.predicate, - (), // No classifer, default is for scans + self.stats_output_mode.clone(), + (), // No classifier, default is for scans )?; Ok(Scan { snapshot: self.snapshot, state_info: Arc::new(state_info), + stats_output_mode: self.stats_output_mode, }) } } @@ -151,15 +284,28 @@ impl PhysicalPredicate { pub(crate) fn try_new( predicate: &Predicate, logical_schema: &Schema, + column_mapping_mode: ColumnMappingMode, ) -> DeltaResult { if can_statically_skip_all_files(predicate) { return Ok(PhysicalPredicate::StaticSkipAll); } + let unresolved_references = predicate.references(); + // Group predicate references by case-folded path so that multiple references to the + // same column with different casings (e.g., `col > 5 AND COL < 10`) all resolve + // correctly instead of one being silently dropped. + let mut folded_references: HashMap, Vec<&ColumnName>> = HashMap::new(); + for r in &unresolved_references { + let folded: Vec = r.iter().map(|s| s.to_lowercase()).collect(); + folded_references.entry(folded).or_default().push(r); + } let mut get_referenced_fields = GetReferencedFields { - unresolved_references: predicate.references(), + unresolved_references, + folded_references, column_mappings: HashMap::new(), logical_path: vec![], + folded_logical_path: vec![], physical_path: vec![], + column_mapping_mode, }; let schema_opt = get_referenced_fields.transform_struct(logical_schema); let mut unresolved = get_referenced_fields.unresolved_references.into_iter(); @@ -197,7 +343,6 @@ impl PhysicalPredicate { // the predicate allows to statically skip all files. Since this is direct evaluation (not an // expression rewrite), we use a `DefaultKernelPredicateEvaluator` with an empty column resolver. fn can_statically_skip_all_files(predicate: &Predicate) -> bool { - use crate::kernel_predicates::KernelPredicateEvaluator as _; let evaluator = DefaultKernelPredicateEvaluator::from(EmptyColumnResolver); evaluator.eval_sql_where(predicate) == Some(false) } @@ -207,23 +352,35 @@ fn can_statically_skip_all_files(predicate: &Predicate) -> bool { // mappings so we can access the correct physical stats column for each logical column. struct GetReferencedFields<'a> { unresolved_references: HashSet<&'a ColumnName>, + /// Case-folded (lowercased) column path -> all predicate column names that fold to it, + /// for O(1) case-insensitive matching. Grouped as a `Vec` so that multiple references to + /// the same column with different casings all resolve correctly. + folded_references: HashMap, Vec<&'a ColumnName>>, column_mappings: HashMap, logical_path: Vec, + /// Case-folded version of `logical_path`, maintained incrementally via push/pop to avoid + /// re-folding the entire path at every leaf field. + folded_logical_path: Vec, physical_path: Vec, + column_mapping_mode: ColumnMappingMode, } impl<'a> SchemaTransform<'a> for GetReferencedFields<'a> { // Capture the path mapping for this leaf field fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option> { - // Record the physical name mappings for all referenced leaf columns - self.unresolved_references - .remove(self.logical_path.as_slice()) - .then(|| { - self.column_mappings.insert( - ColumnName::new(&self.logical_path), - ColumnName::new(&self.physical_path), - ); - Cow::Borrowed(ptype) - }) + // Record the physical name mappings for all referenced leaf columns. Delta column names + // are case-insensitive, so we probe the case-folded lookup map for O(1) matching. + let pred_cols = self + .folded_references + .remove(self.folded_logical_path.as_slice())?; + let physical = ColumnName::new(&self.physical_path); + for pred_col in pred_cols { + self.unresolved_references.remove(pred_col); + // Use the predicate's column name as key so ApplyColumnMappings can look it up + // by the exact name used in the predicate expression. + self.column_mappings + .insert(pred_col.clone(), physical.clone()); + } + Some(Cow::Borrowed(ptype)) } // array and map fields are not eligible for data skipping, so filter them out. @@ -235,16 +392,31 @@ impl<'a> SchemaTransform<'a> for GetReferencedFields<'a> { } fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { - let physical_name = field.physical_name(); + let physical_name = field.physical_name(self.column_mapping_mode); self.logical_path.push(field.name.clone()); + self.folded_logical_path.push(field.name.to_lowercase()); self.physical_path.push(physical_name.to_string()); let field = self.recurse_into_struct_field(field); self.logical_path.pop(); + self.folded_logical_path.pop(); self.physical_path.pop(); Some(Cow::Owned(field?.with_name(physical_name))) } } +/// Prefixes all column references in a predicate with a fixed path. +/// Transforms data-skipping predicates (e.g., `minValues.x > 100`) into +/// checkpoint/sidecar-compatible predicates (e.g., `add.stats_parsed.minValues.x > 100`). +struct PrefixColumns { + prefix: ColumnName, +} + +impl<'a> ExpressionTransform<'a> for PrefixColumns { + fn transform_expr_column(&mut self, name: &'a ColumnName) -> Option> { + Some(Cow::Owned(self.prefix.join(name))) + } +} + struct ApplyColumnMappings { column_mappings: HashMap, } @@ -258,49 +430,31 @@ impl<'a> ExpressionTransform<'a> for ApplyColumnMappings { } } -/// A vector of this type is returned from calling [`Scan::execute`]. Each [`ScanResult`] contains -/// the raw [`EngineData`] as read by the engines [`crate::ParquetHandler`], and a boolean -/// mask. Rows can be dropped from a scan due to deletion vectors, so we communicate back both -/// EngineData and information regarding whether a row should be included or not (via an internal -/// mask). See the docs below for [`ScanResult::full_mask`] for details on the mask. -pub struct ScanResult { - /// Raw engine data as read from the disk for a particular file included in the query. Note - /// that this data may include data that should be filtered out based on the mask given by - /// [`full_mask`]. - /// - /// [`full_mask`]: #method.full_mask - pub raw_data: DeltaResult>, - /// Raw row mask. - // TODO(nick) this should be allocated by the engine - pub(crate) raw_mask: Option>, -} - -impl ScanResult { - /// Returns the raw row mask. If an item at `raw_mask()[i]` is true, row `i` is - /// valid. Otherwise, row `i` is invalid and should be ignored. - /// - /// The raw mask is dangerous to use because it may be shorter than expected. In particular, if - /// you are using the default engine and plan to call arrow's `filter_record_batch`, you _need_ - /// to extend the mask to the full length of the batch or arrow will drop the extra - /// rows. Calling [`full_mask`] instead avoids this risk entirely, at the cost of a copy. - /// - /// [`full_mask`]: #method.full_mask - pub fn raw_mask(&self) -> Option<&Vec> { - self.raw_mask.as_ref() - } +static RESTORED_ADD_SCHEMA: LazyLock = LazyLock::new(|| { + let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); + StructType::new_unchecked(vec![StructField::nullable( + "add", + StructType::new_unchecked(vec![ + StructField::not_null("path", DataType::STRING), + StructField::not_null("partitionValues", partition_values), + StructField::not_null("size", DataType::LONG), + StructField::nullable("modificationTime", DataType::LONG), + StructField::nullable("stats", DataType::STRING), + StructField::nullable( + "tags", + MapType::new(DataType::STRING, DataType::STRING, true), + ), + StructField::nullable("deletionVector", DeletionVectorDescriptor::to_schema()), + StructField::nullable(BASE_ROW_ID_NAME, DataType::LONG), + StructField::nullable(DEFAULT_ROW_COMMIT_VERSION_NAME, DataType::LONG), + StructField::nullable(CLUSTERING_PROVIDER_NAME, DataType::STRING), + ]), + )]) + .into() +}); - /// Extends the underlying (raw) mask to match the row count of the accompanying data. - /// - /// If the raw mask is *shorter* than the number of rows returned, missing elements are - /// considered `true`, i.e. included in the query. If the mask is `None`, all rows are valid. - /// - /// NB: If you are using the default engine and plan to call arrow's `filter_record_batch`, you - /// _need_ to extend the mask to the full length of the batch or arrow will drop the extra rows. - pub fn full_mask(&self) -> Option> { - let mut mask = self.raw_mask.clone()?; - mask.resize(self.raw_data.as_ref().ok()?.len(), true); - Some(mask) - } +pub(crate) fn restored_add_schema() -> &'static SchemaRef { + &RESTORED_ADD_SCHEMA } /// utility method making it easy to get a transform for a particular row. If the requested row is @@ -359,6 +513,7 @@ impl HasSelectionVector for ScanMetadata { pub struct Scan { snapshot: SnapshotRef, state_info: Arc, + stats_output_mode: StatsOutputMode, } impl std::fmt::Debug for Scan { @@ -366,11 +521,17 @@ impl std::fmt::Debug for Scan { f.debug_struct("Scan") .field("schema", &self.state_info.logical_schema) .field("predicate", &self.state_info.physical_predicate) + .field("stats_output_mode", &self.stats_output_mode) .finish() } } impl Scan { + /// Whether stats reading is entirely skipped, disabling data skipping. + fn skip_stats(&self) -> bool { + matches!(self.stats_output_mode, StatsOutputMode::Skip) + } + /// The table's root URL. Any relative paths returned from `scan_data` (or in a callback from /// [`ScanMetadata::visit_scan_files`]) must be resolved against this root to get the actual path to /// the file. @@ -416,6 +577,12 @@ impl Scan { /// Get an iterator of [`ScanMetadata`]s that should be used to facilitate a scan. This handles /// log-replay, reconciling Add and Remove actions, and applying data skipping (if possible). + /// + /// Reports metrics: [`MetricEvent::ScanMetadataCompleted`] when the returned iterator is + /// fully exhausted. + /// + /// [`MetricEvent::ScanMetadataCompleted`]: crate::metrics::MetricEvent::ScanMetadataCompleted + /// /// Each item in the returned iterator is a struct of: /// - `Box`: Data in engine format, where each row represents a file to be /// scanned. The schema for each row can be obtained by calling [`scan_row_schema`]. @@ -429,15 +596,16 @@ impl Scan { /// - `Vec>`: Transformation expressions that need to be applied. For each /// row at index `i` in the above data, if an expression exists at index `i` in the `Vec`, /// the associated expression _must_ be applied to the data read from the file specified by - /// the row. The resultant schema for this expression is guaranteed to be `Scan.schema()`. If - /// the item at index `i` in this `Vec` is `None`, or if the `Vec` contains fewer than `i` - /// elements, no expression need be applied and the data read from disk is already in the - /// correct logical state. + /// the row. The resultant schema for this expression is guaranteed to be + /// [`Self::logical_schema()`]. If the item at index `i` in this `Vec` is `None`, or if the + /// `Vec` contains fewer than `i` elements, no expression need be applied and the data read + /// from disk is already in the correct logical state. pub fn scan_metadata( &self, engine: &dyn Engine, ) -> DeltaResult>> { - self.scan_metadata_inner(engine, self.replay_for_scan_metadata(engine)?) + let actions_with_checkpoint_info = self.replay_for_scan_metadata(engine)?; + self.scan_metadata_inner(engine, actions_with_checkpoint_info) } /// Get an updated iterator of [`ScanMetadata`]s based on an existing iterator of [`EngineData`]s. @@ -490,22 +658,6 @@ impl Scan { existing_data: impl IntoIterator> + 'static, _existing_predicate: Option, ) -> DeltaResult>>> { - static RESTORED_ADD_SCHEMA: LazyLock = LazyLock::new(|| { - let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); - DataType::struct_type_unchecked(vec![StructField::nullable( - "add", - DataType::struct_type_unchecked(vec![ - StructField::not_null("path", DataType::STRING), - StructField::not_null("partitionValues", partition_values), - StructField::not_null("size", DataType::LONG), - StructField::nullable("modificationTime", DataType::LONG), - StructField::nullable("stats", DataType::STRING), - StructField::nullable("deletionVector", DeletionVectorDescriptor::to_schema()), - StructField::nullable("baseRowId", DataType::LONG), - ]), - )]) - }); - // TODO(#966): validate that the current predicate is compatible with the hint predicate. if existing_version > self.snapshot.version() { @@ -522,21 +674,32 @@ impl Scan { let transform = engine.evaluation_handler().new_expression_evaluator( scan_row_schema(), get_scan_metadata_transform_expr(), - RESTORED_ADD_SCHEMA.clone(), - ); + restored_add_schema().clone().into(), + )?; let apply_transform = move |data: Box| { Ok(ActionsBatch::new(transform.evaluate(data.as_ref())?, false)) }; + let log_segment = self.snapshot.log_segment(); + // If the snapshot version corresponds to the hint version, we process the existing data // to apply file skipping and provide the required transformations. + // Since we're only processing existing data (no checkpoint), we use the base schema + // and no stats_parsed optimization. if existing_version == self.snapshot.version() { - let scan = existing_data.into_iter().map(apply_transform); - return Ok(Box::new(self.scan_metadata_inner(engine, scan)?)); + let actions_with_checkpoint_info = ActionsWithCheckpointInfo { + actions: existing_data.into_iter().map(apply_transform), + checkpoint_info: CheckpointReadInfo { + has_stats_parsed: false, + has_partition_values_parsed: false, + checkpoint_read_schema: restored_add_schema().clone(), + }, + }; + return Ok(Box::new( + self.scan_metadata_inner(engine, actions_with_checkpoint_info)?, + )); } - let log_segment = self.snapshot.log_segment(); - // If the current log segment contains a checkpoint newer than the hint version // we disregard the existing data hint, and perform a full scan. The current log segment // only has deltas after the checkpoint, so we cannot update from prior versions. @@ -547,95 +710,258 @@ impl Scan { } // create a new log segment containing only the commits added after the version hint. - let mut ascending_commit_files = log_segment.ascending_commit_files.clone(); + let mut ascending_commit_files = log_segment.listed.ascending_commit_files.clone(); ascending_commit_files.retain(|f| f.version > existing_version); - let listed_log_files = ListedLogFiles { + let log_segment_files = LogSegmentFiles { ascending_commit_files, - ascending_compaction_files: vec![], - checkpoint_parts: vec![], - latest_crc_file: None, - latest_commit_file: log_segment.latest_commit_file.clone(), + latest_commit_file: log_segment.listed.latest_commit_file.clone(), + ..Default::default() }; let new_log_segment = LogSegment::try_new( - listed_log_files, + log_segment_files, log_segment.log_root.clone(), Some(log_segment.end_version), + None, // No checkpoint in this incremental segment )?; - let it = new_log_segment - .read_actions_with_projected_checkpoint_actions( - engine, - COMMIT_READ_SCHEMA.clone(), + // For incremental reads, new_log_segment has no checkpoint but we use the + // checkpoint schema returned by the function for consistency. + let (checkpoint_schema, meta_predicate) = if self.skip_stats() { + (CHECKPOINT_READ_SCHEMA_NO_STATS.clone(), None) + } else { + ( CHECKPOINT_READ_SCHEMA.clone(), - None, - )? - .chain(existing_data.into_iter().map(apply_transform)); + self.build_actions_meta_predicate(), + ) + }; + let result = new_log_segment.read_actions_with_projected_checkpoint_actions( + engine, + COMMIT_READ_SCHEMA.clone(), + checkpoint_schema, + meta_predicate, + self.state_info + .physical_stats_schema + .as_ref() + .map(|s| s.as_ref()), + None, + )?; + let actions_with_checkpoint_info = ActionsWithCheckpointInfo { + actions: result + .actions + .chain(existing_data.into_iter().map(apply_transform)), + checkpoint_info: result.checkpoint_info, + }; - Ok(Box::new(self.scan_metadata_inner(engine, it)?)) + Ok(Box::new(self.scan_metadata_inner( + engine, + actions_with_checkpoint_info, + )?)) } fn scan_metadata_inner( &self, engine: &dyn Engine, - action_batch_iter: impl Iterator>, + actions_with_checkpoint_info: ActionsWithCheckpointInfo< + impl Iterator>, + >, ) -> DeltaResult>> { - if let PhysicalPredicate::StaticSkipAll = self.state_info.physical_predicate { - return Ok(None.into_iter().flatten()); - } - let it = scan_action_iter(engine, action_batch_iter, self.state_info.clone()); - Ok(Some(it).into_iter().flatten()) + let start = Instant::now(); + let reporter = engine.get_metrics_reporter(); + let operation_id = MetricId::new(); + + let (iter, metrics) = match self.state_info.physical_predicate { + PhysicalPredicate::StaticSkipAll => { + info!("Predicate statically evaluated to false; skipping all files"); + (None, Arc::new(ScanMetrics::default())) + } + _ => { + let (it, m) = scan_action_iter( + engine, + actions_with_checkpoint_info.actions, + self.state_info.clone(), + actions_with_checkpoint_info.checkpoint_info, + self.skip_stats(), + )?; + (Some(it), m) + } + }; + + let on_complete = move || { + let event = metrics.to_event(operation_id, ScanType::Full, start.elapsed()); + info!(%event); + if let Some(r) = reporter { + r.report(event); + } + }; + Ok(iter.into_iter().flatten().on_complete(on_complete)) } // Factored out to facilitate testing fn replay_for_scan_metadata( &self, engine: &dyn Engine, - ) -> DeltaResult> + Send> { - // NOTE: We don't pass any meta-predicate because we expect no meaningful row group skipping - // when ~every checkpoint file will contain the adds and removes we are looking for. + ) -> DeltaResult< + ActionsWithCheckpointInfo> + Send>, + > { + let (checkpoint_schema, meta_predicate) = if self.skip_stats() { + (CHECKPOINT_READ_SCHEMA_NO_STATS.clone(), None) + } else { + ( + CHECKPOINT_READ_SCHEMA.clone(), + self.build_actions_meta_predicate(), + ) + }; self.snapshot .log_segment() .read_actions_with_projected_checkpoint_actions( engine, COMMIT_READ_SCHEMA.clone(), - CHECKPOINT_READ_SCHEMA.clone(), - None, + checkpoint_schema, + meta_predicate, + self.state_info + .physical_stats_schema + .as_ref() + .map(|s| s.as_ref()), + self.state_info + .physical_partition_schema + .as_ref() + .map(|s| s.as_ref()), ) } + /// Builds a predicate for row group skipping in checkpoint and sidecar parquet files. + /// + /// The scan predicate is first transformed into a data-skipping form with IS NULL guards + /// (e.g., `x > 100` becomes `OR(maxValues.x IS NULL, maxValues.x > 100)`), then column + /// references are prefixed with `add.stats_parsed` to match the physical column layout + /// of checkpoint/sidecar files. The parquet reader's row group filter can then use + /// parquet-level statistics on these nested columns to skip entire row groups that cannot + /// contain matching files. + /// + /// The IS NULL guards are necessary because parquet footer min/max statistics ignore null + /// values. Without them, row groups containing files with missing stats (null stat columns) + /// could be incorrectly pruned, since the footer min/max wouldn't reflect those files. + /// + /// Returns `None` if the scan has no predicate, no stats schema, or if the predicate is a + /// bare unsupported expression (e.g. column-column comparison). Junctions with unsupported + /// arms replace them with TRUE to conservatively prevent pruning. + fn build_actions_meta_predicate(&self) -> Option { + let PhysicalPredicate::Some(ref predicate, _) = self.state_info.physical_predicate else { + return None; + }; + self.state_info.physical_stats_schema.as_ref()?; + + let partition_columns = self + .snapshot + .table_configuration() + .metadata() + .partition_columns(); + let skipping_pred = as_checkpoint_skipping_predicate(predicate, partition_columns)?; + + let mut prefixer = PrefixColumns { + prefix: ColumnName::new(["add", "stats_parsed"]), + }; + let prefixed = prefixer.transform_pred(&skipping_pred)?; + Some(Arc::new(prefixed.into_owned())) + } + + /// Start a parallel scan metadata processing for the table. + /// + /// This method returns a [`SequentialScanMetadata`] iterator that processes commits and + /// checkpoint manifests sequentially. After exhausting this iterator, call `finish()` + /// to determine if a distributed phase is needed. + /// + /// # Example + /// + /// ```no_run + /// # use std::sync::Arc; + /// # use delta_kernel::{Engine, DeltaResult}; + /// # use delta_kernel::scan::{AfterSequentialScanMetadata, ParallelScanMetadata}; + /// # use delta_kernel::Snapshot; + /// # use url::Url; + /// # use delta_kernel::engine::default::DefaultEngineBuilder; + /// # use delta_kernel::object_store::local::LocalFileSystem; + /// # fn main() -> DeltaResult<()> { + /// let engine = Arc::new(DefaultEngineBuilder::new(Arc::new(LocalFileSystem::new())).build()); + /// let table_root = Url::parse("file:///path/to/table")?; + /// + /// // Build a snapshot + /// let snapshot = Snapshot::builder_for(table_root.clone()) + /// .at_version(5) // Optional: specify a time-travel version (default is latest version) + /// .build(engine.as_ref())?; + /// let scan = snapshot.scan_builder().build()?; + /// let mut sequential = scan.parallel_scan_metadata(engine.clone())?; + /// + /// // Process sequential phase + /// for result in sequential.by_ref() { + /// let scan_metadata = result?; + /// // Process scan metadata... + /// } + /// + /// // Check if distributed phase is needed + /// match sequential.finish()? { + /// AfterSequentialScanMetadata::Done => { + /// // All processing complete + /// } + /// AfterSequentialScanMetadata::Parallel { state, files } => { + /// // Distribute files for parallel processing (e.g., one file per worker) + /// let state = Arc::new(*state); + /// for file in files { + /// let parallel = ParallelScanMetadata::try_new( + /// engine.clone(), + /// state.clone(), + /// vec![file], + /// )?; + /// for result in parallel { + /// let scan_metadata = result?; + /// // Process scan metadata... + /// } + /// } + /// } + /// } + /// # Ok(()) + /// # } + pub fn parallel_scan_metadata( + &self, + engine: Arc, + ) -> DeltaResult { + // For the sequential/parallel phase approach, we use a conservative checkpoint_info + // since SequentialPhase reads checkpoints via CheckpointManifestReader which doesn't + // currently support stats_parsed optimization. + let checkpoint_read_schema = if self.skip_stats() { + CHECKPOINT_READ_SCHEMA_NO_STATS.clone() + } else { + CHECKPOINT_READ_SCHEMA.clone() + }; + let checkpoint_info = CheckpointReadInfo { + has_stats_parsed: false, + has_partition_values_parsed: false, + checkpoint_read_schema, + }; + let processor = ScanLogReplayProcessor::new( + engine.as_ref(), + self.state_info.clone(), + checkpoint_info, + self.skip_stats(), + )?; + let sequential = + SequentialPhase::try_new(processor, self.snapshot.log_segment(), engine.clone())?; + + Ok(SequentialScanMetadata::new(sequential)) + } + /// Perform an "all in one" scan. This will use the provided `engine` to read and process all - /// the data for the query. Each [`ScanResult`] in the resultant iterator encapsulates the raw - /// data and an optional boolean vector built from the deletion vector if it was present. See - /// the documentation for [`ScanResult`] for more details. Generally connectors/engines will - /// want to use [`Scan::scan_metadata`] so they can have more control over the execution of the - /// scan. + /// the data for the query. Each [`EngineData`] in the resultant iterator is a portion of the + /// final table data. Generally connectors/engines will want to use [`Scan::scan_metadata`] so + /// they can have more control over the execution of the scan. // This calls [`Scan::scan_metadata`] to get an iterator of `ScanMetadata` actions for the scan, // and then uses the `engine`'s [`crate::ParquetHandler`] to read the actual table data. pub fn execute( &self, engine: Arc, - ) -> DeltaResult> + use<'_>> { - struct ScanFile { - path: String, - size: i64, - dv_info: DvInfo, - transform: Option, - } - fn scan_metadata_callback( - batches: &mut Vec, - path: &str, - size: i64, - _: Option, - dv_info: DvInfo, - transform: Option, - _: HashMap, - ) { - batches.push(ScanFile { - path: path.to_string(), - size, - dv_info, - transform, - }); + ) -> DeltaResult>>> { + fn scan_metadata_callback(batches: &mut Vec, file: state::ScanFile) { + batches.push(file); } debug!( @@ -655,6 +981,8 @@ impl Scan { // Iterator>> to Iterator> .flatten_ok(); + let physical_schema = self.physical_schema().clone(); + let logical_schema = self.logical_schema().clone(); let result = scan_files_iter .map(move |scan_file| -> DeltaResult<_> { let scan_file = scan_file?; @@ -678,19 +1006,36 @@ impl Scan { // TODO(#860): we disable predicate pushdown until we support row indexes. let read_result_iter = engine.parquet_handler().read_parquet_files( &[meta], - self.physical_schema().clone(), + physical_schema.clone(), None, )?; + let mut read_result_iter = read_result_iter.peekable(); + + // Only flag an empty iterator as a connector bug when stats are present and report + // a positive row count. When stats are absent we cannot distinguish a legitimate + // 0-row file from a buggy connector, so we conservatively allow it. + let expect_data = scan_file.stats.as_ref().is_some_and(|s| s.num_records > 0); + if expect_data && read_result_iter.peek().is_none() { + return Err(Error::internal_error(format!( + "ParquetHandler returned no data for file '{}'. This is likely a connector \ + bug -- the handler's read_parquet_files must return at least one batch for \ + each requested file that contains rows.", + scan_file.path + ))); + } + let engine = engine.clone(); // Arc clone + let physical_schema_inner = physical_schema.clone(); + let logical_schema_inner = logical_schema.clone(); Ok(read_result_iter.map(move |read_result| -> DeltaResult<_> { let read_result = read_result?; // transform the physical data into the correct logical form let logical = state::transform_to_logical( engine.as_ref(), read_result, - self.physical_schema(), - self.logical_schema(), + &physical_schema_inner, + &logical_schema_inner, scan_file.transform.clone(), // Arc clone ); let len = logical.as_ref().map_or(0, |res| res.len()); @@ -700,17 +1045,17 @@ impl Scan { // to `rest` in a moment anyway let mut sv = selection_vector.take(); let rest = split_vector(sv.as_mut(), len, None); - let result = ScanResult { - raw_data: logical, - raw_mask: sv, + let result = match sv { + Some(sv) => logical.and_then(|data| data.apply_selection_vector(sv)), + None => logical, }; selection_vector = rest; - Ok(result) + result })) }) - // Iterator>>> to Iterator>> + // Iterator>>>> to Iterator>>> .flatten_ok() - // Iterator>> to Iterator> + // Iterator>>> to Iterator>> .map(|x| x?); Ok(result) } @@ -734,7 +1079,10 @@ impl Scan { /// }, /// fileConstantValues: { /// partitionValues: map, -/// baseRowId: long +/// tags: map, +/// baseRowId: long, +/// defaultRowCommitVersion: long, +/// clusteringProvider: string, /// } /// } /// ``` @@ -751,668 +1099,3 @@ pub fn selection_vector( let dv_treemap = descriptor.read(storage, table_root)?; Ok(deletion_treemap_to_bools(dv_treemap)) } - -// some utils that are used in file_stream.rs and state.rs tests -#[cfg(test)] -pub(crate) mod test_utils { - use crate::arrow::array::StringArray; - use crate::scan::state_info::StateInfo; - use crate::schema::StructType; - use crate::utils::test_utils::string_array_to_engine_data; - use itertools::Itertools; - use std::sync::Arc; - - use crate::log_replay::ActionsBatch; - use crate::{ - actions::get_commit_schema, - engine::{ - arrow_data::ArrowEngineData, - sync::{json::SyncJsonHandler, SyncEngine}, - }, - scan::log_replay::scan_action_iter, - schema::SchemaRef, - JsonHandler, - }; - - use super::state::ScanCallback; - use super::PhysicalPredicate; - use crate::transforms::TransformSpec; - - // Generates a batch of sidecar actions with the given paths. - // The schema is provided as null columns affect equality checks. - pub(crate) fn sidecar_batch_with_given_paths( - paths: Vec<&str>, - output_schema: SchemaRef, - ) -> Box { - let handler = SyncJsonHandler {}; - - let mut json_strings: Vec = paths - .iter() - .map(|path| { - format!( - r#"{{"sidecar":{{"path":"{path}","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{{"tag_foo":"tag_bar"}}}}}}"# - ) - }) - .collect(); - json_strings.push(r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#.to_string()); - - let json_strings_array: StringArray = - json_strings.iter().map(|s| s.as_str()).collect_vec().into(); - - let parsed = handler - .parse_json( - string_array_to_engine_data(json_strings_array), - output_schema, - ) - .unwrap(); - - ArrowEngineData::try_from_engine_data(parsed).unwrap() - } - - // Generates a batch with an add action. - // The schema is provided as null columns affect equality checks. - pub(crate) fn add_batch_simple(output_schema: SchemaRef) -> Box { - let handler = SyncJsonHandler {}; - let json_strings: StringArray = vec![ - r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, - ] - .into(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() - } - - // Generates a batch with an add action. - // The schema is provided as null columns affect equality checks. - pub(crate) fn add_batch_for_row_id(output_schema: SchemaRef) -> Box { - let handler = SyncJsonHandler {}; - let json_strings: StringArray = vec![ - r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","baseRowId": 42, "tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableRowTracking": "true", "delta.rowTracking.materializedRowIdColumnName":"row_id_col"},"createdTime":1677811175819}}"#, - ] - .into(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() - } - - // An add batch with a removed file parsed with the schema provided - pub(crate) fn add_batch_with_remove(output_schema: SchemaRef) -> Box { - let handler = SyncJsonHandler {}; - let json_strings: StringArray = vec![ - r#"{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c001.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":635,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, - r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c001.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, - r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, - ] - .into(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() - } - - // add batch with a `date` partition col - pub(crate) fn add_batch_with_partition_col() -> Box { - let handler = SyncJsonHandler {}; - let json_strings: StringArray = vec![ - r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["date"],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, - r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c001.snappy.parquet","partitionValues": {"date": "2017-12-11"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, - r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, - r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, - ] - .into(); - let output_schema = get_commit_schema().clone(); - let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - ArrowEngineData::try_from_engine_data(parsed).unwrap() - } - - /// Create a scan action iter and validate what's called back. If you pass `None` as - /// `logical_schema`, `transform` should also be `None` - #[allow(clippy::vec_box)] - pub(crate) fn run_with_validate_callback( - batch: Vec>, - logical_schema: Option, - transform_spec: Option>, - expected_sel_vec: &[bool], - context: T, - validate_callback: ScanCallback, - ) { - let logical_schema = - logical_schema.unwrap_or_else(|| Arc::new(StructType::new_unchecked(vec![]))); - let state_info = Arc::new(StateInfo { - logical_schema: logical_schema.clone(), - physical_schema: logical_schema, - physical_predicate: PhysicalPredicate::None, - transform_spec, - }); - let iter = scan_action_iter( - &SyncEngine::new(), - batch - .into_iter() - .map(|batch| Ok(ActionsBatch::new(batch as _, true))), - state_info, - ); - let mut batch_count = 0; - for res in iter { - let scan_metadata = res.unwrap(); - assert_eq!( - scan_metadata.scan_files.selection_vector(), - expected_sel_vec - ); - scan_metadata - .visit_scan_files(context.clone(), validate_callback) - .unwrap(); - batch_count += 1; - } - assert_eq!(batch_count, 1); - } -} - -#[cfg(test)] -mod tests { - use std::path::PathBuf; - - use crate::arrow::array::BooleanArray; - use crate::arrow::compute::filter_record_batch; - use crate::arrow::record_batch::RecordBatch; - use crate::engine::arrow_data::ArrowEngineData; - use crate::engine::sync::SyncEngine; - use crate::expressions::{column_expr, column_pred, Expression as Expr, Predicate as Pred}; - use crate::schema::{ColumnMetadataKey, PrimitiveType, StructType}; - use crate::Snapshot; - - use super::*; - - #[test] - fn test_static_skipping() { - const NULL: Pred = Pred::null_literal(); - let test_cases = [ - (false, column_pred!("a")), - (true, Pred::literal(false)), - (false, Pred::literal(true)), - (true, NULL), - (true, Pred::and(column_pred!("a"), Pred::literal(false))), - (false, Pred::or(column_pred!("a"), Pred::literal(true))), - (false, Pred::or(column_pred!("a"), Pred::literal(false))), - (false, Pred::lt(column_expr!("a"), Expr::literal(10))), - (false, Pred::lt(Expr::literal(10), Expr::literal(100))), - (true, Pred::gt(Expr::literal(10), Expr::literal(100))), - (true, Pred::and(NULL, column_pred!("a"))), - ]; - for (should_skip, predicate) in test_cases { - assert_eq!( - can_statically_skip_all_files(&predicate), - should_skip, - "Failed for predicate: {predicate:#?}" - ); - } - } - - #[test] - fn test_physical_predicate() { - let logical_schema = StructType::new_unchecked(vec![ - StructField::nullable("a", DataType::LONG), - StructField::nullable("b", DataType::LONG).with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_b", - )]), - StructField::nullable("phys_b", DataType::LONG).with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_c", - )]), - StructField::nullable( - "nested", - StructType::new_unchecked(vec![ - StructField::nullable("x", DataType::LONG), - StructField::nullable("y", DataType::LONG).with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_y", - )]), - ]), - ), - StructField::nullable( - "mapped", - StructType::new_unchecked(vec![StructField::nullable("n", DataType::LONG) - .with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_n", - )])]), - ) - .with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_mapped", - )]), - ]); - - // NOTE: We break several column mapping rules here because they don't matter for this - // test. For example, we do not provide field ids, and not all columns have physical names. - let test_cases = [ - (Pred::literal(true), Some(PhysicalPredicate::None)), - (Pred::literal(false), Some(PhysicalPredicate::StaticSkipAll)), - (column_pred!("x"), None), // no such column - ( - column_pred!("a"), - Some(PhysicalPredicate::Some( - column_pred!("a").into(), - StructType::new_unchecked(vec![StructField::nullable("a", DataType::LONG)]) - .into(), - )), - ), - ( - column_pred!("b"), - Some(PhysicalPredicate::Some( - column_pred!("phys_b").into(), - StructType::new_unchecked(vec![StructField::nullable( - "phys_b", - DataType::LONG, - ) - .with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_b", - )])]) - .into(), - )), - ), - ( - column_pred!("nested.x"), - Some(PhysicalPredicate::Some( - column_pred!("nested.x").into(), - StructType::new_unchecked(vec![StructField::nullable( - "nested", - StructType::new_unchecked(vec![StructField::nullable("x", DataType::LONG)]), - )]) - .into(), - )), - ), - ( - column_pred!("nested.y"), - Some(PhysicalPredicate::Some( - column_pred!("nested.phys_y").into(), - StructType::new_unchecked(vec![StructField::nullable( - "nested", - StructType::new_unchecked(vec![StructField::nullable( - "phys_y", - DataType::LONG, - ) - .with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_y", - )])]), - )]) - .into(), - )), - ), - ( - column_pred!("mapped.n"), - Some(PhysicalPredicate::Some( - column_pred!("phys_mapped.phys_n").into(), - StructType::new_unchecked(vec![StructField::nullable( - "phys_mapped", - StructType::new_unchecked(vec![StructField::nullable( - "phys_n", - DataType::LONG, - ) - .with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_n", - )])]), - ) - .with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_mapped", - )])]) - .into(), - )), - ), - ( - Pred::and(column_pred!("mapped.n"), Pred::literal(true)), - Some(PhysicalPredicate::Some( - Pred::and(column_pred!("phys_mapped.phys_n"), Pred::literal(true)).into(), - StructType::new_unchecked(vec![StructField::nullable( - "phys_mapped", - StructType::new_unchecked(vec![StructField::nullable( - "phys_n", - DataType::LONG, - ) - .with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_n", - )])]), - ) - .with_metadata([( - ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), - "phys_mapped", - )])]) - .into(), - )), - ), - ( - Pred::and(column_pred!("mapped.n"), Pred::literal(false)), - Some(PhysicalPredicate::StaticSkipAll), - ), - ]; - - for (predicate, expected) in test_cases { - let result = PhysicalPredicate::try_new(&predicate, &logical_schema).ok(); - assert_eq!( - result, expected, - "Failed for predicate: {predicate:#?}, expected {expected:#?}, got {result:#?}" - ); - } - } - - fn get_files_for_scan(scan: Scan, engine: &dyn Engine) -> DeltaResult> { - let scan_metadata_iter = scan.scan_metadata(engine)?; - fn scan_metadata_callback( - paths: &mut Vec, - path: &str, - _size: i64, - _: Option, - dv_info: DvInfo, - _transform: Option, - _partition_values: HashMap, - ) { - paths.push(path.to_string()); - assert!(dv_info.deletion_vector.is_none()); - } - let mut files = vec![]; - for res in scan_metadata_iter { - let scan_metadata = res?; - files = scan_metadata.visit_scan_files(files, scan_metadata_callback)?; - } - Ok(files) - } - - #[test] - fn test_scan_metadata_paths() { - let path = - std::fs::canonicalize(PathBuf::from("./tests/data/table-without-dv-small/")).unwrap(); - let url = url::Url::from_directory_path(path).unwrap(); - let engine = SyncEngine::new(); - - let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); - let scan = snapshot.scan_builder().build().unwrap(); - let files = get_files_for_scan(scan, &engine).unwrap(); - assert_eq!(files.len(), 1); - assert_eq!( - files[0], - "part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet" - ); - } - - #[test_log::test] - fn test_scan_metadata() { - let path = - std::fs::canonicalize(PathBuf::from("./tests/data/table-without-dv-small/")).unwrap(); - let url = url::Url::from_directory_path(path).unwrap(); - let engine = Arc::new(SyncEngine::new()); - - let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); - let scan = snapshot.scan_builder().build().unwrap(); - let files: Vec = scan.execute(engine).unwrap().try_collect().unwrap(); - - assert_eq!(files.len(), 1); - let num_rows = files[0].raw_data.as_ref().unwrap().len(); - assert_eq!(num_rows, 10) - } - - #[test_log::test] - fn test_scan_metadata_from_same_version() { - let path = - std::fs::canonicalize(PathBuf::from("./tests/data/table-without-dv-small/")).unwrap(); - let url = url::Url::from_directory_path(path).unwrap(); - let engine = Arc::new(SyncEngine::new()); - - let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); - let version = snapshot.version(); - let scan = snapshot.scan_builder().build().unwrap(); - let files: Vec<_> = scan - .scan_metadata(engine.as_ref()) - .unwrap() - .map_ok(|ScanMetadata { scan_files, .. }| { - let (underlying_data, selection_vector) = scan_files.into_parts(); - let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) - .unwrap() - .into(); - let filtered_batch = - filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap(); - Box::new(ArrowEngineData::from(filtered_batch)) as Box - }) - .try_collect() - .unwrap(); - let new_files: Vec<_> = scan - .scan_metadata_from(engine.as_ref(), version, files, None) - .unwrap() - .try_collect() - .unwrap(); - - assert_eq!(new_files.len(), 1); - } - - // reading v0 with 3 files. - // updating to v1 with 3 more files added. - #[test_log::test] - fn test_scan_metadata_from_with_update() { - let path = std::fs::canonicalize(PathBuf::from("./tests/data/basic_partitioned/")).unwrap(); - let url = url::Url::from_directory_path(path).unwrap(); - let engine = Arc::new(SyncEngine::new()); - - let snapshot = Snapshot::builder_for(url.clone()) - .at_version(0) - .build(engine.as_ref()) - .unwrap(); - let scan = snapshot.scan_builder().build().unwrap(); - let files: Vec<_> = scan - .scan_metadata(engine.as_ref()) - .unwrap() - .map_ok(|ScanMetadata { scan_files, .. }| { - let (underlying_data, selection_vector) = scan_files.into_parts(); - let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) - .unwrap() - .into(); - filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap() - }) - .try_collect() - .unwrap(); - assert_eq!(files.len(), 1); - assert_eq!(files[0].num_rows(), 3); - - let files: Vec<_> = files - .into_iter() - .map(|b| Box::new(ArrowEngineData::from(b)) as Box) - .collect(); - let snapshot = Snapshot::builder_for(url) - .at_version(1) - .build(engine.as_ref()) - .unwrap(); - let scan = snapshot.scan_builder().build().unwrap(); - let new_files: Vec<_> = scan - .scan_metadata_from(engine.as_ref(), 0, files, None) - .unwrap() - .map_ok(|ScanMetadata { scan_files, .. }| { - let (underlying_data, selection_vector) = scan_files.into_parts(); - let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) - .unwrap() - .into(); - filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap() - }) - .try_collect() - .unwrap(); - assert_eq!(new_files.len(), 2); - assert_eq!(new_files[0].num_rows(), 3); - assert_eq!(new_files[1].num_rows(), 3); - } - - #[test] - fn test_get_partition_value() { - let cases = [ - ( - "string", - PrimitiveType::String, - Scalar::String("string".to_string()), - ), - ("123", PrimitiveType::Integer, Scalar::Integer(123)), - ("1234", PrimitiveType::Long, Scalar::Long(1234)), - ("12", PrimitiveType::Short, Scalar::Short(12)), - ("1", PrimitiveType::Byte, Scalar::Byte(1)), - ("1.1", PrimitiveType::Float, Scalar::Float(1.1)), - ("10.10", PrimitiveType::Double, Scalar::Double(10.1)), - ("true", PrimitiveType::Boolean, Scalar::Boolean(true)), - ("2024-01-01", PrimitiveType::Date, Scalar::Date(19723)), - ("1970-01-01", PrimitiveType::Date, Scalar::Date(0)), - ( - "1970-01-01 00:00:00", - PrimitiveType::Timestamp, - Scalar::Timestamp(0), - ), - ( - "1970-01-01 00:00:00.123456", - PrimitiveType::Timestamp, - Scalar::Timestamp(123456), - ), - ( - "1970-01-01 00:00:00.123456789", - PrimitiveType::Timestamp, - Scalar::Timestamp(123456), - ), - ]; - - for (raw, data_type, expected) in &cases { - let value = crate::transforms::parse_partition_value_raw( - Some(&raw.to_string()), - &DataType::Primitive(data_type.clone()), - ) - .unwrap(); - assert_eq!(value, *expected); - } - } - - #[test] - fn test_replay_for_scan_metadata() { - let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); - let url = url::Url::from_directory_path(path.unwrap()).unwrap(); - let engine = SyncEngine::new(); - - let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); - let scan = snapshot.scan_builder().build().unwrap(); - let data: Vec<_> = scan - .replay_for_scan_metadata(&engine) - .unwrap() - .try_collect() - .unwrap(); - // No predicate pushdown attempted, because at most one part of a multi-part checkpoint - // could be skipped when looking for adds/removes. - // - // NOTE: Each checkpoint part is a single-row file -- guaranteed to produce one row group. - assert_eq!(data.len(), 5); - } - - #[test] - fn test_data_row_group_skipping() { - let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); - let url = url::Url::from_directory_path(path.unwrap()).unwrap(); - let engine = Arc::new(SyncEngine::new()); - - let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); - - // No predicate pushdown attempted, so the one data file should be returned. - // - // NOTE: The data file contains only five rows -- near guaranteed to produce one row group. - let scan = snapshot.clone().scan_builder().build().unwrap(); - let data: Vec<_> = scan.execute(engine.clone()).unwrap().try_collect().unwrap(); - assert_eq!(data.len(), 1); - - // Ineffective predicate pushdown attempted, so the one data file should be returned. - let int_col = column_expr!("numeric.ints.int32"); - let value = Expr::literal(1000i32); - let predicate = Arc::new(int_col.clone().gt(value.clone())); - let scan = snapshot - .clone() - .scan_builder() - .with_predicate(predicate) - .build() - .unwrap(); - let data: Vec<_> = scan.execute(engine.clone()).unwrap().try_collect().unwrap(); - assert_eq!(data.len(), 1); - - // TODO(#860): we disable predicate pushdown until we support row indexes. Update this test - // accordingly after support is reintroduced. - // - // Effective predicate pushdown, so no data files should be returned. BUT since we disabled - // predicate pushdown, the one data file is still returned. - let predicate = Arc::new(int_col.lt(value)); - let scan = snapshot - .scan_builder() - .with_predicate(predicate) - .build() - .unwrap(); - let data: Vec<_> = scan.execute(engine).unwrap().try_collect().unwrap(); - assert_eq!(data.len(), 1); - } - - #[test] - fn test_missing_column_row_group_skipping() { - let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); - let url = url::Url::from_directory_path(path.unwrap()).unwrap(); - let engine = Arc::new(SyncEngine::new()); - - let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); - - // Predicate over a logically valid but physically missing column. No data files should be - // returned because the column is inferred to be all-null. - // - // WARNING: https://github.com/delta-io/delta-kernel-rs/issues/434 - This - // optimization is currently disabled, so the one data file is still returned. - let predicate = Arc::new(column_expr!("missing").lt(Expr::literal(1000i64))); - let scan = snapshot - .clone() - .scan_builder() - .with_predicate(predicate) - .build() - .unwrap(); - let data: Vec<_> = scan.execute(engine.clone()).unwrap().try_collect().unwrap(); - assert_eq!(data.len(), 1); - - // Predicate over a logically missing column fails the scan - let predicate = Arc::new(column_expr!("numeric.ints.invalid").lt(Expr::literal(1000))); - snapshot - .scan_builder() - .with_predicate(predicate) - .build() - .expect_err("unknown column"); - } - - #[test_log::test] - fn test_scan_with_checkpoint() -> DeltaResult<()> { - let path = std::fs::canonicalize(PathBuf::from( - "./tests/data/with_checkpoint_no_last_checkpoint/", - ))?; - - let url = url::Url::from_directory_path(path).unwrap(); - let engine = SyncEngine::new(); - - let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); - let scan = snapshot.scan_builder().build()?; - let files = get_files_for_scan(scan, &engine)?; - // test case: - // - // commit0: P and M, no add/remove - // commit1: add file-ad1 - // commit2: remove file-ad1, add file-a19 - // checkpoint2: remove file-ad1, add file-a19 - // commit3: remove file-a19, add file-70b - // - // thus replay should produce only file-70b - assert_eq!( - files, - vec!["part-00000-70b1dcdf-0236-4f63-a072-124cdbafd8a0-c000.snappy.parquet"] - ); - Ok(()) - } -} diff --git a/kernel/src/scan/state.rs b/kernel/src/scan/state.rs index 92c10bc4ca..5260f834c0 100644 --- a/kernel/src/scan/state.rs +++ b/kernel/src/scan/state.rs @@ -10,7 +10,7 @@ use crate::utils::require; use crate::ExpressionRef; use crate::{ actions::{deletion_vector::DeletionVectorDescriptor, visitors::visit_deletion_vector_at}, - engine_data::{GetData, RowVisitor, TypedGetData as _}, + engine_data::{FilteredRowVisitor, GetData, RowIndexIterator, TypedGetData}, schema::{ColumnName, ColumnNamesAndTypes, DataType, SchemaRef}, DeltaResult, Engine, EngineData, Error, }; @@ -107,21 +107,33 @@ pub fn transform_to_logical( physical_schema.clone(), transform, logical_schema.clone().into(), // TODO: expensive deep clone! - ) + )? .evaluate(physical_data.as_ref()), None => Ok(physical_data), } } -pub type ScanCallback = fn( - context: &mut T, - path: &str, - size: i64, - stats: Option, - dv_info: DvInfo, - transform: Option, - partition_values: HashMap, -); +/// A `ScanFile` represents information about one file that needs to be scanned to read a table. +#[derive(Debug, Clone, PartialEq)] +pub struct ScanFile { + /// Path to the file + pub path: String, + /// Size of the file + pub size: i64, + /// The time the file was created, as milliseconds since the epoch + pub modification_time: i64, + /// Statistics about the file + pub stats: Option, + /// A [`DvInfo`] struct, which allows getting the selection vector for this file + pub dv_info: DvInfo, + /// An optional expression that, if present, _must_ be applied to physical data to convert it to + /// the correct logical format + pub transform: Option, + /// a `HashMap` which map partition names to the value they have in this file + pub partition_values: HashMap, +} + +pub type ScanCallback = fn(context: &mut T, scan_file: ScanFile); /// Request that the kernel call a callback on each valid file that needs to be read for the /// scan. @@ -129,12 +141,7 @@ pub type ScanCallback = fn( /// The arguments to the callback are: /// * `context`: an `&mut context` argument. this can be anything that engine needs to pass through /// to each call -/// * `path`: a `&str` which is the path to the file -/// * `size`: an `i64` which is the size of the file -/// * `dv_info`: a [`DvInfo`] struct, which allows getting the selection vector for this file -/// * `transform`: An optional expression that, if present, _must_ be applied to physical data to -/// convert it to the correct logical format -/// * `partition_values`: a `HashMap` which are partition values +/// * `scan_file`: a [`ScanFile`] struct with all the information about the file /// /// ## Context /// A note on the `context`. This can be any value the engine wants. This function takes ownership @@ -156,43 +163,42 @@ impl ScanMetadata { pub fn visit_scan_files(&self, context: T, callback: ScanCallback) -> DeltaResult { let mut visitor = ScanFileVisitor { callback, - selection_vector: self.scan_files.selection_vector(), transforms: &self.scan_file_transforms, context, }; - visitor.visit_rows_of(self.scan_files.data())?; + visitor.visit_rows_of(&self.scan_files)?; Ok(visitor.context) } } // add some visitor magic for engines struct ScanFileVisitor<'a, T> { callback: ScanCallback, - selection_vector: &'a [bool], transforms: &'a [Option], context: T, } -impl RowVisitor for ScanFileVisitor<'_, T> { +impl FilteredRowVisitor for ScanFileVisitor<'_, T> { fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| SCAN_ROW_SCHEMA.leaves(None)); NAMES_AND_TYPES.as_ref() } - fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + fn visit_filtered<'a>( + &mut self, + getters: &[&'a dyn GetData<'a>], + rows: RowIndexIterator<'_>, + ) -> DeltaResult<()> { require!( - getters.len() == 11, + getters.len() == 14, Error::InternalError(format!( "Wrong number of ScanFileVisitor getters: {}", getters.len() )) ); - for row_index in 0..row_count { - if !self.selection_vector[row_index] { - // skip skipped rows - continue; - } + for row_index in rows { // Since path column is required, use it to detect presence of an Add action if let Some(path) = getters[0].get_opt(row_index, "scanFile.path")? { let size = getters[1].get(row_index, "scanFile.size")?; + let modification_time: i64 = getters[2].get(row_index, "add.modificationTime")?; let stats: Option = getters[3].get_opt(row_index, "scanFile.stats")?; let stats: Option = stats.and_then(|json| match serde_json::from_str(json.as_str()) { @@ -210,15 +216,16 @@ impl RowVisitor for ScanFileVisitor<'_, T> { let dv_info = DvInfo { deletion_vector }; let partition_values = getters[9].get(row_index, "scanFile.fileConstantValues.partitionValues")?; - (self.callback)( - &mut self.context, + let scan_file = ScanFile { path, size, + modification_time, stats, dv_info, - get_transform_for_row(row_index, self.transforms), + transform: get_transform_for_row(row_index, self.transforms), partition_values, - ) + }; + (self.callback)(&mut self.context, scan_file) } } Ok(()) @@ -227,41 +234,33 @@ impl RowVisitor for ScanFileVisitor<'_, T> { #[cfg(test)] mod tests { - use std::collections::HashMap; - use crate::actions::get_commit_schema; + use crate::scan::state::ScanFile; use crate::scan::test_utils::{add_batch_simple, run_with_validate_callback}; - use crate::ExpressionRef; - - use super::{DvInfo, Stats}; #[derive(Clone)] struct TestContext { id: usize, } - fn validate_visit( - context: &mut TestContext, - path: &str, - size: i64, - stats: Option, - dv_info: DvInfo, - transform: Option, - part_vals: HashMap, - ) { + fn validate_visit(context: &mut TestContext, scan_file: ScanFile) { assert_eq!( - path, + scan_file.path, "part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet" ); - assert_eq!(size, 635); - assert!(stats.is_some()); - assert_eq!(stats.as_ref().unwrap().num_records, 10); - assert_eq!(part_vals.get("date"), Some(&"2017-12-10".to_string())); - assert_eq!(part_vals.get("non-existent"), None); - assert!(dv_info.deletion_vector.is_some()); - let dv = dv_info.deletion_vector.unwrap(); + assert_eq!(scan_file.size, 635); + assert_eq!(scan_file.modification_time, 1677811178336); + assert!(scan_file.stats.is_some()); + assert_eq!(scan_file.stats.as_ref().unwrap().num_records, 10); + assert_eq!( + scan_file.partition_values.get("date"), + Some(&"2017-12-10".to_string()) + ); + assert_eq!(scan_file.partition_values.get("non-existent"), None); + assert!(scan_file.dv_info.deletion_vector.is_some()); + let dv = scan_file.dv_info.deletion_vector.unwrap(); assert_eq!(dv.unique_id(), "uvBn[lx{q8@P<9BNH/isA@1"); - assert!(transform.is_none()); + assert!(scan_file.transform.is_none()); assert_eq!(context.id, 2); } diff --git a/kernel/src/scan/state_info.rs b/kernel/src/scan/state_info.rs index 8e5a4731e3..1a0fecbe96 100644 --- a/kernel/src/scan/state_info.rs +++ b/kernel/src/scan/state_info.rs @@ -4,18 +4,22 @@ use std::collections::HashSet; use std::sync::Arc; -use tracing::debug; +use tracing::{debug, warn}; +use crate::expressions::ColumnName; +use crate::scan::data_skipping::stats_schema::build_stats_schema; use crate::scan::field_classifiers::TransformFieldClassifier; +use crate::scan::transform_spec::{FieldTransformSpec, TransformSpec}; use crate::scan::PhysicalPredicate; +use crate::scan::StatsOutputMode; use crate::schema::{DataType, MetadataColumnSpec, SchemaRef, StructType}; use crate::table_configuration::TableConfiguration; +use crate::table_features::get_any_level_column_physical_name; use crate::table_features::ColumnMappingMode; -use crate::transforms::{FieldTransformSpec, TransformSpec}; use crate::{DeltaResult, Error, PredicateRef, StructField}; /// All the state needed to process a scan. -#[derive(Debug)] +#[derive(Debug, Clone)] pub(crate) struct StateInfo { /// The logical schema for this scan pub(crate) logical_schema: SchemaRef, @@ -25,6 +29,15 @@ pub(crate) struct StateInfo { pub(crate) physical_predicate: PhysicalPredicate, /// Transform specification for converting physical to logical data pub(crate) transform_spec: Option>, + /// The column mapping mode for this scan + pub(crate) column_mapping_mode: ColumnMappingMode, + /// Physical stats schema for reading/parsing stats from checkpoint files. + /// Used to construct checkpoint read schema with stats_parsed. + pub(crate) physical_stats_schema: Option, + /// Physical partition schema with native types for partition pruning via + /// `partitionValues_parsed`. Fields use physical column names (for column mapping). + /// Only present when the table has partition columns and a predicate is provided. + pub(crate) physical_partition_schema: Option, } /// Validating the metadata columns also extracts information needed to properly construct the full @@ -54,7 +67,7 @@ fn validate_metadata_columns<'a>( table_configuration: &'a TableConfiguration, ) -> DeltaResult> { let mut metadata_info = MetadataInfo::default(); - let partition_columns = table_configuration.metadata().partition_columns(); + let partition_columns = table_configuration.partition_columns(); for metadata_column in logical_schema.metadata_columns() { // Ensure we don't have a metadata column with same name as a partition column if partition_columns.contains(metadata_column.name()) { @@ -79,6 +92,9 @@ fn validate_metadata_columns<'a>( metadata_info.materialized_row_id_column_name = Some(row_id_col); } Some(MetadataColumnSpec::RowCommitVersion) => {} + Some(MetadataColumnSpec::FilePath) => { + // FilePath metadata column is handled by the parquet reader + } None => {} } metadata_info @@ -88,6 +104,105 @@ fn validate_metadata_columns<'a>( Ok(metadata_info) } +/// Build data-skipping schemas based on `StatsOutputMode` and `PhysicalPredicate`. +/// +/// Returns `(physical_stats_schema, physical_partition_schema)`, where: +/// - `physical_stats_schema` contains data-column stats for `stats_parsed`. +/// - `physical_partition_schema` contains typed partition values for `partitionValues_parsed`. +/// +/// In predicate-only mode, predicate-referenced columns are split into data columns +/// (stats-based pruning) and partition columns (partition-value pruning). +fn build_data_skipping_schemas( + stats_output_mode: &StatsOutputMode, + physical_predicate: &PhysicalPredicate, + predicate_column_names_logical: &[ColumnName], + table_configuration: &TableConfiguration, + table_partition_schema: Option, +) -> DeltaResult<(Option, Option)> { + // Filter partition schema to only predicate-referenced columns. The DataSkippingFilter + // only needs partition columns that appear in the predicate, and the transform output + // should not include unused partition columns. + let predicate_partition_schema = match (&table_partition_schema, physical_predicate) { + (Some(tps), PhysicalPredicate::Some(_, ref_schema)) => { + // Partition values extracted from the string map via MapToStruct are always + // nullable (map lookup can return null), so we force all partition fields nullable. + ref_schema + .with_fields_filtered_nonempty(|f| tps.field(f.name()).is_some())? + .map(|partition_schema| { + let nullable_fields = partition_schema + .fields() + .map(|f| StructField::nullable(f.name(), f.data_type().clone())); + Arc::new(StructType::new_unchecked(nullable_fields)) + }) + } + _ => None, + }; + + match (stats_output_mode, physical_predicate) { + // Output all table stats columns in stats_parsed. The DataSkippingFilter + // reads stats_parsed from the transformed batch, which uses this schema. + (StatsOutputMode::AllColumns, _) => { + let expected_stats_schemas = + table_configuration.build_expected_stats_schemas(None, None)?; + Ok(( + Some(expected_stats_schemas.physical), + predicate_partition_schema, + )) + } + // Non-empty requested columns -- include predicate-referenced columns + // alongside the user-requested stats columns so that the DataSkippingFilter + // has the stats it needs. Both sources are logical names that must be + // converted to physical before passing to build_expected_stats_schemas. + (StatsOutputMode::Columns(requested_columns), _) if !requested_columns.is_empty() => { + let existing: HashSet<&ColumnName> = requested_columns.iter().collect(); + let mut all_needed_logical = requested_columns.clone(); + for col in predicate_column_names_logical { + if !existing.contains(col) { + all_needed_logical.push(col.clone()); + } + } + let logical_schema = table_configuration.logical_schema(); + let column_mapping_mode = table_configuration.column_mapping_mode(); + let all_needed_physical: Vec = all_needed_logical + .iter() + .filter_map(|col| { + // Columns not found in the logical schema (e.g. predicate references a + // column that doesn't exist in the table) are safe to skip. + get_any_level_column_physical_name(&logical_schema, col, column_mapping_mode) + .inspect_err(|e| { + warn!("Failed to resolve physical name for column {col}: {e}") + }) + .ok() + }) + .collect(); + let expected_stats_schemas = table_configuration + .build_expected_stats_schemas(None, Some(&all_needed_physical))?; + Ok(( + Some(expected_stats_schemas.physical), + predicate_partition_schema, + )) + } + // Columns(empty) or Skip with a physical predicate -- build stats directly + // from the physical predicate's referenced schema for internal data skipping + // only (no logical schema needed for output). + // Split referenced columns into data columns and partition columns. + // Data columns get min/max/nullCount stats; partition columns get exact values. + (_, PhysicalPredicate::Some(_, schema)) => { + let data_stats = schema + .with_fields_filtered_nonempty(|f| { + predicate_partition_schema + .as_ref() + .is_none_or(|partition_schema| partition_schema.field(f.name()).is_none()) + })? + .as_ref() + .and_then(build_stats_schema); + Ok((data_stats, predicate_partition_schema)) + } + // No stats output and no predicate + (_, _) => Ok((None, None)), + } +} + impl StateInfo { /// Create StateInfo with a custom field classifier for different scan types. /// Get the state needed to process a scan. @@ -95,17 +210,19 @@ impl StateInfo { /// `logical_schema` - The logical schema of the scan output, which includes partition columns /// `table_configuration` - The TableConfiguration for this table /// `predicate` - Optional predicate to filter data during the scan + /// `stats_output_mode` - Controls how file statistics are handled during the scan /// `classifier` - The classifier to use for different scan types. Use `()` if not needed pub(crate) fn try_new( logical_schema: SchemaRef, table_configuration: &TableConfiguration, predicate: Option, + stats_output_mode: StatsOutputMode, classifier: C, ) -> DeltaResult { - let partition_columns = table_configuration.metadata().partition_columns(); + let partition_columns = table_configuration.partition_columns(); let column_mapping_mode = table_configuration.column_mapping_mode(); let mut read_fields = Vec::with_capacity(logical_schema.num_fields()); - let mut transform_spec = Vec::new(); + let mut transform_spec = Vec::with_capacity(logical_schema.num_fields()); let mut last_physical_field: Option = None; let metadata_info = validate_metadata_columns(&logical_schema, table_configuration)?; @@ -136,7 +253,7 @@ impl StateInfo { // ensure we have a column name that isn't already in our schema let index_column_name = (0..) - .map(|i| format!("row_indexes_for_row_id_{}", i)) + .map(|i| format!("row_indexes_for_row_id_{i}")) .find(|name| logical_schema.field(name).is_none()) .ok_or(Error::generic( "Couldn't generate row index column name", @@ -167,10 +284,12 @@ impl StateInfo { Some(MetadataColumnSpec::RowCommitVersion) => { return Err(Error::unsupported("Row commit versions not supported")); } - Some(MetadataColumnSpec::RowIndex) | None => { - // note that RowIndex is handled in the parquet reader so we just add it as - // if it's a normal physical column - let physical_field = logical_field.make_physical(column_mapping_mode); + Some(MetadataColumnSpec::RowIndex) + | Some(MetadataColumnSpec::FilePath) + | None => { + // note that RowIndex and FilePath are handled in the parquet reader so we just add them as + // if they're normal physical columns + let physical_field = logical_field.make_physical(column_mapping_mode)?; debug!("\n\n{logical_field:#?}\nAfter mapping: {physical_field:#?}\n\n"); let physical_name = physical_field.name.clone(); @@ -191,11 +310,54 @@ impl StateInfo { let physical_schema = Arc::new(StructType::try_new(read_fields)?); + // Extract column names referenced by the predicate so we can include them + // in the stats schema when stats_columns is requested. This ensures the + // DataSkippingFilter has the stats it needs for data skipping. + let predicate_column_names: Vec = predicate + .as_ref() + .map(|p| p.references().into_iter().cloned().collect()) + .unwrap_or_default(); + let physical_predicate = match predicate { - Some(pred) => PhysicalPredicate::try_new(&pred, &logical_schema)?, + Some(pred) => PhysicalPredicate::try_new(&pred, &logical_schema, column_mapping_mode)?, None => PhysicalPredicate::None, }; + // Build partition schema with physical names for partition pruning in data skipping. + // Only needed when we have a predicate and partition columns. + // partition_columns stores logical names (per Delta protocol), so we zip the table's + // logical and physical schemas (same field ordering, guaranteed by `make_physical`) + // to match logical names and extract the corresponding physical fields without + // per-field metadata lookups. + let table_partition_schema = if !matches!( + physical_predicate, + PhysicalPredicate::None | PhysicalPredicate::StaticSkipAll + ) && !partition_columns.is_empty() + { + let partition_fields: Vec = table_configuration + .logical_schema() + .fields() + .zip(table_configuration.physical_schema().fields()) + .filter(|(logical_f, _)| partition_columns.contains(logical_f.name())) + .map(|(_, physical_f)| physical_f.clone()) + .collect(); + if partition_fields.is_empty() { + None + } else { + Some(Arc::new(StructType::new_unchecked(partition_fields))) + } + } else { + None + }; + + let (physical_stats_schema, physical_partition_schema) = build_data_skipping_schemas( + &stats_output_mode, + &physical_predicate, + &predicate_column_names, + table_configuration, + table_partition_schema, + )?; + let transform_spec = if !transform_spec.is_empty() || column_mapping_mode != ColumnMappingMode::None { Some(Arc::new(transform_spec)) @@ -208,8 +370,28 @@ impl StateInfo { physical_schema, physical_predicate, transform_spec, + column_mapping_mode, + physical_stats_schema, + physical_partition_schema, }) } + + /// Returns a conservative initial capacity for the dedup `HashSet` in + /// [`ScanLogReplayProcessor`]. + /// + /// The exact file count is not available at this point, so the hint is + /// derived from whether stats are enabled: stats are only computed for + /// non-trivial tables, so their presence is a reasonable proxy for table + /// size. Using 4096 vs 512 as the two tiers eliminates the first 12-14 + /// hashbrown doubling events for medium/large tables while staying cheap + /// for small ones. + pub(crate) fn dedup_capacity_hint(&self) -> usize { + if self.physical_stats_schema.is_some() { + 4096 + } else { + 512 + } + } } #[cfg(test)] @@ -219,8 +401,9 @@ pub(crate) mod tests { use url::Url; use crate::actions::{Metadata, Protocol}; - use crate::expressions::{column_expr, Expression as Expr}; + use crate::expressions::{column_expr, column_name, Expression as Expr}; use crate::schema::{ColumnMetadataKey, MetadataValue}; + use crate::table_features::{FeatureType, TableFeature}; use crate::utils::test_utils::assert_result_error_with_message; use super::*; @@ -230,26 +413,63 @@ pub(crate) mod tests { schema: SchemaRef, partition_columns: Vec, ) -> DeltaResult { - get_state_info(schema, partition_columns, None, HashMap::new(), vec![]) + get_state_info(schema, partition_columns, None, &[], HashMap::new(), vec![]) } + /// When features are non-empty, uses protocol (3,7) with explicit feature lists. + /// When features are empty, uses legacy protocol (2,5). pub(crate) fn get_state_info( schema: SchemaRef, partition_columns: Vec, predicate: Option, + features: &[TableFeature], + metadata_configuration: HashMap, + metadata_cols: Vec<(&str, MetadataColumnSpec)>, + ) -> DeltaResult { + get_state_info_with_stats( + schema, + partition_columns, + predicate, + features, + metadata_configuration, + metadata_cols, + StatsOutputMode::default(), + ) + } + + pub(crate) fn get_state_info_with_stats( + schema: SchemaRef, + partition_columns: Vec, + predicate: Option, + features: &[TableFeature], metadata_configuration: HashMap, metadata_cols: Vec<(&str, MetadataColumnSpec)>, + stats_output_mode: StatsOutputMode, ) -> DeltaResult { let metadata = Metadata::try_new( None, None, - schema.as_ref().clone(), + schema.clone(), partition_columns, 10, metadata_configuration, )?; - let no_features: Option> = None; // needed for type annotation - let protocol = Protocol::try_new(2, 2, no_features.clone(), no_features)?; + let protocol = if features.is_empty() { + Protocol::try_new_legacy(2, 5)? + } else { + // This helper only handles known features. Unknown features would need + // explicit placement on reader vs writer lists. + assert!( + features + .iter() + .all(|f| f.feature_type() != FeatureType::Unknown), + "Test helper does not support unknown features" + ); + let reader_features = features + .iter() + .filter(|f| f.feature_type() == FeatureType::ReaderWriter); + Protocol::try_new_modern(reader_features, features)? + }; let table_configuration = TableConfiguration::try_new( metadata, protocol, @@ -266,7 +486,13 @@ pub(crate) mod tests { ); } - StateInfo::try_new(schema.clone(), &table_configuration, predicate, ()) + StateInfo::try_new( + schema.clone(), + &table_configuration, + predicate, + stats_output_mode, + (), + ) } pub(crate) fn assert_transform_spec( @@ -425,6 +651,7 @@ pub(crate) mod tests { schema.clone(), vec![], // no partition columns Some(predicate), + &[], // no table features HashMap::new(), // no extra metadata vec![], // no metadata ) @@ -467,6 +694,9 @@ pub(crate) mod tests { } } + pub(crate) const ROW_TRACKING_FEATURES: &[TableFeature] = + &[TableFeature::RowTracking, TableFeature::DomainMetadata]; + fn get_string_map(slice: &[(&str, &str)]) -> HashMap { slice .iter() @@ -485,12 +715,17 @@ pub(crate) mod tests { schema.clone(), vec![], None, + ROW_TRACKING_FEATURES, get_string_map(&[ ("delta.enableRowTracking", "true"), ( "delta.rowTracking.materializedRowIdColumnName", "some_row_id_col", ), + ( + "delta.rowTracking.materializedRowCommitVersionColumnName", + "some_row_commit_version_col", + ), ]), vec![("row_id", MetadataColumnSpec::RowId)], ) @@ -517,12 +752,17 @@ pub(crate) mod tests { schema.clone(), vec![], None, + ROW_TRACKING_FEATURES, get_string_map(&[ ("delta.enableRowTracking", "true"), ( "delta.rowTracking.materializedRowIdColumnName", "some_row_id_col", ), + ( + "delta.rowTracking.materializedRowCommitVersionColumnName", + "some_row_commit_version_col", + ), ]), vec![("row_id", MetadataColumnSpec::RowId)], ) @@ -549,12 +789,17 @@ pub(crate) mod tests { schema.clone(), vec![], None, + ROW_TRACKING_FEATURES, get_string_map(&[ ("delta.enableRowTracking", "true"), ( "delta.rowTracking.materializedRowIdColumnName", "some_row_id_col", ), + ( + "delta.rowTracking.materializedRowCommitVersionColumnName", + "some_row_commit_version_col", + ), ]), vec![ ("row_id", MetadataColumnSpec::RowId), @@ -580,17 +825,30 @@ pub(crate) mod tests { DataType::STRING, )])); - for (metadata_config, metadata_cols, expected_error) in [ - (HashMap::new(), vec![("row_id", MetadataColumnSpec::RowId)], "Unsupported: Row ids are not enabled on this table"), - ( - get_string_map(&[("delta.enableRowTracking", "true")]), - vec![("row_id", MetadataColumnSpec::RowId)], - "Generic delta kernel error: No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration", - ), - ] { - let res = get_state_info(schema.clone(), vec![], None, metadata_config, metadata_cols); - assert_result_error_with_message(res, expected_error); - } + // Row IDs requested but row tracking not enabled → error + let res = get_state_info( + schema.clone(), + vec![], + None, + &[], // no table features + HashMap::new(), + vec![("row_id", MetadataColumnSpec::RowId)], + ); + assert_result_error_with_message(res, "Unsupported: Row ids are not enabled on this table"); + + // Row tracking enabled but missing materializedRowIdColumnName → error + let res = get_state_info( + schema, + vec![], + None, + ROW_TRACKING_FEATURES, + get_string_map(&[("delta.enableRowTracking", "true")]), + vec![("row_id", MetadataColumnSpec::RowId)], + ); + assert_result_error_with_message( + res, + "Generic delta kernel error: No delta.rowTracking.materializedRowIdColumnName key found in metadata configuration", + ); } #[test] @@ -603,6 +861,7 @@ pub(crate) mod tests { schema.clone(), vec!["part_col".to_string()], None, + &[], // no table features HashMap::new(), vec![("part_col", MetadataColumnSpec::RowId)], ); @@ -634,6 +893,7 @@ pub(crate) mod tests { schema.clone(), vec![], None, + &[], // no table features get_string_map(&[("delta.columnMapping.mode", "name")]), vec![("other", MetadataColumnSpec::RowIndex)], ); @@ -642,4 +902,278 @@ pub(crate) mod tests { "Schema error: Metadata column names must not match physical columns, but logical column 'id' has physical name 'other'" ); } + + #[test] + fn stats_columns_with_predicate() { + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::nullable("value", DataType::LONG), + ])); + + let predicate = Arc::new(column_expr!("value").gt(Expr::literal(10i64))); + + let state_info = get_state_info_with_stats( + schema, + vec![], + Some(predicate), + &[], // no table features + HashMap::new(), + vec![], + StatsOutputMode::AllColumns, + ) + .unwrap(); + + // physical_stats_schema should be set (from expected_stats_schema) + assert!( + state_info.physical_stats_schema.is_some(), + "physical_stats_schema should be Some when AllColumns is set" + ); + // physical_predicate should still be active for data skipping + assert!( + matches!(state_info.physical_predicate, PhysicalPredicate::Some(..)), + "physical_predicate should be PhysicalPredicate::Some for data skipping" + ); + } + + #[test] + fn stats_columns_with_predicate_merges_columns() { + // When specific stats_columns are requested alongside a predicate, the stats + // schema should include both the requested columns and predicate-referenced columns. + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::nullable("value", DataType::LONG), + StructField::nullable("extra", DataType::LONG), + ])); + + let predicate = Arc::new(column_expr!("extra").gt(Expr::literal(5i64))); + + let state_info = get_state_info_with_stats( + schema, + vec![], + Some(predicate), + &[], + HashMap::new(), + vec![], + StatsOutputMode::Columns(vec![column_name!("value")]), + ) + .unwrap(); + + let stats_schema = state_info + .physical_stats_schema + .expect("should have physical stats schema"); + + let min_values = stats_schema + .field("minValues") + .expect("should have minValues"); + if let DataType::Struct(inner) = min_values.data_type() { + assert!( + inner.field("value").is_some(), + "minValues should have 'value' (requested)" + ); + assert!( + inner.field("extra").is_some(), + "minValues should have 'extra' (from predicate)" + ); + assert!( + inner.field("id").is_none(), + "minValues should not have 'id' (neither requested nor in predicate)" + ); + } else { + panic!("minValues should be a struct"); + } + } + + #[test] + fn non_empty_stats_columns_filters_schema() { + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING), + StructField::nullable("value", DataType::LONG), + ])); + + let state_info = get_state_info_with_stats( + schema, + vec![], + None, + &[], // no table features + HashMap::new(), + vec![], + StatsOutputMode::Columns(vec![column_name!("value")]), + ) + .unwrap(); + + let stats_schema = state_info + .physical_stats_schema + .expect("should have physical stats schema"); + + // Check that minValues/maxValues only contain 'value', not 'id' + let min_values = stats_schema + .field("minValues") + .expect("should have minValues"); + if let DataType::Struct(inner) = min_values.data_type() { + assert!( + inner.field("value").is_some(), + "minValues should have 'value'" + ); + assert!( + inner.field("id").is_none(), + "minValues should not have 'id'" + ); + } else { + panic!("minValues should be a struct"); + } + } + + #[test] + fn partition_schema_uses_physical_names_with_column_mapping() { + // Verify that physical_partition_schema uses physical column names when column + // mapping is enabled. The logical partition column "date" has physical name + // "col-date-phys", and the schema should reflect the physical name. + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::nullable("id", DataType::STRING).with_metadata(HashMap::from([ + ( + ColumnMetadataKey::ColumnMappingId.as_ref().to_string(), + MetadataValue::Number(1), + ), + ( + ColumnMetadataKey::ColumnMappingPhysicalName + .as_ref() + .to_string(), + MetadataValue::String("col-id-phys".to_string()), + ), + ])), + StructField::nullable("date", DataType::DATE).with_metadata(HashMap::from([ + ( + ColumnMetadataKey::ColumnMappingId.as_ref().to_string(), + MetadataValue::Number(2), + ), + ( + ColumnMetadataKey::ColumnMappingPhysicalName + .as_ref() + .to_string(), + MetadataValue::String("col-date-phys".to_string()), + ), + ])), + StructField::nullable("value", DataType::LONG).with_metadata(HashMap::from([ + ( + ColumnMetadataKey::ColumnMappingId.as_ref().to_string(), + MetadataValue::Number(3), + ), + ( + ColumnMetadataKey::ColumnMappingPhysicalName + .as_ref() + .to_string(), + MetadataValue::String("col-value-phys".to_string()), + ), + ])), + ])); + + let predicate = Arc::new(column_expr!("date").lt(Expr::literal(100i32))); + + let state_info = get_state_info( + schema, + vec!["date".to_string()], + Some(predicate), + &[TableFeature::ColumnMapping], + get_string_map(&[("delta.columnMapping.mode", "name")]), + vec![], + ) + .unwrap(); + + // physical_partition_schema should exist and use the physical column name + let partition_schema = state_info + .physical_partition_schema + .as_ref() + .expect("should have physical_partition_schema with predicate + partition columns"); + assert_eq!(partition_schema.num_fields(), 1); + let field = partition_schema.fields().next().unwrap(); + assert_eq!( + field.name(), + "col-date-phys", + "partition schema should use physical column name, not logical" + ); + assert_eq!(field.data_type(), &DataType::DATE); + } + + #[test] + fn stats_columns_with_column_mapping_uses_physical_names() { + let field_a: StructField = serde_json::from_value(serde_json::json!({ + "name": "col_a", + "type": "long", + "nullable": true, + "metadata": { + "delta.columnMapping.id": 1, + "delta.columnMapping.physicalName": "phys_a" + } + })) + .unwrap(); + + let field_b: StructField = serde_json::from_value(serde_json::json!({ + "name": "col_b", + "type": "long", + "nullable": true, + "metadata": { + "delta.columnMapping.id": 2, + "delta.columnMapping.physicalName": "phys_b" + } + })) + .unwrap(); + + let field_c: StructField = serde_json::from_value(serde_json::json!({ + "name": "col_c", + "type": "long", + "nullable": true, + "metadata": { + "delta.columnMapping.id": 3, + "delta.columnMapping.physicalName": "phys_c" + } + })) + .unwrap(); + + let schema = Arc::new(StructType::new_unchecked(vec![field_a, field_b, field_c])); + let mut props = HashMap::new(); + props.insert("delta.columnMapping.mode".to_string(), "name".to_string()); + + // Request col_a via stats_columns (logical), and reference col_b via predicate (logical). + // Both must be translated to physical names in the output stats schema. + let predicate = Arc::new(column_expr!("col_b").gt(Expr::literal(5i64))); + + let state_info = get_state_info_with_stats( + schema, + vec![], + Some(predicate), + &[], + props, + vec![], + StatsOutputMode::Columns(vec![column_name!("col_a")]), + ) + .unwrap(); + + let stats_schema = state_info + .physical_stats_schema + .expect("should have physical stats schema"); + + let present = ["phys_a", "phys_b"]; + let absent = ["col_a", "col_b", "phys_c"]; + for stats_field in ["minValues", "maxValues"] { + let DataType::Struct(inner) = stats_schema + .field(stats_field) + .unwrap_or_else(|| panic!("should have {stats_field}")) + .data_type() + else { + panic!("{stats_field} should be a struct"); + }; + for name in present { + assert!( + inner.field(name).is_some(), + "{stats_field} expected '{name}'" + ); + } + for name in absent { + assert!( + inner.field(name).is_none(), + "{stats_field} unexpected '{name}'" + ); + } + } + } } diff --git a/kernel/src/scan/test_utils.rs b/kernel/src/scan/test_utils.rs new file mode 100644 index 0000000000..c23af88875 --- /dev/null +++ b/kernel/src/scan/test_utils.rs @@ -0,0 +1,197 @@ +use crate::arrow::array::StringArray; +use crate::scan::state_info::StateInfo; +use crate::schema::StructType; +use crate::utils::test_utils::string_array_to_engine_data; +use itertools::Itertools; +use std::sync::Arc; + +use crate::log_replay::ActionsBatch; +use crate::log_segment::CheckpointReadInfo; +use crate::{ + actions::get_commit_schema, + engine::{ + arrow_data::ArrowEngineData, + sync::{json::SyncJsonHandler, SyncEngine}, + }, + scan::log_replay::scan_action_iter, + schema::SchemaRef, + JsonHandler, +}; + +use super::state::ScanCallback; +use super::PhysicalPredicate; +use crate::scan::transform_spec::TransformSpec; +use crate::table_features::ColumnMappingMode; + +// Generates a batch of sidecar actions with the given paths. +// The schema is provided as null columns affect equality checks. +pub(crate) fn sidecar_batch_with_given_paths( + paths: Vec<&str>, + output_schema: SchemaRef, +) -> Box { + // Use default size of 9268 for backward compatibility + let paths_with_sizes: Vec<_> = paths.into_iter().map(|p| (p, 9268u64)).collect(); + sidecar_batch_with_given_paths_and_sizes(paths_with_sizes, output_schema) +} + +// Generates a batch of sidecar actions with the given paths and sizes. +// The schema is provided as null columns affect equality checks. +pub(crate) fn sidecar_batch_with_given_paths_and_sizes( + paths_and_sizes: Vec<(&str, u64)>, + output_schema: SchemaRef, +) -> Box { + let handler = SyncJsonHandler {}; + + let mut json_strings: Vec = paths_and_sizes + .iter() + .map(|(path, size)| { + format!( + r#"{{"sidecar":{{"path":"{path}","sizeInBytes":{size},"modificationTime":1714496113961,"tags":{{"tag_foo":"tag_bar"}}}}}}"# + ) + }) + .collect(); + json_strings.push(r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#.to_string()); + + let json_strings_array: StringArray = + json_strings.iter().map(|s| s.as_str()).collect_vec().into(); + + let parsed = handler + .parse_json( + string_array_to_engine_data(json_strings_array), + output_schema, + ) + .unwrap(); + + ArrowEngineData::try_from_engine_data(parsed).unwrap() +} + +// Generates a batch with an add action. +// The schema is provided as null columns affect equality checks. +pub(crate) fn add_batch_simple(output_schema: SchemaRef) -> Box { + let handler = SyncJsonHandler {}; + let json_strings: StringArray = vec![ + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, + ] + .into(); + let parsed = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap(); + ArrowEngineData::try_from_engine_data(parsed).unwrap() +} + +// Generates a batch with an add action. +// The schema is provided as null columns affect equality checks. +pub(crate) fn add_batch_for_row_id(output_schema: SchemaRef) -> Box { + let handler = SyncJsonHandler {}; + let json_strings: StringArray = vec![ + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","baseRowId": 42, "tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none", "delta.enableRowTracking": "true", "delta.rowTracking.materializedRowIdColumnName":"row_id_col"},"createdTime":1677811175819}}"#, + ] + .into(); + let parsed = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap(); + ArrowEngineData::try_from_engine_data(parsed).unwrap() +} + +// An add batch with a removed file parsed with the schema provided +pub(crate) fn add_batch_with_remove(output_schema: SchemaRef) -> Box { + let handler = SyncJsonHandler {}; + let json_strings: StringArray = vec![ + r#"{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c001.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":635,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c001.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, + ] + .into(); + let parsed = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap(); + ArrowEngineData::try_from_engine_data(parsed).unwrap() +} + +// A batch with a Remove action and a partition column (`date`). The Remove has +// `partitionValues: {"date": "2017-12-10"}` but the transform reads from `add.*` columns, +// so the Remove's partition values are not visible to the data skipping filter. +pub(crate) fn add_batch_with_remove_and_partition( + output_schema: SchemaRef, +) -> Box { + let handler = SyncJsonHandler {}; + let json_strings: StringArray = vec![ + r#"{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c001.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"date":"2017-12-10"},"size":635}}"#, + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c001.snappy.parquet","partitionValues":{"date":"2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0}}"}}"#, + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{"date":"2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0}}"}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["date"],"configuration":{"delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, + ] + .into(); + let parsed = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap(); + ArrowEngineData::try_from_engine_data(parsed).unwrap() +} + +// add batch with a `date` partition col +pub(crate) fn add_batch_with_partition_col() -> Box { + let handler = SyncJsonHandler {}; + let json_strings: StringArray = vec![ + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["date"],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c001.snappy.parquet","partitionValues": {"date": "2017-12-11"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, + ] + .into(); + let output_schema = get_commit_schema().clone(); + let parsed = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap(); + ArrowEngineData::try_from_engine_data(parsed).unwrap() +} + +/// Create a scan action iter and validate what's called back. If you pass `None` as +/// `logical_schema`, `transform` should also be `None` +#[allow(clippy::vec_box)] +pub(crate) fn run_with_validate_callback( + batch: Vec>, + logical_schema: Option, + transform_spec: Option>, + expected_sel_vec: &[bool], + context: T, + validate_callback: ScanCallback, +) { + let logical_schema = + logical_schema.unwrap_or_else(|| Arc::new(StructType::new_unchecked(vec![]))); + let state_info = Arc::new(StateInfo { + logical_schema: logical_schema.clone(), + physical_schema: logical_schema, + physical_predicate: PhysicalPredicate::None, + transform_spec, + column_mapping_mode: ColumnMappingMode::None, + physical_stats_schema: None, + physical_partition_schema: None, + }); + let checkpoint_info = CheckpointReadInfo::without_stats_parsed(); + let (iter, _metrics) = scan_action_iter( + &SyncEngine::new(), + batch + .into_iter() + .map(|batch| Ok(ActionsBatch::new(batch as _, true))), + state_info, + checkpoint_info, + false, + ) + .unwrap(); + let mut batch_count = 0; + for res in iter { + let scan_metadata = res.unwrap(); + assert_eq!( + scan_metadata.scan_files.selection_vector(), + expected_sel_vec + ); + scan_metadata + .visit_scan_files(context.clone(), validate_callback) + .unwrap(); + batch_count += 1; + } + assert_eq!(batch_count, 1); +} diff --git a/kernel/src/scan/tests.rs b/kernel/src/scan/tests.rs new file mode 100644 index 0000000000..a135a45ac2 --- /dev/null +++ b/kernel/src/scan/tests.rs @@ -0,0 +1,1629 @@ +use std::path::PathBuf; +use std::sync::Arc; + +use bytes::Bytes; + +use rstest::rstest; + +use crate::arrow::array::{Array, BooleanArray, Int64Array, StringArray, StructArray}; +use crate::arrow::compute::filter_record_batch; +use crate::arrow::datatypes::{DataType as ArrowDataType, Field, Fields, Schema as ArrowSchema}; +use crate::arrow::record_batch::RecordBatch; +use crate::engine::arrow_data::ArrowEngineData; +use crate::engine::parquet_row_group_skipping::ParquetRowGroupSkipping; +use crate::engine::sync::SyncEngine; +use crate::expressions::{ + column_expr, column_name, column_pred, ColumnName, Expression as Expr, Predicate as Pred, +}; +use crate::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use crate::parquet::arrow::arrow_writer::ArrowWriter; +use crate::scan::data_skipping::as_checkpoint_skipping_predicate; +use crate::scan::state::ScanFile; +use crate::schema::{ColumnMetadataKey, DataType, StructField, StructType}; +use crate::{ + Engine, EvaluationHandler, FileDataReadResultIterator, FileMeta, JsonHandler, ParquetFooter, + ParquetHandler, PredicateRef, StorageHandler, +}; +use crate::{EngineData, Snapshot}; + +use super::*; + +/// Helper macro to extract a typed column from a RecordBatch or StructArray. +macro_rules! get_column { + ($source:expr, $name:expr, $ty:ty) => { + $source + .column_by_name($name) + .unwrap_or_else(|| panic!("should have column '{}'", $name)) + .as_any() + .downcast_ref::<$ty>() + .unwrap_or_else(|| panic!("column '{}' should be {}", $name, stringify!($ty))) + }; +} + +fn field_names(s: &StructArray) -> Vec { + s.fields().iter().map(|f| f.name().clone()).collect() +} + +#[test] +fn test_static_skipping() { + const NULL: Pred = Pred::null_literal(); + let test_cases = [ + (false, column_pred!("a")), + (true, Pred::literal(false)), + (false, Pred::literal(true)), + (false, NULL), // NULL is unknown, not false -- conservative (no skip) + (true, Pred::and(column_pred!("a"), Pred::literal(false))), + (false, Pred::or(column_pred!("a"), Pred::literal(true))), + (false, Pred::or(column_pred!("a"), Pred::literal(false))), + (false, Pred::lt(column_expr!("a"), Expr::literal(10))), + (false, Pred::lt(Expr::literal(10), Expr::literal(100))), + (true, Pred::gt(Expr::literal(10), Expr::literal(100))), + (false, Pred::and(NULL, column_pred!("a"))), // NULL is unknown, not false + ]; + for (should_skip, predicate) in test_cases { + assert_eq!( + can_statically_skip_all_files(&predicate), + should_skip, + "Failed for predicate: {predicate:#?}" + ); + } +} + +#[test] +fn test_physical_predicate() { + let logical_schema = StructType::new_unchecked(vec![ + StructField::nullable("a", DataType::LONG), + StructField::nullable("b", DataType::LONG).with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_b", + )]), + StructField::nullable("phys_b", DataType::LONG).with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_c", + )]), + StructField::nullable( + "nested", + StructType::new_unchecked(vec![ + StructField::nullable("x", DataType::LONG), + StructField::nullable("y", DataType::LONG).with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_y", + )]), + ]), + ), + StructField::nullable( + "mapped", + StructType::new_unchecked(vec![StructField::nullable("n", DataType::LONG) + .with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_n", + )])]), + ) + .with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_mapped", + )]), + ]); + + // NOTE: We break several column mapping rules here because they don't matter for this + // test. For example, we do not provide field ids, and not all columns have physical names. + let test_cases = [ + (Pred::literal(true), Some(PhysicalPredicate::None)), + (Pred::literal(false), Some(PhysicalPredicate::StaticSkipAll)), + (column_pred!("x"), None), // no such column + ( + column_pred!("a"), + Some(PhysicalPredicate::Some( + column_pred!("a").into(), + StructType::new_unchecked(vec![StructField::nullable("a", DataType::LONG)]).into(), + )), + ), + ( + column_pred!("b"), + Some(PhysicalPredicate::Some( + column_pred!("phys_b").into(), + StructType::new_unchecked(vec![StructField::nullable("phys_b", DataType::LONG) + .with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_b", + )])]) + .into(), + )), + ), + ( + column_pred!("nested.x"), + Some(PhysicalPredicate::Some( + column_pred!("nested.x").into(), + StructType::new_unchecked(vec![StructField::nullable( + "nested", + StructType::new_unchecked(vec![StructField::nullable("x", DataType::LONG)]), + )]) + .into(), + )), + ), + ( + column_pred!("nested.y"), + Some(PhysicalPredicate::Some( + column_pred!("nested.phys_y").into(), + StructType::new_unchecked(vec![StructField::nullable( + "nested", + StructType::new_unchecked(vec![StructField::nullable( + "phys_y", + DataType::LONG, + ) + .with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_y", + )])]), + )]) + .into(), + )), + ), + ( + column_pred!("mapped.n"), + Some(PhysicalPredicate::Some( + column_pred!("phys_mapped.phys_n").into(), + StructType::new_unchecked(vec![StructField::nullable( + "phys_mapped", + StructType::new_unchecked(vec![StructField::nullable( + "phys_n", + DataType::LONG, + ) + .with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_n", + )])]), + ) + .with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_mapped", + )])]) + .into(), + )), + ), + ( + Pred::and(column_pred!("mapped.n"), Pred::literal(true)), + Some(PhysicalPredicate::Some( + Pred::and(column_pred!("phys_mapped.phys_n"), Pred::literal(true)).into(), + StructType::new_unchecked(vec![StructField::nullable( + "phys_mapped", + StructType::new_unchecked(vec![StructField::nullable( + "phys_n", + DataType::LONG, + ) + .with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_n", + )])]), + ) + .with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_mapped", + )])]) + .into(), + )), + ), + ( + Pred::and(column_pred!("mapped.n"), Pred::literal(false)), + Some(PhysicalPredicate::StaticSkipAll), + ), + ]; + + for (predicate, expected) in test_cases { + let result = + PhysicalPredicate::try_new(&predicate, &logical_schema, ColumnMappingMode::Name).ok(); + assert_eq!( + result, expected, + "Failed for predicate: {predicate:#?}, expected {expected:#?}, got {result:#?}" + ); + } +} + +/// Delta column names are case-insensitive, so predicates with differently-cased column names +/// must still resolve against the schema. The predicate is rewritten to use the schema's casing +/// (or physical names when column mapping is enabled). +#[rstest] +#[case::without_column_mapping( + // predicate: createdat > 500 AND value < 100, schema: createdAt, Value + StructType::new_unchecked(vec![ + StructField::nullable("createdAt", DataType::LONG), + StructField::nullable("Value", DataType::LONG), + ]), + Pred::and( + Pred::gt(column_expr!("createdat"), Expr::literal(500i64)), + Pred::lt(column_expr!("value"), Expr::literal(100i64)), + ), + ColumnMappingMode::None, + PhysicalPredicate::Some( + Arc::new(Pred::and( + Pred::gt(column_expr!("createdAt"), Expr::literal(500i64)), + Pred::lt(column_expr!("Value"), Expr::literal(100i64)), + )), + StructType::new_unchecked(vec![ + StructField::nullable("createdAt", DataType::LONG), + StructField::nullable("Value", DataType::LONG), + ]).into(), + ), +)] +#[case::with_column_mapping( + // predicate: createdat > 500 AND value < 100, schema has physical name metadata + StructType::new_unchecked(vec![ + StructField::nullable("createdAt", DataType::LONG).with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_created", + )]), + StructField::nullable("Value", DataType::LONG).with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_value", + )]), + ]), + Pred::and( + Pred::gt(column_expr!("createdat"), Expr::literal(500i64)), + Pred::lt(column_expr!("value"), Expr::literal(100i64)), + ), + ColumnMappingMode::Name, + PhysicalPredicate::Some( + Arc::new(Pred::and( + Pred::gt(column_expr!("phys_created"), Expr::literal(500i64)), + Pred::lt(column_expr!("phys_value"), Expr::literal(100i64)), + )), + StructType::new_unchecked(vec![ + StructField::nullable("phys_created", DataType::LONG).with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_created", + )]), + StructField::nullable("phys_value", DataType::LONG).with_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + "phys_value", + )]), + ]).into(), + ), +)] +#[case::duplicate_column_different_casing( + // predicate references same column with different casings: value > 5 AND VALUE < 10 + StructType::new_unchecked(vec![ + StructField::nullable("Value", DataType::LONG), + ]), + Pred::and( + Pred::gt(column_expr!("value"), Expr::literal(5i64)), + Pred::lt(column_expr!("VALUE"), Expr::literal(10i64)), + ), + ColumnMappingMode::None, + PhysicalPredicate::Some( + Arc::new(Pred::and( + Pred::gt(column_expr!("Value"), Expr::literal(5i64)), + Pred::lt(column_expr!("Value"), Expr::literal(10i64)), + )), + StructType::new_unchecked(vec![StructField::nullable("Value", DataType::LONG)]) + .into(), + ), +)] +#[case::nested_fields( + // predicate references nested.fieldname but schema has Nested.FieldName + StructType::new_unchecked(vec![StructField::nullable( + "Nested", + StructType::new_unchecked(vec![StructField::nullable("FieldName", DataType::LONG)]), + )]), + column_pred!("nested.fieldname"), + ColumnMappingMode::None, + PhysicalPredicate::Some( + column_pred!("Nested.FieldName").into(), + StructType::new_unchecked(vec![StructField::nullable( + "Nested", + StructType::new_unchecked(vec![ + StructField::nullable("FieldName", DataType::LONG) + ]), + )]).into(), + ), +)] +fn test_physical_predicate_case_insensitive( + #[case] logical_schema: StructType, + #[case] predicate: Predicate, + #[case] column_mapping_mode: ColumnMappingMode, + #[case] expected: PhysicalPredicate, +) { + let result = + PhysicalPredicate::try_new(&predicate, &logical_schema, column_mapping_mode).unwrap(); + assert_eq!(result, expected); +} + +/// Unknown column still fails even with case-insensitive matching. +#[test] +fn test_physical_predicate_case_insensitive_unknown_column() { + let logical_schema = + StructType::new_unchecked(vec![StructField::nullable("createdAt", DataType::LONG)]); + let result = PhysicalPredicate::try_new( + &column_pred!("nonexistent"), + &logical_schema, + ColumnMappingMode::None, + ); + assert!(result.is_err()); +} + +fn get_files_for_scan(scan: Scan, engine: &dyn Engine) -> DeltaResult> { + let scan_metadata_iter = scan.scan_metadata(engine)?; + fn scan_metadata_callback(paths: &mut Vec, scan_file: ScanFile) { + paths.push(scan_file.path.to_string()); + assert!(scan_file.dv_info.deletion_vector.is_none()); + } + let mut files = vec![]; + for res in scan_metadata_iter { + let scan_metadata = res?; + files = scan_metadata.visit_scan_files(files, scan_metadata_callback)?; + } + Ok(files) +} + +#[test] +fn test_scan_metadata_paths() { + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-without-dv-small/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = SyncEngine::new(); + + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + let scan = snapshot.scan_builder().build().unwrap(); + let files = get_files_for_scan(scan, &engine).unwrap(); + assert_eq!(files.len(), 1); + assert_eq!( + files[0], + "part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet" + ); +} + +#[test_log::test] +fn test_scan_metadata() { + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-without-dv-small/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + let scan = snapshot.scan_builder().build().unwrap(); + let files: Vec> = scan.execute(engine).unwrap().try_collect().unwrap(); + + assert_eq!(files.len(), 1); + let num_rows = files[0].as_ref().len(); + assert_eq!(num_rows, 10) +} + +#[test_log::test] +fn test_scan_metadata_from_same_version() { + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-without-dv-small/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + let version = snapshot.version(); + let scan = snapshot.scan_builder().build().unwrap(); + let files: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .map_ok(|ScanMetadata { scan_files, .. }| { + let (underlying_data, selection_vector) = scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) + .unwrap() + .into(); + let filtered_batch = + filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap(); + Box::new(ArrowEngineData::from(filtered_batch)) as Box + }) + .try_collect() + .unwrap(); + let new_files: Vec<_> = scan + .scan_metadata_from(engine.as_ref(), version, files, None) + .unwrap() + .try_collect() + .unwrap(); + + assert_eq!(new_files.len(), 1); +} + +// reading v0 with 3 files. +// updating to v1 with 3 more files added. +#[test_log::test] +fn test_scan_metadata_from_with_update() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/basic_partitioned/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + + let snapshot = Snapshot::builder_for(url.clone()) + .at_version(0) + .build(engine.as_ref()) + .unwrap(); + let scan = snapshot.scan_builder().build().unwrap(); + let files: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .map_ok(|ScanMetadata { scan_files, .. }| { + let (underlying_data, selection_vector) = scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) + .unwrap() + .into(); + filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap() + }) + .try_collect() + .unwrap(); + assert_eq!(files.len(), 1); + assert_eq!(files[0].num_rows(), 3); + + let files: Vec<_> = files + .into_iter() + .map(|b| Box::new(ArrowEngineData::from(b)) as Box) + .collect(); + let snapshot = Snapshot::builder_for(url) + .at_version(1) + .build(engine.as_ref()) + .unwrap(); + let scan = snapshot.scan_builder().build().unwrap(); + let new_files: Vec<_> = scan + .scan_metadata_from(engine.as_ref(), 0, files, None) + .unwrap() + .map_ok(|ScanMetadata { scan_files, .. }| { + let (underlying_data, selection_vector) = scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) + .unwrap() + .into(); + filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap() + }) + .try_collect() + .unwrap(); + assert_eq!(new_files.len(), 2); + assert_eq!(new_files[0].num_rows(), 3); + assert_eq!(new_files[1].num_rows(), 3); +} + +#[test] +fn test_get_partition_value() { + let cases = [ + ( + "string", + PrimitiveType::String, + Scalar::String("string".to_string()), + ), + ("123", PrimitiveType::Integer, Scalar::Integer(123)), + ("1234", PrimitiveType::Long, Scalar::Long(1234)), + ("12", PrimitiveType::Short, Scalar::Short(12)), + ("1", PrimitiveType::Byte, Scalar::Byte(1)), + ("1.1", PrimitiveType::Float, Scalar::Float(1.1)), + ("10.10", PrimitiveType::Double, Scalar::Double(10.1)), + ("true", PrimitiveType::Boolean, Scalar::Boolean(true)), + ("2024-01-01", PrimitiveType::Date, Scalar::Date(19723)), + ("1970-01-01", PrimitiveType::Date, Scalar::Date(0)), + ( + "1970-01-01 00:00:00", + PrimitiveType::Timestamp, + Scalar::Timestamp(0), + ), + ( + "1970-01-01 00:00:00.123456", + PrimitiveType::Timestamp, + Scalar::Timestamp(123456), + ), + ( + "1970-01-01 00:00:00.123456789", + PrimitiveType::Timestamp, + Scalar::Timestamp(123456), + ), + ]; + + for (raw, data_type, expected) in &cases { + let value = crate::scan::transform_spec::parse_partition_value_raw( + Some(&raw.to_string()), + &DataType::Primitive(data_type.clone()), + ) + .unwrap(); + assert_eq!(value, *expected); + } +} + +#[test] +fn test_replay_for_scan_metadata() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let engine = SyncEngine::new(); + + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + let scan = snapshot.scan_builder().build().unwrap(); + let result = scan.replay_for_scan_metadata(&engine).unwrap(); + let data: Vec<_> = result.actions.try_collect().unwrap(); + // No predicate pushdown attempted, because at most one part of a multi-part checkpoint + // could be skipped when looking for adds/removes. + // + // NOTE: Each checkpoint part is a single-row file -- guaranteed to produce one row group. + assert_eq!(data.len(), 5); +} + +#[test] +fn test_data_row_group_skipping() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let engine = Arc::new(SyncEngine::new()); + + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + // No predicate pushdown attempted, so the one data file should be returned. + // + // NOTE: The data file contains only five rows -- near guaranteed to produce one row group. + let scan = snapshot.clone().scan_builder().build().unwrap(); + let data: Vec<_> = scan.execute(engine.clone()).unwrap().try_collect().unwrap(); + assert_eq!(data.len(), 1); + + // Ineffective predicate pushdown attempted, so the one data file should be returned. + let int_col = column_expr!("numeric.ints.int32"); + let value = Expr::literal(1000i32); + let predicate = Arc::new(int_col.clone().gt(value.clone())); + let scan = snapshot + .clone() + .scan_builder() + .with_predicate(predicate) + .build() + .unwrap(); + let data: Vec<_> = scan.execute(engine.clone()).unwrap().try_collect().unwrap(); + assert_eq!(data.len(), 1); + + // TODO(#860): we disable predicate pushdown until we support row indexes. Update this test + // accordingly after support is reintroduced. + // + // Effective predicate pushdown, so no data files should be returned. BUT since we disabled + // predicate pushdown, the one data file is still returned. + let predicate = Arc::new(int_col.lt(value)); + let scan = snapshot + .scan_builder() + .with_predicate(predicate) + .build() + .unwrap(); + let data: Vec<_> = scan.execute(engine).unwrap().try_collect().unwrap(); + assert_eq!(data.len(), 1); +} + +#[test] +fn test_missing_column_row_group_skipping() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parquet_row_group_skipping/")); + let url = url::Url::from_directory_path(path.unwrap()).unwrap(); + let engine = Arc::new(SyncEngine::new()); + + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + // Predicate over a logically valid but physically missing column. No data files should be + // returned because the column is inferred to be all-null. + // + // WARNING: https://github.com/delta-io/delta-kernel-rs/issues/434 - This + // optimization is currently disabled, so the one data file is still returned. + let predicate = Arc::new(column_expr!("missing").lt(Expr::literal(1000i64))); + let scan = snapshot + .clone() + .scan_builder() + .with_predicate(predicate) + .build() + .unwrap(); + let data: Vec<_> = scan.execute(engine.clone()).unwrap().try_collect().unwrap(); + assert_eq!(data.len(), 1); + + // Predicate over a logically missing column fails the scan + let predicate = Arc::new(column_expr!("numeric.ints.invalid").lt(Expr::literal(1000))); + snapshot + .scan_builder() + .with_predicate(predicate) + .build() + .expect_err("unknown column"); +} + +#[test_log::test] +fn test_scan_with_checkpoint() -> DeltaResult<()> { + let path = std::fs::canonicalize(PathBuf::from( + "./tests/data/with_checkpoint_no_last_checkpoint/", + ))?; + + let url = url::Url::from_directory_path(path).unwrap(); + let engine = SyncEngine::new(); + + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + let scan = snapshot.scan_builder().build()?; + let files = get_files_for_scan(scan, &engine)?; + // test case: + // + // commit0: P and M, no add/remove + // commit1: add file-ad1 + // commit2: remove file-ad1, add file-a19 + // checkpoint2: remove file-ad1, add file-a19 + // commit3: remove file-a19, add file-70b + // + // thus replay should produce only file-70b + assert_eq!( + files, + vec!["part-00000-70b1dcdf-0236-4f63-a072-124cdbafd8a0-c000.snappy.parquet"] + ); + Ok(()) +} + +/// Helper to validate that JSON stats object values match the corresponding parsed struct array. +fn assert_stats_struct_matches_json( + struct_array: &StructArray, + json_object: &serde_json::Map, + row_idx: usize, + field_name: &str, +) { + for (col_name, json_val) in json_object { + let Some(col) = struct_array.column_by_name(col_name) else { + continue; + }; + if col.is_null(row_idx) { + continue; + } + // Currently only validates Int64 columns (the table has integer stats) + if let Some(int_col) = col.as_any().downcast_ref::() { + assert_eq!( + json_val.as_i64().unwrap(), + int_col.value(row_idx), + "{field_name}.{col_name} mismatch at row {row_idx}" + ); + } + } +} + +/// Test that `with_stats_columns(vec![])` outputs parsed stats in scan_metadata batches. +/// Uses a table with a checkpoint that contains stats_parsed for e2e verification. +#[test] +fn test_scan_metadata_with_stats_columns() { + const STATS_PARSED_COL: &str = "stats_parsed"; + + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + let scan = snapshot + .scan_builder() + .include_all_stats_columns() + .build() + .unwrap(); + + let scan_metadata_results: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap(); + + assert!( + !scan_metadata_results.is_empty(), + "Should have scan metadata" + ); + + let mut total_num_records: i64 = 0; + let mut file_count = 0; + + for scan_metadata in scan_metadata_results { + let (underlying_data, selection_vector) = scan_metadata.scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) + .unwrap() + .into(); + let filtered_batch = + filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap(); + + // Verify stats_parsed schema + let schema = filtered_batch.schema(); + let field = schema + .field_with_name(STATS_PARSED_COL) + .expect("Schema should contain stats_parsed column"); + assert!( + matches!(field.data_type(), ArrowDataType::Struct(_)), + "stats_parsed should be a struct type, got: {:?}", + field.data_type() + ); + + // Extract stats_parsed struct array + let stats_parsed = get_column!(filtered_batch, STATS_PARSED_COL, StructArray); + let num_records = get_column!(stats_parsed, "numRecords", Int64Array); + let min_values = get_column!(stats_parsed, "minValues", StructArray); + let max_values = get_column!(stats_parsed, "maxValues", StructArray); + let null_count = get_column!(stats_parsed, "nullCount", StructArray); + + // Extract JSON stats column + let stats_json = get_column!(filtered_batch, "stats", StringArray); + + // Validate each row: JSON stats should match structured stats + for i in 0..stats_json.len() { + if stats_parsed.is_null(i) || stats_json.is_null(i) { + continue; + } + + let json_stats: serde_json::Value = + serde_json::from_str(stats_json.value(i)).expect("stats JSON should be valid"); + + // Validate numRecords + if let Some(json_num) = json_stats.get("numRecords").and_then(|v| v.as_i64()) { + assert_eq!( + json_num, + num_records.value(i), + "numRecords mismatch at row {i}" + ); + } + + // Validate minValues, maxValues, nullCount + if let Some(obj) = json_stats.get("minValues").and_then(|v| v.as_object()) { + assert_stats_struct_matches_json(min_values, obj, i, "minValues"); + } + if let Some(obj) = json_stats.get("maxValues").and_then(|v| v.as_object()) { + assert_stats_struct_matches_json(max_values, obj, i, "maxValues"); + } + if let Some(obj) = json_stats.get("nullCount").and_then(|v| v.as_object()) { + assert_stats_struct_matches_json(null_count, obj, i, "nullCount"); + } + + total_num_records += num_records.value(i); + file_count += 1; + } + } + + assert!(file_count > 0, "Should have processed at least one file"); + assert!(total_num_records > 0, "Should have non-zero numRecords"); + println!( + "Verified {file_count} files with total {total_num_records} records from stats_parsed" + ); +} + +/// Test that `include_all_stats_columns` and `with_predicate` can be used together. +/// The scan should output stats_parsed AND perform data skipping via the predicate. +#[test] +fn test_scan_metadata_stats_columns_with_predicate() { + const STATS_PARSED_COL: &str = "stats_parsed"; + + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + // Build scan with both a predicate and stats_columns + let predicate = Arc::new(column_expr!("id").gt(Expr::literal(0i64))); + let scan = snapshot + .scan_builder() + .with_predicate(predicate) + .include_all_stats_columns() + .build() + .expect("Should succeed when using both predicate and stats_columns"); + + // Verify the scan has a physical predicate (data skipping is active) + assert!( + scan.physical_predicate().is_some(), + "Scan should have a physical predicate for data skipping" + ); + + // Run scan_metadata and verify stats_parsed is present in the output + let scan_metadata_results: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap(); + + assert!( + !scan_metadata_results.is_empty(), + "Should have scan metadata results" + ); + + let mut file_count = 0; + for scan_metadata in scan_metadata_results { + let (underlying_data, selection_vector) = scan_metadata.scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) + .unwrap() + .into(); + let filtered_batch = + filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap(); + + // Verify stats_parsed column exists and is a struct type + let schema = filtered_batch.schema(); + let field = schema + .field_with_name(STATS_PARSED_COL) + .expect("Schema should contain stats_parsed column"); + assert!( + matches!(field.data_type(), ArrowDataType::Struct(_)), + "stats_parsed should be a struct type" + ); + + // Verify stats_parsed has data + let stats_parsed = get_column!(filtered_batch, STATS_PARSED_COL, StructArray); + let num_records = get_column!(stats_parsed, "numRecords", Int64Array); + for i in 0..filtered_batch.num_rows() { + if !stats_parsed.is_null(i) { + assert!(num_records.value(i) > 0, "numRecords should be positive"); + file_count += 1; + } + } + } + + assert!( + file_count > 0, + "Should have processed at least one file with stats" + ); +} + +#[test] +fn test_prefix_columns_simple() { + let mut prefixer = PrefixColumns { + prefix: ColumnName::new(["add", "stats_parsed"]), + }; + // A simple binary predicate: x > 100 + let pred = Pred::gt(column_expr!("x"), Expr::literal(100i64)); + let result = prefixer.transform_pred(&pred).unwrap().into_owned(); + + // The column reference should now be add.stats_parsed.x + let refs: Vec<_> = result.references().into_iter().collect(); + assert_eq!(refs.len(), 1); + assert_eq!(*refs[0], column_name!("add.stats_parsed.x")); +} + +#[test] +fn test_build_actions_meta_predicate_with_predicate() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = SyncEngine::new(); + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + + // Build a scan with a predicate eligible for data skipping + let predicate = Arc::new(Pred::gt(column_expr!("id"), Expr::literal(400i64))); + let scan = snapshot + .scan_builder() + .with_predicate(predicate) + .build() + .unwrap(); + + let meta_pred = scan.build_actions_meta_predicate(); + assert!( + meta_pred.is_some(), + "Should produce an actions meta predicate for a data-skipping-eligible predicate" + ); + + // Verify all column references are prefixed with add.stats_parsed + let pred = meta_pred.unwrap(); + for col_ref in pred.references() { + let path: Vec<_> = col_ref.iter().collect(); + assert_eq!( + path[0], "add", + "Column reference should start with 'add': {col_ref}" + ); + assert_eq!( + path[1], "stats_parsed", + "Column reference should have 'stats_parsed' as second element: {col_ref}" + ); + } +} + +#[test] +fn test_build_actions_meta_predicate_no_predicate() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = SyncEngine::new(); + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + + // Build a scan with no predicate + let scan = snapshot.scan_builder().build().unwrap(); + + assert!( + scan.build_actions_meta_predicate().is_none(), + "Should return None when there is no predicate" + ); +} + +#[test] +fn test_build_actions_meta_predicate_static_skip_all() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = SyncEngine::new(); + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + + // A predicate that statically evaluates to false should produce StaticSkipAll, + // which means build_actions_meta_predicate returns None. + let predicate = Arc::new(Pred::literal(false)); + let scan = snapshot + .scan_builder() + .with_predicate(predicate) + .build() + .unwrap(); + + assert!( + scan.build_actions_meta_predicate().is_none(), + "StaticSkipAll predicate should return None" + ); +} + +/// Helper to build a parquet file with the nested `add.stats_parsed.*` structure that +/// checkpoint files have. Returns the parquet bytes and the arrow schema. +/// +/// Each call to `write_row_group` writes one row group. Each row represents one add action's +/// stats with maxValues.id, minValues.id, nullCount.id, and numRecords. +struct CheckpointParquetBuilder { + arrow_schema: Arc, + id_fields: Fields, + stats_fields: Fields, + buffer: Vec, + writer: Option>>, +} + +impl CheckpointParquetBuilder { + fn new() -> Self { + let id_fields = Fields::from(vec![Field::new("id", ArrowDataType::Int64, true)]); + let stats_fields = Fields::from(vec![ + Field::new("maxValues", ArrowDataType::Struct(id_fields.clone()), true), + Field::new("minValues", ArrowDataType::Struct(id_fields.clone()), true), + Field::new("nullCount", ArrowDataType::Struct(id_fields.clone()), true), + Field::new("numRecords", ArrowDataType::Int64, true), + ]); + let add_fields = Fields::from(vec![Field::new( + "stats_parsed", + ArrowDataType::Struct(stats_fields.clone()), + true, + )]); + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "add", + ArrowDataType::Struct(add_fields.clone()), + true, + )])); + let buffer = Vec::new(); + let writer = ArrowWriter::try_new(buffer, arrow_schema.clone(), None).unwrap(); + // ArrowWriter takes ownership of buffer, so we use a placeholder here. + Self { + arrow_schema, + id_fields, + stats_fields, + buffer: Vec::new(), + writer: Some(writer), + } + } + + /// Writes one row group with the given per-file stats. + fn write_row_group( + &mut self, + max_ids: &[Option], + min_ids: &[Option], + null_counts: &[Option], + num_records: &[i64], + ) { + let make_id_struct = |vals: &[Option]| -> StructArray { + StructArray::from(vec![( + Arc::new(Field::new("id", ArrowDataType::Int64, true)), + Arc::new(Int64Array::from(vals.to_vec())) as Arc, + )]) + }; + let stats_parsed = StructArray::from(vec![ + ( + Arc::new(Field::new( + "maxValues", + ArrowDataType::Struct(self.id_fields.clone()), + true, + )), + Arc::new(make_id_struct(max_ids)) as Arc, + ), + ( + Arc::new(Field::new( + "minValues", + ArrowDataType::Struct(self.id_fields.clone()), + true, + )), + Arc::new(make_id_struct(min_ids)) as Arc, + ), + ( + Arc::new(Field::new( + "nullCount", + ArrowDataType::Struct(self.id_fields.clone()), + true, + )), + Arc::new(make_id_struct(null_counts)) as Arc, + ), + ( + Arc::new(Field::new("numRecords", ArrowDataType::Int64, true)), + Arc::new(Int64Array::from(num_records.to_vec())) as Arc, + ), + ]); + let add = StructArray::from(vec![( + Arc::new(Field::new( + "stats_parsed", + ArrowDataType::Struct(self.stats_fields.clone()), + true, + )), + Arc::new(stats_parsed) as Arc, + )]); + let batch = RecordBatch::try_new(self.arrow_schema.clone(), vec![Arc::new(add)]).unwrap(); + let writer = self.writer.as_mut().unwrap(); + writer.write(&batch).unwrap(); + writer.flush().unwrap(); + } + + /// Finishes writing and returns the parquet bytes. + fn finish(mut self) -> Bytes { + let writer = self.writer.take().unwrap(); + self.buffer = writer.into_inner().unwrap(); + Bytes::from(self.buffer) + } +} + +/// Builds a checkpoint skipping predicate and prefixes column references with `add.stats_parsed`. +fn build_prefixed_checkpoint_predicate(pred: &Pred) -> Option { + let skipping_pred = as_checkpoint_skipping_predicate(pred, &[])?; + let mut prefixer = PrefixColumns { + prefix: ColumnName::new(["add", "stats_parsed"]), + }; + Some( + prefixer + .transform_pred(&skipping_pred) + .unwrap() + .into_owned(), + ) +} + +/// Applies a meta predicate as a row group filter and returns the total rows read. +fn apply_row_group_filter(parquet_bytes: Bytes, meta_predicate: &Pred) -> usize { + ParquetRecordBatchReaderBuilder::try_new(parquet_bytes) + .unwrap() + .with_row_group_filter(meta_predicate, None) + .build() + .unwrap() + .map(|b| b.unwrap().num_rows()) + .sum() +} + +/// Tests checkpoint row group skipping end-to-end with the parquet row group filter. +/// +/// Shared parquet layout (4 row groups): +/// - RG 0 (2 rows): maxValues.id = [100, NULL], nullCount.id = [5, NULL] +/// - RG 1 (1 row): maxValues.id = 300, nullCount.id = 0 +/// - RG 2 (1 row): maxValues.id = 50, nullCount.id = 10 +/// - RG 3 (2 rows): maxValues.id = [150, 40], nullCount.id = [0, NULL] +/// +/// | Predicate | RG 0 (2 rows) | RG 1 (1 row) | RG 2 (1 row) | RG 3 (2 rows) | Total | +/// |----------------|-----------------------|--------------------|--------------------|-----------------------|-------| +/// | id > 200 | keep (null max stats) | keep (max=300>200) | skip (max=50<200) | skip (max=150<200) | 3 | +/// | id IS NULL | keep (nullCount>0) | skip (nullCount=0) | keep (nullCount=10)| keep (null nullCount) | 5 | +/// | id IS NOT NULL | no predicate (col vs col, #1873) | 6 | +#[rstest] +#[case::comparison( + Pred::gt(column_expr!("id"), Expr::literal(200i64)), + // Should skip RG2 and RG3, but https://github.com/apache/arrow-rs/issues/9451 + Some(6), // Some(3), + "keep RG 0 (null stats) + RG 1 (max>200), skip RG 2 + RG 3 (max<200)" +)] +#[case::is_null( + Pred::is_null(column_expr!("id")), + // Should skip RG 1 (nullCount=0), but https://github.com/apache/arrow-rs/issues/9451 + Some(6), // Some(5), + "keep RG 0 (nullCount>0) + RG 2 (nullCount>0) + RG 3 (null nullCount), skip RG 1 (nullCount=0)" +)] +#[case::is_not_null( + Pred::not(Pred::is_null(column_expr!("id"))), + None, + "IS NOT NULL produces no skipping predicate — column vs column (#1873)" +)] +fn test_checkpoint_row_group_skipping( + #[case] pred: Pred, + #[case] expected_rows: Option, + #[case] description: &str, +) { + let mut builder = CheckpointParquetBuilder::new(); + // RG 0: mixed null/non-null stats. maxValues.id = [100, NULL], nullCount.id = [5, NULL]. + builder.write_row_group( + &[Some(100), None], + &[Some(1), None], + &[Some(5), None], + &[100, 50], + ); + // RG 1: maxValues.id = 300, nullCount.id = 0. + builder.write_row_group(&[Some(300)], &[Some(201)], &[Some(0)], &[100]); + // RG 2: maxValues.id = 50, nullCount.id = 10. + builder.write_row_group(&[Some(50)], &[Some(1)], &[Some(10)], &[100]); + // RG 3: maxValues.id = [150, 40], nullCount.id = [0, NULL]. + // Tests that null nullCount stats are conservatively kept for IS NULL. + builder.write_row_group( + &[Some(150), Some(40)], + &[Some(1), Some(1)], + &[Some(0), None], + &[100, 50], + ); + let parquet_bytes = builder.finish(); + + let meta_predicate = build_prefixed_checkpoint_predicate(&pred); + + match expected_rows { + Some(expected) => { + let meta_predicate = + meta_predicate.expect("predicate should produce a checkpoint skipping predicate"); + let total_rows = apply_row_group_filter(parquet_bytes, &meta_predicate); + assert_eq!(total_rows, expected, "{description}"); + } + None => { + assert!(meta_predicate.is_none(), "{description}"); + // Without a predicate, all row groups are read. + let total_rows: usize = ParquetRecordBatchReaderBuilder::try_new(parquet_bytes) + .unwrap() + .build() + .unwrap() + .map(|b| b.unwrap().num_rows()) + .sum(); + assert_eq!(total_rows, 6, "all rows should be read without a predicate"); + } + } +} + +#[test] +fn test_skip_stats_disables_data_skipping() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + let predicate = Arc::new(Pred::gt(column_expr!("id"), Expr::literal(400i64))); + let scan = snapshot + .scan_builder() + .with_predicate(predicate) + .with_skip_stats(true) + .build() + .unwrap(); + + let scan_metadata_results: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap(); + + let mut selected_file_count = 0; + for scan_metadata in &scan_metadata_results { + let selection_vector = scan_metadata.scan_files.selection_vector(); + selected_file_count += selection_vector + .iter() + .filter(|&&selected| selected) + .count(); + } + + assert_eq!(selected_file_count, 6); +} + +#[test] +fn test_skip_stats_after_include_all_stats_columns_wins() { + // With StatsOutputMode enum, last call wins. Calling with_skip_stats(true) after + // include_all_stats_columns() should result in stats being skipped. + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + let predicate = Arc::new(Pred::gt(column_expr!("id"), Expr::literal(400i64))); + let scan = snapshot + .scan_builder() + .include_all_stats_columns() + .with_skip_stats(true) + .with_predicate(predicate) + .build() + .unwrap(); + + // Stats are skipped, so all files should be returned (no data skipping) + let scan_metadata_results: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap(); + + let mut selected_file_count = 0; + for scan_metadata in &scan_metadata_results { + let selection_vector = scan_metadata.scan_files.selection_vector(); + selected_file_count += selection_vector + .iter() + .filter(|&&selected| selected) + .count(); + } + + assert_eq!(selected_file_count, 6); +} + +#[test] +fn test_with_stats_columns_empty_no_stats_output() { + // with_stats_columns(vec![]) should produce no stats output + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + let scan = snapshot + .scan_builder() + .with_stats_columns(vec![]) + .build() + .unwrap(); + + let scan_metadata_results: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap(); + + assert!( + !scan_metadata_results.is_empty(), + "Should have scan metadata" + ); + + for scan_metadata in scan_metadata_results { + let (underlying_data, _selection_vector) = scan_metadata.scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) + .unwrap() + .into(); + + // stats_parsed should not be present since empty Columns means no stats output + assert!( + batch.column_by_name("stats_parsed").is_none(), + "stats_parsed should not be present with empty stats columns" + ); + } +} + +/// Test that `with_stats_columns` with specific columns only returns stats for those columns. +/// Verifies that requesting `vec![col!("id")]` only includes `id` in minValues/maxValues/nullCount. +#[test] +fn test_scan_metadata_with_specific_stats_columns() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + // Request only "id" column stats + let scan = snapshot + .scan_builder() + .with_stats_columns(vec![column_name!("id")]) + .build() + .unwrap(); + + let scan_metadata_results: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap(); + + assert!( + !scan_metadata_results.is_empty(), + "Should have scan metadata" + ); + + for scan_metadata in scan_metadata_results { + let (underlying_data, selection_vector) = scan_metadata.scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) + .unwrap() + .into(); + let filtered_batch = + filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap(); + + let stats_parsed = get_column!(filtered_batch, "stats_parsed", StructArray); + let min_values = get_column!(stats_parsed, "minValues", StructArray); + let max_values = get_column!(stats_parsed, "maxValues", StructArray); + let null_count = get_column!(stats_parsed, "nullCount", StructArray); + + // Check minValues/maxValues/nullCount only have "id" + assert_eq!( + field_names(min_values), + vec!["id"], + "minValues should only contain 'id'" + ); + assert_eq!( + field_names(max_values), + vec!["id"], + "maxValues should only contain 'id'" + ); + assert_eq!( + field_names(null_count), + vec!["id"], + "nullCount should only contain 'id'" + ); + } +} + +/// Test that `with_stats_columns` with multiple specific columns returns stats for all of them. +#[test] +fn test_scan_metadata_with_multiple_stats_columns() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + // Request "id" and "name" column stats (not "age" or "salary") + let scan = snapshot + .scan_builder() + .with_stats_columns(vec![column_name!("id"), column_name!("name")]) + .build() + .unwrap(); + + let scan_metadata_results: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap(); + + assert!( + !scan_metadata_results.is_empty(), + "Should have scan metadata" + ); + + for scan_metadata in scan_metadata_results { + let (underlying_data, selection_vector) = scan_metadata.scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) + .unwrap() + .into(); + let filtered_batch = + filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap(); + + let stats_parsed = get_column!(filtered_batch, "stats_parsed", StructArray); + let min_values = get_column!(stats_parsed, "minValues", StructArray); + let max_values = get_column!(stats_parsed, "maxValues", StructArray); + let null_count = get_column!(stats_parsed, "nullCount", StructArray); + + // Check minValues/maxValues/nullCount have "id" and "name" + let expected = vec!["id", "name"]; + assert_eq!( + field_names(min_values), + expected, + "minValues should contain 'id' and 'name'" + ); + assert_eq!( + field_names(max_values), + expected, + "maxValues should contain 'id' and 'name'" + ); + assert_eq!( + field_names(null_count), + expected, + "nullCount should contain 'id' and 'name'" + ); + + // Verify "age" and "salary" are NOT present + assert!( + min_values.column_by_name("age").is_none(), + "minValues should NOT contain 'age'" + ); + assert!( + min_values.column_by_name("salary").is_none(), + "minValues should NOT contain 'salary'" + ); + } +} + +/// Test that `with_stats_columns` with a nonexistent column name produces empty stats for that column. +#[test] +fn test_scan_metadata_with_nonexistent_stats_columns() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(SyncEngine::new()); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + + let scan = snapshot + .scan_builder() + .with_stats_columns(vec![column_name!("nonexistent_column")]) + .build() + .unwrap(); + + let scan_metadata_results: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap(); + + assert!( + !scan_metadata_results.is_empty(), + "Should have scan metadata" + ); + + for scan_metadata in scan_metadata_results { + let (underlying_data, selection_vector) = scan_metadata.scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(underlying_data) + .unwrap() + .into(); + let filtered_batch = + filter_record_batch(&batch, &BooleanArray::from(selection_vector)).unwrap(); + + let stats_parsed = get_column!(filtered_batch, "stats_parsed", StructArray); + + // Should have numRecords but no minValues/maxValues/nullCount + // (or they exist but are empty structs) + assert!( + stats_parsed.column_by_name("numRecords").is_some(), + "Should still have numRecords" + ); + } +} + +/// A [`ParquetHandler`] that returns an empty iterator for every `read_parquet_files` call. +/// Used to simulate a buggy connector that drops all data for a file. +struct EmptyParquetHandler; + +impl ParquetHandler for EmptyParquetHandler { + fn read_parquet_files( + &self, + _files: &[FileMeta], + _schema: crate::schema::SchemaRef, + _predicate: Option, + ) -> crate::DeltaResult { + Ok(Box::new(std::iter::empty())) + } + + fn read_parquet_footer(&self, _file: &FileMeta) -> crate::DeltaResult { + unimplemented!() + } + + fn write_parquet_file( + &self, + _location: url::Url, + _data: Box>> + Send>, + ) -> crate::DeltaResult<()> { + unimplemented!() + } +} + +/// An [`Engine`] that delegates everything to a [`SyncEngine`] except `parquet_handler`, which +/// returns [`EmptyParquetHandler`]. +struct EmptyParquetEngine(Arc); + +impl Engine for EmptyParquetEngine { + fn evaluation_handler(&self) -> Arc { + self.0.evaluation_handler() + } + + fn json_handler(&self) -> Arc { + self.0.json_handler() + } + + fn parquet_handler(&self) -> Arc { + Arc::new(EmptyParquetHandler) + } + + fn storage_handler(&self) -> Arc { + self.0.storage_handler() + } +} + +/// When a file's Add action stats report `numRecords > 0` and the parquet handler returns an empty +/// iterator, `execute` must surface an error rather than silently producing no rows. +#[test] +fn execute_errors_when_parquet_returns_empty_for_file_with_positive_stats() { + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-without-dv-small/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(EmptyParquetEngine(Arc::new(SyncEngine::new()))); + + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + let scan = snapshot.scan_builder().build().unwrap(); + + let results: Vec<_> = scan.execute(engine).unwrap().collect(); + assert_eq!(results.len(), 1, "should emit exactly one error item"); + assert!(results[0].is_err(), "the result should be an error, got Ok"); + let err = results[0].as_ref().err().unwrap().to_string(); + assert!( + err.contains("ParquetHandler returned no data"), + "unexpected error message: {err}" + ); +} + +/// When a file's Add action has no stats, an empty iterator from the parquet handler is allowed +/// -- we conservatively treat the file as possibly legitimately empty. +#[test] +fn execute_does_not_error_when_parquet_returns_empty_and_stats_absent() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/table-with-cdf/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = Arc::new(EmptyParquetEngine(Arc::new(SyncEngine::new()))); + + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + let scan = snapshot.scan_builder().build().unwrap(); + + // All Add files in this table have no stats -- empty iterators should be silently ignored. + let results: Vec<_> = scan.execute(engine).unwrap().collect(); + assert!( + results.iter().all(|r| r.is_ok()), + "expected no errors for stats-absent files" + ); +} + +/// Tests for ScanMetadataCompleted event emission +mod scan_metadata_completed_tests { + use std::path::PathBuf; + use std::sync::Arc; + use std::time::Duration; + + use rstest::rstest; + + use crate::engine::default::DefaultEngineBuilder; + use crate::expressions::{column_expr, Expression as Expr, Predicate as Pred}; + use crate::metrics::MetricEvent; + use crate::object_store::local::LocalFileSystem; + use crate::utils::test_utils::CapturingReporter; + use crate::Snapshot; + + fn run_scan(table: &str, predicate: Option>) -> (Arc, usize) { + let path = std::fs::canonicalize(PathBuf::from(table)).unwrap(); + let url = url::Url::from_directory_path(&path).unwrap(); + let reporter = Arc::new(CapturingReporter::default()); + let engine = Arc::new( + DefaultEngineBuilder::new(Arc::new(LocalFileSystem::new())) + .with_metrics_reporter(reporter.clone()) + .build(), + ); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + let mut builder = snapshot.scan_builder(); + if let Some(pred) = predicate { + builder = builder.with_predicate(pred); + } + let scan = builder.build().unwrap(); + let results: Vec<_> = scan + .scan_metadata(engine.as_ref()) + .unwrap() + .collect::, _>>() + .unwrap(); + (reporter, results.len()) + } + + fn get_scan_event(reporter: &CapturingReporter) -> MetricEvent { + reporter + .events() + .into_iter() + .find(|e| matches!(e, MetricEvent::ScanMetadataCompleted { .. })) + .expect("expected ScanMetadataCompleted event") + } + + #[rstest] + #[case::basic_scan("./tests/data/parsed-stats/", None, 6, 6, 0, 0)] + #[case::static_skip_all( + "./tests/data/parsed-stats/", + Some(Arc::new(Pred::literal(false))), + 0, + 0, + 0, + 0 + )] + #[case::with_removes("./tests/data/table-with-cdf/", None, 1, 0, 2, 0)] + #[case::with_removes("./tests/data/with_checkpoint_no_last_checkpoint/", None, 2, 1, 1, 0)] + #[case::partition_filter( + "./tests/data/basic_partitioned/", + Some(Arc::new(Expr::eq(column_expr!("letter"), Expr::literal("a")))), + 2, 2, 0, 4 + )] + fn test_scan_metrics( + #[case] table: &str, + #[case] predicate: Option>, + #[case] expected_add_seen: u64, + #[case] expected_active: u64, + #[case] expected_removes: u64, + #[case] expected_filtered: u64, + ) { + let (reporter, _) = run_scan(table, predicate); + let MetricEvent::ScanMetadataCompleted { + total_duration, + num_add_files_seen, + num_active_add_files, + num_remove_files_seen, + num_predicate_filtered, + .. + } = get_scan_event(&reporter) + else { + panic!("expected ScanMetadataCompleted"); + }; + assert!(total_duration > Duration::ZERO); + assert_eq!(num_add_files_seen, expected_add_seen); + assert_eq!(num_active_add_files, expected_active); + assert_eq!(num_remove_files_seen, expected_removes); + assert_eq!(num_predicate_filtered, expected_filtered); + } + + #[test] + fn test_no_metrics_on_early_drop() { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/parsed-stats/")).unwrap(); + let url = url::Url::from_directory_path(&path).unwrap(); + let reporter = Arc::new(CapturingReporter::default()); + let engine = Arc::new( + DefaultEngineBuilder::new(Arc::new(LocalFileSystem::new())) + .with_metrics_reporter(reporter.clone()) + .build(), + ); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); + let scan = snapshot.scan_builder().build().unwrap(); + { + let mut iter = scan.scan_metadata(engine.as_ref()).unwrap(); + let _ = iter.next(); + } + assert!(reporter + .events() + .iter() + .all(|e| !matches!(e, MetricEvent::ScanMetadataCompleted { .. }))); + } +} diff --git a/kernel/src/transforms.rs b/kernel/src/scan/transform_spec.rs similarity index 92% rename from kernel/src/transforms.rs rename to kernel/src/scan/transform_spec.rs index 571c861309..8682e1a8a7 100644 --- a/kernel/src/transforms.rs +++ b/kernel/src/scan/transform_spec.rs @@ -8,11 +8,11 @@ use std::collections::HashMap; use std::sync::Arc; use itertools::Itertools; +use serde::{Deserialize, Serialize}; -use crate::expressions::{ - BinaryExpressionOp, Expression, ExpressionRef, Scalar, Transform, VariadicExpressionOp, -}; +use crate::expressions::{BinaryExpressionOp, Expression, ExpressionRef, Scalar, Transform}; use crate::schema::{DataType, SchemaRef, StructType}; +use crate::table_features::ColumnMappingMode; use crate::{DeltaResult, Error}; /// A list of field transforms that describes a transform expression to be created at scan time. @@ -22,7 +22,7 @@ pub(crate) type TransformSpec = Vec; /// /// These transformations are "sparse" - they only specify what changes, while unchanged fields /// pass through implicitly in their original order. -#[derive(Debug)] +#[derive(Debug, Serialize, Deserialize, PartialEq)] pub(crate) enum FieldTransformSpec { /// Insert the given expression after the named input column (None = prepend instead) // NOTE: It's quite likely we will sometimes need to reorder columns for one reason or another, @@ -70,13 +70,14 @@ pub(crate) fn parse_partition_value( field_idx: usize, logical_schema: &SchemaRef, partition_values: &HashMap, + column_mapping_mode: ColumnMappingMode, ) -> DeltaResult<(usize, (String, Scalar))> { let Some(field) = logical_schema.field_at_index(field_idx) else { return Err(Error::InternalError(format!( "out of bounds partition column field index {field_idx}" ))); }; - let name = field.physical_name(); + let name = field.physical_name(column_mapping_mode); let partition_value = parse_partition_value_raw(partition_values.get(name), field.data_type())?; Ok((field_idx, (name.to_string(), partition_value))) } @@ -86,13 +87,19 @@ pub(crate) fn parse_partition_values( logical_schema: &SchemaRef, transform_spec: &TransformSpec, partition_values: &HashMap, + column_mapping_mode: ColumnMappingMode, ) -> DeltaResult> { transform_spec .iter() .filter_map(|field_transform| match field_transform { - FieldTransformSpec::MetadataDerivedColumn { field_index, .. } => Some( - parse_partition_value(*field_index, logical_schema, partition_values), - ), + FieldTransformSpec::MetadataDerivedColumn { field_index, .. } => { + Some(parse_partition_value( + *field_index, + logical_schema, + partition_values, + column_mapping_mode, + )) + } FieldTransformSpec::DynamicColumn { .. } | FieldTransformSpec::StaticInsert { .. } | FieldTransformSpec::GenerateRowId { .. } @@ -128,17 +135,14 @@ pub(crate) fn get_transform_expr( let base_row_id = base_row_id.ok_or_else(|| { Error::generic("Asked to generate RowIds, but no baseRowId found.") })?; - let expr = Arc::new(Expression::variadic( - VariadicExpressionOp::Coalesce, - vec![ - Expression::column([field_name]), - Expression::binary( - BinaryExpressionOp::Plus, - Expression::literal(base_row_id), - Expression::column([row_index_field_name]), - ), - ], - )); + let expr = Arc::new(Expression::coalesce([ + Expression::column([field_name]), + Expression::binary( + BinaryExpressionOp::Plus, + Expression::literal(base_row_id), + Expression::column([row_index_field_name]), + ), + ])); transform.with_replaced_field(field_name.clone(), expr) } MetadataDerivedColumn { @@ -176,8 +180,7 @@ pub(crate) fn get_transform_expr( // Column doesn't exist physically - treat as partition column let Some((_, partition_value)) = metadata_values.remove(field_index) else { return Err(Error::MissingData(format!( - "missing partition value for dynamic column '{}' at index {}", - physical_name, field_index + "missing partition value for dynamic column '{physical_name}' at index {field_index}" ))); }; @@ -221,7 +224,7 @@ mod tests { )])); let partition_values = HashMap::new(); - let result = parse_partition_value(5, &schema, &partition_values); + let result = parse_partition_value(5, &schema, &partition_values, ColumnMappingMode::None); assert_result_error_with_message(result, "out of bounds"); } @@ -256,7 +259,13 @@ mod tests { partition_values.insert("id".to_string(), "test".to_string()); partition_values.insert("_change_type".to_string(), "insert".to_string()); - let result = parse_partition_values(&schema, &transform_spec, &partition_values).unwrap(); + let result = parse_partition_values( + &schema, + &transform_spec, + &partition_values, + ColumnMappingMode::None, + ) + .unwrap(); assert_eq!(result.len(), 2); assert!(result.contains_key(&0)); assert!(result.contains_key(&1)); @@ -276,7 +285,13 @@ mod tests { let transform_spec = vec![]; let partition_values = HashMap::new(); - let result = parse_partition_values(&schema, &transform_spec, &partition_values).unwrap(); + let result = parse_partition_values( + &schema, + &transform_spec, + &partition_values, + ColumnMappingMode::None, + ) + .unwrap(); assert!(result.is_empty()); } @@ -301,7 +316,7 @@ mod tests { #[test] fn test_parse_partition_value_raw_null() { let result = parse_partition_value_raw(None, &DataType::STRING).unwrap(); - assert!(matches!(result, Scalar::Null(_))); + assert!(result.is_null()); } #[test] @@ -571,17 +586,14 @@ mod tests { .expect("Should have row_id_col transform"); assert!(row_id_transform.is_replace); - let expeceted_expr = Arc::new(Expression::variadic( - VariadicExpressionOp::Coalesce, - vec![ - Expression::column(["row_id_col"]), - Expression::binary( - BinaryExpressionOp::Plus, - Expression::literal(4i64), - Expression::column(["row_index_col"]), - ), - ], - )); + let expeceted_expr = Arc::new(Expression::coalesce([ + Expression::column(["row_id_col"]), + Expression::binary( + BinaryExpressionOp::Plus, + Expression::literal(4i64), + Expression::column(["row_index_col"]), + ), + ])); assert_eq!(row_id_transform.exprs.len(), 1); let expr = &row_id_transform.exprs[0]; assert_eq!(expr, &expeceted_expr); diff --git a/kernel/src/schema/compare.rs b/kernel/src/schema/compare.rs index 9b352c398a..a392cc807d 100644 --- a/kernel/src/schema/compare.rs +++ b/kernel/src/schema/compare.rs @@ -95,9 +95,8 @@ impl SchemaComparison for StructType { /// 2. For each field in this struct, you can read it as the `read_type`'s field. See /// [`StructField::can_read_as`]. /// 3. If a field in `read_type` is not present in this struct, then it must be nullable. - /// 4. Both [`StructTypes`] must be valid schemas. No two fields of a structs may share a - /// name that only differs by case. TODO: This check should be moved into the constructor - /// for [`StructType`]. + /// 4. Both [`StructTypes`] must be valid schemas. No two fields of a struct may share a + /// name that only differs by case. fn can_read_as(&self, read_type: &Self) -> SchemaComparisonResult { let lowercase_field_map: HashMap = self .fields @@ -139,13 +138,13 @@ impl SchemaComparison for StructType { impl SchemaComparison for DataType { /// Returns `Ok` if this [`DataType`] can be read as `read_type`. This is the case when: - /// 1. The data types are the same. Note: This condition will be relaxed to include - /// compatible data types with type widening. See issue [`#623`] + /// 1. The data types are the same, OR the source type can be widened to the target type + /// (see [`PrimitiveType::can_widen_to`]) /// 2. For complex data types, the nested types must be compatible as defined by [`SchemaComparison`] /// 3. For array data types, the nullability may not be tightened in the `read_type`. See /// [`Nullable::can_read_as`] /// - /// [`#623`]: + /// [`PrimitiveType::can_widen_to`]: super::PrimitiveType::can_widen_to fn can_read_as(&self, read_type: &Self) -> SchemaComparisonResult { match (self, read_type) { (Self::Array(self_array), Self::Array(read_array)) => { @@ -164,11 +163,12 @@ impl SchemaComparison for DataType { self_map.key_type().can_read_as(read_map.key_type())?; self_map.value_type().can_read_as(read_map.value_type())?; } - (a, b) => { - // TODO: In the future, we will change this to support type widening. - // See: #623 - require!(a == b, Error::TypeMismatch); - } + // Exact match + (a, b) if a == b => {} + // Type widening: smaller primitive types can be read as larger ones + (Self::Primitive(a), Self::Primitive(b)) if a.can_widen_to(b) => {} + // Any other type change is incompatible + _ => return Err(Error::TypeMismatch), }; Ok(()) } @@ -176,8 +176,10 @@ impl SchemaComparison for DataType { #[cfg(test)] mod tests { + use rstest::rstest; + use crate::schema::compare::{Error, SchemaComparison}; - use crate::schema::{ArrayType, DataType, MapType, StructField, StructType}; + use crate::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; #[test] fn can_read_is_reflexive() { @@ -402,4 +404,114 @@ mod tests { Err(Error::InvalidSchema) )); } + + #[test] + fn type_widening_integer() { + // byte -> short -> int -> long + assert!(DataType::BYTE.can_read_as(&DataType::SHORT).is_ok()); + assert!(DataType::BYTE.can_read_as(&DataType::INTEGER).is_ok()); + assert!(DataType::BYTE.can_read_as(&DataType::LONG).is_ok()); + assert!(DataType::SHORT.can_read_as(&DataType::INTEGER).is_ok()); + assert!(DataType::SHORT.can_read_as(&DataType::LONG).is_ok()); + assert!(DataType::INTEGER.can_read_as(&DataType::LONG).is_ok()); + + // Cannot narrow types + assert!(matches!( + DataType::LONG.can_read_as(&DataType::INTEGER), + Err(Error::TypeMismatch) + )); + assert!(matches!( + DataType::INTEGER.can_read_as(&DataType::SHORT), + Err(Error::TypeMismatch) + )); + assert!(matches!( + DataType::SHORT.can_read_as(&DataType::BYTE), + Err(Error::TypeMismatch) + )); + } + + #[rstest] + // Physical type reinterpretation (not general widening -- can_read_as rejects these) + #[case::integer_to_date(PrimitiveType::Integer, PrimitiveType::Date, true)] + #[case::long_to_timestamp(PrimitiveType::Long, PrimitiveType::Timestamp, true)] + #[case::long_to_timestamp_ntz(PrimitiveType::Long, PrimitiveType::TimestampNtz, true)] + // Reverse (narrowing) is rejected + #[case::date_to_integer(PrimitiveType::Date, PrimitiveType::Integer, false)] + #[case::timestamp_to_long(PrimitiveType::Timestamp, PrimitiveType::Long, false)] + #[case::timestamp_ntz_to_long(PrimitiveType::TimestampNtz, PrimitiveType::Long, false)] + // Cross-type combinations are rejected + #[case::long_to_date(PrimitiveType::Long, PrimitiveType::Date, false)] + #[case::integer_to_timestamp(PrimitiveType::Integer, PrimitiveType::Timestamp, false)] + #[case::integer_to_timestamp_ntz(PrimitiveType::Integer, PrimitiveType::TimestampNtz, false)] + #[case::byte_to_date(PrimitiveType::Byte, PrimitiveType::Date, false)] + // Identity + #[case::date_identity(PrimitiveType::Date, PrimitiveType::Date, true)] + #[case::timestamp_identity(PrimitiveType::Timestamp, PrimitiveType::Timestamp, true)] + #[case::long_identity(PrimitiveType::Long, PrimitiveType::Long, true)] + // Widening (superset of can_widen_to) + #[case::byte_to_long(PrimitiveType::Byte, PrimitiveType::Long, true)] + #[case::short_to_integer(PrimitiveType::Short, PrimitiveType::Integer, true)] + #[case::float_to_double(PrimitiveType::Float, PrimitiveType::Double, true)] + #[case::timestamp_to_ntz(PrimitiveType::Timestamp, PrimitiveType::TimestampNtz, true)] + fn stats_type_compatibility( + #[case] source: PrimitiveType, + #[case] target: PrimitiveType, + #[case] expected: bool, + ) { + assert_eq!( + source.is_stats_type_compatible_with(&target), + expected, + "{source:?} -> {target:?} should be {expected}" + ); + } + + #[test] + fn type_widening_float() { + // float -> double + assert!(DataType::FLOAT.can_read_as(&DataType::DOUBLE).is_ok()); + + // Cannot narrow + assert!(matches!( + DataType::DOUBLE.can_read_as(&DataType::FLOAT), + Err(Error::TypeMismatch) + )); + } + + #[test] + fn type_widening_in_struct() { + let source = StructType::new_unchecked([ + StructField::new("id", DataType::INTEGER, false), + StructField::new("value", DataType::FLOAT, true), + ]); + let target = StructType::new_unchecked([ + StructField::new("id", DataType::LONG, false), + StructField::new("value", DataType::DOUBLE, true), + ]); + + // Can widen types in struct fields + assert!(source.can_read_as(&target).is_ok()); + + // Cannot narrow + assert!(matches!( + target.can_read_as(&source), + Err(Error::TypeMismatch) + )); + } + + #[test] + fn incompatible_type_change() { + // Cannot change between incompatible types + assert!(matches!( + DataType::STRING.can_read_as(&DataType::INTEGER), + Err(Error::TypeMismatch) + )); + assert!(matches!( + DataType::INTEGER.can_read_as(&DataType::STRING), + Err(Error::TypeMismatch) + )); + assert!(matches!( + DataType::BOOLEAN.can_read_as(&DataType::INTEGER), + Err(Error::TypeMismatch) + )); + } } diff --git a/kernel/src/schema/derive_macro_utils.rs b/kernel/src/schema/derive_macro_utils.rs index e193898a67..d25d84da31 100644 --- a/kernel/src/schema/derive_macro_utils.rs +++ b/kernel/src/schema/derive_macro_utils.rs @@ -66,6 +66,13 @@ impl ToDataType for HashMap { } } +// ToDataType impl for maps with nullable values +impl ToDataType for HashMap> { + fn to_data_type() -> DataType { + MapType::new(K::to_data_type(), V::to_data_type(), true).into() + } +} + /// The [`delta_kernel_derive::ToSchema`] macro uses this to convert a struct field's name + type /// into a `StructField` definition. A blanket impl for `Option` supports nullable /// struct fields, which otherwise default to non-nullable. @@ -116,3 +123,10 @@ impl GetNullableContainerStructField for T { StructField::not_null(name, T::to_nullable_container_type()) } } + +// Optional container types produce nullable fields with nullable values. +impl GetNullableContainerStructField for Option { + fn get_nullable_container_struct_field(name: impl Into) -> StructField { + StructField::nullable(name, T::to_nullable_container_type()) + } +} diff --git a/kernel/src/schema/diff.rs b/kernel/src/schema/diff.rs new file mode 100644 index 0000000000..54f62efa96 --- /dev/null +++ b/kernel/src/schema/diff.rs @@ -0,0 +1,2734 @@ +//! Schema diffing implementation for Delta Lake schemas +//! +//! This module provides functionality to compute differences between two schemas +//! using field IDs as the primary mechanism for identifying fields across schema versions. +//! Supports nested field comparison within structs, arrays, and maps. + +// This module is not used outside the diff subsystem; suppress unused-code warnings. +#![allow(dead_code)] + +use super::{ColumnMetadataKey, ColumnName, DataType, MetadataValue, StructField, StructType}; +use std::collections::{HashMap, HashSet}; + +/// Represents the difference between two schemas +#[derive(Debug, Clone, PartialEq)] +pub(crate) struct SchemaDiff { + /// Fields that were added in the new schema + pub(crate) added_fields: Vec, + /// Fields that were removed from the original schema + pub(crate) removed_fields: Vec, + /// Fields that were modified between schemas + pub(crate) updated_fields: Vec, + /// Whether the diff contains breaking changes (computed once during construction) + breaking_changes: bool, +} + +/// Represents a field change (added or removed) at any nesting level +#[derive(Debug, Clone, PartialEq)] +pub(crate) struct FieldChange { + /// The field that was added or removed + pub(crate) field: StructField, + /// The path to this field (e.g., ColumnName::new(["user", "address", "street"])) + pub(crate) path: ColumnName, +} + +/// Represents an update to a field between two schema versions +#[derive(Debug, Clone, PartialEq)] +pub(crate) struct FieldUpdate { + /// The field as it existed in the original schema + pub(crate) before: StructField, + /// The field as it exists in the new schema + pub(crate) after: StructField, + /// The path to this field (e.g., ColumnName::new(["user", "address", "street"])) + pub(crate) path: ColumnName, + /// The types of changes that occurred (can be multiple, e.g. renamed + nullability changed) + pub(crate) change_types: Vec, +} + +/// The types of changes that can occur to a field +#[derive(Debug, Clone, PartialEq)] +pub(crate) enum FieldChangeType { + /// Field was renamed (logical name changed, but field ID stayed the same) + Renamed, + /// Field nullability was loosened (non-nullable -> nullable) - safe change + NullabilityLoosened, + /// Field nullability was tightened (nullable -> non-nullable) - breaking change + NullabilityTightened, + /// Field data type was changed + TypeChanged, + /// Field metadata was changed (excluding column mapping metadata) + MetadataChanged, + /// The container nullability was loosened (safe change) + ContainerNullabilityLoosened, + /// The container nullability was tightened (breaking change) + ContainerNullabilityTightened, +} + +/// Errors that can occur during schema diffing +#[derive(Debug, thiserror::Error)] +pub(crate) enum SchemaDiffError { + #[error("Schema diffing is not yet implemented")] + Unsupported, + #[error("Field at path '{path}' is missing column mapping ID")] + MissingFieldId { path: ColumnName }, + #[error("Duplicate field ID {id} found at paths '{path1}' and '{path2}'")] + DuplicateFieldId { + id: i64, + path1: ColumnName, + path2: ColumnName, + }, + #[error( + "Field at path '{path}' is missing physical name (required when column mapping is enabled)" + )] + MissingPhysicalName { path: ColumnName }, + #[error("Field with ID {field_id} at path '{path}' has inconsistent physical names: '{before}' -> '{after}'. Physical names must not change for the same field ID.")] + PhysicalNameChanged { + field_id: i64, + path: ColumnName, + before: String, + after: String, + }, +} + +impl SchemaDiff { + /// Compute the difference between two schemas using field IDs + pub(crate) fn new(before: &StructType, after: &StructType) -> Result { + compute_schema_diff(before, after) + } + + /// Returns true if there are no differences between the schemas + pub(crate) fn is_empty(&self) -> bool { + self.added_fields.is_empty() + && self.removed_fields.is_empty() + && self.updated_fields.is_empty() + } + + /// Returns the total number of changes + pub(crate) fn change_count(&self) -> usize { + self.added_fields.len() + self.removed_fields.len() + self.updated_fields.len() + } + + /// Returns true if there are any breaking changes (removed fields, type changes, or tightened nullability) + pub(crate) fn has_breaking_changes(&self) -> bool { + self.breaking_changes + } + + /// Get all changes (both top-level and nested) + pub(crate) fn all_changes(&self) -> (&[FieldChange], &[FieldChange], &[FieldUpdate]) { + ( + &self.added_fields, + &self.removed_fields, + &self.updated_fields, + ) + } + + /// Get all changes at the top level only (fields with path length of 1) + pub(crate) fn top_level_changes( + &self, + ) -> (Vec<&FieldChange>, Vec<&FieldChange>, Vec<&FieldUpdate>) { + let added = self + .added_fields + .iter() + .filter(|f| f.path.path().len() == 1) + .collect(); + let removed = self + .removed_fields + .iter() + .filter(|f| f.path.path().len() == 1) + .collect(); + let updated = self + .updated_fields + .iter() + .filter(|f| f.path.path().len() == 1) + .collect(); + (added, removed, updated) + } + + /// Get all changes at nested levels only (fields with path length > 1) + pub(crate) fn nested_changes( + &self, + ) -> (Vec<&FieldChange>, Vec<&FieldChange>, Vec<&FieldUpdate>) { + let added = self + .added_fields + .iter() + .filter(|f| f.path.path().len() > 1) + .collect(); + let removed = self + .removed_fields + .iter() + .filter(|f| f.path.path().len() > 1) + .collect(); + let updated = self + .updated_fields + .iter() + .filter(|f| f.path.path().len() > 1) + .collect(); + (added, removed, updated) + } +} + +/// Internal representation of a field with its path and ID +#[derive(Debug, Clone)] +struct FieldWithPath { + field: StructField, + path: ColumnName, + field_id: i64, +} + +/// Computes the difference between two schemas using field IDs for identification +/// +/// This function requires that both schemas have column mapping enabled and all fields +/// have valid field IDs. Fields are matched by their field ID rather than name, +/// allowing detection of renames at any nesting level within structs, arrays, and maps. +/// +/// # Note +/// It's recommended to use `SchemaDiff::new()` instead of calling this function directly: +/// +/// ```rust,ignore +/// let diff = SchemaDiff::new(&old_schema, &new_schema)?; +/// ``` +/// +/// # Arguments +/// * `before` - The before/original schema +/// * `after` - The after/new schema to compare against +/// +/// # Returns +/// A `SchemaDiff` describing all changes including nested fields, or an error if the schemas are invalid +fn compute_schema_diff( + before: &StructType, + after: &StructType, +) -> Result { + // Collect all fields with their paths from both schemas + let empty_path: Vec = vec![]; + let mut before_fields = Vec::new(); + collect_all_fields_with_paths( + before, + &ColumnName::new(empty_path.clone()), + &mut before_fields, + )?; + let mut after_fields = Vec::new(); + collect_all_fields_with_paths(after, &ColumnName::new(empty_path), &mut after_fields)?; + + // Build maps by field ID + let before_by_id = build_field_map_by_id(&before_fields)?; + let after_by_id = build_field_map_by_id(&after_fields)?; + + let before_field_ids: HashSet = before_by_id.keys().cloned().collect(); + let after_field_ids: HashSet = after_by_id.keys().cloned().collect(); + + // Find added, removed, and potentially updated fields + let added_ids: Vec = after_field_ids + .difference(&before_field_ids) + .cloned() + .collect(); + let removed_ids: Vec = before_field_ids + .difference(&after_field_ids) + .cloned() + .collect(); + let common_ids: Vec = before_field_ids + .intersection(&after_field_ids) + .cloned() + .collect(); + + // Collect added fields + let added_fields: Vec = added_ids + .into_iter() + .map(|id| { + let field_with_path = &after_by_id[&id]; + FieldChange { + field: field_with_path.field.clone(), + path: field_with_path.path.clone(), + } + }) + .collect(); + + // Filter out nested fields whose parent was also added + // Example: If "user" struct was added, don't also report "user.name", "user.email", etc. + let added_fields = filter_ancestor_fields(added_fields); + + // Collect removed fields + let removed_fields: Vec = removed_ids + .into_iter() + .map(|id| { + let field_with_path = &before_by_id[&id]; + FieldChange { + field: field_with_path.field.clone(), + path: field_with_path.path.clone(), + } + }) + .collect(); + + // Filter out nested fields whose parent was also removed + // Example: If "user" struct was removed, don't also report "user.name", "user.email", etc. + let removed_fields = filter_ancestor_fields(removed_fields); + + // Check for updates in common fields + let mut updated_fields = Vec::new(); + for id in common_ids { + let before_field_with_path = &before_by_id[&id]; + let after_field_with_path = &after_by_id[&id]; + + // Invariant: A field in common_ids must have existed in both schemas, which means + // its parent path must also have existed in both schemas. Therefore, neither an + // added nor removed ancestor should be a parent of an updated field. + #[cfg(debug_assertions)] + { + let added_paths: HashSet = + added_fields.iter().map(|f| f.path.clone()).collect(); + let removed_paths: HashSet = + removed_fields.iter().map(|f| f.path.clone()).collect(); + + debug_assert!( + !has_added_ancestor(&after_field_with_path.path, &added_paths), + "Field with ID {} at path '{}' is in common_ids but has an added ancestor. \ + This violates the invariant that common fields must have existed in both schemas.", + id, + after_field_with_path.path + ); + debug_assert!( + !has_added_ancestor(&after_field_with_path.path, &removed_paths), + "Field with ID {} at path '{}' is in common_ids but has a removed ancestor. \ + This violates the invariant that common fields must have existed in both schemas.", + id, + after_field_with_path.path + ); + } + + if let Some(field_update) = + compute_field_update(before_field_with_path, after_field_with_path)? + { + updated_fields.push(field_update); + } + } + + // Compute whether there are breaking changes + let has_breaking_changes = + compute_has_breaking_changes(&added_fields, &removed_fields, &updated_fields); + + Ok(SchemaDiff { + added_fields, + removed_fields, + updated_fields, + breaking_changes: has_breaking_changes, + }) +} + +/// Helper function to check if a change type is breaking +fn is_breaking_change_type(change_type: &FieldChangeType) -> bool { + matches!( + change_type, + FieldChangeType::TypeChanged + | FieldChangeType::NullabilityTightened + | FieldChangeType::ContainerNullabilityTightened + | FieldChangeType::MetadataChanged + ) +} + +/// Computes whether the diff contains breaking changes +fn compute_has_breaking_changes( + added_fields: &[FieldChange], + _removed_fields: &[FieldChange], + updated_fields: &[FieldUpdate], +) -> bool { + // Removed fields are safe - existing data files remain valid, queries referencing + // removed fields will fail at query time but data integrity is maintained + // Adding a non-nullable (required) field is breaking - existing data won't have values + added_fields.iter().any(|add| !add.field.nullable) + // Certain update types are breaking (type changes, nullability tightening, etc.) + || updated_fields.iter().any(|update| { + update + .change_types + .iter() + .any(is_breaking_change_type) + }) +} + +/// Filters field changes by removing descendants of already-reported ancestors. +/// +/// A field is dropped if any of its ancestors is also present in the input: +/// reporting both would be redundant. +/// Two fields that merely share a common ancestor not present in the input +/// are both kept. +/// +/// The algorithm is O(n * d) where n is the number of fields and d is max path depth: +/// 1. Put all paths in a HashSet for O(1) lookup +/// 2. For each field, walk up its parent chain +/// 3. Keep only fields with no ancestor present in the set +fn filter_ancestor_fields(fields: Vec) -> Vec { + // Build a set of all paths for O(1) lookup (owned to avoid lifetime issues) + let all_paths: HashSet = fields.iter().map(|f| f.path.clone()).collect(); + + // Filter to keep only fields whose ancestors are NOT in the set + fields + .into_iter() + .filter(|field_change| !has_added_ancestor(&field_change.path, &all_paths)) + .collect() +} + +/// Checks if a field path has a parent in the given set of ancestor paths. +/// +/// Returns true if any path in `added_ancestor_paths` is a prefix of `path`. +/// For example, "user" is an ancestor of "user.name" and "user.address.street". +/// +/// This implementation walks up the parent chain of `path`, checking at each level +/// if that parent exists in the set. This is O(depth) instead of O(N * depth) where +/// N is the number of ancestor paths. +fn has_added_ancestor(path: &ColumnName, added_ancestor_paths: &HashSet) -> bool { + let mut curr = path.parent(); + while let Some(parent) = curr { + if added_ancestor_paths.contains(&parent) { + return true; + } + curr = parent.parent(); + } + false +} + +/// Gets the physical name of a field if present +fn physical_name(field: &StructField) -> Option<&str> { + match field.get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) { + Some(MetadataValue::String(s)) => Some(s.as_str()), + _ => None, + } +} + +/// Validates that physical names are consistent between two versions of the same field. +/// +/// Since schema diffing requires column mapping (field IDs), physical names must be present +/// and must not change for the same field ID across schema versions. +/// +/// # Errors +/// - `PhysicalNameChanged`: Physical name differs between before and after +/// - `MissingPhysicalName`: Physical name is missing in either version +fn validate_physical_name( + before: &FieldWithPath, + after: &FieldWithPath, +) -> Result<(), SchemaDiffError> { + let before_physical = physical_name(&before.field); + let after_physical = physical_name(&after.field); + + match (before_physical, after_physical) { + (Some(b), Some(a)) if b == a => { + // Valid: physical name is present and unchanged + Ok(()) + } + (Some(b), Some(a)) => { + // Invalid: physical name changed for the same field ID + Err(SchemaDiffError::PhysicalNameChanged { + field_id: before.field_id, + path: after.path.clone(), + before: b.to_string(), + after: a.to_string(), + }) + } + (Some(_), None) | (None, Some(_)) => { + // Invalid: physical name was added or removed + Err(SchemaDiffError::MissingPhysicalName { + path: after.path.clone(), + }) + } + (None, None) => { + // Invalid: physical name must be present when column mapping is enabled + Err(SchemaDiffError::MissingPhysicalName { + path: after.path.clone(), + }) + } + } +} + +/// Recursively collects all reachable struct fields from a data type with their paths +/// +/// This helper function handles deep nesting like `array>>` or +/// `map>, array>>` by recursing through container layers. +fn collect_fields_from_datatype( + data_type: &DataType, + parent_path: &ColumnName, + out: &mut Vec, +) -> Result<(), SchemaDiffError> { + match data_type { + DataType::Struct(struct_type) => { + // Collect fields from this struct + collect_all_fields_with_paths(struct_type, parent_path, out)?; + } + DataType::Array(array_type) => { + // TODO: Add IcebergCompatV2 support - check that array nested field IDs remain stable + // For IcebergCompatV2, arrays should have a field ID on the array itself and nested + // field IDs must not be added or removed (they must stay the same across versions). + // See: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#writer-requirements-for-icebergcompatv2 + + // For arrays, we use "element" as the path segment and recurse into element type + let element_path = parent_path.join(&ColumnName::new(["element"])); + collect_fields_from_datatype(array_type.element_type(), &element_path, out)?; + } + DataType::Map(map_type) => { + // TODO: Add IcebergCompatV2 support - check that map nested field IDs remain stable + // For IcebergCompatV2, maps should have field IDs on key/value and nested field IDs + // must not be added or removed (they must stay the same across versions). + // See: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#writer-requirements-for-icebergcompatv2 + + // For maps, we use "key" and "value" as path segments and recurse into both types + let key_path = parent_path.join(&ColumnName::new(["key"])); + collect_fields_from_datatype(map_type.key_type(), &key_path, out)?; + + let value_path = parent_path.join(&ColumnName::new(["value"])); + collect_fields_from_datatype(map_type.value_type(), &value_path, out)?; + } + _ => { + // Primitive types don't have nested fields + } + } + + Ok(()) +} + +/// Recursively collects all struct fields with their paths from a schema +fn collect_all_fields_with_paths( + schema: &StructType, + parent_path: &ColumnName, + out: &mut Vec, +) -> Result<(), SchemaDiffError> { + for field in schema.fields() { + let field_path = parent_path.join(&ColumnName::new([field.name()])); + + // Only struct fields can have field IDs in column mapping + let field_id = get_field_id_for_path(field, &field_path)?; + + out.push(FieldWithPath { + field: field.clone(), + path: field_path.clone(), + field_id, + }); + + // Recursively collect nested struct fields from the field's data type + collect_fields_from_datatype(field.data_type(), &field_path, out)?; + } + + Ok(()) +} + +/// Builds a map from field ID to FieldWithPath +fn build_field_map_by_id( + fields: &[FieldWithPath], +) -> Result, SchemaDiffError> { + let mut field_map = HashMap::new(); + + for field_with_path in fields { + let field_id = field_with_path.field_id; + + if let Some(existing) = field_map.insert(field_id, field_with_path.clone()) { + return Err(SchemaDiffError::DuplicateFieldId { + id: field_id, + path1: existing.path, + path2: field_with_path.path.clone(), + }); + } + } + + Ok(field_map) +} + +/// Extracts the field ID from a StructField's metadata with path for error reporting +fn get_field_id_for_path(field: &StructField, path: &ColumnName) -> Result { + match field.get_config_value(&ColumnMetadataKey::ColumnMappingId) { + Some(MetadataValue::Number(id)) => Ok(*id), + _ => Err(SchemaDiffError::MissingFieldId { path: path.clone() }), + } +} + +/// Computes the update for two fields with the same ID, if they differ +fn compute_field_update( + before: &FieldWithPath, + after: &FieldWithPath, +) -> Result, SchemaDiffError> { + let mut changes = Vec::new(); + + // Check for name change (rename) + if before.field.name() != after.field.name() { + changes.push(FieldChangeType::Renamed); + } + + // Check for nullability change - distinguish between tightening and loosening + if let Some(change) = + check_field_nullability_change(before.field.nullable, after.field.nullable) + { + changes.push(change); + } + + // Validate physical name consistency + validate_physical_name(before, after)?; + + // Check for type change (including container changes) + changes.extend(classify_data_type_change( + before.field.data_type(), + after.field.data_type(), + )); + + // Check for metadata changes (excluding column mapping metadata) + if has_metadata_changes(&before.field, &after.field) { + changes.push(FieldChangeType::MetadataChanged); + } + + // If no changes detected, return None + if changes.is_empty() { + return Ok(None); + } + + Ok(Some(FieldUpdate { + before: before.field.clone(), + after: after.field.clone(), + path: after.path.clone(), // Use the new path in case of renames + change_types: changes, + })) +} + +/// Checks for field nullability changes. +/// +/// Returns: +/// - `Some(FieldChangeType::NullabilityLoosened)` if nullability was relaxed (false -> true) +/// - `Some(FieldChangeType::NullabilityTightened)` if nullability was restricted (true -> false) +/// - `None` if nullability didn't change +fn check_field_nullability_change( + before_nullable: bool, + after_nullable: bool, +) -> Option { + match (before_nullable, after_nullable) { + (false, true) => Some(FieldChangeType::NullabilityLoosened), + (true, false) => Some(FieldChangeType::NullabilityTightened), + (true, true) | (false, false) => None, + } +} + +/// Checks for container nullability changes. +/// +/// Returns: +/// - `Some(FieldChangeType::ContainerNullabilityLoosened)` if nullability was relaxed (false -> true) +/// - `Some(FieldChangeType::ContainerNullabilityTightened)` if nullability was restricted (true -> false) +/// - `None` if nullability didn't change +fn check_container_nullability_change( + before_nullable: bool, + after_nullable: bool, +) -> Option { + match (before_nullable, after_nullable) { + (false, true) => Some(FieldChangeType::ContainerNullabilityLoosened), + (true, false) => Some(FieldChangeType::ContainerNullabilityTightened), + (true, true) | (false, false) => None, + } +} + +/// Classifies a type change between two data types. +/// +/// Returns: +/// - A `Vec` containing all changes detected (type changed or container nullability changed) +/// - An empty vec if the types are the same container with nested changes handled elsewhere +/// +/// This function handles the following cases: +/// 1. **Struct containers**: Changes to nested fields are captured via field IDs, so return empty vec +/// 2. **Array containers**: +/// - If element types match and only nullability changed, return the specific nullability change +/// - If element types are both structs with same nullability, nested changes handled via field IDs (return empty vec) +/// - Otherwise, it's a type change +/// 3. **Map containers**: Similar logic to arrays, but for both key and value types +/// 4. **Different container types or primitives**: Type change +fn classify_data_type_change(before: &DataType, after: &DataType) -> Vec { + // Early return if types are identical - no change to report + if before == after { + return Vec::new(); + } + + match (before, after) { + // Struct-to-struct: nested field changes are handled separately via field IDs + (DataType::Struct(_), DataType::Struct(_)) => Vec::new(), + + // Array-to-array: check element types and nullability + (DataType::Array(before_array), DataType::Array(after_array)) => { + // Recursively check for element type changes + let element_type_changes = + match (before_array.element_type(), after_array.element_type()) { + // Both have struct elements - nested changes handled via field IDs + (DataType::Struct(_), DataType::Struct(_)) => Vec::new(), + // For non-struct elements, recurse to check for changes + (e1, e2) => classify_data_type_change(e1, e2), + }; + + // Check container nullability change + let nullability_change = check_container_nullability_change( + before_array.contains_null(), + after_array.contains_null(), + ); + + // Combine both changes if present + let mut changes = element_type_changes; + if let Some(null_change) = nullability_change { + changes.push(null_change); + } + changes + } + + // Map-to-map: check key types, value types, and nullability + (DataType::Map(before_map), DataType::Map(after_map)) => { + // Recursively check for key type changes + let key_type_changes = match (before_map.key_type(), after_map.key_type()) { + // Both have struct keys - nested changes handled via field IDs + (DataType::Struct(_), DataType::Struct(_)) => Vec::new(), + // For non-struct keys (including arrays/maps containing structs), recurse + (k1, k2) => classify_data_type_change(k1, k2), + }; + + // Recursively check for value type changes + let value_type_changes = match (before_map.value_type(), after_map.value_type()) { + // Both have struct values - nested changes handled via field IDs + (DataType::Struct(_), DataType::Struct(_)) => Vec::new(), + // For non-struct values (including arrays/maps containing structs), recurse + (v1, v2) => classify_data_type_change(v1, v2), + }; + + // Check container nullability change + let nullability_change = check_container_nullability_change( + before_map.value_contains_null(), + after_map.value_contains_null(), + ); + + // Combine all changes if present + let mut changes = key_type_changes; + changes.extend(value_type_changes); + if let Some(null_change) = nullability_change { + changes.push(null_change); + } + changes + } + + // Different container types or primitive type changes + _ => vec![FieldChangeType::TypeChanged], + } +} + +/// Checks if two fields have different metadata (excluding column mapping metadata) +fn has_metadata_changes(before: &StructField, after: &StructField) -> bool { + // Instead of returning a HashMap of references, we'll compare directly + let before_filtered: HashMap = before + .metadata + .iter() + .filter(|(key, _)| { + !key.starts_with("delta.columnMapping") && !key.starts_with("parquet.field") + }) + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + + let after_filtered: HashMap = after + .metadata + .iter() + .filter(|(key, _)| { + !key.starts_with("delta.columnMapping") && !key.starts_with("parquet.field") + }) + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + + before_filtered != after_filtered +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::schema::{ArrayType, DataType, MapType, StructField, StructType}; + use std::collections::HashSet; + + fn create_field_with_id( + name: &str, + data_type: DataType, + nullable: bool, + id: i64, + ) -> StructField { + StructField::new(name, data_type, nullable).add_metadata([ + ("delta.columnMapping.id", MetadataValue::Number(id)), + ( + "delta.columnMapping.physicalName", + MetadataValue::String(format!("col_{id}")), + ), + ]) + } + + fn updated_paths(diff: &SchemaDiff) -> HashSet { + diff.updated_fields.iter().map(|u| u.path.clone()).collect() + } + + #[test] + fn test_identical_schemas() { + let schema = StructType::new_unchecked([ + create_field_with_id("id", DataType::LONG, false, 1), + create_field_with_id("name", DataType::STRING, false, 2), + ]); + + let diff = SchemaDiff::new(&schema, &schema).unwrap(); + assert!(diff.is_empty()); + assert!(!diff.has_breaking_changes()); + } + + #[test] + fn test_change_count() { + let before = StructType::new_unchecked([ + create_field_with_id("id", DataType::LONG, false, 1), + create_field_with_id("name", DataType::STRING, false, 2), + ]); + + let after = StructType::new_unchecked([ + create_field_with_id("id", DataType::LONG, true, 1), // Changed + create_field_with_id("email", DataType::STRING, false, 3), // Added + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + // 1 removed (name), 1 added (email), 1 updated (id) + assert_eq!(diff.change_count(), 3); + assert_eq!(diff.removed_fields.len(), 1); + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.updated_fields.len(), 1); + } + + #[test] + fn test_top_level_added_field() { + let before = + StructType::new_unchecked([create_field_with_id("id", DataType::LONG, false, 1)]); + + let after = StructType::new_unchecked([ + create_field_with_id("id", DataType::LONG, false, 1), + create_field_with_id("name", DataType::STRING, false, 2), + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 0); + assert_eq!(diff.added_fields[0].path, ColumnName::new(["name"])); + assert_eq!(diff.added_fields[0].field.name(), "name"); + assert!(diff.has_breaking_changes()); // Adding non-nullable field is breaking + } + + #[test] + fn test_added_required_field_is_breaking() { + // Adding a non-nullable (required) field is breaking + let before = + StructType::new_unchecked([create_field_with_id("id", DataType::LONG, false, 1)]); + + let after = StructType::new_unchecked([ + create_field_with_id("id", DataType::LONG, false, 1), + create_field_with_id("required_field", DataType::STRING, false, 2), // Non-nullable + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 0); + assert!(diff.has_breaking_changes()); + } + + #[test] + fn test_added_nullable_field_is_not_breaking() { + // Adding a nullable (optional) field is NOT breaking + let before = + StructType::new_unchecked([create_field_with_id("id", DataType::LONG, false, 1)]); + + let after = StructType::new_unchecked([ + create_field_with_id("id", DataType::LONG, false, 1), + create_field_with_id("optional_field", DataType::STRING, true, 2), // Nullable + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 0); + assert!(!diff.has_breaking_changes()); // Not breaking + } + + #[test] + fn test_physical_name_validation() { + // Test: Physical names present and unchanged - valid schema evolution (just a rename) + let before = StructType::new_unchecked([StructField::new("name", DataType::STRING, false) + .add_metadata([ + ("delta.columnMapping.id", MetadataValue::Number(1)), + ( + "delta.columnMapping.physicalName", + MetadataValue::String("col_1".to_string()), + ), + ])]); + let after = + StructType::new_unchecked([StructField::new("full_name", DataType::STRING, false) + .add_metadata([ + ("delta.columnMapping.id", MetadataValue::Number(1)), + ( + "delta.columnMapping.physicalName", + MetadataValue::String("col_1".to_string()), + ), + ])]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::Renamed] + ); + assert!(!diff.has_breaking_changes()); // Rename is not breaking + + // Test: Physical name changed - INVALID (returns error) + let before = StructType::new_unchecked([StructField::new("name", DataType::STRING, false) + .add_metadata([ + ("delta.columnMapping.id", MetadataValue::Number(1)), + ( + "delta.columnMapping.physicalName", + MetadataValue::String("col_001".to_string()), + ), + ])]); + let after = StructType::new_unchecked([StructField::new("name", DataType::STRING, false) + .add_metadata([ + ("delta.columnMapping.id", MetadataValue::Number(1)), + ( + "delta.columnMapping.physicalName", + MetadataValue::String("col_002".to_string()), + ), + ])]); + + let result = SchemaDiff::new(&before, &after); + assert!(matches!( + result, + Err(SchemaDiffError::PhysicalNameChanged { .. }) + )); + + // Test: Missing physical name in one schema - INVALID (returns error) + let before = StructType::new_unchecked([StructField::new("name", DataType::STRING, false) + .add_metadata([ + ("delta.columnMapping.id", MetadataValue::Number(1)), + ( + "delta.columnMapping.physicalName", + MetadataValue::String("col_1".to_string()), + ), + ])]); + let after = StructType::new_unchecked([StructField::new("name", DataType::STRING, false) + .add_metadata([("delta.columnMapping.id", MetadataValue::Number(1))])]); + + let result = SchemaDiff::new(&before, &after); + assert!(matches!( + result, + Err(SchemaDiffError::MissingPhysicalName { .. }) + )); + } + + #[test] + fn test_multiple_change_types() { + // Test that a field with multiple simultaneous changes produces FieldChangeType::Multiple + let before = StructType::new_unchecked([create_field_with_id( + "user_name", + DataType::STRING, + false, + 1, + ) + .add_metadata([("custom", MetadataValue::String("old_value".to_string()))])]); + + let after = StructType::new_unchecked([ + create_field_with_id("userName", DataType::STRING, true, 1) // Renamed + nullability loosened + .add_metadata([("custom", MetadataValue::String("new_value".to_string()))]), // Metadata changed + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + let update = &diff.updated_fields[0]; + + // Should have 3 changes + assert_eq!(update.change_types.len(), 3); + assert!(update.change_types.contains(&FieldChangeType::Renamed)); + assert!(update + .change_types + .contains(&FieldChangeType::NullabilityLoosened)); + assert!(update + .change_types + .contains(&FieldChangeType::MetadataChanged)); + + // Breaking because metadata changed (metadata changes can be unsafe, e.g., row tracking) + assert!(diff.has_breaking_changes()); + } + + #[test] + fn test_multiple_with_breaking_change() { + // Test that Multiple changes are correctly identified as breaking when they contain breaking changes + let before = StructType::new_unchecked([create_field_with_id( + "user_name", + DataType::STRING, + true, + 1, + ) + .add_metadata([("custom", MetadataValue::String("old_value".to_string()))])]); + + let after = StructType::new_unchecked([ + create_field_with_id("userName", DataType::STRING, false, 1) // Renamed + nullability TIGHTENED + .add_metadata([("custom", MetadataValue::String("new_value".to_string()))]), // Metadata changed + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + let update = &diff.updated_fields[0]; + + assert_eq!(update.change_types.len(), 3); + assert!(update.change_types.contains(&FieldChangeType::Renamed)); + assert!(update + .change_types + .contains(&FieldChangeType::NullabilityTightened)); + assert!(update + .change_types + .contains(&FieldChangeType::MetadataChanged)); + + // Breaking because nullability was tightened + assert!(diff.has_breaking_changes()); + } + + #[test] + fn test_duplicate_field_id_error() { + // Test that duplicate field IDs in the same schema produce an error + let schema_with_duplicates = StructType::new_unchecked([ + create_field_with_id("field1", DataType::STRING, false, 1), + create_field_with_id("field2", DataType::STRING, false, 1), // Same ID! + ]); + + let result = SchemaDiff::new(&schema_with_duplicates, &schema_with_duplicates); + + assert!(result.is_err()); + match result { + Err(SchemaDiffError::DuplicateFieldId { id, path1, path2 }) => { + assert_eq!(id, 1); + assert_eq!(path1, ColumnName::new(["field1"])); + assert_eq!(path2, ColumnName::new(["field2"])); + } + _ => panic!("Expected DuplicateFieldId error"), + } + } + + #[test] + fn test_top_level_and_nested_change_filters() { + // Test that top_level_changes and nested_changes correctly filter by path depth. + // This test manually constructs a SchemaDiff to exercise the filtering logic. + + let top_level_field = create_field_with_id("name", DataType::STRING, false, 1); + let nested_field = create_field_with_id("street", DataType::STRING, false, 2); + let deeply_nested_field = create_field_with_id("city", DataType::STRING, false, 3); + + // Create a diff with mixed top-level and nested changes + let diff = SchemaDiff { + added_fields: vec![ + FieldChange { + field: top_level_field.clone(), + path: ColumnName::new(["name"]), // Top-level (depth 1) + }, + FieldChange { + field: nested_field.clone(), + path: ColumnName::new(["address", "street"]), // Nested (depth 2) + }, + ], + removed_fields: vec![FieldChange { + field: deeply_nested_field.clone(), + path: ColumnName::new(["user", "address", "city"]), // Deeply nested (depth 3) + }], + updated_fields: vec![], + breaking_changes: false, + }; + + // Test top_level_changes - should only return depth 1 fields + let (top_added, top_removed, top_updated) = diff.top_level_changes(); + assert_eq!(top_added.len(), 1); + assert_eq!(top_added[0].path, ColumnName::new(["name"])); + assert_eq!(top_removed.len(), 0); + assert_eq!(top_updated.len(), 0); + + // Test nested_changes - should only return depth > 1 fields + let (nested_added, nested_removed, nested_updated) = diff.nested_changes(); + assert_eq!(nested_added.len(), 1); + assert_eq!(nested_added[0].path, ColumnName::new(["address", "street"])); + assert_eq!(nested_removed.len(), 1); + assert_eq!( + nested_removed[0].path, + ColumnName::new(["user", "address", "city"]) + ); + assert_eq!(nested_updated.len(), 0); + } + + #[test] + fn test_ancestor_filtering() { + // Test that when a parent struct is added/removed, its children aren't reported separately + let without_user = + StructType::new_unchecked([create_field_with_id("id", DataType::LONG, false, 1)]); + + let with_user = StructType::new_unchecked([ + create_field_with_id("id", DataType::LONG, false, 1), + create_field_with_id( + "user", + DataType::try_struct_type([ + create_field_with_id("name", DataType::STRING, false, 3), + create_field_with_id("email", DataType::STRING, true, 4), + create_field_with_id( + "address", + DataType::try_struct_type([ + create_field_with_id("street", DataType::STRING, false, 6), + create_field_with_id("city", DataType::STRING, false, 7), + ]) + .unwrap(), + true, + 5, + ), + ]) + .unwrap(), + false, + 2, + ), + ]); + + // CASE 1: Adding a parent struct - only parent should be reported, not nested fields + let diff = SchemaDiff::new(&without_user, &with_user).unwrap(); + + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.added_fields[0].path, ColumnName::new(["user"])); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 0); + // The filtered paths would have been: user.name, user.email, user.address, user.address.street, user.address.city + assert!(diff.has_breaking_changes()); // Adding non-nullable struct field is breaking + + // CASE 2: Removing a parent struct - only parent should be reported, not nested fields + let diff = SchemaDiff::new(&with_user, &without_user).unwrap(); + + assert_eq!(diff.removed_fields.len(), 1); + assert_eq!(diff.removed_fields[0].path, ColumnName::new(["user"])); + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 0); + assert!(!diff.has_breaking_changes()); // Removing fields is safe + } + + #[test] + fn test_array_of_struct_addition_reports_only_ancestor_field() { + // Before: no fields. After: items: array> + // Expected: added_fields == [items], not [items, items.element.name] + let before = StructType::new_unchecked([]); + let after = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "name", + DataType::STRING, + false, + 2, + )]) + .unwrap(), + false, + ))), + true, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.added_fields[0].path, ColumnName::new(["items"])); + + let (nested_added, nested_removed, nested_updated) = diff.nested_changes(); + assert_eq!(nested_added.len(), 0); + assert_eq!(nested_removed.len(), 0); + assert_eq!(nested_updated.len(), 0); + } + + #[test] + fn test_container_with_nested_changes_not_reported_as_type_change() { + // Test that when a struct's nested fields change, the struct itself isn't reported as TypeChanged + let before = StructType::new_unchecked([create_field_with_id( + "user", + DataType::try_struct_type([ + create_field_with_id("name", DataType::STRING, false, 2), + create_field_with_id("email", DataType::STRING, true, 3), + ]) + .unwrap(), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "user", + DataType::try_struct_type([ + create_field_with_id("full_name", DataType::STRING, false, 2), // Renamed + create_field_with_id("email", DataType::STRING, true, 3), + create_field_with_id("age", DataType::INTEGER, true, 4), // Added + ]) + .unwrap(), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + // Should see the nested field changes but NOT a type change on the parent struct + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.added_fields[0].path, ColumnName::new(["user", "age"])); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!( + diff.updated_fields[0].path, + ColumnName::new(["user", "full_name"]) + ); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::Renamed] + ); + + // Crucially, there should be NO update reported for the "user" field itself + // even though its DataType::Struct contains different nested fields + let top_level_updates: Vec<_> = diff + .updated_fields + .iter() + .filter(|u| u.path.path().len() == 1) + .collect(); + assert_eq!(top_level_updates.len(), 0); + + // Not a breaking change since it's just a rename and an added nullable field + assert!(!diff.has_breaking_changes()); + } + + #[test] + fn test_actual_struct_type_change_still_reported() { + // Test that actual type changes (not just nested content changes) are still reported + let before = + StructType::new_unchecked([create_field_with_id("data", DataType::STRING, false, 1)]); + + let after = StructType::new_unchecked([ + create_field_with_id( + "data", + DataType::try_struct_type([create_field_with_id( + "nested", + DataType::STRING, + false, + 2, + )]) + .unwrap(), + false, + 1, + ), // Changed from STRING to STRUCT + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + // This IS a real type change from primitive to struct + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["data"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::TypeChanged] + ); + assert!(diff.has_breaking_changes()); + + // The new nested field should also be reported as added + assert_eq!( + diff.added_fields[0].path, + ColumnName::new(["data", "nested"]) + ); + assert!(diff.has_breaking_changes()); + } + + #[test] + fn test_array_with_struct_element_changes() { + // Test that array containers aren't reported as changed when their struct elements change + let before = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "name", + DataType::STRING, + false, + 2, + )]) + .unwrap(), + true, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([ + create_field_with_id("title", DataType::STRING, false, 2), // Renamed + ]) + .unwrap(), + true, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + // Should only see the nested field rename, not a change to the array container + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!( + diff.updated_fields[0].path, + ColumnName::new(["items", "element", "title"]) + ); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::Renamed] + ); + + // No change should be reported for the "items" array itself + let array_updates: Vec<_> = diff + .updated_fields + .iter() + .filter(|u| u.path == ColumnName::new(["items"])) + .collect(); + assert_eq!(array_updates.len(), 0); + assert!(!diff.has_breaking_changes()); + } + + #[test] + fn test_ancestor_filtering_with_mixed_changes() { + let before = StructType::new_unchecked([ + create_field_with_id("existing", DataType::STRING, false, 1), + create_field_with_id( + "existing_struct", + DataType::try_struct_type([create_field_with_id( + "old_name", + DataType::STRING, + false, + 3, + )]) + .unwrap(), + false, + 2, + ), + ]); + + let after = StructType::new_unchecked([ + create_field_with_id("existing", DataType::STRING, true, 1), // Changed nullability + create_field_with_id( + "existing_struct", + DataType::try_struct_type([ + create_field_with_id("new_name", DataType::STRING, false, 3), // Renamed + ]) + .unwrap(), + true, // Changed nullability + 2, + ), + create_field_with_id( + "new_struct", // Completely new struct + DataType::try_struct_type([create_field_with_id( + "nested_field", + DataType::INTEGER, + false, + 5, + )]) + .unwrap(), + false, + 4, + ), + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + // Should see: existing changed, existing_struct changed, existing_struct.old_name->new_name renamed, new_struct added + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.added_fields[0].path, ColumnName::new(["new_struct"])); + + assert_eq!(diff.updated_fields.len(), 3); + let paths = updated_paths(&diff); + assert!(paths.contains(&ColumnName::new(["existing"]))); + assert!(paths.contains(&ColumnName::new(["existing_struct"]))); + assert!(paths.contains(&ColumnName::new(["existing_struct", "new_name"]))); + + // Added a non-nullable struct "new_struct" + assert!(diff.has_breaking_changes()); + + // nested_field should NOT appear as added since new_struct is its ancestor + } + + #[test] + fn test_nested_field_rename() { + let before = StructType::new_unchecked([create_field_with_id( + "user", + DataType::try_struct_type([create_field_with_id("name", DataType::STRING, false, 2)]) + .unwrap(), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "user", + DataType::try_struct_type([ + create_field_with_id("full_name", DataType::STRING, false, 2), // Renamed! + ]) + .unwrap(), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + + let update = &diff.updated_fields[0]; + assert_eq!(update.path, ColumnName::new(["user", "full_name"])); + assert_eq!(update.change_types, vec![FieldChangeType::Renamed]); + assert!(!diff.has_breaking_changes()); // Rename is not breaking + } + + #[test] + fn test_nested_field_added() { + let before = StructType::new_unchecked([create_field_with_id( + "user", + DataType::try_struct_type([create_field_with_id("name", DataType::STRING, false, 2)]) + .unwrap(), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "user", + DataType::try_struct_type([ + create_field_with_id("name", DataType::STRING, false, 2), + create_field_with_id("age", DataType::INTEGER, true, 3), // Added! + ]) + .unwrap(), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 0); + + let added = &diff.added_fields[0]; + assert_eq!(added.path, ColumnName::new(["user", "age"])); + assert_eq!(added.field.name(), "age"); + assert!(!diff.has_breaking_changes()); // Adding nullable field is not breaking + } + + #[test] + fn test_deeply_nested_changes() { + let before = StructType::new_unchecked([create_field_with_id( + "level1", + DataType::try_struct_type([create_field_with_id( + "level2", + DataType::try_struct_type([create_field_with_id( + "deep_field", + DataType::STRING, + false, + 3, + )]) + .unwrap(), + false, + 2, + )]) + .unwrap(), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "level1", + DataType::try_struct_type([create_field_with_id( + "level2", + DataType::try_struct_type([ + create_field_with_id("very_deep_field", DataType::STRING, false, 3), // Renamed! + ]) + .unwrap(), + false, + 2, + )]) + .unwrap(), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + + let update = &diff.updated_fields[0]; + assert_eq!( + update.path, + ColumnName::new(["level1", "level2", "very_deep_field"]) + ); + assert_eq!(update.change_types, vec![FieldChangeType::Renamed]); + } + + #[test] + fn test_top_level_vs_nested_filtering() { + let before = StructType::new_unchecked([ + create_field_with_id("top_field", DataType::STRING, false, 1), + create_field_with_id( + "user", + DataType::try_struct_type([create_field_with_id( + "name", + DataType::STRING, + false, + 3, + )]) + .unwrap(), + false, + 2, + ), + ]); + + let after = StructType::new_unchecked([ + create_field_with_id("renamed_top", DataType::STRING, false, 1), // Renamed top-level + create_field_with_id( + "user", + DataType::try_struct_type([ + create_field_with_id("full_name", DataType::STRING, false, 3), // Renamed nested + create_field_with_id("age", DataType::INTEGER, true, 4), // Added nested + ]) + .unwrap(), + false, + 2, + ), + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + let (top_added, _, top_updated) = diff.top_level_changes(); + let (nested_added, _, nested_updated) = diff.nested_changes(); + + assert_eq!(top_added.len(), 0); + assert_eq!(top_updated.len(), 1); + assert_eq!(top_updated[0].path, ColumnName::new(["renamed_top"])); + + assert_eq!(nested_added.len(), 1); + assert_eq!(nested_added[0].path, ColumnName::new(["user", "age"])); + assert_eq!(nested_updated.len(), 1); + assert_eq!( + nested_updated[0].path, + ColumnName::new(["user", "full_name"]) + ); + } + + #[test] + fn test_mixed_changes() { + let before = StructType::new_unchecked([ + create_field_with_id("id", DataType::LONG, false, 1), + create_field_with_id( + "user", + DataType::try_struct_type([ + create_field_with_id("name", DataType::STRING, false, 3), + create_field_with_id("email", DataType::STRING, true, 4), + ]) + .unwrap(), + false, + 2, + ), + ]); + + let after = StructType::new_unchecked([ + create_field_with_id("identifier", DataType::LONG, false, 1), // Renamed top-level + create_field_with_id( + "user", + DataType::try_struct_type([ + create_field_with_id("full_name", DataType::STRING, false, 3), // Renamed nested + // email removed (id=4) + create_field_with_id("age", DataType::INTEGER, true, 5), // Added nested + ]) + .unwrap(), + false, + 2, + ), + create_field_with_id("created_at", DataType::TIMESTAMP, false, 6), // Added top-level + ]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + // Check totals + assert_eq!(diff.added_fields.len(), 2); + assert_eq!(diff.removed_fields.len(), 1); + assert_eq!(diff.updated_fields.len(), 2); + + // Check specific changes + let added_paths: HashSet = + diff.added_fields.iter().map(|f| f.path.clone()).collect(); + assert!(added_paths.contains(&ColumnName::new(["user", "age"]))); + assert!(added_paths.contains(&ColumnName::new(["created_at"]))); + + let removed_paths: HashSet = + diff.removed_fields.iter().map(|f| f.path.clone()).collect(); + assert!(removed_paths.contains(&ColumnName::new(["user", "email"]))); + + let paths = updated_paths(&diff); + assert!(paths.contains(&ColumnName::new(["identifier"]))); + assert!(paths.contains(&ColumnName::new(["user", "full_name"]))); + } + + #[test] + fn test_array_element_struct_field_changes() { + let before = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([ + create_field_with_id("name", DataType::STRING, false, 2), + create_field_with_id("removed_field", DataType::INTEGER, true, 3), + ]) + .unwrap(), + true, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([ + create_field_with_id("title", DataType::STRING, false, 2), // Renamed! + create_field_with_id("added_field", DataType::STRING, true, 4), // Added! + ]) + .unwrap(), + true, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 1); + assert_eq!(diff.updated_fields.len(), 1); + + // Check added field + assert_eq!( + diff.added_fields[0].path, + ColumnName::new(["items", "element", "added_field"]) + ); + + // Check removed field + assert_eq!( + diff.removed_fields[0].path, + ColumnName::new(["items", "element", "removed_field"]) + ); + + // Check updated field (rename) + let update = &diff.updated_fields[0]; + assert_eq!(update.path, ColumnName::new(["items", "element", "title"])); + assert_eq!(update.change_types, vec![FieldChangeType::Renamed]); + + assert!(!diff.has_breaking_changes()); // Removal is safe, rename is safe + } + + #[test] + fn test_doubly_nested_array_type_change() { + // Test that we can detect type changes in doubly nested arrays: array> -> array> + let before = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))), + false, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::DOUBLE, false))), + false, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + // The entire field should be reported as TypeChanged since we can't recurse into + // non-struct array elements (no field IDs at intermediate levels) + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["matrix"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::TypeChanged] + ); + + assert!(diff.has_breaking_changes()); // Type change is breaking + } + + #[test] + fn test_array_primitive_element_type_change() { + // Test direct primitive element type change: array -> array + let before = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["items"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::TypeChanged] + ); + assert!(diff.has_breaking_changes()); // Type change is breaking + } + + #[test] + fn test_nested_array_nullability_loosened() { + // Test: array not null> -> array> + // Outer array element nullability loosened (safe change) + let before = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))), + false, // Outer array elements are non-nullable + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))), + true, // Outer array elements now nullable + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["matrix"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityLoosened] + ); + assert!(!diff.has_breaking_changes()); // Loosening is safe + } + + #[test] + fn test_nested_array_nullability_tightened() { + // Test: array> -> array not null> + // Outer array element nullability tightened (breaking change) + let before = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))), + true, // Outer array elements are nullable + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))), + false, // Outer array elements now non-nullable + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["matrix"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityTightened] + ); + assert!(diff.has_breaking_changes()); // Tightening is breaking + } + + #[test] + fn test_nested_array_inner_nullability_loosened() { + // Test: array> -> array> + // Inner array element nullability loosened (safe change) + let before = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))), // Inner elements non-nullable + false, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, true))), // Inner elements now nullable + false, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["matrix"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityLoosened] + ); + assert!(!diff.has_breaking_changes()); // Loosening is safe + } + + #[test] + fn test_nested_array_inner_nullability_tightened() { + // Test: array> -> array> + // Inner array element nullability tightened (breaking change) + let before = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, true))), // Inner elements nullable + false, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))), // Inner elements now non-nullable + false, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["matrix"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityTightened] + ); + assert!(diff.has_breaking_changes()); // Tightening is breaking + } + + #[test] + fn test_array_nullability_loosened() { + let before = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))), // Non-nullable elements + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new(DataType::STRING, true))), // Nullable elements now + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["items"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityLoosened] + ); + assert!(!diff.has_breaking_changes()); // Loosening is safe + } + + #[test] + fn test_array_nullability_tightened() { + let before = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new(DataType::STRING, true))), // Nullable elements + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))), // Non-nullable now + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["items"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityTightened] + ); + assert!(diff.has_breaking_changes()); // Tightening is breaking + } + #[test] + fn test_map_value_struct_field_changes() { + let before = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::try_struct_type([ + create_field_with_id("value", DataType::INTEGER, false, 2), + create_field_with_id("removed_field", DataType::STRING, true, 3), + ]) + .unwrap(), + true, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::try_struct_type([ + create_field_with_id("count", DataType::INTEGER, false, 2), // Renamed! + create_field_with_id("added_field", DataType::STRING, true, 4), // Added! + ]) + .unwrap(), + true, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 1); + assert_eq!(diff.updated_fields.len(), 1); + + // Check added field + assert_eq!( + diff.added_fields[0].path, + ColumnName::new(["lookup", "value", "added_field"]) + ); + + // Check removed field + assert_eq!( + diff.removed_fields[0].path, + ColumnName::new(["lookup", "value", "removed_field"]) + ); + + // Check updated field (rename) + let update = &diff.updated_fields[0]; + assert_eq!(update.path, ColumnName::new(["lookup", "value", "count"])); + assert_eq!(update.change_types, vec![FieldChangeType::Renamed]); + + assert!(!diff.has_breaking_changes()); // Removal is safe, rename is safe + } + + #[test] + fn test_array_struct_element_nullability_loosened() { + // Test: array not null> -> array> + // Struct element nullability loosened (safe change) + let before = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "name", + DataType::STRING, + false, + 2, + )]) + .unwrap(), + false, // Struct elements non-nullable + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "name", + DataType::STRING, + false, + 2, + )]) + .unwrap(), + true, // Struct elements now nullable + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["items"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityLoosened] + ); + assert!(!diff.has_breaking_changes()); // Loosening is safe + } + + #[test] + fn test_map_nullability_loosened() { + let before = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::INTEGER, + false, // Non-nullable values + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::INTEGER, + true, // Nullable values now + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["lookup"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityLoosened] + ); + assert!(!diff.has_breaking_changes()); + } + + #[test] + fn test_array_struct_element_nullability_tightened() { + // Test: array> -> array not null> + // Struct element nullability tightened (breaking change) + let before = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "name", + DataType::STRING, + false, + 2, + )]) + .unwrap(), + true, // Struct elements nullable + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "name", + DataType::STRING, + false, + 2, + )]) + .unwrap(), + false, // Struct elements now non-nullable + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["items"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityTightened] + ); + assert!(diff.has_breaking_changes()); // Tightening is breaking + } + + #[test] + fn test_map_nullability_tightened() { + let before = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::INTEGER, + true, // Nullable values + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::INTEGER, + false, // Non-nullable now + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["lookup"])); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityTightened] + ); + assert!(diff.has_breaking_changes()); + } + + #[test] + fn test_array_combined_nullability_and_type_change() { + // Test that both nullability and type change can be exercised at once in a single diff. + // Before: items: array not null (elements non-nullable) + // After: items: array (elements nullable, type changed) + let before = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new(DataType::STRING, false))), // Non-nullable elements + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, true))), // Nullable, type changed + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!(diff.updated_fields[0].path, ColumnName::new(["items"])); + // Should have both TypeChanged and ContainerNullabilityLoosened + let change_types = &diff.updated_fields[0].change_types; + assert!(change_types.contains(&FieldChangeType::TypeChanged)); + assert!(change_types.contains(&FieldChangeType::ContainerNullabilityLoosened)); + assert!(diff.has_breaking_changes()); // Type change is breaking + } + + #[test] + fn test_map_with_struct_key() { + // Test that maps with struct keys can be diffed + let before = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::try_struct_type([create_field_with_id( + "id", + DataType::INTEGER, + false, + 2, + )]) + .unwrap(), + DataType::STRING, + true, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::try_struct_type([create_field_with_id( + "identifier", // Renamed key field + DataType::INTEGER, + false, + 2, + )]) + .unwrap(), + DataType::STRING, + true, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + // Should see the nested key field renamed + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!( + diff.updated_fields[0].path, + ColumnName::new(["lookup", "key", "identifier"]) + ); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::Renamed] + ); + } + + #[test] + fn test_nested_struct_in_array_in_struct_field_changes() { + // struct>>> + let before = StructType::new_unchecked([create_field_with_id( + "data", + DataType::try_struct_type([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "inner", + DataType::try_struct_type([ + create_field_with_id("a", DataType::INTEGER, false, 3), + create_field_with_id("removed", DataType::STRING, true, 4), + ]) + .unwrap(), + false, + 2, + )]) + .unwrap(), + false, + ))), + false, + 5, + )]) + .unwrap(), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "data", + DataType::try_struct_type([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "inner", + DataType::try_struct_type([ + create_field_with_id("renamed_a", DataType::INTEGER, false, 3), // Renamed! + create_field_with_id("added", DataType::LONG, true, 6), // Added! + ]) + .unwrap(), + false, + 2, + )]) + .unwrap(), + false, + ))), + false, + 5, + )]) + .unwrap(), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 1); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!( + diff.added_fields[0].path, + ColumnName::new(["data", "items", "element", "inner", "added"]) + ); + assert_eq!( + diff.removed_fields[0].path, + ColumnName::new(["data", "items", "element", "inner", "removed"]) + ); + assert_eq!( + diff.updated_fields[0].path, + ColumnName::new(["data", "items", "element", "inner", "renamed_a"]) + ); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::Renamed] + ); + assert!(!diff.has_breaking_changes()); + } + + #[test] + fn test_nested_map_within_struct_within_map() { + // map>>> + let before = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::try_struct_type([create_field_with_id( + "nested", + DataType::Map(Box::new(MapType::new( + DataType::INTEGER, + DataType::try_struct_type([create_field_with_id( + "x", + DataType::INTEGER, + false, + 3, + )]) + .unwrap(), + false, + ))), + false, + 2, + )]) + .unwrap(), + false, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "lookup", + DataType::Map(Box::new(MapType::new( + DataType::STRING, + DataType::try_struct_type([create_field_with_id( + "nested", + DataType::Map(Box::new(MapType::new( + DataType::INTEGER, + DataType::try_struct_type([ + create_field_with_id("renamed_x", DataType::INTEGER, false, 3), // Renamed! + ]) + .unwrap(), + false, + ))), + false, + 2, + )]) + .unwrap(), + false, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!( + diff.updated_fields[0].path, + ColumnName::new(["lookup", "value", "nested", "value", "renamed_x"]) + ); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::Renamed] + ); + assert!(!diff.has_breaking_changes()); + } + + #[test] + fn test_doubly_nested_array_with_struct_elements() { + // array>> + let before = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "x", + DataType::INTEGER, + false, + 2, + )]) + .unwrap(), + false, + ))), + false, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "matrix", + DataType::Array(Box::new(ArrayType::new( + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([ + create_field_with_id("renamed_x", DataType::INTEGER, false, 2), // Renamed! + create_field_with_id("y", DataType::INTEGER, true, 3), // Added! + ]) + .unwrap(), + false, + ))), + false, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!( + diff.added_fields[0].path, + ColumnName::new(["matrix", "element", "element", "y"]) + ); + assert_eq!( + diff.updated_fields[0].path, + ColumnName::new(["matrix", "element", "element", "renamed_x"]) + ); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::Renamed] + ); + assert!(!diff.has_breaking_changes()); + } + + #[test] + fn test_map_with_array_of_struct_key_and_value() { + // map>, array>> + let before = StructType::new_unchecked([create_field_with_id( + "complex_map", + DataType::Map(Box::new(MapType::new( + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "key_field", + DataType::INTEGER, + false, + 2, + )]) + .unwrap(), + false, + ))), + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "value_field", + DataType::STRING, + false, + 3, + )]) + .unwrap(), + false, + ))), + false, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "complex_map", + DataType::Map(Box::new(MapType::new( + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([ + create_field_with_id("renamed_key_field", DataType::INTEGER, false, 2), // Renamed! + ]) + .unwrap(), + false, + ))), + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([ + create_field_with_id("renamed_value_field", DataType::STRING, false, 3), // Renamed! + ]) + .unwrap(), + false, + ))), + false, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 2); + + let paths = updated_paths(&diff); + assert!(paths.contains(&ColumnName::new([ + "complex_map", + "key", + "element", + "renamed_key_field" + ]))); + assert!(paths.contains(&ColumnName::new([ + "complex_map", + "value", + "element", + "renamed_value_field" + ]))); + assert!(!diff.has_breaking_changes()); + } + + #[test] + fn test_map_struct_key_nested_map_value() { + // map, map, struct>> + let before = StructType::new_unchecked([create_field_with_id( + "nested_maps", + DataType::Map(Box::new(MapType::new( + DataType::try_struct_type([create_field_with_id( + "outer_key", + DataType::INTEGER, + false, + 2, + )]) + .unwrap(), + DataType::Map(Box::new(MapType::new( + DataType::try_struct_type([create_field_with_id( + "inner_key", + DataType::INTEGER, + false, + 3, + )]) + .unwrap(), + DataType::try_struct_type([ + create_field_with_id("data", DataType::STRING, false, 4), + create_field_with_id("removed", DataType::INTEGER, true, 5), + ]) + .unwrap(), + false, + ))), + false, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "nested_maps", + DataType::Map(Box::new(MapType::new( + DataType::try_struct_type([create_field_with_id( + "renamed_outer_key", // Renamed! + DataType::INTEGER, + false, + 2, + )]) + .unwrap(), + DataType::Map(Box::new(MapType::new( + DataType::try_struct_type([create_field_with_id( + "renamed_inner_key", // Renamed! + DataType::INTEGER, + false, + 3, + )]) + .unwrap(), + DataType::try_struct_type([ + create_field_with_id("renamed_data", DataType::STRING, false, 4), // Renamed! + create_field_with_id("added", DataType::LONG, true, 6), // Added! + ]) + .unwrap(), + false, + ))), + false, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 1); + assert_eq!(diff.removed_fields.len(), 1); + assert_eq!(diff.updated_fields.len(), 3); + + assert_eq!( + diff.added_fields[0].path, + ColumnName::new(["nested_maps", "value", "value", "added"]) + ); + assert_eq!( + diff.removed_fields[0].path, + ColumnName::new(["nested_maps", "value", "value", "removed"]) + ); + + let paths = updated_paths(&diff); + assert!(paths.contains(&ColumnName::new([ + "nested_maps", + "key", + "renamed_outer_key" + ]))); + assert!(paths.contains(&ColumnName::new([ + "nested_maps", + "value", + "key", + "renamed_inner_key" + ]))); + assert!(paths.contains(&ColumnName::new([ + "nested_maps", + "value", + "value", + "renamed_data" + ]))); + + assert!(!diff.has_breaking_changes()); + } + + #[test] + fn test_deeply_nested_nullability_tightening_is_breaking() { + // array>>> -> array>>> + let before = StructType::new_unchecked([create_field_with_id( + "wrapper", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([ + create_field_with_id("value", DataType::INTEGER, true, 3), // Nullable + ]) + .unwrap(), + false, + ))), + false, + 2, + )]) + .unwrap(), + false, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "wrapper", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([ + create_field_with_id("value", DataType::INTEGER, false, 3), // Non-nullable now - BREAKING! + ]) + .unwrap(), + false, + ))), + false, + 2, + )]) + .unwrap(), + false, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!( + diff.updated_fields[0].path, + ColumnName::new(["wrapper", "element", "items", "element", "value"]) + ); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::NullabilityTightened] + ); + assert!(diff.has_breaking_changes()); + } + + #[test] + fn test_deeply_nested_container_nullability_tightening_is_breaking() { + // array nullable>>> -> array not null>>> + let before = StructType::new_unchecked([create_field_with_id( + "wrapper", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "value", + DataType::INTEGER, + false, + 3, + )]) + .unwrap(), + true, // Array elements are nullable + ))), + false, + 2, + )]) + .unwrap(), + false, + ))), + false, + 1, + )]); + + let after = StructType::new_unchecked([create_field_with_id( + "wrapper", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "items", + DataType::Array(Box::new(ArrayType::new( + DataType::try_struct_type([create_field_with_id( + "value", + DataType::INTEGER, + false, + 3, + )]) + .unwrap(), + false, // Array elements now non-nullable - BREAKING! + ))), + false, + 2, + )]) + .unwrap(), + false, + ))), + false, + 1, + )]); + + let diff = SchemaDiff::new(&before, &after).unwrap(); + + assert_eq!(diff.added_fields.len(), 0); + assert_eq!(diff.removed_fields.len(), 0); + assert_eq!(diff.updated_fields.len(), 1); + assert_eq!( + diff.updated_fields[0].path, + ColumnName::new(["wrapper", "element", "items"]) + ); + assert_eq!( + diff.updated_fields[0].change_types, + vec![FieldChangeType::ContainerNullabilityTightened] + ); + assert!(diff.has_breaking_changes()); + } +} diff --git a/kernel/src/schema/mod.rs b/kernel/src/schema/mod.rs index 8c1c4302a3..5eaabc780c 100644 --- a/kernel/src/schema/mod.rs +++ b/kernel/src/schema/mod.rs @@ -1,8 +1,8 @@ //! Definitions and functions to create and manipulate kernel schema use std::borrow::Cow; -use std::collections::HashMap; -use std::fmt::{Display, Formatter}; +use std::collections::{HashMap, HashSet}; +use std::fmt::{Debug, Display, Formatter}; use std::iter::{DoubleEndedIterator, FusedIterator}; use std::str::FromStr; use std::sync::{Arc, LazyLock}; @@ -14,12 +14,17 @@ use tracing::warn; // re-export because many call sites that use schemas do not necessarily use expressions pub(crate) use crate::expressions::{column_name, ColumnName}; +use crate::reserved_field_ids::FILE_NAME; +use crate::table_features::get_field_column_mapping_info; use crate::table_features::ColumnMappingMode; -use crate::utils::{require, CowExt as _}; +use crate::transforms::SchemaTransform; +use crate::utils::require; use crate::{DeltaResult, Error}; use delta_kernel_derive::internal_api; pub(crate) mod compare; +#[cfg(feature = "schema-diff")] +pub(crate) mod diff; #[cfg(feature = "internal-api")] pub mod derive_macro_utils; @@ -111,6 +116,9 @@ impl AsRef for ColumnMetadataKey { match self { Self::ColumnMappingId => "delta.columnMapping.id", Self::ColumnMappingPhysicalName => "delta.columnMapping.physicalName", + // "parquet.field.id" is not defined by the Delta protocol, but follows the convention + // established by delta-spark and other Delta ecosystem implementations for storing + // Parquet field IDs in StructField metadata. Self::ParquetFieldId => "parquet.field.id", Self::GenerationExpression => "delta.generationExpression", Self::IdentityAllowExplicitInsert => "delta.identity.allowExplicitInsert", @@ -132,6 +140,7 @@ pub enum MetadataColumnSpec { RowIndex, RowId, RowCommitVersion, + FilePath, } impl MetadataColumnSpec { @@ -141,6 +150,7 @@ impl MetadataColumnSpec { Self::RowIndex => "row_index", Self::RowId => "row_id", Self::RowCommitVersion => "row_commit_version", + Self::FilePath => "_file", } } @@ -150,6 +160,7 @@ impl MetadataColumnSpec { Self::RowIndex => DataType::LONG, Self::RowId => DataType::LONG, Self::RowCommitVersion => DataType::LONG, + Self::FilePath => DataType::STRING, } } @@ -159,6 +170,15 @@ impl MetadataColumnSpec { Self::RowIndex => false, Self::RowId => false, Self::RowCommitVersion => false, + Self::FilePath => false, + } + } + + /// The reserved field ID for the specified metadata column, if any. + pub fn reserved_field_id(&self) -> Option { + match self { + Self::FilePath => Some(FILE_NAME), + _ => None, } } } @@ -171,6 +191,7 @@ impl FromStr for MetadataColumnSpec { "row_index" => Ok(Self::RowIndex), "row_id" => Ok(Self::RowId), "row_commit_version" => Ok(Self::RowCommitVersion), + "_file" => Ok(Self::FilePath), _ => Err(Error::Schema(format!("Unknown metadata column spec: {s}"))), } } @@ -314,20 +335,49 @@ impl StructField { /// Get the physical name for this field as it should be read from parquet. /// + /// When `column_mapping_mode` is `None`, always returns the logical name (even if physical + /// name metadata is present). When mode is `Id` or `Name`, returns the physical name from + /// metadata if present, otherwise returns the logical name. + /// /// NOTE: Caller affirms that the schema was already validated by - /// [`crate::table_features::validate_schema_column_mapping`], to ensure that annotations are + /// [`crate::table_configuration::TableConfiguration::try_new`], to ensure that annotations are /// always and only present when column mapping mode is enabled. #[internal_api] - pub(crate) fn physical_name(&self) -> &str { - match self - .metadata - .get(ColumnMetadataKey::ColumnMappingPhysicalName.as_ref()) - { - Some(MetadataValue::String(physical_name)) => physical_name, - _ => &self.name, + pub(crate) fn physical_name(&self, column_mapping_mode: ColumnMappingMode) -> &str { + match column_mapping_mode { + ColumnMappingMode::None => &self.name, + ColumnMappingMode::Id | ColumnMappingMode::Name => { + match self + .metadata + .get(ColumnMetadataKey::ColumnMappingPhysicalName.as_ref()) + { + Some(MetadataValue::String(physical_name)) => physical_name, + _ => &self.name, + } + } } } + /// Returns true if this field has a physical name annotation + /// in its column mapping metadata. + pub(crate) fn has_physical_name_annotation(&self) -> bool { + matches!( + self.metadata + .get(ColumnMetadataKey::ColumnMappingPhysicalName.as_ref()), + Some(MetadataValue::String(_)) + ) + } + + /// Returns true if this field has a column mapping ID annotation + /// in its column mapping metadata. + pub(crate) fn has_id_annotation(&self) -> bool { + matches!( + self.metadata + .get(ColumnMetadataKey::ColumnMappingId.as_ref()), + Some(MetadataValue::Number(_)) + ) + } + /// Change the name of a field. The field will preserve its data type and nullability. Note that /// this allocates a new field. pub fn with_name(&self, new_name: impl Into) -> Self { @@ -382,67 +432,17 @@ impl StructField { /// `Id` or `Name`, this is specified in [`ColumnMetadataKey::ColumnMappingPhysicalName`]. /// Otherwise, the field's logical name is used. /// - /// If the `column_mapping_mode` is `None`, then all column mapping metadata is removed. - /// If the `column_mapping_mode` is `Name`, then all Id mode column mapping metadata is - /// removed. - /// - /// NOTE: The caller must ensure that the schema has been validated by - /// [`crate::table_features::validate_schema_column_mapping`] to ensure that annotations are - /// present only when column mapping mode is enabled. + /// Returns an error if a field has invalid or inconsistent column mapping annotations (e.g. + /// missing when column mapping is enabled, present when disabled, or wrong type), or if a + /// metadata column is encountered (metadata columns should not participate in column mapping). /// /// [`read_parquet_files`]: crate::ParquetHandler::read_parquet_files #[internal_api] - pub(crate) fn make_physical(&self, column_mapping_mode: ColumnMappingMode) -> Self { - struct MakePhysical { - column_mapping_mode: ColumnMappingMode, - } - impl<'a> SchemaTransform<'a> for MakePhysical { - fn transform_struct_field( - &mut self, - field: &'a StructField, - ) -> Option> { - let field = self.recurse_into_struct_field(field)?; - - let metadata = field.logical_to_physical_metadata(self.column_mapping_mode); - let name = match self.column_mapping_mode { - ColumnMappingMode::None => field.name().to_owned(), - ColumnMappingMode::Id | ColumnMappingMode::Name => { - // Assert that the physical name is present - match field.is_metadata_column() { - true => { - debug_assert!( - false, - "Metadata column should not have a physical name" - ); - } - false => { - debug_assert!(field - .metadata - .get(ColumnMetadataKey::ColumnMappingPhysicalName.as_ref()) - .is_some_and(|x| matches!(x, MetadataValue::String(_)))); - } - } - field.physical_name().to_owned() - } - }; - - Some(Cow::Owned(field.with_name(name).with_metadata(metadata))) - } - - fn transform_variant(&mut self, stype: &'a StructType) -> Option> { - // There is no column mapping metadata inside the struct fields of a variant, so - // we do not recurse into the variant fields - Some(Cow::Borrowed(stype)) - } - } - // NOTE: unwrap is safe because the transformer is incapable of returning None - #[allow(clippy::unwrap_used)] - MakePhysical { - column_mapping_mode, - } - .transform_struct_field(self) - .unwrap() - .into_owned() + pub(crate) fn make_physical( + &self, + column_mapping_mode: ColumnMappingMode, + ) -> DeltaResult { + MakePhysical::new(column_mapping_mode).run_field(self) } fn has_invariants(&self) -> bool { @@ -453,8 +453,10 @@ impl StructField { /// Converts logical schema StructField metadata to physical schema metadata /// based on the specified `column_mapping_mode`. /// - /// NOTE: Caller affirms that the schema was already validated by - /// [`crate::table_features::validate_schema_column_mapping`], to ensure that annotations are + /// NOTE: Must not be called on metadata columns, which are not subject to column mapping. + /// + /// NOTE: Caller affirms that `self` was already validated by + /// [`crate::table_features::get_field_column_mapping_info`], to ensure that annotations are /// always and only present when column mapping mode is enabled. fn logical_to_physical_metadata( &self, @@ -468,7 +470,7 @@ impl StructField { match column_mapping_mode { ColumnMappingMode::Id => { let Some(MetadataValue::Number(fid)) = field_id else { - // `validate_schema_column_mapping` should have verified that this has a field Id + // `get_field_column_mapping_info` should have verified that this has a field Id warn!("StructField with name {} is missing field id in the Id column mapping mode", self.name()); debug_assert!(false); return base_metadata; @@ -486,9 +488,18 @@ impl StructField { debug_assert!(base_metadata.contains_key(physical_name_key)); debug_assert!(base_metadata.contains_key(field_id_key)); - // Remove all id mode related metadata keys - base_metadata.remove(field_id_key); - base_metadata.remove(parquet_field_id_key); + // Retain column mapping id and insert parquet field id so that + // Parquet files carry field IDs in Name mode as well (matching + // the Delta protocol requirement and Delta Spark behaviour). + let Some(MetadataValue::Number(fid)) = field_id else { + warn!("StructField with name {} is missing field id in the Name column mapping mode", self.name()); + debug_assert!(false); + return base_metadata; + }; + base_metadata.insert( + parquet_field_id_key.to_string(), + MetadataValue::Number(*fid), + ); // TODO(#1070): Remove nested column ids when they are supported in kernel } ColumnMappingMode::None => { @@ -502,6 +513,26 @@ impl StructField { } } +impl Display for StructField { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let mut metadata_str = String::from("{"); + let mut first = true; + for (k, v) in self.metadata.iter() { + if !first { + metadata_str.push_str(", "); + } + first = false; + metadata_str.push_str(&format!("{k}: {v:?}")); + } + metadata_str.push('}'); + write!( + f, + "{}: {} (is nullable: {}, metadata: {})", + self.name, self.data_type, self.nullable, metadata_str, + ) + } +} + /// A struct is used to represent both the top-level schema of the table /// as well as struct columns that contain nested columns. #[derive(Debug, PartialEq, Clone, Eq)] @@ -518,16 +549,55 @@ pub struct StructType { metadata_columns: HashMap, } +pub struct StructTypeBuilder { + fields: IndexMap, +} + +impl Default for StructTypeBuilder { + fn default() -> Self { + Self::new() + } +} + +impl StructTypeBuilder { + pub fn new() -> Self { + Self { + fields: IndexMap::new(), + } + } + + pub fn from_schema(schema: &StructType) -> Self { + Self { + fields: schema.fields.clone(), + } + } + + pub fn add_field(mut self, field: StructField) -> Self { + self.fields.insert(field.name.clone(), field); + self + } + + pub fn build(self) -> DeltaResult { + StructType::try_new(self.fields.into_values()) + } + + pub fn build_arc_unchecked(self) -> Arc { + Arc::new(StructType::new_unchecked(self.fields.into_values())) + } +} + impl StructType { /// Creates a new [`StructType`] from the given fields. /// /// Returns an error if: - /// - the schema contains duplicate field names + /// - the schema contains duplicate field names (case-insensitive; Delta column names are + /// case-insensitive per the protocol) /// - the schema contains duplicate metadata columns /// - the schema contains nested metadata columns pub fn try_new(fields: impl IntoIterator) -> DeltaResult { let mut field_map = IndexMap::new(); let mut metadata_columns = HashMap::new(); + let mut seen_lowercase_names = HashSet::new(); // Validate each field during insertion for (i, field) in fields.into_iter().enumerate() { @@ -545,10 +615,16 @@ impl StructType { } } - // Check for duplicate field names - if let Some(dup) = field_map.insert(field.name.clone(), field) { - return Err(Error::schema(format!("Duplicate field name: {}", dup.name))); + // Delta column names are case-insensitive; reject schemas with duplicates that differ only by case. + let key = field.name.to_lowercase(); + if !seen_lowercase_names.insert(key) { + return Err(Error::schema(format!( + "Duplicate field name (case-insensitive): '{}'", + field.name + ))); } + + field_map.insert(field.name.clone(), field); } Ok(Self { @@ -571,6 +647,10 @@ impl StructType { .process_results(|iter| Self::try_new(iter))? } + pub fn builder() -> StructTypeBuilder { + StructTypeBuilder::new() + } + /// Creates a new [`StructType`] from the given fields without validating them. /// /// This should only be used when you are sure that the fields are valid. @@ -654,6 +734,58 @@ impl StructType { self.fields.get(name.as_ref()) } + /// Resolves a column path through nested structs, returning references to all + /// [`StructField`]s along the path. The last element is the leaf field. + /// + /// Each element of the path must resolve to a field in the current struct. All intermediate + /// (non-leaf) fields must be struct types. + /// + /// Returns an error if the path is empty, a field is not found, or an intermediate + /// field is not a struct type. + pub(crate) fn walk_column_fields<'a>( + &'a self, + col: &ColumnName, + ) -> DeltaResult> { + self.walk_column_fields_by(col, |s, name| s.field(name)) + } + + /// Helper to walk through nested columns. For each path component in `col`, calls + /// `find_field(current_struct, component)` to locate the matching field, then descends + /// into the next nested struct. Returns references to all [`StructField`]s along the path. + pub(crate) fn walk_column_fields_by<'a, F>( + &'a self, + col: &ColumnName, + find_field: F, + ) -> DeltaResult> + where + F: for<'b> Fn(&'b StructType, &str) -> Option<&'b StructField>, + { + let path = col.path(); + if path.is_empty() { + return Err(Error::generic("Column path cannot be empty")); + } + let mut current_struct = self; + let mut fields = Vec::with_capacity(path.len()); + for (i, field_name) in path.iter().enumerate() { + let field = find_field(current_struct, field_name).ok_or_else(|| { + Error::generic(format!( + "Could not resolve column '{col}': field '{field_name}' not found in schema" + )) + })?; + fields.push(field); + if i < path.len() - 1 { + let DataType::Struct(inner) = field.data_type() else { + return Err(Error::generic(format!( + "Cannot resolve column '{col}': intermediate field '{field_name}' \ + is not a struct type" + ))); + }; + current_struct = inner; + } + } + Ok(fields) + } + /// Gets the field with the given name and its index. pub fn field_with_index(&self, name: impl AsRef) -> Option<(usize, &StructField)> { self.fields @@ -691,6 +823,30 @@ impl StructType { self.fields.len() } + /// Recursively counts all [`StructField`] nodes in this schema tree. + /// + /// This includes nested struct fields (inside Struct, Array, and Map types) but does not + /// count Array/Map containers themselves. This matches the traversal pattern used by + /// `assign_column_mapping_metadata` when assigning column IDs, so the result equals the + /// expected `delta.columnMapping.maxColumnId` for a newly created table. + #[allow(unused)] // Only used by integration tests (create_table/column_mapping.rs) + #[internal_api] + pub(crate) fn total_struct_fields(&self) -> usize { + fn count_data_type(dt: &DataType) -> usize { + match dt { + DataType::Struct(inner) => inner.total_struct_fields(), + DataType::Array(array) => count_data_type(array.element_type()), + DataType::Map(map) => { + count_data_type(map.key_type()) + count_data_type(map.value_type()) + } + _ => 0, + } + } + self.fields() + .map(|field| 1 + count_data_type(field.data_type())) + .sum() + } + /// Gets a reference to the metadata column with the given spec. pub fn metadata_column(&self, spec: &MetadataColumnSpec) -> Option<&StructField> { self.metadata_columns @@ -715,7 +871,7 @@ impl StructType { #[internal_api] pub(crate) fn leaves<'s>(&self, own_name: impl Into>) -> ColumnNamesAndTypes { let mut get_leaves = GetSchemaLeaves::new(own_name.into()); - let _ = get_leaves.transform_struct(self); + get_leaves.transform_struct(self); (get_leaves.names, get_leaves.types).into() } @@ -723,16 +879,19 @@ impl StructType { /// [`ColumnMappingMode::Id`], then each StructField will have its parquet field id in the /// [`ColumnMetadataKey::ParquetFieldId`] metadata field. /// - /// NOTE: Caller affirms that the schema was already validated by - /// [`crate::table_features::validate_schema_column_mapping`], to ensure that annotations are - /// always and only present when column mapping mode is enabled. - #[allow(unused)] + /// Uses a single transformer so duplicate column mapping IDs are detected across all + /// fields in this struct, not just within each field's subtree. #[internal_api] - pub(crate) fn make_physical(&self, column_mapping_mode: ColumnMappingMode) -> Self { - let fields = self + pub(crate) fn make_physical( + &self, + column_mapping_mode: ColumnMappingMode, + ) -> DeltaResult { + let mut transformer = MakePhysical::new(column_mapping_mode); + let fields: Vec = self .fields() - .map(|field| field.make_physical(column_mapping_mode)); - Self::new_unchecked(fields) + .map(|field| transformer.run_field(field)) + .try_collect()?; + Self::try_new(fields) } /// Validates that there are no metadata columns in the given fields. @@ -779,6 +938,167 @@ impl StructType { Ok(()) } + + /// Returns a StructType with `new_field` inserted after the field named `after`. + /// If `new_field` already presents in the schema, an error is returned. + /// If `after` is None, `new_field` is appended to the end. + /// If `after` is not found, an error is returned. + pub fn with_field_inserted_after( + mut self, + after: Option<&str>, + new_field: StructField, + ) -> DeltaResult { + // TODO: Upgrade to a case-insensitive duplicate check when this method is used for + // user-facing operations like ALTER TABLE ADD COLUMN. Currently only used internally + // for inserting protocol-defined fields (e.g. stats_parsed) where exact-name matching + // is sufficient. + if self.fields.contains_key(&new_field.name) { + return Err(Error::generic(format!( + "Field {} already exists", + new_field.name + ))); + } + + let insert_index = after + .map(|after| { + self.fields + .get_index_of(after) + .map(|index| index + 1) + .ok_or_else(|| Error::generic(format!("Field {after} not found"))) + }) + .unwrap_or_else(|| Ok(self.fields.len()))?; + + self.fields + .insert_before(insert_index, new_field.name.clone(), new_field); + Ok(self) + } + + /// Returns a StructType with `new_field` inserted before the field named `before`. + /// If `new_field` already presents in the schema, an error is returned. + /// If `before` is None, `new_field` is inserted at the beginning. + /// If `before` is not found, an error is returned. + pub fn with_field_inserted_before( + mut self, + before: Option<&str>, + new_field: StructField, + ) -> DeltaResult { + // TODO: Upgrade to a case-insensitive duplicate check when this method is used for + // user-facing operations like ALTER TABLE ADD COLUMN. Currently only used internally + // for inserting protocol-defined fields where exact-name matching is sufficient. + if self.fields.contains_key(&new_field.name) { + return Err(Error::generic(format!( + "Field {} already exists", + new_field.name + ))); + } + + let index_of_before = before + .map(|before| { + self.fields + .get_index_of(before) + .ok_or_else(|| Error::generic(format!("Field {before} not found"))) + }) + .unwrap_or_else(|| Ok(0))?; + + self.fields + .insert_before(index_of_before, new_field.name.clone(), new_field); + Ok(self) + } + + /// Returns a StructType with the named field removed. + /// Returns self unchanged if field doesn't exist. + pub fn with_field_removed(mut self, name: &str) -> Self { + self.fields.shift_remove(name); + self + } + + /// Returns a new [`StructType`] containing only the top-level fields for which `predicate` + /// returns `true`. This does not recurse into nested [`StructType`] fields. + pub fn with_fields_filtered( + &self, + predicate: impl Fn(&StructField) -> bool, + ) -> DeltaResult { + Self::try_new(self.fields().filter(|f| predicate(f)).cloned()) + } + + /// Returns an optional [`StructType`] containing only the top-level fields for which + /// `predicate` returns `true`. + /// + /// This is a convenience wrapper around [`StructType::with_fields_filtered`] for callers + /// that treat an empty top-level struct as "no schema". + pub fn with_fields_filtered_nonempty( + &self, + predicate: impl Fn(&StructField) -> bool, + ) -> DeltaResult> { + let filtered = self.with_fields_filtered(predicate)?; + if filtered.num_fields() == 0 { + Ok(None) + } else { + Ok(Some(filtered)) + } + } + + /// Returns a StructType with the named field replaced. + /// Returns an error if field doesn't exist. + pub fn with_field_replaced( + mut self, + name: &str, + new_field: StructField, + ) -> DeltaResult { + let replace_field = self + .fields + .get_mut(name) + .ok_or_else(|| Error::generic(format!("Field {name} not found")))?; + + *replace_field = new_field; + Ok(self) + } +} + +fn write_indent(f: &mut Formatter<'_>, levels: &[bool]) -> std::fmt::Result { + let mut it = levels.iter().peekable(); + + while let Some(is_last) = it.next() { + // Final level → draw branch + if it.peek().is_none() { + write!(f, "{}", if *is_last { "└─" } else { "├─" })?; + } + // Parent levels → vertical line or empty space + else { + write!(f, "{}", if *is_last { " " } else { "│ " })?; + } + } + + Ok(()) +} + +fn write_struct_type( + st: &StructType, + f: &mut Formatter<'_>, + levels: &mut Vec, +) -> std::fmt::Result { + let len = st.fields.len(); + + for (i, (_, field)) in st.fields.iter().enumerate() { + let is_last = i + 1 == len; + levels.push(is_last); + + write_indent(f, levels)?; + writeln!(f, "{field}")?; + + field.data_type.fmt_recursive(f, levels)?; + + levels.pop(); + } + Ok(()) +} + +impl Display for StructType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + writeln!(f, "{}:", self.type_name)?; + let mut levels = Vec::new(); + write_struct_type(self, f, &mut levels) + } } impl IntoIterator for StructType { @@ -942,32 +1262,27 @@ impl DoubleEndedIterator for StructFieldRefIter<'_> { } } -#[derive(Debug, Default)] -pub(crate) struct InvariantChecker { - has_invariants: bool, -} +struct InvariantChecker(bool); impl<'a> SchemaTransform<'a> for InvariantChecker { fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { if field.has_invariants() { - self.has_invariants = true; - } else if !self.has_invariants { + self.0 = true; + } else if !self.0 { let _ = self.recurse_into_struct_field(field); } Some(Cow::Borrowed(field)) } } -impl InvariantChecker { - /// Checks if any column in the schema (including nested columns) has invariants defined. - /// - /// This traverses the entire schema to check for the presence of the "delta.invariants" - /// metadata key. - pub(crate) fn has_invariants(schema: &Schema) -> bool { - let mut checker = InvariantChecker::default(); - let _ = checker.transform_struct(schema); - checker.has_invariants - } +/// Checks if any column in the schema (including nested columns) has invariants defined. +/// +/// This traverses the entire schema to check for the presence of the `delta.invariants` +/// metadata key. +pub(crate) fn schema_has_invariants(schema: &Schema) -> bool { + let mut checker = InvariantChecker(false); + let _ = checker.transform_struct(schema); + checker.0 } /// Helper for RowVisitor implementations @@ -979,11 +1294,6 @@ impl ColumnNamesAndTypes { pub(crate) fn as_ref(&self) -> (&[ColumnName], &[DataType]) { (&self.0, &self.1) } - - pub(crate) fn extend(&mut self, other: ColumnNamesAndTypes) { - self.0.extend(other.0); - self.1.extend(other.1); - } } impl From<(Vec, Vec)> for ColumnNamesAndTypes { @@ -1111,7 +1421,7 @@ fn default_true() -> bool { true } -#[derive(Debug, Clone, Copy, Eq, PartialEq)] +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] pub struct DecimalType { precision: u8, scale: u8, @@ -1144,7 +1454,7 @@ impl DecimalType { } } -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] +#[derive(Debug, Serialize, PartialEq, Clone, Eq)] #[serde(rename_all = "camelCase")] pub enum PrimitiveType { /// UTF-8 encoded string of characters @@ -1169,11 +1479,7 @@ pub enum PrimitiveType { Timestamp, #[serde(rename = "timestamp_ntz")] TimestampNtz, - #[serde( - serialize_with = "serialize_decimal", - deserialize_with = "deserialize_decimal", - untagged - )] + #[serde(serialize_with = "serialize_decimal", untagged)] Decimal(DecimalType), } @@ -1181,6 +1487,64 @@ impl PrimitiveType { pub fn decimal(precision: u8, scale: u8) -> DeltaResult { Ok(DecimalType::try_new(precision, scale)?.into()) } + + /// Returns `true` if this primitive type can be widened to the `target` type. + /// + /// Widening rules: + /// - Integer widening: byte -> short -> int -> long (Delta protocol type widening) + /// - Float widening: float -> double (Delta protocol type widening) + /// - Timestamp interchangeability: Timestamp <-> TimestampNtz (both are i64 microseconds + /// since epoch, differing only in timezone semantics; this is a physical read + /// accommodation, not a Delta protocol type widening rule) + pub(crate) fn can_widen_to(&self, target: &Self) -> bool { + use PrimitiveType::*; + matches!( + (self, target), + // Integer widening: smaller types can be read as larger ones + (Byte, Short | Integer | Long) + | (Short, Integer | Long) + | (Integer, Long) + // Float widening: float can be read as double + | (Float, Double) + // Timestamp equivalence: both are i64 microseconds since epoch, differing only + // in timezone semantics. The parquet representation is identical, so reading + // one as the other is safe at the data layer. + | (Timestamp, TimestampNtz) + | (TimestampNtz, Timestamp) + ) + } + + /// Returns `true` if `self` is a physical integer type that some checkpoint writers + /// produce when they omit Parquet logical type annotations for date or timestamp columns. + /// + /// Specifically: + /// - Integer -> Date (int32 stored without DATE annotation) + /// - Long -> Timestamp/TimestampNtz (int64 stored without TIMESTAMP annotation) + /// + /// These are **not** Delta protocol type widening rules and must not be used outside of + /// checkpoint compatibility checks. + /// + /// NOTE: The Arrow-level equivalent lives in `check_cast_compat` in + /// `engine/ensure_data_types.rs`. Changes here must be mirrored there. + pub(crate) fn is_checkpoint_cast_compatible(&self, target: &Self) -> bool { + matches!( + (self, target), + (Self::Integer, Self::Date) | (Self::Long, Self::Timestamp | Self::TimestampNtz) + ) + } + + /// Returns `true` if this primitive type is compatible with `target` for reading + /// `stats_parsed` columns from checkpoint parquet files. + /// + /// This is a superset of [`can_widen_to`]: it includes all Delta protocol type widening + /// rules plus physical Parquet encoding accommodations for checkpoint interop (see + /// [`is_checkpoint_cast_compatible`]). + /// + /// [`can_widen_to`]: PrimitiveType::can_widen_to + /// [`is_checkpoint_cast_compatible`]: PrimitiveType::is_checkpoint_cast_compatible + pub(crate) fn is_stats_type_compatible_with(&self, target: &Self) -> bool { + self == target || self.can_widen_to(target) || self.is_checkpoint_cast_compatible(target) + } } fn serialize_decimal( @@ -1190,32 +1554,6 @@ fn serialize_decimal( serializer.serialize_str(&format!("decimal({},{})", dtype.precision(), dtype.scale())) } -fn deserialize_decimal<'de, D>(deserializer: D) -> Result -where - D: serde::Deserializer<'de>, -{ - let str_value = String::deserialize(deserializer)?; - require!( - str_value.starts_with("decimal(") && str_value.ends_with(')'), - serde::de::Error::custom(format!("Invalid decimal: {str_value}")) - ); - - let mut parts = str_value[8..str_value.len() - 1].split(','); - let precision = parts - .next() - .and_then(|part| part.trim().parse::().ok()) - .ok_or_else(|| { - serde::de::Error::custom(format!("Invalid precision in decimal: {str_value}")) - })?; - let scale = parts - .next() - .and_then(|part| part.trim().parse::().ok()) - .ok_or_else(|| { - serde::de::Error::custom(format!("Invalid scale in decimal: {str_value}")) - })?; - DecimalType::try_new(precision, scale).map_err(serde::de::Error::custom) -} - fn serialize_variant( _: &StructType, serializer: S, @@ -1223,21 +1561,60 @@ fn serialize_variant( serializer.serialize_str("variant") } -fn deserialize_variant<'de, D>(deserializer: D) -> Result, D::Error> -where - D: serde::Deserializer<'de>, -{ - let str_value = String::deserialize(deserializer)?; - require!( - str_value == "variant", - serde::de::Error::custom(format!("Invalid variant: {str_value}")) - ); - match DataType::unshredded_variant() { - DataType::Variant(st) => Ok(st), - _ => Err(serde::de::Error::custom( - "Issue in DataType::unshredded_variant(). Please raise an issue at ".to_string() - + "delta-io/delta-kernel-rs.", - )), +// Custom Deserialize to provide clear error messages for unsupported types. +// The derived impl would produce: "unknown variant `interval second`, expected one of ..." +// This impl produces: "Unsupported Delta table type: 'interval second'" +impl<'de> serde::Deserialize<'de> for PrimitiveType { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let str_value = String::deserialize(deserializer)?; + + match str_value.as_str() { + "string" => Ok(PrimitiveType::String), + "long" => Ok(PrimitiveType::Long), + "integer" => Ok(PrimitiveType::Integer), + "short" => Ok(PrimitiveType::Short), + "byte" => Ok(PrimitiveType::Byte), + "float" => Ok(PrimitiveType::Float), + "double" => Ok(PrimitiveType::Double), + "boolean" => Ok(PrimitiveType::Boolean), + "binary" => Ok(PrimitiveType::Binary), + "date" => Ok(PrimitiveType::Date), + "timestamp" => Ok(PrimitiveType::Timestamp), + "timestamp_ntz" => Ok(PrimitiveType::TimestampNtz), + decimal_str if decimal_str.starts_with("decimal(") && decimal_str.ends_with(')') => { + // Parse decimal type + let mut parts = decimal_str[8..decimal_str.len() - 1].split(','); + let precision = parts + .next() + .and_then(|part| part.trim().parse::().ok()) + .ok_or_else(|| { + serde::de::Error::custom(format!( + "Invalid precision in decimal: {decimal_str}" + )) + })?; + let scale = parts + .next() + .and_then(|part| part.trim().parse::().ok()) + .ok_or_else(|| { + serde::de::Error::custom(format!("Invalid scale in decimal: {decimal_str}")) + })?; + // Reject extra parts (e.g., decimal(10,2,99)) + if parts.next().is_some() { + return Err(serde::de::Error::custom(format!( + "Invalid decimal format (expected 2 parts): {decimal_str}" + ))); + } + DecimalType::try_new(precision, scale) + .map(PrimitiveType::Decimal) + .map_err(serde::de::Error::custom) + } + unsupported => Err(serde::de::Error::custom(format!( + "Unsupported Delta table type: '{unsupported}'" + ))), + } } } @@ -1263,7 +1640,7 @@ impl Display for PrimitiveType { } } -#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] +#[derive(Debug, Serialize, PartialEq, Clone, Eq)] #[serde(untagged, rename_all = "camelCase")] pub enum DataType { /// UTF-8 encoded string of characters @@ -1278,10 +1655,7 @@ pub enum DataType { Map(Box), /// The Variant data type. The physical representation can be flexible to support shredded /// reads. The unshredded schema is `Variant(StructType)`. - #[serde( - serialize_with = "serialize_variant", - deserialize_with = "deserialize_variant" - )] + #[serde(serialize_with = "serialize_variant")] Variant(Box), } @@ -1324,6 +1698,61 @@ impl From for DataType { } } +// Custom Deserialize to preserve error messages from PrimitiveType. +// Serde's untagged enum only reports the last variant's error, discarding PrimitiveType's +// clear "Unsupported Delta table type: 'X'" message. We deserialize to Value first, then +// dispatch based on structure (string -> Primitive/Variant, object -> Array/Struct/Map). +impl<'de> serde::Deserialize<'de> for DataType { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + use serde::de::Error; + use serde_json::Value; + + let value = Value::deserialize(deserializer)?; + + // String values are either primitive types or "variant" + if let Value::String(s) = &value { + if s == "variant" { + return match DataType::unshredded_variant() { + DataType::Variant(st) => Ok(DataType::Variant(st)), + _ => Err(Error::custom("Failed to create variant type")), + }; + } + + // Try PrimitiveType - this will give us good error messages for unsupported types + return PrimitiveType::deserialize(value.clone()) + .map(DataType::Primitive) + .map_err(|e| Error::custom(e.to_string())); + } + + // Object values are complex types - dispatch based on "type" field + if let Value::Object(map) = &value { + if let Some(Value::String(type_str)) = map.get("type") { + return match type_str.as_str() { + "array" => ArrayType::deserialize(value) + .map(|at| DataType::Array(Box::new(at))) + .map_err(|e| Error::custom(e.to_string())), + "struct" => StructType::deserialize(value) + .map(|st| DataType::Struct(Box::new(st))) + .map_err(|e| Error::custom(e.to_string())), + "map" => MapType::deserialize(value) + .map(|mt| DataType::Map(Box::new(mt))) + .map_err(|e| Error::custom(e.to_string())), + _ => Err(Error::custom(format!("Unknown complex type: '{type_str}'"))), + }; + } + } + + // Fallback error with the actual value that failed + Err(Error::custom(format!( + "Invalid data type: {}", + serde_json::to_string(&value).unwrap_or_else(|_| format!("{value:?}")) + ))) + } +} + /// cbindgen:ignore impl DataType { pub const STRING: Self = DataType::Primitive(PrimitiveType::String); @@ -1396,6 +1825,40 @@ impl DataType { _ => None, } } + + fn fmt_recursive(&self, f: &mut Formatter<'_>, levels: &mut Vec) -> std::fmt::Result { + match self { + DataType::Struct(inner) => write_struct_type(inner, f, levels), + + DataType::Array(inner) => { + levels.push(true); // only one child → last + write_indent(f, levels)?; + writeln!(f, "array_element: {}", inner.element_type)?; + inner.element_type.fmt_recursive(f, levels)?; + levels.pop(); + Ok(()) + } + + DataType::Map(inner) => { + // key + levels.push(false); // map_key is NOT last + write_indent(f, levels)?; + writeln!(f, "map_key: {}", inner.key_type)?; + inner.key_type.fmt_recursive(f, levels)?; + levels.pop(); + + // value + levels.push(true); // map_value IS last at this level + write_indent(f, levels)?; + writeln!(f, "map_value: {}", inner.value_type)?; + inner.value_type.fmt_recursive(f, levels)?; + levels.pop(); + Ok(()) + } + + _ => Ok(()), + } + } } impl Display for DataType { @@ -1419,271 +1882,129 @@ impl Display for DataType { } } -/// Generic framework for describing recursive bottom-up schema transforms. Transformations return -/// `Option` with the following semantics: -/// * `Some(Cow::Owned)` -- The schema element was transformed and should propagate to its parent. -/// * `Some(Cow::Borrowed)` -- The schema element was not transformed. -/// * `None` -- The schema element was filtered out and the parent should no longer reference it. -/// -/// The transform can start from whatever schema element is available -/// (e.g. [`Self::transform_struct`] to start with [`StructType`]), or it can start from the generic -/// [`Self::transform`]. -/// -/// The provided `transform_xxx` methods all default to no-op, and implementations should -/// selectively override specific `transform_xxx` methods as needed for the task at hand. -/// -/// The provided `recurse_into_xxx` methods encapsulate the boilerplate work of recursing into the -/// child schema elements of each schema element. Implementations can call these as needed but will -/// generally not need to override them. -pub trait SchemaTransform<'a> { - /// Called for each primitive encountered during the schema traversal. - fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option> { - Some(Cow::Borrowed(ptype)) +struct GetSchemaLeaves { + path: Vec, + names: Vec, + types: Vec, +} +impl GetSchemaLeaves { + fn new(own_name: Option<&str>) -> Self { + Self { + path: own_name.into_iter().map(|s| s.to_string()).collect(), + names: vec![], + types: vec![], + } } +} - /// Called for each struct encountered during the schema traversal. Implementations can call - /// [`Self::recurse_into_struct`] if they wish to recursively transform the struct's fields. - fn transform_struct(&mut self, stype: &'a StructType) -> Option> { - self.recurse_into_struct(stype) +impl<'a> SchemaTransform<'a> for GetSchemaLeaves { + fn transform_struct_field(&mut self, field: &StructField) -> Option> { + self.path.push(field.name.clone()); + if let DataType::Struct(_) = field.data_type { + self.recurse_into_struct_field(field); + } else { + self.names.push(ColumnName::new(&self.path)); + self.types.push(field.data_type.clone()); + } + self.path.pop(); + None } +} - /// Called for each struct field encountered during the schema traversal. Implementations can - /// call [`Self::recurse_into_struct_field`] if they wish to recursively transform the field's - /// data type. - fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { - self.recurse_into_struct_field(field) +struct MakePhysical<'a> { + column_mapping_mode: ColumnMappingMode, + path: Vec<&'a str>, + seen: HashMap, + err: Option, +} +impl<'a> MakePhysical<'a> { + fn new(column_mapping_mode: ColumnMappingMode) -> Self { + Self { + column_mapping_mode, + path: vec![], + seen: HashMap::new(), + err: None, + } } - /// Called for each array encountered during the schema traversal. Implementations can call - /// [`Self::recurse_into_array`] if they wish to recursively transform the array's element type. - fn transform_array(&mut self, atype: &'a ArrayType) -> Option> { - self.recurse_into_array(atype) + /// Transforms a single [`StructField`] from logical to physical. Returns the physical + /// field on success, or the first error encountered during the recursive transformation. + fn run_field(&mut self, field: &'a StructField) -> DeltaResult { + let result = self.transform_struct_field(field); + match (self.err.take(), result) { + (Some(err), _) => Err(err), + // Theoretically impossible: MakePhysical only returns None when it sets an error + (None, None) => Err(Error::internal_error( + "make_physical: transform returned None without setting an error", + )), + (None, Some(field)) => Ok(field.into_owned()), + } } - /// Called for each array element encountered during the schema traversal. Implementations can - /// call [`Self::transform`] if they wish to recursively transform the array element type. - fn transform_array_element(&mut self, etype: &'a DataType) -> Option> { - self.transform(etype) - } - - /// Called for each map encountered during the schema traversal. Implementations can call - /// [`Self::recurse_into_map`] if they wish to recursively transform the map's key and/or value - /// types. - fn transform_map(&mut self, mtype: &'a MapType) -> Option> { - self.recurse_into_map(mtype) - } - - /// Called for each map key encountered during the schema traversal. Implementations can call - /// [`Self::transform`] if they wish to recursively transform the map key type. - fn transform_map_key(&mut self, etype: &'a DataType) -> Option> { - self.transform(etype) - } - - /// Called for each map value encountered during the schema traversal. Implementations can call - /// [`Self::transform`] if they wish to recursively transform the map value type. - fn transform_map_value(&mut self, etype: &'a DataType) -> Option> { - self.transform(etype) - } - - /// Called for each variant value encountered. By default, recurses into the fields of the - /// variant struct type. - fn transform_variant(&mut self, stype: &'a StructType) -> Option> { - self.recurse_into_struct(stype) - } - - /// General entry point for a recursive traversal over any data type. Also invoked internally to - /// dispatch on nested data types encountered during the traversal. - fn transform(&mut self, data_type: &'a DataType) -> Option> { - use DataType::*; - let result = match data_type { - Primitive(ptype) => self - .transform_primitive(ptype)? - .map_owned_or_else(data_type, DataType::from), - Array(atype) => self - .transform_array(atype)? - .map_owned_or_else(data_type, DataType::from), - Struct(stype) => self - .transform_struct(stype)? - .map_owned_or_else(data_type, DataType::from), - Map(mtype) => self - .transform_map(mtype)? - .map_owned_or_else(data_type, DataType::from), - Variant(stype) => self - .transform_variant(stype)? - .map_owned_or_else(data_type, |s| DataType::Variant(Box::new(s))), - }; - Some(result) - } - - /// Recursively transforms a struct field's data type. If the data type changes, update the - /// field to reference it. Otherwise, no-op. - fn recurse_into_struct_field( + fn transform_inner( &mut self, - field: &'a StructField, - ) -> Option> { - let result = self.transform(&field.data_type)?; - let f = |new_data_type| StructField { - name: field.name.clone(), - data_type: new_data_type, - nullable: field.nullable, - metadata: field.metadata.clone(), - }; - Some(result.map_owned_or_else(field, f)) - } - - /// Recursively transforms a struct's fields. If one or more fields were changed or removed, - /// update the struct to reference all surviving fields. Otherwise, no-op. - fn recurse_into_struct(&mut self, stype: &'a StructType) -> Option> { - use Cow::*; - let mut num_borrowed = 0; - let fields: Vec<_> = stype - .fields() - .filter_map(|field| self.transform_struct_field(field)) - .inspect(|field| { - if let Borrowed(_) = field { - num_borrowed += 1; - } - }) - .collect(); - - if fields.is_empty() { - None - } else if num_borrowed < stype.fields.len() { - // At least one field was changed or filtered out, so make a new struct - Some(Owned(StructType::new_unchecked( - fields.into_iter().map(|f| f.into_owned()), - ))) - } else { - Some(Borrowed(stype)) - } - } - - /// Recursively transforms an array's element type. If the element type changes, update the - /// array to reference it. Otherwise, no-op. - fn recurse_into_array(&mut self, atype: &'a ArrayType) -> Option> { - let result = self.transform_array_element(&atype.element_type)?; - let f = |element_type| ArrayType { - type_name: atype.type_name.clone(), - element_type, - contains_null: atype.contains_null, - }; - Some(result.map_owned_or_else(atype, f)) - } - - /// Recursively transforms a map's key and value types. If either one changes, update the map to - /// reference them. If either one is removed, remove the map as well. Otherwise, no-op. - fn recurse_into_map(&mut self, mtype: &'a MapType) -> Option> { - let key_type = self.transform_map_key(&mtype.key_type)?; - let value_type = self.transform_map_value(&mtype.value_type)?; - let f = |(key_type, value_type)| MapType { - type_name: mtype.type_name.clone(), - key_type, - value_type, - value_contains_null: mtype.value_contains_null, - }; - Some((key_type, value_type).map_owned_or_else(mtype, f)) - } -} - -struct GetSchemaLeaves { - path: Vec, - names: Vec, - types: Vec, -} -impl GetSchemaLeaves { - fn new(own_name: Option<&str>) -> Self { - Self { - path: own_name.into_iter().map(|s| s.to_string()).collect(), - names: vec![], - types: vec![], - } - } -} - -impl<'a> SchemaTransform<'a> for GetSchemaLeaves { - fn transform_struct_field(&mut self, field: &StructField) -> Option> { - self.path.push(field.name.clone()); - if let DataType::Struct(_) = field.data_type { - let _ = self.recurse_into_struct_field(field); - } else { - self.names.push(ColumnName::new(&self.path)); - self.types.push(field.data_type.clone()); + field_name: &'a str, + transform: impl FnOnce(&mut Self) -> Option, + ) -> Option { + if self.err.is_some() { + return None; } + self.path.push(field_name); + let result = transform(self); self.path.pop(); - None + result } } - -/// A schema "transform" that doesn't actually change the schema at all. Instead, it measures the -/// maximum depth of a schema, with a depth limit to prevent stack overflow. Useful for verifying -/// that a schema has reasonable depth before attempting to work with it. -pub struct SchemaDepthChecker { - depth_limit: usize, - max_depth_seen: usize, - current_depth: usize, - call_count: usize, -} -impl SchemaDepthChecker { - /// Depth-checks the given data type against a given depth limit. The return value is the - /// largest depth seen, which is capped at one more than the depth limit (indicating the - /// recursion was terminated). - pub fn check(data_type: &DataType, depth_limit: usize) -> usize { - Self::check_with_call_count(data_type, depth_limit).0 - } - - // Exposed for testing - fn check_with_call_count(data_type: &DataType, depth_limit: usize) -> (usize, usize) { - let mut checker = Self { - depth_limit, - max_depth_seen: 0, - current_depth: 0, - call_count: 0, - }; - checker.transform(data_type); - (checker.max_depth_seen, checker.call_count) +impl<'a> SchemaTransform<'a> for MakePhysical<'a> { + fn transform_array_element(&mut self, etype: &'a DataType) -> Option> { + self.transform_inner("", |this| this.transform(etype)) } - - // Triggers the requested recursion only doing so would not exceed the depth limit. - fn depth_limited<'a, T: Clone + std::fmt::Debug>( - &mut self, - recurse: impl FnOnce(&mut Self, &'a T) -> Option>, - arg: &'a T, - ) -> Option> { - self.call_count += 1; - if self.max_depth_seen < self.current_depth { - self.max_depth_seen = self.current_depth; - if self.depth_limit < self.current_depth { - tracing::warn!("Max schema depth {} exceeded by {arg:?}", self.depth_limit); - } - } - if self.max_depth_seen <= self.depth_limit { - self.current_depth += 1; - let _ = recurse(self, arg); - self.current_depth -= 1; - } - None + fn transform_map_key(&mut self, ktype: &'a DataType) -> Option> { + self.transform_inner("", |this| this.transform(ktype)) } -} -impl<'a> SchemaTransform<'a> for SchemaDepthChecker { - fn transform_struct(&mut self, stype: &'a StructType) -> Option> { - self.depth_limited(Self::recurse_into_struct, stype) + fn transform_map_value(&mut self, vtype: &'a DataType) -> Option> { + self.transform_inner("", |this| this.transform(vtype)) } fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { - self.depth_limited(Self::recurse_into_struct_field, field) - } - fn transform_array(&mut self, atype: &'a ArrayType) -> Option> { - self.depth_limited(Self::recurse_into_array, atype) + self.transform_inner(field.name(), |this| { + let (physical_name, _id) = get_field_column_mapping_info( + field, + this.column_mapping_mode, + &this.path, + Some(&mut this.seen), + ) + .map_err(|e| this.err = Some(e)) + .ok()?; + + if field.is_metadata_column() { + return Some(Cow::Borrowed(field)); + } + + let field = this.recurse_into_struct_field(field)?; + + let metadata = field.logical_to_physical_metadata(this.column_mapping_mode); + let name = physical_name.to_owned(); + + Some(Cow::Owned(field.with_name(name).with_metadata(metadata))) + }) } - fn transform_map(&mut self, mtype: &'a MapType) -> Option> { - self.depth_limited(Self::recurse_into_map, mtype) + + fn transform_variant(&mut self, stype: &'a StructType) -> Option> { + // There is no column mapping metadata inside the struct fields of a variant, so + // we do not recurse into the variant fields + Some(Cow::Borrowed(stype)) } } #[cfg(test)] mod tests { - use crate::utils::test_utils::assert_result_error_with_message; + use crate::table_features::ColumnMappingMode; + use crate::utils::test_utils::{ + assert_result_error_with_message, test_deep_nested_schema_missing_leaf_cm, + }; use super::*; + use rstest::rstest; use serde_json; fn example_schema_metadata() -> &'static str { @@ -1852,25 +2173,153 @@ mod tests { } } + #[rstest] + #[case("interval second")] + #[case("interval day")] + #[case("money")] + fn test_unsupported_type_error_message(#[case] unsupported_type: &str) { + let data = format!( + r#"{{ + "name": "test_field", + "type": "{unsupported_type}", + "nullable": false, + "metadata": {{}} + }}"# + ); + let result: Result = serde_json::from_str(&data); + assert!(result.is_err()); + let err = result.unwrap_err(); + let expected_msg = format!("Unsupported Delta table type: '{unsupported_type}'"); + assert!( + err.to_string().contains(&expected_msg), + "Expected error message about unsupported type '{unsupported_type}', got: {err}" + ); + } + + #[rstest] + #[case("string", DataType::STRING)] + #[case("long", DataType::LONG)] + #[case("integer", DataType::INTEGER)] + #[case("short", DataType::SHORT)] + #[case("byte", DataType::BYTE)] + #[case("float", DataType::FLOAT)] + #[case("double", DataType::DOUBLE)] + #[case("boolean", DataType::BOOLEAN)] + #[case("binary", DataType::BINARY)] + #[case("date", DataType::DATE)] + #[case("timestamp", DataType::TIMESTAMP)] + #[case("timestamp_ntz", DataType::TIMESTAMP_NTZ)] + fn test_primitive_type_deserialization_still_works( + #[case] type_str: &str, + #[case] expected_type: DataType, + ) { + let data = format!( + r#"{{ + "name": "test_field", + "type": "{type_str}", + "nullable": false, + "metadata": {{}} + }}"# + ); + let field: StructField = serde_json::from_str(&data).unwrap(); + assert_eq!(field.data_type, expected_type); + } + + #[rstest] + #[case(10, 2)] + #[case(16, 4)] + #[case(38, 10)] + fn test_decimal_with_primitive_deserializer(#[case] precision: u8, #[case] scale: u8) { + let data = format!( + r#"{{ + "name": "test_decimal", + "type": "decimal({precision},{scale})", + "nullable": false, + "metadata": {{}} + }}"# + ); + let field: StructField = serde_json::from_str(&data).unwrap(); + assert_eq!( + field.data_type, + DataType::decimal(precision, scale).unwrap() + ); + } + + #[rstest] + #[case("decimal(invalid)", "Invalid precision in decimal")] + #[case("decimal(10)", "Invalid scale in decimal")] + #[case("decimal()", "Invalid precision in decimal")] + #[case("decimal(10,2,99)", "Invalid decimal format (expected 2 parts)")] + fn test_invalid_decimal_format(#[case] invalid_type: &str, #[case] expected_error: &str) { + let data = format!( + r#"{{ + "name": "invalid", + "type": "{invalid_type}", + "nullable": false, + "metadata": {{}} + }}"# + ); + let result: Result = serde_json::from_str(&data); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!( + err.to_string().contains(expected_error), + "Expected error containing '{expected_error}', got: {err}" + ); + } + + #[rstest] + #[case( + r#"{"type": "array", "elementType": "integer", "containsNull": false}"#, + DataType::Array(Box::new(ArrayType::new(DataType::INTEGER, false))) + )] + #[case( + r#"{"type": "struct", "fields": [{"name": "a", "type": "integer", "nullable": false, "metadata": {}}, {"name": "b", "type": "string", "nullable": true, "metadata": {}}]}"#, + DataType::Struct(Box::new(StructType::new_unchecked([ + StructField::new("a", DataType::INTEGER, false), + StructField::new("b", DataType::STRING, true), + ]))) + )] + #[case( + r#"{"type": "map", "keyType": "string", "valueType": "integer", "valueContainsNull": true}"#, + DataType::Map(Box::new(MapType::new(DataType::STRING, DataType::INTEGER, true))) + )] + #[case("\"string\"", DataType::STRING)] + #[case("\"long\"", DataType::LONG)] + #[case("\"integer\"", DataType::INTEGER)] + #[case("\"short\"", DataType::SHORT)] + #[case("\"byte\"", DataType::BYTE)] + #[case("\"float\"", DataType::FLOAT)] + #[case("\"double\"", DataType::DOUBLE)] + #[case("\"boolean\"", DataType::BOOLEAN)] + #[case("\"binary\"", DataType::BINARY)] + #[case("\"date\"", DataType::DATE)] + #[case("\"timestamp\"", DataType::TIMESTAMP)] + #[case("\"timestamp_ntz\"", DataType::TIMESTAMP_NTZ)] + #[case("\"variant\"", DataType::unshredded_variant())] + fn test_data_type_deserialization(#[case] type_json: &str, #[case] expected: DataType) { + let data_type: DataType = serde_json::from_str(type_json).unwrap(); + assert_eq!(data_type, expected); + } + #[test] fn test_make_physical_no_column_mapping() { - let data = example_schema_metadata(); - let field: StructField = serde_json::from_str(data).unwrap(); - let physical_field = field.make_physical(ColumnMappingMode::None); - - let assert_field_metadata_is_wiped = |field: &StructField| { - assert!(field - .get_config_value(&ColumnMetadataKey::ColumnMappingId) - .is_none()); - assert!(field - .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) - .is_none()); - assert!(field - .get_config_value(&ColumnMetadataKey::ParquetFieldId) - .is_none()); - }; + let field = StructField::nullable( + "e", + ArrayType::new( + StructType::new_unchecked([StructField::not_null("d", DataType::INTEGER)]).into(), + true, + ), + ); + let physical_field = field.make_physical(ColumnMappingMode::None).unwrap(); + assert_eq!(physical_field.name, "e"); - assert_field_metadata_is_wiped(&physical_field); + assert!(physical_field + .get_config_value(&ColumnMetadataKey::ColumnMappingId) + .is_none()); + assert!(physical_field + .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) + .is_none()); let DataType::Array(atype) = physical_field.data_type else { panic!("Expected an Array"); @@ -1880,7 +2329,63 @@ mod tests { }; let struct_field = stype.fields.get_index(0).unwrap().1; assert_eq!(struct_field.name, "d"); - assert_field_metadata_is_wiped(struct_field); + } + + #[test] + fn test_make_physical_rejects_annotated_fields_when_column_mapping_disabled() { + let data = example_schema_metadata(); + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(field.make_physical(ColumnMappingMode::None).is_err()); + } + + #[test] + fn test_make_physical_rejects_unannotated_leaf_in_deep_nesting() { + let schema = test_deep_nested_schema_missing_leaf_cm(); + let field = schema.fields().next().unwrap(); + let err = field + .make_physical(ColumnMappingMode::Name) + .unwrap_err() + .to_string(); + assert!( + err.contains("top.``.mid_field.``.leaf"), + "Expected full nested path in error, got: {err}" + ); + } + + #[test] + fn test_make_physical_rejects_duplicate_column_mapping_ids() { + use crate::schema::ColumnMetadataKey; + + fn cm_field(name: &str, id: i64, data_type: impl Into) -> StructField { + StructField::not_null(name, data_type).with_metadata([ + ( + ColumnMetadataKey::ColumnMappingId.as_ref(), + MetadataValue::Number(id), + ), + ( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + MetadataValue::String(format!("col-{name}")), + ), + ]) + } + + let inner = StructType::new_unchecked([ + cm_field("x", 3, DataType::INTEGER), + cm_field("y", 4, DataType::STRING), + ]); + let schema = StructType::new_unchecked([ + cm_field("a", 1, DataType::INTEGER), + cm_field( + "b", + 2, + ArrayType::new(DataType::Struct(Box::new(inner)), true), + ), + cm_field("c", 3, DataType::STRING), + ]); + assert_result_error_with_message( + schema.make_physical(ColumnMappingMode::Id), + "Duplicate column mapping ID", + ); } #[test] @@ -1901,10 +2406,10 @@ mod tests { assert!(matches!(col_id, MetadataValue::Number(num) if *num == 4)); assert!(matches!(id_start, MetadataValue::Number(num) if *num == 2147483648i64)); assert_eq!( - field.physical_name(), + field.physical_name(mode), "col-5f422f40-de70-45b2-88ab-1d5c90e94db1" ); - let physical_field = field.make_physical(mode); + let physical_field = field.make_physical(mode).unwrap(); // Parquet field id should only be present in id column mapping mode match mode { @@ -1920,12 +2425,14 @@ mod tests { )); } ColumnMappingMode::Name => { - assert!(physical_field - .get_config_value(&ColumnMetadataKey::ParquetFieldId) - .is_none()); - assert!(physical_field - .get_config_value(&ColumnMetadataKey::ColumnMappingId) - .is_none(),); + assert!(matches!( + physical_field.get_config_value(&ColumnMetadataKey::ParquetFieldId), + Some(MetadataValue::Number(4)) + )); + assert!(matches!( + physical_field.get_config_value(&ColumnMetadataKey::ColumnMappingId), + Some(MetadataValue::Number(4)) + )); } ColumnMappingMode::None => panic!("unexpected column mapping mode"), } @@ -1960,18 +2467,53 @@ mod tests { )); } ColumnMappingMode::Name => { - assert!(struct_field - .get_config_value(&ColumnMetadataKey::ParquetFieldId) - .is_none()); - assert!(struct_field - .get_config_value(&ColumnMetadataKey::ColumnMappingId) - .is_none()); + assert!(matches!( + struct_field.get_config_value(&ColumnMetadataKey::ParquetFieldId), + Some(MetadataValue::Number(5)) + )); + assert!(matches!( + struct_field.get_config_value(&ColumnMetadataKey::ColumnMappingId), + Some(MetadataValue::Number(5)) + )); } ColumnMappingMode::None => panic!("unexpected column mapping mode"), } }); } + #[test] + fn test_make_physical_passes_metadata_column_through() { + let field = StructField::create_metadata_column( + "_metadata.row_index", + MetadataColumnSpec::RowIndex, + ); + for mode in [ + ColumnMappingMode::None, + ColumnMappingMode::Name, + ColumnMappingMode::Id, + ] { + let physical = field.make_physical(mode).unwrap(); + assert_eq!(physical.name(), "_metadata.row_index"); + assert!(physical.is_metadata_column()); + } + } + + #[test] + fn test_make_physical_rejects_metadata_column_with_cm_annotations() { + let field = StructField::create_metadata_column( + "_metadata.row_index", + MetadataColumnSpec::RowIndex, + ) + .add_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + MetadataValue::String("phys".to_string()), + )]); + assert_result_error_with_message( + field.make_physical(ColumnMappingMode::Name), + "must not have column mapping annotations", + ); + } + #[test] fn test_read_schemas() { let file = std::fs::File::open("./tests/serde/schema.json").unwrap(); @@ -2006,97 +2548,6 @@ mod tests { assert!(serde_json::from_str::(data).is_err()); } - #[test] - fn test_depth_checker() { - let schema = DataType::try_struct_type([ - StructField::nullable( - "a", - ArrayType::new( - DataType::try_struct_type([ - StructField::nullable("w", DataType::LONG), - StructField::nullable("x", ArrayType::new(DataType::LONG, true)), - StructField::nullable( - "y", - MapType::new(DataType::LONG, DataType::STRING, true), - ), - StructField::nullable( - "z", - DataType::try_struct_type([ - StructField::nullable("n", DataType::LONG), - StructField::nullable("m", DataType::STRING), - ]) - .unwrap(), - ), - ]) - .unwrap(), - true, - ), - ), - StructField::nullable( - "b", - DataType::try_struct_type([ - StructField::nullable("o", ArrayType::new(DataType::LONG, true)), - StructField::nullable( - "p", - MapType::new(DataType::LONG, DataType::STRING, true), - ), - StructField::nullable( - "q", - DataType::try_struct_type([ - StructField::nullable( - "s", - DataType::try_struct_type([ - StructField::nullable("u", DataType::LONG), - StructField::nullable("v", DataType::LONG), - ]) - .unwrap(), - ), - StructField::nullable("t", DataType::LONG), - ]) - .unwrap(), - ), - StructField::nullable("r", DataType::LONG), - ]) - .unwrap(), - ), - StructField::nullable( - "c", - MapType::new( - DataType::LONG, - DataType::try_struct_type([ - StructField::nullable("f", DataType::LONG), - StructField::nullable("g", DataType::STRING), - ]) - .unwrap(), - true, - ), - ), - ]) - .unwrap(); - - // Similar to SchemaDepthChecker::check, but also returns call count - let check_with_call_count = - |depth_limit| SchemaDepthChecker::check_with_call_count(&schema, depth_limit); - - // Hit depth limit at "a" but still have to look at "b" "c" "d" - assert_eq!(check_with_call_count(1), (2, 5)); - assert_eq!(check_with_call_count(2), (3, 6)); - - // Hit depth limit at "w" but still have to look at "x" "y" "z" - assert_eq!(check_with_call_count(3), (4, 10)); - assert_eq!(check_with_call_count(4), (5, 11)); - - // Depth limit hit at "n" but still have to look at "m" - assert_eq!(check_with_call_count(5), (6, 15)); - - // Depth limit not hit until "u" - assert_eq!(check_with_call_count(6), (7, 28)); - - // Depth limit not hit (full traversal required) - assert_eq!(check_with_call_count(7), (7, 32)); - assert_eq!(check_with_call_count(8), (7, 32)); - } - #[test] fn test_metadata_value_to_string() { assert_eq!(MetadataValue::Number(0).to_string(), "0"); @@ -2145,7 +2596,7 @@ mod tests { StructField::nullable("a", DataType::STRING), StructField::nullable("b", DataType::INTEGER), ]); - assert!(!InvariantChecker::has_invariants(&schema)); + assert!(!schema_has_invariants(&schema)); // Schema with top-level invariant let mut field = StructField::nullable("c", DataType::STRING); @@ -2156,7 +2607,7 @@ mod tests { let schema = StructType::new_unchecked([StructField::nullable("a", DataType::STRING), field]); - assert!(InvariantChecker::has_invariants(&schema)); + assert!(schema_has_invariants(&schema)); // Schema with nested invariant in a struct let nested_field = StructField::nullable( @@ -2177,7 +2628,7 @@ mod tests { StructField::nullable("b", DataType::INTEGER), nested_field, ]); - assert!(InvariantChecker::has_invariants(&schema)); + assert!(schema_has_invariants(&schema)); // Schema with nested invariant in an array of structs let array_field = StructField::nullable( @@ -2201,7 +2652,7 @@ mod tests { StructField::nullable("b", DataType::INTEGER), array_field, ]); - assert!(InvariantChecker::has_invariants(&schema)); + assert!(schema_has_invariants(&schema)); // Schema with nested invariant in a map value that's a struct let map_field = StructField::nullable( @@ -2226,7 +2677,7 @@ mod tests { StructField::nullable("b", DataType::INTEGER), map_field, ]); - assert!(InvariantChecker::has_invariants(&schema)); + assert!(schema_has_invariants(&schema)); } #[test] @@ -2415,7 +2866,7 @@ mod tests { 1 => assert_eq!(field.name, "required_int"), 2 => assert_eq!(field.name, "nullable_bool"), 3 => assert_eq!(field.name, "required_long"), - _ => panic!("Unexpected field index: {}", index), + _ => panic!("Unexpected field index: {index}"), } } } @@ -2620,6 +3071,7 @@ mod tests { MetadataColumnSpec::RowCommitVersion.text_value(), "row_commit_version" ); + assert_eq!(MetadataColumnSpec::FilePath.text_value(), "_file"); // Test data_type assert_eq!(MetadataColumnSpec::RowIndex.data_type(), DataType::LONG); @@ -2628,11 +3080,25 @@ mod tests { MetadataColumnSpec::RowCommitVersion.data_type(), DataType::LONG ); + assert_eq!(MetadataColumnSpec::FilePath.data_type(), DataType::STRING); // Test nullable assert!(!MetadataColumnSpec::RowIndex.nullable()); assert!(!MetadataColumnSpec::RowId.nullable()); assert!(!MetadataColumnSpec::RowCommitVersion.nullable()); + assert!(!MetadataColumnSpec::FilePath.nullable()); + + // Test reserved_field_id + assert_eq!(MetadataColumnSpec::RowIndex.reserved_field_id(), None); + assert_eq!(MetadataColumnSpec::RowId.reserved_field_id(), None); + assert_eq!( + MetadataColumnSpec::RowCommitVersion.reserved_field_id(), + None + ); + assert_eq!( + MetadataColumnSpec::FilePath.reserved_field_id(), + Some(crate::reserved_field_ids::FILE_NAME) + ); // Test from_str assert_eq!( @@ -2647,6 +3113,10 @@ mod tests { MetadataColumnSpec::from_str("row_commit_version")?, MetadataColumnSpec::RowCommitVersion ); + assert_eq!( + MetadataColumnSpec::from_str("_file")?, + MetadataColumnSpec::FilePath + ); // Test invalid from_str assert!(MetadataColumnSpec::from_str("invalid").is_err()); @@ -2728,6 +3198,26 @@ mod tests { Ok(()) } + #[test] + fn test_duplicate_field_name_case_insensitive() { + // Delta column names are case-insensitive per protocol; (Value, value) is invalid + let result = StructType::try_new([ + StructField::nullable("Value", DataType::INTEGER), + StructField::nullable("value", DataType::STRING), + ]); + assert_result_error_with_message(result, "Duplicate field name (case-insensitive)"); + } + + #[test] + fn test_duplicate_field_name_exact() { + // Exact duplicate (same name twice) is rejected via the case-insensitive check + let result = StructType::try_new([ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("id", DataType::STRING), + ]); + assert_result_error_with_message(result, "Duplicate field name (case-insensitive)"); + } + #[test] fn test_nested_metadata_columns_validation_struct() -> DeltaResult<()> { // Test that metadata columns in nested structs are rejected @@ -2887,4 +3377,440 @@ mod tests { ); Ok(()) } + + #[test] + fn test_physical_name_with_mode_none() { + let field_json = r#"{ + "name": "logical_name", + "type": "string", + "nullable": true, + "metadata": { + "delta.columnMapping.physicalName": "physical_name_col123" + } + }"#; + let field: StructField = serde_json::from_str(field_json).unwrap(); + + // With ColumnMappingMode::None, should return logical name even though physical name exists + assert_eq!(field.physical_name(ColumnMappingMode::None), "logical_name"); + } + + #[test] + fn test_physical_name_with_mode_id() { + let field_json = r#"{ + "name": "logical_name", + "type": "string", + "nullable": true, + "metadata": { + "delta.columnMapping.id": 5, + "delta.columnMapping.physicalName": "physical_name_col123" + } + }"#; + let field: StructField = serde_json::from_str(field_json).unwrap(); + + // With ColumnMappingMode::Id, should return physical name + assert_eq!( + field.physical_name(ColumnMappingMode::Id), + "physical_name_col123" + ); + } + + #[test] + fn test_physical_name_with_mode_name() { + let field_json = r#"{ + "name": "logical_name", + "type": "string", + "nullable": true, + "metadata": { + "delta.columnMapping.physicalName": "physical_name_col456" + } + }"#; + let field: StructField = serde_json::from_str(field_json).unwrap(); + + // With ColumnMappingMode::Name, should return physical name + assert_eq!( + field.physical_name(ColumnMappingMode::Name), + "physical_name_col456" + ); + } + + #[test] + fn test_physical_name_fallback_id() { + let field_json = r#"{ + "name": "logical_name", + "type": "string", + "nullable": true, + "metadata": {} + }"#; + let field: StructField = serde_json::from_str(field_json).unwrap(); + + // With ColumnMappingMode::Id but no physical name, should fallback to logical name + assert_eq!(field.physical_name(ColumnMappingMode::Id), "logical_name"); + } + + #[test] + fn test_physical_name_fallback_name() { + let field_json = r#"{ + "name": "logical_name", + "type": "string", + "nullable": true, + "metadata": {} + }"#; + let field: StructField = serde_json::from_str(field_json).unwrap(); + + // With ColumnMappingMode::Name but no physical name, should fallback to logical name + assert_eq!(field.physical_name(ColumnMappingMode::Name), "logical_name"); + } + + #[test] + fn test_display_struct_type_stable_output() -> DeltaResult<()> { + let nested_field_with_metadata = + StructField::create_metadata_column("nested_row_index", MetadataColumnSpec::RowIndex); + let inner_struct = + StructType::new_unchecked([StructField::new("q", DataType::LONG, false)]); + let nested_struct = StructType::new_unchecked([ + nested_field_with_metadata, + StructField::new("x", DataType::DOUBLE, true), + StructField::new( + "inner_struct", + DataType::Struct(Box::new(inner_struct)), + false, + ), + ]); + let array_type = ArrayType::new(DataType::Struct(Box::new(nested_struct.clone())), true); + let map_type = MapType::new( + DataType::Struct(Box::new(nested_struct.clone())), + DataType::Struct(Box::new(nested_struct.clone())), // kek + true, + ); + let fields = vec![ + StructField::new("x", DataType::DOUBLE, true), + StructField::new("y", DataType::FLOAT, false), + StructField::new("z", DataType::LONG, true), + StructField::new("s", nested_struct.clone(), false), + StructField::nullable("array_col", DataType::Array(Box::new(array_type))), + StructField::nullable("map_col", DataType::Map(Box::new(map_type))), + StructField::new("a", DataType::LONG, true), + ]; + + let struct_type = StructType::new_unchecked(fields); + assert_eq!( + struct_type.to_string(), + "struct: +├─x: double (is nullable: true, metadata: {}) +├─y: float (is nullable: false, metadata: {}) +├─z: long (is nullable: true, metadata: {}) +├─s: struct> (is nullable: false, metadata: {}) +│ ├─nested_row_index: long (is nullable: false, metadata: {delta.metadataSpec: String(\"row_index\")}) +│ ├─x: double (is nullable: true, metadata: {}) +│ └─inner_struct: struct (is nullable: false, metadata: {}) +│ └─q: long (is nullable: false, metadata: {}) +├─array_col: array>> (is nullable: true, metadata: {}) +│ └─array_element: struct> +│ ├─nested_row_index: long (is nullable: false, metadata: {delta.metadataSpec: String(\"row_index\")}) +│ ├─x: double (is nullable: true, metadata: {}) +│ └─inner_struct: struct (is nullable: false, metadata: {}) +│ └─q: long (is nullable: false, metadata: {}) +├─map_col: map>, struct>> (is nullable: true, metadata: {}) +│ ├─map_key: struct> +│ │ ├─nested_row_index: long (is nullable: false, metadata: {delta.metadataSpec: String(\"row_index\")}) +│ │ ├─x: double (is nullable: true, metadata: {}) +│ │ └─inner_struct: struct (is nullable: false, metadata: {}) +│ │ └─q: long (is nullable: false, metadata: {}) +│ └─map_value: struct> +│ ├─nested_row_index: long (is nullable: false, metadata: {delta.metadataSpec: String(\"row_index\")}) +│ ├─x: double (is nullable: true, metadata: {}) +│ └─inner_struct: struct (is nullable: false, metadata: {}) +│ └─q: long (is nullable: false, metadata: {}) +└─a: long (is nullable: true, metadata: {}) +" + ); + + let schema = StructType::try_new([StructField::nullable("regular_col", DataType::STRING)])?; + let schema = schema + .add_metadata_column("row_index", MetadataColumnSpec::RowIndex)? + .add_metadata_column("row_id", MetadataColumnSpec::RowId)? + .add_metadata_column("row_commit_version", MetadataColumnSpec::RowCommitVersion)?; + assert_eq!(schema.to_string(), "struct: +├─regular_col: string (is nullable: true, metadata: {}) +├─row_index: long (is nullable: false, metadata: {delta.metadataSpec: String(\"row_index\")}) +├─row_id: long (is nullable: false, metadata: {delta.metadataSpec: String(\"row_id\")}) +└─row_commit_version: long (is nullable: false, metadata: {delta.metadataSpec: String(\"row_commit_version\")}) +"); + Ok(()) + } + + #[test] + fn test_builder_empty() { + let schema = StructType::builder().build().unwrap(); + assert_eq!(schema.num_fields(), 0) + } + + #[test] + fn test_builder_add_fields() { + let schema = StructType::builder() + .add_field(StructField::new("id", DataType::INTEGER, false)) + .add_field(StructField::new("name", DataType::STRING, true)) + .build() + .unwrap(); + + assert_eq!(schema.num_fields(), 2); + assert_eq!(schema.field_at_index(0).unwrap().name(), "id"); + assert_eq!(schema.field_at_index(1).unwrap().name(), "name"); + } + + #[test] + fn test_builder_from_schema() { + let base_schema = + StructType::try_new([StructField::new("id", DataType::INTEGER, false)]).unwrap(); + + let extended_schema = StructTypeBuilder::from_schema(&base_schema) + .add_field(StructField::new("name", DataType::STRING, true)) + .build() + .unwrap(); + + assert_eq!(extended_schema.num_fields(), 2); + assert_eq!(extended_schema.field_at_index(0).unwrap().name(), "id"); + assert_eq!(extended_schema.field_at_index(1).unwrap().name(), "name"); + } + + #[test] + fn test_parquet_field_id_key_value() { + // Verify the string value of ColumnMetadataKey::ParquetFieldId matches the convention + // used by delta-spark and other Delta ecosystem implementations. This is not part of + // the Delta protocol spec, so we pin the value here to catch accidental changes. + assert_eq!( + ColumnMetadataKey::ParquetFieldId.as_ref(), + "parquet.field.id" + ); + } + + #[test] + fn test_with_field_inserted_empty_struct() { + let schema = StructType::try_new([]).unwrap(); + let schema = schema + .with_field_inserted_after(None, StructField::new("age", DataType::STRING, true)) + .expect("with field inserted should produce a valid schema"); + assert_eq!(schema.num_fields(), 1); + assert_eq!(schema.field_at_index(0).unwrap().name(), "age"); + } + + #[test] + fn test_with_field_inserted() { + let schema = StructType::try_new([ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ]) + .unwrap(); + let schema = schema + .with_field_inserted_after(Some("id"), StructField::new("age", DataType::STRING, true)) + .expect("with field inserted should produce a valid schema"); + assert_eq!(schema.num_fields(), 3); + assert_eq!(schema.field_at_index(0).unwrap().name(), "id"); + assert_eq!(schema.field_at_index(1).unwrap().name(), "age"); + assert_eq!(schema.field_at_index(2).unwrap().name(), "name"); + } + + #[test] + fn test_with_field_inserted_append_to_end() { + let schema = StructType::try_new([ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ]) + .unwrap(); + let schema = schema + .with_field_inserted_after(None, StructField::new("age", DataType::STRING, true)) + .expect("with field inserted should produce a valid schema"); + + assert_eq!(schema.num_fields(), 3); + assert_eq!(schema.field_at_index(0).unwrap().name(), "id"); + assert_eq!(schema.field_at_index(1).unwrap().name(), "name"); + assert_eq!(schema.field_at_index(2).unwrap().name(), "age"); + } + + #[test] + fn test_with_field_inserted_after_non_existent_field() { + let schema = + StructType::try_new([StructField::new("id", DataType::INTEGER, false)]).unwrap(); + let new_schema = schema.with_field_inserted_after( + Some("nonexistent"), + StructField::new("name", DataType::STRING, true), + ); + assert!(new_schema.is_err()); + } + + #[test] + fn test_with_field_inserted_after_duplicate_field() { + let schema = StructType::try_new([ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ]) + .unwrap(); + let new_schema = schema.with_field_inserted_after( + Some("name"), + StructField::new("id", DataType::STRING, true), + ); + assert!(new_schema.is_err()); + assert_result_error_with_message(new_schema, "Field id already exists"); + } + + #[test] + fn test_with_field_inserted_before() { + let schema = StructType::try_new([ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ]) + .unwrap(); + let schema = schema + .with_field_inserted_before( + Some("name"), + StructField::new("age", DataType::STRING, true), + ) + .expect("with field inserted before should produce a valid schema"); + assert_eq!(schema.num_fields(), 3); + assert_eq!(schema.field_at_index(0).unwrap().name(), "id"); + assert_eq!(schema.field_at_index(1).unwrap().name(), "age"); + assert_eq!(schema.field_at_index(2).unwrap().name(), "name"); + } + + #[test] + fn test_with_field_inserted_before_duplicate_field() { + let schema = StructType::try_new([ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ]) + .unwrap(); + let new_schema = schema.with_field_inserted_before( + Some("name"), + StructField::new("id", DataType::STRING, true), + ); + assert!(new_schema.is_err()); + assert_result_error_with_message(new_schema, "Field id already exists"); + } + + #[test] + fn test_with_field_inserted_before_at_beginning() { + let schema = StructType::try_new([ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ]) + .unwrap(); + let schema = schema + .with_field_inserted_before(None, StructField::new("age", DataType::STRING, true)) + .expect("with field inserted before should produce a valid schema"); + assert_eq!(schema.num_fields(), 3); + assert_eq!(schema.field_at_index(0).unwrap().name(), "age"); + assert_eq!(schema.field_at_index(1).unwrap().name(), "id"); + assert_eq!(schema.field_at_index(2).unwrap().name(), "name"); + } + + #[test] + fn test_with_field_inserted_before_non_existent_field() { + let schema = + StructType::try_new([StructField::new("id", DataType::INTEGER, false)]).unwrap(); + let new_schema = schema.with_field_inserted_before( + Some("nonexistent"), + StructField::new("name", DataType::STRING, true), + ); + assert!(new_schema.is_err()); + } + + #[test] + fn test_with_field_inserted_before_empty_struct() { + let schema = StructType::try_new([]).unwrap(); + let schema = schema + .with_field_inserted_before(None, StructField::new("age", DataType::STRING, true)) + .expect("with field inserted before on empty struct should succeed"); + assert_eq!(schema.num_fields(), 1); + assert_eq!(schema.field_at_index(0).unwrap().name(), "age"); + } + + #[test] + fn test_with_field_removed() { + let schema = + StructType::try_new([StructField::new("id", DataType::INTEGER, false)]).unwrap(); + let new_schema = schema.with_field_removed("id"); + assert_eq!(new_schema.num_fields(), 0); + } + + #[test] + fn test_with_field_removed_non_existent_field() { + let schema = + StructType::try_new([StructField::new("id", DataType::INTEGER, false)]).unwrap(); + let new_schema = schema.with_field_removed("nonexistent"); + assert_eq!(new_schema.num_fields(), 1); + assert_eq!(new_schema.field_at_index(0).unwrap().name(), "id"); + } + + #[test] + fn test_with_field_replaced() { + let schema = + StructType::try_new([StructField::new("id", DataType::INTEGER, false)]).unwrap(); + let new_schema = schema + .with_field_replaced("id", StructField::new("name", DataType::STRING, true)) + .unwrap(); + + assert_eq!(new_schema.num_fields(), 1); + assert_eq!(new_schema.field_at_index(0).unwrap().name(), "name"); + } + + #[test] + fn test_with_field_replaced_non_existent_field() { + let schema = + StructType::try_new([StructField::new("id", DataType::INTEGER, false)]).unwrap(); + let new_schema = schema.with_field_replaced( + "nonexistent", + StructField::new("name", DataType::STRING, true), + ); + assert!(new_schema.is_err(), "Expected error for non-existent field"); + } + + /// Schema: { a: { b: { c: double } } } — supports walks at depths 1, 2, and 3. + fn walk_test_schema() -> StructType { + let l3 = StructType::new_unchecked([StructField::new("c", DataType::DOUBLE, false)]); + let l2 = StructType::new_unchecked([StructField::new( + "b", + DataType::Struct(Box::new(l3)), + false, + )]); + StructType::new_unchecked([StructField::new("a", DataType::Struct(Box::new(l2)), false)]) + } + + #[rstest::rstest] + #[case::single_level(vec!["a"], vec!["a"], DataType::Struct(Box::new( + StructType::new_unchecked([StructField::new("b", DataType::Struct(Box::new( + StructType::new_unchecked([StructField::new("c", DataType::DOUBLE, false)]) + )), false)]) + )))] + #[case::nested_2(vec!["a", "b"], vec!["a", "b"], DataType::Struct(Box::new( + StructType::new_unchecked([StructField::new("c", DataType::DOUBLE, false)]) + )))] + #[case::nested_3(vec!["a", "b", "c"], vec!["a", "b", "c"], DataType::DOUBLE)] + #[test] + fn test_walk_column_fields_happy( + #[case] col_path: Vec<&str>, + #[case] expected_names: Vec<&str>, + #[case] expected_leaf_type: DataType, + ) { + let schema = walk_test_schema(); + let fields = schema + .walk_column_fields(&ColumnName::new(col_path.iter().copied())) + .unwrap(); + assert_eq!(fields.len(), expected_names.len()); + for (field, name) in fields.iter().zip(expected_names.iter()) { + assert_eq!(field.name(), *name); + } + assert_eq!(fields.last().unwrap().data_type(), &expected_leaf_type); + } + + #[rstest::rstest] + #[case::empty_path(vec![], "Column path cannot be empty")] + #[case::not_found_top(vec!["x"], "not found in schema")] + #[case::not_found_nested(vec!["a", "x"], "not found in schema")] + #[case::intermediate_not_struct(vec!["a", "b", "c", "d"], "not a struct type")] + #[test] + fn test_walk_column_fields_error(#[case] col_path: Vec<&str>, #[case] expected_error: &str) { + let schema = walk_test_schema(); + let result = schema.walk_column_fields(&ColumnName::new(col_path.iter().copied())); + assert_result_error_with_message(result, expected_error); + } } diff --git a/kernel/src/schema/variant_utils.rs b/kernel/src/schema/variant_utils.rs index c9bb88bd8e..03a678fbd8 100644 --- a/kernel/src/schema/variant_utils.rs +++ b/kernel/src/schema/variant_utils.rs @@ -1,15 +1,16 @@ //! Utility functions for the variant type and variant-related table features. -use crate::actions::Protocol; -use crate::schema::{Schema, SchemaTransform, StructType}; -use crate::table_features::{ReaderFeature, WriterFeature}; +use crate::schema::{Schema, StructType}; +use crate::table_configuration::TableConfiguration; +use crate::table_features::TableFeature; +use crate::transforms::SchemaTransform; use crate::utils::require; use crate::{DeltaResult, Error}; use std::borrow::Cow; /// Schema visitor that checks if any column in the schema uses VARIANT type #[derive(Debug, Default)] -pub(crate) struct UsesVariant(pub(crate) bool); +pub(crate) struct UsesVariant(bool); impl<'a> SchemaTransform<'a> for UsesVariant { fn transform_variant(&mut self, _: &'a StructType) -> Option> { @@ -18,21 +19,22 @@ impl<'a> SchemaTransform<'a> for UsesVariant { } } -pub(crate) fn validate_variant_type_feature_support( - schema: &Schema, - protocol: &Protocol, -) -> DeltaResult<()> { +/// Checks if any column in the schema (including nested columns) has VARIANT type. +pub(crate) fn schema_contains_variant_type(schema: &Schema) -> bool { + let mut visitor = UsesVariant(false); + let _ = visitor.transform_struct(schema); + visitor.0 +} + +pub(crate) fn validate_variant_type_feature_support(tc: &TableConfiguration) -> DeltaResult<()> { // Both the reader and writer need to have either the VariantType or the VariantTypePreview // features. - if (!protocol.has_reader_feature(&ReaderFeature::VariantType) - && !protocol.has_reader_feature(&ReaderFeature::VariantTypePreview)) - || (!protocol.has_writer_feature(&WriterFeature::VariantType) - && !protocol.has_writer_feature(&WriterFeature::VariantTypePreview)) + let protocol = tc.protocol(); + if !protocol.has_table_feature(&TableFeature::VariantType) + && !protocol.has_table_feature(&TableFeature::VariantTypePreview) { - let mut uses_variant = UsesVariant::default(); - let _ = uses_variant.transform_struct(schema); require!( - !uses_variant.0, + !schema_contains_variant_type(&tc.logical_schema()), Error::unsupported( "Table contains VARIANT columns but does not have the required 'variantType' feature in reader and writer features" ) @@ -43,11 +45,12 @@ pub(crate) fn validate_variant_type_feature_support( #[cfg(test)] mod tests { - use super::*; use crate::actions::Protocol; use crate::schema::{DataType, StructField, StructType}; - use crate::table_features::{ReaderFeature, WriterFeature}; - use crate::utils::test_utils::assert_result_error_with_message; + use crate::table_features::TableFeature; + use crate::utils::test_utils::{ + assert_result_error_with_message, assert_schema_feature_validation, + }; #[test] fn test_is_unshredded_variant() { @@ -73,25 +76,15 @@ mod tests { #[test] fn test_variant_feature_validation() { - let features = [ - (ReaderFeature::VariantType, WriterFeature::VariantType), - ( - ReaderFeature::VariantTypePreview, - WriterFeature::VariantTypePreview, - ), - ]; - let schema_with_variant = StructType::new_unchecked([ + let schema_with = StructType::new_unchecked([ StructField::new("id", DataType::INTEGER, false), StructField::new("v", DataType::unshredded_variant(), true), ]); - - let schema_without_variant = StructType::new_unchecked([ + let schema_without = StructType::new_unchecked([ StructField::new("id", DataType::INTEGER, false), StructField::new("name", DataType::STRING, true), ]); - - // Nested schema with VARIANT - let nested_schema_with_variant = StructType::new_unchecked([ + let nested_schema_with = StructType::new_unchecked([ StructField::new("id", DataType::INTEGER, false), StructField::new( "nested", @@ -103,80 +96,37 @@ mod tests { true, ), ]); - features - .iter() - .for_each(|(variant_reader, variant_writer)| { - // Protocol with variantType features - let protocol_with_features = - Protocol::try_new(3, 7, Some([variant_reader]), Some([variant_writer])) - .unwrap(); - - // Protocol without variantType features - let protocol_without_features = Protocol::try_new( - 3, - 7, - Some::>(vec![]), - Some::>(vec![]), - ) - .unwrap(); - - // Protocol without variantType writer feature - let protocol_without_writer_feature = - Protocol::try_new(3, 7, Some([variant_reader]), Some::>(vec![])) - .unwrap(); - - // Protocol without variantType reader feature - let protocol_without_reader_feature = - Protocol::try_new(3, 7, Some::>(vec![]), Some([variant_writer])) - .unwrap(); + let protocol_without = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, TableFeature::EMPTY_LIST).unwrap(); + let err_msg = "Table contains VARIANT columns but does not have the required 'variantType' feature in reader and writer features"; - // Schema with VARIANT + Protocol with features = OK - validate_variant_type_feature_support( - &schema_with_variant, - &protocol_with_features, - ) - .expect("Should succeed when features are present"); - - // Schema without VARIANT + Protocol without features = OK - validate_variant_type_feature_support( - &schema_without_variant, - &protocol_without_features, - ) - .expect("Should succeed when no VARIANT columns are present"); - - // Schema without VARIANT + Protocol with features = OK - validate_variant_type_feature_support( - &schema_without_variant, - &protocol_with_features, - ) - .expect("Should succeed when no VARIANT columns are present, even with features"); - - // Schema with VARIANT + Protocol without features = ERROR - let result = validate_variant_type_feature_support( - &schema_with_variant, - &protocol_without_features, - ); - assert_result_error_with_message(result, "Unsupported: Table contains VARIANT columns but does not have the required 'variantType' feature in reader and writer features"); - - let result = validate_variant_type_feature_support( - &nested_schema_with_variant, - &protocol_without_features, - ); - assert_result_error_with_message(result, "Unsupported: Table contains VARIANT columns but does not have the required 'variantType' feature in reader and writer features"); - - // Schema with VARIANT + Protocol without writer feature = ERROR - let result = validate_variant_type_feature_support( - &schema_with_variant, - &protocol_without_writer_feature, - ); - assert_result_error_with_message(result, "Unsupported: Table contains VARIANT columns but does not have the required 'variantType' feature in reader and writer features"); - - // Schema with VARIANT + Protocol without reader feature = ERROR - let result = validate_variant_type_feature_support( - &schema_with_variant, - &protocol_without_reader_feature, - ); - assert_result_error_with_message(result, "Unsupported: Table contains VARIANT columns but does not have the required 'variantType' feature in reader and writer features"); - }); + for (reader, writer) in [ + (TableFeature::VariantType, TableFeature::VariantType), + ( + TableFeature::VariantTypePreview, + TableFeature::VariantTypePreview, + ), + ] { + let protocol_with = Protocol::try_new_modern([&reader], [&writer]).unwrap(); + + // ReaderWriter features must be listed on both sides + assert_result_error_with_message( + Protocol::try_new_modern([&reader], TableFeature::EMPTY_LIST), + "Reader features must contain only ReaderWriter features that are also listed in writer features", + ); + assert_result_error_with_message( + Protocol::try_new_modern(TableFeature::EMPTY_LIST, [&writer]), + "Writer features must be Writer-only or also listed in reader features", + ); + + assert_schema_feature_validation( + &schema_with, + &schema_without, + &protocol_with, + &protocol_without, + &[&nested_schema_with], + err_msg, + ); + } } } diff --git a/kernel/src/snapshot.rs b/kernel/src/snapshot.rs index de6c3e3628..c20fb64409 100644 --- a/kernel/src/snapshot.rs +++ b/kernel/src/snapshot.rs @@ -1,45 +1,91 @@ //! In-memory representation of snapshots of tables (snapshot is a table at given point in time, it //! has schema etc.) +use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use std::time::Instant; + +use delta_kernel_derive::internal_api; +use tracing::{debug, info, instrument, warn}; +use url::Url; use crate::action_reconciliation::calculate_transaction_expiration_timestamp; -use crate::actions::domain_metadata::domain_metadata_configuration; -use crate::actions::set_transaction::SetTransactionScanner; -use crate::actions::INTERNAL_DOMAIN_PREFIX; +use crate::actions::set_transaction::{is_set_txn_expired, SetTransactionScanner}; +use crate::actions::{DomainMetadata, INTERNAL_DOMAIN_PREFIX}; use crate::checkpoint::CheckpointWriter; -use crate::committer::Committer; -use crate::listed_log_files::ListedLogFiles; -use crate::log_segment::LogSegment; +use crate::clustering::{parse_clustering_columns, CLUSTERING_DOMAIN_NAME}; +use crate::committer::{Committer, PublishMetadata}; +#[cfg(any(test, feature = "test-utils"))] +use crate::crc::Crc; +use crate::crc::{try_write_crc_file, CrcDelta, LazyCrc}; +use crate::expressions::ColumnName; +use crate::log_segment::{DomainMetadataMap, LogSegment}; +use crate::log_segment_files::LogSegmentFiles; +use crate::metrics::{MetricEvent, MetricId}; use crate::path::ParsedLogPath; use crate::scan::ScanBuilder; use crate::schema::SchemaRef; use crate::table_configuration::{InCommitTimestampEnablement, TableConfiguration}; +use crate::table_features::{physical_to_logical_column_name, ColumnMappingMode, TableFeature}; use crate::table_properties::TableProperties; use crate::transaction::Transaction; +use crate::utils::require; use crate::LogCompactionWriter; use crate::{DeltaResult, Engine, Error, Version}; -use delta_kernel_derive::internal_api; mod builder; pub use builder::SnapshotBuilder; -use tracing::debug; -use url::Url; - +/// A shared, thread-safe reference to a [`Snapshot`]. pub type SnapshotRef = Arc; +/// File-level statistics for a table version. +/// +/// NOTE: This is an unstable API expected to change in future releases. +#[allow(unused)] +#[internal_api] +pub(crate) type FileStats = crate::crc::FileStats; + +/// Result of attempting to write a version checksum (CRC) file. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ChecksumWriteResult { + /// A CRC file already exists at this version. Per the Delta protocol, writers MUST NOT + /// overwrite existing version checksum files. + AlreadyExists, + /// The CRC file was successfully written to storage. + Written, +} + +/// Result of attempting to write a checkpoint file. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CheckpointWriteResult { + /// A checkpoint already exists at this version. + AlreadyExists, + /// The checkpoint was successfully written to storage. + Written, +} + // TODO expose methods for accessing the files of a table (with file pruning). /// In-memory representation of a specific snapshot of a Delta table. While a `DeltaTable` exists /// throughout time, `Snapshot`s represent a view of a table at a specific point in time; they /// have a defined schema (which may change over time for any given table), specific version, and /// frozen log segment. -#[derive(PartialEq, Eq)] pub struct Snapshot { + span: tracing::Span, log_segment: LogSegment, table_configuration: TableConfiguration, + lazy_crc: Arc, +} + +impl PartialEq for Snapshot { + fn eq(&self, other: &Self) -> bool { + self.log_segment == other.log_segment + && self.table_configuration == other.table_configuration + } } +impl Eq for Snapshot {} + impl Drop for Snapshot { fn drop(&mut self) { debug!("Dropping snapshot"); @@ -52,6 +98,7 @@ impl std::fmt::Debug for Snapshot { .field("path", &self.log_segment.log_root.as_str()) .field("version", &self.version()) .field("metadata", &self.table_configuration().metadata()) + .field("log_segment", &self.log_segment) .finish() } } @@ -60,7 +107,7 @@ impl Snapshot { /// Create a new [`SnapshotBuilder`] to build a new [`Snapshot`] for a given table root. If you /// instead have an existing [`Snapshot`] you would like to do minimal work to update, consider /// using - pub fn builder_for(table_root: Url) -> SnapshotBuilder { + pub fn builder_for(table_root: impl AsRef) -> SnapshotBuilder { SnapshotBuilder::new_for(table_root) } @@ -68,54 +115,80 @@ impl Snapshot { /// version. /// /// We implement a simple heuristic: - /// 1. if the new version == existing version, just return the existing snapshot + /// 1. if the caller explicitly requests the existing version, just return the existing + /// snapshot /// 2. if the new version < existing version, error: there is no optimization to do here - /// 3. list from (existing checkpoint version + 1) onward (or just existing snapshot version if - /// no checkpoint) - /// 4. a. if new checkpoint is found: just create a new snapshot from that checkpoint (and - /// commits after it) - /// b. if no new checkpoint is found: do lightweight P+M replay on the latest commits (after - /// ensuring we only retain commits > any checkpoints) + /// 3. list from (existing checkpoint version + 1) onward (or from version 1 if there is no + /// checkpoint yet) + /// 4. if a newer or newly discovered checkpoint is found while refreshing to the latest + /// version, create a new snapshot from that checkpoint (and commits after it), even if the + /// table version itself did not advance + /// 5. if no new checkpoint is found and the table version did not advance, return the + /// existing snapshot + /// 6. if no new checkpoint is found, do lightweight P+M replay on the latest commits after + /// ensuring we only retain commits > any checkpoints /// /// # Parameters /// /// - `existing_snapshot`: reference to an existing [`Snapshot`] /// - `engine`: Implementation of [`Engine`] apis. - /// - `version`: target version of the [`Snapshot`]. None will create a snapshot at the latest - /// version of the table. + /// - `target_version`: target version of the [`Snapshot`]. None will create a snapshot at the + /// latest version of the table. pub fn builder_from(existing_snapshot: SnapshotRef) -> SnapshotBuilder { SnapshotBuilder::new_from(existing_snapshot) } + /// Create a new [`Snapshot`] from a [`LogSegment`] and [`TableConfiguration`]. #[internal_api] pub(crate) fn new(log_segment: LogSegment, table_configuration: TableConfiguration) -> Self { + Self::new_with_crc( + log_segment, + table_configuration, + Arc::new(LazyCrc::new(None)), + ) + } + + /// Internal constructor that accepts an explicit [`LazyCrc`]. + pub(crate) fn new_with_crc( + log_segment: LogSegment, + table_configuration: TableConfiguration, + lazy_crc: Arc, + ) -> Self { + let span = tracing::info_span!( + parent: tracing::Span::none(), + "snap", + path = %table_configuration.table_root(), + version = table_configuration.version(), + ); + info!(parent: &span, "Created snapshot"); Self { + span, log_segment, table_configuration, + lazy_crc, } } - /// Create a new [`Snapshot`] instance from an existing [`Snapshot`]. This is useful when you - /// already have a [`Snapshot`] lying around and want to do the minimal work to 'update' the - /// snapshot to a later version. - fn try_new_from( + /// Implementation of snapshot creation from existing snapshot. + fn try_new_from_impl( existing_snapshot: Arc, log_tail: Vec, engine: &dyn Engine, - version: impl Into>, + target_version: impl Into>, + operation_id: MetricId, ) -> DeltaResult> { let old_log_segment = &existing_snapshot.log_segment; let old_version = existing_snapshot.version(); - let new_version = version.into(); - if let Some(new_version) = new_version { - if new_version == old_version { + let requested_version = target_version.into(); + if let Some(requested_version) = requested_version { + if requested_version == old_version { // Re-requesting the same version return Ok(existing_snapshot.clone()); } - if new_version < old_version { + if requested_version < old_version { // Hint is too new: error since this is effectively an incorrect optimization return Err(Error::Generic(format!( - "Requested snapshot version {new_version} is older than snapshot hint version {old_version}" + "Requested snapshot version {requested_version} is older than snapshot hint version {old_version}" ))); } } @@ -123,29 +196,29 @@ impl Snapshot { let log_root = old_log_segment.log_root.clone(); let storage = engine.storage_handler(); - // Start listing just after the previous segment's checkpoint, if any + // Start listing just after the previous segment's checkpoint, if any. let listing_start = old_log_segment.checkpoint_version.unwrap_or(0) + 1; // Check for new commits (and CRC) - let new_listed_files = ListedLogFiles::list( + let new_listed_files = LogSegmentFiles::list( storage.as_ref(), &log_root, log_tail, Some(listing_start), - new_version, + requested_version, )?; // NB: we need to check both checkpoints and commits since we filter commits at and below // the checkpoint version. Example: if we have a checkpoint + commit at version 1, the log // listing above will only return the checkpoint and not the commit. - if new_listed_files.ascending_commit_files.is_empty() - && new_listed_files.checkpoint_parts.is_empty() + if new_listed_files.ascending_commit_files().is_empty() + && new_listed_files.checkpoint_parts().is_empty() { - match new_version { - Some(new_version) if new_version != old_version => { + match requested_version { + Some(requested_version) if requested_version != old_version => { // No new commits, but we are looking for a new version return Err(Error::Generic(format!( - "Requested snapshot version {new_version} is newer than the latest version {old_version}" + "Requested snapshot version {requested_version} is newer than the latest version {old_version}" ))); } _ => { @@ -158,9 +231,11 @@ impl Snapshot { // create a log segment just from existing_checkpoint.version -> new_version // OR could be from 1 -> new_version // Save the latest_commit before moving new_listed_files - let new_latest_commit_file = new_listed_files.latest_commit_file.clone(); + let new_latest_commit_file = new_listed_files.latest_commit_file().clone(); + // Note: new_log_segment won't have checkpoint_schema since we're listing without a hint. + // If it has a checkpoint, we use it as-is. Otherwise, we preserve the old checkpoint_schema. let mut new_log_segment = - LogSegment::try_new(new_listed_files, log_root.clone(), new_version)?; + LogSegment::try_new(new_listed_files, log_root.clone(), requested_version, None)?; let new_end_version = new_log_segment.end_version; if new_end_version < old_version { @@ -169,21 +244,24 @@ impl Snapshot { return Err(Error::Generic(format!( "Unexpected state: The newest version in the log {new_end_version} is older than the old version {old_version}"))); } - if new_end_version == old_version { - // No new commits, just return the same snapshot - return Ok(existing_snapshot.clone()); - } - if new_log_segment.checkpoint_version.is_some() { - // we have a checkpoint in the new LogSegment, just construct a new snapshot from that - let snapshot = Self::try_new_from_log_segment( + // We found a checkpoint in the new log segment, so build a fresh snapshot from it. + // TODO(#2217): reuse old LazyCrc when CRC file matches. + // TODO(#2218): consider incremental P&M replay instead of full rebuild. + let snapshot = Self::try_new_from_log_segment_impl( existing_snapshot.table_root().clone(), new_log_segment, engine, + operation_id, ); return Ok(Arc::new(snapshot?)); } + if new_end_version == old_version { + // No new commits and no newly discovered checkpoint, just return the same snapshot. + return Ok(existing_snapshot.clone()); + } + // after this point, we incrementally update the snapshot with the new log segment. // first we remove the 'overlap' in commits, example: // @@ -197,12 +275,29 @@ impl Snapshot { // 2. new logsegment [commit4] // 3. new logsegment [checkpoint2-commit3] -> caught above new_log_segment + .listed .ascending_commit_files .retain(|log_path| old_version < log_path.version); + // Deduplicate compaction files the same way: the new listing re-lists from + // checkpoint_version, so it includes compaction files already in the old segment. + // Note: This removes all _new_ compaction files that start at or before `old_version`, + // which may drop useful compaction files that span across the old/new boundary + // (e.g. a new compaction(1, 3) when old_version=2). This is conservative but safe. + new_log_segment + .listed + .ascending_compaction_files + .retain(|log_path| old_version < log_path.version); // we have new commits and no new checkpoint: we replay new commits for P+M and then // create a new snapshot by combining LogSegments and building a new TableConfiguration - let (new_metadata, new_protocol) = new_log_segment.protocol_and_metadata(engine)?; + let (crc_file, lazy_crc) = Self::resolve_crc( + &new_log_segment, + old_log_segment, + &existing_snapshot.lazy_crc, + ); + + let (new_metadata, new_protocol) = + new_log_segment.read_protocol_metadata_opt(engine, &lazy_crc)?; let table_configuration = TableConfiguration::try_new_from( existing_snapshot.table_configuration(), new_metadata, @@ -211,66 +306,257 @@ impl Snapshot { )?; // NB: we must add the new log segment to the existing snapshot's log segment - let mut ascending_commit_files = old_log_segment.ascending_commit_files.clone(); - ascending_commit_files.extend(new_log_segment.ascending_commit_files); - let mut ascending_compaction_files = old_log_segment.ascending_compaction_files.clone(); - ascending_compaction_files.extend(new_log_segment.ascending_compaction_files); - - // Note that we _could_ go backwards if someone deletes a CRC: - // old listing: 1, 2, 2.crc, 3, 3.crc (latest is 3.crc) - // new listing: 1, 2, 2.crc, 3 (latest is 2.crc) - // and we would still pick the new listing's (older) CRC file since it ostensibly still - // exists - let latest_crc_file = new_log_segment - .latest_crc_file - .or_else(|| old_log_segment.latest_crc_file.clone()); + let mut ascending_commit_files = old_log_segment.listed.ascending_commit_files.clone(); + ascending_commit_files.extend(new_log_segment.listed.ascending_commit_files); + let mut ascending_compaction_files = + old_log_segment.listed.ascending_compaction_files.clone(); + ascending_compaction_files.extend(new_log_segment.listed.ascending_compaction_files); // Use the new latest_commit if available, otherwise use the old one // This handles the case where the new listing returned no commits let latest_commit_file = - new_latest_commit_file.or_else(|| old_log_segment.latest_commit_file.clone()); + new_latest_commit_file.or_else(|| old_log_segment.listed.latest_commit_file.clone()); // we can pass in just the old checkpoint parts since by the time we reach this line, we // know there are no checkpoints in the new log segment. let combined_log_segment = LogSegment::try_new( - ListedLogFiles { + LogSegmentFiles { ascending_commit_files, ascending_compaction_files, - checkpoint_parts: old_log_segment.checkpoint_parts.clone(), - latest_crc_file, + checkpoint_parts: old_log_segment.listed.checkpoint_parts.clone(), + latest_crc_file: crc_file, latest_commit_file, + max_published_version: new_log_segment + .listed + .max_published_version + .max(old_log_segment.listed.max_published_version), }, log_root, - new_version, + requested_version, + // Preserve checkpoint schema from old segment + old_log_segment.checkpoint_schema.clone(), )?; - Ok(Arc::new(Snapshot::new( + + Ok(Arc::new(Snapshot::new_with_crc( combined_log_segment, table_configuration, + lazy_crc, ))) } - /// Create a new [`Snapshot`] instance. - pub(crate) fn try_new_from_log_segment( + /// Determine the CRC file and LazyCrc for an incremental snapshot update. + /// + /// Prefers the new segment's CRC file, falls back to the old segment's. If the resolved + /// CRC version matches the existing snapshot's LazyCrc, reuses it to avoid redundant I/O + /// (it may already be loaded in memory). + fn resolve_crc( + new_log_segment: &LogSegment, + old_log_segment: &LogSegment, + existing_lazy_crc: &Arc, + ) -> (Option, Arc) { + let new_crc_file = new_log_segment.listed.latest_crc_file.clone(); + let old_crc_file = old_log_segment.listed.latest_crc_file.clone(); + let crc_file = new_crc_file.or(old_crc_file); + let crc_version = crc_file.as_ref().map(|f| f.version); + let lazy_crc = if crc_version == existing_lazy_crc.crc_version() { + existing_lazy_crc.clone() + } else { + Arc::new(LazyCrc::new(crc_file.clone())) + }; + (crc_file, lazy_crc) + } + + /// Implementation of snapshot creation from log segment. + /// + /// Reports metrics: `ProtocolMetadataLoaded`. + fn try_new_from_log_segment_impl( location: Url, log_segment: LogSegment, engine: &dyn Engine, + operation_id: MetricId, ) -> DeltaResult { - let (metadata, protocol) = log_segment.read_metadata(engine)?; + let reporter = engine.get_metrics_reporter(); + + // Create lazy CRC loader for P&M optimization + let lazy_crc = Arc::new(LazyCrc::new(log_segment.listed.latest_crc_file.clone())); + + // Read protocol and metadata (may use CRC if available) + let start = Instant::now(); + let (metadata, protocol) = log_segment.read_protocol_metadata(engine, &lazy_crc)?; + let read_metadata_duration = start.elapsed(); + + reporter.as_ref().inspect(|r| { + r.report(MetricEvent::ProtocolMetadataLoaded { + operation_id, + duration: read_metadata_duration, + }); + }); + let table_configuration = TableConfiguration::try_new(metadata, protocol, location, log_segment.end_version)?; - Ok(Self { + + Ok(Self::new_with_crc( log_segment, table_configuration, - }) + lazy_crc, + )) + } + + /// Creates a new [`Snapshot`] representing the table state immediately after a commit. + /// + /// Appends the newly committed file to this snapshot's log segment and bumps the version, + /// producing a post-commit snapshot without a full log replay from storage. + /// + /// The `crc_delta` captures the CRC-relevant changes from the committed transaction + /// (file stats, domain metadata, ICT, etc.). If the pre-commit snapshot had a loaded CRC + /// at its version, the delta is applied to produce a precomputed in-memory CRC for the new + /// version -- this CRC contains all important table metadata (protocol, metadata, domain + /// metadata, set transactions, ICT) and avoids re-reading them from storage. CREATE TABLE + /// always produces a CRC at v0. If no CRC was available on the pre-commit snapshot, the + /// existing lazy CRC is carried forward unchanged. + /// + /// TODO: Handle Protocol changes in CrcDelta (when Kernel-RS supports protocol changes) + /// TODO: Handle Metadata changes in CrcDelta (when Kernel-RS supports metadata changes) + pub(crate) fn new_post_commit( + &self, + commit: ParsedLogPath, + crc_delta: CrcDelta, + ) -> DeltaResult { + require!( + commit.is_commit(), + Error::internal_error(format!( + "Cannot create post-commit Snapshot. Log file is not a commit file. \ + Path: {}, Type: {:?}.", + commit.location.location, commit.file_type + )) + ); + let read_version = self.version(); + let new_version = commit.version; + require!( + new_version == read_version.wrapping_add(1), + Error::internal_error(format!( + "Cannot create post-commit Snapshot. Log file version ({new_version}) does not \ + equal Snapshot version ({read_version}) + 1." + )) + ); + + let new_table_configuration = + TableConfiguration::new_post_commit(self.table_configuration(), new_version); + + let new_log_segment = self.log_segment.new_with_commit_appended(commit)?; + + let new_lazy_crc = self.compute_post_commit_crc(new_version, crc_delta); + + Ok(Snapshot::new_with_crc( + new_log_segment, + new_table_configuration, + new_lazy_crc, + )) + } + + /// Compute the lazy CRC for a post-commit snapshot by applying a [`CrcDelta`]. + /// + /// For CREATE TABLE, builds a fresh CRC from the `crc_delta`. For existing tables, applies + /// the `crc_delta` to the current CRC if loaded, otherwise carries forward the existing lazy CRC. + fn compute_post_commit_crc(&self, new_version: Version, crc_delta: CrcDelta) -> Arc { + let crc = if self.version() == crate::PRE_COMMIT_VERSION { + crc_delta.into_crc_for_version_zero() + } else { + self.lazy_crc + .get_if_loaded_at_version(self.version()) + .map(|base| { + let mut crc = base.as_ref().clone(); + crc.apply(crc_delta); + crc + }) + }; + + match crc { + Some(c) => Arc::new(LazyCrc::new_precomputed(c, new_version)), + None => self.lazy_crc.clone(), + } } /// Creates a [`CheckpointWriter`] for generating a checkpoint from this snapshot. /// /// See the [`crate::checkpoint`] module documentation for more details on checkpoint types /// and the overall checkpoint process. - pub fn checkpoint(self: Arc) -> DeltaResult { + pub fn create_checkpoint_writer(self: Arc) -> DeltaResult { CheckpointWriter::try_new(self) } + /// Performs a complete checkpoint of this snapshot using the provided engine. + /// + /// If a checkpoint already exists at this version, returns + /// [`CheckpointWriteResult::AlreadyExists`] with the original snapshot unchanged. + /// Otherwise, writes a checkpoint parquet file and the `_last_checkpoint` file and returns + /// [`CheckpointWriteResult::Written`] with an updated [`SnapshotRef`] whose log segment + /// reflects the new checkpoint. Commits and compaction files subsumed by the checkpoint are + /// dropped from the returned snapshot. + /// + /// Note: + /// - It is still possible that an existing checkpoint gets overwritten if that + /// checkpoint was written by a concurrent writer. + /// - This function uses [`crate::ParquetHandler::write_parquet_file`] and + /// [`crate::StorageHandler::head`], which may not be implemented by all engines. + /// If you are using the default engine, make sure to build it with the multi-threaded + /// executor if you want to use this method. + #[instrument(parent = &self.span, name = "snap.checkpoint", skip_all, err)] + pub fn checkpoint( + self: &SnapshotRef, + engine: &dyn Engine, + ) -> DeltaResult<(CheckpointWriteResult, SnapshotRef)> { + if self.log_segment.checkpoint_version == Some(self.log_segment.end_version) { + info!( + "Checkpoint already exists for snapshot version {}", + self.version() + ); + return Ok((CheckpointWriteResult::AlreadyExists, Arc::clone(self))); + } + + let writer = Arc::clone(self).create_checkpoint_writer()?; + let checkpoint_path = writer.checkpoint_path()?; + let data_iter = writer.checkpoint_data(engine)?; + let state = data_iter.state(); + let lazy_data = data_iter.map(|r| r.and_then(|f| f.apply_selection_vector())); + match engine + .parquet_handler() + .write_parquet_file(checkpoint_path.clone(), Box::new(lazy_data)) + { + Ok(()) => (), + Err(Error::FileAlreadyExists(_)) => { + // NOTE: Per write_parquet_file's documentation, it should silently overwrite existing files, + // so we log a warning but still return the correct result. + warn!( + "ParquetHandler::write_parquet_file unexpectedly failed on \ + FileAlreadyExists for version {}", + self.version() + ); + return Ok((CheckpointWriteResult::AlreadyExists, Arc::clone(self))); + } + Err(e) => return Err(e), + } + + let file_meta = engine.storage_handler().head(&checkpoint_path)?; + + // Finalize the checkpoint (writes `_last_checkpoint` file). + writer.finalize(engine, &file_meta, &state)?; + + let checkpoint_log_path = ParsedLogPath::try_from(file_meta)?.ok_or_else(|| { + Error::internal_error("Checkpoint path could not be parsed as a log path") + })?; + let new_log_segment = self + .log_segment + .try_new_with_checkpoint(checkpoint_log_path)?; + Ok(( + CheckpointWriteResult::Written, + Arc::new(Snapshot::new_with_crc( + new_log_segment, + self.table_configuration().clone(), + self.lazy_crc.clone(), + )), + )) + } + /// Creates a [`LogCompactionWriter`] for generating a log compaction file. /// /// Log compaction aggregates commit files in a version range into a single compacted file, @@ -309,7 +595,7 @@ impl Snapshot { /// /// [`Schema`]: crate::schema::Schema pub fn schema(&self) -> SchemaRef { - self.table_configuration.schema() + self.table_configuration.logical_schema() } /// Get the [`TableProperties`] for this [`Snapshot`]. @@ -317,6 +603,54 @@ impl Snapshot { self.table_configuration().table_properties() } + /// Returns the protocol-derived table properties as a map of key-value pairs. + /// + /// This includes: + /// - `delta.minReaderVersion` and `delta.minWriterVersion` + /// - `delta.feature. = "supported"` for each reader and writer feature (when using + /// table features protocol, i.e. reader version 3 / writer version 7) + #[allow(unused)] + #[internal_api] + pub(crate) fn get_protocol_derived_properties(&self) -> HashMap { + let protocol = self.table_configuration().protocol(); + + let mut properties = HashMap::from([ + ( + "delta.minReaderVersion".into(), + protocol.min_reader_version().to_string(), + ), + ( + "delta.minWriterVersion".into(), + protocol.min_writer_version().to_string(), + ), + ]); + + let features = protocol + .reader_features() + .into_iter() + .flatten() + .chain(protocol.writer_features().into_iter().flatten()); + + for feature in features { + properties + .entry(format!("delta.feature.{}", feature.as_ref())) + .or_insert_with(|| "supported".to_string()); + } + + properties + } + + /// Get the raw metadata configuration for this table. + /// + /// This returns the `Metadata.configuration` map as stored in the Delta log, containing + /// user-defined properties, delta table properties (e.g., `delta.enableInCommitTimestamps`), + /// and application-specific properties (e.g., `io.unitycatalog.tableId`). + #[allow(unused)] + #[internal_api] + pub(crate) fn metadata_configuration(&self) -> &HashMap { + self.table_configuration().metadata().configuration() + } + /// Get the [`TableConfiguration`] for this [`Snapshot`]. #[internal_api] pub(crate) fn table_configuration(&self) -> &TableConfiguration { @@ -329,21 +663,45 @@ impl Snapshot { } /// Create a [`Transaction`] for this `SnapshotRef`. With the specified [`Committer`]. - pub fn transaction(self: Arc, committer: Box) -> DeltaResult { - Transaction::try_new(self, committer) + /// + /// Note: For tables with clustering enabled, this performs log replay to read clustering + /// columns from domain metadata, which may have a performance cost. + pub fn transaction( + self: Arc, + committer: Box, + engine: &dyn Engine, + ) -> DeltaResult { + Transaction::try_new_existing_table(self, committer, engine) } - /// Fetch the latest version of the provided `application_id` for this snapshot. Filters the txn based on the SetTransactionRetentionDuration property and lastUpdated + /// Fetch the latest version of the provided `application_id` for this snapshot. Filters the + /// txn based on the delta.setTransactionRetentionDuration property and lastUpdated. /// - /// Note that this method performs log replay (fetches and processes metadata from storage). + /// Uses the CRC fast path when available, otherwise falls back to log replay. // TODO: add a get_app_id_versions to fetch all at once using SetTransactionScanner::get_all + #[instrument(parent = &self.span, name = "snap.get_app_id_version", skip_all, err)] pub fn get_app_id_version( - self: Arc, + &self, application_id: &str, engine: &dyn Engine, ) -> DeltaResult> { let expiration_timestamp = calculate_transaction_expiration_timestamp(self.table_properties())?; + + // Fast path: serve from CRC if it tracks set transactions at this version. + if let Some(crc) = self + .lazy_crc + .get_or_load_if_at_version(engine, self.version()) + { + if let Some(txn_map) = &crc.set_transactions { + return Ok(txn_map + .get(application_id) + .filter(|txn| !is_set_txn_expired(expiration_timestamp, txn.last_updated)) + .map(|txn| txn.version)); + } + } + + // Fallback: full log replay. let txn = SetTransactionScanner::get_one( self.log_segment(), application_id, @@ -368,7 +726,327 @@ impl Snapshot { )); } - domain_metadata_configuration(self.log_segment(), domain, engine) + self.get_domain_metadata_internal(domain, engine) + } + + /// Get the logical clustering columns for this snapshot, if clustering is enabled. + /// + /// Returns `Ok(Some(columns))` if the ClusteredTable feature is enabled and clustering + /// columns are defined, `Ok(None)` if clustering is not enabled, or an error if the + /// clustering metadata is malformed. + /// + /// The columns are returned as logical [`ColumnName`]s. When column mapping is enabled, + /// this converts the physical names stored in domain metadata back to logical names using + /// the table schema. + /// + /// Note that this method performs log replay (fetches and processes metadata from storage). + /// + /// # Errors + /// + /// Returns an error if the clustering domain metadata is malformed, or if a physical + /// column name cannot be resolved to a logical name in the schema. + /// + /// [`ColumnName`]: crate::expressions::ColumnName + #[allow(unused)] + #[internal_api] + pub(crate) fn get_logical_clustering_columns( + &self, + engine: &dyn Engine, + ) -> DeltaResult>> { + let physical_columns = match self.get_physical_clustering_columns(engine)? { + Some(cols) => cols, + None => return Ok(None), + }; + let column_mapping_mode = self.table_configuration.column_mapping_mode(); + if column_mapping_mode == ColumnMappingMode::None { + // No column mapping: physical = logical + return Ok(Some(physical_columns)); + } + // Convert physical column names to logical names by walking the schema + let logical_schema = self.table_configuration.logical_schema(); + let logical_columns = physical_columns + .iter() + .map(|physical_col| { + physical_to_logical_column_name(&logical_schema, physical_col, column_mapping_mode) + }) + .collect::>>()?; + Ok(Some(logical_columns)) + } + + /// Get the clustering columns for this snapshot, if the table has clustering enabled. + /// + /// Returns `Ok(Some(columns))` if the ClusteredTable feature is enabled and clustering + /// columns are defined, `Ok(None)` if clustering is not enabled, or an error if the + /// clustering metadata is malformed. + /// + /// The columns are returned as physical column names, respecting the column mapping mode. + /// Note that this method performs log replay (fetches and processes metadata from storage). + #[internal_api] + pub(crate) fn get_physical_clustering_columns( + &self, + engine: &dyn Engine, + ) -> DeltaResult>> { + if !self + .table_configuration + .protocol() + .has_table_feature(&TableFeature::ClusteredTable) + { + return Ok(None); + } + match self.get_domain_metadata_internal(CLUSTERING_DOMAIN_NAME, engine)? { + Some(config) => Ok(Some(parse_clustering_columns(&config)?)), + None => Ok(None), + } + } + + /// Load domain metadata from this snapshot. If `domains` is `Some`, only load the specified + /// domains (with early termination). If `None`, load all domains. + /// + /// This is the single entry point for all domain metadata reads on a snapshot. All public + /// and internal domain metadata APIs delegate to this method. + #[internal_api] + pub(crate) fn get_domain_metadatas_internal( + &self, + engine: &dyn Engine, + domains: Option<&HashSet<&str>>, + ) -> DeltaResult { + // Fast path: serve from CRC if it tracks domain metadata at this version. + if let Some(crc) = self + .lazy_crc + .get_or_load_if_at_version(engine, self.version()) + { + if let Some(dm_map) = &crc.domain_metadata { + return Ok(match domains { + None => dm_map.clone(), + Some(filter) => dm_map + .iter() + .filter(|(k, _)| filter.contains(k.as_str())) + .map(|(k, v)| (k.clone(), v.clone())) + .collect(), + }); + } + } + // Fallback: full log replay. + self.log_segment().scan_domain_metadatas(domains, engine) + } + + /// Returns file-level statistics, or `None` if no CRC with valid stats exists at this + /// snapshot's version. + /// + /// NOTE: This is an unstable API expected to change in future releases. + #[allow(unused)] + #[internal_api] + pub(crate) fn get_or_load_file_stats(&self, engine: &dyn Engine) -> Option { + let crc = self + .lazy_crc + .get_or_load_if_at_version(engine, self.version())?; + crc.file_stats() + } + + /// Returns file-level statistics, or `None` if CRC is not loaded, not at this + /// version, or has no valid file stats. + /// + /// NOTE: This API is purely opportunistic, no I/O. + #[internal_api] + pub(crate) fn get_file_stats_if_loaded(&self) -> Option { + let crc = self.lazy_crc.get_if_loaded_at_version(self.version())?; + crc.file_stats() + } + + /// Returns the CRC if one has been loaded at this snapshot's version (no I/O). + /// + /// This is a test-only helper for integration tests to inspect the CRC state. + #[cfg(any(test, feature = "test-utils"))] + pub fn get_current_crc_if_loaded_for_testing(&self) -> Option<&Crc> { + if self.lazy_crc.crc_version() != Some(self.version()) { + return None; + } + self.lazy_crc.cached.get()?.get().map(|arc| arc.as_ref()) + } + + /// Returns the CRC version tracked by this snapshot's LazyCrc, if any. + /// + /// This is a test-only helper for integration tests to inspect the CRC version. + #[cfg(any(test, feature = "test-utils"))] + pub fn crc_version_for_testing(&self) -> Option { + self.lazy_crc.crc_version() + } + + /// Writes a version checksum (CRC) file for this snapshot. Writers should call this after + /// every commit because checksums enable faster snapshot loading and table state validation. + /// + /// Currently only supports writing from a post-commit snapshot that has pre-computed CRC + /// information in memory (i.e. the snapshot returned by + /// [`CommittedTransaction::post_commit_snapshot`]). + /// + /// Returns a tuple of [`ChecksumWriteResult`] and a [`SnapshotRef`]. On + /// [`ChecksumWriteResult::Written`], the returned snapshot has the CRC file recorded in + /// its log segment. On [`ChecksumWriteResult::AlreadyExists`], the original snapshot is + /// returned unchanged. + /// + /// # Errors + /// + /// - [`Error::ChecksumWriteUnsupported`] if no in-memory CRC is available at this + /// snapshot's version (e.g. a snapshot loaded from disk that has no CRC file), or if + /// the CRC's file stats are not valid. File stats can be invalid for two reasons: + /// (a) a non-incremental operation like ANALYZE STATS was encountered, which is + /// recoverable with a full state reconstruction in the future; (b) a file action had a + /// missing size (e.g. `remove.size` is null), which is permanently unrecoverable. + /// - I/O errors from the engine's storage handler if the write fails. + /// + /// [`CommittedTransaction::post_commit_snapshot`]: crate::transaction::CommittedTransaction::post_commit_snapshot + #[instrument(parent = &self.span, name = "snap.write_checksum", skip_all, err)] + pub fn write_checksum( + self: &SnapshotRef, + engine: &dyn Engine, + ) -> DeltaResult<(ChecksumWriteResult, SnapshotRef)> { + let has_crc_on_disk = self + .log_segment + .listed + .latest_crc_file + .as_ref() + .is_some_and(|f| f.version == self.version()); + + if has_crc_on_disk { + info!( + "CRC file already exists on disk at version {}", + self.version() + ); + return Ok((ChecksumWriteResult::AlreadyExists, Arc::clone(self))); + } + + let crc = self + .lazy_crc + .get_if_loaded_at_version(self.version()) + .ok_or_else(|| { + Error::ChecksumWriteUnsupported( + "No in-memory CRC available at this snapshot version.".to_string(), + ) + })?; + + let crc_path = ParsedLogPath::new_crc(self.table_root(), self.version())?; + + // Note: try_write_crc_file validates file stats validity before writing. + match try_write_crc_file(engine, &crc_path.location, crc) { + Ok(()) => { + info!("Wrote CRC file at {}", crc_path.location); + let new_log_segment = self.log_segment.try_new_with_crc_file(crc_path)?; + let new_snapshot = Arc::new(Snapshot::new_with_crc( + new_log_segment, + self.table_configuration().clone(), + self.lazy_crc.clone(), + )); + Ok((ChecksumWriteResult::Written, new_snapshot)) + } + Err(Error::FileAlreadyExists(_)) => { + info!( + "Another writer beat us to writing CRC file at {}", + crc_path.location + ); + Ok((ChecksumWriteResult::AlreadyExists, Arc::clone(self))) + } + Err(e) => Err(e), + } + } + + /// Publishes all catalog commits at this table version. Applicable only to catalog-managed + /// tables. This method is a no-op for filesystem-managed tables or if there are no catalog + /// commits to publish. + /// + /// Publishing copies ratified catalog commits to the Delta log as published Delta files, + /// reducing catalog storage requirements and enabling some table maintenance operations, + /// like checkpointing. + /// + /// # Parameters + /// + /// - `engine`: The engine to use for publishing commits + /// + /// # Errors + /// + /// Returns an error if the publish operation fails, or if there are catalog commits that need + /// publishing but the table or committer don't support publishing. + /// + /// # See Also + /// + /// - [`Committer::publish`] + #[instrument(parent = &self.span, name = "snap.publish", skip_all, err)] + pub fn publish( + self: &SnapshotRef, + engine: &dyn Engine, + committer: &dyn Committer, + ) -> DeltaResult { + let unpublished_catalog_commits = self.log_segment().get_unpublished_catalog_commits()?; + + if unpublished_catalog_commits.is_empty() { + return Ok(Arc::clone(self)); + } + + require!( + unpublished_catalog_commits + .windows(2) + .all(|commits| commits[0].version() + 1 == commits[1].version()), + Error::generic(format!( + "Expected ordered and contiguous unpublished catalog commits. \ + Got: {unpublished_catalog_commits:?}" + )) + ); + + require!( + self.table_configuration().is_catalog_managed(), + Error::generic( + "There are catalog commits that need publishing, but the table is not catalog-managed.", + ) + ); + + require!( + committer.is_catalog_committer(), + Error::generic( + "There are catalog commits that need publishing, but the committer is not a catalog committer.", + ) + ); + + let publish_metadata = + PublishMetadata::try_new(self.version(), unpublished_catalog_commits)?; + + committer.publish(engine, publish_metadata)?; + + Ok(Arc::new(Snapshot::new_with_crc( + self.log_segment().new_as_published()?, + self.table_configuration().clone(), + self.lazy_crc.clone(), + ))) + } + + /// Fetch both user-controlled and system-controlled domain metadata for a specific domain + /// in this snapshot. + /// + /// Returns the latest configuration for the domain, or `None` if the domain does not exist + /// (or was removed). Unlike [`Snapshot::get_domain_metadata`], this does not reject `delta.*` domains. + #[allow(unused)] + #[internal_api] + pub(crate) fn get_domain_metadata_internal( + &self, + domain: &str, + engine: &dyn Engine, + ) -> DeltaResult> { + let mut map = self.get_domain_metadatas_internal(engine, Some(&HashSet::from([domain])))?; + Ok(map.remove(domain).map(|dm| dm.configuration().to_owned())) + } + + /// Fetch all non-internal domain metadata for this snapshot as a `Vec`. + /// + /// Internal (`delta.*`) domains are filtered out. + #[allow(unused)] + #[internal_api] + pub(crate) fn get_all_domain_metadata( + &self, + engine: &dyn Engine, + ) -> DeltaResult> { + let all_metadata = self.get_domain_metadatas_internal(engine, None)?; + Ok(all_metadata + .into_values() + .filter(|domain| !domain.is_internal()) + .collect()) } /// Get the In-Commit Timestamp (ICT) for this snapshot. @@ -379,6 +1057,8 @@ impl Snapshot { /// - `Ok(Some(timestamp))` - ICT is enabled and available for this version /// - `Ok(None)` - ICT is not enabled /// - `Err(...)` - ICT is enabled but cannot be read, or enablement version is invalid + #[instrument(parent = &self.span, name = "snap.get_ict", skip_all, err)] + #[internal_api] pub(crate) fn get_in_commit_timestamp(&self, engine: &dyn Engine) -> DeltaResult> { // Get ICT enablement info and check if we should read ICT for this version let enablement = self @@ -404,8 +1084,24 @@ impl Snapshot { } } - // Read the ICT from latest_commit_file - match &self.log_segment.latest_commit_file { + // Fast path: try reading ICT from CRC file (if it is at this snapshot version) + if let Some(crc) = self + .lazy_crc + .get_or_load_if_at_version(engine, self.version()) + { + match crc.in_commit_timestamp_opt { + Some(ict) => return Ok(Some(ict)), + None => { + return Err(Error::generic(format!( + "In-Commit Timestamp not found in CRC file at version {}", + self.version() + ))); + } + } + } + + // Fallback: read the ICT from latest_commit_file + match &self.log_segment.listed.latest_commit_file { Some(commit_file_meta) => { let ict = commit_file_meta.read_in_commit_timestamp(engine)?; Ok(Some(ict)) @@ -413,6 +1109,55 @@ impl Snapshot { None => Err(Error::generic("Last commit file not found in log segment")), } } + + /// Get the timestamp for this snapshot's version, in milliseconds since the Unix epoch. + /// + /// When In-Commit Timestamp (ICT) are enabled, returns the In-Commit Timestamp value. + /// Otherwise, falls back to the filesystem last-modified time of the latest commit file. + /// + /// Returns an error if the commit file is missing, the ICT configuration is invalid, or the + /// ICT value cannot be read. + /// + /// See also [`get_in_commit_timestamp`] for ICT-only semantics. + /// + /// [`get_in_commit_timestamp`]: Self::get_in_commit_timestamp + #[allow(unused)] + #[instrument(parent = &self.span, name = "snap.get_ts", skip_all, err)] + pub fn get_timestamp(&self, engine: &dyn Engine) -> DeltaResult { + match self + .table_configuration() + .in_commit_timestamp_enablement()? + { + InCommitTimestampEnablement::NotEnabled => { + match &self.log_segment.listed.latest_commit_file { + Some(commit_file_meta) => { + let ts = commit_file_meta.location.last_modified; + Ok(ts) + } + None => Err(Error::generic(format!( + "Last commit file not found in log segment for version {} \ + (ICT disabled): cannot read filesystem modification timestamp", + self.version() + ))), + } + } + InCommitTimestampEnablement::Enabled { .. } => self + .get_in_commit_timestamp(engine) + .map_err(|e| { + Error::generic(format!( + "Unable to read in-commit timestamp for version {}: {e}", + self.version() + )) + })? + .ok_or_else(|| { + Error::internal_error(format!( + "Invalid state: version {}, ICT is enabled \ + but get_in_commit_timestamp returned None", + self.version() + )) + }), + } + } } #[cfg(test)] @@ -422,26 +1167,36 @@ mod tests { use std::path::PathBuf; use std::sync::Arc; - use object_store::local::LocalFileSystem; - use object_store::memory::InMemory; - use object_store::path::Path; - use object_store::ObjectStore; + use rstest::rstest; use serde_json::json; use test_utils::{add_commit, delta_path_for_version}; - use crate::actions::Protocol; + use crate::actions::{DomainMetadata, Protocol}; use crate::arrow::array::StringArray; use crate::arrow::record_batch::RecordBatch; + use crate::committer::FileSystemCommitter; use crate::engine::arrow_data::ArrowEngineData; - use crate::engine::default::executor::tokio::TokioBackgroundExecutor; + use crate::engine::default::executor::tokio::{ + TokioBackgroundExecutor, TokioMultiThreadExecutor, + }; use crate::engine::default::filesystem::ObjectStoreStorageHandler; - use crate::engine::default::DefaultEngine; + use crate::engine::default::{DefaultEngine, DefaultEngineBuilder}; use crate::engine::sync::SyncEngine; use crate::last_checkpoint_hint::LastCheckpointHint; - use crate::listed_log_files::ListedLogFiles; use crate::log_segment::LogSegment; + use crate::log_segment_files::LogSegmentFiles; + use crate::object_store::local::LocalFileSystem; + use crate::object_store::memory::InMemory; + use crate::object_store::path::Path; + use crate::object_store::ObjectStore; use crate::parquet::arrow::ArrowWriter; - use crate::path::ParsedLogPath; + use crate::path::{LogPathFileType, ParsedLogPath}; + use crate::schema::{DataType, StructField, StructType}; + use crate::table_features::{ + TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION, + }; + use crate::table_properties::ENABLE_IN_COMMIT_TIMESTAMPS; + use crate::transaction::create_table::create_table; use crate::utils::test_utils::{assert_result_error_with_message, string_array_to_engine_data}; /// Helper function to create a commitInfo action with optional ICT @@ -467,13 +1222,13 @@ mod tests { let mut protocol = json!({ "protocol": { "minReaderVersion": reader_version, - "minWriterVersion": 7, + "minWriterVersion": TABLE_FEATURES_MIN_WRITER_VERSION, "writerFeatures": ["inCommitTimestamp"] } }); - // Only include readerFeatures if minReaderVersion >= 3 - if reader_version >= 3 { + // Only include readerFeatures if minReaderVersion >= table-features minimum. + if reader_version >= TABLE_FEATURES_MIN_READER_VERSION as u32 { protocol["protocol"]["readerFeatures"] = json!([]); } @@ -525,7 +1280,31 @@ mod tests { fn create_basic_commit(ict_enabled: bool, ict_config: Option<(String, String)>) -> String { let protocol = create_protocol(ict_enabled, None); let metadata = create_metadata(None, None, None, ict_config, false); - format!("{}\n{}", protocol, metadata) + format!("{protocol}\n{metadata}") + } + + fn create_snapshot_with_commit_file_absent_from_log_segment( + url: &Url, + table_cfg: TableConfiguration, + ) -> DeltaResult { + // Create a log segment with only checkpoint and no commit file (simulating scenario + // where a checkpoint exists but the commit file has been cleaned up) + let checkpoint_parts = vec![ParsedLogPath::try_from(crate::FileMeta { + location: url.join("_delta_log/00000000000000000000.checkpoint.parquet")?, + last_modified: 0, + size: 100, + })? + .unwrap()]; + + let listed_files = LogSegmentFiles { + checkpoint_parts, + ..Default::default() + }; + + let log_segment = + LogSegment::try_new(listed_files, url.join("_delta_log/")?, Some(0), None)?; + + Ok(Snapshot::new(log_segment, table_cfg)) } #[test] @@ -540,8 +1319,7 @@ mod tests { .build(&engine) .unwrap(); - let expected = - Protocol::try_new(3, 7, Some(["deletionVectors"]), Some(["deletionVectors"])).unwrap(); + let expected = Protocol::try_new_modern(["deletionVectors"], ["deletionVectors"]).unwrap(); assert_eq!(snapshot.table_configuration().protocol(), &expected); let schema_string = r#"{"type":"struct","fields":[{"name":"value","type":"integer","nullable":true,"metadata":{}}]}"#; @@ -558,8 +1336,7 @@ mod tests { let engine = SyncEngine::new(); let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); - let expected = - Protocol::try_new(3, 7, Some(["deletionVectors"]), Some(["deletionVectors"])).unwrap(); + let expected = Protocol::try_new_modern(["deletionVectors"], ["deletionVectors"]).unwrap(); assert_eq!(snapshot.table_configuration().protocol(), &expected); let schema_string = r#"{"type":"struct","fields":[{"name":"value","type":"integer","nullable":true,"metadata":{}}]}"#; @@ -567,14 +1344,21 @@ mod tests { assert_eq!(snapshot.schema(), expected); } - // TODO: unify this and lots of stuff in LogSegment tests and test_utils - async fn commit(store: &InMemory, version: Version, commit: Vec) { + // TODO: unify this and lots of stuff in LogSegment tests and test_utils. + async fn commit( + table_root: impl AsRef, + store: &InMemory, + version: Version, + commit: Vec, + ) { let commit_data = commit .iter() .map(ToString::to_string) .collect::>() .join("\n"); - add_commit(store, version, commit_data).await.unwrap(); + add_commit(table_root, store, version, commit_data) + .await + .unwrap(); } // interesting cases for testing Snapshot::new_from: @@ -626,15 +1410,15 @@ mod tests { // // in each test we will modify versions 1 and 2 to test different scenarios fn test_new_from(store: Arc) -> DeltaResult<()> { - let url = Url::parse("memory:///")?; - let engine = DefaultEngine::new(store, Arc::new(TokioBackgroundExecutor::new())); - let base_snapshot = Snapshot::builder_for(url.clone()) + let table_root = "memory:///"; + let engine = DefaultEngineBuilder::new(store).build(); + let base_snapshot = Snapshot::builder_for(table_root) .at_version(0) .build(&engine)?; let snapshot = Snapshot::builder_from(base_snapshot.clone()) .at_version(1) .build(&engine)?; - let expected = Snapshot::builder_for(url.clone()) + let expected = Snapshot::builder_for(table_root) .at_version(1) .build(&engine)?; assert_eq!(snapshot, expected); @@ -673,19 +1457,16 @@ mod tests { } }), ]; - commit(store.as_ref(), 0, commit0.clone()).await; + let table_root = "memory:///"; + commit(table_root, store.as_ref(), 0, commit0.clone()).await; // 3. new version > existing version // a. no new log segment - let url = Url::parse("memory:///")?; - let engine = DefaultEngine::new( - Arc::new(store.fork()), - Arc::new(TokioBackgroundExecutor::new()), - ); - let base_snapshot = Snapshot::builder_for(url.clone()) + let engine = DefaultEngineBuilder::new(Arc::new(store.fork())).build(); + let base_snapshot = Snapshot::builder_for(table_root) .at_version(0) .build(&engine)?; let snapshot = Snapshot::builder_from(base_snapshot.clone()).build(&engine)?; - let expected = Snapshot::builder_for(url.clone()) + let expected = Snapshot::builder_for(table_root) .at_version(0) .build(&engine)?; assert_eq!(snapshot, expected); @@ -698,7 +1479,7 @@ mod tests { // b. log segment for old..=new version has a checkpoint (with new protocol/metadata) let store_3a = store.fork(); let mut checkpoint1 = commit0.clone(); - commit(&store_3a, 1, commit0.clone()).await; + commit(table_root, &store_3a, 1, commit0.clone()).await; checkpoint1[1] = json!({ "protocol": { "minReaderVersion": 2, @@ -748,13 +1529,12 @@ mod tests { } }); commit1[2]["partitionColumns"] = serde_json::to_value(["some_partition_column"])?; - commit(store_3c_i.as_ref(), 1, commit1).await; + commit(table_root, store_3c_i.as_ref(), 1, commit1).await; test_new_from(store_3c_i.clone())?; // new commits AND request version > end of log - let url = Url::parse("memory:///")?; - let engine = DefaultEngine::new(store_3c_i, Arc::new(TokioBackgroundExecutor::new())); - let base_snapshot = Snapshot::builder_for(url.clone()) + let engine = DefaultEngineBuilder::new(store_3c_i).build(); + let base_snapshot = Snapshot::builder_for(table_root) .at_version(0) .build(&engine)?; assert!(matches!( @@ -772,7 +1552,7 @@ mod tests { } }); commit1.remove(2); // remove metadata - commit(&store_3c_ii, 1, commit1).await; + commit(table_root, &store_3c_ii, 1, commit1).await; test_new_from(store_3c_ii.into())?; // iii. commits have (no protocol, new metadata) @@ -780,13 +1560,13 @@ mod tests { let mut commit1 = commit0.clone(); commit1[2]["partitionColumns"] = serde_json::to_value(["some_partition_column"])?; commit1.remove(1); // remove protocol - commit(&store_3c_iii, 1, commit1).await; + commit(table_root, &store_3c_iii, 1, commit1).await; test_new_from(store_3c_iii.into())?; // iv. commits have (no protocol, no metadata) let store_3c_iv = store.fork(); let commit1 = vec![commit0[0].clone()]; - commit(&store_3c_iv, 1, commit1).await; + commit(table_root, &store_3c_iv, 1, commit1).await; test_new_from(store_3c_iv.into())?; Ok(()) @@ -796,8 +1576,8 @@ mod tests { #[tokio::test] async fn test_snapshot_new_from_crc() -> Result<(), Box> { let store = Arc::new(InMemory::new()); - let url = Url::parse("memory:///")?; - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let table_root = "memory:///"; + let engine = DefaultEngineBuilder::new(store.clone()).build(); let protocol = |reader_version, writer_version| { json!({ "protocol": { @@ -844,11 +1624,13 @@ mod tests { ]; // commit 0 and 1 jsons - commit(&store, 0, commit0.clone()).await; - commit(&store, 1, commit1).await; + commit(table_root, &store, 0, commit0.clone()).await; + commit(table_root, &store, 1, commit1).await; - // a) CRC: old one has 0.crc, no new one (expect 0.crc) - // b) CRC: old one has 0.crc, new one has 1.crc (expect 1.crc) + // Test CRC handling during incremental snapshot update (v0 -> v1). + // The new log listing starts at v1, so the new log segment doesn't find 0.crc. + // a) Only 0.crc exists: resolve_crc falls back to old segment's 0.crc. + // b) Both 0.crc and 1.crc exist: resolve_crc picks up 1.crc. let crc = json!({ "table_size_bytes": 100, "num_files": 1, @@ -863,21 +1645,18 @@ mod tests { store.put(&path, crc.to_string().into()).await?; // base snapshot is at version 0 - let base_snapshot = Snapshot::builder_for(url.clone()) + let base_snapshot = Snapshot::builder_for(table_root) .at_version(0) .build(&engine)?; - // first test: no new crc + // a) only 0.crc exists -- falls back to old segment's 0.crc let snapshot = Snapshot::builder_from(base_snapshot.clone()) .at_version(1) .build(&engine)?; - let expected = Snapshot::builder_for(url.clone()) - .at_version(1) - .build(&engine)?; - assert_eq!(snapshot, expected); assert_eq!( snapshot .log_segment + .listed .latest_crc_file .as_ref() .unwrap() @@ -885,8 +1664,7 @@ mod tests { 0 ); - // second test: new crc - // put the new crc + // b) both 0.crc and 1.crc exist -- resolve_crc picks up 1.crc let path = delta_path_for_version(1, "crc"); let crc = json!({ "table_size_bytes": 100, @@ -900,13 +1678,14 @@ mod tests { let snapshot = Snapshot::builder_from(base_snapshot.clone()) .at_version(1) .build(&engine)?; - let expected = Snapshot::builder_for(url.clone()) + let expected = Snapshot::builder_for(table_root) .at_version(1) .build(&engine)?; assert_eq!(snapshot, expected); assert_eq!( snapshot .log_segment + .listed .latest_crc_file .as_ref() .unwrap() @@ -928,17 +1707,50 @@ mod tests { let store = Arc::new(LocalFileSystem::new()); let executor = Arc::new(TokioBackgroundExecutor::new()); - let storage = ObjectStoreStorageHandler::new(store, executor); + let storage = ObjectStoreStorageHandler::new(store, executor, None); let cp = LastCheckpointHint::try_read(&storage, &url).unwrap(); assert!(cp.is_none()); } - fn valid_last_checkpoint() -> Vec { - r#"{"size":8,"sizeInBytes":21857,"version":1}"#.as_bytes().to_vec() + fn valid_last_checkpoint() -> (Vec, LastCheckpointHint) { + let checkpoint = LastCheckpointHint { + version: 1, + size: 8, + parts: None, + size_in_bytes: Some(21857), + num_of_add_files: None, + checkpoint_schema: None, + checksum: None, + tags: None, + }; + let data = checkpoint.to_json_bytes(); + (data, checkpoint) } - #[test] - fn test_read_table_with_empty_last_checkpoint() { + fn valid_last_checkpoint_with_tags() -> (Vec, LastCheckpointHint) { + use std::collections::HashMap; + + let (_, base_checkpoint) = valid_last_checkpoint(); + + let mut tags = HashMap::new(); + tags.insert( + "author".to_string(), + "test_read_table_with_last_checkpoint".to_string(), + ); + tags.insert("environment".to_string(), "snapshot_tests".to_string()); + tags.insert("created_by".to_string(), "delta-kernel-rs".to_string()); + + let checkpoint = LastCheckpointHint { + tags: Some(tags), + ..base_checkpoint + }; + + let data = checkpoint.to_json_bytes(); + (data, checkpoint) + } + + #[tokio::test] + async fn test_read_table_with_empty_last_checkpoint() { // in memory file system let store = Arc::new(InMemory::new()); @@ -946,63 +1758,52 @@ mod tests { let empty = "{}".as_bytes().to_vec(); let invalid_path = Path::from("invalid/_last_checkpoint"); - tokio::runtime::Runtime::new() - .expect("create tokio runtime") - .block_on(async { - store - .put(&invalid_path, empty.into()) - .await - .expect("put _last_checkpoint"); - }); + store + .put(&invalid_path, empty.into()) + .await + .expect("put _last_checkpoint"); let executor = Arc::new(TokioBackgroundExecutor::new()); - let storage = ObjectStoreStorageHandler::new(store, executor); + let storage = ObjectStoreStorageHandler::new(store, executor, None); let url = Url::parse("memory:///invalid/").expect("valid url"); let invalid = LastCheckpointHint::try_read(&storage, &url).expect("read last checkpoint"); assert!(invalid.is_none()) } - #[test] - fn test_read_table_with_last_checkpoint() { + #[tokio::test] + async fn test_read_table_with_last_checkpoint() { // in memory file system let store = Arc::new(InMemory::new()); - // put a valid/invalid _last_checkpoint file - let data = valid_last_checkpoint(); - let invalid_data = "invalid".as_bytes().to_vec(); - let path = Path::from("valid/_last_checkpoint"); - let invalid_path = Path::from("invalid/_last_checkpoint"); + // Define test cases: (path, data, expected_result) + let (data, expected) = valid_last_checkpoint(); + let (data_with_tags, expected_with_tags) = valid_last_checkpoint_with_tags(); + let test_cases = vec![ + ("valid", data, Some(expected)), + ("invalid", "invalid".as_bytes().to_vec(), None), + ("valid_with_tags", data_with_tags, Some(expected_with_tags)), + ]; - tokio::runtime::Runtime::new() - .expect("create tokio runtime") - .block_on(async { - store - .put(&path, data.into()) - .await - .expect("put _last_checkpoint"); - store - .put(&invalid_path, invalid_data.into()) - .await - .expect("put _last_checkpoint"); - }); + // Write all test files to the in memory file system + for (path_prefix, data, _) in &test_cases { + let path = Path::from(format!("{path_prefix}/_last_checkpoint")); + store + .put(&path, data.clone().into()) + .await + .expect("put _last_checkpoint"); + } let executor = Arc::new(TokioBackgroundExecutor::new()); - let storage = ObjectStoreStorageHandler::new(store, executor); - let url = Url::parse("memory:///valid/").expect("valid url"); - let valid = LastCheckpointHint::try_read(&storage, &url).expect("read last checkpoint"); - let url = Url::parse("memory:///invalid/").expect("valid url"); - let invalid = LastCheckpointHint::try_read(&storage, &url).expect("read last checkpoint"); - let expected = LastCheckpointHint { - version: 1, - size: 8, - parts: None, - size_in_bytes: Some(21857), - num_of_add_files: None, - checkpoint_schema: None, - checksum: None, - }; - assert_eq!(valid.unwrap(), expected); - assert!(invalid.is_none()); + let storage = ObjectStoreStorageHandler::new(store, executor, None); + + // Test reading all checkpoints from the in memory file system for cases where the data is valid, invalid and + // valid with tags. + for (path_prefix, _, expected_result) in test_cases { + let url = Url::parse(&format!("memory:///{path_prefix}/")).expect("valid url"); + let result = + LastCheckpointHint::try_read(&storage, &url).expect("read last checkpoint"); + assert_eq!(result, expected_result); + } } #[test_log::test] @@ -1015,18 +1816,22 @@ mod tests { let engine = SyncEngine::new(); let snapshot = Snapshot::builder_for(location).build(&engine).unwrap(); - assert_eq!(snapshot.log_segment.checkpoint_parts.len(), 1); + assert_eq!(snapshot.log_segment.listed.checkpoint_parts.len(), 1); assert_eq!( - ParsedLogPath::try_from(snapshot.log_segment.checkpoint_parts[0].location.clone()) - .unwrap() - .unwrap() - .version, + ParsedLogPath::try_from( + snapshot.log_segment.listed.checkpoint_parts[0] + .location + .clone() + ) + .unwrap() + .unwrap() + .version, 2, ); - assert_eq!(snapshot.log_segment.ascending_commit_files.len(), 1); + assert_eq!(snapshot.log_segment.listed.ascending_commit_files.len(), 1); assert_eq!( ParsedLogPath::try_from( - snapshot.log_segment.ascending_commit_files[0] + snapshot.log_segment.listed.ascending_commit_files[0] .location .clone() ) @@ -1039,9 +1844,9 @@ mod tests { #[tokio::test] async fn test_domain_metadata() -> DeltaResult<()> { - let url = Url::parse("memory:///")?; + let table_root = "memory:///test_table/"; let store = Arc::new(InMemory::new()); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // commit0 // - domain1: not removed @@ -1087,7 +1892,9 @@ mod tests { ] .map(|json| json.to_string()) .join("\n"); - add_commit(store.clone().as_ref(), 0, commit).await.unwrap(); + add_commit(table_root, store.clone().as_ref(), 0, commit) + .await + .unwrap(); // commit1 // - domain1: removed @@ -1118,9 +1925,13 @@ mod tests { ] .map(|json| json.to_string()) .join("\n"); - add_commit(store.as_ref(), 1, commit).await.unwrap(); + add_commit(table_root, store.as_ref(), 1, commit) + .await + .unwrap(); + + let snapshot = Snapshot::builder_for(table_root).build(&engine)?; - let snapshot = Snapshot::builder_for(url.clone()).build(&engine)?; + // Test get_domain_metadata assert_eq!(snapshot.get_domain_metadata("domain1", &engine)?, None); assert_eq!( @@ -1136,6 +1947,25 @@ mod tests { .unwrap_err(); assert!(matches!(err, Error::Generic(msg) if msg == "User DomainMetadata are not allowed to use system-controlled 'delta.*' domain")); + + // Test get_domain_metadata_internal + assert_eq!( + snapshot.get_domain_metadata_internal("delta.domain3", &engine)?, + Some("domain3_commit1".to_string()) + ); + + // Test get_all_domain_metadata + let mut metadata = snapshot.get_all_domain_metadata(&engine)?; + metadata.sort_by(|a, b| a.domain().cmp(b.domain())); + + let mut expected = vec![ + DomainMetadata::new("domain2".to_string(), "domain2_commit1".to_string()), + DomainMetadata::new("domain3".to_string(), "domain3_commit0".to_string()), + ]; + expected.sort_by(|a, b| a.domain().cmp(b.domain())); + + assert_eq!(metadata, expected); + Ok(()) } @@ -1168,14 +1998,14 @@ mod tests { #[tokio::test] async fn test_timestamp_with_ict_disabled() -> Result<(), Box> { let store = Arc::new(InMemory::new()); - let url = url::Url::parse("memory://test/")?; - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let table_root = "memory://test/"; + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Create a basic commit without ICT enabled let commit0 = create_basic_commit(false, None); - add_commit(store.as_ref(), 0, commit0).await?; + add_commit(table_root, store.as_ref(), 0, commit0).await?; - let snapshot = Snapshot::builder_for(url).build(&engine)?; + let snapshot = Snapshot::builder_for(table_root).build(&engine)?; // When ICT is disabled, get_timestamp should return None let result = snapshot.get_in_commit_timestamp(&engine)?; @@ -1188,27 +2018,27 @@ mod tests { async fn test_timestamp_with_ict_enablement_timeline() -> Result<(), Box> { let store = Arc::new(InMemory::new()); - let url = url::Url::parse("memory://test/")?; - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let table_root = "memory://test/"; + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Create initial commit without ICT let commit0 = create_basic_commit(false, None); - add_commit(store.as_ref(), 0, commit0).await?; + add_commit(table_root, store.as_ref(), 0, commit0).await?; // Create commit that enables ICT (version 1 = enablement version) let commit1 = create_basic_commit(true, Some(("1".to_string(), "1587968586154".to_string()))); - add_commit(store.as_ref(), 1, commit1).await?; + add_commit(table_root, store.as_ref(), 1, commit1).await?; // Create commit with ICT enabled let expected_timestamp = 1587968586200i64; let commit2 = format!( r#"{{"commitInfo":{{"timestamp":1587968586154,"inCommitTimestamp":{expected_timestamp},"operation":"WRITE"}}}}"#, ); - add_commit(store.as_ref(), 2, commit2.to_string()).await?; + add_commit(table_root, store.as_ref(), 2, commit2.to_string()).await?; // Read snapshot at version 0 (before ICT enablement) - let snapshot_v0 = Snapshot::builder_for(url.clone()) + let snapshot_v0 = Snapshot::builder_for(table_root) .at_version(0) .build(&engine)?; // This snapshot version predates ICT enablement, so ICT is not available @@ -1216,7 +2046,9 @@ mod tests { assert_eq!(result_v0, None); // Read snapshot at version 2 (after ICT enabled) - let snapshot_v2 = Snapshot::builder_for(url).at_version(2).build(&engine)?; + let snapshot_v2 = Snapshot::builder_for(table_root) + .at_version(2) + .build(&engine)?; // When ICT is enabled and available, timestamp() should return inCommitTimestamp let result_v2 = snapshot_v2.get_in_commit_timestamp(&engine)?; assert_eq!(result_v2, Some(expected_timestamp)); @@ -1227,15 +2059,15 @@ mod tests { #[tokio::test] async fn test_get_timestamp_enablement_version_in_future() -> DeltaResult<()> { // Test invalid state where snapshot has enablement version in the future - should error - let url = Url::parse("memory:///table2")?; + let table_root = "memory:///test_table/"; let store = Arc::new(InMemory::new()); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); let commit_data = [ json!({ "protocol": { - "minReaderVersion": 3, - "minWriterVersion": 7, + "minReaderVersion": TABLE_FEATURES_MIN_READER_VERSION, + "minWriterVersion": TABLE_FEATURES_MIN_WRITER_VERSION, "readerFeatures": [], "writerFeatures": ["inCommitTimestamp"] } @@ -1255,13 +2087,15 @@ mod tests { } }), ]; - commit(store.as_ref(), 0, commit_data.to_vec()).await; + commit(table_root, store.as_ref(), 0, commit_data.to_vec()).await; // Create commit that predates ICT enablement (no inCommitTimestamp) let commit_predates = [create_commit_info(1234567890, None)]; - commit(store.as_ref(), 1, commit_predates.to_vec()).await; + commit(table_root, store.as_ref(), 1, commit_predates.to_vec()).await; - let snapshot_predates = Snapshot::builder_for(url).at_version(1).build(&engine)?; + let snapshot_predates = Snapshot::builder_for(table_root) + .at_version(1) + .build(&engine)?; let result_predates = snapshot_predates.get_in_commit_timestamp(&engine); // Version 1 with enablement at version 5 is invalid - should error @@ -1276,12 +2110,12 @@ mod tests { #[tokio::test] async fn test_get_timestamp_missing_ict_when_enabled() -> DeltaResult<()> { // Test missing ICT when it should be present - should error - let url = Url::parse("memory:///table3")?; + let table_root = "memory:///test_table/"; let store = Arc::new(InMemory::new()); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); let commit_data = [ - create_protocol(true, Some(3)), + create_protocol(true, Some(TABLE_FEATURES_MIN_READER_VERSION as u32)), create_metadata( Some("test_id"), Some("{\"type\":\"struct\",\"fields\":[]}"), @@ -1290,13 +2124,15 @@ mod tests { false, ), ]; - commit(store.as_ref(), 0, commit_data.to_vec()).await; // ICT enabled from version 0 + commit(table_root, store.as_ref(), 0, commit_data.to_vec()).await; // ICT enabled from version 0 // Create commit without ICT despite being enabled (corrupt case) let commit_missing_ict = [create_commit_info(1234567890, None)]; - commit(store.as_ref(), 1, commit_missing_ict.to_vec()).await; + commit(table_root, store.as_ref(), 1, commit_missing_ict.to_vec()).await; - let snapshot_missing = Snapshot::builder_for(url).at_version(1).build(&engine)?; + let snapshot_missing = Snapshot::builder_for(table_root) + .at_version(1) + .build(&engine)?; let result = snapshot_missing.get_in_commit_timestamp(&engine); assert_result_error_with_message(result, "In-Commit Timestamp not found"); @@ -1308,13 +2144,13 @@ mod tests { // When ICT is enabled but commit file is not found in log segment, // get_in_commit_timestamp should return an error - let url = Url::parse("memory:///missing_commit_test")?; + let url = Url::parse("memory:///")?; let store = Arc::new(InMemory::new()); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Create initial commit with ICT enabled let commit_data = [ - create_protocol(true, Some(3)), + create_protocol(true, Some(TABLE_FEATURES_MIN_READER_VERSION as u32)), create_metadata( Some("test_id"), Some("{\"type\":\"struct\",\"fields\":[]}"), @@ -1323,35 +2159,17 @@ mod tests { false, ), ]; - commit(store.as_ref(), 0, commit_data.to_vec()).await; + commit(url.as_str(), store.as_ref(), 0, commit_data.to_vec()).await; // Build snapshot to get table configuration - let snapshot = Snapshot::builder_for(url.clone()) + let snapshot = Snapshot::builder_for(url.as_str()) .at_version(0) .build(&engine)?; - // Create a log segment with only checkpoint and no commit file (simulating scenario - // where a checkpoint exists but the commit file has been cleaned up) - let checkpoint_parts = vec![ParsedLogPath::try_from(crate::FileMeta { - location: url.join("_delta_log/00000000000000000000.checkpoint.parquet")?, - last_modified: 0, - size: 100, - })? - .unwrap()]; - - let listed_files = ListedLogFiles { - ascending_commit_files: vec![], - ascending_compaction_files: vec![], - checkpoint_parts, - latest_crc_file: None, - latest_commit_file: None, // No commit file - }; - - let log_segment = LogSegment::try_new(listed_files, url.join("_delta_log/")?, Some(0))?; - let table_config = snapshot.table_configuration().clone(); - - // Create snapshot without commit file in log segment - let snapshot_no_commit = Snapshot::new(log_segment, table_config); + let snapshot_no_commit = create_snapshot_with_commit_file_absent_from_log_segment( + &url, + snapshot.table_configuration().clone(), + )?; // Should return an error when commit file is missing let result = snapshot_no_commit.get_in_commit_timestamp(&engine); @@ -1363,14 +2181,14 @@ mod tests { #[tokio::test] async fn test_get_timestamp_with_checkpoint_and_commit_same_version() -> DeltaResult<()> { // Test the scenario where both checkpoint and commit exist at the same version with ICT enabled. - let url = Url::parse("memory:///checkpoint_commit_test")?; + let table_root = "memory:///test_table/"; let store = Arc::new(InMemory::new()); - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Create 00000000000000000000.json with ICT enabled let commit0_data = [ create_commit_info(1587968586154, None), - create_protocol(true, Some(3)), + create_protocol(true, Some(TABLE_FEATURES_MIN_READER_VERSION as u32)), create_metadata( Some("5fba94ed-9794-4965-ba6e-6ee3c0d22af9"), Some("{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"val\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}"), @@ -1379,12 +2197,12 @@ mod tests { false, ), ]; - commit(store.as_ref(), 0, commit0_data.to_vec()).await; + commit(table_root, store.as_ref(), 0, commit0_data.to_vec()).await; // Create 00000000000000000001.checkpoint.parquet let checkpoint_data = [ create_commit_info(1587968586154, None), - create_protocol(true, Some(3)), + create_protocol(true, Some(TABLE_FEATURES_MIN_READER_VERSION as u32)), create_metadata( Some("5fba94ed-9794-4965-ba6e-6ee3c0d22af9"), Some("{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"val\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}"), @@ -1418,10 +2236,12 @@ mod tests { // Create 00000000000000000001.json with ICT let expected_ict = 1587968586200i64; let commit1_data = [create_commit_info(1587968586200, Some(expected_ict))]; - commit(store.as_ref(), 1, commit1_data.to_vec()).await; + commit(table_root, store.as_ref(), 1, commit1_data.to_vec()).await; // Build snapshot - LogSegment will filter out the commit file because checkpoint exists at same version - let snapshot = Snapshot::builder_for(url).at_version(1).build(&engine)?; + let snapshot = Snapshot::builder_for(table_root) + .at_version(1) + .build(&engine)?; // We should successfully read ICT by falling back to storage let timestamp = snapshot.get_in_commit_timestamp(&engine)?; @@ -1430,11 +2250,132 @@ mod tests { Ok(()) } + #[rstest] + #[case::ict_disabled(false)] + #[case::ict_enabled(true)] + fn test_get_timestamp_returns_valid_timestamp(#[case] ict_enabled: bool) -> DeltaResult<()> { + let temp_dir = tempfile::tempdir().unwrap(); + let table_path = Url::from_directory_path(temp_dir.path()) + .unwrap() + .to_string(); + let store = Arc::new(LocalFileSystem::new()); + let engine = DefaultEngineBuilder::new(store).build(); + + let schema = Arc::new(StructType::try_new(vec![StructField::new( + "id", + DataType::INTEGER, + true, + )])?); + + let mut create_table_builder = create_table(&table_path, schema, "Test/1.0"); + if ict_enabled { + create_table_builder = create_table_builder + .with_table_properties(vec![(ENABLE_IN_COMMIT_TIMESTAMPS, "true")]); + } + + let _ = create_table_builder + .build(&engine, Box::new(FileSystemCommitter::new()))? + .commit(&engine)?; + + let snapshot = Snapshot::builder_for(&table_path).build(&engine)?; + let ts = snapshot.get_timestamp(&engine)?; + let now_ms = chrono::Utc::now().timestamp_millis(); + let two_days_ms = 2 * 24 * 60 * 60 * 1000_i64; + assert!( + (now_ms - two_days_ms..=now_ms).contains(&ts), + "timestamp {ts} not within 2 days of now ({now_ms})" + ); + + if ict_enabled { + let ict_ts = snapshot.get_in_commit_timestamp(&engine)?.unwrap(); + assert_eq!(ts, ict_ts); + } + Ok(()) + } + + #[rstest] + #[case::ict_enabled(true)] + #[case::ict_disabled(false)] + #[tokio::test] + async fn test_get_timestamp_errors_when_commit_file_missing( + #[case] ict_enabled: bool, + ) -> DeltaResult<()> { + let url = Url::parse("memory:///")?; + let store = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(store.clone()).build(); + + // TODO: refactor `ict_config` from a raw tuple to a dedicated ICTConfig struct so the + // enablement version and enablement timestamp fields are named and self-documenting. + // The ict_config tuple is (inCommitTimestampEnablementVersion, inCommitTimestampEnablementTimestamp): + // if ICT is enabled, the enablement version is 0 with an arbitrary enablement timestamp. + let ict_config = ict_enabled.then(|| ("0".to_string(), "1612345678".to_string())); + let reader_version = ict_enabled.then_some(TABLE_FEATURES_MIN_READER_VERSION as u32); + + let mut commit_data = vec![]; + // When ICT is enabled, commitInfo must be the first action (protocol requirement) + if ict_enabled { + commit_data.push(create_commit_info(1677811175819, Some(1677811175999))); + } + commit_data.extend([ + create_protocol(ict_enabled, reader_version), + create_metadata( + Some("test_id"), + Some("{\"type\":\"struct\",\"fields\":[]}"), + Some(1677811175819), + ict_config, + false, + ), + ]); + commit(url.as_str(), store.as_ref(), 0, commit_data).await; + + let snapshot = Snapshot::builder_for(url.as_str()) + .at_version(0) + .build(&engine)?; + + let snapshot_no_commit = create_snapshot_with_commit_file_absent_from_log_segment( + &url, + snapshot.table_configuration().clone(), + )?; + + let result = snapshot_no_commit.get_timestamp(&engine); + assert_result_error_with_message(result, "Last commit file not found in log segment"); + + Ok(()) + } + + #[tokio::test] + async fn test_get_timestamp_errors_when_ict_missing_from_commit_info() -> DeltaResult<()> { + // ICT is enabled and commit file IS present in the log segment, but the commitInfo + // action does not carry an inCommitTimestamp value (corrupt/incomplete commit). + let store = Arc::new(InMemory::new()); + let table_root = "memory:///test_table/"; + let engine = DefaultEngineBuilder::new(store.clone()).build(); + + let commit0_data = vec![ + create_commit_info(1677811175819, None), // commitInfo without inCommitTimestamp + create_protocol(true, Some(TABLE_FEATURES_MIN_READER_VERSION as u32)), + create_metadata( + Some("test_id"), + Some("{\"type\":\"struct\",\"fields\":[]}"), + Some(1677811175819), + Some(("0".to_string(), "1612345678".to_string())), // ict enabled at version 0, and an arbitrary timestamp + false, + ), + ]; + commit(table_root, store.as_ref(), 0, commit0_data).await; + + let snapshot = Snapshot::builder_for(table_root).build(&engine)?; + let result = snapshot.get_timestamp(&engine); + assert_result_error_with_message(result, "In-Commit Timestamp not found in commit file"); + + Ok(()) + } + #[tokio::test] async fn test_try_new_from_empty_log_tail() -> DeltaResult<()> { let store = Arc::new(InMemory::new()); - let url = Url::parse("memory:///")?; - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let table_root = "memory:///test_table/"; + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Create initial commit let commit0 = vec![ @@ -1455,14 +2396,20 @@ mod tests { } }), ]; - commit(store.as_ref(), 0, commit0).await; + commit(table_root, store.as_ref(), 0, commit0).await; - let base_snapshot = Snapshot::builder_for(url.clone()) + let base_snapshot = Snapshot::builder_for(table_root) .at_version(0) .build(&engine)?; // Test with empty log tail - should return same snapshot - let result = Snapshot::try_new_from(base_snapshot.clone(), vec![], &engine, None)?; + let result = Snapshot::try_new_from_impl( + base_snapshot.clone(), + vec![], + &engine, + None, + MetricId::default(), + )?; assert_eq!(result, base_snapshot); Ok(()) @@ -1472,7 +2419,7 @@ mod tests { async fn test_try_new_from_latest_commit_preservation() -> DeltaResult<()> { let store = Arc::new(InMemory::new()); let url = Url::parse("memory:///")?; - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Create commits 0-2 let base_commit = vec![ @@ -1489,21 +2436,23 @@ mod tests { }), ]; - commit(store.as_ref(), 0, base_commit.clone()).await; + commit(url.as_str(), store.as_ref(), 0, base_commit.clone()).await; commit( + url.as_str(), store.as_ref(), 1, vec![json!({"commitInfo": {"timestamp": 1234}})], ) .await; commit( + url.as_str(), store.as_ref(), 2, vec![json!({"commitInfo": {"timestamp": 5678}})], ) .await; - let base_snapshot = Snapshot::builder_for(url.clone()) + let base_snapshot = Snapshot::builder_for(url.as_str()) .at_version(1) .build(&engine)?; @@ -1511,6 +2460,7 @@ mod tests { assert_eq!( base_snapshot .log_segment + .listed .latest_commit_file .as_ref() .map(|f| f.version), @@ -1528,14 +2478,20 @@ mod tests { .ok_or_else(|| Error::Generic("Failed to parse log path".to_string()))?; let log_tail = vec![parsed_path]; - // Create new snapshot from base to version 2 using try_new_from directly - let new_snapshot = - Snapshot::try_new_from(base_snapshot.clone(), log_tail, &engine, Some(2))?; + // Create new snapshot from base to version 2 using try_new_from_impl directly + let new_snapshot = Snapshot::try_new_from_impl( + base_snapshot.clone(), + log_tail, + &engine, + Some(2), + MetricId::default(), + )?; // Latest commit should now be version 2 assert_eq!( new_snapshot .log_segment + .listed .latest_commit_file .as_ref() .map(|f| f.version), @@ -1548,8 +2504,8 @@ mod tests { #[tokio::test] async fn test_try_new_from_version_boundary_cases() -> DeltaResult<()> { let store = Arc::new(InMemory::new()); - let url = Url::parse("memory:///")?; - let engine = DefaultEngine::new(store.clone(), Arc::new(TokioBackgroundExecutor::new())); + let table_root = "memory:///test_table/"; + let engine = DefaultEngineBuilder::new(store.clone()).build(); // Create commits let base_commit = vec![ @@ -1566,24 +2522,37 @@ mod tests { }), ]; - commit(store.as_ref(), 0, base_commit).await; + commit(table_root, store.as_ref(), 0, base_commit).await; commit( + table_root, store.as_ref(), 1, vec![json!({"commitInfo": {"timestamp": 1234}})], ) .await; - let base_snapshot = Snapshot::builder_for(url.clone()) + let base_snapshot = Snapshot::builder_for(table_root) .at_version(1) .build(&engine)?; // Test requesting same version - should return same snapshot - let same_version = Snapshot::try_new_from(base_snapshot.clone(), vec![], &engine, Some(1))?; + let same_version = Snapshot::try_new_from_impl( + base_snapshot.clone(), + vec![], + &engine, + Some(1), + MetricId::default(), + )?; assert!(Arc::ptr_eq(&same_version, &base_snapshot)); // Test requesting older version - should error - let older_version = Snapshot::try_new_from(base_snapshot.clone(), vec![], &engine, Some(0)); + let older_version = Snapshot::try_new_from_impl( + base_snapshot.clone(), + vec![], + &engine, + Some(0), + MetricId::default(), + ); assert!(matches!( older_version, Err(Error::Generic(msg)) if msg.contains("older than snapshot hint version") @@ -1591,4 +2560,409 @@ mod tests { Ok(()) } + + #[test] + fn test_new_post_commit_simple() { + // GIVEN + let path = std::fs::canonicalize(PathBuf::from("./tests/data/basic_partitioned/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let engine = SyncEngine::new(); + let base_snapshot = Snapshot::builder_for(url.clone()).build(&engine).unwrap(); + let next_version = base_snapshot.version() + 1; + + // WHEN + let fake_new_commit = ParsedLogPath::create_parsed_published_commit(&url, next_version); + let post_commit_snapshot = base_snapshot + .new_post_commit(fake_new_commit, CrcDelta::default()) + .unwrap(); + + // THEN + assert_eq!(post_commit_snapshot.version(), next_version); + assert_eq!(post_commit_snapshot.log_segment().end_version, next_version); + } + + // Helper: create a minimal test table with commits 0-N + async fn setup_test_table_with_commits( + table_root: impl AsRef, + store: &InMemory, + num_commits: u64, + ) -> DeltaResult<()> { + // Commit 0: protocol + metadata + first file + let commit0 = vec![ + json!({"protocol": {"minReaderVersion": 1, "minWriterVersion": 2}}), + json!({ + "metaData": { + "id": "test-id", + "format": {"provider": "parquet", "options": {}}, + "schemaString": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}", + "partitionColumns": [], + "configuration": {}, + "createdTime": 1587968585495i64 + } + }), + json!({"add": {"path": "file1.parquet", "partitionValues": {}, "size": 100, "modificationTime": 1000, "dataChange": true}}), + ]; + commit(table_root.as_ref(), store, 0, commit0).await; + + // Additional commits with just add actions + for i in 1..num_commits { + let commit_i = vec![json!({ + "add": { + "path": format!("file{}.parquet", i + 1), + "partitionValues": {}, + "size": (i + 1) * 100, + "modificationTime": (i + 1) * 1000, + "dataChange": true + } + })]; + commit(table_root.as_ref(), store, i, commit_i).await; + } + Ok(()) + } + + // Helper: write a compaction file + async fn write_compaction_file(store: &InMemory, start: u64, end: u64) -> DeltaResult<()> { + let content = r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#; + store + .put( + &test_utils::compacted_log_path_for_versions(start, end, "json"), + content.into(), + ) + .await?; + Ok(()) + } + + struct IncrementalSnapshotTestContext { + store: Arc, + url: Url, + engine: Arc>, + } + + fn setup_incremental_snapshot_test() -> DeltaResult { + let store = Arc::new(InMemory::new()); + let url = Url::parse("memory:///")?; + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = Arc::new( + DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(), + ); + + Ok(IncrementalSnapshotTestContext { store, url, engine }) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_incremental_snapshot_picks_up_checkpoint_written_at_current_version( + ) -> DeltaResult<()> { + let ctx = setup_incremental_snapshot_test()?; + + setup_test_table_with_commits(ctx.url.as_str(), &ctx.store, 2).await?; + + let snapshot_v1 = Snapshot::builder_for(ctx.url.as_str()) + .at_version(1) + .build(ctx.engine.as_ref())?; + assert_eq!(snapshot_v1.log_segment.checkpoint_version, None); + + snapshot_v1.clone().checkpoint(ctx.engine.as_ref())?; + + let fresh = Snapshot::builder_for(ctx.url.as_str()).build(ctx.engine.as_ref())?; + assert_eq!(fresh.version(), 1); + assert_eq!(fresh.log_segment.checkpoint_version, Some(1)); + + let updated = Snapshot::builder_from(snapshot_v1).build(ctx.engine.as_ref())?; + assert_eq!(updated.version(), 1); + assert_eq!(updated.log_segment.checkpoint_version, Some(1)); + assert_eq!(updated, fresh); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_incremental_snapshot_picks_up_newer_checkpoint_below_current_version( + ) -> DeltaResult<()> { + let ctx = setup_incremental_snapshot_test()?; + + setup_test_table_with_commits(ctx.url.as_str(), &ctx.store, 4).await?; + + Snapshot::builder_for(ctx.url.as_str()) + .at_version(1) + .build(ctx.engine.as_ref())? + .checkpoint(ctx.engine.as_ref())?; + + let snapshot_v3 = Snapshot::builder_for(ctx.url.as_str()) + .at_version(3) + .build(ctx.engine.as_ref())?; + assert_eq!(snapshot_v3.log_segment.checkpoint_version, Some(1)); + + Snapshot::builder_for(ctx.url.as_str()) + .at_version(2) + .build(ctx.engine.as_ref())? + .checkpoint(ctx.engine.as_ref())?; + + let fresh = Snapshot::builder_for(ctx.url.as_str()).build(ctx.engine.as_ref())?; + assert_eq!(fresh.version(), 3); + assert_eq!(fresh.log_segment.checkpoint_version, Some(2)); + + let updated = Snapshot::builder_from(snapshot_v3).build(ctx.engine.as_ref())?; + assert_eq!(updated.version(), 3); + assert_eq!(updated.log_segment.checkpoint_version, Some(2)); + assert_eq!(updated, fresh); + + Ok(()) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_explicit_same_version_request_keeps_existing_snapshot_after_checkpoint_write( + ) -> DeltaResult<()> { + let ctx = setup_incremental_snapshot_test()?; + + setup_test_table_with_commits(ctx.url.as_str(), &ctx.store, 2).await?; + + let snapshot_v1 = Snapshot::builder_for(ctx.url.as_str()) + .at_version(1) + .build(ctx.engine.as_ref())?; + assert_eq!(snapshot_v1.log_segment.checkpoint_version, None); + + snapshot_v1.clone().checkpoint(ctx.engine.as_ref())?; + + let refreshed = Snapshot::builder_for(ctx.url.as_str()).build(ctx.engine.as_ref())?; + assert_eq!(refreshed.log_segment.checkpoint_version, Some(1)); + + let pinned = Snapshot::builder_from(snapshot_v1.clone()) + .at_version(1) + .build(ctx.engine.as_ref())?; + assert!(Arc::ptr_eq(&pinned, &snapshot_v1)); + assert_eq!(pinned.log_segment.checkpoint_version, None); + + Ok(()) + } + /// The incremental snapshot path (try_new_from_impl) re-lists files from the checkpoint + /// version onwards. We must ensure that it deduplicates compaction files, since producing + /// duplicates violated the sort invariant in LogSegmentFilesBuilder::build(). + #[tokio::test] + async fn test_incremental_snapshot_with_compaction_files() -> DeltaResult<()> { + let store = Arc::new(InMemory::new()); + let table_root = "memory:///"; + let engine = DefaultEngineBuilder::new(store.clone()).build(); + + // Create commits 0-3 and compaction files (1,1) and (1,2) + setup_test_table_with_commits(table_root, &store, 3).await?; + write_compaction_file(&store, 1, 1).await?; + write_compaction_file(&store, 1, 2).await?; + + // Build snapshot at v2 (includes both compaction files) + let snapshot_v2 = Snapshot::builder_for(table_root) + .at_version(2) + .build(&engine)?; + assert_eq!( + snapshot_v2 + .log_segment + .listed + .ascending_compaction_files + .len(), + 2 + ); + + // Add commit 3 + commit( + table_root, + &store, + 3, + vec![json!({"add": {"path": "file4.parquet", "partitionValues": {}, "size": 400, "modificationTime": 4000, "dataChange": true}})], + ) + .await; + + // Build v3 incrementally - before the fix, this panicked due to duplicate compaction files + let snapshot_v3 = Snapshot::builder_from(snapshot_v2) + .at_version(3) + .build(&engine)?; + + assert_eq!(snapshot_v3.version(), 3); + assert_eq!( + snapshot_v3 + .log_segment + .listed + .ascending_compaction_files + .len(), + 2 + ); + + Ok(()) + } + + /// This test documents a limitation: When deduplicating compactions, the deduplication logic + /// only checks the start version (lo), not the hi version. So a new compaction file (1,3) + /// added after building the base snapshot at v2 gets filtered out because its start version + /// (1) <= old_version (2). + #[tokio::test] + async fn test_incremental_snapshot_with_new_compaction_files() -> DeltaResult<()> { + let store = Arc::new(InMemory::new()); + let table_root = "memory:///"; + let engine = DefaultEngineBuilder::new(store.clone()).build(); + + // Create commits 0-3 and compaction files (1,2) and (2,2) + setup_test_table_with_commits(table_root, &store, 4).await?; + write_compaction_file(&store, 1, 2).await?; + write_compaction_file(&store, 2, 2).await?; + + // Build snapshot at v2 + let snapshot_v2 = Snapshot::builder_for(table_root) + .at_version(2) + .build(&engine)?; + assert_eq!( + snapshot_v2 + .log_segment + .listed + .ascending_compaction_files + .len(), + 2 + ); + + // Add new compaction file (1,3) after building the base snapshot + write_compaction_file(&store, 1, 3).await?; + + // Build v3 incrementally - the new (1,3) file gets filtered out because + // the deduplication only looks at start version: 1 <= old_version (2) + let snapshot_v3 = Snapshot::builder_from(snapshot_v2) + .at_version(3) + .build(&engine)?; + + assert_eq!(snapshot_v3.version(), 3); + assert_eq!( + snapshot_v3 + .log_segment + .listed + .ascending_compaction_files + .len(), + 2 + ); + + // Verify we still have the original (1,2) and (2,2) files + let versions_and_his: Vec<_> = snapshot_v3 + .log_segment + .listed + .ascending_compaction_files + .iter() + .map(|p| match p.file_type { + LogPathFileType::CompactedCommit { hi } => (p.version, hi), + _ => panic!("Expected CompactedCommit"), + }) + .collect(); + assert_eq!(versions_and_his, vec![(1, 2), (2, 2)]); + + Ok(()) + } + + #[test] + fn test_get_protocol_derived_properties() { + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + + let engine = SyncEngine::new(); + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + + let props = snapshot.get_protocol_derived_properties(); + assert_eq!( + props.get("delta.minReaderVersion").unwrap(), + &TABLE_FEATURES_MIN_READER_VERSION.to_string() + ); + assert_eq!( + props.get("delta.minWriterVersion").unwrap(), + &TABLE_FEATURES_MIN_WRITER_VERSION.to_string() + ); + assert_eq!( + props.get("delta.feature.deletionVectors").unwrap(), + "supported" + ); + } + + #[tokio::test] + async fn test_metadata_configuration() { + let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + let engine = DefaultEngineBuilder::new(storage.clone()).build(); + + // Create a commit with custom configuration + let actions = vec![ + json!({"commitInfo": {"timestamp": 123, "operation": "CREATE TABLE"}}), + json!({"protocol": { + "minReaderVersion": 3, + "minWriterVersion": 7, + "readerFeatures": [], + "writerFeatures": [] + }}), + json!({"metaData": { + "id": "test-id", + "format": {"provider": "parquet", "options": {}}, + "schemaString": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":false,\"metadata\":{}}]}", + "partitionColumns": [], + "configuration": { + "io.unitycatalog.tableId": "abc-123", + "myapp.setting": "value" + }, + "createdTime": 1234567890 + }}), + ]; + commit(table_root, &storage, 0, actions).await; + + let snapshot = Snapshot::builder_for(table_root).build(&engine).unwrap(); + let config = snapshot.metadata_configuration(); + assert_eq!( + config.get("io.unitycatalog.tableId"), + Some(&"abc-123".to_string()) + ); + assert_eq!(config.get("myapp.setting"), Some(&"value".to_string())); + } + + #[rstest::rstest] + #[case::no_clustering(None, None, None)] + #[case::clustered_no_column_mapping( + Some(vec!["region"]), + None, + Some(vec![ColumnName::new(["region"])]) + )] + #[case::clustered_with_column_mapping( + Some(vec!["region"]), + Some("name"), + Some(vec![ColumnName::new(["region"])]) + )] + fn test_get_logical_clustering_columns( + #[case] clustering_cols: Option>, + #[case] column_mapping_mode: Option<&str>, + #[case] expected: Option>, + ) { + use crate::transaction::create_table::create_table; + use crate::transaction::data_layout::DataLayout; + + let storage = Arc::new(InMemory::new()); + let engine = DefaultEngineBuilder::new(storage).build(); + let schema = Arc::new( + crate::schema::StructType::try_new(vec![ + crate::schema::StructField::new("id", crate::schema::DataType::INTEGER, false), + crate::schema::StructField::new("region", crate::schema::DataType::STRING, true), + ]) + .unwrap(), + ); + let mut builder = create_table("memory:///", schema, "test"); + if let Some(cols) = &clustering_cols { + builder = builder.with_data_layout(DataLayout::clustered(cols.clone())); + } + if let Some(mode) = column_mapping_mode { + builder = builder.with_table_properties([("delta.columnMapping.mode", mode)]); + } + let _ = builder + .build( + &engine, + Box::new(crate::committer::FileSystemCommitter::new()), + ) + .unwrap() + .commit(&engine) + .unwrap(); + let snapshot = Snapshot::builder_for("memory:///").build(&engine).unwrap(); + let result = snapshot.get_logical_clustering_columns(&engine).unwrap(); + assert_eq!(result, expected); + } } diff --git a/kernel/src/snapshot/builder.rs b/kernel/src/snapshot/builder.rs index 860ed443cc..96c64edf8b 100644 --- a/kernel/src/snapshot/builder.rs +++ b/kernel/src/snapshot/builder.rs @@ -1,11 +1,16 @@ //! Builder for creating [`Snapshot`] instances. +use std::sync::Arc; +use std::time::Instant; + +use tracing::{info, instrument}; + use crate::log_path::LogPath; use crate::log_segment::LogSegment; +use crate::metrics::{MetricEvent, MetricId, MetricsReporter}; use crate::snapshot::SnapshotRef; +use crate::utils::try_parse_uri; use crate::{DeltaResult, Engine, Error, Snapshot, Version}; -use url::Url; - /// Builder for creating [`Snapshot`] instances. /// /// # Example @@ -30,16 +35,16 @@ use url::Url; // types/add type state. #[derive(Debug)] pub struct SnapshotBuilder { - table_root: Option, + table_root: Option, existing_snapshot: Option, version: Option, log_tail: Vec, } impl SnapshotBuilder { - pub(crate) fn new_for(table_root: Url) -> Self { + pub(crate) fn new_for(table_root: impl AsRef) -> Self { Self { - table_root: Some(table_root), + table_root: Some(table_root.as_ref().to_string()), existing_snapshot: None, version: None, log_tail: Vec::new(), @@ -67,7 +72,6 @@ impl SnapshotBuilder { /// /// Note that the log tail must be a contiguous sequence of commits from M..=N where N is the /// latest version of the table and 0 <= M <= N. - #[cfg(feature = "catalog-managed")] pub fn with_log_tail(mut self, log_tail: Vec) -> Self { self.log_tail = log_tail; self @@ -77,27 +81,117 @@ impl SnapshotBuilder { /// returning a reference to an existing snapshot if the request to build a new snapshot /// matches the version of an existing snapshot. /// + /// Reports metrics: [`MetricEvent::SnapshotCompleted`] or [`MetricEvent::SnapshotFailed`]. + /// /// # Parameters /// /// - `engine`: Implementation of [`Engine`] apis. + #[instrument( + name = "snap.build", + skip_all, + fields(path = %self.table_path()), + err + )] pub fn build(self, engine: &dyn Engine) -> DeltaResult { + info!( + target = self.target_version_str(), + from_version = ?self.existing_snapshot.as_ref().map(|s| s.version()), + log_tail_len = self.log_tail.len(), + "building snapshot" + ); + let log_tail = self.log_tail.into_iter().map(Into::into).collect(); - if let Some(table_root) = self.table_root { - let log_segment = LogSegment::for_snapshot( - engine.storage_handler().as_ref(), - table_root.join("_delta_log/")?, - log_tail, - self.version, - )?; - Ok(Snapshot::try_new_from_log_segment(table_root, log_segment, engine)?.into()) - } else { - let existing_snapshot = self.existing_snapshot.ok_or_else(|| { - Error::internal_error( - "SnapshotBuilder should have either table_root or existing_snapshot", + let operation_id = MetricId::new(); + let reporter = engine.get_metrics_reporter(); + let start = Instant::now(); + + let result = if let Some(table_root) = self.table_root { + try_parse_uri(table_root).and_then(|table_url| { + let log_segment = LogSegment::for_snapshot( + engine.storage_handler().as_ref(), + table_url.join("_delta_log/")?, + log_tail, + self.version, + reporter.as_ref(), + Some(operation_id), + )?; + Snapshot::try_new_from_log_segment_impl( + table_url, + log_segment, + engine, + operation_id, ) - })?; - Snapshot::try_new_from(existing_snapshot, log_tail, engine, self.version) + .map(Into::into) + }) + } else { + self.existing_snapshot + .ok_or_else(|| { + Error::internal_error( + "SnapshotBuilder should have either table_root or existing_snapshot", + ) + }) + .and_then(|existing_snapshot| { + Snapshot::try_new_from_impl( + existing_snapshot, + log_tail, + engine, + self.version, + operation_id, + ) + }) + }; + + Self::report_snapshot_build_result(result, start, operation_id, reporter.as_ref()) + } + + /// Emit [`MetricEvent::SnapshotCompleted`] or [`MetricEvent::SnapshotFailed`] based on the + /// result, measuring total duration from `start`. + fn report_snapshot_build_result( + result: DeltaResult, + start: Instant, + operation_id: MetricId, + reporter: Option<&Arc>, + ) -> DeltaResult { + let snapshot_duration = start.elapsed(); + match &result { + Ok(snapshot) => { + reporter.inspect(|r| { + r.report(MetricEvent::SnapshotCompleted { + operation_id, + version: snapshot.version(), + total_duration: snapshot_duration, + }); + }); + } + Err(_) => { + reporter.inspect(|r| { + r.report(MetricEvent::SnapshotFailed { + operation_id, + duration: snapshot_duration, + }); + }); + } } + result + } + + // ===== Instrumentation Helpers ===== + + fn table_path(&self) -> &str { + self.table_root + .as_deref() + .or_else(|| { + self.existing_snapshot + .as_ref() + .map(|s| s.table_root().as_str()) + }) + .unwrap_or("unknown") + } + + fn target_version_str(&self) -> String { + self.version + .map(|v| v.to_string()) + .unwrap_or_else(|| "LATEST".into()) } } @@ -105,30 +199,32 @@ impl SnapshotBuilder { mod tests { use std::sync::Arc; - use crate::engine::default::{executor::tokio::TokioBackgroundExecutor, DefaultEngine}; - + use crate::engine::default::{ + executor::tokio::TokioBackgroundExecutor, DefaultEngine, DefaultEngineBuilder, + }; + use crate::metrics::MetricEvent; + use crate::object_store::memory::InMemory; + use crate::object_store::path::Path; + use crate::object_store::{DynObjectStore, ObjectStore as _}; + use crate::utils::test_utils::CapturingReporter; use itertools::Itertools; - use object_store::memory::InMemory; - use object_store::ObjectStore; use serde_json::json; use super::*; fn setup_test() -> ( Arc>, - Arc, - Url, + Arc, + String, ) { - let table_root = Url::parse("memory:///test_table").unwrap(); + let table_root = String::from("memory:///"); let store = Arc::new(InMemory::new()); - let engine = Arc::new(DefaultEngine::new( - store.clone(), - Arc::new(TokioBackgroundExecutor::new()), - )); + let engine = Arc::new(DefaultEngineBuilder::new(store.clone()).build()); (engine, store, table_root) } - fn create_table(store: &Arc, _table_root: &Url) -> DeltaResult<()> { + // TODO (#1990): update this function to properly store the table at table_root + async fn create_table(store: &Arc, _table_root: String) -> DeltaResult<()> { let protocol = json!({ "minReaderVersion": 3, "minWriterVersion": 7, @@ -165,8 +261,8 @@ mod tests { .collect_vec() .join("\n"); - let path = object_store::path::Path::from(format!("_delta_log/{:020}.json", 0).as_str()); - futures::executor::block_on(async { store.put(&path, commit0_data.into()).await })?; + let path = Path::from(format!("_delta_log/{:020}.json", 0).as_str()); + store.put(&path, commit0_data.into()).await?; // Create commit 1 with a single addFile action let commit1 = [json!({ @@ -188,17 +284,17 @@ mod tests { .collect_vec() .join("\n"); - let path = object_store::path::Path::from(format!("_delta_log/{:020}.json", 1).as_str()); - futures::executor::block_on(async { store.put(&path, commit1_data.into()).await })?; + let path = Path::from(format!("_delta_log/{:020}.json", 1).as_str()); + store.put(&path, commit1_data.into()).await?; Ok(()) } - #[test] - fn test_snapshot_builder() -> Result<(), Box> { + #[test_log::test(tokio::test)] + async fn test_snapshot_builder() -> Result<(), Box> { let (engine, store, table_root) = setup_test(); let engine = engine.as_ref(); - create_table(&store, &table_root)?; + create_table(&store, table_root.clone()).await?; let snapshot = SnapshotBuilder::new_for(table_root.clone()).build(engine)?; assert_eq!(snapshot.version(), 1); @@ -210,4 +306,248 @@ mod tests { Ok(()) } + + #[test_log::test(tokio::test)] + async fn test_snapshot_with_unsupported_type() -> Result<(), Box> { + let (engine, store, table_root) = setup_test(); + let engine = engine.as_ref(); + + // Create a table with an unsupported type in the schema + let protocol = json!({ + "minReaderVersion": 1, + "minWriterVersion": 2, + }); + + let metadata = json!({ + "id": "test-table-id", + "format": { + "provider": "parquet", + "options": {} + }, + "schemaString": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"interval_col\",\"type\":\"interval second\",\"nullable\":true,\"metadata\":{}}]}", + "partitionColumns": [], + "configuration": {}, + "createdTime": 1587968585495i64 + }); + + let commit0 = [ + json!({ + "protocol": protocol + }), + json!({ + "metaData": metadata + }), + ]; + + let commit0_data = commit0 + .iter() + .map(ToString::to_string) + .collect_vec() + .join("\n"); + + let path = Path::from("_delta_log/00000000000000000000.json"); + store.put(&path, commit0_data.into()).await?; + + // Try to build a snapshot and expect a clear error message + let result = SnapshotBuilder::new_for(table_root.clone()).build(engine); + assert!(result.is_err()); + + let err = result.unwrap_err(); + let err_msg = err.to_string(); + assert!( + err_msg.contains("Unsupported Delta table type: 'interval second'"), + "Expected clear error message about unsupported type, got: {err_msg}" + ); + + Ok(()) + } + + fn setup_test_with_reporter() -> ( + Arc>, + Arc, + String, + Arc, + ) { + let table_root = String::from("memory:///"); + let store: Arc = Arc::new(InMemory::new()); + let reporter = Arc::new(CapturingReporter::default()); + let engine = Arc::new( + DefaultEngineBuilder::new(store.clone()) + .with_metrics_reporter(reporter.clone()) + .build(), + ); + (engine, store, table_root, reporter) + } + + fn assert_has_event(reporter: &CapturingReporter, pred: fn(&MetricEvent) -> bool, msg: &str) { + let events = reporter.events(); + assert!(events.iter().any(pred), "{msg}"); + } + + fn assert_no_event(reporter: &CapturingReporter, pred: fn(&MetricEvent) -> bool, msg: &str) { + let events = reporter.events(); + assert!(!events.iter().any(pred), "{msg}"); + } + + fn is_snapshot_completed(e: &MetricEvent) -> bool { + matches!(e, MetricEvent::SnapshotCompleted { .. }) + } + + fn is_snapshot_failed(e: &MetricEvent) -> bool { + matches!(e, MetricEvent::SnapshotFailed { .. }) + } + + #[test_log::test(tokio::test)] + async fn snapshot_failed_emits_metric_on_error() { + let (engine, store, table_root, reporter) = setup_test_with_reporter(); + + // Write a commit with an unsupported schema type to force a build failure + let commit0_data = [ + json!({"protocol": {"minReaderVersion": 1, "minWriterVersion": 2}}), + json!({"metaData": { + "id": "test-table-id", + "format": {"provider": "parquet", "options": {}}, + "schemaString": "{\"type\":\"struct\",\"fields\":[{\"name\":\"x\",\"type\":\"interval second\",\"nullable\":true,\"metadata\":{}}]}", + "partitionColumns": [], + "configuration": {}, + "createdTime": 1587968585495i64 + }}), + ] + .iter() + .map(ToString::to_string) + .collect_vec() + .join("\n"); + + let path = Path::from("_delta_log/00000000000000000000.json"); + store.put(&path, commit0_data.into()).await.unwrap(); + + let result = SnapshotBuilder::new_for(table_root).build(engine.as_ref()); + assert!(result.is_err()); + + assert_has_event( + &reporter, + is_snapshot_failed, + "expected a SnapshotFailed event", + ); + assert_no_event( + &reporter, + is_snapshot_completed, + "should not emit SnapshotCompleted on failure", + ); + } + + #[test_log::test(tokio::test)] + async fn snapshot_update_from_existing_emits_metric() { + let (engine, store, table_root, reporter) = setup_test_with_reporter(); + create_table(&store, table_root.clone()).await.unwrap(); + + // Build an initial snapshot at version 0 + let base = SnapshotBuilder::new_for(table_root) + .at_version(0) + .build(engine.as_ref()) + .unwrap(); + + // Clear events from the initial build + reporter.clear(); + + // Incrementally update to the latest version via the else branch + let updated = SnapshotBuilder::new_from(base) + .build(engine.as_ref()) + .unwrap(); + assert_eq!(updated.version(), 1); + + let events = reporter.events(); + let snapshot_completed = events.iter().find_map(|e| match e { + MetricEvent::SnapshotCompleted { + version, + total_duration, + .. + } => Some((*version, *total_duration)), + _ => None, + }); + + let (version, duration) = snapshot_completed.expect("expected SnapshotCompleted event"); + assert_eq!(version, 1); + assert!( + !duration.is_zero(), + "SnapshotCompleted.total_duration should be non-zero" + ); + } + + #[test_log::test(tokio::test)] + async fn snapshot_update_to_earlier_version_emits_failed_metric() { + let (engine, store, table_root, reporter) = setup_test_with_reporter(); + create_table(&store, table_root.clone()).await.unwrap(); + + // Build a snapshot at version 1 + let base = SnapshotBuilder::new_for(table_root) + .build(engine.as_ref()) + .unwrap(); + assert_eq!(base.version(), 1); + + // Clear events from the initial build + reporter.clear(); + + // Attempt to update to version 0 (earlier than base version 1) + let result = SnapshotBuilder::new_from(base) + .at_version(0) + .build(engine.as_ref()); + assert!(result.is_err()); + + assert_has_event( + &reporter, + is_snapshot_failed, + "expected a SnapshotFailed event when updating to an earlier version", + ); + assert_no_event( + &reporter, + is_snapshot_completed, + "should not emit SnapshotCompleted on failure", + ); + } + + #[test_log::test(tokio::test)] + async fn snapshot_completed_duration_includes_log_segment_loading() { + let (engine, store, table_root, reporter) = setup_test_with_reporter(); + create_table(&store, table_root.clone()).await.unwrap(); + + let _snapshot = SnapshotBuilder::new_for(table_root) + .build(engine.as_ref()) + .unwrap(); + + assert_has_event( + &reporter, + is_snapshot_completed, + "expected a SnapshotCompleted event", + ); + + let events = reporter.events(); + + let log_segment_duration = events + .iter() + .find_map(|e| match e { + MetricEvent::LogSegmentLoaded { duration, .. } => Some(*duration), + _ => None, + }) + .expect("expected LogSegmentLoaded event"); + let snapshot_duration = events + .iter() + .find_map(|e| match e { + MetricEvent::SnapshotCompleted { total_duration, .. } => Some(*total_duration), + _ => None, + }) + .expect("expected SnapshotCompleted event"); + + assert!( + snapshot_duration >= log_segment_duration, + "SnapshotCompleted.total_duration ({snapshot_duration:?}) should be >= \ + LogSegmentLoaded.duration ({log_segment_duration:?})" + ); + + let snapshot_completed_count = events.iter().filter(|e| is_snapshot_completed(e)).count(); + assert_eq!( + snapshot_completed_count, 1, + "expected exactly one SnapshotCompleted event" + ); + } } diff --git a/kernel/src/table_changes/log_replay.rs b/kernel/src/table_changes/log_replay.rs index 63f7878106..6b27e118e9 100644 --- a/kernel/src/table_changes/log_replay.rs +++ b/kernel/src/table_changes/log_replay.rs @@ -5,27 +5,29 @@ use std::collections::{HashMap, HashSet}; use std::slice; use std::sync::{Arc, LazyLock}; -use crate::actions::visitors::{visit_deletion_vector_at, visit_protocol_at}; +use crate::actions::visitors::visit_deletion_vector_at; +use crate::actions::visitors::InCommitTimestampVisitor; use crate::actions::{ - get_log_add_schema, Add, Cdc, Metadata, Protocol, Remove, ADD_NAME, CDC_NAME, METADATA_NAME, - PROTOCOL_NAME, REMOVE_NAME, + get_log_add_schema, Add, Cdc, Metadata, Protocol, Remove, ADD_NAME, CDC_NAME, COMMIT_INFO_NAME, + METADATA_NAME, PROTOCOL_NAME, REMOVE_NAME, }; use crate::engine_data::{GetData, TypedGetData}; -use crate::expressions::{column_name, ColumnName}; -use crate::path::ParsedLogPath; +use crate::expressions::{column_expr, column_expr_ref, column_name, ColumnName, Expression}; +use crate::path::{AsUrl, ParsedLogPath}; +use crate::scan::data_skipping::stats_schema::build_stats_schema; use crate::scan::data_skipping::DataSkippingFilter; use crate::scan::state::DvInfo; use crate::schema::{ - ArrayType, ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType, - ToSchema as _, + ColumnNamesAndTypes, DataType, SchemaRef, StructField, StructType, ToSchema as _, }; use crate::table_changes::scan_file::{cdf_scan_row_expression, cdf_scan_row_schema}; -use crate::table_changes::{check_cdf_table_properties, ensure_cdf_read_supported}; -use crate::table_properties::TableProperties; +use crate::table_configuration::TableConfiguration; +use crate::table_features::{format_features, Operation, TableFeature}; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, PredicateRef, RowVisitor}; use itertools::Itertools; +use tracing::info; #[cfg(test)] mod tests; @@ -52,15 +54,45 @@ pub(crate) struct TableChangesScanMetadata { /// (JSON) commit files. pub(crate) fn table_changes_action_iter( engine: Arc, + start_table_configuration: &TableConfiguration, commit_files: impl IntoIterator, table_schema: SchemaRef, physical_predicate: Option<(PredicateRef, SchemaRef)>, ) -> DeltaResult>> { - let filter = DataSkippingFilter::new(engine.as_ref(), physical_predicate).map(Arc::new); + let filter = physical_predicate + .and_then(|(predicate, ref_schema)| { + let stats_schema = build_stats_schema(&ref_schema)?; + + // Parse JSON stats from the raw action batch's `add.stats` column. Unlike the scan + // path (which transforms first and reads pre-parsed stats), table_changes must + // resolve deletion vector pairs before filtering, so it operates on raw batches. + let stats_expr = Arc::new(Expression::parse_json( + column_expr!("add.stats"), + stats_schema.clone(), + )); + DataSkippingFilter::new( + engine.as_ref(), + Some(predicate), + Some(&stats_schema), + stats_expr, + None, // no partition columns for table changes (partition_expr unused) + column_expr_ref!("partitionValues_parsed"), + get_log_add_schema().clone(), + None, // Table changes doesn't use metrics yet + ) + }) + .map(Arc::new); + + let mut current_configuration = start_table_configuration.clone(); let result = commit_files .into_iter() .map(move |commit_file| -> DeltaResult<_> { - let scanner = LogReplayScanner::try_new(engine.as_ref(), commit_file, &table_schema)?; + let scanner = LogReplayScanner::try_new( + engine.as_ref(), + &mut current_configuration, + commit_file, + &table_schema, + )?; scanner.into_scan_batches(engine.clone(), filter.clone()) }) //Iterator-Result-Iterator-Result .flatten_ok() // Iterator-Result-Result @@ -144,6 +176,7 @@ impl LogReplayScanner { /// For more details, see the documentation for [`LogReplayScanner`]. fn try_new( engine: &dyn Engine, + table_configuration: &mut TableConfiguration, commit_file: ParsedLogPath, table_schema: &SchemaRef, ) -> DeltaResult { @@ -158,15 +191,26 @@ impl LogReplayScanner { // As a result, we would read the file path for the remove action, which is unnecessary because // all of the rows will be filtered by the predicate. Instead, we wait until deletion // vectors are resolved so that we can skip both actions in the pair. - let action_iter = engine.json_handler().read_json_files( - slice::from_ref(&commit_file.location), - visitor_schema, - None, // not safe to apply data skipping yet - )?; + let mut action_iter = engine + .json_handler() + .read_json_files( + slice::from_ref(&commit_file.location), + visitor_schema, + None, // not safe to apply data skipping yet + )? + .peekable(); + + let mut in_commit_timestamp_opt = None; + if let Some(Ok(actions)) = action_iter.peek() { + let mut visitor = InCommitTimestampVisitor::default(); + visitor.visit_rows_of(actions.as_ref())?; + in_commit_timestamp_opt = visitor.in_commit_timestamp; + } let mut remove_dvs = HashMap::default(); let mut add_paths = HashSet::default(); let mut has_cdc_action = false; + for actions in action_iter { let actions = actions?; @@ -174,17 +218,16 @@ impl LogReplayScanner { add_paths: &mut add_paths, remove_dvs: &mut remove_dvs, has_cdc_action: &mut has_cdc_action, - protocol: None, - metadata_info: None, }; visitor.visit_rows_of(actions.as_ref())?; - if let Some(protocol) = visitor.protocol { - ensure_cdf_read_supported(&protocol) - .map_err(|_| Error::change_data_feed_unsupported(commit_file.version))?; - } - if let Some((schema, configuration)) = visitor.metadata_info { - let schema: StructType = serde_json::from_str(&schema)?; + let metadata_opt = Metadata::try_new_from_data(actions.as_ref())?; + let has_metadata_update = metadata_opt.is_some(); + let protocol_opt = Protocol::try_new_from_data(actions.as_ref())?; + let has_protocol_update = protocol_opt.is_some(); + + if let Some(ref metadata) = metadata_opt { + let schema = metadata.parse_schema()?; // Currently, schema compatibility is defined as having equal schema types. In the // future, more permisive schema evolution will be supported. // See: https://github.com/delta-io/delta-kernel-rs/issues/523 @@ -192,8 +235,48 @@ impl LogReplayScanner { table_schema.as_ref() == &schema, Error::change_data_feed_incompatible_schema(table_schema, &schema) ); - let table_properties = TableProperties::from(configuration); - check_cdf_table_properties(&table_properties) + } + + // Update table configuration with any new Protocol or Metadata from this commit + if has_metadata_update || has_protocol_update { + *table_configuration = TableConfiguration::try_new_from( + table_configuration, + metadata_opt, + protocol_opt, + commit_file.version, + )?; + + let writer_features_str = table_configuration + .protocol() + .writer_features() + .map(format_features) + .unwrap_or_else(|| "[]".to_string()); + + info!( + version = commit_file.version, + id = table_configuration.metadata().id(), + // Writer features are always a superset of reader features, so we log writer features to trace the full set of table features. + writerFeatures = %writer_features_str, + minReaderVersion = table_configuration.protocol().min_reader_version(), + minWriterVersion = table_configuration.protocol().min_writer_version(), + schemaString = %table_configuration.metadata().schema_string(), + configuration = ?table_configuration.metadata().configuration(), + "Table configuration updated during CDF query" + ); + } + + // If metadata is updated, check if Change Data Feed is enabled + if has_metadata_update { + require!( + table_configuration.is_feature_enabled(&TableFeature::ChangeDataFeed), + Error::change_data_feed_unsupported(commit_file.version) + ); + } + + // If protocol is updated, check if Change Data Feed is supported + if has_protocol_update { + table_configuration + .ensure_operation_supported(Operation::Cdf) .map_err(|_| Error::change_data_feed_unsupported(commit_file.version))?; } } @@ -205,8 +288,33 @@ impl LogReplayScanner { // same as an `add` action. remove_dvs.retain(|rm_path, _| add_paths.contains(rm_path)); } + + // If ICT is enabled, then set the timestamp to be the ICT; otherwise, default to the last_modified timestamp value + let timestamp = if table_configuration.is_feature_enabled(&TableFeature::InCommitTimestamp) + { + let Some(in_commit_timestamp) = in_commit_timestamp_opt else { + return Err(Error::generic(format!( + "In-commit timestamp is enabled but not found in commit at version {}", + commit_file.version + ))); + }; + in_commit_timestamp + } else { + commit_file.location.last_modified + }; + + info!( + version = commit_file.version, + id = table_configuration.metadata().id(), + remove_dvs_size = remove_dvs.len(), + has_cdc_action = has_cdc_action, + file_path = %commit_file.location.as_url(), + timestamp = timestamp, + "Phase 1 of CDF query processing completed" + ); + Ok(LogReplayScanner { - timestamp: commit_file.location.last_modified, + timestamp, commit_file, has_cdc_action, remove_dvs, @@ -243,7 +351,7 @@ impl LogReplayScanner { get_log_add_schema().clone(), Arc::new(cdf_scan_row_expression(timestamp, commit_version)), cdf_scan_row_schema().into(), - ); + )?; let result = action_iter.map(move |actions| -> DeltaResult<_> { let actions = actions?; @@ -273,8 +381,6 @@ impl LogReplayScanner { // This is a visitor used in the prepare phase of [`LogReplayScanner`]. See // [`LogReplayScanner::try_new`] for details usage. struct PreparePhaseVisitor<'a> { - protocol: Option, - metadata_info: Option<(String, HashMap)>, has_cdc_action: &'a mut bool, add_paths: &'a mut HashSet, remove_dvs: &'a mut HashMap, @@ -287,6 +393,14 @@ impl PreparePhaseVisitor<'_> { StructField::nullable(CDC_NAME, Cdc::to_schema()), StructField::nullable(METADATA_NAME, Metadata::to_schema()), StructField::nullable(PROTOCOL_NAME, Protocol::to_schema()), + StructField::nullable( + COMMIT_INFO_NAME, + StructType::new_unchecked([StructField::new( + "inCommitTimestamp", + DataType::LONG, + true, + )]), + ), ])) } } @@ -299,8 +413,6 @@ impl RowVisitor for PreparePhaseVisitor<'_> { const INTEGER: DataType = DataType::INTEGER; const LONG: DataType = DataType::LONG; const BOOLEAN: DataType = DataType::BOOLEAN; - let string_list: DataType = ArrayType::new(STRING, false).into(); - let string_string_map = MapType::new(STRING, STRING, false).into(); let types_and_names = vec![ (STRING, column_name!("add.path")), (BOOLEAN, column_name!("add.dataChange")), @@ -312,12 +424,7 @@ impl RowVisitor for PreparePhaseVisitor<'_> { (INTEGER, column_name!("remove.deletionVector.sizeInBytes")), (LONG, column_name!("remove.deletionVector.cardinality")), (STRING, column_name!("cdc.path")), - (STRING, column_name!("metaData.schemaString")), - (string_string_map, column_name!("metaData.configuration")), - (INTEGER, column_name!("protocol.minReaderVersion")), - (INTEGER, column_name!("protocol.minWriterVersion")), - (string_list.clone(), column_name!("protocol.readerFeatures")), - (string_list, column_name!("protocol.writerFeatures")), + (LONG, column_name!("commitInfo.inCommitTimestamp")), ]; let (types, names) = types_and_names.into_iter().unzip(); (names, types).into() @@ -327,7 +434,7 @@ impl RowVisitor for PreparePhaseVisitor<'_> { fn visit<'b>(&mut self, row_count: usize, getters: &[&'b dyn GetData<'b>]) -> DeltaResult<()> { require!( - getters.len() == 16, + getters.len() == 11, Error::InternalError(format!( "Wrong number of PreparePhaseVisitor getters: {}", getters.len() @@ -348,12 +455,6 @@ impl RowVisitor for PreparePhaseVisitor<'_> { } } else if getters[9].get_str(i, "cdc.path")?.is_some() { *self.has_cdc_action = true; - } else if let Some(schema) = getters[10].get_str(i, "metaData.schemaString")? { - let configuration_map_opt = getters[11].get_opt(i, "metadata.configuration")?; - let configuration = configuration_map_opt.unwrap_or_else(HashMap::new); - self.metadata_info = Some((schema.to_string(), configuration)); - } else if let Some(protocol) = visit_protocol_at(i, &getters[12..])? { - self.protocol = Some(protocol); } } Ok(()) diff --git a/kernel/src/table_changes/log_replay/tests.rs b/kernel/src/table_changes/log_replay/tests.rs index f1278504a1..4b651fa8cc 100644 --- a/kernel/src/table_changes/log_replay/tests.rs +++ b/kernel/src/table_changes/log_replay/tests.rs @@ -1,30 +1,120 @@ use super::table_changes_action_iter; use super::TableChangesScanMetadata; use crate::actions::deletion_vector::{DeletionVectorDescriptor, DeletionVectorStorageType}; -use crate::actions::{Add, Cdc, Metadata, Protocol, Remove}; +use crate::actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove}; use crate::engine::sync::SyncEngine; use crate::expressions::{column_expr, BinaryPredicateOp, Scalar}; use crate::log_segment::LogSegment; use crate::path::ParsedLogPath; use crate::scan::state::DvInfo; use crate::scan::PhysicalPredicate; -use crate::schema::{DataType, StructField, StructType}; +use crate::schema::{DataType, SchemaRef, StructField, StructType}; use crate::table_changes::log_replay::LogReplayScanner; -use crate::table_features::ReaderFeature; +use crate::table_configuration::TableConfiguration; +use crate::table_features::{ColumnMappingMode, TableFeature}; use crate::utils::test_utils::{assert_result_error_with_message, Action, LocalMockTable}; use crate::Predicate; use crate::{DeltaResult, Engine, Error, Version}; +use test_utils::LoggingTest; use itertools::Itertools; use std::collections::HashMap; use std::path::Path; use std::sync::Arc; -fn get_schema() -> StructType { - StructType::new_unchecked([ +fn get_schema() -> SchemaRef { + Arc::new(StructType::new_unchecked([ StructField::nullable("id", DataType::INTEGER), StructField::nullable("value", DataType::STRING), - ]) + ])) +} + +fn get_default_table_config(table_root: &url::Url) -> TableConfiguration { + let metadata = Metadata::try_new( + None, + None, + get_schema(), + vec![], + 0, + HashMap::from([ + ("delta.enableChangeDataFeed".to_string(), "true".to_string()), + ("delta.columnMapping.mode".to_string(), "none".to_string()), + ]), + ) + .unwrap(); + // CDF requires min_writer_version = 4 + let protocol = Protocol::try_new_legacy(1, 4).unwrap(); + TableConfiguration::try_new(metadata, protocol, table_root.clone(), 0).unwrap() +} + +/// Helper to create a Metadata action with the given schema and configuration +fn metadata_action(schema: SchemaRef, configuration: HashMap) -> Action { + Action::Metadata( + Metadata::try_new(None, None, schema.clone(), vec![], 0, configuration).unwrap(), + ) +} + +/// Helper to create a Metadata action with CDF enabled +fn metadata_with_cdf(schema: SchemaRef) -> Action { + metadata_action( + schema, + HashMap::from([("delta.enableChangeDataFeed".to_string(), "true".to_string())]), + ) +} + +/// Helper to create a Protocol action +fn protocol_action( + min_reader: i32, + min_writer: i32, + reader_features: Option>, + writer_features: Option>, +) -> Action { + Action::Protocol( + Protocol::try_new(min_reader, min_writer, reader_features, writer_features).unwrap(), + ) +} + +/// Helper to execute table_changes_action_iter for a specific version range +fn execute_table_changes( + engine: Arc, + mock_table: &LocalMockTable, + start_version: Version, + end_version: Option, +) -> DeltaResult> { + let commits = get_segment( + engine.as_ref(), + mock_table.table_root(), + start_version, + end_version, + )? + .into_iter(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + table_changes_action_iter(engine, &table_config, commits, get_schema(), None)?.try_collect() +} + +/// Helper to assert midstream failure pattern: +/// - Reading v0 alone succeeds +/// - Reading v0-v1 fails with ChangeDataFeedUnsupported +/// - Reading v1 alone fails with ChangeDataFeedUnsupported +fn assert_midstream_failure(engine: Arc, mock_table: &LocalMockTable) { + // Reading just the first commit (0 to 0) should succeed + let res_v0 = execute_table_changes(engine.clone(), mock_table, 0, Some(0)); + assert!(res_v0.is_ok(), "Reading version 0 alone should succeed"); + + // Reading commits 0-1 should fail + let res_v0_v1 = execute_table_changes(engine.clone(), mock_table, 0, Some(1)); + assert!( + matches!(res_v0_v1, Err(Error::ChangeDataFeedUnsupported(_))), + "Reading versions 0-1 should fail" + ); + + // Reading just commit 1 should also fail + let res_v1 = execute_table_changes(engine, mock_table, 1, Some(1)); + assert!( + matches!(res_v1, Err(Error::ChangeDataFeedUnsupported(_))), + "Reading version 1 alone should fail" + ); } fn get_segment( @@ -41,7 +131,7 @@ fn get_segment( start_version, end_version, )?; - Ok(log_segment.ascending_commit_files) + Ok(log_segment.listed.ascending_commit_files) } fn result_to_sv(iter: impl Iterator>) -> Vec { @@ -76,11 +166,9 @@ async fn metadata_protocol() { .unwrap(), ), Action::Protocol( - Protocol::try_new( - 3, - 7, - Some([ReaderFeature::DeletionVectors]), - Some([ReaderFeature::ColumnMapping]), + Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [TableFeature::DeletionVectors, TableFeature::ChangeDataFeed], ) .unwrap(), ), @@ -91,8 +179,10 @@ async fn metadata_protocol() { .unwrap() .into_iter(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); let scan_batches = - table_changes_action_iter(engine, commits, get_schema().into(), None).unwrap(); + table_changes_action_iter(engine, &table_config, commits, get_schema(), None).unwrap(); let sv = result_to_sv(scan_batches); assert_eq!(sv, &[false, false]); } @@ -100,20 +190,10 @@ async fn metadata_protocol() { async fn cdf_not_enabled() { let engine = Arc::new(SyncEngine::new()); let mut mock_table = LocalMockTable::new(); + // Commit metadata without CDF property to test that CDF is rejected mock_table .commit([Action::Metadata( - Metadata::try_new( - None, - None, - get_schema(), - vec![], - 0, - HashMap::from([( - "delta.enableDeletionVectors".to_string(), - "true".to_string(), - )]), - ) - .unwrap(), + Metadata::try_new(None, None, get_schema(), vec![], 0, HashMap::new()).unwrap(), )]) .await; @@ -121,8 +201,10 @@ async fn cdf_not_enabled() { .unwrap() .into_iter(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); let res: DeltaResult> = - table_changes_action_iter(engine, commits, get_schema().into(), None) + table_changes_action_iter(engine, &table_config, commits, get_schema(), None) .unwrap() .try_collect(); @@ -135,11 +217,16 @@ async fn unsupported_reader_feature() { let mut mock_table = LocalMockTable::new(); mock_table .commit([Action::Protocol( - Protocol::try_new( - 3, - 7, - Some([ReaderFeature::DeletionVectors, ReaderFeature::ColumnMapping]), - Some([""; 0]), + Protocol::try_new_modern( + [ + TableFeature::DeletionVectors, + TableFeature::unknown("unsupportedReaderFeature"), + ], + [ + TableFeature::DeletionVectors, + TableFeature::ChangeDataFeed, + TableFeature::unknown("unsupportedReaderFeature"), + ], ) .unwrap(), )]) @@ -149,54 +236,147 @@ async fn unsupported_reader_feature() { .unwrap() .into_iter(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); let res: DeltaResult> = - table_changes_action_iter(engine, commits, get_schema().into(), None) + table_changes_action_iter(engine, &table_config, commits, get_schema(), None) .unwrap() .try_collect(); assert!(matches!(res, Err(Error::ChangeDataFeedUnsupported(_)))); } + #[tokio::test] -async fn column_mapping_should_fail() { +async fn column_mapping_should_succeed() { + use crate::schema::{ColumnMetadataKey, MetadataValue}; + + fn cm_field(name: &str, data_type: DataType, id: i64) -> StructField { + StructField::nullable(name, data_type).with_metadata(HashMap::from([ + ( + ColumnMetadataKey::ColumnMappingId.as_ref().to_string(), + MetadataValue::Number(id), + ), + ( + ColumnMetadataKey::ColumnMappingPhysicalName + .as_ref() + .to_string(), + MetadataValue::String(name.to_string()), + ), + ])) + } + + let cm_schema = Arc::new(StructType::new_unchecked([ + cm_field("id", DataType::INTEGER, 1), + cm_field("value", DataType::STRING, 2), + ])); + let engine = Arc::new(SyncEngine::new()); let mut mock_table = LocalMockTable::new(); mock_table - .commit([Action::Metadata( - Metadata::try_new( - None, - None, - get_schema(), - vec![], - 0, - HashMap::from([ - ( - "delta.enableDeletionVectors".to_string(), - "true".to_string(), - ), - ("delta.enableChangeDataFeed".to_string(), "true".to_string()), - ("delta.columnMapping.mode".to_string(), "id".to_string()), - ]), - ) - .unwrap(), - )]) + .commit([ + Action::Protocol( + Protocol::try_new_modern( + [TableFeature::DeletionVectors, TableFeature::ColumnMapping], + [ + TableFeature::DeletionVectors, + TableFeature::ColumnMapping, + TableFeature::ChangeDataFeed, + ], + ) + .unwrap(), + ), + Action::Metadata( + Metadata::try_new( + None, + None, + cm_schema.clone(), + vec![], + 0, + HashMap::from([ + ( + "delta.enableDeletionVectors".to_string(), + "true".to_string(), + ), + ("delta.enableChangeDataFeed".to_string(), "true".to_string()), + ("delta.columnMapping.mode".to_string(), "id".to_string()), + ]), + ) + .unwrap(), + ), + ]) .await; let commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None) .unwrap() .into_iter(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); let res: DeltaResult> = - table_changes_action_iter(engine, commits, get_schema().into(), None) + table_changes_action_iter(engine, &table_config, commits, cm_schema, None) .unwrap() .try_collect(); - assert!(matches!(res, Err(Error::ChangeDataFeedUnsupported(_)))); + // Column mapping with CDF should now succeed + assert!(res.is_ok(), "CDF should now support column mapping"); +} + +// Test that CDF fails when disabled mid-stream +#[tokio::test] +async fn cdf_disabled_midstream() { + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + + // First commit: CDF enabled + mock_table.commit([metadata_with_cdf(get_schema())]).await; + + // Second commit: CDF disabled + mock_table + .commit([metadata_action( + get_schema(), + HashMap::from([( + "delta.enableChangeDataFeed".to_string(), + "false".to_string(), + )]), + )]) + .await; + + assert_midstream_failure(engine, &mock_table); +} + +// Test that unsupported protocol features added mid-stream are rejected +#[tokio::test] +async fn unsupported_protocol_feature_midstream() { + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + + // First commit: Basic protocol with CDF enabled + mock_table + .commit([ + protocol_action(2, 6, None, None), + metadata_with_cdf(get_schema()), + ]) + .await; + + // Second commit: Protocol update with unsupported feature + mock_table + .commit([protocol_action( + 3, + 7, + Some(vec![TableFeature::unknown("unsupportedFeature")]), + Some(vec![ + TableFeature::unknown("unsupportedFeature"), + TableFeature::ChangeDataFeed, + ]), + )]) + .await; + + assert_midstream_failure(engine, &mock_table); } -// Note: This should be removed once type widening support is added for CDF #[tokio::test] async fn incompatible_schemas_fail() { - async fn assert_incompatible_schema(commit_schema: StructType, cdf_schema: StructType) { + async fn assert_incompatible_schema(commit_schema: SchemaRef, cdf_schema: SchemaRef) { let engine = Arc::new(SyncEngine::new()); let mut mock_table = LocalMockTable::new(); @@ -218,8 +398,10 @@ async fn incompatible_schemas_fail() { .unwrap() .into_iter(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); let res: DeltaResult> = - table_changes_action_iter(engine, commits, cdf_schema.into(), None) + table_changes_action_iter(engine, &table_config, commits, cdf_schema, None) .unwrap() .try_collect(); @@ -231,60 +413,166 @@ async fn incompatible_schemas_fail() { // The CDF schema has fields: `id: int` and `value: string`. // This commit has schema with fields: `id: long`, `value: string` and `year: int` (nullable). - let schema = StructType::new_unchecked([ + let schema = Arc::new(StructType::new_unchecked([ StructField::nullable("id", DataType::LONG), StructField::nullable("value", DataType::STRING), StructField::nullable("year", DataType::INTEGER), - ]); + ])); assert_incompatible_schema(schema, get_schema()).await; // The CDF schema has fields: `id: int` and `value: string`. // This commit has schema with fields: `id: long` and `value: string`. - let schema = StructType::new_unchecked([ + let schema = Arc::new(StructType::new_unchecked([ StructField::nullable("id", DataType::LONG), StructField::nullable("value", DataType::STRING), - ]); + ])); assert_incompatible_schema(schema, get_schema()).await; // NOTE: Once type widening is supported, this should not return an error. // // The CDF schema has fields: `id: long` and `value: string`. // This commit has schema with fields: `id: int` and `value: string`. - let cdf_schema = StructType::new_unchecked([ + let cdf_schema = Arc::new(StructType::new_unchecked([ StructField::nullable("id", DataType::LONG), StructField::nullable("value", DataType::STRING), - ]); - let commit_schema = StructType::new_unchecked([ + ])); + let commit_schema = Arc::new(StructType::new_unchecked([ StructField::nullable("id", DataType::INTEGER), StructField::nullable("value", DataType::STRING), - ]); + ])); assert_incompatible_schema(cdf_schema, commit_schema).await; // Note: Once schema evolution is supported, this should not return an error. // // The CDF schema has fields: nullable `id` and nullable `value`. // This commit has schema with fields: non-nullable `id` and nullable `value`. - let schema = StructType::new_unchecked([ + let schema = Arc::new(StructType::new_unchecked([ StructField::not_null("id", DataType::LONG), StructField::nullable("value", DataType::STRING), - ]); + ])); assert_incompatible_schema(schema, get_schema()).await; // The CDF schema has fields: `id: int` and `value: string`. // This commit has schema with fields:`id: string` and `value: string`. - let schema = StructType::new_unchecked([ + let schema = Arc::new(StructType::new_unchecked([ StructField::nullable("id", DataType::STRING), StructField::nullable("value", DataType::STRING), - ]); + ])); assert_incompatible_schema(schema, get_schema()).await; // Note: Once schema evolution is supported, this should not return an error. // The CDF schema has fields: `id` (nullable) and `value` (nullable). // This commit has schema with fields: `id` (nullable). - let schema = get_schema().project_as_struct(&["id"]).unwrap(); + let schema = Arc::new(get_schema().project_as_struct(&["id"]).unwrap()); assert_incompatible_schema(schema, get_schema()).await; } +// Helper function to test schema evolution scenarios. +// Returns an error if schema evolution fails (which is expected currently). +async fn test_schema_evolution( + initial_schema: SchemaRef, + evolved_schema: SchemaRef, +) -> DeltaResult> { + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + + // Create initial commit with initial schema + mock_table + .commit([ + metadata_with_cdf(initial_schema.clone()), + protocol_action(1, 1, None, None), + ]) + .await; + + // Add some data with initial schema + mock_table + .commit([Action::Add(Add { + path: "file1.parquet".into(), + data_change: true, + ..Default::default() + })]) + .await; + + // Evolve the schema + mock_table + .commit([metadata_with_cdf(evolved_schema.clone())]) + .await; + + // Add data with evolved schema + mock_table + .commit([Action::Add(Add { + path: "file2.parquet".into(), + data_change: true, + ..Default::default() + })]) + .await; + + let commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None)?.into_iter(); + + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + + // Try to read CDF using the evolved schema - this currently fails + table_changes_action_iter(engine, &table_config, commits, evolved_schema, None)?.try_collect() +} + +// This test demonstrates various schema evolution scenarios that currently fail +// but could be supported in the future. See: https://github.com/delta-io/delta-kernel-rs/issues/523 +#[tokio::test] +async fn demonstration_schema_evolution_failures() { + // Scenario 1: Adding a nullable column (safe evolution) + // Initial: {id: int, value: string} + // Evolved: {id: int, value: string, new_col: int?} + let initial = Arc::new(StructType::new_unchecked([ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("value", DataType::STRING), + ])); + let evolved = Arc::new(StructType::new_unchecked([ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("value", DataType::STRING), + StructField::nullable("new_col", DataType::INTEGER), + ])); + let res = test_schema_evolution(initial, evolved).await; + assert!( + matches!(res, Err(Error::ChangeDataFeedIncompatibleSchema(_, _))), + "Expected ChangeDataFeedIncompatibleSchema error for adding nullable column" + ); + + // Scenario 2: Type widening (int -> long) - supported by type widening feature + // Initial: {id: int, value: string} + // Evolved: {id: long, value: string} + let initial = Arc::new(StructType::new_unchecked([ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("value", DataType::STRING), + ])); + let evolved = Arc::new(StructType::new_unchecked([ + StructField::nullable("id", DataType::LONG), + StructField::nullable("value", DataType::STRING), + ])); + let res = test_schema_evolution(initial, evolved).await; + assert!( + matches!(res, Err(Error::ChangeDataFeedIncompatibleSchema(_, _))), + "Expected ChangeDataFeedIncompatibleSchema error for type widening" + ); + + // Scenario 3: Changing nullability from non-null to nullable (safe evolution) + // Initial: {id: int!, value: string} + // Evolved: {id: int?, value: string} + let initial = Arc::new(StructType::new_unchecked([ + StructField::not_null("id", DataType::INTEGER), + StructField::nullable("value", DataType::STRING), + ])); + let evolved = Arc::new(StructType::new_unchecked([ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("value", DataType::STRING), + ])); + let res = test_schema_evolution(initial, evolved).await; + assert!( + matches!(res, Err(Error::ChangeDataFeedIncompatibleSchema(_, _))), + "Expected ChangeDataFeedIncompatibleSchema error for nullability change" + ); +} + #[tokio::test] async fn add_remove() { let engine = Arc::new(SyncEngine::new()); @@ -308,7 +596,9 @@ async fn add_remove() { .unwrap() .into_iter(); - let sv = table_changes_action_iter(engine, commits, get_schema().into(), None) + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + let sv = table_changes_action_iter(engine, &table_config, commits, get_schema(), None) .unwrap() .flat_map(|scan_metadata| { let scan_metadata = scan_metadata.unwrap(); @@ -358,7 +648,9 @@ async fn filter_data_change() { .unwrap() .into_iter(); - let sv = table_changes_action_iter(engine, commits, get_schema().into(), None) + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + let sv = table_changes_action_iter(engine, &table_config, commits, get_schema(), None) .unwrap() .flat_map(|scan_metadata| { let scan_metadata = scan_metadata.unwrap(); @@ -404,7 +696,9 @@ async fn cdc_selection() { .unwrap() .into_iter(); - let sv = table_changes_action_iter(engine, commits, get_schema().into(), None) + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + let sv = table_changes_action_iter(engine, &table_config, commits, get_schema(), None) .unwrap() .flat_map(|scan_metadata| { let scan_metadata = scan_metadata.unwrap(); @@ -470,7 +764,9 @@ async fn dv() { }, )]) .into(); - let sv = table_changes_action_iter(engine, commits, get_schema().into(), None) + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + let sv = table_changes_action_iter(engine, &table_config, commits, get_schema(), None) .unwrap() .flat_map(|scan_metadata| { let scan_metadata = scan_metadata.unwrap(); @@ -539,15 +835,18 @@ async fn data_skipping_filter() { Scalar::from(4), ); let logical_schema = get_schema(); - let predicate = match PhysicalPredicate::try_new(&predicate, &logical_schema) { - Ok(PhysicalPredicate::Some(p, s)) => Some((p, s)), - other => panic!("Unexpected result: {other:?}"), - }; + let predicate = + match PhysicalPredicate::try_new(&predicate, &logical_schema, ColumnMappingMode::None) { + Ok(PhysicalPredicate::Some(p, s)) => Some((p, s)), + other => panic!("Unexpected result: {other:?}"), + }; let commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None) .unwrap() .into_iter(); - let sv = table_changes_action_iter(engine, commits, logical_schema.into(), predicate) + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + let sv = table_changes_action_iter(engine, &table_config, commits, logical_schema, predicate) .unwrap() .flat_map(|scan_metadata| { let scan_metadata = scan_metadata.unwrap(); @@ -564,13 +863,7 @@ async fn failing_protocol() { let engine = Arc::new(SyncEngine::new()); let mut mock_table = LocalMockTable::new(); - let protocol = Protocol::try_new( - 3, - 1, - ["fake_feature".to_string()].into(), - ["fake_feature".to_string()].into(), - ) - .unwrap(); + let protocol = Protocol::try_new_modern(["fake_feature"], ["fake_feature"]).unwrap(); mock_table .commit([ @@ -592,8 +885,10 @@ async fn failing_protocol() { .unwrap() .into_iter(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); let res: DeltaResult> = - table_changes_action_iter(engine, commits, get_schema().into(), None) + table_changes_action_iter(engine, &table_config, commits, get_schema(), None) .unwrap() .try_collect(); @@ -622,6 +917,411 @@ async fn file_meta_timestamp() { let commit = commits.next().unwrap(); let file_meta_ts = commit.location.last_modified; - let scanner = LogReplayScanner::try_new(engine.as_ref(), commit, &get_schema().into()).unwrap(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let mut table_config = get_default_table_config(&table_root_url); + let scanner = + LogReplayScanner::try_new(engine.as_ref(), &mut table_config, commit, &get_schema()) + .unwrap(); assert_eq!(scanner.timestamp, file_meta_ts); } + +#[tokio::test] +async fn print_table_configuration() { + let tracing_guard = LoggingTest::new(); + + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + mock_table + .commit([ + Action::Metadata( + Metadata::try_new( + None, + None, + get_schema(), + vec![], + 0, + HashMap::from([ + ("delta.enableChangeDataFeed".to_string(), "true".to_string()), + ( + "delta.enableDeletionVectors".to_string(), + "true".to_string(), + ), + ("delta.columnMapping.mode".to_string(), "none".to_string()), + ]), + ) + .unwrap(), + ), + Action::Protocol( + Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [TableFeature::DeletionVectors, TableFeature::ChangeDataFeed], + ) + .unwrap(), + ), + ]) + .await; + + let commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None) + .unwrap() + .into_iter(); + + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + + let _scan_batches: DeltaResult> = + table_changes_action_iter(engine, &table_config, commits, get_schema(), None) + .unwrap() + .try_collect(); + + let log_output = tracing_guard.logs(); + + assert!(log_output.contains("Table configuration updated during CDF query")); + assert!(log_output.contains("version=0")); + assert!(log_output.contains("id=")); + assert!(log_output.contains("writerFeatures=[deletionVectors, changeDataFeed]")); + assert!(log_output.contains("minReaderVersion=3")); + assert!(log_output.contains("minWriterVersion=7")); + assert!(log_output.contains("schemaString={\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}")); + assert!(log_output.contains("configuration=")); + assert!(log_output.contains("\"delta.enableChangeDataFeed\": \"true\"")); + assert!(log_output.contains("\"delta.columnMapping.mode\": \"none\"")); + assert!(log_output.contains("\"delta.enableDeletionVectors\": \"true\"")); +} + +#[tokio::test] +async fn print_table_info_post_phase1() { + let tracing_guard = LoggingTest::new(); + + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + // This specific commit (with these actions) isn't necessary to test the tracing for this test, we just need to have one commit with any actions + mock_table + .commit([ + Action::Metadata( + Metadata::try_new( + None, + None, + get_schema(), + vec![], + 0, + HashMap::from([ + ("delta.enableChangeDataFeed".to_string(), "true".to_string()), + ( + "delta.enableDeletionVectors".to_string(), + "true".to_string(), + ), + ("delta.columnMapping.mode".to_string(), "none".to_string()), + ]), + ) + .unwrap(), + ), + Action::Protocol( + Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [TableFeature::DeletionVectors, TableFeature::ChangeDataFeed], + ) + .unwrap(), + ), + ]) + .await; + + let commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None) + .unwrap() + .into_iter(); + + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + + let _scan_batches: DeltaResult> = + table_changes_action_iter(engine, &table_config, commits, get_schema(), None) + .unwrap() + .try_collect(); + + let log_output = tracing_guard.logs(); + + assert!(log_output.contains("Phase 1 of CDF query processing completed")); + assert!(log_output.contains("id=")); + assert!(log_output.contains("remove_dvs_size=0")); + assert!(log_output.contains("has_cdc_action=false")); + assert!(log_output.contains("file_path=")); + assert!(log_output.contains("version=0")); + assert!(log_output.contains("timestamp=")); +} + +#[tokio::test] +async fn print_table_info_post_phase1_has_cdc() { + let tracing_guard = LoggingTest::new(); + + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + + mock_table + .commit([ + Action::Add(Add { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + }), + Action::Cdc(Cdc { + path: "fake_path_2".into(), + ..Default::default() + }), + ]) + .await; + + let commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None) + .unwrap() + .into_iter(); + + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + + let _scan_batches: DeltaResult> = + table_changes_action_iter(engine, &table_config, commits, get_schema(), None) + .unwrap() + .try_collect(); + + let log_output = tracing_guard.logs(); + + assert!(log_output.contains("Phase 1 of CDF query processing completed")); + assert!(log_output.contains("id=")); + assert!(log_output.contains("remove_dvs_size=0")); + assert!(log_output.contains("has_cdc_action=true")); + assert!(log_output.contains("file_path=")); + assert!(log_output.contains("version=0")); + assert!(log_output.contains("timestamp=")); +} + +#[tokio::test] +async fn print_table_info_post_phase1_has_dv() { + let tracing_guard = LoggingTest::new(); + + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + + let deletion_vector1 = DeletionVectorDescriptor { + storage_type: DeletionVectorStorageType::PersistedRelative, + path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), + offset: Some(1), + size_in_bytes: 36, + cardinality: 2, + }; + let deletion_vector2 = DeletionVectorDescriptor { + storage_type: DeletionVectorStorageType::PersistedRelative, + path_or_inline_dv: "U5OWRz5k%CFT.Td}yCPW".to_string(), + offset: Some(1), + size_in_bytes: 38, + cardinality: 3, + }; + // - fake_path_1 undergoes a restore. All rows are restored, so the deletion vector is removed. + // - All remaining rows of fake_path_2 are deleted + mock_table + .commit([ + Action::Remove(Remove { + path: "fake_path_1".into(), + data_change: true, + deletion_vector: Some(deletion_vector1.clone()), + ..Default::default() + }), + Action::Add(Add { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + }), + Action::Remove(Remove { + path: "fake_path_2".into(), + data_change: true, + deletion_vector: Some(deletion_vector2.clone()), + ..Default::default() + }), + ]) + .await; + + let commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None) + .unwrap() + .into_iter(); + + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let table_config = get_default_table_config(&table_root_url); + let _scan_batches: DeltaResult> = + table_changes_action_iter(engine, &table_config, commits, get_schema(), None) + .unwrap() + .try_collect(); + + let log_output = tracing_guard.logs(); + + let expected_remove_dvs: Arc> = HashMap::from([( + "fake_path_1".to_string(), + DvInfo { + deletion_vector: Some(deletion_vector1.clone()), + }, + )]) + .into(); + + assert!(log_output.contains("Phase 1 of CDF query processing completed")); + assert!(log_output.contains("id=")); + assert!(log_output.contains(&format!("remove_dvs_size={}", expected_remove_dvs.len()))); + assert!(log_output.contains("has_cdc_action=false")); + assert!(log_output.contains("file_path=")); + assert!(log_output.contains("version=0")); + assert!(log_output.contains("timestamp=")); +} + +#[tokio::test] +async fn test_timestamp_with_ict_enabled() { + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + + mock_table + .commit([ + Action::CommitInfo(CommitInfo::new(1000, Some(2000), None, None, false)), + Action::Metadata( + Metadata::try_new( + None, + None, + get_schema(), + vec![], + 0, + HashMap::from([ + ("delta.enableChangeDataFeed".to_string(), "true".to_string()), + ( + "delta.enableInCommitTimestamps".to_string(), + "true".to_string(), + ), + ]), + ) + .unwrap(), + ), + Action::Protocol( + Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [ + TableFeature::InCommitTimestamp, + TableFeature::ChangeDataFeed, + TableFeature::DeletionVectors, + ], + ) + .unwrap(), + ), + ]) + .await; + + let mut commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None) + .unwrap() + .into_iter(); + + let commit = commits.next().unwrap(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let mut table_config = get_default_table_config(&table_root_url); + let scanner = + LogReplayScanner::try_new(engine.as_ref(), &mut table_config, commit, &get_schema()) + .unwrap(); + assert_eq!(scanner.timestamp, 2000); +} + +#[tokio::test] +async fn test_timestamp_with_ict_disabled() { + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + + mock_table + .commit([ + Action::CommitInfo(CommitInfo::new(1000, Some(2000), None, None, false)), + Action::Metadata( + Metadata::try_new( + None, + None, + get_schema(), + vec![], + 0, + HashMap::from([("delta.enableChangeDataFeed".to_string(), "true".to_string())]), + ) + .unwrap(), + ), + Action::Protocol( + Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [ + TableFeature::InCommitTimestamp, + TableFeature::ChangeDataFeed, + TableFeature::DeletionVectors, + ], + ) + .unwrap(), + ), + ]) + .await; + + let mut commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None) + .unwrap() + .into_iter(); + + let commit = commits.next().unwrap(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let mut table_config = get_default_table_config(&table_root_url); + let scanner = LogReplayScanner::try_new( + engine.as_ref(), + &mut table_config, + commit.clone(), + &get_schema(), + ) + .unwrap(); + assert_ne!(scanner.timestamp, 2000); + assert_eq!(scanner.timestamp, commit.location.last_modified); +} + +#[tokio::test] +async fn test_timestamp_with_commit_info_not_first() { + let engine = Arc::new(SyncEngine::new()); + let mut mock_table = LocalMockTable::new(); + + mock_table + .commit([ + Action::Metadata( + Metadata::try_new( + None, + None, + get_schema(), + vec![], + 0, + HashMap::from([ + ("delta.enableChangeDataFeed".to_string(), "true".to_string()), + ( + "delta.enableInCommitTimestamps".to_string(), + "true".to_string(), + ), + ]), + ) + .unwrap(), + ), + Action::Protocol( + Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [ + TableFeature::InCommitTimestamp, + TableFeature::ChangeDataFeed, + TableFeature::DeletionVectors, + ], + ) + .unwrap(), + ), + Action::CommitInfo(CommitInfo::new(1000, Some(2000), None, None, false)), + ]) + .await; + + let mut commits = get_segment(engine.as_ref(), mock_table.table_root(), 0, None) + .unwrap() + .into_iter(); + + let commit = commits.next().unwrap(); + let table_root_url = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let mut table_config = get_default_table_config(&table_root_url); + let result = + LogReplayScanner::try_new(engine.as_ref(), &mut table_config, commit, &get_schema()); + + // Should error because ICT is enabled but not found in the first action + assert_result_error_with_message( + result, + "In-commit timestamp is enabled but not found in commit at version 0", + ); +} diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index 9172dc8834..68b2d12ea2 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -3,14 +3,14 @@ //! # Example //! ```rust //! # use std::sync::Arc; -//! # use test_utils::DefaultEngineExtension; -//! # use delta_kernel::engine::default::DefaultEngine; +//! # use delta_kernel::engine::default::{DefaultEngine, DefaultEngineBuilder}; //! # use delta_kernel::expressions::{column_expr, Scalar}; //! # use delta_kernel::{Predicate, Snapshot, SnapshotRef, Error, Engine}; //! # use delta_kernel::table_changes::TableChanges; //! # let path = "./tests/data/table-with-cdf"; -//! # let engine = DefaultEngine::new_local(); //! let url = delta_kernel::try_parse_uri(path)?; +//! # use delta_kernel::engine::default::storage::store_from_url; +//! # let engine = std::sync::Arc::new(DefaultEngineBuilder::new(store_from_url(&url)?).build()); //! // Get the table changes (change data feed) between version 0 and 1 //! let table_changes = TableChanges::try_new(url, engine.as_ref(), 0, Some(1))?; //! @@ -27,7 +27,7 @@ //! .with_predicate(predicate.clone()) //! .build()?; //! -//! // Execute the table changes scan to get a fallible iterator of `ScanResult`s +//! // Execute the table changes scan to get a fallible iterator of `Box`s //! let table_change_batches = table_changes_scan.execute(engine.clone())?; //! # Ok::<(), Error>(()) //! ``` @@ -36,14 +36,13 @@ use std::sync::{Arc, LazyLock}; use scan::TableChangesScanBuilder; use url::Url; -use crate::actions::{ensure_supported_features, Protocol}; use crate::log_segment::LogSegment; use crate::path::AsUrl; use crate::schema::{DataType, Schema, StructField, StructType}; use crate::snapshot::{Snapshot, SnapshotRef}; -use crate::table_features::{ColumnMappingMode, ReaderFeature}; -use crate::table_properties::TableProperties; -use crate::utils::require; +use crate::table_configuration::TableConfiguration; +use crate::table_features::Operation; +use crate::table_features::TableFeature; use crate::{DeltaResult, Engine, Error, Version}; mod log_replay; @@ -95,14 +94,13 @@ static CDF_FIELDS: LazyLock<[StructField; 3]> = LazyLock::new(|| { /// # Examples /// Get `TableChanges` for versions 0 to 1 (inclusive) /// ```rust -/// # use delta_kernel::engine::default::DefaultEngine; -/// # use test_utils::DefaultEngineExtension; +/// # use delta_kernel::engine::default::{storage::store_from_url, DefaultEngineBuilder}; /// # use delta_kernel::{SnapshotRef, Error}; /// # use delta_kernel::table_changes::TableChanges; -/// # let engine = DefaultEngine::new_local(); /// # let path = "./tests/data/table-with-cdf"; -/// let url = delta_kernel::try_parse_uri(path).unwrap(); -/// let table_changes = TableChanges::try_new(url, engine.as_ref(), 0, Some(1))?; +/// let url = delta_kernel::try_parse_uri(path)?; +/// # let engine = DefaultEngineBuilder::new(store_from_url(&url)?).build(); +/// let table_changes = TableChanges::try_new(url, &engine, 0, Some(1))?; /// # Ok::<(), Error>(()) /// ```` /// For more details, see the following sections of the protocol: @@ -115,6 +113,7 @@ pub struct TableChanges { end_snapshot: SnapshotRef, start_version: Version, schema: Schema, + start_table_config: TableConfiguration, } impl TableChanges { @@ -147,46 +146,33 @@ impl TableChanges { end_version, )?; - // Both snapshots ensure that reading is supported at the start and end version using - // `ensure_read_supported`. Note that we must still verify that reading is - // supported for every protocol action in the CDF range. let start_snapshot = Snapshot::builder_for(table_root.as_url().clone()) .at_version(start_version) .build(engine)?; + start_snapshot + .table_configuration() + .ensure_operation_supported(Operation::Cdf)?; + let end_snapshot = match end_version { Some(version) => Snapshot::builder_from(start_snapshot.clone()) .at_version(version) .build(engine)?, None => Snapshot::builder_from(start_snapshot.clone()).build(engine)?, }; - - // we block reading catalog-managed tables with CDF for now. note this is best-effort just - // checking that start/end snapshots are not catalog-managed. - // - // TODO: link issue - #[cfg(feature = "catalog-managed")] - require!( - !start_snapshot - .table_configuration() - .protocol() - .is_catalog_managed() - && !end_snapshot - .table_configuration() - .protocol() - .is_catalog_managed(), - Error::unsupported("Change data feed is not supported for catalog-managed tables") - ); + end_snapshot + .table_configuration() + .ensure_operation_supported(Operation::Cdf)?; // Verify CDF is enabled at the beginning and end of the interval using // [`check_cdf_table_properties`] to fail early. This also ensures that column mapping is // disabled. // - // We also check the [`Protocol`] using [`ensure_cdf_read_supported`] to verify that - // we support CDF with those features enabled. - // // Note: We must still check each metadata and protocol action in the CDF range. let check_table_config = |snapshot: &Snapshot| { - if snapshot.table_configuration().is_cdf_read_supported() { + if snapshot + .table_configuration() + .is_feature_enabled(&TableFeature::ChangeDataFeed) + { Ok(()) } else { Err(Error::change_data_feed_unsupported(snapshot.version())) @@ -219,6 +205,7 @@ impl TableChanges { log_segment, start_version, schema, + start_table_config: start_snapshot.table_configuration().clone(), }) } @@ -252,42 +239,6 @@ impl TableChanges { } } -/// Ensures that change data feed is enabled in `table_properties`. See the documentation -/// of [`TableChanges`] for more details. -fn check_cdf_table_properties(table_properties: &TableProperties) -> DeltaResult<()> { - require!( - table_properties.enable_change_data_feed.unwrap_or(false), - Error::unsupported("Change data feed is not enabled") - ); - require!( - matches!( - table_properties.column_mapping_mode, - None | Some(ColumnMappingMode::None) - ), - Error::unsupported("Change data feed not supported when column mapping is enabled") - ); - Ok(()) -} - -/// Ensures that Change Data Feed is supported for a table with this [`Protocol`] . -/// See the documentation of [`TableChanges`] for more details. -fn ensure_cdf_read_supported(protocol: &Protocol) -> DeltaResult<()> { - static CDF_SUPPORTED_READER_FEATURES: LazyLock> = - LazyLock::new(|| vec![ReaderFeature::DeletionVectors]); - match &protocol.reader_features() { - // if min_reader_version = 3 and all reader features are subset of supported => OK - Some(reader_features) if protocol.min_reader_version() == 3 => { - ensure_supported_features(reader_features, &CDF_SUPPORTED_READER_FEATURES) - } - // if min_reader_version = 1 and there are no reader features => OK - None if protocol.min_reader_version() == 1 => Ok(()), - // any other protocol is not supported - _ => Err(Error::unsupported( - "Change data feed not supported on this protocol", - )), - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index dd38f8015d..0676cc3fda 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -2,8 +2,8 @@ use std::collections::HashMap; use crate::expressions::Scalar; use crate::scan::state_info::StateInfo; +use crate::scan::transform_spec::{get_transform_expr, parse_partition_values}; use crate::schema::{DataType, SchemaRef, StructField, StructType}; -use crate::transforms::{get_transform_expr, parse_partition_values}; use crate::{DeltaResult, Error, ExpressionRef}; use super::scan_file::{CdfScanFile, CdfScanFileType}; @@ -107,6 +107,7 @@ pub(crate) fn get_cdf_transform_expr( &state_info.logical_schema, transform_spec, &scan_file.partition_values, + state_info.column_mapping_mode, )?; partition_values.extend(parsed_values); @@ -129,9 +130,10 @@ mod tests { use crate::expressions::Expression; use crate::scan::state::DvInfo; use crate::scan::state_info::StateInfo; + use crate::scan::transform_spec::FieldTransformSpec; use crate::scan::PhysicalPredicate; use crate::schema::{DataType, StructField, StructType}; - use crate::transforms::FieldTransformSpec; + use crate::table_features::ColumnMappingMode; use std::collections::HashMap; use std::sync::Arc; @@ -166,6 +168,7 @@ mod tests { commit_timestamp: 1000000000000, dv_info: DvInfo::default(), remove_dv: None, + size: None, } } @@ -180,6 +183,9 @@ mod tests { physical_schema: physical_schema.into(), physical_predicate: PhysicalPredicate::None, transform_spec: Some(Arc::new(transform_spec)), + column_mapping_mode: ColumnMappingMode::None, + physical_stats_schema: None, + physical_partition_schema: None, } } @@ -376,6 +382,7 @@ mod tests { commit_timestamp: 1000000000000, dv_info: DvInfo::default(), remove_dv: None, + size: None, }; // Create a simple schema without CDF metadata columns @@ -397,6 +404,9 @@ mod tests { physical_schema: physical_schema.clone().into(), physical_predicate: PhysicalPredicate::None, transform_spec: Some(Arc::new(transform_spec)), + column_mapping_mode: ColumnMappingMode::None, + physical_stats_schema: None, + physical_partition_schema: None, }; let result = get_cdf_transform_expr(&scan_file, &state_info, &physical_schema); diff --git a/kernel/src/table_changes/resolve_dvs.rs b/kernel/src/table_changes/resolve_dvs.rs index 765a4bef3b..c478166c38 100644 --- a/kernel/src/table_changes/resolve_dvs.rs +++ b/kernel/src/table_changes/resolve_dvs.rs @@ -199,6 +199,7 @@ mod tests { partition_values: HashMap::new(), commit_version: 42, commit_timestamp: 1234, + size: None, } } diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index a7a59198b7..629980147d 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -8,9 +8,10 @@ use url::Url; use crate::actions::deletion_vector::split_vector; use crate::scan::field_classifiers::CdfTransformFieldClassifier; use crate::scan::state_info::StateInfo; -use crate::scan::{PhysicalPredicate, ScanResult}; +use crate::scan::PhysicalPredicate; +use crate::scan::StatsOutputMode; use crate::schema::SchemaRef; -use crate::{DeltaResult, Engine, FileMeta, PredicateRef}; +use crate::{DeltaResult, Engine, EngineData, Error, FileMeta, PredicateRef}; use super::log_replay::{table_changes_action_iter, TableChangesScanMetadata}; use super::physical_to_logical::{get_cdf_transform_expr, scan_file_physical_schema}; @@ -42,15 +43,14 @@ pub struct TableChangesScan { /// Construct a [`TableChangesScan`] from `table_changes` with a given schema and predicate /// ```rust /// # use std::sync::Arc; -/// # use test_utils::DefaultEngineExtension; -/// # use delta_kernel::engine::default::DefaultEngine; /// # use delta_kernel::expressions::{column_expr, Scalar}; /// # use delta_kernel::Predicate; /// # use delta_kernel::table_changes::TableChanges; /// # let path = "./tests/data/table-with-cdf"; -/// # let engine = DefaultEngine::new_local(); /// # let url = delta_kernel::try_parse_uri(path).unwrap(); -/// # let table_changes = TableChanges::try_new(url, engine.as_ref(), 0, Some(1)).unwrap(); +/// # use delta_kernel::engine::default::{storage::store_from_url, DefaultEngineBuilder}; +/// # let engine = DefaultEngineBuilder::new(store_from_url(&url).unwrap()).build(); +/// # let table_changes = TableChanges::try_new(url, &engine, 0, Some(1)).unwrap(); /// let schema = table_changes /// .schema() /// .project(&["id", "_commit_version"]) @@ -114,10 +114,12 @@ impl TableChangesScanBuilder { .unwrap_or_else(|| self.table_changes.schema.clone().into()); // Create StateInfo using CDF field classifier + // CDF doesn't support stats output let state_info = StateInfo::try_new( logical_schema, self.table_changes.end_snapshot.table_configuration(), self.predicate, + StatsOutputMode::default(), CdfTransformFieldClassifier, )?; @@ -142,6 +144,7 @@ impl TableChangesScan { let commits = self .table_changes .log_segment + .listed .ascending_commit_files .clone(); // NOTE: This is a cheap arc clone @@ -151,7 +154,13 @@ impl TableChangesScan { PhysicalPredicate::None => None, }; let schema = self.table_changes.end_snapshot.schema(); - let it = table_changes_action_iter(engine, commits, schema, physical_predicate)?; + let it = table_changes_action_iter( + engine, + &self.table_changes.start_table_config, + commits, + schema, + physical_predicate, + )?; Ok(Some(it).into_iter().flatten()) } @@ -182,14 +191,13 @@ impl TableChangesScan { } } - /// Perform an "all in one" scan to get the change data feed. This will use the provided `engine` - /// to read and process all the data for the query. Each [`ScanResult`] in the resultant iterator - /// encapsulates the raw data and an optional boolean vector built from the deletion vector if it - /// was present. See the documentation for [`ScanResult`] for more details. + /// Perform an "all in one" scan to get the change data feed. This will use the provided + /// `engine` to read and process all the data for the query. Each [`EngineData`] in the + /// resultant iterator is a portion of the final set of data. pub fn execute( &self, engine: Arc, - ) -> DeltaResult> + use<'_>> { + ) -> DeltaResult>>> { let scan_metadata = self.scan_metadata(engine.clone())?; let scan_files = scan_metadata_to_scan_file(scan_metadata); @@ -197,6 +205,9 @@ impl TableChangesScan { let state_info = self.state_info.clone(); let dv_engine_ref = engine.clone(); + let table_root_copy = self.table_changes.table_root().clone(); + let physical_predicate = self.physical_predicate().clone(); + let result = scan_files .map(move |scan_file| { resolve_scan_file_dv(dv_engine_ref.as_ref(), &table_root, scan_file?) @@ -206,9 +217,9 @@ impl TableChangesScan { read_scan_file( engine.as_ref(), resolved_scan_file?, - self.table_root(), + &table_root_copy, state_info.as_ref(), - self.physical_predicate(), + physical_predicate.clone(), ) }) // Iterator-Result-Iterator-Result .flatten_ok() // Iterator-Result-Result @@ -219,14 +230,14 @@ impl TableChangesScan { } /// Reads the data at the `resolved_scan_file` and transforms the data from physical to logical. -/// The result is a fallible iterator of [`ScanResult`] containing the logical data. +/// The result is a fallible iterator of [`Box`] containing the logical data. fn read_scan_file( engine: &dyn Engine, resolved_scan_file: ResolvedCdfScanFile, table_root: &Url, state_info: &StateInfo, _physical_predicate: Option, -) -> DeltaResult>> { +) -> DeltaResult>>> { let ResolvedCdfScanFile { scan_file, mut selection_vector, @@ -237,20 +248,27 @@ fn read_scan_file( let transform_expr = get_cdf_transform_expr(&scan_file, state_info, physical_schema.as_ref())?; // Only create an evaluator if transformation is needed - let phys_to_logical_eval = transform_expr.map(|expr| { - engine.evaluation_handler().new_expression_evaluator( - physical_schema.clone(), - expr, - state_info.logical_schema.clone().into(), - ) - }); + let phys_to_logical_eval = transform_expr + .map(|expr| { + engine.evaluation_handler().new_expression_evaluator( + physical_schema.clone(), + expr, + state_info.logical_schema.clone().into(), + ) + }) + .transpose()?; // Determine if the scan file was derived from a deletion vector pair let is_dv_resolved_pair = scan_file.remove_dv.is_some(); let location = table_root.join(&scan_file.path)?; let file = FileMeta { last_modified: 0, - size: 0, + size: match scan_file.size { + Some(s) => s + .try_into() + .map_err(|_| Error::generic(format!("invalid file size: {s}")))?, + None => 0, + }, location, }; // TODO(#860): we disable predicate pushdown until we support row indexes. @@ -299,12 +317,12 @@ fn read_scan_file( // the selection vector is `None`. let extend = Some(!is_dv_resolved_pair); let rest = split_vector(sv.as_mut(), len, extend); - let result = ScanResult { - raw_data: logical, - raw_mask: sv, + let result = match sv { + Some(sv) => logical.and_then(|data| data.apply_selection_vector(sv)), + None => logical, }; selection_vector = rest; - Ok(result) + result }); Ok(result) } @@ -315,11 +333,11 @@ mod tests { use crate::engine::sync::SyncEngine; use crate::expressions::{column_expr, Scalar}; + use crate::scan::transform_spec::FieldTransformSpec; use crate::scan::PhysicalPredicate; use crate::schema::{DataType, StructField, StructType}; use crate::table_changes::TableChanges; use crate::table_changes::COMMIT_VERSION_COL_NAME; - use crate::transforms::FieldTransformSpec; use crate::Predicate; #[test] diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 5eca45763c..0bf60c7221 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -53,6 +53,8 @@ pub(crate) struct CdfScanFile { pub commit_version: i64, /// The timestamp of the commit that this action was performed in pub commit_timestamp: i64, + /// The size of the file in bytes + pub size: Option, } pub(crate) type CdfScanCallback = fn(context: &mut T, scan_file: CdfScanFile); @@ -126,7 +128,7 @@ struct CdfScanFileVisitor<'a, T> { impl RowVisitor for CdfScanFileVisitor<'_, T> { fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { require!( - getters.len() == 18, + getters.len() == 21, Error::InternalError(format!( "Wrong number of CdfScanFileVisitor getters: {}", getters.len() @@ -137,26 +139,29 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { continue; } - let (scan_type, path, deletion_vector, partition_values) = + let (scan_type, path, deletion_vector, partition_values, size) = if let Some(path) = getters[0].get_opt(row_index, "scanFile.add.path")? { let scan_type = CdfScanFileType::Add; let deletion_vector = visit_deletion_vector_at(row_index, &getters[1..=5])?; let partition_values = getters[6] .get_opt(row_index, "scanFile.add.fileConstantValues.partitionValues")?; - (scan_type, path, deletion_vector, partition_values) - } else if let Some(path) = getters[7].get_opt(row_index, "scanFile.remove.path")? { + let size = getters[7].get_opt(row_index, "scanFile.add.size")?; + (scan_type, path, deletion_vector, partition_values, size) + } else if let Some(path) = getters[8].get_opt(row_index, "scanFile.remove.path")? { let scan_type = CdfScanFileType::Remove; - let deletion_vector = visit_deletion_vector_at(row_index, &getters[8..=12])?; - let partition_values = getters[13].get_opt( + let deletion_vector = visit_deletion_vector_at(row_index, &getters[9..=13])?; + let partition_values = getters[14].get_opt( row_index, "scanFile.remove.fileConstantValues.partitionValues", )?; - (scan_type, path, deletion_vector, partition_values) - } else if let Some(path) = getters[14].get_opt(row_index, "scanFile.cdc.path")? { + let size = getters[15].get_opt(row_index, "scanFile.remove.size")?; + (scan_type, path, deletion_vector, partition_values, size) + } else if let Some(path) = getters[16].get_opt(row_index, "scanFile.cdc.path")? { let scan_type = CdfScanFileType::Cdc; - let partition_values = getters[15] + let partition_values = getters[17] .get_opt(row_index, "scanFile.cdc.fileConstantValues.partitionValues")?; - (scan_type, path, None, partition_values) + let size = getters[18].get_opt(row_index, "scanFile.cdc.size")?; + (scan_type, path, None, partition_values, size) } else { continue; }; @@ -167,8 +172,9 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { path, dv_info: DvInfo { deletion_vector }, partition_values, - commit_timestamp: getters[16].get(row_index, "scanFile.timestamp")?, - commit_version: getters[17].get(row_index, "scanFile.commit_version")?, + commit_timestamp: getters[19].get(row_index, "scanFile.timestamp")?, + commit_version: getters[20].get(row_index, "scanFile.commit_version")?, + size, }; (self.callback)(&mut self.context, scan_file) } @@ -200,15 +206,18 @@ pub(crate) fn cdf_scan_row_schema() -> SchemaRef { StructField::nullable("path", DataType::STRING), StructField::nullable("deletionVector", deletion_vector.clone()), StructField::nullable("fileConstantValues", file_constant_values.clone()), + StructField::nullable("size", DataType::LONG), ]); let remove = StructType::new_unchecked([ StructField::nullable("path", DataType::STRING), StructField::nullable("deletionVector", deletion_vector), StructField::nullable("fileConstantValues", file_constant_values.clone()), + StructField::nullable("size", DataType::LONG), ]); let cdc = StructType::new_unchecked([ StructField::nullable("path", DataType::STRING), StructField::nullable("fileConstantValues", file_constant_values), + StructField::nullable("size", DataType::LONG), ]); Arc::new(StructType::new_unchecked([ @@ -230,15 +239,18 @@ pub(crate) fn cdf_scan_row_expression(commit_timestamp: i64, commit_number: i64) column_expr!("add.path"), column_expr!("add.deletionVector"), Expression::struct_from([column_expr!("add.partitionValues")]), + column_expr!("add.size"), ]), Expression::struct_from([ column_expr!("remove.path"), column_expr!("remove.deletionVector"), Expression::struct_from([column_expr!("remove.partitionValues")]), + column_expr!("remove.size"), ]), Expression::struct_from([ column_expr!("cdc.path"), Expression::struct_from([column_expr!("cdc.partitionValues")]), + column_expr!("cdc.size"), ]), Expression::literal(commit_timestamp), Expression::literal(commit_number), @@ -281,6 +293,7 @@ mod tests { deletion_vector: Some(dv_info.clone()), partition_values: add_partition_values, data_change: true, + size: 100i64, ..Default::default() }; let remove_paired = Remove { @@ -288,6 +301,7 @@ mod tests { deletion_vector: None, partition_values: None, data_change: true, + size: Some(200i64), ..Default::default() }; @@ -304,6 +318,7 @@ mod tests { deletion_vector: Some(rm_dv), partition_values: rm_partition_values, data_change: true, + size: None, ..Default::default() }; @@ -319,6 +334,7 @@ mod tests { deletion_vector: None, partition_values: None, data_change: true, + size: None, ..Default::default() }; @@ -339,14 +355,38 @@ mod tests { let log_segment = LogSegment::for_table_changes(engine.storage_handler().as_ref(), log_root, 0, None) .unwrap(); - let table_schema = StructType::new_unchecked([ + let table_schema = Arc::new(StructType::new_unchecked([ StructField::nullable("id", DataType::INTEGER), StructField::nullable("value", DataType::STRING), - ]); + ])); + + // Create a TableConfiguration for testing + use crate::actions::{Metadata, Protocol}; + use crate::table_configuration::TableConfiguration; + use crate::table_properties::{COLUMN_MAPPING_MODE, ENABLE_CHANGE_DATA_FEED}; + + let metadata = Metadata::try_new( + None, + None, + table_schema.clone(), + vec![], + 0, + HashMap::from([ + (ENABLE_CHANGE_DATA_FEED.to_string(), "true".to_string()), + (COLUMN_MAPPING_MODE.to_string(), "none".to_string()), + ]), + ) + .unwrap(); + // CDF (enableChangeDataFeed) requires min_writer_version = 4 + let protocol = Protocol::try_new_legacy(1, 4).unwrap(); + let table_config = + TableConfiguration::try_new(metadata, protocol, table_root.clone(), 0).unwrap(); + let scan_metadata = table_changes_action_iter( Arc::new(engine), - log_segment.ascending_commit_files.clone(), - table_schema.into(), + &table_config, + log_segment.listed.ascending_commit_files.clone(), + table_schema, None, ) .unwrap(); @@ -356,6 +396,7 @@ mod tests { // Generate the expected [`CdfScanFile`] let timestamps = log_segment + .listed .ascending_commit_files .iter() .map(|commit| commit.location.last_modified) @@ -374,6 +415,7 @@ mod tests { commit_version: 0, commit_timestamp: timestamps[0], remove_dv: Some(expected_remove_dv), + size: Some(add_paired.size), }, CdfScanFile { scan_type: CdfScanFileType::Remove, @@ -385,6 +427,7 @@ mod tests { commit_version: 0, commit_timestamp: timestamps[0], remove_dv: None, + size: remove.size, }, CdfScanFile { scan_type: CdfScanFileType::Cdc, @@ -396,6 +439,7 @@ mod tests { commit_version: 1, commit_timestamp: timestamps[1], remove_dv: None, + size: Some(cdc.size), }, CdfScanFile { scan_type: CdfScanFileType::Remove, @@ -407,6 +451,7 @@ mod tests { commit_version: 2, commit_timestamp: timestamps[2], remove_dv: None, + size: remove_no_partition.size, }, ]; diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index b5586d0e47..df901be1aa 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -8,20 +8,43 @@ //! [`TableProperties`]. //! //! [`Schema`]: crate::schema::Schema -use std::sync::{Arc, LazyLock}; +use std::borrow::Cow; +use std::collections::HashSet; +use std::sync::{Arc, OnceLock}; use url::Url; -use crate::actions::{ensure_supported_features, Metadata, Protocol}; -use crate::schema::variant_utils::validate_variant_type_feature_support; -use crate::schema::{InvariantChecker, SchemaRef}; +use crate::actions::{Metadata, Protocol}; +use crate::expressions::ColumnName; +use crate::scan::data_skipping::stats_schema::{ + expected_stats_schema, stats_column_names, StatsConfig, StripFieldMetadataTransform, +}; +pub(crate) use crate::schema::variant_utils::validate_variant_type_feature_support; +use crate::schema::{schema_has_invariants, SchemaRef, StructField, StructType}; use crate::table_features::{ - column_mapping_mode, validate_schema_column_mapping, validate_timestamp_ntz_feature_support, - ColumnMappingMode, ReaderFeature, WriterFeature, + column_mapping_mode, get_any_level_column_physical_name, + validate_timestamp_ntz_feature_support, ColumnMappingMode, EnablementCheck, FeatureRequirement, + FeatureType, KernelSupport, Operation, TableFeature, LEGACY_READER_FEATURES, + LEGACY_WRITER_FEATURES, MAX_VALID_READER_VERSION, MAX_VALID_WRITER_VERSION, + MIN_VALID_RW_VERSION, TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION, }; use crate::table_properties::TableProperties; +use crate::transforms::SchemaTransform as _; +use crate::utils::require; use crate::{DeltaResult, Error, Version}; use delta_kernel_derive::internal_api; +use tracing::warn; + +/// Expected schema for file statistics, using physical column names. +/// +/// Wrapped in a struct so it can be extended with a logical-name variant if needed. +#[allow(unused)] +#[derive(Debug, Clone)] +#[internal_api] +pub(crate) struct ExpectedStatsSchemas { + /// Stats schema using physical column names (for storage). + pub physical: SchemaRef, +} /// Information about in-commit timestamp enablement state. #[derive(Debug, Clone, PartialEq, Eq)] @@ -36,22 +59,63 @@ pub(crate) enum InCommitTimestampEnablement { }, } +/// Utility function to strip field metadata from stats schemas. This metadata describes logical +/// table columns, not the stats. Keeping it can cause schema mismatches when combining the parsed +/// stats from a checkpoint written before logical metadata was added. +fn strip_metadata(schema: SchemaRef) -> SchemaRef { + match StripFieldMetadataTransform.transform_struct(&schema) { + Some(Cow::Owned(s)) => Arc::new(s), + _ => schema, + } +} + +/// Physical schema variants for a table. +/// +/// - `full`: physical representations of all columns from [`TableConfiguration::logical_schema`]. +/// - `without_partition`: lazily computed variant that excludes partition columns. +#[derive(Debug, Clone, Eq)] +struct PhysicalSchemas { + full: SchemaRef, + without_partition: OnceLock, +} + +impl PhysicalSchemas { + fn new(full: SchemaRef) -> Self { + Self { + full, + without_partition: OnceLock::new(), + } + } +} + +impl PartialEq for PhysicalSchemas { + fn eq(&self, other: &Self) -> bool { + // `without_partition` is deterministically derived from `full` and partition columns + // (compared via `metadata` in TableConfiguration's PartialEq), so comparing it is + // redundant. Two PhysicalSchemas with the same `full` are considered equal even if + // one has `without_partition` initialized and the other does not. + self.full == other.full + } +} + /// Holds all the configuration for a table at a specific version. This includes the supported /// reader and writer features, table properties, schema, version, and table root. This can be used /// to check whether a table supports a feature or has it enabled. For example, deletion vector -/// support can be checked with [`TableConfiguration::is_deletion_vector_supported`] and deletion -/// vector write enablement can be checked with [`TableConfiguration::is_deletion_vector_enabled`]. +/// support can be checked with [`TableConfiguration::is_feature_supported`] and deletion +/// vector write enablement can be checked with [`TableConfiguration::is_feature_enabled`]. /// /// [`TableConfiguration`] performs checks upon construction with `TableConfiguration::try_new` -/// to validate that Metadata and Protocol are correctly formatted and mutually compatible. If -/// `try_new` successfully returns `TableConfiguration`, it is also guaranteed that reading the -/// table is supported. +/// to validate that Metadata and Protocol are correctly formatted and mutually compatible. +/// After construction, call `ensure_operation_supported` to verify that the kernel supports the +/// required operations for the table's protocol features. #[internal_api] #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct TableConfiguration { metadata: Metadata, protocol: Protocol, - schema: SchemaRef, + /// Logical schema: field names are the user-facing (logical) column names. + logical_schema: SchemaRef, + physical_schemas: PhysicalSchemas, table_properties: TableProperties, column_mapping_mode: ColumnMappingMode, table_root: Url, @@ -85,28 +149,29 @@ impl TableConfiguration { table_root: Url, version: Version, ) -> DeltaResult { - protocol.ensure_read_supported()?; - - let schema = Arc::new(metadata.parse_schema()?); + let logical_schema = Arc::new(metadata.parse_schema()?); let table_properties = metadata.parse_table_properties(); let column_mapping_mode = column_mapping_mode(&protocol, &table_properties); - // validate column mapping mode -- all schema fields should be correctly (un)annotated - validate_schema_column_mapping(&schema, column_mapping_mode)?; - - validate_timestamp_ntz_feature_support(&schema, &protocol)?; - - validate_variant_type_feature_support(&schema, &protocol)?; + let physical_schema = Arc::new(logical_schema.make_physical(column_mapping_mode)?); + let physical_schemas = PhysicalSchemas::new(physical_schema); - Ok(Self { - schema, + let table_config = Self { + logical_schema, + physical_schemas, metadata, protocol, table_properties, column_mapping_mode, table_root, version, - }) + }; + + // Validate schema against protocol features now that we have a TC instance. + validate_timestamp_ntz_feature_support(&table_config)?; + validate_variant_type_feature_support(&table_config)?; + + Ok(table_config) } pub(crate) fn try_new_from( @@ -134,6 +199,190 @@ impl TableConfiguration { ) } + /// Creates a new [`TableConfiguration`] representing the table configuration immediately + /// after a commit. + /// + /// This method takes a pre-commit table configuration and produces a post-commit + /// configuration at the committed version. This allows immediate use of the new table + /// configuration without re-reading metadata from storage. + /// + /// TODO: Take in Protocol (when Kernel-RS supports protocol changes) + /// TODO: Take in Metadata (when Kernel-RS supports metadata changes) + pub(crate) fn new_post_commit(table_configuration: &Self, new_version: Version) -> Self { + Self { + version: new_version, + ..table_configuration.clone() + } + } + + /// Generates the expected schema for file statistics. + /// + /// Engines can provide statistics for files written to the delta table, enabling + /// data skipping and other optimizations. Returns the physical stats schema wrapped in + /// an `ExpectedStatsSchemas`. + /// + /// The schema is structured as: + /// ```text + /// { + /// numRecords: long, + /// nullCount: { }, + /// minValues: { }, + /// maxValues: { }, + /// } + /// ``` + /// + /// The schemas are affected by: + /// - **Column mapping mode**: Physical schema field names use physical names from column + /// mapping metadata. + /// - **`delta.dataSkippingStatsColumns`**: If set, only specified columns are included. + /// - **`delta.dataSkippingNumIndexedCols`**: Otherwise, includes the first N leaf columns + /// (default 32). + /// - **Required columns** (e.g. clustering columns): Per the Delta protocol, always included + /// in statistics, regardless of the above settings. + /// - **Requested columns**: Optional output filter that limits which columns appear in the + /// schema without affecting column counting. + /// + /// See the Delta protocol for more details on per-file statistics: + /// + #[allow(unused)] + #[internal_api] + pub(crate) fn build_expected_stats_schemas( + &self, + required_physical_columns: Option<&[ColumnName]>, + requested_physical_columns: Option<&[ColumnName]>, + ) -> DeltaResult { + let physical_data_schema = self.physical_data_schema_without_partition_columns(); + let required_physical_stats_columns = self.required_physical_stats_columns(); + let config = StatsConfig { + data_skipping_stats_columns: required_physical_stats_columns.as_deref(), + data_skipping_num_indexed_cols: self.table_properties().data_skipping_num_indexed_cols, + }; + let physical_stats_schema = Arc::new(expected_stats_schema( + &physical_data_schema, + &config, + required_physical_columns, + requested_physical_columns, + )?); + let physical_stats_schema = strip_metadata(physical_stats_schema); + + Ok(ExpectedStatsSchemas { + physical: physical_stats_schema, + }) + } + + /// Returns the list of physical column names that should have statistics collected. + pub(crate) fn physical_stats_column_names( + &self, + required_columns: Option<&[ColumnName]>, + ) -> Vec { + let physical_stats_columns = self.required_physical_stats_columns(); + let config = StatsConfig { + data_skipping_stats_columns: physical_stats_columns.as_deref(), + data_skipping_num_indexed_cols: self.table_properties().data_skipping_num_indexed_cols, + }; + stats_column_names(&self.physical_schema(), &config, required_columns) + } + + /// Returns the physical partition schema for `partitionValues_parsed`. + /// + /// Field names are physical column names (respecting column mapping mode), + /// and field types are the actual partition column data types with their original nullability. + /// Returns `None` if the table has no partition columns. + pub(crate) fn build_partition_values_parsed_schema(&self) -> Option { + let partition_columns = self.metadata().partition_columns(); + if partition_columns.is_empty() { + return None; + } + let logical_schema = self.logical_schema(); + let column_mapping_mode = self.column_mapping_mode(); + let partition_fields: Vec = partition_columns + .iter() + .filter_map(|col_name| { + let field = logical_schema.field(col_name); + if field.is_none() { + warn!("Partition column '{col_name}' not found in table schema"); + } + field + }) + .map(|field: &StructField| { + StructField::new( + field.physical_name(column_mapping_mode).to_owned(), + field.data_type().clone(), + field.is_nullable(), + ) + }) + .collect(); + Some(Arc::new(StructType::new_unchecked(partition_fields))) + } + + /// Returns the logical schema for data columns (excludes partition columns). + /// + /// Partition columns are excluded because statistics are only collected for data columns + /// that are physically stored in the parquet files. Partition values are stored in the + /// file path, not in the file content, so they don't have file-level statistics. + fn logical_data_schema(&self) -> SchemaRef { + let partition_columns = self.partition_columns(); + Arc::new(StructType::new_unchecked( + self.logical_schema() + .fields() + .filter(|field| !partition_columns.contains(field.name())) + .cloned(), + )) + } + + /// Returns the physical data schema excluding partition columns. + pub(crate) fn physical_data_schema_without_partition_columns(&self) -> SchemaRef { + self.physical_schemas + .without_partition + .get_or_init(|| { + let partition_columns: HashSet<&str> = self + .partition_columns() + .iter() + .map(|s| s.as_str()) + .collect(); + // Safety: subset of an already-valid schema. + Arc::new(StructType::new_unchecked( + self.logical_schema() + .fields() + .zip(self.physical_schemas.full.fields()) + .filter(|(logical_field, _)| { + !partition_columns.contains(logical_field.name().as_str()) + }) + .map(|(_, physical_field)| physical_field.clone()), + )) + }) + .clone() + } + + /// Translates `delta.dataSkippingStatsColumns` entries to physical column names. + /// + /// Returns `None` if the table property is not set. Entries that cannot be resolved + /// (e.g. non-existent columns) are silently skipped with a warning. + fn required_physical_stats_columns(&self) -> Option> { + self.table_properties() + .data_skipping_stats_columns + .as_ref() + .map(|cols| { + let logical_schema = self.logical_data_schema(); + let mode = self.column_mapping_mode(); + cols.iter() + .filter_map(|col| { + get_any_level_column_physical_name(&logical_schema, col, mode) + // Theoretically this should always resolve — if it doesn't, + // the user specified a non-existent column in + // delta.dataSkippingStatsColumns, which is safe to ignore. + .inspect_err(|e| { + warn!( + "Couldn't translate dataSkippingStatsColumns entry '{col}' \ + to physical name: {e}; skipping" + ); + }) + .ok() + }) + .collect() + }) + } + /// The [`Metadata`] for this table at this version. #[internal_api] pub(crate) fn metadata(&self) -> &Metadata { @@ -141,6 +390,7 @@ impl TableConfiguration { } /// The [`Protocol`] of this table at this version. + #[allow(unused)] #[internal_api] pub(crate) fn protocol(&self) -> &Protocol { &self.protocol @@ -148,8 +398,33 @@ impl TableConfiguration { /// The logical schema ([`SchemaRef`]) of this table at this version. #[internal_api] - pub(crate) fn schema(&self) -> SchemaRef { - self.schema.clone() + pub(crate) fn logical_schema(&self) -> SchemaRef { + self.logical_schema.clone() + } + + /// The physical schema ([`SchemaRef`]) of this table at this version. + /// + /// When column mapping is disabled, this is identical to [`logical_schema`](Self::logical_schema). + /// Otherwise, field names are replaced with physical column names derived from column + /// mapping metadata. + #[internal_api] + pub(crate) fn physical_schema(&self) -> SchemaRef { + self.physical_schemas.full.clone() + } + + /// The physical schema for writing data files. + /// + /// When [`MaterializePartitionColumns`] is enabled, returns the full physical schema + /// (partition columns are materialized in data files). Otherwise, returns the physical + /// schema with partition columns excluded. + /// + /// [`MaterializePartitionColumns`]: crate::table_features::TableFeature::MaterializePartitionColumns + pub(crate) fn physical_write_schema(&self) -> SchemaRef { + if self.is_feature_enabled(&TableFeature::MaterializePartitionColumns) { + self.physical_schema() + } else { + self.physical_data_schema_without_partition_columns() + } } /// The [`TableProperties`] of this table at this version. @@ -158,12 +433,26 @@ impl TableConfiguration { &self.table_properties } + /// Whether this table is catalog-managed (has the CatalogManaged or CatalogOwnedPreview + /// table feature). + #[internal_api] + pub(crate) fn is_catalog_managed(&self) -> bool { + self.is_feature_supported(&TableFeature::CatalogManaged) + || self.is_feature_supported(&TableFeature::CatalogOwnedPreview) + } + /// The [`ColumnMappingMode`] for this table at this version. #[internal_api] pub(crate) fn column_mapping_mode(&self) -> ColumnMappingMode { self.column_mapping_mode } + /// The partition columns of this table (empty if non-partitioned) + #[internal_api] + pub(crate) fn partition_columns(&self) -> &[String] { + self.metadata().partition_columns() + } + /// The [`Url`] of the table this [`TableConfiguration`] belongs to #[internal_api] pub(crate) fn table_root(&self) -> &Url { @@ -176,158 +465,193 @@ impl TableConfiguration { self.version } - /// Returns `true` if the kernel supports writing to this table. This checks that the - /// protocol's writer features are all supported. - #[internal_api] - pub(crate) fn ensure_write_supported(&self) -> DeltaResult<()> { - self.protocol.ensure_write_supported()?; - - // for now we don't allow invariants so although we support writer version 2 and the - // ColumnInvariant TableFeature we _must_ check here that they are not actually in use - if self.is_invariants_supported() - && InvariantChecker::has_invariants(self.schema().as_ref()) - { - return Err(Error::unsupported( - "Column invariants are not yet supported", - )); - } - - // Fail if row tracking is both enabled and suspended - if self.is_row_tracking_enabled() && self.is_row_tracking_suspended() { - return Err(Error::unsupported( - "Row tracking cannot be both enabled and suspended", - )); + /// Validates that all feature requirements for a given feature are satisfied. + fn validate_feature_requirements(&self, feature: &TableFeature) -> DeltaResult<()> { + for req in feature.info().feature_requirements { + match req { + FeatureRequirement::Supported(dep) => { + require!( + self.is_feature_supported(dep), + Error::invalid_protocol(format!( + "Feature '{feature}' requires '{dep}' to be supported" + )) + ); + } + FeatureRequirement::Enabled(dep) => { + require!( + self.is_feature_enabled(dep), + Error::invalid_protocol(format!( + "Feature '{feature}' requires '{dep}' to be enabled" + )) + ); + } + FeatureRequirement::NotSupported(dep) => { + require!( + !self.is_feature_supported(dep), + Error::invalid_protocol(format!( + "Feature '{feature}' requires '{dep}' to not be supported" + )) + ); + } + FeatureRequirement::NotEnabled(dep) => { + require!( + !self.is_feature_enabled(dep), + Error::invalid_protocol(format!( + "Feature '{feature}' requires '{dep}' to not be enabled" + )) + ); + } + FeatureRequirement::Custom(check) => { + check(&self.protocol, &self.table_properties)?; + } + } } - Ok(()) } - /// Returns `true` if kernel supports reading Change Data Feed on this table. - /// See the documentation of [`TableChanges`] for more details. - /// - /// [`TableChanges`]: crate::table_changes::TableChanges - #[internal_api] - pub(crate) fn is_cdf_read_supported(&self) -> bool { - static CDF_SUPPORTED_READER_FEATURES: LazyLock> = - LazyLock::new(|| vec![ReaderFeature::DeletionVectors]); - let protocol_supported = match self.protocol.reader_features() { - // if min_reader_version = 3 and all reader features are subset of supported => OK - Some(reader_features) if self.protocol.min_reader_version() == 3 => { - ensure_supported_features(reader_features, &CDF_SUPPORTED_READER_FEATURES).is_ok() + /// Checks that kernel supports a feature for the given operation. + /// Returns an error if the feature is unknown, not supported, or fails validation. + fn check_feature_support( + &self, + feature: &TableFeature, + operation: Operation, + ) -> DeltaResult<()> { + let info = feature.info(); + match &info.kernel_support { + KernelSupport::Supported => {} + KernelSupport::NotSupported => { + return Err(Error::unsupported(format!( + "Feature '{feature}' is not supported" + ))) + } + KernelSupport::Custom(check) => { + check(&self.protocol, &self.table_properties, operation)?; } - // if min_reader_version = 1 and there are no reader features => OK - None => self.protocol.min_reader_version() == 1, - // any other protocol is not supported - _ => false, }; - let cdf_enabled = self - .table_properties - .enable_change_data_feed - .unwrap_or(false); - let column_mapping_disabled = matches!( - self.table_properties.column_mapping_mode, - None | Some(ColumnMappingMode::None) - ); - protocol_supported && cdf_enabled && column_mapping_disabled + + self.validate_feature_requirements(feature) } - /// Returns `true` if deletion vectors is supported on this table. To support deletion vectors, - /// a table must support reader version 3, writer version 7, and the deletionVectors feature in - /// both the protocol's readerFeatures and writerFeatures. - /// - /// See: - #[internal_api] - #[allow(unused)] // needed to compile w/o default features - pub(crate) fn is_deletion_vector_supported(&self) -> bool { - let read_supported = self - .protocol() - .has_reader_feature(&ReaderFeature::DeletionVectors) - && self.protocol.min_reader_version() == 3; - let write_supported = self - .protocol() - .has_writer_feature(&WriterFeature::DeletionVectors) - && self.protocol.min_writer_version() == 7; - read_supported && write_supported - } - - /// Returns `true` if writing deletion vectors is enabled for this table. This is the case - /// when the deletion vectors is supported on this table and the `delta.enableDeletionVectors` - /// table property is set to `true`. - /// - /// See: - #[internal_api] - #[allow(unused)] // needed to compile w/o default features - pub(crate) fn is_deletion_vector_enabled(&self) -> bool { - self.is_deletion_vector_supported() - && self - .table_properties - .enable_deletion_vectors - .unwrap_or(false) - } - - /// Returns `true` if the table supports the appendOnly table feature. To support this feature: - /// - The table must have a writer version between 2 and 7 (inclusive) - /// - If the table is on writer version 7, it must have the [`WriterFeature::AppendOnly`] - /// writer feature. - pub(crate) fn is_append_only_supported(&self) -> bool { - let protocol = &self.protocol; - match protocol.min_writer_version() { - 7 if protocol.has_writer_feature(&WriterFeature::AppendOnly) => true, - version => (2..=6).contains(&version), + /// Returns all reader features enabled for this table based on protocol version. + /// For table features protocol (v3), returns the explicit reader_features list. + /// For legacy protocol (v1-2), infers features from the version number. + fn get_enabled_reader_features(&self) -> Vec { + match self.protocol.min_reader_version() { + TABLE_FEATURES_MIN_READER_VERSION => { + // Table features reader: use explicit reader_features list + self.protocol + .reader_features() + .map(|f| f.to_vec()) + .unwrap_or_default() + } + v if (1..=2).contains(&v) => { + // Legacy reader: infer features from version + LEGACY_READER_FEATURES + .iter() + .filter(|f| f.is_valid_for_legacy_reader(v)) + .cloned() + .collect() + } + _ => Vec::new(), } } - #[allow(unused)] - pub(crate) fn is_append_only_enabled(&self) -> bool { - self.is_append_only_supported() && self.table_properties.append_only.unwrap_or(false) + /// Returns all writer features enabled for this table based on protocol version. + /// For table features protocol (v7), returns the explicit writer_features list. + /// For legacy protocol (v1-6), infers features from the version number. + fn get_enabled_writer_features(&self) -> Vec { + match self.protocol.min_writer_version() { + TABLE_FEATURES_MIN_WRITER_VERSION => { + // Table features writer: use explicit writer_features list + self.protocol + .writer_features() + .map(|f| f.to_vec()) + .unwrap_or_default() + } + v if (1..=6).contains(&v) => { + // Legacy writer: infer features from version + LEGACY_WRITER_FEATURES + .iter() + .filter(|f| f.is_valid_for_legacy_writer(v)) + .cloned() + .collect() + } + _ => Vec::new(), + } } - /// Returns `true` if the table supports the column invariant table feature. - pub(crate) fn is_invariants_supported(&self) -> bool { - let protocol = &self.protocol; - match protocol.min_writer_version() { - 7 if protocol.has_writer_feature(&WriterFeature::Invariants) => true, - version => (2..=6).contains(&version), + /// Returns `Ok` if the kernel supports the given operation on this table. This checks that + /// the protocol's features are all supported for the requested operation type. + /// + /// - For `Scan` and `Cdf` operations: checks reader version and reader features + /// - For `Write` operations: checks writer version and writer features + #[internal_api] + pub(crate) fn ensure_operation_supported(&self, operation: Operation) -> DeltaResult<()> { + match operation { + Operation::Scan | Operation::Cdf => self.ensure_read_supported(operation), + Operation::Write => self.ensure_write_supported(), } } - /// Returns `true` if V2 checkpoint is supported on this table. To support V2 checkpoint, - /// a table must support reader version 3, writer version 7, and the v2Checkpoint feature in - /// both the protocol's readerFeatures and writerFeatures. - /// - /// See: - pub(crate) fn is_v2_checkpoint_write_supported(&self) -> bool { - let read_supported = self - .protocol() - .has_reader_feature(&ReaderFeature::V2Checkpoint); - let write_supported = self - .protocol() - .has_writer_feature(&WriterFeature::V2Checkpoint); - read_supported && write_supported - } - - /// Returns `true` if the table supports writing in-commit timestamps. - /// - /// To support this feature the table must: - /// - Have a min_writer_version of 7 - /// - Have the [`WriterFeature::InCommitTimestamp`] writer feature. - #[allow(unused)] - pub(crate) fn is_in_commit_timestamps_supported(&self) -> bool { - self.protocol().min_writer_version() == 7 - && self - .protocol() - .has_writer_feature(&WriterFeature::InCommitTimestamp) + /// Internal helper for read operations (Scan, Cdf) + fn ensure_read_supported(&self, operation: Operation) -> DeltaResult<()> { + require!( + self.protocol.min_reader_version() >= MIN_VALID_RW_VERSION, + Error::InvalidProtocol(format!( + "min_reader_version must be >= {MIN_VALID_RW_VERSION}, got {}", + self.protocol.min_reader_version() + )) + ); + // Version check: kernel supports reader versions 1..=MAX_VALID_READER_VERSION + if self.protocol.min_reader_version() > MAX_VALID_READER_VERSION { + return Err(Error::unsupported(format!( + "Unsupported minimum reader version {}", + self.protocol.min_reader_version() + ))); + } + + // Check all enabled reader features have kernel support + for feature in self.get_enabled_reader_features() { + self.check_feature_support(&feature, operation)?; + } + + Ok(()) } - /// Returns `true` if in-commit timestamps is supported and it is enabled. In-commit timestamps - /// is enabled when the `delta.enableInCommitTimestamps` configuration is set to `true`. - #[allow(unused)] - pub(crate) fn is_in_commit_timestamps_enabled(&self) -> bool { - self.is_in_commit_timestamps_supported() - && self - .table_properties() - .enable_in_commit_timestamps - .unwrap_or(false) + /// Internal helper for write operations + fn ensure_write_supported(&self) -> DeltaResult<()> { + // Version check: kernel supports writer versions MIN_VALID_RW_VERSION..=MAX_VALID_WRITER_VERSION + require!( + self.protocol.min_writer_version() >= MIN_VALID_RW_VERSION, + Error::InvalidProtocol(format!( + "min_writer_version must be >= {MIN_VALID_RW_VERSION}, got {}", + self.protocol.min_writer_version() + )) + ); + // Version check: kernel supports writer versions 1..=MAX_VALID_WRITER_VERSION + if self.protocol.min_writer_version() > MAX_VALID_WRITER_VERSION { + return Err(Error::unsupported(format!( + "Unsupported minimum writer version {}", + self.protocol.min_writer_version() + ))); + } + + // Check all enabled writer features have kernel support + for feature in self.get_enabled_writer_features() { + self.check_feature_support(&feature, Operation::Write)?; + } + + // Schema-dependent validation for Invariants (can't be in FeatureInfo) + // TODO: Better story for schema validation for Invariants and other features + if self.is_feature_supported(&TableFeature::Invariants) + && schema_has_invariants(self.logical_schema.as_ref()) + { + return Err(Error::unsupported( + "Column invariants are not yet supported", + )); + } + + Ok(()) } /// Returns information about in-commit timestamp enablement state. @@ -338,7 +662,7 @@ impl TableConfiguration { pub(crate) fn in_commit_timestamp_enablement( &self, ) -> DeltaResult { - if !self.is_in_commit_timestamps_enabled() { + if !self.is_feature_enabled(&TableFeature::InCommitTimestamp) { return Ok(InCommitTimestampEnablement::NotEnabled); } @@ -365,41 +689,6 @@ impl TableConfiguration { } } - /// Returns `true` if the table supports writing domain metadata. - /// - /// To support this feature the table must: - /// - Have a min_writer_version of 7. - /// - Have the [`WriterFeature::DomainMetadata`] writer feature. - #[allow(unused)] - pub(crate) fn is_domain_metadata_supported(&self) -> bool { - self.protocol().min_writer_version() == 7 - && self - .protocol() - .has_writer_feature(&WriterFeature::DomainMetadata) - } - - /// Returns `true` if the table supports writing row tracking metadata. - /// - /// To support this feature the table must: - /// - Have a min_writer_version of 7. - /// - Have the [`WriterFeature::RowTracking`] writer feature. - pub(crate) fn is_row_tracking_supported(&self) -> bool { - self.protocol().min_writer_version() == 7 - && self - .protocol() - .has_writer_feature(&WriterFeature::RowTracking) - } - - /// Returns `true` if row tracking is enabled for this table. - /// - /// In order to enable row tracking the table must: - /// - Support row tracking (see [`Self::is_row_tracking_supported`]). - /// - Have the `delta.enableRowTracking` table property set to `true`. - pub(crate) fn is_row_tracking_enabled(&self) -> bool { - self.is_row_tracking_supported() - && self.table_properties().enable_row_tracking.unwrap_or(false) - } - /// Returns `true` if row tracking is suspended for this table. /// /// Row tracking is suspended when the `delta.rowTrackingSuspended` table property is set to `true`. @@ -421,52 +710,225 @@ impl TableConfiguration { /// Note: We ignore [`is_row_tracking_enabled`] at this point because Kernel does not /// preserve row IDs and row commit versions yet. pub(crate) fn should_write_row_tracking(&self) -> bool { - self.is_row_tracking_supported() && !self.is_row_tracking_suspended() + self.is_feature_supported(&TableFeature::RowTracking) && !self.is_row_tracking_suspended() + } + + /// Returns true if the protocol uses legacy reader version (< 3) + #[allow(dead_code)] + fn is_legacy_reader_version(&self) -> bool { + self.protocol.min_reader_version() < TABLE_FEATURES_MIN_READER_VERSION + } + + /// Returns true if the protocol uses legacy writer version (< 7) + #[allow(dead_code)] + fn is_legacy_writer_version(&self) -> bool { + self.protocol.min_writer_version() < TABLE_FEATURES_MIN_WRITER_VERSION + } + + /// Helper to check if a feature is present in a feature list. + fn has_feature(features: Option<&[TableFeature]>, feature: &TableFeature) -> bool { + features + .map(|features| features.contains(feature)) + .unwrap_or(false) + } + + /// Helper method to check if a feature is supported. + /// This checks protocol versions and feature lists but does NOT check enablement properties. + #[internal_api] + pub(crate) fn is_feature_supported(&self, feature: &TableFeature) -> bool { + let info = feature.info(); + let min_legacy_version = info.min_legacy_version.as_ref(); + let min_reader_version = + min_legacy_version.map_or(TABLE_FEATURES_MIN_READER_VERSION, |v| v.reader); + let min_writer_version = + min_legacy_version.map_or(TABLE_FEATURES_MIN_WRITER_VERSION, |v| v.writer); + match info.feature_type { + FeatureType::WriterOnly => { + if self.is_legacy_writer_version() { + // Legacy writer: protocol writer version meets minimum requirement + self.protocol.min_writer_version() >= min_writer_version + } else { + // Table features writer: feature is in writer_features list + Self::has_feature(self.protocol.writer_features(), feature) + } + } + FeatureType::ReaderWriter => { + let reader_supported = if self.is_legacy_reader_version() { + // Legacy reader: protocol reader version meets minimum requirement + self.protocol.min_reader_version() >= min_reader_version + } else { + // Table features reader: feature is in reader_features list + Self::has_feature(self.protocol.reader_features(), feature) + }; + + let writer_supported = if self.is_legacy_writer_version() { + // Legacy writer: protocol writer version meets minimum requirement + self.protocol.min_writer_version() >= min_writer_version + } else { + // Table features writer: feature is in writer_features list + Self::has_feature(self.protocol.writer_features(), feature) + }; + + reader_supported && writer_supported + } + FeatureType::Unknown => Self::has_feature(self.protocol.writer_features(), feature), + } + } + + /// Generic method to check if a feature is enabled. + /// + /// A feature is enabled if: + /// 1. It is supported in the protocol + /// 2. The enablement check passes + #[internal_api] + pub(crate) fn is_feature_enabled(&self, feature: &TableFeature) -> bool { + if !self.is_feature_supported(feature) { + return false; + } + + match feature.info().enablement_check { + EnablementCheck::AlwaysIfSupported => true, + EnablementCheck::EnabledIf(check_fn) => check_fn(&self.table_properties), + } } } #[cfg(test)] mod test { + use std::collections::HashMap; + use std::sync::Arc; use url::Url; use crate::actions::{Metadata, Protocol}; - use crate::schema::{DataType, StructField, StructType}; - use crate::table_features::{ReaderFeature, WriterFeature}; - use crate::table_properties::TableProperties; - use crate::utils::test_utils::assert_result_error_with_message; + use crate::schema::ColumnName; + use crate::schema::{DataType, SchemaRef, StructField, StructType}; + use crate::table_features::ColumnMappingMode; + use crate::table_features::{ + FeatureType, Operation, TableFeature, TABLE_FEATURES_MIN_READER_VERSION, + TABLE_FEATURES_MIN_WRITER_VERSION, + }; + use crate::table_properties::{ + TableProperties, COLUMN_MAPPING_MODE, ENABLE_IN_COMMIT_TIMESTAMPS, + }; + use crate::utils::test_utils::{ + assert_result_error_with_message, test_schema_flat, test_schema_flat_with_column_mapping, + test_schema_nested, test_schema_nested_with_column_mapping, test_schema_with_array, + test_schema_with_array_and_column_mapping, test_schema_with_map, + test_schema_with_map_and_column_mapping, + }; use crate::Error; + use rstest::rstest; use super::{InCommitTimestampEnablement, TableConfiguration}; + fn create_mock_table_config( + props_to_enable: &[(&str, &str)], + features: &[TableFeature], + ) -> TableConfiguration { + create_mock_table_config_with_version( + props_to_enable, + Some(features), + TABLE_FEATURES_MIN_READER_VERSION, + TABLE_FEATURES_MIN_WRITER_VERSION, + ) + } + + fn create_mock_table_config_with_version( + props_to_enable: &[(&str, &str)], + features_opt: Option<&[TableFeature]>, + min_reader_version: i32, + min_writer_version: i32, + ) -> TableConfiguration { + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); + let metadata = Metadata::try_new( + None, + None, + schema, + vec![], + 0, + HashMap::from_iter( + props_to_enable + .iter() + .map(|(key, value)| (key.to_string(), value.to_string())), + ), + ) + .unwrap(); + + let (reader_features_opt, writer_features_opt) = if let Some(features) = features_opt { + // This helper only handles known features. Unknown features would need + // explicit placement on reader vs writer lists. + assert!( + features + .iter() + .all(|f| f.feature_type() != FeatureType::Unknown), + "Test helper does not support unknown features" + ); + let reader_features = features + .iter() + .filter(|f| f.feature_type() == FeatureType::ReaderWriter); + ( + // Only add reader_features if reader >= 3 (non-legacy reader mode) + (min_reader_version >= TABLE_FEATURES_MIN_READER_VERSION) + .then_some(reader_features), + // Only add writer_features if writer >= 7 (non-legacy writer mode) + (min_writer_version >= TABLE_FEATURES_MIN_WRITER_VERSION).then_some(features), + ) + } else { + (None, None) + }; + + let protocol = Protocol::try_new( + min_reader_version, + min_writer_version, + reader_features_opt, + writer_features_opt, + ) + .unwrap(); + let table_root = Url::try_from("file:///").unwrap(); + TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap() + } + #[test] fn dv_supported_not_enabled() { - let schema = StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + use crate::table_properties::ENABLE_CHANGE_DATA_FEED; + + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let metadata = Metadata::try_new( None, None, schema, vec![], 0, - HashMap::from_iter([("delta.enableChangeDataFeed".to_string(), "true".to_string())]), + HashMap::from_iter([(ENABLE_CHANGE_DATA_FEED.to_string(), "true".to_string())]), ) .unwrap(); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::DeletionVectors]), - Some([WriterFeature::DeletionVectors]), + let protocol = Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [TableFeature::DeletionVectors, TableFeature::ChangeDataFeed], ) .unwrap(); let table_root = Url::try_from("file:///").unwrap(); let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); - assert!(table_config.is_deletion_vector_supported()); - assert!(!table_config.is_deletion_vector_enabled()); + assert!(table_config.is_feature_supported(&TableFeature::DeletionVectors)); + assert!(!table_config.is_feature_enabled(&TableFeature::DeletionVectors)); } + #[test] fn dv_enabled() { - let schema = StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + use crate::table_properties::{ENABLE_CHANGE_DATA_FEED, ENABLE_DELETION_VECTORS}; + + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let metadata = Metadata::try_new( None, None, @@ -474,52 +936,146 @@ mod test { vec![], 0, HashMap::from_iter([ - ("delta.enableChangeDataFeed".to_string(), "true".to_string()), - ( - "delta.enableDeletionVectors".to_string(), - "true".to_string(), - ), + (ENABLE_CHANGE_DATA_FEED.to_string(), "true".to_string()), + (ENABLE_DELETION_VECTORS.to_string(), "true".to_string()), ]), ) .unwrap(); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::DeletionVectors]), - Some([WriterFeature::DeletionVectors]), + let protocol = Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [TableFeature::DeletionVectors, TableFeature::ChangeDataFeed], ) .unwrap(); let table_root = Url::try_from("file:///").unwrap(); let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); - assert!(table_config.is_deletion_vector_supported()); - assert!(table_config.is_deletion_vector_enabled()); + assert!(table_config.is_feature_supported(&TableFeature::DeletionVectors)); + assert!(table_config.is_feature_enabled(&TableFeature::DeletionVectors)); + } + + #[rstest] + #[case(-1, 2, Operation::Scan)] + #[case(1, -1, Operation::Write)] + fn reject_protocol_version_below_minimum( + #[case] rv: i32, + #[case] wv: i32, + #[case] op: Operation, + ) { + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); + let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); + let protocol = + Protocol::new_unchecked(rv, wv, TableFeature::NO_LIST, TableFeature::NO_LIST); + let table_root = Url::try_from("file:///").unwrap(); + let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); + let expected = if rv < 1 { + format!("Invalid protocol action in the delta log: min_reader_version must be >= 1, got {rv}") + } else { + format!("Invalid protocol action in the delta log: min_writer_version must be >= 1, got {wv}") + }; + assert_result_error_with_message(table_config.ensure_operation_supported(op), &expected); + } + + #[test] + fn write_with_cdf() { + use crate::table_properties::{APPEND_ONLY, ENABLE_CHANGE_DATA_FEED}; + use TableFeature::*; + let cases = [ + ( + // Writing to CDF-enabled table is supported for writes + create_mock_table_config(&[(ENABLE_CHANGE_DATA_FEED, "true")], &[ChangeDataFeed]), + Ok(()), + ), + ( + // Should succeed even if AppendOnly is supported but not enabled + create_mock_table_config( + &[(ENABLE_CHANGE_DATA_FEED, "true")], + &[ChangeDataFeed, AppendOnly], + ), + Ok(()), + ), + ( + // Should succeed since AppendOnly is enabled + create_mock_table_config( + &[(ENABLE_CHANGE_DATA_FEED, "true"), (APPEND_ONLY, "true")], + &[ChangeDataFeed, AppendOnly], + ), + Ok(()), + ), + ( + // Writer version > 7 is not supported + create_mock_table_config_with_version( + &[(ENABLE_CHANGE_DATA_FEED, "true")], + None, + 1, + 8, + ), + Err(Error::unsupported("Unsupported minimum writer version 8")), + ), + // Column mapping is now supported for writes. + ( + // CDF + column mapping: both supported, should succeed + create_mock_table_config( + &[(ENABLE_CHANGE_DATA_FEED, "true"), (APPEND_ONLY, "true")], + &[ChangeDataFeed, ColumnMapping, AppendOnly], + ), + Ok(()), + ), + ( + // Column mapping + AppendOnly, no CDF enabled: should succeed + create_mock_table_config( + &[(APPEND_ONLY, "true")], + &[ChangeDataFeed, ColumnMapping, AppendOnly], + ), + Ok(()), + ), + ( + // Should succeed since change data feed is not enabled + create_mock_table_config(&[(APPEND_ONLY, "true")], &[AppendOnly]), + Ok(()), + ), + ]; + + for (table_configuration, result) in cases { + match ( + table_configuration.ensure_operation_supported(Operation::Write), + result, + ) { + (Ok(()), Ok(())) => { /* Correct result */ } + (actual_result, Err(expected)) => { + assert_result_error_with_message(actual_result, &expected.to_string()); + } + (Err(actual_result), Ok(())) => { + panic!("Expected Ok but got error: {actual_result}"); + } + } + } } #[test] fn ict_enabled_from_table_creation() { - let schema = StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + use crate::table_properties::ENABLE_IN_COMMIT_TIMESTAMPS; + + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let metadata = Metadata::try_new( None, None, schema, vec![], 0, // Table creation version - HashMap::from_iter([( - "delta.enableInCommitTimestamps".to_string(), - "true".to_string(), - )]), - ) - .unwrap(); - let protocol = Protocol::try_new( - 3, - 7, - Some::>(vec![]), - Some([WriterFeature::InCommitTimestamp]), + HashMap::from_iter([(ENABLE_IN_COMMIT_TIMESTAMPS.to_string(), "true".to_string())]), ) .unwrap(); + let protocol = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, [TableFeature::InCommitTimestamp]) + .unwrap(); let table_root = Url::try_from("file:///").unwrap(); let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); - assert!(table_config.is_in_commit_timestamps_supported()); - assert!(table_config.is_in_commit_timestamps_enabled()); + assert!(table_config.is_feature_supported(&TableFeature::InCommitTimestamp)); + assert!(table_config.is_feature_enabled(&TableFeature::InCommitTimestamp)); // When ICT is enabled from table creation (version 0), it's perfectly normal // for enablement properties to be missing let info = table_config.in_commit_timestamp_enablement().unwrap(); @@ -530,7 +1086,15 @@ mod test { } #[test] fn ict_supported_and_enabled() { - let schema = StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + use crate::table_properties::{ + ENABLE_IN_COMMIT_TIMESTAMPS, IN_COMMIT_TIMESTAMP_ENABLEMENT_TIMESTAMP, + IN_COMMIT_TIMESTAMP_ENABLEMENT_VERSION, + }; + + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let metadata = Metadata::try_new( None, None, @@ -538,32 +1102,25 @@ mod test { vec![], 0, HashMap::from_iter([ + (ENABLE_IN_COMMIT_TIMESTAMPS.to_string(), "true".to_string()), ( - "delta.enableInCommitTimestamps".to_string(), - "true".to_string(), - ), - ( - "delta.inCommitTimestampEnablementVersion".to_string(), + IN_COMMIT_TIMESTAMP_ENABLEMENT_VERSION.to_string(), "5".to_string(), ), ( - "delta.inCommitTimestampEnablementTimestamp".to_string(), + IN_COMMIT_TIMESTAMP_ENABLEMENT_TIMESTAMP.to_string(), "100".to_string(), ), ]), ) .unwrap(); - let protocol = Protocol::try_new( - 3, - 7, - Some::>(vec![]), - Some([WriterFeature::InCommitTimestamp]), - ) - .unwrap(); + let protocol = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, [TableFeature::InCommitTimestamp]) + .unwrap(); let table_root = Url::try_from("file:///").unwrap(); let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); - assert!(table_config.is_in_commit_timestamps_supported()); - assert!(table_config.is_in_commit_timestamps_enabled()); + assert!(table_config.is_feature_supported(&TableFeature::InCommitTimestamp)); + assert!(table_config.is_feature_enabled(&TableFeature::InCommitTimestamp)); let info = table_config.in_commit_timestamp_enablement().unwrap(); assert_eq!( info, @@ -574,7 +1131,14 @@ mod test { } #[test] fn ict_enabled_with_partial_enablement_info() { - let schema = StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + use crate::table_properties::{ + ENABLE_IN_COMMIT_TIMESTAMPS, IN_COMMIT_TIMESTAMP_ENABLEMENT_VERSION, + }; + + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let metadata = Metadata::try_new( None, None, @@ -582,29 +1146,22 @@ mod test { vec![], 0, HashMap::from_iter([ + (ENABLE_IN_COMMIT_TIMESTAMPS.to_string(), "true".to_string()), ( - "delta.enableInCommitTimestamps".to_string(), - "true".to_string(), - ), - ( - "delta.inCommitTimestampEnablementVersion".to_string(), + IN_COMMIT_TIMESTAMP_ENABLEMENT_VERSION.to_string(), "5".to_string(), ), // Missing enablement timestamp ]), ) .unwrap(); - let protocol = Protocol::try_new( - 3, - 7, - Some::>(vec![]), - Some([WriterFeature::InCommitTimestamp]), - ) - .unwrap(); + let protocol = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, [TableFeature::InCommitTimestamp]) + .unwrap(); let table_root = Url::try_from("file:///").unwrap(); let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); - assert!(table_config.is_in_commit_timestamps_supported()); - assert!(table_config.is_in_commit_timestamps_enabled()); + assert!(table_config.is_feature_supported(&TableFeature::InCommitTimestamp)); + assert!(table_config.is_feature_enabled(&TableFeature::InCommitTimestamp)); assert!(matches!( table_config.in_commit_timestamp_enablement(), Err(Error::Generic(msg)) if msg.contains("In-commit timestamp enabled, but enablement timestamp is missing") @@ -612,80 +1169,95 @@ mod test { } #[test] fn ict_supported_and_not_enabled() { - let schema = StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); - let protocol = Protocol::try_new( - 3, - 7, - Some::>(vec![]), - Some([WriterFeature::InCommitTimestamp]), - ) - .unwrap(); + let protocol = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, [TableFeature::InCommitTimestamp]) + .unwrap(); let table_root = Url::try_from("file:///").unwrap(); let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); - assert!(table_config.is_in_commit_timestamps_supported()); - assert!(!table_config.is_in_commit_timestamps_enabled()); + assert!(table_config.is_feature_supported(&TableFeature::InCommitTimestamp)); + assert!(!table_config.is_feature_enabled(&TableFeature::InCommitTimestamp)); let info = table_config.in_commit_timestamp_enablement().unwrap(); assert_eq!(info, InCommitTimestampEnablement::NotEnabled); } #[test] fn fails_on_unsupported_feature() { - let schema = StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); - let protocol = Protocol::try_new(3, 7, Some(["unknown"]), Some(["unknown"])).unwrap(); + let protocol = Protocol::try_new_modern(["unknown"], ["unknown"]).unwrap(); let table_root = Url::try_from("file:///").unwrap(); - TableConfiguration::try_new(metadata, protocol, table_root, 0) + let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); + table_config + .ensure_operation_supported(Operation::Scan) .expect_err("Unknown feature is not supported in kernel"); } #[test] fn dv_not_supported() { - let schema = StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + use crate::table_properties::ENABLE_CHANGE_DATA_FEED; + + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let metadata = Metadata::try_new( None, None, schema, vec![], 0, - HashMap::from_iter([("delta.enableChangeDataFeed".to_string(), "true".to_string())]), + HashMap::from_iter([(ENABLE_CHANGE_DATA_FEED.to_string(), "true".to_string())]), ) .unwrap(); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::TimestampWithoutTimezone]), - Some([WriterFeature::TimestampWithoutTimezone]), + let protocol = Protocol::try_new_modern( + [TableFeature::TimestampWithoutTimezone], + [ + TableFeature::TimestampWithoutTimezone, + TableFeature::ChangeDataFeed, + ], ) .unwrap(); let table_root = Url::try_from("file:///").unwrap(); let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); - assert!(!table_config.is_deletion_vector_supported()); - assert!(!table_config.is_deletion_vector_enabled()); + assert!(!table_config.is_feature_supported(&TableFeature::DeletionVectors)); + assert!(!table_config.is_feature_enabled(&TableFeature::DeletionVectors)); } #[test] fn test_try_new_from() { - let schema = StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + use crate::table_properties::{ENABLE_CHANGE_DATA_FEED, ENABLE_DELETION_VECTORS}; + + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let metadata = Metadata::try_new( None, None, schema, vec![], 0, - HashMap::from_iter([("delta.enableChangeDataFeed".to_string(), "true".to_string())]), + HashMap::from_iter([(ENABLE_CHANGE_DATA_FEED.to_string(), "true".to_string())]), ) .unwrap(); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::DeletionVectors]), - Some([WriterFeature::DeletionVectors]), + let protocol = Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [TableFeature::DeletionVectors, TableFeature::ChangeDataFeed], ) .unwrap(); let table_root = Url::try_from("file:///").unwrap(); let table_config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); - let new_schema = - StructType::new_unchecked([StructField::nullable("value", DataType::INTEGER)]); + let new_schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); let new_metadata = Metadata::try_new( None, None, @@ -693,26 +1265,19 @@ mod test { vec![], 0, HashMap::from_iter([ - ( - "delta.enableChangeDataFeed".to_string(), - "false".to_string(), - ), - ( - "delta.enableDeletionVectors".to_string(), - "true".to_string(), - ), + (ENABLE_CHANGE_DATA_FEED.to_string(), "false".to_string()), + (ENABLE_DELETION_VECTORS.to_string(), "true".to_string()), ]), ) .unwrap(); - let new_protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::DeletionVectors, ReaderFeature::V2Checkpoint]), - Some([ - WriterFeature::DeletionVectors, - WriterFeature::V2Checkpoint, - WriterFeature::AppendOnly, - ]), + let new_protocol = Protocol::try_new_modern( + [TableFeature::DeletionVectors, TableFeature::V2Checkpoint], + [ + TableFeature::DeletionVectors, + TableFeature::V2Checkpoint, + TableFeature::AppendOnly, + TableFeature::ChangeDataFeed, + ], ) .unwrap(); let new_version = 1; @@ -727,7 +1292,10 @@ mod test { assert_eq!(new_table_config.version(), new_version); assert_eq!(new_table_config.metadata(), &new_metadata); assert_eq!(new_table_config.protocol(), &new_protocol); - assert_eq!(new_table_config.schema(), table_config.schema()); + assert_eq!( + new_table_config.logical_schema(), + table_config.logical_schema() + ); assert_eq!( new_table_config.table_properties(), &TableProperties { @@ -746,23 +1314,18 @@ mod test { #[test] fn test_timestamp_ntz_validation_integration() { // Schema with TIMESTAMP_NTZ column - let schema = - StructType::new_unchecked([StructField::nullable("ts", DataType::TIMESTAMP_NTZ)]); + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "ts", + DataType::TIMESTAMP_NTZ, + )])); let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); - let protocol_without_timestamp_ntz_features = Protocol::try_new( - 3, - 7, - Some::>(vec![]), - Some::>(vec![]), - ) - .unwrap(); + let protocol_without_timestamp_ntz_features = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, TableFeature::EMPTY_LIST).unwrap(); - let protocol_with_timestamp_ntz_features = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::TimestampWithoutTimezone]), - Some([WriterFeature::TimestampWithoutTimezone]), + let protocol_with_timestamp_ntz_features = Protocol::try_new_modern( + [TableFeature::TimestampWithoutTimezone], + [TableFeature::TimestampWithoutTimezone], ) .unwrap(); @@ -791,25 +1354,18 @@ mod test { #[test] fn test_variant_validation_integration() { // Schema with VARIANT column - let schema = - StructType::new_unchecked([StructField::nullable("v", DataType::unshredded_variant())]); + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "v", + DataType::unshredded_variant(), + )])); let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); - let protocol_without_variant_features = Protocol::try_new( - 3, - 7, - Some::>(vec![]), - Some::>(vec![]), - ) - .unwrap(); + let protocol_without_variant_features = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, TableFeature::EMPTY_LIST).unwrap(); - let protocol_with_variant_features = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::VariantType]), - Some([WriterFeature::VariantType]), - ) - .unwrap(); + let protocol_with_variant_features = + Protocol::try_new_modern([TableFeature::VariantType], [TableFeature::VariantType]) + .unwrap(); let table_root = Url::try_from("file:///").unwrap(); @@ -828,4 +1384,686 @@ mod test { "Should succeed when VARIANT is used with required features" ); } + + #[derive(Debug, Clone, Copy)] + enum UnknownFeatureShape { + NotListed, + WriterOnly, + ReaderWriter, + } + + fn create_unknown_feature_config( + shape: UnknownFeatureShape, + ) -> (TableFeature, TableConfiguration) { + const UNKNOWN: &str = "futureFeature"; + let metadata = Metadata::try_new( + None, + None, + Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])), + vec![], + 0, + HashMap::new(), + ) + .unwrap(); + let table_root = Url::try_from("file:///").unwrap(); + + let reader_features = match shape { + UnknownFeatureShape::ReaderWriter => vec![UNKNOWN], + _ => vec![], + }; + let writer_features = match shape { + UnknownFeatureShape::NotListed => vec![], + _ => vec![UNKNOWN], + }; + let protocol = Protocol::try_new_modern(reader_features, writer_features).unwrap(); + + let tc = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); + (TableFeature::unknown(UNKNOWN), tc) + } + + #[rstest] + #[case::not_listed(UnknownFeatureShape::NotListed, false)] + #[case::writer_only(UnknownFeatureShape::WriterOnly, true)] + #[case::reader_writer(UnknownFeatureShape::ReaderWriter, true)] + fn test_unknown_feature_protocol_support( + #[case] shape: UnknownFeatureShape, + #[case] expected_supported: bool, + ) { + let (unknown, config) = create_unknown_feature_config(shape); + assert_eq!(config.is_feature_supported(&unknown), expected_supported); + } + + #[rstest] + #[case::not_listed(UnknownFeatureShape::NotListed, false)] + #[case::writer_only(UnknownFeatureShape::WriterOnly, true)] + #[case::reader_writer(UnknownFeatureShape::ReaderWriter, true)] + fn test_unknown_feature_protocol_enablement( + #[case] shape: UnknownFeatureShape, + #[case] expected_enabled: bool, + ) { + let (unknown, config) = create_unknown_feature_config(shape); + assert_eq!(config.is_feature_enabled(&unknown), expected_enabled); + } + + #[rstest] + fn test_unknown_feature_capabilities( + #[values( + UnknownFeatureShape::NotListed, + UnknownFeatureShape::WriterOnly, + UnknownFeatureShape::ReaderWriter + )] + shape: UnknownFeatureShape, + #[values(Operation::Scan, Operation::Cdf, Operation::Write)] operation: Operation, + ) { + let (_, config) = create_unknown_feature_config(shape); + let expected_ok = match shape { + UnknownFeatureShape::NotListed => true, + UnknownFeatureShape::WriterOnly => operation != Operation::Write, + UnknownFeatureShape::ReaderWriter => false, + }; + assert_eq!( + config.ensure_operation_supported(operation).is_ok(), + expected_ok + ); + } + + #[test] + fn test_is_feature_supported_writer_only() { + let feature = TableFeature::AppendOnly; + + // Test with legacy protocol writer v2 - should be supported + let config = create_mock_table_config_with_version(&[], None, 1, 2); + assert!(config.is_feature_supported(&feature)); + + // Test with legacy protocol writer v1 - should NOT be supported + let config = create_mock_table_config_with_version(&[], None, 1, 1); + assert!(!config.is_feature_supported(&feature)); + + // reader=2 (legacy), writer=7 (non-legacy) - feature in list, should be supported + let config = + create_mock_table_config_with_version(&[], Some(&[TableFeature::AppendOnly]), 2, 7); + assert!(config.is_feature_supported(&feature)); + + // reader=2 (legacy), writer=7 (non-legacy) - feature NOT in list, should NOT be supported + // Use ChangeDataFeed which is also a WriterOnly feature + let config = + create_mock_table_config_with_version(&[], Some(&[TableFeature::ChangeDataFeed]), 2, 7); + assert!(!config.is_feature_supported(&feature)); + + // Test with protocol reader=3, writer=7 (both non-legacy) - feature in list, should be supported + let config = create_mock_table_config(&[], &[TableFeature::AppendOnly]); + assert!(config.is_feature_supported(&feature)); + + let config = create_mock_table_config(&[], &[TableFeature::DeletionVectors]); + assert!(!config.is_feature_supported(&feature)); + } + + #[test] + fn test_is_feature_supported_reader_writer() { + let feature = TableFeature::ColumnMapping; + + // Test with sufficient versions (legacy mode) - should be supported + let config = create_mock_table_config_with_version(&[], None, 2, 5); + assert!(config.is_feature_supported(&feature)); + + // Test with insufficient reader version - should NOT be supported + let config = create_mock_table_config_with_version(&[], None, 1, 5); + assert!(!config.is_feature_supported(&feature)); + + // Test with insufficient writer version - should NOT be supported + let config = create_mock_table_config_with_version(&[], None, 2, 4); + assert!(!config.is_feature_supported(&feature)); + + // Test with asymmetric: reader=2 (legacy), writer=7 (non-legacy) + // ReaderWriter features CANNOT be enabled in this protocol state (protocol validation) + // But we still need to test that the code correctly identifies them as NOT supported + // Create a table with only WriterOnly features (e.g., AppendOnly) + let config = + create_mock_table_config_with_version(&[], Some(&[TableFeature::AppendOnly]), 2, 7); + // ColumnMapping (ReaderWriter) should NOT be supported because: + // - reader=2 (legacy) checks version: 2 >= 2 (reader_supported = true) + // - writer=7 (non-legacy) checks list: ColumnMapping not in writer_features (writer_supported = false) + // - Result: false (requires BOTH to be true) + assert!(!config.is_feature_supported(&feature)); + + // Test with non-legacy mode (3,7) - feature in list, should be supported + let config = create_mock_table_config(&[], &[TableFeature::ColumnMapping]); + assert!(config.is_feature_supported(&feature)); + + // Test with non-legacy mode (3,7) - feature NOT in list, should NOT be supported + let config = create_mock_table_config(&[], &[TableFeature::DeletionVectors]); + assert!(!config.is_feature_supported(&feature)); + } + + #[test] + fn test_is_feature_enabled_with_property_check() { + use crate::table_properties::APPEND_ONLY; + + let feature = TableFeature::AppendOnly; + + // Test when property check fails - should be supported but not enabled + let config = create_mock_table_config_with_version(&[], None, 1, 2); + assert!(config.is_feature_supported(&feature)); + assert!(!config.is_feature_enabled(&feature)); + + // Test when property check passes - should be both supported and enabled + let config = create_mock_table_config_with_version(&[(APPEND_ONLY, "true")], None, 1, 2); + assert!(config.is_feature_supported(&feature)); + assert!(config.is_feature_enabled(&feature)); + + // Test when property is set but feature is not supported by protocol versions. + // TODO: Reject this orphaned metadata + let config = create_mock_table_config_with_version(&[(APPEND_ONLY, "true")], None, 1, 1); + assert!(!config.is_feature_supported(&feature)); + assert!(!config.is_feature_enabled(&feature)); + } + + #[test] + fn test_is_feature_enabled_always_if_supported() { + let feature = TableFeature::V2Checkpoint; + + // Test when supported - should be both supported and enabled + let config = create_mock_table_config(&[], &[TableFeature::V2Checkpoint]); + assert!(config.is_feature_supported(&feature)); + assert!(config.is_feature_enabled(&feature)); + + // Test when not supported - should be neither supported nor enabled + let config = create_mock_table_config(&[], &[TableFeature::DeletionVectors]); + assert!(!config.is_feature_supported(&feature)); + assert!(!config.is_feature_enabled(&feature)); + } + + #[test] + fn test_ensure_operation_supported_reads() { + let config = create_mock_table_config(&[], &[]); + assert!(config.ensure_operation_supported(Operation::Scan).is_ok()); + + let config = create_mock_table_config(&[], &[TableFeature::V2Checkpoint]); + assert!(config.ensure_operation_supported(Operation::Scan).is_ok()); + + let config = create_mock_table_config_with_version(&[], None, 1, 2); + assert!(config.ensure_operation_supported(Operation::Scan).is_ok()); + + let config = create_mock_table_config_with_version( + &[], + Some(&[TableFeature::InCommitTimestamp]), + 2, + 7, + ); + assert!(config.ensure_operation_supported(Operation::Scan).is_ok()); + } + + #[test] + fn test_ensure_operation_supported_writes() { + let config = create_mock_table_config( + &[], + &[ + TableFeature::AppendOnly, + TableFeature::DeletionVectors, + TableFeature::DomainMetadata, + TableFeature::Invariants, + TableFeature::RowTracking, + ], + ); + assert!(config.ensure_operation_supported(Operation::Write).is_ok()); + + // Type Widening is not supported for writes + let config = create_mock_table_config(&[], &[TableFeature::TypeWidening]); + assert_result_error_with_message( + config.ensure_operation_supported(Operation::Write), + r#"Feature 'typeWidening' is not supported for writes"#, + ); + } + + #[test] + fn test_illegal_writer_feature_combination() { + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); + let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); + let protocol = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, vec![TableFeature::RowTracking]) + .unwrap(); + let table_root = Url::try_from("file:///").unwrap(); + let config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); + assert_result_error_with_message( + config.ensure_operation_supported(Operation::Write), + "Feature 'rowTracking' requires 'domainMetadata' to be supported", + ); + } + + #[test] + fn test_row_tracking_with_domain_metadata_requirement() { + let schema = Arc::new(StructType::new_unchecked([StructField::nullable( + "value", + DataType::INTEGER, + )])); + let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); + let protocol = Protocol::try_new_modern( + TableFeature::EMPTY_LIST, + vec![TableFeature::RowTracking, TableFeature::DomainMetadata], + ) + .unwrap(); + let table_root = Url::try_from("file:///").unwrap(); + let config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); + assert!( + config.ensure_operation_supported(Operation::Write).is_ok(), + "RowTracking with DomainMetadata should be supported for writes" + ); + } + + #[test] + fn test_catalog_managed_writes() { + // CatalogManaged requires ICT to be supported and enabled + let config = create_mock_table_config( + &[(ENABLE_IN_COMMIT_TIMESTAMPS, "true")], + &[ + TableFeature::CatalogManaged, + TableFeature::InCommitTimestamp, + ], + ); + assert!(config.ensure_operation_supported(Operation::Write).is_ok()); + + let config = create_mock_table_config( + &[(ENABLE_IN_COMMIT_TIMESTAMPS, "true")], + &[ + TableFeature::CatalogOwnedPreview, + TableFeature::InCommitTimestamp, + ], + ); + assert!(config.ensure_operation_supported(Operation::Write).is_ok()); + } + + /// Helper to create a schema with column mapping metadata using JSON deserialization + fn schema_with_column_mapping() -> SchemaRef { + let field_a: StructField = serde_json::from_str( + r#"{ + "name": "col_a", + "type": "long", + "nullable": true, + "metadata": { + "delta.columnMapping.id": 1, + "delta.columnMapping.physicalName": "phys_col_a" + } + }"#, + ) + .unwrap(); + + let field_b: StructField = serde_json::from_str( + r#"{ + "name": "col_b", + "type": "string", + "nullable": true, + "metadata": { + "delta.columnMapping.id": 2, + "delta.columnMapping.physicalName": "phys_col_b" + } + }"#, + ) + .unwrap(); + + Arc::new(StructType::new_unchecked([field_a, field_b])) + } + + fn create_table_config_with_column_mapping( + schema: SchemaRef, + column_mapping_mode: &str, + ) -> TableConfiguration { + create_table_config_with_column_mapping_and_props(schema, column_mapping_mode, []) + } + + fn create_table_config_with_column_mapping_and_props( + schema: SchemaRef, + column_mapping_mode: &str, + extra_props: impl IntoIterator, + ) -> TableConfiguration { + let mut props: HashMap = extra_props + .into_iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + props.insert( + COLUMN_MAPPING_MODE.to_string(), + column_mapping_mode.to_string(), + ); + + let metadata = Metadata::try_new(None, None, schema, vec![], 0, props).unwrap(); + + // Use reader version 2 which supports column mapping + let protocol = Protocol::try_new_legacy(2, 5).unwrap(); + let table_root = Url::try_from("file:///").unwrap(); + TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap() + } + + #[test] + fn test_build_expected_stats_schemas_no_column_mapping() { + let schema = Arc::new(StructType::new_unchecked([ + StructField::nullable("col_a", DataType::LONG), + StructField::nullable("col_b", DataType::STRING), + ])); + let metadata = Metadata::try_new(None, None, schema, vec![], 0, HashMap::new()).unwrap(); + let protocol = Protocol::try_new_legacy(1, 2).unwrap(); + let table_root = Url::try_from("file:///").unwrap(); + let config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); + + assert_eq!(config.column_mapping_mode(), ColumnMappingMode::None); + + let stats_schemas = config.build_expected_stats_schemas(None, None).unwrap(); + + // Verify field names are logical names + let min_values = stats_schemas + .physical + .field("minValues") + .unwrap() + .data_type(); + if let DataType::Struct(inner) = min_values { + assert!(inner.field("col_a").is_some()); + assert!(inner.field("col_b").is_some()); + } else { + panic!("Expected minValues to be a struct"); + } + } + + #[test] + fn test_build_expected_stats_schemas_with_column_mapping() { + // With column mapping, physical schema should have physical names + let schema = schema_with_column_mapping(); + let config = create_table_config_with_column_mapping(schema, "name"); + + assert_eq!(config.column_mapping_mode(), ColumnMappingMode::Name); + + let stats_schemas = config.build_expected_stats_schemas(None, None).unwrap(); + + // Verify physical schema has physical names + let physical_min_values = stats_schemas + .physical + .field("minValues") + .unwrap() + .data_type(); + if let DataType::Struct(inner) = physical_min_values { + assert!( + inner.field("phys_col_a").is_some(), + "Physical schema should have phys_col_a" + ); + assert!( + inner.field("phys_col_b").is_some(), + "Physical schema should have phys_col_b" + ); + assert!(inner.field("col_a").is_none()); + } else { + panic!("Expected minValues to be a struct"); + } + } + + #[test] + fn test_build_expected_stats_schemas_id_mode_has_no_parquet_field_ids() { + // With column mapping mode `id`, make_physical() injects ParquetFieldId metadata for + // data file reading. But the physical stats schema must NOT contain these field IDs + // because stats are read from JSON commit files or checkpoint Parquet files, neither of + // which use parquet field IDs. + use crate::schema::{ColumnMetadataKey, MetadataValue}; + + let schema = schema_with_column_mapping(); + let config = create_table_config_with_column_mapping(schema, "id"); + + assert_eq!(config.column_mapping_mode(), ColumnMappingMode::Id); + + let stats_schemas = config.build_expected_stats_schemas(None, None).unwrap(); + + // Verify physical schema has physical names + let physical_min_values = stats_schemas + .physical + .field("minValues") + .unwrap() + .data_type(); + let DataType::Struct(inner) = physical_min_values else { + panic!("Expected minValues to be a struct"); + }; + assert!( + inner.field("phys_col_a").is_some(), + "Physical schema should have phys_col_a" + ); + assert!( + inner.field("phys_col_b").is_some(), + "Physical schema should have phys_col_b" + ); + assert!(inner.field("col_a").is_none()); + + // Verify no field has ParquetFieldId metadata + for field in inner.fields() { + assert!( + field + .get_config_value(&ColumnMetadataKey::ParquetFieldId) + .is_none(), + "Physical stats schema field '{}' should not have ParquetFieldId metadata", + field.name() + ); + } + + // Verify that make_physical on the same schema DOES produce ParquetFieldId (sanity check) + let data_schema = schema_with_column_mapping(); + let physical_data = data_schema.make_physical(ColumnMappingMode::Id).unwrap(); + let data_field = physical_data.field("phys_col_a").unwrap(); + assert!( + matches!( + data_field.get_config_value(&ColumnMetadataKey::ParquetFieldId), + Some(MetadataValue::Number(_)) + ), + "make_physical should inject ParquetFieldId for data schemas in Id mode" + ); + } + + #[test] + fn test_build_expected_stats_schemas_excludes_partition_columns() { + let field_a: StructField = serde_json::from_str( + r#"{ + "name": "data_col", + "type": "long", + "nullable": true, + "metadata": { + "delta.columnMapping.id": 1, + "delta.columnMapping.physicalName": "phys_data" + } + }"#, + ) + .unwrap(); + + let field_b: StructField = serde_json::from_str( + r#"{ + "name": "part_col", + "type": "string", + "nullable": true, + "metadata": { + "delta.columnMapping.id": 2, + "delta.columnMapping.physicalName": "phys_part" + } + }"#, + ) + .unwrap(); + + let schema = Arc::new(StructType::new_unchecked([field_a, field_b])); + let mut props = HashMap::new(); + props.insert(COLUMN_MAPPING_MODE.to_string(), "name".to_string()); + let metadata = + Metadata::try_new(None, None, schema, vec!["part_col".to_string()], 0, props).unwrap(); + let protocol = Protocol::try_new_legacy(2, 5).unwrap(); + let table_root = Url::try_from("file:///").unwrap(); + let config = TableConfiguration::try_new(metadata, protocol, table_root, 0).unwrap(); + + let stats_schemas = config.build_expected_stats_schemas(None, None).unwrap(); + + let DataType::Struct(inner) = stats_schemas + .physical + .field("minValues") + .unwrap() + .data_type() + else { + panic!("Expected minValues to be a struct"); + }; + assert!( + inner.field("phys_data").is_some(), + "Data column should be present with physical name" + ); + assert!( + inner.field("phys_part").is_none(), + "Partition column should be excluded" + ); + assert!( + inner.field("part_col").is_none(), + "Partition column logical name should also be absent" + ); + } + + #[test] + fn test_physical_stats_column_names_returns_physical_names() { + // physical_stats_column_names should return physical column names + let schema = schema_with_column_mapping(); + let config = create_table_config_with_column_mapping(schema, "name"); + + let column_names = config.physical_stats_column_names(None /* required_columns */); + + // Should return physical names, not logical names + assert_eq!( + column_names, + vec![ + ColumnName::new(["phys_col_a"]), + ColumnName::new(["phys_col_b"]), + ], + "Expected physical column names, not logical names" + ); + } + + #[test] + fn test_physical_stats_column_names_with_data_skipping_stats_columns() { + let config = create_table_config_with_column_mapping_and_props( + test_schema_nested_with_column_mapping(), + "name", + [("delta.dataSkippingStatsColumns", "id,info.name")], + ); + let column_names = config.physical_stats_column_names(None); + assert_eq!( + column_names, + vec![ + ColumnName::new(["phys_id"]), + ColumnName::new(["phys_info", "phys_name"]), + ], + ); + } + + #[test] + fn test_physical_stats_column_names_skips_nonexistent_data_skipping_stats_column() { + let config = create_table_config_with_column_mapping_and_props( + test_schema_nested_with_column_mapping(), + "name", + [("delta.dataSkippingStatsColumns", "id,nonexistent")], + ); + let column_names = config.physical_stats_column_names(None); + assert_eq!(column_names, vec![ColumnName::new(["phys_id"])],); + } + + #[rstest] + // --- flat schema --- + #[case::flat_none( + test_schema_flat(), + "none", + vec![ColumnName::new(["id"]), ColumnName::new(["name"])], + )] + #[case::flat_name( + test_schema_flat_with_column_mapping(), + "name", + vec![ColumnName::new(["phys_id"]), ColumnName::new(["phys_name"])], + )] + #[case::flat_id( + test_schema_flat_with_column_mapping(), + "id", + vec![ColumnName::new(["phys_id"]), ColumnName::new(["phys_name"])], + )] + // --- nested schema --- + #[case::nested_none( + test_schema_nested(), + "none", + vec![ + ColumnName::new(["id"]), + ColumnName::new(["info", "name"]), + ColumnName::new(["info", "age"]), + ], + )] + #[case::nested_name( + test_schema_nested_with_column_mapping(), + "name", + vec![ + ColumnName::new(["phys_id"]), + ColumnName::new(["phys_info", "phys_name"]), + ColumnName::new(["phys_info", "phys_age"]), + ], + )] + #[case::nested_id( + test_schema_nested_with_column_mapping(), + "id", + vec![ + ColumnName::new(["phys_id"]), + ColumnName::new(["phys_info", "phys_name"]), + ColumnName::new(["phys_info", "phys_age"]), + ], + )] + // --- schema with map (map fields excluded from stats) --- + #[case::map_none( + test_schema_with_map(), + "none", + vec![ColumnName::new(["id"]), ColumnName::new(["name"])], + )] + #[case::map_name( + test_schema_with_map_and_column_mapping(), + "name", + vec![ColumnName::new(["phys_id"]), ColumnName::new(["phys_name"])], + )] + #[case::map_id( + test_schema_with_map_and_column_mapping(), + "id", + vec![ColumnName::new(["phys_id"]), ColumnName::new(["phys_name"])], + )] + // --- schema with array (array fields excluded from stats) --- + #[case::array_none( + test_schema_with_array(), + "none", + vec![ColumnName::new(["id"]), ColumnName::new(["name"])], + )] + #[case::array_name( + test_schema_with_array_and_column_mapping(), + "name", + vec![ColumnName::new(["phys_id"]), ColumnName::new(["phys_name"])], + )] + #[case::array_id( + test_schema_with_array_and_column_mapping(), + "id", + vec![ColumnName::new(["phys_id"]), ColumnName::new(["phys_name"])], + )] + fn test_physical_stats_column_names_all_schemas( + #[case] schema: SchemaRef, + #[case] mode: &str, + #[case] expected_physical: Vec, + ) { + let config = create_table_config_with_column_mapping(schema, mode); + let physical_names = config.physical_stats_column_names(None); + assert_eq!( + physical_names, expected_physical, + "Incorrect physical column names for mode '{mode}'" + ); + } + + #[test] + fn test_clustered_table_writes() { + // ClusteredTable requires DomainMetadata to be supported + let config = create_mock_table_config( + &[], + &[TableFeature::ClusteredTable, TableFeature::DomainMetadata], + ); + assert!( + config.ensure_operation_supported(Operation::Write).is_ok(), + "ClusteredTable with DomainMetadata should be supported for writes" + ); + } } diff --git a/kernel/src/table_features/column_mapping.rs b/kernel/src/table_features/column_mapping.rs index cf92d6952f..9130297930 100644 --- a/kernel/src/table_features/column_mapping.rs +++ b/kernel/src/table_features/column_mapping.rs @@ -1,16 +1,25 @@ //! Code to handle column mapping, including modes and schema transforms -use super::ReaderFeature; +//! +//! This module provides: +//! - Read-side: Mode detection and schema validation +//! - Write-side: Schema transformation for assigning IDs and physical names +use super::TableFeature; use crate::actions::Protocol; use crate::schema::{ - ColumnName, DataType, MetadataValue, Schema, SchemaTransform, StructField, StructType, + ArrayType, ColumnMetadataKey, ColumnName, DataType, MapType, MetadataValue, Schema, + StructField, StructType, }; -use crate::table_properties::TableProperties; + +use crate::table_properties::{TableProperties, COLUMN_MAPPING_MODE}; +use crate::transforms::SchemaTransform; use crate::{DeltaResult, Error}; use std::borrow::Cow; +use std::collections::HashMap; use serde::{Deserialize, Serialize}; use strum::EnumString; +use uuid::Uuid; /// Modes of column mapping a table can be in #[derive(Debug, EnumString, Serialize, Deserialize, Copy, Clone, PartialEq, Eq)] @@ -38,17 +47,19 @@ pub(crate) fn column_mapping_mode( // (but should be ignored) even when the feature is not supported. For details see // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#column-mapping (Some(mode), 2) => mode, - (Some(mode), 3) if protocol.has_reader_feature(&ReaderFeature::ColumnMapping) => mode, + (Some(mode), 3) if protocol.has_table_feature(&TableFeature::ColumnMapping) => mode, _ => ColumnMappingMode::None, } } /// When column mapping mode is enabled, verify that each field in the schema is annotated with a -/// physical name and field_id; when not enabled, verify that no fields are annotated. +/// physical name and field_id, and that no two fields share the same `delta.columnMapping.id` +/// value. When not enabled, verifies that no fields are annotated. pub fn validate_schema_column_mapping(schema: &Schema, mode: ColumnMappingMode) -> DeltaResult<()> { let mut validator = ValidateColumnMappings { mode, path: vec![], + seen: HashMap::new(), err: None, }; let _ = validator.transform_struct(schema); @@ -58,98 +69,144 @@ pub fn validate_schema_column_mapping(schema: &Schema, mode: ColumnMappingMode) } } +/// Validates a field's column mapping annotations and extracts the physical name and column +/// mapping id. If `seen` is provided, also checks for duplicate column mapping IDs. +/// +/// Metadata columns are not subject to column mapping and must not carry column mapping +/// annotations. Returns the logical field name and `None` for such fields. +/// +/// When column mapping is enabled (`Id` or `Name`), the field must have a +/// `delta.columnMapping.physicalName` (string) and `delta.columnMapping.id` (number) annotation. +/// Returns the physical name and `Some(id)`. +/// +/// When disabled (`None`), neither annotation should be present. Returns the logical field name +/// and `None`. +/// +/// `path` identifies the field in error messages (e.g. `&["a", "b"]` renders as `a.b`). +pub(crate) fn get_field_column_mapping_info<'a>( + field: &'a StructField, + mode: ColumnMappingMode, + path: &[&str], + seen: Option<&mut HashMap>, +) -> DeltaResult<(&'a str, Option)> { + let field_path = || ColumnName::new(path.iter().copied()); + let physical_name_meta = field + .metadata + .get(ColumnMetadataKey::ColumnMappingPhysicalName.as_ref()); + let id_meta = field + .metadata + .get(ColumnMetadataKey::ColumnMappingId.as_ref()); + + if field.is_metadata_column() { + if physical_name_meta.is_some() || id_meta.is_some() { + return Err(Error::internal_error(format!( + "Metadata column '{}' must not have column mapping annotations", + field.name() + ))); + } + return Ok((field.name(), None)); + } + + let annotation = ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(); + let physical_name = match (mode, physical_name_meta) { + (ColumnMappingMode::None, None) => field.name(), + (ColumnMappingMode::Name | ColumnMappingMode::Id, Some(MetadataValue::String(s))) => s, + (ColumnMappingMode::Name | ColumnMappingMode::Id, Some(_)) => { + return Err(Error::schema(format!( + "The {annotation} annotation on field '{}' must be a string", + field_path(), + ))); + } + (ColumnMappingMode::Name | ColumnMappingMode::Id, None) => { + return Err(Error::schema(format!( + "Column mapping is enabled but field '{}' lacks the {annotation} annotation", + field_path(), + ))); + } + (ColumnMappingMode::None, Some(_)) => { + return Err(Error::schema(format!( + "Column mapping is not enabled but field '{}' is annotated with {annotation}", + field_path(), + ))); + } + }; + + let annotation = ColumnMetadataKey::ColumnMappingId.as_ref(); + let id = match (mode, id_meta) { + (ColumnMappingMode::None, None) => None, + (ColumnMappingMode::Name | ColumnMappingMode::Id, Some(MetadataValue::Number(n))) => { + Some(*n) + } + (ColumnMappingMode::Name | ColumnMappingMode::Id, Some(_)) => { + return Err(Error::schema(format!( + "The {annotation} annotation on field '{}' must be a number", + field_path(), + ))); + } + (ColumnMappingMode::Name | ColumnMappingMode::Id, None) => { + return Err(Error::schema(format!( + "Column mapping is enabled but field '{}' lacks the {annotation} annotation", + field_path(), + ))); + } + (ColumnMappingMode::None, Some(_)) => { + return Err(Error::schema(format!( + "Column mapping is not enabled but field '{}' is annotated with {annotation}", + field_path(), + ))); + } + }; + + if let (Some(id), Some(seen)) = (id, seen) { + seen.insert(id, field.name()).map_or(Ok(()), |prev| { + Err(Error::schema(format!( + "Duplicate column mapping ID {id} assigned to both '{prev}' and '{}'", + field.name() + ))) + })?; + } + + Ok((physical_name, id)) +} + struct ValidateColumnMappings<'a> { mode: ColumnMappingMode, path: Vec<&'a str>, + seen: HashMap, // column mapping id -> first field name that claimed it err: Option, } impl<'a> ValidateColumnMappings<'a> { - fn transform_inner_type( - &mut self, - data_type: &'a DataType, - name: &'a str, - ) -> Option> { + fn transform_inner(&mut self, field_name: &'a str, validate: impl FnOnce(&mut Self) -> R) { if self.err.is_none() { - self.path.push(name); - let _ = self.transform(data_type); + self.path.push(field_name); + let _ = validate(self); self.path.pop(); } - None - } - fn check_annotations(&mut self, field: &StructField) { - // The iterator yields `&&str` but `ColumnName::new` needs `&str` - let column_name = || ColumnName::new(self.path.iter().copied()); - let annotation = "delta.columnMapping.physicalName"; - match (self.mode, field.metadata.get(annotation)) { - // Both Id and Name modes require a physical name annotation; None mode forbids it. - (ColumnMappingMode::None, None) => {} - (ColumnMappingMode::Name | ColumnMappingMode::Id, Some(MetadataValue::String(_))) => {} - (ColumnMappingMode::Name | ColumnMappingMode::Id, Some(_)) => { - self.err = Some(Error::invalid_column_mapping_mode(format!( - "The {annotation} annotation on field '{}' must be a string", - column_name() - ))); - } - (ColumnMappingMode::Name | ColumnMappingMode::Id, None) => { - self.err = Some(Error::invalid_column_mapping_mode(format!( - "Column mapping is enabled but field '{}' lacks the {annotation} annotation", - column_name() - ))); - } - (ColumnMappingMode::None, Some(_)) => { - self.err = Some(Error::invalid_column_mapping_mode(format!( - "Column mapping is not enabled but field '{annotation}' is annotated with {}", - column_name() - ))); - } - } - - let annotation = "delta.columnMapping.id"; - match (self.mode, field.metadata.get(annotation)) { - // Both Id and Name modes require a field ID annotation; None mode forbids it. - (ColumnMappingMode::None, None) => {} - (ColumnMappingMode::Name | ColumnMappingMode::Id, Some(MetadataValue::Number(_))) => {} - (ColumnMappingMode::Name | ColumnMappingMode::Id, Some(_)) => { - self.err = Some(Error::invalid_column_mapping_mode(format!( - "The {annotation} annotation on field '{}' must be a number", - column_name() - ))); - } - (ColumnMappingMode::Name | ColumnMappingMode::Id, None) => { - self.err = Some(Error::invalid_column_mapping_mode(format!( - "Column mapping is enabled but field '{}' lacks the {annotation} annotation", - column_name() - ))); - } - (ColumnMappingMode::None, Some(_)) => { - self.err = Some(Error::invalid_column_mapping_mode(format!( - "Column mapping is not enabled but field '{}' is annotated with {annotation}", - column_name() - ))); - } - } } } impl<'a> SchemaTransform<'a> for ValidateColumnMappings<'a> { // Override array element and map key/value for better error messages fn transform_array_element(&mut self, etype: &'a DataType) -> Option> { - self.transform_inner_type(etype, "") + self.transform_inner("", |this| this.transform(etype)); + Some(Cow::Borrowed(etype)) } fn transform_map_key(&mut self, ktype: &'a DataType) -> Option> { - self.transform_inner_type(ktype, "") + self.transform_inner("", |this| this.transform(ktype)); + Some(Cow::Borrowed(ktype)) } fn transform_map_value(&mut self, vtype: &'a DataType) -> Option> { - self.transform_inner_type(vtype, "") + self.transform_inner("", |this| this.transform(vtype)); + Some(Cow::Borrowed(vtype)) } fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { - if self.err.is_none() { - self.path.push(&field.name); - self.check_annotations(field); - let _ = self.recurse_into_struct_field(field); - self.path.pop(); - } + self.transform_inner(field.name(), |this| { + get_field_column_mapping_info(field, this.mode, &this.path, Some(&mut this.seen)) + .map_err(|e| this.err = Some(e)) + .ok()?; + this.recurse_into_struct_field(field) + }); None } fn transform_variant(&mut self, _: &'a StructType) -> Option> { @@ -160,100 +217,273 @@ impl<'a> SchemaTransform<'a> for ValidateColumnMappings<'a> { } } +// ============================================================================ +// Write-side column mapping functions +// ============================================================================ + +/// Get the column mapping mode from a table properties map. +/// +/// This is used during table creation when we have raw properties from the builder, +/// not yet converted to [`TableProperties`]. +/// +/// Returns `ColumnMappingMode::None` if the property is not set. +pub(crate) fn get_column_mapping_mode_from_properties( + properties: &HashMap, +) -> DeltaResult { + match properties.get(COLUMN_MAPPING_MODE) { + Some(mode_str) => mode_str.parse::().map_err(|_| { + Error::generic(format!( + "Invalid column mapping mode '{mode_str}'. Must be one of: none, name, id" + )) + }), + None => Ok(ColumnMappingMode::None), + } +} + +/// Assigns column mapping metadata (id and physicalName) to all fields in a schema. +/// +/// This function recursively processes all fields in the schema, including nested structs, +/// arrays, and maps. Each field is assigned a new unique ID and physical name. +/// +/// Fields with pre-existing column mapping metadata (id or physicalName) are rejected +/// to avoid conflicts. ALTER TABLE will need different handling in the future. +/// +/// # Arguments +/// +/// * `schema` - The schema to transform +/// * `max_id` - Tracks the highest column ID assigned. Updated in place. Should be initialized +/// to 0 for a new table. +/// +/// # Returns +/// +/// A new schema with column mapping metadata on all fields. +pub(crate) fn assign_column_mapping_metadata( + schema: &StructType, + max_id: &mut i64, +) -> DeltaResult { + let new_fields: Vec = schema + .fields() + .map(|field| assign_field_column_mapping(field, max_id)) + .collect::>>()?; + + StructType::try_new(new_fields) +} + +/// Assigns column mapping metadata to a single field, recursively processing nested types. +/// +/// Rejects fields with pre-existing column mapping metadata. Otherwise, assigns a new +/// unique ID and physical name (incrementing `max_id`). +fn assign_field_column_mapping(field: &StructField, max_id: &mut i64) -> DeltaResult { + let has_id = field + .get_config_value(&ColumnMetadataKey::ColumnMappingId) + .is_some(); + let has_physical_name = field + .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) + .is_some(); + + // For CREATE TABLE, reject any pre-existing column mapping metadata. + // This avoids conflicts between user-provided IDs/physical names and the ones we assign. + // ALTER TABLE (adding columns) will need different handling in the future. + // TODO: Also check for nested column IDs (`delta.columnMapping.nested.ids`) once + // Iceberg compatibility (IcebergCompatV2+) is supported. See issue #1125. + if has_id || has_physical_name { + return Err(Error::generic(format!( + "Field '{}' already has column mapping metadata. \ + Pre-existing column mapping metadata is not supported for CREATE TABLE.", + field.name + ))); + } + + // Start with the existing field and assign new ID + let mut new_field = field.clone(); + *max_id += 1; + new_field.metadata.insert( + ColumnMetadataKey::ColumnMappingId.as_ref().to_string(), + MetadataValue::Number(*max_id), + ); + + // Assign physical name + let physical_name = format!("col-{}", Uuid::new_v4()); + new_field.metadata.insert( + ColumnMetadataKey::ColumnMappingPhysicalName + .as_ref() + .to_string(), + MetadataValue::String(physical_name), + ); + + // Recursively process nested types + new_field.data_type = process_nested_data_type(&field.data_type, max_id)?; + + Ok(new_field) +} + +/// Process nested data types to assign column mapping metadata to any nested struct fields. +fn process_nested_data_type(data_type: &DataType, max_id: &mut i64) -> DeltaResult { + match data_type { + DataType::Struct(inner) => { + let new_inner = assign_column_mapping_metadata(inner, max_id)?; + Ok(DataType::Struct(Box::new(new_inner))) + } + DataType::Array(array_type) => { + let new_element_type = process_nested_data_type(array_type.element_type(), max_id)?; + Ok(DataType::Array(Box::new(ArrayType::new( + new_element_type, + array_type.contains_null(), + )))) + } + DataType::Map(map_type) => { + let new_key_type = process_nested_data_type(map_type.key_type(), max_id)?; + let new_value_type = process_nested_data_type(map_type.value_type(), max_id)?; + Ok(DataType::Map(Box::new(MapType::new( + new_key_type, + new_value_type, + map_type.value_contains_null(), + )))) + } + // Primitive and Variant types don't contain nested struct fields - return as-is + DataType::Primitive(_) | DataType::Variant(_) => Ok(data_type.clone()), + } +} + +/// Translates a logical [`ColumnName`] to physical. It can be top level or nested. +/// +/// Uses `StructType::walk_column_fields` to walk the column path through nested structs, +/// then maps each field to its physical name based on the column mapping mode. +/// +/// Returns an error if the column name cannot be resolved in the schema, or if column mapping is +/// enabled but any field in the path lacks the required +/// [`ColumnMetadataKey::ColumnMappingPhysicalName`] or [`ColumnMetadataKey::ColumnMappingId`] +/// annotations. +#[delta_kernel_derive::internal_api] +pub(crate) fn get_any_level_column_physical_name( + schema: &StructType, + col_name: &ColumnName, + column_mapping_mode: ColumnMappingMode, +) -> DeltaResult { + let fields = schema.walk_column_fields(col_name)?; + let physical_path: Vec = fields + .iter() + .map(|field| -> DeltaResult { + if column_mapping_mode != ColumnMappingMode::None { + if !field.has_physical_name_annotation() { + return Err(Error::Schema(format!( + "Column mapping is enabled but field '{}' lacks the {} annotation", + field.name, + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref() + ))); + } + if !field.has_id_annotation() { + return Err(Error::Schema(format!( + "Column mapping is enabled but field '{}' lacks the {} annotation", + field.name, + ColumnMetadataKey::ColumnMappingId.as_ref() + ))); + } + } + + Ok(field.physical_name(column_mapping_mode).to_string()) + }) + .collect::>>()?; + Ok(ColumnName::new(physical_path)) +} + +/// Convert a physical column name to a logical column name by walking the schema. +/// +/// For each path component in the physical column, finds the field in the schema whose +/// `physical_name(mode)` matches, and returns the field's logical name instead. +pub(crate) fn physical_to_logical_column_name( + logical_schema: &StructType, + physical_col: &ColumnName, + column_mapping_mode: ColumnMappingMode, +) -> DeltaResult { + let fields = logical_schema.walk_column_fields_by(physical_col, |s, phys_name| { + s.fields() + .find(|f| f.physical_name(column_mapping_mode) == phys_name) + })?; + Ok(ColumnName::new(fields.iter().map(|f| f.name.clone()))) +} + #[cfg(test)] mod tests { use super::*; - use crate::schema::StructType; - use std::collections::HashMap; + use crate::expressions::ColumnName; + use crate::schema::{DataType, MetadataValue, StructField, StructType}; + use crate::utils::test_utils::make_test_tc; + use std::collections::{HashMap, HashSet}; + + use crate::utils::test_utils::test_deep_nested_schema_missing_leaf_cm; #[test] fn test_column_mapping_mode() { - let table_properties: HashMap<_, _> = - [("delta.columnMapping.mode".to_string(), "id".to_string())] - .into_iter() - .collect(); - let table_properties = TableProperties::from(table_properties.iter()); - let empty_table_properties = TableProperties::from([] as [(String, String); 0]); - - let protocol = Protocol::try_new(2, 5, None::>, None::>).unwrap(); - - assert_eq!( - column_mapping_mode(&protocol, &table_properties), - ColumnMappingMode::Id - ); + let annotated = create_schema("5", "\"col-a7f4159c\"", "4", "\"col-5f422f40\""); + let plain = create_schema(None, None, None, None); + let cmm_id = HashMap::from([("delta.columnMapping.mode".to_string(), "id".to_string())]); + let no_props = HashMap::new(); - assert_eq!( - column_mapping_mode(&protocol, &empty_table_properties), - ColumnMappingMode::None - ); - - let empty_features = Some::<[String; 0]>([]); - let protocol = - Protocol::try_new(3, 7, empty_features.clone(), empty_features.clone()).unwrap(); - - assert_eq!( - column_mapping_mode(&protocol, &table_properties), - ColumnMappingMode::None - ); - - assert_eq!( - column_mapping_mode(&protocol, &empty_table_properties), - ColumnMappingMode::None - ); + // v2 legacy + mode=id => Id (annotated schema required) + let tc = make_test_tc( + annotated.clone(), + Protocol::try_new_legacy(2, 5).unwrap(), + cmm_id.clone(), + ) + .unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::Id); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::ColumnMapping]), - empty_features.clone(), + // v2 legacy + no mode => None + let tc = make_test_tc( + plain.clone(), + Protocol::try_new_legacy(2, 5).unwrap(), + no_props.clone(), ) .unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::None); - assert_eq!( - column_mapping_mode(&protocol, &table_properties), - ColumnMappingMode::Id - ); + // v3 + empty features + mode=id => None (mode ignored without CM feature) + let protocol = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, TableFeature::EMPTY_LIST).unwrap(); + let tc = make_test_tc(plain.clone(), protocol.clone(), cmm_id.clone()).unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::None); - assert_eq!( - column_mapping_mode(&protocol, &empty_table_properties), - ColumnMappingMode::None - ); + // v3 + empty features + no mode => None + let tc = make_test_tc(plain.clone(), protocol, no_props.clone()).unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::None); + + // v3 + CM feature + mode=id => Id + let protocol = + Protocol::try_new_modern([TableFeature::ColumnMapping], [TableFeature::ColumnMapping]) + .unwrap(); + let tc = make_test_tc(annotated.clone(), protocol.clone(), cmm_id.clone()).unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::Id); + + // v3 + CM feature + no mode => None + let tc = make_test_tc(plain.clone(), protocol, no_props.clone()).unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::None); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::DeletionVectors]), - empty_features.clone(), + // v3 + DV feature (no CM) + mode=id => None (mode ignored) + let protocol = Protocol::try_new_modern( + [TableFeature::DeletionVectors], + [TableFeature::DeletionVectors], ) .unwrap(); + let tc = make_test_tc(plain.clone(), protocol.clone(), cmm_id.clone()).unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::None); - assert_eq!( - column_mapping_mode(&protocol, &table_properties), - ColumnMappingMode::None - ); - - assert_eq!( - column_mapping_mode(&protocol, &empty_table_properties), - ColumnMappingMode::None - ); + // v3 + DV feature + no mode => None + let tc = make_test_tc(plain.clone(), protocol, no_props.clone()).unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::None); - let protocol = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::DeletionVectors, ReaderFeature::ColumnMapping]), - empty_features, + // v3 + DV + CM features + mode=id => Id + let protocol = Protocol::try_new_modern( + [TableFeature::DeletionVectors, TableFeature::ColumnMapping], + [TableFeature::DeletionVectors, TableFeature::ColumnMapping], ) .unwrap(); + let tc = make_test_tc(annotated.clone(), protocol.clone(), cmm_id.clone()).unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::Id); - assert_eq!( - column_mapping_mode(&protocol, &table_properties), - ColumnMappingMode::Id - ); - - assert_eq!( - column_mapping_mode(&protocol, &empty_table_properties), - ColumnMappingMode::None - ); + // v3 + DV + CM features + no mode => None + let tc = make_test_tc(plain.clone(), protocol, no_props).unwrap(); + assert_eq!(tc.column_mapping_mode(), ColumnMappingMode::None); } // Creates optional schema field annotations for column mapping id and physical name, as a string. @@ -354,4 +584,727 @@ mod tests { let schema = create_schema(None, None, None, "\"col-5f422f40\""); validate_schema_column_mapping(&schema, ColumnMappingMode::None).expect_err("field name"); } + + #[test] + fn test_annotation_validation_reaches_struct_fields_in_map_value() { + let unannotated = + StructType::new_unchecked([StructField::new("x", DataType::INTEGER, false)]); + let schema = StructType::new_unchecked([make_cm_field( + "b", + 1, + MapType::new( + DataType::STRING, + DataType::Struct(Box::new(unannotated)), + false, + ), + )]); + validate_schema_column_mapping(&schema, ColumnMappingMode::Id) + .expect_err("missing annotation on struct field inside map value"); + } + + fn make_cm_field(name: &str, id: i64, data_type: impl Into) -> StructField { + StructField::new(name, data_type, false).with_metadata([ + ( + ColumnMetadataKey::ColumnMappingId.as_ref(), + MetadataValue::Number(id), + ), + ( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + MetadataValue::String(format!("col-{name}")), + ), + ]) + } + + fn cm_schema_same_level_duplicates() -> StructType { + StructType::new_unchecked([ + make_cm_field("a", 1, DataType::INTEGER), + make_cm_field("b", 1, DataType::INTEGER), + ]) + } + + fn cm_schema_nested_duplicates() -> StructType { + let nested = StructType::new_unchecked([ + make_cm_field("x", 5, DataType::INTEGER), + make_cm_field("y", 5, DataType::INTEGER), + ]); + StructType::new_unchecked([make_cm_field( + "outer", + 10, + DataType::Struct(Box::new(nested)), + )]) + } + + fn cm_schema_cross_level_duplicates() -> StructType { + let nested = StructType::new_unchecked([make_cm_field("inner", 1, DataType::INTEGER)]); + StructType::new_unchecked([ + make_cm_field("a", 1, DataType::INTEGER), + make_cm_field("b", 2, DataType::Struct(Box::new(nested))), + ]) + } + + fn cm_schema_array_duplicates() -> StructType { + let element = StructType::new_unchecked([make_cm_field("x", 1, DataType::INTEGER)]); + StructType::new_unchecked([ + make_cm_field("a", 1, DataType::INTEGER), + make_cm_field( + "b", + 2, + ArrayType::new(DataType::Struct(Box::new(element)), false), + ), + ]) + } + + fn cm_schema_map_duplicates() -> StructType { + let value = StructType::new_unchecked([make_cm_field("x", 1, DataType::INTEGER)]); + StructType::new_unchecked([ + make_cm_field("a", 1, DataType::INTEGER), + make_cm_field( + "b", + 2, + MapType::new(DataType::STRING, DataType::Struct(Box::new(value)), false), + ), + ]) + } + + #[rstest::rstest] + #[case::same_level(cm_schema_same_level_duplicates())] + #[case::nested_struct(cm_schema_nested_duplicates())] + #[case::across_nesting_levels(cm_schema_cross_level_duplicates())] + #[case::across_array(cm_schema_array_duplicates())] + #[case::across_map(cm_schema_map_duplicates())] + fn test_duplicate_column_mapping_ids_rejected(#[case] schema: StructType) { + crate::utils::test_utils::assert_result_error_with_message( + validate_schema_column_mapping(&schema, ColumnMappingMode::Id), + "Duplicate column mapping ID", + ); + } + + #[test] + fn test_duplicate_column_mapping_ids_rejected_in_name_mode() { + crate::utils::test_utils::assert_result_error_with_message( + validate_schema_column_mapping( + &cm_schema_same_level_duplicates(), + ColumnMappingMode::Name, + ), + "Duplicate column mapping ID", + ); + } + + // ========================================================================= + // Tests for write-side column mapping functions + // ========================================================================= + + #[rstest::rstest] + #[case::no_property(None, Some(ColumnMappingMode::None))] + #[case::mode_name(Some("name"), Some(ColumnMappingMode::Name))] + #[case::mode_id(Some("id"), Some(ColumnMappingMode::Id))] + #[case::mode_none_explicit(Some("none"), Some(ColumnMappingMode::None))] + #[case::invalid_mode(Some("invalid"), None)] + fn test_get_column_mapping_mode_from_properties( + #[case] mode_str: Option<&str>, + #[case] expected: Option, + ) { + let mut properties = HashMap::new(); + if let Some(mode) = mode_str { + properties.insert(COLUMN_MAPPING_MODE.to_string(), mode.to_string()); + } + match expected { + Some(mode) => assert_eq!( + get_column_mapping_mode_from_properties(&properties).unwrap(), + mode + ), + None => assert!(get_column_mapping_mode_from_properties(&properties).is_err()), + } + } + + #[test] + fn test_assign_column_mapping_metadata_simple() { + let schema = StructType::new_unchecked([ + StructField::new("a", DataType::INTEGER, false), + StructField::new("b", DataType::STRING, true), + ]); + + let mut max_id = 0; + let result = assign_column_mapping_metadata(&schema, &mut max_id).unwrap(); + + // Should have assigned IDs 1 and 2 + assert_eq!(max_id, 2); + assert_eq!(result.fields().count(), 2); + + // Check both fields have metadata + for (i, field) in result.fields().enumerate() { + let expected_id = (i + 1) as i64; + assert_eq!( + field.get_config_value(&ColumnMetadataKey::ColumnMappingId), + Some(&MetadataValue::Number(expected_id)) + ); + assert!(field + .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) + .is_some()); + + // Verify physical name format (col-{uuid}) + if let Some(MetadataValue::String(name)) = + field.get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) + { + assert!( + name.starts_with("col-"), + "Physical name should start with 'col-'" + ); + } + } + } + + #[test] + fn test_assign_column_mapping_metadata_rejects_existing_id() { + // Schema with pre-existing column mapping metadata should be rejected + let schema = StructType::new_unchecked([ + StructField::new("a", DataType::INTEGER, false).add_metadata([ + ( + ColumnMetadataKey::ColumnMappingId.as_ref(), + MetadataValue::Number(100), + ), + ( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + MetadataValue::String("existing-physical".to_string()), + ), + ]), + StructField::new("b", DataType::STRING, true), + ]); + + let mut max_id = 0; + let result = assign_column_mapping_metadata(&schema, &mut max_id); + + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("already has column mapping metadata"), + "Expected error about existing column mapping metadata, got: {err_msg}" + ); + } + + #[test] + fn test_assign_column_mapping_metadata_nested_struct() { + let inner = StructType::new_unchecked([ + StructField::new("x", DataType::INTEGER, false), + StructField::new("y", DataType::STRING, true), + ]); + + let schema = StructType::new_unchecked([ + StructField::new("a", DataType::INTEGER, false), + StructField::new("nested", DataType::Struct(Box::new(inner)), true), + ]); + + let mut max_id = 0; + let result = assign_column_mapping_metadata(&schema, &mut max_id).unwrap(); + + // Should have assigned IDs to all 4 fields + assert_eq!(max_id, 4); + + let mut seen_ids = HashSet::new(); + let mut seen_physical_names = HashSet::new(); + + // Check outer field 'a' + let field_a = result.field("a").unwrap(); + assert_has_column_mapping_metadata(field_a, &mut seen_ids, &mut seen_physical_names); + + // Check outer field 'nested' + let field_nested = result.field("nested").unwrap(); + assert_has_column_mapping_metadata(field_nested, &mut seen_ids, &mut seen_physical_names); + + // Check nested fields + let inner = unwrap_struct(&field_nested.data_type, "nested"); + let field_x = inner.field("x").unwrap(); + assert_has_column_mapping_metadata(field_x, &mut seen_ids, &mut seen_physical_names); + let field_y = inner.field("y").unwrap(); + assert_has_column_mapping_metadata(field_y, &mut seen_ids, &mut seen_physical_names); + + // All 4 fields should have unique IDs and physical names + assert_eq!(seen_ids.len(), 4); + assert_eq!(seen_physical_names.len(), 4); + } + + // ======================================================================== + // "Cursed" nested type tests - verify column mapping metadata is assigned + // correctly for complex nested structures (arrays, maps, deeply nested) + // ======================================================================== + + /// Helper to verify a struct field has column mapping metadata (id and physical name). + /// Also collects the id and physical name into the provided sets for uniqueness checking. + fn assert_has_column_mapping_metadata( + field: &StructField, + seen_ids: &mut HashSet, + seen_physical_names: &mut HashSet, + ) { + let id = field + .get_config_value(&ColumnMetadataKey::ColumnMappingId) + .unwrap_or_else(|| panic!("Field '{}' should have column mapping ID", field.name)); + let MetadataValue::Number(id_val) = id else { + panic!( + "Field '{}' column mapping ID should be a number", + field.name + ); + }; + assert!( + seen_ids.insert(*id_val), + "Duplicate column mapping ID {} on field '{}'", + id_val, + field.name + ); + + let physical = field + .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) + .unwrap_or_else(|| panic!("Field '{}' should have physical name", field.name)); + let MetadataValue::String(physical_name) = physical else { + panic!("Field '{}' physical name should be a string", field.name); + }; + assert!( + seen_physical_names.insert(physical_name.clone()), + "Duplicate physical name '{}' on field '{}'", + physical_name, + field.name + ); + } + + /// Helper to extract struct from a DataType, panicking with context if not a struct + fn unwrap_struct<'a>(data_type: &'a DataType, context: &str) -> &'a StructType { + match data_type { + DataType::Struct(s) => s, + _ => panic!("Expected Struct for {context}, got {data_type:?}"), + } + } + + #[test] + fn test_assign_column_mapping_metadata_map_with_struct_key_and_value() { + // Test: map, struct> + // Both key and value are structs that need column mapping metadata + + let key_struct = + StructType::new_unchecked([StructField::new("k", DataType::INTEGER, false)]); + let value_struct = + StructType::new_unchecked([StructField::new("v", DataType::INTEGER, false)]); + + let map_type = MapType::new( + DataType::Struct(Box::new(key_struct)), + DataType::Struct(Box::new(value_struct)), + true, + ); + + let schema = StructType::new_unchecked([StructField::new( + "my_map", + DataType::Map(Box::new(map_type)), + true, + )]); + + let mut max_id = 0; + let result = assign_column_mapping_metadata(&schema, &mut max_id).unwrap(); + + // Should assign IDs to: my_map (1), k (2), v (3) + assert_eq!(max_id, 3); + + let mut seen_ids = HashSet::new(); + let mut seen_physical_names = HashSet::new(); + + // Check top-level map field + let map_field = result.field("my_map").unwrap(); + assert_has_column_mapping_metadata(map_field, &mut seen_ids, &mut seen_physical_names); + + // Check key struct field + if let DataType::Map(inner_map) = &map_field.data_type { + let key_struct = unwrap_struct(inner_map.key_type(), "map key"); + let field_k = key_struct.field("k").unwrap(); + assert_has_column_mapping_metadata(field_k, &mut seen_ids, &mut seen_physical_names); + + // Check value struct field + let value_struct = unwrap_struct(inner_map.value_type(), "map value"); + let field_v = value_struct.field("v").unwrap(); + assert_has_column_mapping_metadata(field_v, &mut seen_ids, &mut seen_physical_names); + } else { + panic!("Expected map type"); + } + + assert_eq!(seen_ids.len(), 3); + assert_eq!(seen_physical_names.len(), 3); + } + + #[test] + fn test_assign_column_mapping_metadata_array_with_struct_element() { + // Test: array> + + let elem_struct = + StructType::new_unchecked([StructField::new("elem", DataType::INTEGER, false)]); + + let array_type = ArrayType::new(DataType::Struct(Box::new(elem_struct)), true); + + let schema = StructType::new_unchecked([StructField::new( + "my_array", + DataType::Array(Box::new(array_type)), + true, + )]); + + let mut max_id = 0; + let result = assign_column_mapping_metadata(&schema, &mut max_id).unwrap(); + + // Should assign IDs to: my_array (1), elem (2) + assert_eq!(max_id, 2); + + let mut seen_ids = HashSet::new(); + let mut seen_physical_names = HashSet::new(); + + // Check top-level array field + let array_field = result.field("my_array").unwrap(); + assert_has_column_mapping_metadata(array_field, &mut seen_ids, &mut seen_physical_names); + + // Check element struct field + if let DataType::Array(inner_array) = &array_field.data_type { + let elem_struct = unwrap_struct(inner_array.element_type(), "array element"); + let field_elem = elem_struct.field("elem").unwrap(); + assert_has_column_mapping_metadata(field_elem, &mut seen_ids, &mut seen_physical_names); + } else { + panic!("Expected array type"); + } + + assert_eq!(seen_ids.len(), 2); + assert_eq!(seen_physical_names.len(), 2); + } + + #[test] + fn test_assign_column_mapping_metadata_double_nested_array() { + // Test: array>> + + let deep_struct = + StructType::new_unchecked([StructField::new("deep", DataType::INTEGER, false)]); + + let inner_array = ArrayType::new(DataType::Struct(Box::new(deep_struct)), true); + let outer_array = ArrayType::new(DataType::Array(Box::new(inner_array)), true); + + let schema = StructType::new_unchecked([StructField::new( + "nested_arrays", + DataType::Array(Box::new(outer_array)), + true, + )]); + + let mut max_id = 0; + let result = assign_column_mapping_metadata(&schema, &mut max_id).unwrap(); + + // Should assign IDs to: nested_arrays (1), deep (2) + assert_eq!(max_id, 2); + + let mut seen_ids = HashSet::new(); + let mut seen_physical_names = HashSet::new(); + + // Check top-level field + let outer_field = result.field("nested_arrays").unwrap(); + assert_has_column_mapping_metadata(outer_field, &mut seen_ids, &mut seen_physical_names); + + // Navigate: array -> array -> struct -> field + let DataType::Array(outer) = &outer_field.data_type else { + panic!("Expected outer array type"); + }; + let DataType::Array(inner) = outer.element_type() else { + panic!("Expected inner array type"); + }; + let deep_struct = unwrap_struct(inner.element_type(), "inner array element"); + let field_deep = deep_struct.field("deep").unwrap(); + assert_has_column_mapping_metadata(field_deep, &mut seen_ids, &mut seen_physical_names); + + assert_eq!(seen_ids.len(), 2); + assert_eq!(seen_physical_names.len(), 2); + } + + #[test] + fn test_assign_column_mapping_metadata_array_map_array_struct_nesting() { + // Test: array>, array>>> + // Deeply nested array-map-array-struct combination + + let key_struct = + StructType::new_unchecked([StructField::new("k", DataType::INTEGER, false)]); + let value_struct = + StructType::new_unchecked([StructField::new("v", DataType::INTEGER, false)]); + + let key_array = ArrayType::new(DataType::Struct(Box::new(key_struct)), true); + let value_array = ArrayType::new(DataType::Struct(Box::new(value_struct)), true); + + let inner_map = MapType::new( + DataType::Array(Box::new(key_array)), + DataType::Array(Box::new(value_array)), + true, + ); + + let outer_array = ArrayType::new(DataType::Map(Box::new(inner_map)), true); + + let schema = StructType::new_unchecked([StructField::new( + "cursed", + DataType::Array(Box::new(outer_array)), + true, + )]); + + let mut max_id = 0; + let result = assign_column_mapping_metadata(&schema, &mut max_id).unwrap(); + + // Should assign IDs to: cursed (1), k (2), v (3) + assert_eq!(max_id, 3); + + let mut seen_ids = HashSet::new(); + let mut seen_physical_names = HashSet::new(); + + // Check top-level field + let cursed_field = result.field("cursed").unwrap(); + assert_has_column_mapping_metadata(cursed_field, &mut seen_ids, &mut seen_physical_names); + + // Navigate: array -> map -> key array -> struct -> field + // -> value array -> struct -> field + let DataType::Array(outer) = &cursed_field.data_type else { + panic!("Expected outer array type"); + }; + let DataType::Map(inner_map) = outer.element_type() else { + panic!("Expected map inside outer array"); + }; + + // Check key path: array> + let DataType::Array(key_arr) = inner_map.key_type() else { + panic!("Expected array for map key"); + }; + let key_struct = unwrap_struct(key_arr.element_type(), "key array element"); + let field_k = key_struct.field("k").unwrap(); + assert_has_column_mapping_metadata(field_k, &mut seen_ids, &mut seen_physical_names); + + // Check value path: array> + let DataType::Array(val_arr) = inner_map.value_type() else { + panic!("Expected array for map value"); + }; + let val_struct = unwrap_struct(val_arr.element_type(), "value array element"); + let field_v = val_struct.field("v").unwrap(); + assert_has_column_mapping_metadata(field_v, &mut seen_ids, &mut seen_physical_names); + + assert_eq!(seen_ids.len(), 3); + assert_eq!(seen_physical_names.len(), 3); + } + + #[test] + fn test_get_any_level_column_physical_name_success() { + let inner = StructType::new_unchecked([StructField::new("y", DataType::INTEGER, false) + .add_metadata([ + ( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + MetadataValue::String("col-inner-y".to_string()), + ), + ( + ColumnMetadataKey::ColumnMappingId.as_ref(), + MetadataValue::Number(2), + ), + ])]); + + let schema = StructType::new_unchecked([StructField::new( + "a", + DataType::Struct(Box::new(inner)), + true, + ) + .add_metadata([ + ( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + MetadataValue::String("col-outer-a".to_string()), + ), + ( + ColumnMetadataKey::ColumnMappingId.as_ref(), + MetadataValue::Number(1), + ), + ])]); + + // Top-level column + let result = get_any_level_column_physical_name( + &schema, + &ColumnName::new(["a"]), + ColumnMappingMode::Name, + ) + .unwrap(); + assert_eq!(result, ColumnName::new(["col-outer-a"])); + assert_eq!(result.path().len(), 1); + + // Nested column + let result = get_any_level_column_physical_name( + &schema, + &ColumnName::new(["a", "y"]), + ColumnMappingMode::Name, + ) + .unwrap(); + assert_eq!(result, ColumnName::new(["col-outer-a", "col-inner-y"])); + assert_eq!(result.path().len(), 2); + + // No mapping mode returns logical names (annotations are ignored) + let result = get_any_level_column_physical_name( + &schema, + &ColumnName::new(["a", "y"]), + ColumnMappingMode::None, + ) + .unwrap(); + assert_eq!(result, ColumnName::new(["a", "y"])); + assert_eq!(result.path().len(), 2); + } + + #[test] + fn test_get_any_level_column_physical_name_errors() { + let schema = StructType::new_unchecked([StructField::new("a", DataType::INTEGER, false)]); + + // Non-existent top-level column + let result = get_any_level_column_physical_name( + &schema, + &ColumnName::new(["nonexistent"]), + ColumnMappingMode::None, + ); + assert!(result.is_err()); + + // Nested path on a non-struct field + let result = get_any_level_column_physical_name( + &schema, + &ColumnName::new(["a", "b"]), + ColumnMappingMode::None, + ); + assert!(result.is_err()); + } + + #[rstest::rstest] + // physicalName present, id missing → id error + #[case::missing_id(true, false, "delta.columnMapping.id")] + // id present, physicalName missing → physicalName error + #[case::missing_physical_name(false, true, "delta.columnMapping.physicalName")] + // both missing → physicalName checked first, so physicalName error + #[case::missing_both(false, false, "delta.columnMapping.physicalName")] + fn test_get_any_level_column_physical_name_missing_annotations( + #[case] has_physical_name: bool, + #[case] has_id: bool, + #[case] expected_err: &str, + ) { + let mut inner_field = StructField::new("y", DataType::INTEGER, false); + if has_physical_name { + inner_field = inner_field.add_metadata([( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + MetadataValue::String("col-inner-y".to_string()), + )]); + } + if has_id { + inner_field = inner_field.add_metadata([( + ColumnMetadataKey::ColumnMappingId.as_ref(), + MetadataValue::Number(2), + )]); + } + + let inner = StructType::new_unchecked([inner_field]); + let schema = StructType::new_unchecked([StructField::new( + "a", + DataType::Struct(Box::new(inner)), + true, + ) + .add_metadata([ + ( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + MetadataValue::String("col-outer-a".to_string()), + ), + ( + ColumnMetadataKey::ColumnMappingId.as_ref(), + MetadataValue::Number(1), + ), + ])]); + + let err = get_any_level_column_physical_name( + &schema, + &ColumnName::new(["a", "y"]), + ColumnMappingMode::Name, + ) + .unwrap_err() + .to_string(); + assert!( + err.contains(expected_err), + "Expected error containing '{expected_err}', got: {err}" + ); + } + + #[test] + fn validate_schema_column_mapping_error_includes_full_path() { + let schema = test_deep_nested_schema_missing_leaf_cm(); + let err = validate_schema_column_mapping(&schema, ColumnMappingMode::Name) + .unwrap_err() + .to_string(); + assert!( + err.contains("top.``.mid_field.``.leaf"), + "Expected full nested path in error, got: {err}" + ); + } + + #[test] + fn physical_to_logical_no_mapping() { + let schema = StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ]); + let physical_col = ColumnName::new(["id"]); + let result = + physical_to_logical_column_name(&schema, &physical_col, ColumnMappingMode::None) + .unwrap(); + assert_eq!(result, ColumnName::new(["id"])); + } + + #[test] + fn physical_to_logical_with_name_mapping() { + let field = StructField::new("user_id", DataType::INTEGER, false).with_metadata([( + "delta.columnMapping.physicalName".to_string(), + MetadataValue::String("col-abc-123".to_string()), + )]); + let schema = StructType::new_unchecked(vec![field]); + + let physical_col = ColumnName::new(["col-abc-123"]); + let result = + physical_to_logical_column_name(&schema, &physical_col, ColumnMappingMode::Name) + .unwrap(); + assert_eq!(result, ColumnName::new(["user_id"])); + } + + #[test] + fn physical_to_logical_not_found() { + let schema = + StructType::new_unchecked(vec![StructField::new("id", DataType::INTEGER, false)]); + let physical_col = ColumnName::new(["nonexistent"]); + let result = + physical_to_logical_column_name(&schema, &physical_col, ColumnMappingMode::None); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("not found in schema")); + } + + #[test] + fn physical_to_logical_nested_struct_with_mapping() { + let inner_field = StructField::new("city", DataType::STRING, true).with_metadata([( + "delta.columnMapping.physicalName".to_string(), + MetadataValue::String("col-inner-456".to_string()), + )]); + let inner_struct = StructType::new_unchecked(vec![inner_field]); + let outer_field = + StructField::new("address", DataType::Struct(Box::new(inner_struct)), true) + .with_metadata([( + "delta.columnMapping.physicalName".to_string(), + MetadataValue::String("col-outer-123".to_string()), + )]); + let schema = StructType::new_unchecked(vec![outer_field]); + + let physical_col = ColumnName::new(["col-outer-123", "col-inner-456"]); + let result = + physical_to_logical_column_name(&schema, &physical_col, ColumnMappingMode::Name) + .unwrap(); + assert_eq!(result, ColumnName::new(["address", "city"])); + } + + #[test] + fn physical_to_logical_non_struct_intermediate_errors() { + let schema = + StructType::new_unchecked(vec![StructField::new("id", DataType::INTEGER, false)]); + let physical_col = ColumnName::new(["id", "nested"]); + let result = + physical_to_logical_column_name(&schema, &physical_col, ColumnMappingMode::None); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("is not a struct type")); + } } diff --git a/kernel/src/table_features/mod.rs b/kernel/src/table_features/mod.rs index 189eaf856b..dc98706e7a 100644 --- a/kernel/src/table_features/mod.rs +++ b/kernel/src/table_features/mod.rs @@ -1,84 +1,67 @@ -use std::sync::LazyLock; - +use itertools::Itertools; use serde::{Deserialize, Serialize}; -use strum::{AsRefStr, Display as StrumDisplay, EnumCount, EnumString}; +use strum::{AsRefStr, Display as StrumDisplay, EnumCount, EnumIter, EnumString}; +use crate::actions::Protocol; use crate::expressions::Scalar; use crate::schema::derive_macro_utils::ToDataType; use crate::schema::DataType; +use crate::table_properties::TableProperties; +use crate::{DeltaResult, Error}; use delta_kernel_derive::internal_api; -pub(crate) use column_mapping::column_mapping_mode; -pub use column_mapping::{validate_schema_column_mapping, ColumnMappingMode}; -pub(crate) use timestamp_ntz::validate_timestamp_ntz_feature_support; +#[internal_api] +pub(crate) use column_mapping::get_any_level_column_physical_name; +pub(crate) use column_mapping::physical_to_logical_column_name; +#[deprecated = "Enable internal-api and use TableConfiguration instead"] +pub use column_mapping::validate_schema_column_mapping; +pub use column_mapping::ColumnMappingMode; +pub(crate) use column_mapping::{ + assign_column_mapping_metadata, column_mapping_mode, get_column_mapping_mode_from_properties, + get_field_column_mapping_info, +}; +pub(crate) use timestamp_ntz::{ + schema_contains_timestamp_ntz, validate_timestamp_ntz_feature_support, +}; mod column_mapping; mod timestamp_ntz; -/// Reader features communicate capabilities that must be implemented in order to correctly read a -/// given table. That is, readers must implement and respect all features listed in a table's -/// `ReaderFeatures`. Note that any feature listed as a `ReaderFeature` must also have a -/// corresponding `WriterFeature`. -/// -/// The kernel currently supports all reader features except for V2Checkpoints. -#[derive( - Serialize, - Deserialize, - Debug, - Clone, - Eq, - PartialEq, - EnumString, - StrumDisplay, - AsRefStr, - EnumCount, - Hash, -)] -#[strum(serialize_all = "camelCase")] -#[serde(rename_all = "camelCase")] -#[internal_api] -pub(crate) enum ReaderFeature { - /// CatalogManaged tables: - /// - CatalogManaged, - #[strum(serialize = "catalogOwned-preview")] - #[serde(rename = "catalogOwned-preview")] - CatalogOwnedPreview, - /// Mapping of one column to another - ColumnMapping, - /// Deletion vectors for merge, update, delete - DeletionVectors, - /// timestamps without timezone support - #[strum(serialize = "timestampNtz")] - #[serde(rename = "timestampNtz")] - TimestampWithoutTimezone, - // Allow columns to change type - TypeWidening, - #[strum(serialize = "typeWidening-preview")] - #[serde(rename = "typeWidening-preview")] - TypeWideningPreview, - /// version 2 of checkpointing - V2Checkpoint, - /// vacuumProtocolCheck ReaderWriter feature ensures consistent application of reader and writer - /// protocol checks during VACUUM operations - VacuumProtocolCheck, - /// This feature enables support for the variant data type, which stores semi-structured data. - VariantType, - #[strum(serialize = "variantType-preview")] - #[serde(rename = "variantType-preview")] - VariantTypePreview, - #[strum(serialize = "variantShredding-preview")] - #[serde(rename = "variantShredding-preview")] - VariantShreddingPreview, - #[serde(untagged)] - #[strum(default)] - Unknown(String), -} +/// Minimum reader/writer protocol version that the kernel can handle. +pub const MIN_VALID_RW_VERSION: i32 = 1; + +/// Maximum reader protocol version that the kernel can handle. +pub const MAX_VALID_READER_VERSION: i32 = 3; + +/// Maximum writer protocol version that the kernel can handle. +pub const MAX_VALID_WRITER_VERSION: i32 = 7; -/// Similar to reader features, writer features communicate capabilities that must be implemented -/// in order to correctly write to a given table. That is, writers must implement and respect all -/// features listed in a table's `WriterFeatures`. +/// Minimum reader version for tables that use table features. +/// When set to 3, the protocol requires an explicit `readerFeatures` array. +pub const TABLE_FEATURES_MIN_READER_VERSION: i32 = 3; + +/// Minimum writer version for tables that use table features. +/// When set to 7, the protocol requires an explicit `writerFeatures` array. +pub const TABLE_FEATURES_MIN_WRITER_VERSION: i32 = 7; + +/// Prefix for table feature override properties. +/// Properties with this prefix (e.g., `delta.feature.deletionVectors`) are used to +/// explicitly turn on support for the feature in the protocol. +pub const SET_TABLE_FEATURE_SUPPORTED_PREFIX: &str = "delta.feature."; + +/// Value to add support for a table feature when used with [`SET_TABLE_FEATURE_SUPPORTED_PREFIX`]. +/// Example: `"delta.feature.deletionVectors" -> "supported"` +pub const SET_TABLE_FEATURE_SUPPORTED_VALUE: &str = "supported"; + +/// Table features represent protocol capabilities required to correctly read or write a given table. +/// - Readers must implement all features required for correct table reads. +/// - Writers must implement all features required for correct table writes. +/// +/// Each variant corresponds to one such feature. A feature is either: +/// - **ReaderWriter** (must be supported by both readers and writers), or +/// - **WriterOnly** (applies only to writers). +/// There are no ReaderOnly features. See `TableFeature::feature_type` for the category of each. /// -/// Kernel write support is currently in progress and as such these are not supported. +/// The kernel currently supports all reader features except `V2Checkpoint`. #[derive( Serialize, Deserialize, @@ -92,16 +75,20 @@ pub(crate) enum ReaderFeature { EnumCount, Hash, )] -#[strum(serialize_all = "camelCase")] +#[strum( + serialize_all = "camelCase", + parse_err_fn = xxx__not_needed__default_variant_means_parsing_is_infallible__xxx, + parse_err_ty = Infallible // ignored, sadly: https://github.com/Peternator7/strum/issues/430 +)] #[serde(rename_all = "camelCase")] #[internal_api] -pub(crate) enum WriterFeature { - /// CatalogManaged tables: - /// - CatalogManaged, - #[strum(serialize = "catalogOwned-preview")] - #[serde(rename = "catalogOwned-preview")] - CatalogOwnedPreview, +#[derive(EnumIter)] +// ^^ We must derive EnumIter only after internal_api adjusts visibility. Otherwise, internal-api +// builds will fail because the now-public `TableFeature::iter()` returns a pub(crate) type. +pub(crate) enum TableFeature { + ////////////////////////// + // Writer-only features // + ////////////////////////// /// Append Only Tables AppendOnly, /// Table invariants @@ -112,16 +99,39 @@ pub(crate) enum WriterFeature { ChangeDataFeed, /// Columns with generated values GeneratedColumns, - /// Mapping of one column to another - ColumnMapping, /// ID Columns IdentityColumns, /// Monotonically increasing timestamps in the CommitInfo InCommitTimestamp, - /// Deletion vectors for merge, update, delete - DeletionVectors, /// Row tracking on tables RowTracking, + /// domain specific metadata + DomainMetadata, + /// Iceberg compatibility support + IcebergCompatV1, + /// Iceberg compatibility support + IcebergCompatV2, + /// The Clustered Table feature facilitates the physical clustering of rows + /// that share similar values on a predefined set of clustering columns. + #[strum(serialize = "clustering")] + #[serde(rename = "clustering")] + ClusteredTable, + /// Materialize partition columns in parquet data files. + MaterializePartitionColumns, + + /////////////////////////// + // ReaderWriter features // + /////////////////////////// + /// CatalogManaged tables: + /// + CatalogManaged, + #[strum(serialize = "catalogOwned-preview")] + #[serde(rename = "catalogOwned-preview")] + CatalogOwnedPreview, + /// Mapping of one column to another + ColumnMapping, + /// Deletion vectors for merge, update, delete + DeletionVectors, /// timestamps without timezone support #[strum(serialize = "timestampNtz")] #[serde(rename = "timestampNtz")] @@ -131,22 +141,11 @@ pub(crate) enum WriterFeature { #[strum(serialize = "typeWidening-preview")] #[serde(rename = "typeWidening-preview")] TypeWideningPreview, - /// domain specific metadata - DomainMetadata, /// version 2 of checkpointing V2Checkpoint, - /// Iceberg compatibility support - IcebergCompatV1, - /// Iceberg compatibility support - IcebergCompatV2, /// vacuumProtocolCheck ReaderWriter feature ensures consistent application of reader and writer /// protocol checks during VACUUM operations VacuumProtocolCheck, - /// The Clustered Table feature facilitates the physical clustering of rows - /// that share similar values on a predefined set of clustering columns. - #[strum(serialize = "clustering")] - #[serde(rename = "clustering")] - ClusteredTable, /// This feature enables support for the variant data type, which stores semi-structured data. VariantType, #[strum(serialize = "variantType-preview")] @@ -155,93 +154,566 @@ pub(crate) enum WriterFeature { #[strum(serialize = "variantShredding-preview")] #[serde(rename = "variantShredding-preview")] VariantShreddingPreview, + #[serde(untagged)] #[strum(default)] Unknown(String), } -impl ToDataType for ReaderFeature { - fn to_data_type() -> DataType { - DataType::STRING +/// ReaderWriter features that can be supported by legacy readers (min_reader_version < 3). +/// Only ColumnMapping qualifies with min_reader_version = 2. +pub(crate) static LEGACY_READER_FEATURES: [TableFeature; 1] = [TableFeature::ColumnMapping]; + +/// Writer and ReaderWriter features that can be supported by legacy writers (min_writer_version < 7). +/// These are features with min_writer_version in range [1, 6]. +pub(crate) static LEGACY_WRITER_FEATURES: [TableFeature; 7] = [ + // Writer-only features (min_writer < 7) + TableFeature::AppendOnly, // min_writer = 2 + TableFeature::Invariants, // min_writer = 2 + TableFeature::CheckConstraints, // min_writer = 3 + TableFeature::ChangeDataFeed, // min_writer = 4 + TableFeature::GeneratedColumns, // min_writer = 4 + TableFeature::IdentityColumns, // min_writer = 6 + // ReaderWriter features (min_writer < 7) + TableFeature::ColumnMapping, // min_writer = 5 +]; + +/// Classifies table features by their type +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum FeatureType { + /// Feature only affects write operations + WriterOnly, + /// Feature affects both read and write operations (must appear in both feature lists) + ReaderWriter, + /// Unknown feature type (for forward compatibility) + Unknown, +} + +/// Defines how a feature's enablement is determined +#[derive(Debug, Clone, Copy)] +pub(crate) enum EnablementCheck { + /// Feature is enabled if it's supported (appears in protocol feature lists) + AlwaysIfSupported, + /// Feature is enabled if supported AND the provided function returns true when checking table properties + EnabledIf(fn(&TableProperties) -> bool), +} + +/// Represents the type of operation being performed on a table +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[internal_api] +pub(crate) enum Operation { + /// Read operations on regular table data + Scan, + /// Read operations on change data feed data + Cdf, + /// Write operations on regular table data + Write, +} + +/// Defines whether the Rust kernel has implementation support for a feature's operation +pub(crate) enum KernelSupport { + /// Kernel has full support for any operation on this feature + Supported, + /// Kernel does not support this operation on this feature + NotSupported, + /// Custom logic to determine support based on operation type and table properties. + /// For example: Column Mapping may support Scan but not CDF, or CDF writes may only + /// be supported when AppendOnly is true. + Custom(fn(&Protocol, &TableProperties, Operation) -> DeltaResult<()>), +} + +/// Types of requirements for feature dependencies +#[derive(Debug)] +pub(crate) enum FeatureRequirement { + /// Feature must be supported (in protocol) + Supported(TableFeature), + /// Feature must be enabled (supported + property set) + Enabled(TableFeature), + /// Feature must NOT be supported + NotSupported(TableFeature), + /// Feature must NOT be enabled (may be supported but property must not activate it) + NotEnabled(TableFeature), + /// Custom validation logic + Custom(fn(&Protocol, &TableProperties) -> DeltaResult<()>), +} + +/// Minimum protocol versions for legacy (pre-feature-list) inference. +pub(crate) struct MinReaderWriterVersion { + pub reader: i32, + pub writer: i32, +} + +impl MinReaderWriterVersion { + pub(crate) const fn new(reader: i32, writer: i32) -> Self { + Self { reader, writer } } } -impl ToDataType for WriterFeature { - fn to_data_type() -> DataType { - DataType::STRING +/// Rich metadata about a table feature including version requirements, dependencies, and support status +pub(crate) struct FeatureInfo { + /// The type of feature (WriterOnly, ReaderWriter, or Unknown) + pub feature_type: FeatureType, + /// Minimum legacy protocol versions for version-based feature inference. + /// `Some` for features that predate feature lists and can be inferred from protocol version. + /// `None` for features that require explicit feature lists (reader v3+ / writer v7+). + pub min_legacy_version: Option, + /// Requirements this feature has (features + custom validations) + pub feature_requirements: &'static [FeatureRequirement], + /// Rust kernel's support for this feature (may vary by Operation type) + /// + /// Note: `kernel_support` validation depends on `feature_type`: + /// WriterOnly features: Only checked during `Operation::Write` + /// ReaderWriter features: Checked during all operations (Scan/Write/CDF) + /// Read operations (Scan/CDF) only validate reader features, so `kernel_support` for + /// WriterOnly features are never invoked for Scan/CDF regardless of the custom check logic. + pub kernel_support: KernelSupport, + /// How to check if this feature is enabled in a table + pub enablement_check: EnablementCheck, +} + +// Static FeatureInfo instances for each table feature +static APPEND_ONLY_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: Some(MinReaderWriterVersion::new(1, 2)), + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::EnabledIf(|props| props.append_only == Some(true)), +}; + +// Although kernel marks invariants as "Supported", invariants must NOT actually be present in the table schema. +// Kernel will fail to write to any table that actually uses invariants (see check in TableConfiguration::ensure_write_supported). +// This is to allow legacy tables with the Invariants feature enabled but not in use. +static INVARIANTS_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: Some(MinReaderWriterVersion::new(1, 2)), + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static CHECK_CONSTRAINTS_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: Some(MinReaderWriterVersion::new(1, 3)), + feature_requirements: &[], + kernel_support: KernelSupport::NotSupported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static CHANGE_DATA_FEED_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: Some(MinReaderWriterVersion::new(1, 4)), + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::EnabledIf(|props| { + props.enable_change_data_feed == Some(true) + }), +}; + +static GENERATED_COLUMNS_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: Some(MinReaderWriterVersion::new(1, 4)), + feature_requirements: &[], + kernel_support: KernelSupport::NotSupported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static IDENTITY_COLUMNS_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: Some(MinReaderWriterVersion::new(1, 6)), + feature_requirements: &[], + kernel_support: KernelSupport::NotSupported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static IN_COMMIT_TIMESTAMP_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Custom(|_protocol, _properties, operation| match operation { + Operation::Scan | Operation::Write | Operation::Cdf => Ok(()), + }), + enablement_check: EnablementCheck::EnabledIf(|props| { + props.enable_in_commit_timestamps == Some(true) + }), +}; + +static ROW_TRACKING_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: None, + feature_requirements: &[FeatureRequirement::Supported(TableFeature::DomainMetadata)], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::EnabledIf(|props| { + props.enable_row_tracking == Some(true) && props.row_tracking_suspended != Some(true) + }), +}; + +static DOMAIN_METADATA_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +// TODO(#1125): IcebergCompatV1 requires schema type validation to block Map, Array, and Void types. +// This validation is not yet implemented. The feature is marked as NotSupported for writes until proper validation is added. +// See Delta Spark: IcebergCompat.scala CheckNoListMapNullType (lines 422-433) +// See Java Kernel: IcebergWriterCompatMetadataValidatorAndUpdater.java UNSUPPORTED_TYPES_CHECK +// See https://github.com/delta-io/delta/blob/master/PROTOCOL.md#writer-requirements-for-icebergcompatv1 for more requirements to support +static ICEBERG_COMPAT_V1_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: None, + feature_requirements: &[ + FeatureRequirement::Enabled(TableFeature::ColumnMapping), + FeatureRequirement::Custom(|_protocol, properties| { + let mode = properties.column_mapping_mode; + if !matches!( + mode, + Some(ColumnMappingMode::Name) | Some(ColumnMappingMode::Id) + ) { + return Err(Error::generic( + "IcebergCompatV1 requires Column Mapping in 'name' or 'id' mode", + )); + } + Ok(()) + }), + FeatureRequirement::NotSupported(TableFeature::DeletionVectors), + ], + kernel_support: KernelSupport::NotSupported, + enablement_check: EnablementCheck::EnabledIf(|props| { + props.enable_iceberg_compat_v1 == Some(true) + }), +}; + +// TODO(#1125): IcebergCompatV2 requires schema type validation. Unlike V1, V2 allows Map and Array +// types but needs validation against an allowlist of supported types. +// This validation is not yet implemented. The feature is marked as NotSupported for writes until proper validation is added. +// See Delta Spark: IcebergCompat.scala CheckTypeInV2AllowList (lines 450-459) +// See Java Kernel: IcebergCompatMetadataValidatorAndUpdater.java V2_SUPPORTED_TYPES +// See https://github.com/delta-io/delta/blob/master/PROTOCOL.md#writer-requirements-for-icebergcompatv2 for more requirements to support. +static ICEBERG_COMPAT_V2_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: None, + feature_requirements: &[ + FeatureRequirement::Enabled(TableFeature::ColumnMapping), + FeatureRequirement::Custom(|_protocol, properties| { + let mode = properties.column_mapping_mode; + if !matches!( + mode, + Some(ColumnMappingMode::Name) | Some(ColumnMappingMode::Id) + ) { + return Err(Error::generic( + "IcebergCompatV2 requires Column Mapping in 'name' or 'id' mode", + )); + } + Ok(()) + }), + FeatureRequirement::NotEnabled(TableFeature::IcebergCompatV1), + FeatureRequirement::NotEnabled(TableFeature::DeletionVectors), + ], + kernel_support: KernelSupport::NotSupported, + enablement_check: EnablementCheck::EnabledIf(|props| { + props.enable_iceberg_compat_v2 == Some(true) + }), +}; + +static CLUSTERED_TABLE_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: None, + feature_requirements: &[FeatureRequirement::Supported(TableFeature::DomainMetadata)], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static MATERIALIZE_PARTITION_COLUMNS_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::WriterOnly, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static CATALOG_MANAGED_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Custom(|_, _, op| match op { + Operation::Scan | Operation::Write => Ok(()), + Operation::Cdf => Err(Error::unsupported( + "Feature 'catalogManaged' is not supported for CDF", + )), + }), + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static CATALOG_OWNED_PREVIEW_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Custom(|_, _, op| match op { + Operation::Scan | Operation::Write => Ok(()), + Operation::Cdf => Err(Error::unsupported( + "Feature 'catalogOwned-preview' is not supported for CDF", + )), + }), + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static COLUMN_MAPPING_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: Some(MinReaderWriterVersion::new(2, 5)), + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::EnabledIf(|props| { + props.column_mapping_mode.is_some() + && props.column_mapping_mode != Some(ColumnMappingMode::None) + }), +}; + +static DELETION_VECTORS_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + // We support writing to tables with DeletionVectors enabled, but we never write DV files + // ourselves (no DML). The kernel only performs append operations. + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::EnabledIf(|props| { + props.enable_deletion_vectors == Some(true) + }), +}; + +static TIMESTAMP_WITHOUT_TIMEZONE_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static TYPE_WIDENING_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Custom(|_, _, op| match op { + Operation::Scan | Operation::Cdf => Ok(()), + Operation::Write => Err(Error::unsupported( + "Feature 'typeWidening' is not supported for writes", + )), + }), + enablement_check: EnablementCheck::EnabledIf(|props| props.enable_type_widening == Some(true)), +}; + +static TYPE_WIDENING_PREVIEW_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Custom(|_, _, op| match op { + Operation::Scan | Operation::Cdf => Ok(()), + Operation::Write => Err(Error::unsupported( + "Feature 'typeWidening-preview' is not supported for writes", + )), + }), + enablement_check: EnablementCheck::EnabledIf(|props| props.enable_type_widening == Some(true)), +}; + +static V2_CHECKPOINT_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static VACUUM_PROTOCOL_CHECK_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static VARIANT_TYPE_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static VARIANT_TYPE_PREVIEW_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +static VARIANT_SHREDDING_PREVIEW_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::ReaderWriter, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::Supported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +/// By definition, kernel cannot know how to handle unknown features and must assume they're always +/// enabled if supported in protocol. However, the read path ignores all writer-only features, +/// including unknown ones. Unknown features are never inferred from legacy protocol versions. +static UNKNOWN_FEATURE_INFO: FeatureInfo = FeatureInfo { + feature_type: FeatureType::Unknown, + min_legacy_version: None, + feature_requirements: &[], + kernel_support: KernelSupport::NotSupported, + enablement_check: EnablementCheck::AlwaysIfSupported, +}; + +impl TableFeature { + #[cfg(test)] + pub(crate) const NO_LIST: Option> = None; + #[cfg(test)] + pub(crate) const EMPTY_LIST: Vec = vec![]; + + pub(crate) fn feature_type(&self) -> FeatureType { + match self { + TableFeature::CatalogManaged + | TableFeature::CatalogOwnedPreview + | TableFeature::ColumnMapping + | TableFeature::DeletionVectors + | TableFeature::TimestampWithoutTimezone + | TableFeature::TypeWidening + | TableFeature::TypeWideningPreview + | TableFeature::V2Checkpoint + | TableFeature::VacuumProtocolCheck + | TableFeature::VariantType + | TableFeature::VariantTypePreview + | TableFeature::VariantShreddingPreview => FeatureType::ReaderWriter, + TableFeature::AppendOnly + | TableFeature::DomainMetadata + | TableFeature::Invariants + | TableFeature::RowTracking + | TableFeature::CheckConstraints + | TableFeature::ChangeDataFeed + | TableFeature::GeneratedColumns + | TableFeature::IdentityColumns + | TableFeature::InCommitTimestamp + | TableFeature::IcebergCompatV1 + | TableFeature::IcebergCompatV2 + | TableFeature::ClusteredTable + | TableFeature::MaterializePartitionColumns => FeatureType::WriterOnly, + TableFeature::Unknown(_) => FeatureType::Unknown, + } + } + + /// Returns true if this feature can be inferred from a legacy reader protocol version. + /// Always returns false for modern features (use feature lists instead). + pub(crate) fn is_valid_for_legacy_reader(&self, reader_version: i32) -> bool { + matches!(&self.info().min_legacy_version, Some(v) if reader_version >= v.reader) + } + + /// Returns true if this feature can be inferred from a legacy writer protocol version. + /// Always returns false for modern features (use feature lists instead). + pub(crate) fn is_valid_for_legacy_writer(&self, writer_version: i32) -> bool { + matches!(&self.info().min_legacy_version, Some(v) if writer_version >= v.writer) + } + + /// Returns rich metadata about this table feature including protocol version requirements, + /// dependencies, and kernel support status. + pub(crate) fn info(&self) -> &FeatureInfo { + match self { + // Writer-only features + TableFeature::AppendOnly => &APPEND_ONLY_INFO, + TableFeature::Invariants => &INVARIANTS_INFO, + TableFeature::CheckConstraints => &CHECK_CONSTRAINTS_INFO, + TableFeature::ChangeDataFeed => &CHANGE_DATA_FEED_INFO, + TableFeature::GeneratedColumns => &GENERATED_COLUMNS_INFO, + TableFeature::IdentityColumns => &IDENTITY_COLUMNS_INFO, + TableFeature::InCommitTimestamp => &IN_COMMIT_TIMESTAMP_INFO, + TableFeature::RowTracking => &ROW_TRACKING_INFO, + TableFeature::DomainMetadata => &DOMAIN_METADATA_INFO, + TableFeature::IcebergCompatV1 => &ICEBERG_COMPAT_V1_INFO, + TableFeature::IcebergCompatV2 => &ICEBERG_COMPAT_V2_INFO, + TableFeature::ClusteredTable => &CLUSTERED_TABLE_INFO, + TableFeature::MaterializePartitionColumns => &MATERIALIZE_PARTITION_COLUMNS_INFO, + + // ReaderWriter features + TableFeature::CatalogManaged => &CATALOG_MANAGED_INFO, + TableFeature::CatalogOwnedPreview => &CATALOG_OWNED_PREVIEW_INFO, + TableFeature::ColumnMapping => &COLUMN_MAPPING_INFO, + TableFeature::DeletionVectors => &DELETION_VECTORS_INFO, + TableFeature::TimestampWithoutTimezone => &TIMESTAMP_WITHOUT_TIMEZONE_INFO, + TableFeature::TypeWidening => &TYPE_WIDENING_INFO, + TableFeature::TypeWideningPreview => &TYPE_WIDENING_PREVIEW_INFO, + TableFeature::V2Checkpoint => &V2_CHECKPOINT_INFO, + TableFeature::VacuumProtocolCheck => &VACUUM_PROTOCOL_CHECK_INFO, + TableFeature::VariantType => &VARIANT_TYPE_INFO, + TableFeature::VariantTypePreview => &VARIANT_TYPE_PREVIEW_INFO, + TableFeature::VariantShreddingPreview => &VARIANT_SHREDDING_PREVIEW_INFO, + + // Unknown features: not supported by kernel, no legacy version inference. + TableFeature::Unknown(_) => &UNKNOWN_FEATURE_INFO, + } } } -impl From for Scalar { - fn from(feature: ReaderFeature) -> Self { - Scalar::String(feature.to_string()) +impl ToDataType for TableFeature { + fn to_data_type() -> DataType { + DataType::STRING } } -impl From for Scalar { - fn from(feature: WriterFeature) -> Self { +impl From for Scalar { + fn from(feature: TableFeature) -> Self { Scalar::String(feature.to_string()) } } #[cfg(test)] // currently only used in tests -impl ReaderFeature { +impl TableFeature { pub(crate) fn unknown(s: impl ToString) -> Self { - ReaderFeature::Unknown(s.to_string()) + TableFeature::Unknown(s.to_string()) } } -#[cfg(test)] // currently only used in tests -impl WriterFeature { - pub(crate) fn unknown(s: impl ToString) -> Self { - WriterFeature::Unknown(s.to_string()) +/// Like `Into`, but avoids collisions between strum's derived `EnumString` and the +/// blanket impl `TryFrom<&str>` that `From<&str> for TableFeature` would trigger. +/// +/// Parsing is infallible: the `Unknown` default variant catches any unrecognized feature name. If +/// https://github.com/Peternator7/strum/pull/432 merges, use impl From for TableFeature instead. +pub(crate) trait IntoTableFeature { + fn into_table_feature(self) -> TableFeature; +} + +impl IntoTableFeature for TableFeature { + fn into_table_feature(self) -> TableFeature { + self } } -pub(crate) static SUPPORTED_READER_FEATURES: LazyLock> = LazyLock::new(|| { - vec![ - #[cfg(feature = "catalog-managed")] - ReaderFeature::CatalogManaged, - #[cfg(feature = "catalog-managed")] - ReaderFeature::CatalogOwnedPreview, - ReaderFeature::ColumnMapping, - ReaderFeature::DeletionVectors, - ReaderFeature::TimestampWithoutTimezone, - ReaderFeature::TypeWidening, - ReaderFeature::TypeWideningPreview, - ReaderFeature::VacuumProtocolCheck, - ReaderFeature::V2Checkpoint, - ReaderFeature::VariantType, - ReaderFeature::VariantTypePreview, - // The default engine currently DOES NOT support shredded Variant reads and the parquet - // reader will reject the read if it sees a shredded schema in the parquet file. That being - // said, kernel does permit reconstructing shredded variants into the - // `STRUCT` representation if parquet readers of - // third-party engines support it. - ReaderFeature::VariantShreddingPreview, - ] -}); - -/// The writer features have the following limitations: -/// - We 'support' Invariants only insofar as we check that they are not present. -/// - We support writing to tables that have Invariants enabled but not used. -/// - We only support DeletionVectors in that we never write them (no DML). -/// - We support writing to existing tables with row tracking, but we don't support creating -/// tables with row tracking yet. -pub(crate) static SUPPORTED_WRITER_FEATURES: LazyLock> = LazyLock::new(|| { - vec![ - WriterFeature::AppendOnly, - WriterFeature::DeletionVectors, - WriterFeature::DomainMetadata, - WriterFeature::InCommitTimestamp, - WriterFeature::Invariants, - WriterFeature::RowTracking, - WriterFeature::TimestampWithoutTimezone, - WriterFeature::VariantType, - WriterFeature::VariantTypePreview, - WriterFeature::VariantShreddingPreview, - ] -}); +impl IntoTableFeature for &TableFeature { + fn into_table_feature(self) -> TableFeature { + self.clone() + } +} + +/// Parsing is infallible thanks to `TableFeature::Unknown` default variant +impl IntoTableFeature for &str { + fn into_table_feature(self) -> TableFeature { + #[allow(clippy::unwrap_used)] // infallible, see strum parse_err_fn + self.parse().unwrap() + } +} + +impl IntoTableFeature for String { + fn into_table_feature(self) -> TableFeature { + self.as_str().into_table_feature() + } +} + +/// Formats a slice of table features using Delta's standard serialization (camelCase). +pub(crate) fn format_features(features: &[TableFeature]) -> String { + let feature_strings: Vec<&str> = features.iter().map(|f| f.as_ref()).collect_vec(); + format!("[{}]", feature_strings.join(", ")) +} #[cfg(test)] mod tests { @@ -250,14 +722,14 @@ mod tests { #[test] fn test_unknown_features() { let mixed_reader = &[ - ReaderFeature::DeletionVectors, - ReaderFeature::unknown("cool_feature"), - ReaderFeature::ColumnMapping, + TableFeature::DeletionVectors, + TableFeature::unknown("cool_feature"), + TableFeature::ColumnMapping, ]; let mixed_writer = &[ - WriterFeature::DeletionVectors, - WriterFeature::unknown("cool_feature"), - WriterFeature::AppendOnly, + TableFeature::DeletionVectors, + TableFeature::unknown("cool_feature"), + TableFeature::AppendOnly, ]; let reader_string = serde_json::to_string(mixed_reader).unwrap(); @@ -272,8 +744,8 @@ mod tests { "[\"deletionVectors\",\"cool_feature\",\"appendOnly\"]" ); - let typed_reader: Vec = serde_json::from_str(&reader_string).unwrap(); - let typed_writer: Vec = serde_json::from_str(&writer_string).unwrap(); + let typed_reader: Vec = serde_json::from_str(&reader_string).unwrap(); + let typed_writer: Vec = serde_json::from_str(&writer_string).unwrap(); assert_eq!(typed_reader.len(), 3); assert_eq!(&typed_reader, mixed_reader); @@ -282,86 +754,49 @@ mod tests { } #[test] - fn test_roundtrip_reader_features() { - let cases = [ - (ReaderFeature::CatalogManaged, "catalogManaged"), - (ReaderFeature::CatalogOwnedPreview, "catalogOwned-preview"), - (ReaderFeature::ColumnMapping, "columnMapping"), - (ReaderFeature::DeletionVectors, "deletionVectors"), - (ReaderFeature::TimestampWithoutTimezone, "timestampNtz"), - (ReaderFeature::TypeWidening, "typeWidening"), - (ReaderFeature::TypeWideningPreview, "typeWidening-preview"), - (ReaderFeature::V2Checkpoint, "v2Checkpoint"), - (ReaderFeature::VacuumProtocolCheck, "vacuumProtocolCheck"), - (ReaderFeature::VariantType, "variantType"), - (ReaderFeature::VariantTypePreview, "variantType-preview"), - ( - ReaderFeature::VariantShreddingPreview, - "variantShredding-preview", - ), - (ReaderFeature::unknown("something"), "something"), - ]; + fn test_roundtrip_table_features() { + use strum::IntoEnumIterator as _; - assert_eq!(ReaderFeature::COUNT, cases.len()); + for feature in TableFeature::iter() { + let expected = match feature { + TableFeature::AppendOnly => "appendOnly", + TableFeature::Invariants => "invariants", + TableFeature::CheckConstraints => "checkConstraints", + TableFeature::ChangeDataFeed => "changeDataFeed", + TableFeature::GeneratedColumns => "generatedColumns", + TableFeature::IdentityColumns => "identityColumns", + TableFeature::InCommitTimestamp => "inCommitTimestamp", + TableFeature::RowTracking => "rowTracking", + TableFeature::DomainMetadata => "domainMetadata", + TableFeature::IcebergCompatV1 => "icebergCompatV1", + TableFeature::IcebergCompatV2 => "icebergCompatV2", + TableFeature::ClusteredTable => "clustering", + TableFeature::MaterializePartitionColumns => "materializePartitionColumns", + TableFeature::CatalogManaged => "catalogManaged", + TableFeature::CatalogOwnedPreview => "catalogOwned-preview", + TableFeature::ColumnMapping => "columnMapping", + TableFeature::DeletionVectors => "deletionVectors", + TableFeature::TimestampWithoutTimezone => "timestampNtz", + TableFeature::TypeWidening => "typeWidening", + TableFeature::TypeWideningPreview => "typeWidening-preview", + TableFeature::V2Checkpoint => "v2Checkpoint", + TableFeature::VacuumProtocolCheck => "vacuumProtocolCheck", + TableFeature::VariantType => "variantType", + TableFeature::VariantTypePreview => "variantType-preview", + TableFeature::VariantShreddingPreview => "variantShredding-preview", + TableFeature::Unknown(_) => continue, // tested in test_unknown_features + }; - for (feature, expected) in cases { + // strum assert_eq!(feature.to_string(), expected); - let serialized = serde_json::to_string(&feature).unwrap(); - assert_eq!(serialized, format!("\"{expected}\"")); - - let deserialized: ReaderFeature = serde_json::from_str(&serialized).unwrap(); - assert_eq!(deserialized, feature); - - let from_str: ReaderFeature = expected.parse().unwrap(); - assert_eq!(from_str, feature); - } - } + assert_eq!(feature, expected.into_table_feature()); - #[test] - fn test_roundtrip_writer_features() { - let cases = [ - (WriterFeature::AppendOnly, "appendOnly"), - (WriterFeature::CatalogManaged, "catalogManaged"), - (WriterFeature::CatalogOwnedPreview, "catalogOwned-preview"), - (WriterFeature::Invariants, "invariants"), - (WriterFeature::CheckConstraints, "checkConstraints"), - (WriterFeature::ChangeDataFeed, "changeDataFeed"), - (WriterFeature::GeneratedColumns, "generatedColumns"), - (WriterFeature::ColumnMapping, "columnMapping"), - (WriterFeature::IdentityColumns, "identityColumns"), - (WriterFeature::InCommitTimestamp, "inCommitTimestamp"), - (WriterFeature::DeletionVectors, "deletionVectors"), - (WriterFeature::RowTracking, "rowTracking"), - (WriterFeature::TimestampWithoutTimezone, "timestampNtz"), - (WriterFeature::TypeWidening, "typeWidening"), - (WriterFeature::TypeWideningPreview, "typeWidening-preview"), - (WriterFeature::DomainMetadata, "domainMetadata"), - (WriterFeature::V2Checkpoint, "v2Checkpoint"), - (WriterFeature::IcebergCompatV1, "icebergCompatV1"), - (WriterFeature::IcebergCompatV2, "icebergCompatV2"), - (WriterFeature::VacuumProtocolCheck, "vacuumProtocolCheck"), - (WriterFeature::ClusteredTable, "clustering"), - (WriterFeature::VariantType, "variantType"), - (WriterFeature::VariantTypePreview, "variantType-preview"), - ( - WriterFeature::VariantShreddingPreview, - "variantShredding-preview", - ), - (WriterFeature::unknown("something"), "something"), - ]; - - assert_eq!(WriterFeature::COUNT, cases.len()); - - for (feature, expected) in cases { - assert_eq!(feature.to_string(), expected); + // json let serialized = serde_json::to_string(&feature).unwrap(); assert_eq!(serialized, format!("\"{expected}\"")); - let deserialized: WriterFeature = serde_json::from_str(&serialized).unwrap(); + let deserialized: TableFeature = serde_json::from_str(&serialized).unwrap(); assert_eq!(deserialized, feature); - - let from_str: WriterFeature = expected.parse().unwrap(); - assert_eq!(from_str, feature); } } } diff --git a/kernel/src/table_features/timestamp_ntz.rs b/kernel/src/table_features/timestamp_ntz.rs index 32c1488edc..0230bbe2af 100644 --- a/kernel/src/table_features/timestamp_ntz.rs +++ b/kernel/src/table_features/timestamp_ntz.rs @@ -1,8 +1,9 @@ //! Validation for TIMESTAMP_NTZ feature support -use super::{ReaderFeature, WriterFeature}; -use crate::actions::Protocol; -use crate::schema::{PrimitiveType, Schema, SchemaTransform}; +use super::TableFeature; +use crate::schema::{PrimitiveType, Schema}; +use crate::table_configuration::TableConfiguration; +use crate::transforms::SchemaTransform; use crate::utils::require; use crate::{DeltaResult, Error}; @@ -10,17 +11,11 @@ use std::borrow::Cow; /// Validates that if a table schema contains TIMESTAMP_NTZ columns, the table must have the /// TimestampWithoutTimezone feature in both reader and writer features. -pub(crate) fn validate_timestamp_ntz_feature_support( - schema: &Schema, - protocol: &Protocol, -) -> DeltaResult<()> { - if !protocol.has_reader_feature(&ReaderFeature::TimestampWithoutTimezone) - || !protocol.has_writer_feature(&WriterFeature::TimestampWithoutTimezone) - { - let mut uses_timestamp_ntz = UsesTimestampNtz(false); - let _ = uses_timestamp_ntz.transform_struct(schema); +pub(crate) fn validate_timestamp_ntz_feature_support(tc: &TableConfiguration) -> DeltaResult<()> { + let protocol = tc.protocol(); + if !protocol.has_table_feature(&TableFeature::TimestampWithoutTimezone) { require!( - !uses_timestamp_ntz.0, + !schema_contains_timestamp_ntz(&tc.logical_schema()), Error::unsupported( "Table contains TIMESTAMP_NTZ columns but does not have the required 'timestampNtz' feature in reader and writer features" ) @@ -29,7 +24,14 @@ pub(crate) fn validate_timestamp_ntz_feature_support( Ok(()) } -/// Schema visitor that checks if any column in the schema uses TIMESTAMP_NTZ type +/// Checks if any column in the schema (including nested structs, arrays, maps) uses +/// the TIMESTAMP_NTZ primitive type. +pub(crate) fn schema_contains_timestamp_ntz(schema: &Schema) -> bool { + let mut uses_timestamp_ntz = UsesTimestampNtz(false); + let _ = uses_timestamp_ntz.transform_struct(schema); + uses_timestamp_ntz.0 +} + struct UsesTimestampNtz(bool); impl<'a> SchemaTransform<'a> for UsesTimestampNtz { @@ -43,69 +45,22 @@ impl<'a> SchemaTransform<'a> for UsesTimestampNtz { #[cfg(test)] mod tests { - use super::*; use crate::actions::Protocol; use crate::schema::{DataType, PrimitiveType, StructField, StructType}; - use crate::table_features::{ReaderFeature, WriterFeature}; - use crate::utils::test_utils::assert_result_error_with_message; + use crate::table_features::TableFeature; + use crate::utils::test_utils::assert_schema_feature_validation; #[test] fn test_timestamp_ntz_feature_validation() { - let schema_with_timestamp_ntz = StructType::new_unchecked([ + let schema_with = StructType::new_unchecked([ StructField::new("id", DataType::INTEGER, false), StructField::new("ts", DataType::Primitive(PrimitiveType::TimestampNtz), true), ]); - - let schema_without_timestamp_ntz = StructType::new_unchecked([ + let schema_without = StructType::new_unchecked([ StructField::new("id", DataType::INTEGER, false), StructField::new("name", DataType::STRING, true), ]); - - // Protocol with TimestampWithoutTimezone features - let protocol_with_features = Protocol::try_new( - 3, - 7, - Some([ReaderFeature::TimestampWithoutTimezone]), - Some([WriterFeature::TimestampWithoutTimezone]), - ) - .unwrap(); - - // Protocol without TimestampWithoutTimezone features - let protocol_without_features = Protocol::try_new( - 3, - 7, - Some::>(vec![]), - Some::>(vec![]), - ) - .unwrap(); - - // Schema with TIMESTAMP_NTZ + Protocol with features = OK - validate_timestamp_ntz_feature_support(&schema_with_timestamp_ntz, &protocol_with_features) - .expect("Should succeed when features are present"); - - // Schema without TIMESTAMP_NTZ + Protocol without features = OK - validate_timestamp_ntz_feature_support( - &schema_without_timestamp_ntz, - &protocol_without_features, - ) - .expect("Should succeed when no TIMESTAMP_NTZ columns are present"); - - // Schema without TIMESTAMP_NTZ + Protocol with features = OK - validate_timestamp_ntz_feature_support( - &schema_without_timestamp_ntz, - &protocol_with_features, - ) - .expect("Should succeed when no TIMESTAMP_NTZ columns are present, even with features"); - - // Schema with TIMESTAMP_NTZ + Protocol without features = ERROR - let result = validate_timestamp_ntz_feature_support( - &schema_with_timestamp_ntz, - &protocol_without_features, - ); - assert_result_error_with_message(result, "Unsupported: Table contains TIMESTAMP_NTZ columns but does not have the required 'timestampNtz' feature in reader and writer features"); - - // Nested schema with TIMESTAMP_NTZ - let nested_schema_with_timestamp_ntz = StructType::new_unchecked([ + let nested_schema_with = StructType::new_unchecked([ StructField::new("id", DataType::INTEGER, false), StructField::new( "nested", @@ -117,11 +72,21 @@ mod tests { true, ), ]); - - let result = validate_timestamp_ntz_feature_support( - &nested_schema_with_timestamp_ntz, - &protocol_without_features, + let protocol_with = Protocol::try_new_modern( + [TableFeature::TimestampWithoutTimezone], + [TableFeature::TimestampWithoutTimezone], + ) + .unwrap(); + let protocol_without = + Protocol::try_new_modern(TableFeature::EMPTY_LIST, TableFeature::EMPTY_LIST).unwrap(); + + assert_schema_feature_validation( + &schema_with, + &schema_without, + &protocol_with, + &protocol_without, + &[&nested_schema_with], + "Table contains TIMESTAMP_NTZ columns but does not have the required 'timestampNtz' feature in reader and writer features", ); - assert_result_error_with_message(result, "Unsupported: Table contains TIMESTAMP_NTZ columns but does not have the required 'timestampNtz' feature in reader and writer features"); } } diff --git a/kernel/src/table_properties.rs b/kernel/src/table_properties.rs index bf5754c5ec..947ea5e7d7 100644 --- a/kernel/src/table_properties.rs +++ b/kernel/src/table_properties.rs @@ -23,6 +23,47 @@ use strum::EnumString; mod deserialize; pub use deserialize::ParseIntervalError; +/// Prefix for delta table properties (e.g., `delta.enableChangeDataFeed`, `delta.appendOnly`). +pub const DELTA_PROPERTY_PREFIX: &str = "delta."; + +// Table property key constants +pub(crate) const APPEND_ONLY: &str = "delta.appendOnly"; +pub(crate) const AUTO_COMPACT: &str = "delta.autoOptimize.autoCompact"; +pub(crate) const OPTIMIZE_WRITE: &str = "delta.autoOptimize.optimizeWrite"; +pub(crate) const CHECKPOINT_INTERVAL: &str = "delta.checkpointInterval"; +pub(crate) const CHECKPOINT_WRITE_STATS_AS_JSON: &str = "delta.checkpoint.writeStatsAsJson"; +pub(crate) const CHECKPOINT_WRITE_STATS_AS_STRUCT: &str = "delta.checkpoint.writeStatsAsStruct"; +pub(crate) const COLUMN_MAPPING_MODE: &str = "delta.columnMapping.mode"; +pub(crate) const COLUMN_MAPPING_MAX_COLUMN_ID: &str = "delta.columnMapping.maxColumnId"; +pub(crate) const DATA_SKIPPING_NUM_INDEXED_COLS: &str = "delta.dataSkippingNumIndexedCols"; +pub(crate) const DATA_SKIPPING_STATS_COLUMNS: &str = "delta.dataSkippingStatsColumns"; +pub(crate) const DELETED_FILE_RETENTION_DURATION: &str = "delta.deletedFileRetentionDuration"; +pub(crate) const ENABLE_CHANGE_DATA_FEED: &str = "delta.enableChangeDataFeed"; +pub(crate) const ENABLE_DELETION_VECTORS: &str = "delta.enableDeletionVectors"; +pub(crate) const ENABLE_TYPE_WIDENING: &str = "delta.enableTypeWidening"; +pub(crate) const ENABLE_ICEBERG_COMPAT_V1: &str = "delta.enableIcebergCompatV1"; +pub(crate) const ENABLE_ICEBERG_COMPAT_V2: &str = "delta.enableIcebergCompatV2"; +pub(crate) const ISOLATION_LEVEL: &str = "delta.isolationLevel"; +pub(crate) const LOG_RETENTION_DURATION: &str = "delta.logRetentionDuration"; +pub(crate) const ENABLE_EXPIRED_LOG_CLEANUP: &str = "delta.enableExpiredLogCleanup"; +pub(crate) const RANDOMIZE_FILE_PREFIXES: &str = "delta.randomizeFilePrefixes"; +pub(crate) const RANDOM_PREFIX_LENGTH: &str = "delta.randomPrefixLength"; +pub(crate) const SET_TRANSACTION_RETENTION_DURATION: &str = "delta.setTransactionRetentionDuration"; +pub(crate) const TARGET_FILE_SIZE: &str = "delta.targetFileSize"; +pub(crate) const TUNE_FILE_SIZES_FOR_REWRITES: &str = "delta.tuneFileSizesForRewrites"; +pub(crate) const CHECKPOINT_POLICY: &str = "delta.checkpointPolicy"; +pub(crate) const ENABLE_ROW_TRACKING: &str = "delta.enableRowTracking"; +pub(crate) const MATERIALIZED_ROW_ID_COLUMN_NAME: &str = + "delta.rowTracking.materializedRowIdColumnName"; +pub(crate) const MATERIALIZED_ROW_COMMIT_VERSION_COLUMN_NAME: &str = + "delta.rowTracking.materializedRowCommitVersionColumnName"; +pub(crate) const ROW_TRACKING_SUSPENDED: &str = "delta.rowTrackingSuspended"; +pub(crate) const ENABLE_IN_COMMIT_TIMESTAMPS: &str = "delta.enableInCommitTimestamps"; +pub(crate) const IN_COMMIT_TIMESTAMP_ENABLEMENT_VERSION: &str = + "delta.inCommitTimestampEnablementVersion"; +pub(crate) const IN_COMMIT_TIMESTAMP_ENABLEMENT_TIMESTAMP: &str = + "delta.inCommitTimestampEnablementTimestamp"; + /// Delta table properties. These are parsed from the 'configuration' map in the most recent /// 'Metadata' action of a table. /// @@ -89,6 +130,18 @@ pub struct TableProperties { /// true to enable deletion vectors and predictive I/O for updates. pub enable_deletion_vectors: Option, + /// Whether widening the type of an existing column or field is allowed, either manually using + /// ALTER TABLE CHANGE COLUMN or automatically if automatic schema evolution is enabled. + pub enable_type_widening: Option, + + /// Whether Iceberg compatibility V1 is enabled for this table. When enabled, Delta Lake + /// ensures compatibility with Apache Iceberg V1 table format. + pub enable_iceberg_compat_v1: Option, + + /// Whether Iceberg compatibility V2 is enabled for this table. When enabled, Delta Lake + /// ensures compatibility with Apache Iceberg V2 table format. + pub enable_iceberg_compat_v2: Option, + /// The degree to which a transaction must be isolated from modifications made by concurrent /// transactions. /// @@ -167,12 +220,36 @@ pub struct TableProperties { pub unknown_properties: HashMap, } +impl TableProperties { + /// Returns whether to write file statistics as JSON in checkpoints. + /// Default: `true` per the Delta protocol. + pub fn should_write_stats_as_json(&self) -> bool { + self.checkpoint_write_stats_as_json.unwrap_or(true) + } + + /// Returns whether to write file statistics as parsed structs in checkpoints. + /// Default: `false` per the Delta protocol. + pub fn should_write_stats_as_struct(&self) -> bool { + self.checkpoint_write_stats_as_struct.unwrap_or(false) + } +} + +/// Default number of leaf columns to collect statistics on when `dataSkippingNumIndexedCols` +/// is not specified. +pub const DEFAULT_NUM_INDEXED_COLS: u64 = 32; + #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum DataSkippingNumIndexedCols { AllColumns, NumColumns(u64), } +impl Default for DataSkippingNumIndexedCols { + fn default() -> Self { + DataSkippingNumIndexedCols::NumColumns(DEFAULT_NUM_INDEXED_COLS) + } +} + impl TryFrom<&str> for DataSkippingNumIndexedCols { type Error = Error; @@ -234,12 +311,121 @@ mod tests { use crate::expressions::column_name; use std::collections::HashMap; + #[test] + fn test_property_key_constants() { + // Verify all property key constants have the correct string values. + // This also ensures coverage tools recognize these lines as exercised. + assert_eq!(APPEND_ONLY, "delta.appendOnly"); + assert_eq!(AUTO_COMPACT, "delta.autoOptimize.autoCompact"); + assert_eq!(OPTIMIZE_WRITE, "delta.autoOptimize.optimizeWrite"); + assert_eq!(CHECKPOINT_INTERVAL, "delta.checkpointInterval"); + assert_eq!( + CHECKPOINT_WRITE_STATS_AS_JSON, + "delta.checkpoint.writeStatsAsJson" + ); + assert_eq!( + CHECKPOINT_WRITE_STATS_AS_STRUCT, + "delta.checkpoint.writeStatsAsStruct" + ); + assert_eq!(COLUMN_MAPPING_MODE, "delta.columnMapping.mode"); + assert_eq!( + DATA_SKIPPING_NUM_INDEXED_COLS, + "delta.dataSkippingNumIndexedCols" + ); + assert_eq!( + DATA_SKIPPING_STATS_COLUMNS, + "delta.dataSkippingStatsColumns" + ); + assert_eq!( + DELETED_FILE_RETENTION_DURATION, + "delta.deletedFileRetentionDuration" + ); + assert_eq!(ENABLE_CHANGE_DATA_FEED, "delta.enableChangeDataFeed"); + assert_eq!(ENABLE_DELETION_VECTORS, "delta.enableDeletionVectors"); + assert_eq!(ENABLE_TYPE_WIDENING, "delta.enableTypeWidening"); + assert_eq!(ENABLE_ICEBERG_COMPAT_V1, "delta.enableIcebergCompatV1"); + assert_eq!(ENABLE_ICEBERG_COMPAT_V2, "delta.enableIcebergCompatV2"); + assert_eq!(ISOLATION_LEVEL, "delta.isolationLevel"); + assert_eq!(LOG_RETENTION_DURATION, "delta.logRetentionDuration"); + assert_eq!(ENABLE_EXPIRED_LOG_CLEANUP, "delta.enableExpiredLogCleanup"); + assert_eq!(RANDOMIZE_FILE_PREFIXES, "delta.randomizeFilePrefixes"); + assert_eq!(RANDOM_PREFIX_LENGTH, "delta.randomPrefixLength"); + assert_eq!( + SET_TRANSACTION_RETENTION_DURATION, + "delta.setTransactionRetentionDuration" + ); + assert_eq!(TARGET_FILE_SIZE, "delta.targetFileSize"); + assert_eq!( + TUNE_FILE_SIZES_FOR_REWRITES, + "delta.tuneFileSizesForRewrites" + ); + assert_eq!(CHECKPOINT_POLICY, "delta.checkpointPolicy"); + assert_eq!(ENABLE_ROW_TRACKING, "delta.enableRowTracking"); + assert_eq!( + MATERIALIZED_ROW_ID_COLUMN_NAME, + "delta.rowTracking.materializedRowIdColumnName" + ); + assert_eq!( + MATERIALIZED_ROW_COMMIT_VERSION_COLUMN_NAME, + "delta.rowTracking.materializedRowCommitVersionColumnName" + ); + assert_eq!(ROW_TRACKING_SUSPENDED, "delta.rowTrackingSuspended"); + assert_eq!( + ENABLE_IN_COMMIT_TIMESTAMPS, + "delta.enableInCommitTimestamps" + ); + assert_eq!( + IN_COMMIT_TIMESTAMP_ENABLEMENT_VERSION, + "delta.inCommitTimestampEnablementVersion" + ); + assert_eq!( + IN_COMMIT_TIMESTAMP_ENABLEMENT_TIMESTAMP, + "delta.inCommitTimestampEnablementTimestamp" + ); + } + + #[test] + fn test_parse_type_widening() { + let properties = HashMap::from([(ENABLE_TYPE_WIDENING.to_string(), "true".to_string())]); + let table_properties = TableProperties::from(properties.iter()); + assert_eq!(table_properties.enable_type_widening, Some(true)); + + let properties = HashMap::from([(ENABLE_TYPE_WIDENING.to_string(), "false".to_string())]); + let table_properties = TableProperties::from(properties.iter()); + assert_eq!(table_properties.enable_type_widening, Some(false)); + } + + #[test] + fn test_parse_iceberg_compat_v1() { + let properties = + HashMap::from([(ENABLE_ICEBERG_COMPAT_V1.to_string(), "true".to_string())]); + let table_properties = TableProperties::from(properties.iter()); + assert_eq!(table_properties.enable_iceberg_compat_v1, Some(true)); + + let properties = + HashMap::from([(ENABLE_ICEBERG_COMPAT_V1.to_string(), "false".to_string())]); + let table_properties = TableProperties::from(properties.iter()); + assert_eq!(table_properties.enable_iceberg_compat_v1, Some(false)); + } + + #[test] + fn test_parse_iceberg_compat_v2() { + let properties = + HashMap::from([(ENABLE_ICEBERG_COMPAT_V2.to_string(), "true".to_string())]); + let table_properties = TableProperties::from(properties.iter()); + assert_eq!(table_properties.enable_iceberg_compat_v2, Some(true)); + + let properties = + HashMap::from([(ENABLE_ICEBERG_COMPAT_V2.to_string(), "false".to_string())]); + let table_properties = TableProperties::from(properties.iter()); + assert_eq!(table_properties.enable_iceberg_compat_v2, Some(false)); + } + #[test] fn known_key_unknown_val() { - let properties = HashMap::from([("delta.appendOnly".to_string(), "wack".to_string())]); + let properties = HashMap::from([(APPEND_ONLY.to_string(), "wack".to_string())]); let table_properties = TableProperties::from(properties.iter()); - let unknown_properties = - HashMap::from([("delta.appendOnly".to_string(), "wack".to_string())]); + let unknown_properties = HashMap::from([(APPEND_ONLY.to_string(), "wack".to_string())]); let expected = TableProperties { unknown_properties, ..Default::default() @@ -269,43 +455,40 @@ mod tests { #[test] fn test_parse_table_properties() { let properties = [ - ("delta.appendOnly", "true"), - ("delta.autoOptimize.optimizeWrite", "true"), - ("delta.autoOptimize.autoCompact", "true"), - ("delta.checkpointInterval", "101"), - ("delta.checkpoint.writeStatsAsJson", "true"), - ("delta.checkpoint.writeStatsAsStruct", "true"), - ("delta.columnMapping.mode", "id"), - ("delta.dataSkippingNumIndexedCols", "-1"), - ("delta.dataSkippingStatsColumns", "col1,col2"), - ("delta.deletedFileRetentionDuration", "interval 1 second"), - ("delta.enableChangeDataFeed", "true"), - ("delta.enableDeletionVectors", "true"), - ("delta.isolationLevel", "snapshotIsolation"), - ("delta.logRetentionDuration", "interval 2 seconds"), - ("delta.enableExpiredLogCleanup", "true"), - ("delta.randomizeFilePrefixes", "true"), - ("delta.randomPrefixLength", "1001"), - ( - "delta.setTransactionRetentionDuration", - "interval 60 seconds", - ), - ("delta.targetFileSize", "1000000000"), - ("delta.tuneFileSizesForRewrites", "true"), - ("delta.checkpointPolicy", "v2"), - ("delta.enableRowTracking", "true"), - ( - "delta.rowTracking.materializedRowIdColumnName", - "_row-id-col-some_uuid", - ), + (APPEND_ONLY, "true"), + (OPTIMIZE_WRITE, "true"), + (AUTO_COMPACT, "true"), + (CHECKPOINT_INTERVAL, "101"), + (CHECKPOINT_WRITE_STATS_AS_JSON, "true"), + (CHECKPOINT_WRITE_STATS_AS_STRUCT, "true"), + (COLUMN_MAPPING_MODE, "id"), + (DATA_SKIPPING_NUM_INDEXED_COLS, "-1"), + (DATA_SKIPPING_STATS_COLUMNS, "col1,col2"), + (DELETED_FILE_RETENTION_DURATION, "interval 1 second"), + (ENABLE_CHANGE_DATA_FEED, "true"), + (ENABLE_DELETION_VECTORS, "true"), + (ENABLE_TYPE_WIDENING, "true"), + (ENABLE_ICEBERG_COMPAT_V1, "true"), + (ENABLE_ICEBERG_COMPAT_V2, "true"), + (ISOLATION_LEVEL, "snapshotIsolation"), + (LOG_RETENTION_DURATION, "interval 2 seconds"), + (ENABLE_EXPIRED_LOG_CLEANUP, "true"), + (RANDOMIZE_FILE_PREFIXES, "true"), + (RANDOM_PREFIX_LENGTH, "1001"), + (SET_TRANSACTION_RETENTION_DURATION, "interval 60 seconds"), + (TARGET_FILE_SIZE, "1000000000"), + (TUNE_FILE_SIZES_FOR_REWRITES, "true"), + (CHECKPOINT_POLICY, "v2"), + (ENABLE_ROW_TRACKING, "true"), + (MATERIALIZED_ROW_ID_COLUMN_NAME, "_row-id-col-some_uuid"), ( - "delta.rowTracking.materializedRowCommitVersionColumnName", + MATERIALIZED_ROW_COMMIT_VERSION_COLUMN_NAME, "_row-commit-version-col-some_uuid", ), - ("delta.rowTrackingSuspended", "false"), - ("delta.enableInCommitTimestamps", "true"), - ("delta.inCommitTimestampEnablementVersion", "15"), - ("delta.inCommitTimestampEnablementTimestamp", "1612345678"), + (ROW_TRACKING_SUSPENDED, "false"), + (ENABLE_IN_COMMIT_TIMESTAMPS, "true"), + (IN_COMMIT_TIMESTAMP_ENABLEMENT_VERSION, "15"), + (IN_COMMIT_TIMESTAMP_ENABLEMENT_TIMESTAMP, "1612345678"), ]; let actual = TableProperties::from(properties.into_iter()); let expected = TableProperties { @@ -321,6 +504,9 @@ mod tests { deleted_file_retention_duration: Some(Duration::new(1, 0)), enable_change_data_feed: Some(true), enable_deletion_vectors: Some(true), + enable_type_widening: Some(true), + enable_iceberg_compat_v1: Some(true), + enable_iceberg_compat_v2: Some(true), isolation_level: Some(IsolationLevel::SnapshotIsolation), log_retention_duration: Some(Duration::new(2, 0)), enable_expired_log_cleanup: Some(true), diff --git a/kernel/src/table_properties/deserialize.rs b/kernel/src/table_properties/deserialize.rs index e91d816c88..bd4f38a03d 100644 --- a/kernel/src/table_properties/deserialize.rs +++ b/kernel/src/table_properties/deserialize.rs @@ -37,62 +37,61 @@ where // attempt to parse a key-value pair into a `TableProperties` struct. Returns Some(()) if the key // was successfully parsed, and None otherwise. fn try_parse(props: &mut TableProperties, k: &str, v: &str) -> Option<()> { + // Table property key constants are imported via `use super::*` at the top of this file. + // NOTE!! we do Some(parse(v)?) instead of just parse(v) because we want to return None if the // parsing fails. If we simply call 'parse(v)', then we would (incorrectly) return Some(()) and // just set the property to None. match k { - "delta.appendOnly" => props.append_only = Some(parse_bool(v)?), - "delta.autoOptimize.autoCompact" => props.auto_compact = Some(parse_bool(v)?), - "delta.autoOptimize.optimizeWrite" => props.optimize_write = Some(parse_bool(v)?), - "delta.checkpointInterval" => props.checkpoint_interval = Some(parse_positive_int(v)?), - "delta.checkpoint.writeStatsAsJson" => { + APPEND_ONLY => props.append_only = Some(parse_bool(v)?), + AUTO_COMPACT => props.auto_compact = Some(parse_bool(v)?), + OPTIMIZE_WRITE => props.optimize_write = Some(parse_bool(v)?), + CHECKPOINT_INTERVAL => props.checkpoint_interval = Some(parse_positive_int(v)?), + CHECKPOINT_WRITE_STATS_AS_JSON => { props.checkpoint_write_stats_as_json = Some(parse_bool(v)?) } - "delta.checkpoint.writeStatsAsStruct" => { + CHECKPOINT_WRITE_STATS_AS_STRUCT => { props.checkpoint_write_stats_as_struct = Some(parse_bool(v)?) } - "delta.columnMapping.mode" => { - props.column_mapping_mode = ColumnMappingMode::try_from(v).ok() - } - "delta.dataSkippingNumIndexedCols" => { + COLUMN_MAPPING_MODE => props.column_mapping_mode = ColumnMappingMode::try_from(v).ok(), + DATA_SKIPPING_NUM_INDEXED_COLS => { props.data_skipping_num_indexed_cols = DataSkippingNumIndexedCols::try_from(v).ok() } - "delta.dataSkippingStatsColumns" => { + DATA_SKIPPING_STATS_COLUMNS => { props.data_skipping_stats_columns = Some(parse_column_names(v)?) } - "delta.deletedFileRetentionDuration" => { + DELETED_FILE_RETENTION_DURATION => { props.deleted_file_retention_duration = Some(parse_interval(v)?) } - "delta.enableChangeDataFeed" => props.enable_change_data_feed = Some(parse_bool(v)?), - "delta.enableDeletionVectors" => props.enable_deletion_vectors = Some(parse_bool(v)?), - "delta.isolationLevel" => props.isolation_level = IsolationLevel::try_from(v).ok(), - "delta.logRetentionDuration" => props.log_retention_duration = Some(parse_interval(v)?), - "delta.enableExpiredLogCleanup" => props.enable_expired_log_cleanup = Some(parse_bool(v)?), - "delta.randomizeFilePrefixes" => props.randomize_file_prefixes = Some(parse_bool(v)?), - "delta.randomPrefixLength" => props.random_prefix_length = Some(parse_positive_int(v)?), - "delta.setTransactionRetentionDuration" => { + ENABLE_CHANGE_DATA_FEED => props.enable_change_data_feed = Some(parse_bool(v)?), + ENABLE_DELETION_VECTORS => props.enable_deletion_vectors = Some(parse_bool(v)?), + ENABLE_TYPE_WIDENING => props.enable_type_widening = Some(parse_bool(v)?), + ENABLE_ICEBERG_COMPAT_V1 => props.enable_iceberg_compat_v1 = Some(parse_bool(v)?), + ENABLE_ICEBERG_COMPAT_V2 => props.enable_iceberg_compat_v2 = Some(parse_bool(v)?), + ISOLATION_LEVEL => props.isolation_level = IsolationLevel::try_from(v).ok(), + LOG_RETENTION_DURATION => props.log_retention_duration = Some(parse_interval(v)?), + ENABLE_EXPIRED_LOG_CLEANUP => props.enable_expired_log_cleanup = Some(parse_bool(v)?), + RANDOMIZE_FILE_PREFIXES => props.randomize_file_prefixes = Some(parse_bool(v)?), + RANDOM_PREFIX_LENGTH => props.random_prefix_length = Some(parse_positive_int(v)?), + SET_TRANSACTION_RETENTION_DURATION => { props.set_transaction_retention_duration = Some(parse_interval(v)?) } - "delta.targetFileSize" => props.target_file_size = Some(parse_positive_int(v)?), - "delta.tuneFileSizesForRewrites" => { - props.tune_file_sizes_for_rewrites = Some(parse_bool(v)?) - } - "delta.checkpointPolicy" => props.checkpoint_policy = CheckpointPolicy::try_from(v).ok(), - "delta.enableRowTracking" => props.enable_row_tracking = Some(parse_bool(v)?), - "delta.rowTracking.materializedRowIdColumnName" => { + TARGET_FILE_SIZE => props.target_file_size = Some(parse_positive_int(v)?), + TUNE_FILE_SIZES_FOR_REWRITES => props.tune_file_sizes_for_rewrites = Some(parse_bool(v)?), + CHECKPOINT_POLICY => props.checkpoint_policy = CheckpointPolicy::try_from(v).ok(), + ENABLE_ROW_TRACKING => props.enable_row_tracking = Some(parse_bool(v)?), + MATERIALIZED_ROW_ID_COLUMN_NAME => { props.materialized_row_id_column_name = Some(v.to_string()) } - "delta.rowTracking.materializedRowCommitVersionColumnName" => { + MATERIALIZED_ROW_COMMIT_VERSION_COLUMN_NAME => { props.materialized_row_commit_version_column_name = Some(v.to_string()) } - "delta.rowTrackingSuspended" => props.row_tracking_suspended = Some(parse_bool(v)?), - "delta.enableInCommitTimestamps" => { - props.enable_in_commit_timestamps = Some(parse_bool(v)?) - } - "delta.inCommitTimestampEnablementVersion" => { + ROW_TRACKING_SUSPENDED => props.row_tracking_suspended = Some(parse_bool(v)?), + ENABLE_IN_COMMIT_TIMESTAMPS => props.enable_in_commit_timestamps = Some(parse_bool(v)?), + IN_COMMIT_TIMESTAMP_ENABLEMENT_VERSION => { props.in_commit_timestamp_enablement_version = Some(parse_non_negative(v)?) } - "delta.inCommitTimestampEnablementTimestamp" => { + IN_COMMIT_TIMESTAMP_ENABLEMENT_TIMESTAMP => { props.in_commit_timestamp_enablement_timestamp = Some(parse_non_negative(v)?) } _ => return None, diff --git a/kernel/src/transaction/builder/create_table.rs b/kernel/src/transaction/builder/create_table.rs new file mode 100644 index 0000000000..b0c385b771 --- /dev/null +++ b/kernel/src/transaction/builder/create_table.rs @@ -0,0 +1,1445 @@ +//! Builder for creating new Delta tables. +//! +//! This module contains [`CreateTableTransactionBuilder`], which validates and constructs a +//! [`CreateTableTransaction`] from user-provided schema, properties, and data layout options. +//! +//! Use [`create_table()`](super::super::create_table::create_table) as the entry point rather +//! than constructing the builder directly. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use itertools::Itertools; +use url::Url; + +use crate::actions::{DomainMetadata, Metadata, Protocol}; +use crate::clustering::{create_clustering_domain_metadata, validate_clustering_columns}; +use crate::committer::Committer; +use crate::expressions::ColumnName; +use crate::log_segment::LogSegment; +use crate::schema::variant_utils::schema_contains_variant_type; +use crate::schema::{DataType, SchemaRef, StructType}; +use crate::snapshot::Snapshot; +use crate::table_configuration::TableConfiguration; +use crate::table_features::{ + assign_column_mapping_metadata, get_any_level_column_physical_name, + get_column_mapping_mode_from_properties, schema_contains_timestamp_ntz, ColumnMappingMode, + EnablementCheck, FeatureType, TableFeature, SET_TABLE_FEATURE_SUPPORTED_PREFIX, + SET_TABLE_FEATURE_SUPPORTED_VALUE, +}; +use crate::table_properties::{ + TableProperties, APPEND_ONLY, CHECKPOINT_WRITE_STATS_AS_JSON, CHECKPOINT_WRITE_STATS_AS_STRUCT, + COLUMN_MAPPING_MAX_COLUMN_ID, COLUMN_MAPPING_MODE, DELTA_PROPERTY_PREFIX, + ENABLE_CHANGE_DATA_FEED, ENABLE_DELETION_VECTORS, ENABLE_IN_COMMIT_TIMESTAMPS, + ENABLE_TYPE_WIDENING, SET_TRANSACTION_RETENTION_DURATION, +}; +use crate::transaction::create_table::CreateTableTransaction; +use crate::transaction::data_layout::DataLayout; +use crate::transaction::Transaction; +use crate::utils::{current_time_ms, try_parse_uri}; +use crate::{DeltaResult, Engine, Error, StorageHandler, PRE_COMMIT_VERSION}; + +/// Table features allowed to be enabled via `delta.feature.*=supported` during CREATE TABLE. +/// +/// Feature signals (`delta.feature.X=supported`) are validated against this list. +/// Only features in this list can be enabled via feature signals. +const ALLOWED_DELTA_FEATURES: &[TableFeature] = &[ + // DomainMetadata is required for clustering and other system domain operations + TableFeature::DomainMetadata, + // ColumnMapping enables column mapping (name/id mode) + TableFeature::ColumnMapping, + // InCommitTimestamp enables in-commit timestamps (writer-only) + TableFeature::InCommitTimestamp, + // VacuumProtocolCheck ensures consistent protocol checks during VACUUM + TableFeature::VacuumProtocolCheck, + // CatalogManaged enables catalog-managed table support + TableFeature::CatalogManaged, + // Note: Clustering is NOT included here. Users should not enable clustering via + // `delta.feature.clustering = supported`. Instead, clustering is enabled by + // specifying clustering columns via `with_data_layout()`. + TableFeature::DeletionVectors, + TableFeature::V2Checkpoint, + // Simple protocol-only features: enabling these only updates the protocol action. + // They can also be auto-enabled via their enablement properties (e.g. delta.appendOnly=true) + // through `maybe_auto_enable_property_driven_features`. + TableFeature::AppendOnly, + TableFeature::ChangeDataFeed, + TableFeature::TypeWidening, +]; + +/// Delta properties allowed to be set during CREATE TABLE. +/// +/// This list will expand as more features are supported. +/// The allow list will be deprecated once auto feature enablement is implemented +/// like the Java Kernel. +const ALLOWED_DELTA_PROPERTIES: &[&str] = &[ + // ColumnMapping mode property: triggers column mapping transform + COLUMN_MAPPING_MODE, + // InCommitTimestamp enablement property: triggers ICT auto-enablement + ENABLE_IN_COMMIT_TIMESTAMPS, + // Checkpoint stats format properties + CHECKPOINT_WRITE_STATS_AS_JSON, + CHECKPOINT_WRITE_STATS_AS_STRUCT, + // Property-driven feature enablement properties + ENABLE_DELETION_VECTORS, + ENABLE_CHANGE_DATA_FEED, + ENABLE_TYPE_WIDENING, + APPEND_ONLY, + // Set transaction retention duration: controls expiration of txn identifiers + SET_TRANSACTION_RETENTION_DURATION, +]; + +/// Ensures that no Delta table exists at the given path. +/// +/// This function checks the `_delta_log` directory to determine if a table already exists. +/// It handles various storage backend behaviors gracefully: +/// - If the directory doesn't exist (FileNotFound), returns Ok (new table can be created) +/// - If the directory exists but is empty, returns Ok (new table can be created) +/// - If the directory contains files, returns an error (table already exists) +/// - For other errors (permissions, network), propagates the error +/// +/// # Arguments +/// * `storage` - The storage handler to use for listing +/// * `delta_log_url` - URL to the `_delta_log` directory +/// * `table_path` - Original table path (for error messages) +fn ensure_table_does_not_exist( + storage: &dyn StorageHandler, + delta_log_url: &Url, + table_path: &str, +) -> DeltaResult<()> { + match storage.list_from(delta_log_url) { + Ok(mut files) => { + // files.next() returns Option> + // - Some(Ok(_)) means a file exists -> table exists + // - Some(Err(FileNotFound)) means path doesn't exist -> OK for new table + // - Some(Err(other)) means real error -> propagate + // - None means empty iterator -> OK for new table + match files.next() { + Some(Ok(_)) => Err(Error::generic(format!( + "Table already exists at path: {table_path}" + ))), + Some(Err(Error::FileNotFound(_))) | None => { + // Path doesn't exist or empty - OK for new table + Ok(()) + } + Some(Err(e)) => { + // Real error (permissions, network, etc.) - propagate + Err(e) + } + } + } + Err(Error::FileNotFound(_)) => { + // Directory doesn't exist - this is expected for a new table. + // The storage layer will create the full path (including _delta_log/) + // when the commit writes the first log file via write_json_file(). + Ok(()) + } + Err(e) => { + // Real error - propagate + Err(e) + } + } +} + +/// Result of validating and transforming table properties. +struct ValidatedTableProperties { + /// Table properties with feature signals removed (to be stored in metadata) + properties: HashMap, + /// Reader features extracted from feature signals (for ReaderWriter features) + reader_features: Vec, + /// Writer features extracted from feature signals (for all features) + writer_features: Vec, +} + +/// Adds a feature to the appropriate reader/writer feature lists based on its type. +/// +/// - ReaderWriter features are added to both reader and writer lists +/// - Writer and Unknown features are added only to the writer list +/// +/// This function is idempotent - it won't add duplicate features. +fn add_feature_to_lists( + feature: TableFeature, + reader_features: &mut Vec, + writer_features: &mut Vec, +) { + match feature.feature_type() { + FeatureType::ReaderWriter => { + if !reader_features.contains(&feature) { + reader_features.push(feature.clone()); + } + if !writer_features.contains(&feature) { + writer_features.push(feature); + } + } + FeatureType::WriterOnly | FeatureType::Unknown => { + if !writer_features.contains(&feature) { + writer_features.push(feature); + } + } + } +} + +/// Configures clustering support for table creation (used by unit tests). +/// +/// Validates clustering columns, adds required features (DomainMetadata, ClusteredTable), +/// and creates the domain metadata action. +fn apply_clustering_for_table_create( + logical_schema: &SchemaRef, + logical_columns: &[ColumnName], + reader_features: &mut Vec, + writer_features: &mut Vec, +) -> DeltaResult { + validate_clustering_columns(logical_schema, logical_columns)?; + + // Add required features + add_feature_to_lists( + TableFeature::DomainMetadata, + reader_features, + writer_features, + ); + add_feature_to_lists( + TableFeature::ClusteredTable, + reader_features, + writer_features, + ); + + Ok(create_clustering_domain_metadata(logical_columns)) +} + +/// Result of applying data layout configuration during table creation. +/// +/// Contains all outputs needed by `build()` from the data layout processing step. +#[derive(Debug, Default)] +struct DataLayoutResult { + /// Domain metadata actions (clustering stores `delta.clustering` domain metadata). + system_domain_metadata: Vec, + /// Clustering columns for stats schema (physical names, `None` if not clustered). + clustering_columns: Option>, + /// Partition columns (logical names, `None` if not partitioned). + partition_columns: Option>, +} + +/// Validates partition columns against the table schema. +/// +/// Similar to [`validate_clustering_columns`] (duplicate check, schema lookup), but with +/// stricter constraints: partition columns must be top-level and primitive-typed, while +/// clustering columns may be nested and accept all stats-eligible types. +/// +/// Partition columns must be: +/// 1. Top-level columns (nested paths are not supported) +/// 2. Present in the schema +/// 3. Not duplicated +/// 4. Of a primitive type (Struct, Array, Map are rejected because partition values +/// must be representable as directory-path strings) +/// 5. A strict subset of the schema columns (at least one non-partition column required) +fn validate_partition_columns( + schema: &StructType, + partition_columns: &[ColumnName], +) -> DeltaResult<()> { + if partition_columns.is_empty() { + return Err(Error::generic("Partitioning requires at least one column")); + } + if partition_columns.len() >= schema.fields().len() { + return Err(Error::generic( + "Table must have at least one non-partition column", + )); + } + + let mut seen = HashSet::new(); + for col in partition_columns { + let path = col.path(); + if path.len() != 1 { + return Err(Error::generic(format!( + "Partition column '{}' must be a top-level column (nested paths are not supported)", + col + ))); + } + + if !seen.insert(col) { + return Err(Error::generic(format!( + "Duplicate partition column: '{col}'" + ))); + } + + // Safety: path.len() == 1 is enforced by the top-level check above + let col_name = &path[0]; + let field = schema.field(col_name).ok_or_else(|| { + Error::generic(format!("Partition column '{col}' not found in schema")) + })?; + + if !matches!(field.data_type(), DataType::Primitive(_)) { + return Err(Error::generic(format!( + "Partition column '{col}' has non-primitive type '{}'. \ + Partition columns must have primitive types.", + field.data_type() + ))); + } + } + Ok(()) +} + +/// Applies data layout configuration for table creation. +/// +/// Handles all [`DataLayout`] variants: +/// +/// - **None**: Returns defaults (no domain metadata, no clustering/partition columns). +/// - **Clustered**: Validates clustering columns, resolves to physical names, adds +/// `DomainMetadata` + `ClusteredTable` features, creates clustering domain metadata. +/// - **Partitioned**: Validates partition columns and stores logical names. No domain +/// metadata or special features are needed (partitioning is a core Delta feature). +fn apply_data_layout( + data_layout: &DataLayout, + effective_schema: &SchemaRef, + column_mapping_mode: ColumnMappingMode, + validated: &mut ValidatedTableProperties, +) -> DeltaResult { + match data_layout { + DataLayout::None => Ok(DataLayoutResult::default()), + + DataLayout::Clustered { columns } => { + validate_clustering_columns(effective_schema, columns)?; + + let physical_columns: Vec = columns + .iter() + .map(|c| { + get_any_level_column_physical_name(effective_schema, c, column_mapping_mode) + }) + .try_collect()?; + + add_feature_to_lists( + TableFeature::DomainMetadata, + &mut validated.reader_features, + &mut validated.writer_features, + ); + add_feature_to_lists( + TableFeature::ClusteredTable, + &mut validated.reader_features, + &mut validated.writer_features, + ); + + let dm = create_clustering_domain_metadata(&physical_columns); + + Ok(DataLayoutResult { + system_domain_metadata: vec![dm], + clustering_columns: Some(physical_columns), + partition_columns: None, + }) + } + + DataLayout::Partitioned { columns } => { + validate_partition_columns(effective_schema, columns)?; + + Ok(DataLayoutResult { + system_domain_metadata: vec![], + clustering_columns: None, + partition_columns: Some(columns.clone()), + }) + } + } +} + +/// Conditionally adds the `variantType` feature to the protocol when the schema contains Variant +/// columns anywhere in the schema tree (top-level, nested structs, arrays, maps). +fn maybe_enable_variant_type(schema: &SchemaRef, validated: &mut ValidatedTableProperties) { + if schema_contains_variant_type(schema) { + add_feature_to_lists( + TableFeature::VariantType, + &mut validated.reader_features, + &mut validated.writer_features, + ); + } +} + +/// Conditionally adds the `timestampNtz` feature to the protocol when the schema contains +/// TimestampNTZ columns anywhere in the schema tree (top-level, nested structs, arrays, maps). +fn maybe_enable_timestamp_ntz(schema: &SchemaRef, validated: &mut ValidatedTableProperties) { + if schema_contains_timestamp_ntz(schema) { + add_feature_to_lists( + TableFeature::TimestampWithoutTimezone, + &mut validated.reader_features, + &mut validated.writer_features, + ); + } +} + +/// Auto-enables allowed features whose [`EnablementCheck::EnabledIf`] check is satisfied by the +/// table properties. Features with [`EnablementCheck::AlwaysIfSupported`] are skipped since they +/// don't require property-driven enablement. +fn maybe_auto_enable_property_driven_features(validated: &mut ValidatedTableProperties) { + let table_properties = TableProperties::from(validated.properties.iter()); + for feature in ALLOWED_DELTA_FEATURES { + if let EnablementCheck::EnabledIf(check) = feature.info().enablement_check { + if check(&table_properties) { + add_feature_to_lists( + feature.clone(), + &mut validated.reader_features, + &mut validated.writer_features, + ); + } + } + } +} + +/// Ensures that `inCommitTimestamp` is enabled when `catalogManaged` is present. Adds the ICT +/// feature to the protocol and sets the enablement property if not already present. +fn maybe_enable_ict_for_catalog_managed( + validated: &mut ValidatedTableProperties, +) -> DeltaResult<()> { + let has_catalog_managed = validated + .writer_features + .contains(&TableFeature::CatalogManaged); + if has_catalog_managed { + if validated + .properties + .get(ENABLE_IN_COMMIT_TIMESTAMPS) + .is_some_and(|v| v != "true") + { + return Err(Error::generic(format!( + "Catalog-managed tables require '{ENABLE_IN_COMMIT_TIMESTAMPS}=true', \ + but it was explicitly set to '{}'", + validated.properties[ENABLE_IN_COMMIT_TIMESTAMPS] + ))); + } + add_feature_to_lists( + TableFeature::InCommitTimestamp, + &mut validated.reader_features, + &mut validated.writer_features, + ); + validated + .properties + .entry(ENABLE_IN_COMMIT_TIMESTAMPS.to_string()) + .or_insert_with(|| "true".to_string()); + } + Ok(()) +} + +/// Conditionally applies column mapping for table creation based on the mode in properties. +/// +/// If `delta.columnMapping.mode` is set to `name` or `id`, this function: +/// 1. Adds the ColumnMapping feature to the protocol +/// 2. Transforms the schema to assign IDs and physical names to all fields +/// 3. Sets `delta.columnMapping.maxColumnId` in properties +/// 4. Returns the transformed schema +/// +/// If mode is `none` or not set, returns the original schema unchanged. +/// +/// # Arguments +/// +/// * `schema` - The table schema to potentially transform +/// * `validated` - The validated table properties (may be modified to add maxColumnId) +/// +/// # Returns +/// +/// A tuple of (effective_schema, column_mapping_mode). +fn maybe_apply_column_mapping_for_table_create( + schema: &SchemaRef, + validated: &mut ValidatedTableProperties, +) -> DeltaResult<(SchemaRef, ColumnMappingMode)> { + let column_mapping_mode = get_column_mapping_mode_from_properties(&validated.properties)?; + + let effective_schema = match column_mapping_mode { + ColumnMappingMode::Name | ColumnMappingMode::Id => { + // Add ColumnMapping feature to protocol (it's a ReaderWriter feature) + add_feature_to_lists( + TableFeature::ColumnMapping, + &mut validated.reader_features, + &mut validated.writer_features, + ); + + // Transform schema: assign IDs and physical names to all fields + let mut max_id = 0i64; + let transformed_schema = assign_column_mapping_metadata(schema, &mut max_id)?; + + // Add maxColumnId to properties + validated + .properties + .insert(COLUMN_MAPPING_MAX_COLUMN_ID.to_string(), max_id.to_string()); + + Arc::new(transformed_schema) + } + ColumnMappingMode::None => schema.clone(), + }; + + Ok((effective_schema, column_mapping_mode)) +} + +/// Validates and transforms table properties for CREATE TABLE. +/// +/// This function: +/// 1. Validates feature signals (`delta.feature.*`) against `ALLOWED_DELTA_FEATURES` +/// 2. Validates delta properties (`delta.*`) against `ALLOWED_DELTA_PROPERTIES` +/// 3. Removes feature signals from properties (they shouldn't be stored in metadata) +/// 4. Extracts reader/writer features from validated feature signals +/// +/// Non-delta properties (user/application properties) are always allowed. +/// +/// Note: This function does not auto-set enablement properties. A feature signal like +/// `delta.feature.deletionVectors=supported` adds the feature to the protocol but does +/// not insert `delta.enableDeletionVectors=true` into the properties. Property-driven +/// auto-enablement is handled separately by [`maybe_auto_enable_property_driven_features`] +/// called after validation. +fn validate_extract_table_features_and_properties( + properties: HashMap, +) -> DeltaResult { + let mut reader_features = Vec::new(); + let mut writer_features = Vec::new(); + + // Partition properties into feature signals and regular properties + // Feature signals (delta.feature.X=supported) are processed but not stored in metadata + // Feature signals are removed from the properties map. + let (feature_signals, properties): (HashMap<_, _>, HashMap<_, _>) = properties + .into_iter() + .partition(|(k, _)| k.starts_with(SET_TABLE_FEATURE_SUPPORTED_PREFIX)); + + // Process and validate feature signals + for (key, value) in &feature_signals { + // Safe: we partitioned for keys starting with this prefix above + let Some(feature_name) = key.strip_prefix(SET_TABLE_FEATURE_SUPPORTED_PREFIX) else { + continue; + }; + + // Validate that the value is "supported" + if value != SET_TABLE_FEATURE_SUPPORTED_VALUE { + return Err(Error::generic(format!( + "Invalid value '{value}' for '{key}'. Only '{SET_TABLE_FEATURE_SUPPORTED_VALUE}' is allowed." + ))); + } + + // Parse feature name to TableFeature (unknown features become TableFeature::Unknown) + let feature: TableFeature = feature_name + .parse() + .unwrap_or_else(|_| TableFeature::Unknown(feature_name.to_string())); + + if !ALLOWED_DELTA_FEATURES.contains(&feature) { + return Err(Error::generic(format!( + "Enabling feature '{feature_name}' via '{key}' is not supported during CREATE TABLE" + ))); + } + + // Add to appropriate feature lists based on feature type + add_feature_to_lists(feature, &mut reader_features, &mut writer_features); + } + + // Validate remaining delta.* properties against allow list + for key in properties.keys() { + if key.starts_with(DELTA_PROPERTY_PREFIX) + && !ALLOWED_DELTA_PROPERTIES.contains(&key.as_str()) + { + return Err(Error::generic(format!( + "Setting delta property '{key}' is not supported during CREATE TABLE" + ))); + } + } + + Ok(ValidatedTableProperties { + properties, + reader_features, + writer_features, + }) +} + +/// Builder for configuring a new Delta table. +/// +/// Use this to configure table properties before building a [`CreateTableTransaction`]. +/// If the table build fails, no transaction will be created. +/// +/// Created via [`create_table()`](super::super::create_table::create_table). +pub struct CreateTableTransactionBuilder { + path: String, + schema: SchemaRef, + engine_info: String, + table_properties: HashMap, + data_layout: DataLayout, +} + +impl CreateTableTransactionBuilder { + /// Creates a new CreateTableTransactionBuilder. + /// + /// This is typically called via + /// [`create_table()`](super::super::create_table::create_table) rather than directly. + pub fn new(path: impl AsRef, schema: SchemaRef, engine_info: impl Into) -> Self { + Self { + path: path.as_ref().to_string(), + schema, + engine_info: engine_info.into(), + table_properties: HashMap::new(), + data_layout: DataLayout::None, + } + } + + /// Sets table properties for the new Delta table. + /// + /// Custom application properties (those not starting with `delta.`) are always allowed. + /// Delta properties (`delta.*`) are validated against an allow list during [`build()`]. + /// Feature flags (`delta.feature.*`) are not supported during CREATE TABLE. + /// + /// This method can be called multiple times. If a property key already exists from a + /// previous call, the new value will overwrite the old one. + /// + /// # Arguments + /// + /// * `properties` - A map of table property names to their values + /// + /// # Example + /// + /// ```rust,no_run + /// # use delta_kernel::transaction::create_table::create_table; + /// # use delta_kernel::schema::{StructType, DataType, StructField}; + /// # use std::sync::Arc; + /// # fn example() -> delta_kernel::DeltaResult<()> { + /// # let schema = Arc::new(StructType::try_new(vec![StructField::new("id", DataType::INTEGER, false)])?); + /// let builder = create_table("/path/to/table", schema, "MyApp/1.0") + /// .with_table_properties([ + /// ("myapp.version", "1.0"), + /// ("myapp.author", "test"), + /// ]); + /// # Ok(()) + /// # } + /// ``` + /// + /// [`build()`]: CreateTableTransactionBuilder::build + pub fn with_table_properties(mut self, properties: I) -> Self + where + I: IntoIterator, + K: Into, + V: Into, + { + self.table_properties + .extend(properties.into_iter().map(|(k, v)| (k.into(), v.into()))); + self + } + + /// Sets the data layout for the new Delta table. + /// + /// The data layout determines how data files are organized within the table: + /// + /// - [`DataLayout::None`]: No special organization (default) + /// - [`DataLayout::Clustered`]: Data files are optimized for queries on clustering columns + /// - [`DataLayout::Partitioned`]: Data files are organized into directories by partition + /// column values + /// + /// Partitioning and clustering are mutually exclusive. + /// + /// Calling this method multiple times replaces the previous layout. Only the last + /// `with_data_layout()` call takes effect. + /// + /// # Example + /// + /// ```rust,no_run + /// # use delta_kernel::transaction::create_table::create_table; + /// # use delta_kernel::transaction::data_layout::DataLayout; + /// # use delta_kernel::schema::{StructType, DataType, StructField}; + /// # use std::sync::Arc; + /// # fn example() -> delta_kernel::DeltaResult<()> { + /// # let schema = Arc::new(StructType::try_new(vec![ + /// # StructField::new("id", DataType::INTEGER, false), + /// # StructField::new("date", DataType::STRING, false), + /// # ])?); + /// // Clustered layout: + /// let builder = create_table("/path/to/table", schema.clone(), "MyApp/1.0") + /// .with_data_layout(DataLayout::clustered(["id"])); + /// + /// // Partitioned layout: + /// let builder = create_table("/path/to/table", schema, "MyApp/1.0") + /// .with_data_layout(DataLayout::partitioned(["date"])); + /// # Ok(()) + /// # } + /// ``` + pub fn with_data_layout(mut self, layout: DataLayout) -> Self { + self.data_layout = layout; + self + } + + /// Builds a [`CreateTableTransaction`] that can be committed to create the table. + /// + /// The returned [`CreateTableTransaction`] only exposes operations that are valid for + /// table creation. Operations like removing files, removing domain metadata, or updating + /// deletion vectors are not available, preventing misuse at compile time. + /// + /// This method performs validation: + /// - Checks that the table path is valid + /// - Verifies the table doesn't already exist + /// - Validates the schema is non-empty + /// - Validates the data layout is valid + /// - Validates table properties against the allow list + /// + /// # Arguments + /// + /// * `engine` - The engine instance to use for validation + /// * `committer` - The committer to use for the transaction + /// + /// # Errors + /// + /// Returns an error if: + /// - The table path is invalid + /// - A table already exists at the given path + /// - The schema is empty + /// - The data layout is invalid + /// - Unsupported delta properties or feature flags are specified + pub fn build( + self, + engine: &dyn Engine, + committer: Box, + ) -> DeltaResult { + // Validate path + let table_url = try_parse_uri(&self.path)?; + + // Validate schema is non-empty + if self.schema.fields().len() == 0 { + return Err(Error::generic("Schema cannot be empty")); + } + // Check if table already exists by looking for _delta_log directory + let delta_log_url = table_url.join("_delta_log/")?; + let storage = engine.storage_handler(); + ensure_table_does_not_exist(storage.as_ref(), &delta_log_url, &self.path)?; + + // Validate and transform table properties + // - Extracts and validates feature signals + // - Removes feature signals from properties (they shouldn't be stored in metadata) + // - Returns reader/writer features to add to protocol + let mut validated = validate_extract_table_features_and_properties(self.table_properties)?; + + // Apply column mapping if mode is name or id (must happen BEFORE data layout) + let (effective_schema, column_mapping_mode) = + maybe_apply_column_mapping_for_table_create(&self.schema, &mut validated)?; + + // Validate data layout and resolve column names (physical for clustering, logical + // for partitioning). Adds required table features for clustering. + let data_layout_result = apply_data_layout( + &self.data_layout, + &effective_schema, + column_mapping_mode, + &mut validated, + )?; + + // Schema-driven auto-enablement: detect types that require a feature + maybe_enable_variant_type(&effective_schema, &mut validated); + maybe_enable_timestamp_ntz(&effective_schema, &mut validated); + + // Property-driven auto-enablement: check enablement properties + maybe_auto_enable_property_driven_features(&mut validated); + + // Auto-enable inCommitTimestamp for catalogManaged tables + maybe_enable_ict_for_catalog_managed(&mut validated)?; + + // Create Protocol action with table features support + let protocol = + Protocol::try_new_modern(validated.reader_features, validated.writer_features)?; + + // Create Metadata action with filtered properties (feature signals removed) + // Use effective_schema which includes column mapping annotations if enabled + // Partition columns are validated to be top-level, so each ColumnName has + // exactly one path component. Extract it with remove(0). + let partition_columns: Vec = data_layout_result + .partition_columns + .map(|cols| cols.into_iter().map(|c| c.into_inner().remove(0)).collect()) + .unwrap_or_default(); + let metadata = Metadata::try_new( + None, // name + None, // description + effective_schema.clone(), + partition_columns, + current_time_ms()?, + validated.properties, + )?; + + // Create pre-commit snapshot from protocol/metadata + let log_root = table_url.join("_delta_log/")?; + let log_segment = LogSegment::for_pre_commit(log_root); + let table_configuration = + TableConfiguration::try_new(metadata, protocol, table_url, PRE_COMMIT_VERSION)?; + + // Create Transaction with pre-commit snapshot + Transaction::try_new_create_table( + Arc::new(Snapshot::new(log_segment, table_configuration)), + self.engine_info, + committer, + data_layout_result.system_domain_metadata, + data_layout_result.clustering_columns, + ) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + use crate::expressions::ColumnName; + use crate::schema::{DataType, StructField, StructType}; + use crate::table_features::FeatureType; + use crate::table_properties::ENABLE_ICEBERG_COMPAT_V1; + use crate::utils::test_utils::assert_result_error_with_message; + + fn test_schema() -> SchemaRef { + Arc::new(StructType::new_unchecked(vec![StructField::new( + "id", + DataType::INTEGER, + false, + )])) + } + + #[test] + fn test_basic_builder_creation() { + let schema = test_schema(); + let builder = + CreateTableTransactionBuilder::new("/path/to/table", schema.clone(), "TestApp/1.0"); + + assert_eq!(builder.path, "/path/to/table"); + assert_eq!(builder.engine_info, "TestApp/1.0"); + assert!(builder.table_properties.is_empty()); + } + + #[test] + fn test_nested_path_builder_creation() { + let schema = test_schema(); + let builder = CreateTableTransactionBuilder::new( + "/path/to/table/nested", + schema.clone(), + "TestApp/1.0", + ); + + assert_eq!(builder.path, "/path/to/table/nested"); + } + + #[test] + fn test_with_table_properties() { + let schema = test_schema(); + + let builder = CreateTableTransactionBuilder::new("/path/to/table", schema, "TestApp/1.0") + .with_table_properties([("key1", "value1")]); + + assert_eq!( + builder.table_properties.get("key1"), + Some(&"value1".to_string()) + ); + } + + #[test] + fn test_with_multiple_table_properties() { + let schema = test_schema(); + + let builder = CreateTableTransactionBuilder::new("/path/to/table", schema, "TestApp/1.0") + .with_table_properties([("key1", "value1")]) + .with_table_properties([("key2", "value2")]); + + assert_eq!( + builder.table_properties.get("key1"), + Some(&"value1".to_string()) + ); + assert_eq!( + builder.table_properties.get("key2"), + Some(&"value2".to_string()) + ); + } + + #[test] + fn test_validate_supported_properties() { + // Empty properties are allowed + let properties = HashMap::new(); + let result = validate_extract_table_features_and_properties(properties); + assert!(result.is_ok()); + let validated = result.unwrap(); + assert!(validated.properties.is_empty()); + assert!(validated.reader_features.is_empty()); + assert!(validated.writer_features.is_empty()); + + // User/application properties are allowed and preserved + let mut properties = HashMap::new(); + properties.insert("myapp.version".to_string(), "1.0".to_string()); + properties.insert("custom.setting".to_string(), "value".to_string()); + let result = validate_extract_table_features_and_properties(properties); + assert!(result.is_ok()); + let validated = result.unwrap(); + assert_eq!(validated.properties.len(), 2); + assert_eq!( + validated.properties.get("myapp.version"), + Some(&"1.0".to_string()) + ); + assert_eq!( + validated.properties.get("custom.setting"), + Some(&"value".to_string()) + ); + } + + #[test] + fn test_validate_unsupported_properties() { + // Delta properties not on allow list are rejected + let mut properties = HashMap::new(); + properties.insert(ENABLE_ICEBERG_COMPAT_V1.to_string(), "true".to_string()); + assert_result_error_with_message( + validate_extract_table_features_and_properties(properties), + "Setting delta property 'delta.enableIcebergCompatV1' is not supported", + ); + + // Feature signals for features not in ALLOWED_DELTA_FEATURES are rejected + let properties = HashMap::from([( + "delta.feature.identityColumns".to_string(), + "supported".to_string(), + )]); + assert_result_error_with_message( + validate_extract_table_features_and_properties(properties), + "Enabling feature 'identityColumns' via 'delta.feature.identityColumns' is not supported", + ); + + // Clustering feature signal is rejected - users must use with_clustering_columns() instead + let properties = HashMap::from([( + "delta.feature.clustering".to_string(), + "supported".to_string(), + )]); + assert_result_error_with_message( + validate_extract_table_features_and_properties(properties), + "Enabling feature 'clustering' via 'delta.feature.clustering' is not supported", + ); + + // Mixed properties with unsupported delta property are rejected + let mut properties = HashMap::new(); + properties.insert("myapp.version".to_string(), "1.0".to_string()); + properties.insert(ENABLE_ICEBERG_COMPAT_V1.to_string(), "true".to_string()); + assert_result_error_with_message( + validate_extract_table_features_and_properties(properties), + "Setting delta property 'delta.enableIcebergCompatV1' is not supported", + ); + } + + #[test] + fn test_clustering_support_valid() { + use crate::clustering::CLUSTERING_DOMAIN_NAME; + use crate::expressions::ColumnName; + + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ])); + + let mut reader_features = vec![]; + let mut writer_features = vec![]; + + let dm = apply_clustering_for_table_create( + &schema, + &[ColumnName::new(["id"])], + &mut reader_features, + &mut writer_features, + ) + .unwrap(); + + assert_eq!(dm.domain(), CLUSTERING_DOMAIN_NAME); + assert!(writer_features.contains(&TableFeature::DomainMetadata)); + assert!(writer_features.contains(&TableFeature::ClusteredTable)); + // DomainMetadata is a writer-only feature, ClusteredTable is also writer-only + // So reader_features should be empty + assert!(reader_features.is_empty()); + } + + #[test] + fn test_clustering_support_multiple_columns() { + use crate::expressions::ColumnName; + + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("date", DataType::STRING, true), + StructField::new("region", DataType::STRING, true), + ])); + + let mut reader_features = vec![]; + let mut writer_features = vec![]; + + let dm = apply_clustering_for_table_create( + &schema, + &[ColumnName::new(["id"]), ColumnName::new(["date"])], + &mut reader_features, + &mut writer_features, + ) + .unwrap(); + + // Verify domain metadata contains both columns with correct names + let config: serde_json::Value = serde_json::from_str(dm.configuration()).unwrap(); + let clustering_cols = config["clusteringColumns"].as_array().unwrap(); + assert_eq!(clustering_cols.len(), 2); + assert_eq!(clustering_cols[0], serde_json::json!(["id"])); + assert_eq!(clustering_cols[1], serde_json::json!(["date"])); + } + + #[test] + fn test_clustering_column_not_in_schema() { + use crate::expressions::ColumnName; + + let schema = Arc::new(StructType::new_unchecked(vec![StructField::new( + "id", + DataType::INTEGER, + false, + )])); + + let mut reader_features = vec![]; + let mut writer_features = vec![]; + + let result = apply_clustering_for_table_create( + &schema, + &[ColumnName::new(["nonexistent"])], + &mut reader_features, + &mut writer_features, + ); + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("not found in schema")); + } + + #[test] + fn test_clustering_nested_column_accepted() { + use crate::clustering::CLUSTERING_DOMAIN_NAME; + use crate::expressions::ColumnName; + + let address_struct = StructType::new_unchecked(vec![ + StructField::new("city", DataType::STRING, true), + StructField::new("zip", DataType::STRING, true), + ]); + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("address", DataType::Struct(Box::new(address_struct)), true), + ])); + + let mut reader_features = vec![]; + let mut writer_features = vec![]; + + let nested_col = ColumnName::new(["address", "city"]); + let dm = apply_clustering_for_table_create( + &schema, + &[nested_col], + &mut reader_features, + &mut writer_features, + ) + .unwrap(); + + assert_eq!(dm.domain(), CLUSTERING_DOMAIN_NAME); + assert!(writer_features.contains(&TableFeature::ClusteredTable)); + } + + #[rstest::rstest] + #[case::clustered(DataLayout::clustered(["id"]), true, false)] + #[case::partitioned(DataLayout::partitioned(["id"]), false, true)] + #[case::none(DataLayout::default(), false, false)] + fn test_with_data_layout( + #[case] layout: DataLayout, + #[case] expect_clustered: bool, + #[case] expect_partitioned: bool, + ) { + let schema = test_schema(); + + let builder = CreateTableTransactionBuilder::new("/path/to/table", schema, "TestApp/1.0") + .with_data_layout(layout); + + assert_eq!(builder.data_layout.is_clustered(), expect_clustered); + assert_eq!(builder.data_layout.is_partitioned(), expect_partitioned); + } + + #[rstest::rstest] + #[case::variant_top_level( + Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("v", DataType::unshredded_variant(), true), + ])), + &[TableFeature::VariantType], + )] + #[case::variant_nested( + Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new( + "nested", + DataType::Struct(Box::new(StructType::new_unchecked(vec![ + StructField::new("inner_v", DataType::unshredded_variant(), true), + ]))), + true, + ), + ])), + &[TableFeature::VariantType], + )] + #[case::ntz_top_level( + Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("ts", DataType::TIMESTAMP_NTZ, true), + ])), + &[TableFeature::TimestampWithoutTimezone], + )] + #[case::ntz_nested( + Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new( + "nested", + DataType::Struct(Box::new(StructType::new_unchecked(vec![ + StructField::new("inner_ts", DataType::TIMESTAMP_NTZ, true), + ]))), + true, + ), + ])), + &[TableFeature::TimestampWithoutTimezone], + )] + #[case::both_variant_and_ntz( + Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("v", DataType::unshredded_variant(), true), + StructField::new("ts", DataType::TIMESTAMP_NTZ, true), + ])), + &[TableFeature::VariantType, TableFeature::TimestampWithoutTimezone], + )] + #[case::no_special_types( + test_schema(), + &[], + )] + fn test_schema_driven_feature_auto_enablement( + #[case] schema: SchemaRef, + #[case] expected_features: &[TableFeature], + ) { + let mut validated = ValidatedTableProperties { + properties: HashMap::new(), + reader_features: vec![], + writer_features: vec![], + }; + + maybe_enable_variant_type(&schema, &mut validated); + maybe_enable_timestamp_ntz(&schema, &mut validated); + + for feature in expected_features { + assert!( + validated.reader_features.contains(feature), + "Expected {feature:?} in reader_features" + ); + assert!( + validated.writer_features.contains(feature), + "Expected {feature:?} in writer_features" + ); + } + assert_eq!( + validated.reader_features.len(), + expected_features.len(), + "Unexpected extra reader features: {:?}", + validated.reader_features + ); + assert_eq!( + validated.writer_features.len(), + expected_features.len(), + "Unexpected extra writer features: {:?}", + validated.writer_features + ); + } + + #[rstest::rstest] + #[case::property_true(&[("delta.enableInCommitTimestamps", "true")], true, true)] + #[case::property_false(&[("delta.enableInCommitTimestamps", "false")], false, true)] + #[case::property_absent(&[], false, false)] + #[case::feature_signal(&[("delta.feature.inCommitTimestamp", "supported")], true, false)] + fn test_ict_support_and_enablement( + #[case] properties: &[(&str, &str)], + #[case] expect_in_writer_features: bool, + #[case] expect_property_preserved: bool, + ) { + let properties: HashMap = properties + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + let mut validated = validate_extract_table_features_and_properties(properties).unwrap(); + + maybe_auto_enable_property_driven_features(&mut validated); + + assert_eq!( + validated + .writer_features + .contains(&TableFeature::InCommitTimestamp), + expect_in_writer_features, + ); + assert_eq!( + validated + .properties + .contains_key(ENABLE_IN_COMMIT_TIMESTAMPS), + expect_property_preserved, + ); + assert!( + validated.reader_features.is_empty(), + "InCommitTimestamp is writer-only, reader_features should always be empty" + ); + } + + #[rstest::rstest] + #[case::vacuum_protocol_check(TableFeature::VacuumProtocolCheck, "vacuumProtocolCheck")] + #[case::domain_metadata(TableFeature::DomainMetadata, "domainMetadata")] + #[case::column_mapping(TableFeature::ColumnMapping, "columnMapping")] + #[case::in_commit_timestamp(TableFeature::InCommitTimestamp, "inCommitTimestamp")] + #[case::deletion_vectors(TableFeature::DeletionVectors, "deletionVectors")] + #[case::v2_checkpoint(TableFeature::V2Checkpoint, "v2Checkpoint")] + #[case::append_only(TableFeature::AppendOnly, "appendOnly")] + #[case::change_data_feed(TableFeature::ChangeDataFeed, "changeDataFeed")] + #[case::type_widening(TableFeature::TypeWidening, "typeWidening")] + #[case::catalog_managed(TableFeature::CatalogManaged, "catalogManaged")] + fn test_feature_signal_accepted(#[case] feature: TableFeature, #[case] feature_name: &str) { + let key = format!("delta.feature.{feature_name}"); + let properties = HashMap::from([(key, "supported".to_string())]); + let validated = validate_extract_table_features_and_properties(properties).unwrap(); + + assert!( + validated.properties.is_empty(), + "Feature signal should be removed from properties" + ); + assert!( + validated.writer_features.contains(&feature), + "{feature:?} should be in writer_features" + ); + match feature.feature_type() { + FeatureType::ReaderWriter => assert!( + validated.reader_features.contains(&feature), + "{feature:?} is ReaderWriter but missing from reader_features" + ), + _ => assert!( + validated.reader_features.is_empty(), + "{feature:?} is WriterOnly but reader_features is not empty" + ), + } + } + + fn multi_column_schema() -> SchemaRef { + Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + StructField::new("date", DataType::DATE, true), + ])) + } + + struct DataLayoutExpectation { + layout: DataLayout, + has_domain_metadata: bool, + has_clustering_columns: bool, + expected_partition_columns: Option>, + expected_writer_features: Vec, + } + + #[rstest::rstest] + #[case::none(DataLayoutExpectation { + layout: DataLayout::default(), + has_domain_metadata: false, + has_clustering_columns: false, + expected_partition_columns: None, + expected_writer_features: vec![], + })] + #[case::clustered(DataLayoutExpectation { + layout: DataLayout::clustered(["id"]), + has_domain_metadata: true, + has_clustering_columns: true, + expected_partition_columns: None, + expected_writer_features: vec![TableFeature::DomainMetadata, TableFeature::ClusteredTable], + })] + #[case::partitioned_single(DataLayoutExpectation { + layout: DataLayout::partitioned(["date"]), + has_domain_metadata: false, + has_clustering_columns: false, + expected_partition_columns: Some(vec![ColumnName::new(["date"])]), + expected_writer_features: vec![], + })] + #[case::partitioned_multiple(DataLayoutExpectation { + layout: DataLayout::partitioned(["id", "date"]), + has_domain_metadata: false, + has_clustering_columns: false, + expected_partition_columns: Some(vec![ColumnName::new(["id"]), ColumnName::new(["date"])]), + expected_writer_features: vec![], + })] + fn test_apply_data_layout(#[case] expectation: DataLayoutExpectation) { + let schema = multi_column_schema(); + let mut validated = ValidatedTableProperties { + properties: HashMap::new(), + reader_features: vec![], + writer_features: vec![], + }; + + let result = apply_data_layout( + &expectation.layout, + &schema, + ColumnMappingMode::None, + &mut validated, + ) + .unwrap(); + + assert_eq!( + !result.system_domain_metadata.is_empty(), + expectation.has_domain_metadata + ); + assert_eq!( + result.clustering_columns.is_some(), + expectation.has_clustering_columns + ); + assert_eq!( + result.partition_columns, + expectation.expected_partition_columns + ); + + for feature in &expectation.expected_writer_features { + assert!( + validated.writer_features.contains(feature), + "Expected {feature:?} in writer_features" + ); + } + } + + #[rstest::rstest] + #[case::clustered_invalid_col(DataLayout::clustered(["nonexistent"]), "not found in schema")] + #[case::partitioned_invalid_col(DataLayout::partitioned(["nonexistent"]), "not found in schema")] + #[case::partitioned_duplicate(DataLayout::partitioned(["id", "id"]), "Duplicate partition column")] + #[case::partitioned_empty(DataLayout::Partitioned { columns: vec![] }, "at least one column")] + #[case::partitioned_all_columns(DataLayout::partitioned(["id", "name", "date"]), "at least one non-partition column")] + fn test_apply_data_layout_validation_errors( + #[case] layout: DataLayout, + #[case] expected_error: &str, + ) { + let schema = multi_column_schema(); + let mut validated = ValidatedTableProperties { + properties: HashMap::new(), + reader_features: vec![], + writer_features: vec![], + }; + + let result = apply_data_layout(&layout, &schema, ColumnMappingMode::None, &mut validated); + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains(expected_error), + "Expected error containing '{expected_error}'" + ); + } + + #[test] + fn test_validate_partition_columns_nested_rejected() { + let address_struct = + StructType::new_unchecked(vec![StructField::new("city", DataType::STRING, true)]); + let schema = StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("address", DataType::Struct(Box::new(address_struct)), true), + ]); + + let columns = vec![ColumnName::new(["address", "city"])]; + let result = validate_partition_columns(&schema, &columns); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("must be a top-level column")); + } + + #[rstest::rstest] + #[case::struct_type( + "struct_col", + DataType::Struct(Box::new(StructType::new_unchecked(vec![ + StructField::new("inner", DataType::STRING, false), + ]))), + )] + #[case::array_type( + "array_col", + DataType::Array(Box::new(crate::schema::ArrayType::new(DataType::INTEGER, false))) + )] + #[case::map_type( + "map_col", + DataType::Map(Box::new(crate::schema::MapType::new( + DataType::STRING, + DataType::INTEGER, + false + ))) + )] + fn test_validate_partition_columns_complex_types_rejected( + #[case] col_name: &str, + #[case] data_type: DataType, + ) { + let schema = StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new(col_name, data_type, false), + ]); + let columns = vec![ColumnName::new([col_name])]; + let result = validate_partition_columns(&schema, &columns); + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("non-primitive type")); + } + + #[rstest::rstest] + #[case::integer(DataType::INTEGER)] + #[case::string(DataType::STRING)] + #[case::date(DataType::DATE)] + #[case::timestamp(DataType::TIMESTAMP)] + #[case::boolean(DataType::BOOLEAN)] + #[case::long(DataType::LONG)] + fn test_validate_partition_columns_primitive_types_accepted(#[case] data_type: DataType) { + let schema = StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("col", data_type, false), + ]); + let columns = vec![ColumnName::new(["col"])]; + assert!(validate_partition_columns(&schema, &columns).is_ok()); + } + + #[test] + fn test_catalog_managed_auto_enables_ict() { + let properties = HashMap::from([( + "delta.feature.catalogManaged".to_string(), + "supported".to_string(), + )]); + let mut validated = validate_extract_table_features_and_properties(properties).unwrap(); + maybe_auto_enable_property_driven_features(&mut validated); + maybe_enable_ict_for_catalog_managed(&mut validated).unwrap(); + + assert!( + validated + .writer_features + .contains(&TableFeature::InCommitTimestamp), + "ICT should be auto-added to writer_features" + ); + assert_eq!( + validated.properties.get(ENABLE_IN_COMMIT_TIMESTAMPS), + Some(&"true".to_string()), + "delta.enableInCommitTimestamps should be set to true" + ); + } + + #[test] + fn test_catalog_managed_with_ict_true_succeeds() { + let properties = HashMap::from([ + ( + "delta.feature.catalogManaged".to_string(), + "supported".to_string(), + ), + ( + "delta.enableInCommitTimestamps".to_string(), + "true".to_string(), + ), + ]); + let mut validated = validate_extract_table_features_and_properties(properties).unwrap(); + maybe_auto_enable_property_driven_features(&mut validated); + maybe_enable_ict_for_catalog_managed(&mut validated).unwrap(); + + assert!(validated + .writer_features + .contains(&TableFeature::InCommitTimestamp)); + assert_eq!( + validated.properties.get(ENABLE_IN_COMMIT_TIMESTAMPS), + Some(&"true".to_string()), + ); + } + + #[test] + fn test_catalog_managed_with_ict_false_fails() { + let properties = HashMap::from([ + ( + "delta.feature.catalogManaged".to_string(), + "supported".to_string(), + ), + ( + "delta.enableInCommitTimestamps".to_string(), + "false".to_string(), + ), + ]); + let mut validated = validate_extract_table_features_and_properties(properties).unwrap(); + maybe_auto_enable_property_driven_features(&mut validated); + let err = maybe_enable_ict_for_catalog_managed(&mut validated).unwrap_err(); + assert!( + err.to_string().contains("enableInCommitTimestamps"), + "expected ICT conflict error, got: {err}" + ); + } +} diff --git a/kernel/src/transaction/builder/mod.rs b/kernel/src/transaction/builder/mod.rs new file mode 100644 index 0000000000..c6785cdd8a --- /dev/null +++ b/kernel/src/transaction/builder/mod.rs @@ -0,0 +1,8 @@ +//! Builder modules for transaction construction. + +// Allow `pub` items in this module even though the module itself may be `pub(crate)`. +// The module visibility controls external access; items are `pub` for use within the crate +// and for tests. Also allow dead_code since these are used by integration tests. +#![allow(unreachable_pub, dead_code)] + +pub mod create_table; diff --git a/kernel/src/transaction/commit_info.rs b/kernel/src/transaction/commit_info.rs new file mode 100644 index 0000000000..36d880de8a --- /dev/null +++ b/kernel/src/transaction/commit_info.rs @@ -0,0 +1,502 @@ +use std::sync::Arc; + +use crate::actions::{get_log_commit_info_schema, CommitInfo, COMMIT_INFO_NAME}; +use crate::expressions::{MapData, Scalar, Transform}; +use crate::schema::{MapType, StructField, StructType, ToSchema}; +use crate::{DataType, Engine, EngineData, Error, Expression, ExpressionRef, IntoEngineData}; + +use super::Transaction; + +/// Builds a list of `(field_name, literal_expression)` pairs covering every [`CommitInfo`] +/// field. Field names match the camelCase schema names produced by the `ToSchema` derive macro. +/// The returned vec preserves CommitInfo schema field order, which callers rely on when +/// inserting kernel-only fields after the last engine field. +fn commit_info_literal_exprs( + commit_info: CommitInfo, +) -> Result, Error> { + let op_params_map_type = MapType::new(DataType::STRING, DataType::STRING, true); + let literal_exprs = vec![ + ( + "timestamp", + Arc::new(Expression::literal(commit_info.timestamp)), + ), + ( + "inCommitTimestamp", + Arc::new(Expression::literal(commit_info.in_commit_timestamp)), + ), + ( + "operation", + Arc::new(Expression::literal(commit_info.operation)), + ), + ( + "operationParameters", + Arc::new(Expression::literal( + match commit_info.operation_parameters { + Some(map) => Scalar::Map(MapData::try_new( + op_params_map_type, + map.into_iter() + .map(|(k, v)| (Scalar::String(k), Scalar::String(v))), + )?), + None => Scalar::Null(DataType::Map(Box::new(op_params_map_type))), + }, + )), + ), + ( + "kernelVersion", + Arc::new(Expression::literal(commit_info.kernel_version)), + ), + ( + "isBlindAppend", + Arc::new(Expression::literal(commit_info.is_blind_append)), + ), + ( + "engineInfo", + Arc::new(Expression::literal(commit_info.engine_info)), + ), + ("txnId", Arc::new(Expression::literal(commit_info.txn_id))), + ]; + let expected_expr_len = CommitInfo::to_schema().fields().len(); + if literal_exprs.len() != expected_expr_len { + return Err(Error::Generic(format!("expect the commit_info_literal_exprs return {expected_expr_len} expressions, but only get {} expressions. \ + If CommitInfo field was added/removed, please update Expression::Literal in this function and update the with_commit_info doc comment", literal_exprs.len()))); + } + Ok(literal_exprs) +} + +impl Transaction { + pub(super) fn generate_commit_info( + &self, + engine: &dyn Engine, + kernel_commit_info: CommitInfo, + ) -> Result, Error> { + match &self.engine_commit_info { + Some((engine_commit_info, engine_commit_info_schema)) => { + let kernel_schema = CommitInfo::to_schema(); + + // Step 1: Build output schema - all engine fields first, then any kernel-only + // fields that are not already present in the engine schema appended at the end. + let output_fields: Vec<_> = engine_commit_info_schema + .fields() + .map(|field| kernel_schema.field(field.name()).unwrap_or(field)) + .cloned() + .chain( + kernel_schema + .fields() + .filter(|field| !engine_commit_info_schema.contains(field.name())) + .cloned(), + ) + .collect(); + + let output_schema = StructType::new_unchecked(output_fields); + + // Step 2: Build literal expressions for each CommitInfo field. + let literal_exprs = commit_info_literal_exprs(kernel_commit_info)?; + + // Step 3: Build Transform. Replacements must be registered before insertions so + // that for the last engine field (which may itself be replaced), exprs is ordered + // as [replace_expr, insert_exprs...]. The evaluator emits exprs in declaration + // order, so the replace value must come first. + let last_engine_field = engine_commit_info_schema.field_names().last().cloned(); + let mut transform = Transform::new_top_level(); + + // First pass: replace fields that already exist in the engine schema. + for (field_name, expr_ref) in &literal_exprs { + if engine_commit_info_schema.contains(*field_name) { + transform = transform.with_replaced_field(*field_name, expr_ref.clone()); + } + } + // Second pass: append kernel-only fields after the last engine field. + for (field_name, expr_ref) in &literal_exprs { + if !engine_commit_info_schema.contains(*field_name) { + transform = transform + .with_inserted_field(last_engine_field.as_deref(), expr_ref.clone()); + } + } + + // Step 4: Wrap the transform in a struct expression so the output matches the + // Delta log action format `{ "commitInfo": { merged fields... } }`, consistent + // with the None branch which uses `get_log_commit_info_schema()`. + let wrapped_expr = + Expression::struct_from([Arc::new(Expression::transform(transform))]); + let wrapped_schema = Arc::new(StructType::new_unchecked([StructField::nullable( + COMMIT_INFO_NAME, + output_schema, + )])); + let evaluator = engine.evaluation_handler().new_expression_evaluator( + engine_commit_info_schema.clone(), + Arc::new(wrapped_expr), + wrapped_schema.into(), + )?; + evaluator.evaluate(engine_commit_info.as_ref()) + } + None => { + kernel_commit_info.into_engine_data(get_log_commit_info_schema().clone(), engine) + } + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use crate::actions::CommitInfo; + use crate::arrow::array::{ + Array, ArrayRef, BooleanArray, Int64Array, MapArray, MapBuilder, StringArray, + StringBuilder, StructArray, + }; + use crate::arrow::datatypes::{ + DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, + }; + use crate::arrow::record_batch::RecordBatch; + use crate::committer::FileSystemCommitter; + use crate::engine::arrow_conversion::TryIntoKernel; + use crate::engine::arrow_data::ArrowEngineData; + use crate::schema::{Schema, SchemaRef, StructField, StructType, ToSchema}; + use crate::transaction::Transaction; + use crate::utils::test_utils::load_test_table; + use crate::{DeltaResult, Engine, EngineData}; + + // ── build_commit_info tests ──────────────────────────────────────────────── + + /// Helper: create a kernel `CommitInfo` that mirrors what `Transaction::commit` produces. + fn make_kernel_commit_info() -> CommitInfo { + CommitInfo::new( + 1_700_000_000_000i64, + Some(134_000_000i64), + Some("WRITE".to_string()), + Some("test_engine/1.0".to_string()), + false, + ) + } + + /// Helper: build an Arrow RecordBatch + kernel SchemaRef for use as engine_commit_info. + fn make_engine_commit_info( + arrow_fields: Vec, + columns: Vec, + ) -> (Box, SchemaRef) { + let arrow_schema = ArrowSchema::new(arrow_fields); + #[cfg(all(feature = "arrow-56", not(feature = "arrow-57")))] + let kernel_schema: Schema = (&arrow_schema).try_into_kernel().unwrap(); + #[cfg(any(not(feature = "arrow-56"), feature = "arrow-57"))] + let kernel_schema: Schema = arrow_schema.as_ref().try_into_kernel().unwrap(); + let batch = + RecordBatch::try_new(Arc::new(arrow_schema), columns).expect("valid RecordBatch"); + ( + Box::new(ArrowEngineData::new(batch)), + Arc::new(kernel_schema), + ) + } + + /// Helper: extract the inner "commitInfo" StructArray from a top-level RecordBatch. + /// Both branches of `build_commit_info` produce `{ "commitInfo": { ... } }`. + fn commit_info_struct(result: &ArrowEngineData) -> &StructArray { + let batch = result.record_batch(); + assert_eq!( + batch.num_columns(), + 1, + "expected single 'commitInfo' column" + ); + assert_eq!(batch.schema().field(0).name(), "commitInfo"); + batch + .column(0) + .as_any() + .downcast_ref::() + .expect("commitInfo column should be a StructArray") + } + + /// Helper: pull a non-null string value from a named column in a StructArray. + fn get_str<'a>(s: &'a StructArray, col: &str) -> &'a str { + s.column_by_name(col) + .unwrap_or_else(|| panic!("field '{col}' not found")) + .as_any() + .downcast_ref::() + .unwrap_or_else(|| panic!("field '{col}' is not a StringArray")) + .value(0) + } + + /// Helper: pull a non-null i64 value from a named column in a StructArray. + fn get_i64(s: &StructArray, col: &str) -> i64 { + s.column_by_name(col) + .unwrap_or_else(|| panic!("field '{col}' not found")) + .as_any() + .downcast_ref::() + .unwrap_or_else(|| panic!("field '{col}' is not an Int64Array")) + .value(0) + } + + /// Helper: pull the map value at row 0 from a named MapArray column in a StructArray. + /// Returns the key-value pairs as a StructArray. + fn get_map(s: &StructArray, col: &str) -> StructArray { + s.column_by_name(col) + .unwrap_or_else(|| panic!("field '{col}' not found")) + .as_any() + .downcast_ref::() + .unwrap_or_else(|| panic!("field '{col}' is not a MapArray")) + .value(0) + } + + /// Helper: pull a non-null boolean value from a named column in a StructArray. + fn get_bool(s: &StructArray, col: &str) -> bool { + s.column_by_name(col) + .unwrap_or_else(|| panic!("field '{col}' not found")) + .as_any() + .downcast_ref::() + .unwrap_or_else(|| panic!("field '{col}' is not an Int64Array")) + .value(0) + } + + /// Create a transaction with the given engine_commit_info, using the shared test table. + fn make_txn( + engine_commit_info: Option<(Box, SchemaRef)>, + ) -> DeltaResult<(Arc, Transaction)> { + let (engine, snapshot, _tempdir) = load_test_table("table-without-dv-small")?; + let mut txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("WRITE".to_string()); + if let Some((engine_commit_info_data, engine_commit_info_schema)) = engine_commit_info { + txn = txn.with_commit_info(engine_commit_info_data, engine_commit_info_schema); + } + Ok((engine, txn)) + } + + /// no engine_commit_info -- output is the kernel CommitInfo wrapped in a "commitInfo" + /// outer struct, matching the Delta log action format produced by `get_log_commit_info_schema`. + #[test] + fn test_build_commit_info_none_branch() -> DeltaResult<()> { + let (engine, txn) = make_txn(None)?; + let result = ArrowEngineData::try_from_engine_data( + txn.generate_commit_info(engine.as_ref(), make_kernel_commit_info())?, + )?; + let ci = commit_info_struct(&result); + + let kernel_schema = CommitInfo::to_schema(); + assert_eq!(ci.num_columns(), kernel_schema.fields().count()); + assert_eq!(get_str(ci, "operation"), "WRITE"); + assert!(!get_str(ci, "kernelVersion").is_empty()); + assert!(!get_str(ci, "txnId").is_empty()); + Ok(()) + } + + /// engine schema has fields that are fully disjoint from CommitInfo -- all CommitInfo + /// fields are appended after the engine-only fields, in CommitInfo schema order. + #[test] + fn test_build_commit_info_disjoint_schemas() -> DeltaResult<()> { + let (data, schema) = make_engine_commit_info( + vec![ + ArrowField::new("customApp", ArrowDataType::Utf8, false), + ArrowField::new("customVersion", ArrowDataType::Int64, false), + ], + vec![ + Arc::new(StringArray::from(vec!["myApp"])) as ArrayRef, + Arc::new(Int64Array::from(vec![42i64])) as ArrayRef, + ], + ); + let (engine, txn) = make_txn(Some((data, schema)))?; + + let result = ArrowEngineData::try_from_engine_data( + txn.generate_commit_info(engine.as_ref(), make_kernel_commit_info())?, + )?; + let commit_info = commit_info_struct(&result); + + // All CommitInfo fields are appended -- total = 2 engine + 8 CommitInfo. + assert_eq!( + commit_info.num_columns(), + 2 + CommitInfo::to_schema().fields().count() + ); + + // Engine fields are first and their values pass through unchanged. + assert_eq!(commit_info.fields()[0].name(), "customApp"); + assert_eq!(commit_info.fields()[1].name(), "customVersion"); + assert_eq!(get_str(commit_info, "customApp"), "myApp"); + assert_eq!(get_i64(commit_info, "customVersion"), 42); + + assert_eq!(get_str(commit_info, "operation"), "WRITE"); + assert!(!get_str(commit_info, "kernelVersion").is_empty()); + assert!(get_map(commit_info, "operationParameters").len() == 0); + assert!(uuid::Uuid::parse_str(get_str(commit_info, "txnId")).is_ok()); + assert!(get_i64(commit_info, "timestamp") > 0); + assert_eq!(get_i64(commit_info, "inCommitTimestamp"), 134_000_000); + assert_eq!(get_str(commit_info, "engineInfo"), "test_engine/1.0"); + assert!(!get_bool(commit_info, "isBlindAppend")); + + Ok(()) + } + + /// engine schema contains every kernel's CommitInfo field. + /// All overlapping fields must be replaced by kernel values, no new fields added. + #[test] + fn test_build_commit_info_full_overlap() -> DeltaResult<()> { + let mut map_builder = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new()); + map_builder.keys().append_value("stale_key"); + map_builder.values().append_value("stale_value"); + map_builder.append(true).unwrap(); + let stale_op_params = Arc::new(map_builder.finish()) as ArrayRef; + + let (data, schema) = make_engine_commit_info( + vec![ + ArrowField::new("timestamp", ArrowDataType::Int64, true), + ArrowField::new("inCommitTimestamp", ArrowDataType::Int64, true), + ArrowField::new("operation", ArrowDataType::Utf8, true), + ArrowField::new( + "operationParameters", + stale_op_params.data_type().clone(), + true, + ), + ArrowField::new("kernelVersion", ArrowDataType::Utf8, true), + ArrowField::new("isBlindAppend", ArrowDataType::Boolean, true), + ArrowField::new("engineInfo", ArrowDataType::Utf8, true), + ArrowField::new("txnId", ArrowDataType::Utf8, true), + ], + vec![ + Arc::new(Int64Array::from(vec![Some(0i64)])) as ArrayRef, + Arc::new(Int64Array::from(vec![None::])) as ArrayRef, + Arc::new(StringArray::from(vec!["STALE_OP"])) as ArrayRef, + stale_op_params, + Arc::new(StringArray::from(vec!["v0.0.0"])) as ArrayRef, + Arc::new(BooleanArray::from(vec![None::])) as ArrayRef, + Arc::new(StringArray::from(vec!["stale_engine"])) as ArrayRef, + Arc::new(StringArray::from(vec!["stale_txn"])) as ArrayRef, + ], + ); + let (engine, txn) = make_txn(Some((data, schema)))?; + + let result = ArrowEngineData::try_from_engine_data( + txn.generate_commit_info(engine.as_ref(), make_kernel_commit_info())?, + )?; + let commit_info = commit_info_struct(&result); + + // All 8 CommitInfo fields are present in the engine schema -- no fields appended. + assert_eq!(commit_info.num_columns(), 8); + + assert_eq!(get_str(commit_info, "operation"), "WRITE"); + assert!(!get_str(commit_info, "kernelVersion").is_empty()); + assert_eq!(get_map(commit_info, "operationParameters").len(), 0); + assert!(uuid::Uuid::parse_str(get_str(commit_info, "txnId")).is_ok()); + assert!(get_i64(commit_info, "timestamp") > 0); + assert_eq!(get_i64(commit_info, "inCommitTimestamp"), 134_000_000); + assert_eq!(get_str(commit_info, "engineInfo"), "test_engine/1.0"); + assert!(!get_bool(commit_info, "isBlindAppend")); + + Ok(()) + } + + /// engine schema has partial overlap -- overlapping fields are replaced, engine-only + /// fields pass through, and remaining CommitInfo fields are appended after the last engine field. + #[test] + fn test_build_commit_info_partial_overlap() -> DeltaResult<()> { + let (data, schema) = make_engine_commit_info( + vec![ + ArrowField::new("timestamp", ArrowDataType::Int64, true), + ArrowField::new("operation", ArrowDataType::Utf8, true), + ArrowField::new("myCustomField", ArrowDataType::Utf8, false), + ], + vec![ + Arc::new(Int64Array::from(vec![Some(0i64)])) as ArrayRef, + Arc::new(StringArray::from(vec!["STALE_OP"])) as ArrayRef, + Arc::new(StringArray::from(vec!["keep_me"])) as ArrayRef, + ], + ); + let (engine, txn) = make_txn(Some((data, schema)))?; + + let result = ArrowEngineData::try_from_engine_data( + txn.generate_commit_info(engine.as_ref(), make_kernel_commit_info())?, + )?; + let ci = commit_info_struct(&result); + + // Engine-only field passes through unchanged. + assert_eq!(get_str(ci, "myCustomField"), "keep_me"); + + // Overlapping fields are replaced with kernel values. + assert_ne!(get_str(ci, "operation"), "STALE_OP"); + assert_eq!(get_str(ci, "operation"), "WRITE"); + + // Engine fields keep their original schema positions (first 3 columns). + assert_eq!(ci.fields()[0].name(), "timestamp"); + assert_eq!(ci.fields()[1].name(), "operation"); + assert_eq!(ci.fields()[2].name(), "myCustomField"); + + // Remaining CommitInfo fields (6 not in engine schema) are appended after myCustomField. + // Total = 3 engine fields + 6 kernel-only fields. + assert_eq!( + ci.num_columns(), + 3 + CommitInfo::to_schema().fields().count() - 2 + ); + Ok(()) + } + + /// engine schema has overlapping fields with different DataTypes than kernel expects. + /// Kernel replacement must win, so each output field has the kernel's type. + #[test] + fn test_build_commit_info_type_conflict_replaced_by_kernel() -> DeltaResult<()> { + let (data, schema) = make_engine_commit_info( + vec![ + ArrowField::new("timestamp", ArrowDataType::Utf8, true), + ArrowField::new("inCommitTimestamp", ArrowDataType::Utf8, true), + ArrowField::new("operation", ArrowDataType::Int64, true), + ArrowField::new("isBlindAppend", ArrowDataType::Utf8, true), + ArrowField::new("myCustomField", ArrowDataType::Utf8, false), + ], + vec![ + Arc::new(StringArray::from(vec!["not-a-timestamp"])) as ArrayRef, + Arc::new(StringArray::from(vec!["not-a-timestamp"])) as ArrayRef, + Arc::new(Int64Array::from(vec![0i64])) as ArrayRef, + Arc::new(StringArray::from(vec!["not-a-bool"])) as ArrayRef, + Arc::new(StringArray::from(vec!["keep_me"])) as ArrayRef, + ], + ); + let (engine, txn) = make_txn(Some((data, schema)))?; + + let result = ArrowEngineData::try_from_engine_data( + txn.generate_commit_info(engine.as_ref(), make_kernel_commit_info())?, + )?; + let ci = commit_info_struct(&result); + + // Each kernel-owned field has the kernel's type, not the engine's. + let field_type = |name: &str| { + ci.fields() + .iter() + .find(|f| f.name() == name) + .unwrap_or_else(|| panic!("field '{name}' must be present")) + .data_type() + .clone() + }; + assert_eq!(field_type("timestamp"), ArrowDataType::Int64); + assert_eq!(field_type("inCommitTimestamp"), ArrowDataType::Int64); + assert_eq!(field_type("operation"), ArrowDataType::Utf8); + assert_eq!(field_type("isBlindAppend"), ArrowDataType::Boolean); + + // Engine-only field passes through with its original type and value unchanged. + assert_eq!(field_type("myCustomField"), ArrowDataType::Utf8); + assert_eq!(get_str(ci, "myCustomField"), "keep_me"); + Ok(()) + } + + /// engine schema is empty -- all CommitInfo fields are prepended (which, with no engine + /// fields preceding them, is equivalent to producing the full CommitInfo schema). + #[test] + fn test_build_commit_info_empty_engine_schema() -> DeltaResult<()> { + // A 0-row, 0-column RecordBatch with an empty kernel schema. + let empty_batch = RecordBatch::new_empty(Arc::new(ArrowSchema::empty())); + let empty_schema = Arc::new(StructType::new_unchecked(Vec::::new())); + let (engine, txn) = make_txn(Some(( + Box::new(ArrowEngineData::new(empty_batch)), + empty_schema, + )))?; + + let result = ArrowEngineData::try_from_engine_data( + txn.generate_commit_info(engine.as_ref(), make_kernel_commit_info())?, + )?; + let ci = commit_info_struct(&result); + + // With no engine fields, the inner schema matches CommitInfo::to_schema(). + let kernel_schema = CommitInfo::to_schema(); + assert_eq!(ci.num_columns(), kernel_schema.fields().count()); + + // Column order matches CommitInfo schema field order. + for (i, field) in kernel_schema.fields().enumerate() { + assert_eq!(ci.fields()[i].name(), field.name()); + } + Ok(()) + } +} diff --git a/kernel/src/transaction/create_table.rs b/kernel/src/transaction/create_table.rs new file mode 100644 index 0000000000..f85e90bb35 --- /dev/null +++ b/kernel/src/transaction/create_table.rs @@ -0,0 +1,179 @@ +//! Create table transaction types and entry point (internal API). +//! +//! This module defines the [`CreateTableTransaction`] type alias and the [`create_table`] +//! entry point function. The builder logic lives in +//! [`builder::create_table`](super::builder::create_table). +//! +//! # Example +//! +//! ```rust,no_run +//! use delta_kernel::transaction::create_table::create_table; +//! use delta_kernel::schema::{StructType, StructField, DataType}; +//! use delta_kernel::committer::FileSystemCommitter; +//! use std::sync::Arc; +//! # use delta_kernel::Engine; +//! # fn example(engine: &dyn Engine) -> delta_kernel::DeltaResult<()> { +//! +//! let schema = Arc::new(StructType::try_new(vec![ +//! StructField::new("id", DataType::INTEGER, false), +//! ])?); +//! +//! let result = create_table("/path/to/table", schema, "MyApp/1.0") +//! .with_table_properties([("myapp.version", "1.0")]) +//! .build(engine, Box::new(FileSystemCommitter::new()))? +//! .commit(engine)?; +//! # Ok(()) +//! # } +//! ``` + +// Allow `pub` items in this module even though the module itself may be `pub(crate)`. +// The module visibility controls external access; items are `pub` for use within the crate +// and for tests. Also allow dead_code since these are used by integration tests. +#![allow(unreachable_pub, dead_code)] + +use std::marker::PhantomData; + +use crate::actions::DomainMetadata; +use crate::committer::Committer; +use crate::expressions::ColumnName; +use crate::schema::SchemaRef; +use crate::snapshot::SnapshotRef; +use crate::transaction::{CreateTable, Transaction}; +use crate::utils::current_time_ms; +use crate::DeltaResult; + +// Re-export the builder so callers can still access it from this module path. +pub use super::builder::create_table::CreateTableTransactionBuilder; + +/// A type alias for create-table transactions. +/// +/// This provides a restricted API surface that only exposes operations valid during table +/// creation. Operations like removing files, removing domain metadata, updating deletion +/// vectors, and setting blind append are not available at compile time. +/// +/// # Operations NOT available on create-table transactions +/// +/// - **`with_domain_metadata_removed()`** — Cannot remove domain metadata from a table +/// that doesn't exist yet. +/// - **`remove_files()`** — Cannot remove files from a table that has no files. +/// - **`with_blind_append()`** — Blind append semantics don't apply to table creation. +/// - **`update_deletion_vectors()`** — Deletion vectors require an existing table. +/// - **`with_transaction_id()`** — Transaction ID (app_id) tracking is for existing tables. +/// - **`with_operation()`** — The operation is fixed to `"CREATE TABLE"`. +/// +/// # Example +/// +/// ```rust,no_run +/// use delta_kernel::transaction::create_table::create_table; +/// use delta_kernel::schema::{StructType, StructField, DataType}; +/// use delta_kernel::committer::FileSystemCommitter; +/// use std::sync::Arc; +/// # use delta_kernel::Engine; +/// # fn example(engine: &dyn Engine) -> delta_kernel::DeltaResult<()> { +/// +/// let schema = Arc::new(StructType::try_new(vec![ +/// StructField::new("id", DataType::INTEGER, false), +/// ])?); +/// +/// let result = create_table("/path/to/table", schema, "MyApp/1.0") +/// .build(engine, Box::new(FileSystemCommitter::new()))? +/// .commit(engine)?; +/// # Ok(()) +/// # } +/// ``` +pub type CreateTableTransaction = Transaction; + +/// Creates a builder for creating a new Delta table. +/// +/// This function returns a [`CreateTableTransactionBuilder`] that can be configured with table +/// properties and other options before building a [`CreateTableTransaction`]. +/// +/// # Arguments +/// +/// * `path` - The file system path where the Delta table will be created +/// * `schema` - The schema for the new table +/// * `engine_info` - Information about the engine creating the table (e.g., "MyApp/1.0") +/// +/// # Example +/// +/// ```no_run +/// use std::sync::Arc; +/// use delta_kernel::transaction::create_table::create_table; +/// use delta_kernel::schema::{DataType, StructField, StructType}; +/// use delta_kernel::committer::FileSystemCommitter; +/// use delta_kernel::engine::default::DefaultEngineBuilder; +/// use delta_kernel::engine::default::storage::store_from_url; +/// +/// # fn main() -> delta_kernel::DeltaResult<()> { +/// let schema = Arc::new(StructType::new_unchecked(vec![ +/// StructField::new("id", DataType::INTEGER, false), +/// StructField::new("name", DataType::STRING, true), +/// ])); +/// +/// let url = url::Url::parse("file:///tmp/my_table")?; +/// let engine = DefaultEngineBuilder::new(store_from_url(&url)?).build(); +/// +/// let transaction = create_table("/tmp/my_table", schema, "MyApp/1.0") +/// .build(&engine, Box::new(FileSystemCommitter::new()))?; +/// +/// // Commit the transaction to create the table +/// transaction.commit(&engine)?; +/// # Ok(()) +/// # } +/// ``` +pub fn create_table( + path: impl AsRef, + schema: SchemaRef, + engine_info: impl Into, +) -> CreateTableTransactionBuilder { + CreateTableTransactionBuilder::new(path, schema, engine_info) +} + +impl CreateTableTransaction { + /// Create a new transaction for creating a new table. This is used when the table doesn't + /// exist yet and we need to create it with Protocol and Metadata actions. + /// + /// The `pre_commit_snapshot` is a synthetic snapshot created from the protocol and metadata + /// that will be committed. It uses `PRE_COMMIT_VERSION` as a sentinel to indicate no + /// version exists yet on disk. + /// + /// This is typically called via `CreateTableTransactionBuilder::build()` rather than directly. + pub(crate) fn try_new_create_table( + pre_commit_snapshot: SnapshotRef, + engine_info: String, + committer: Box, + system_domain_metadata: Vec, + clustering_columns: Option>, + ) -> DeltaResult { + // TODO(sanuj) Today transactions expect a read snapshot to be passed in and we pass + // in the pre_commit_snapshot for CREATE. To support other operations such as ALTERs + // there might be cleaner alternatives which can clearly disambiguate b/w a snapshot + // the was read vs the effective snapshot we will use for the commit. + let span = tracing::info_span!( + "txn", + path = %pre_commit_snapshot.table_root(), + operation = "CREATE", + ); + + Ok(Transaction { + span, + read_snapshot: pre_commit_snapshot, + committer, + operation: Some("CREATE TABLE".to_string()), + engine_info: Some(engine_info), + add_files_metadata: vec![], + remove_files_metadata: vec![], + set_transactions: vec![], + commit_timestamp: current_time_ms()?, + user_domain_metadata_additions: vec![], + system_domain_metadata_additions: system_domain_metadata, + user_domain_removals: vec![], + data_change: true, + engine_commit_info: None, + is_blind_append: false, + dv_matched_files: vec![], + physical_clustering_columns: clustering_columns, + _state: PhantomData, + }) + } +} diff --git a/kernel/src/transaction/data_layout.rs b/kernel/src/transaction/data_layout.rs new file mode 100644 index 0000000000..99b8ff9e49 --- /dev/null +++ b/kernel/src/transaction/data_layout.rs @@ -0,0 +1,167 @@ +//! Data layout configuration for Delta tables. +//! +//! This module defines [`DataLayout`] which specifies how data files are organized +//! within a Delta table. Supported layouts are: +//! +//! - **None**: No special organization (default) +//! - **Clustered**: Data files optimized for queries on clustering columns +//! - **Partitioned**: Data files organized into directories by partition column values + +// Allow unreachable_pub because this module is pub when internal-api is enabled +// but pub(crate) otherwise. The items need to be pub for the public API. +#![allow(unreachable_pub)] +#![allow(dead_code)] + +use crate::expressions::ColumnName; + +/// Data layout configuration for a Delta table. +/// +/// Determines how data files are organized within the table: +/// +/// - [`DataLayout::None`]: No special organization (default) +/// - [`DataLayout::Clustered`]: Data files optimized for queries on clustering columns +/// - [`DataLayout::Partitioned`]: Data files organized into directories by partition column values +/// +/// Partitioning and clustering are mutually exclusive -- only one variant can be active at a time. +#[derive(Debug, Clone, Default)] +pub enum DataLayout { + /// No special data organization (default). + #[default] + None, + + /// Data files optimized for queries on clustering columns. + /// Both top-level and nested columns are supported. Each column's leaf field must + /// have a stats-eligible primitive type. + Clustered { + /// Columns to cluster by (in order). + columns: Vec, + }, + + /// Data files organized into directories by partition column values. + /// Only top-level columns are supported. Partition column values are stored + /// in the directory path rather than in the data files themselves. + Partitioned { + /// Columns to partition by (in order). + columns: Vec, + }, +} + +impl DataLayout { + /// Create a clustered layout with the given top-level column names. + /// + /// Each string is treated as a single top-level column name. For nested columns, + /// construct the [`DataLayout::Clustered`] variant directly with multi-segment + /// [`ColumnName`] values. + /// + /// This method constructs the layout without validation. Full validation + /// (duplicates, schema compatibility, data types) is performed during + /// `CreateTableTransactionBuilder::build()` via `validate_clustering_columns()`. + /// + /// # Examples + /// + /// Top-level columns: + /// + /// ```ignore + /// let layout = DataLayout::clustered(["id", "timestamp"]); + /// ``` + /// + /// Nested columns (construct the variant directly): + /// + /// ```ignore + /// let layout = DataLayout::Clustered { + /// columns: vec![ColumnName::new(["user", "address", "city"])], + /// }; + /// ``` + pub fn clustered(columns: I) -> Self + where + I: IntoIterator, + S: AsRef, + { + let columns: Vec = columns + .into_iter() + .map(|s| ColumnName::new([s.as_ref()])) + .collect(); + + DataLayout::Clustered { columns } + } + + /// Create a partitioned layout with the given top-level column names. + /// + /// Each string is treated as a single top-level column name. Partition columns + /// must be top-level columns in the schema (nested columns are not supported). + /// + /// This method constructs the layout without validation. Full validation + /// (duplicates, schema compatibility, data types) is performed during + /// `CreateTableTransactionBuilder::build()` via `validate_partition_columns()`. + /// + /// # Example + /// + /// ```ignore + /// let layout = DataLayout::partitioned(["year", "month"]); + /// ``` + pub fn partitioned(columns: I) -> Self + where + I: IntoIterator, + S: AsRef, + { + let columns: Vec = columns + .into_iter() + .map(|s| ColumnName::new([s.as_ref()])) + .collect(); + + DataLayout::Partitioned { columns } + } + + /// Returns true if this layout specifies clustering. + #[cfg(test)] + pub fn is_clustered(&self) -> bool { + matches!(self, DataLayout::Clustered { .. }) + } + + /// Returns true if this layout specifies partitioning. + #[cfg(test)] + pub fn is_partitioned(&self) -> bool { + matches!(self, DataLayout::Partitioned { .. }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[rstest::rstest] + #[case::clustered(DataLayout::clustered(["id"]), true, false)] + #[case::partitioned(DataLayout::partitioned(["id"]), false, true)] + #[case::none(DataLayout::default(), false, false)] + fn test_data_layout_predicates( + #[case] layout: DataLayout, + #[case] expect_clustered: bool, + #[case] expect_partitioned: bool, + ) { + assert_eq!(layout.is_clustered(), expect_clustered); + assert_eq!(layout.is_partitioned(), expect_partitioned); + } + + #[test] + fn test_clustered_layout_construction() { + let layout = DataLayout::clustered(["col1", "col2"]); + if let DataLayout::Clustered { columns } = layout { + assert_eq!(columns.len(), 2); + } else { + panic!("Expected Clustered variant"); + } + } + + #[test] + fn test_partitioned_layout_construction() { + let layout = DataLayout::partitioned(["year", "month"]); + if let DataLayout::Partitioned { columns } = layout { + assert_eq!(columns.len(), 2); + } else { + panic!("Expected Partitioned variant"); + } + } + + // Note: Validation tests (duplicates, schema compatibility, data types) are in + // clustering.rs and builder/create_table.rs since validation is performed at build time. +} diff --git a/kernel/src/transaction/domain_metadata.rs b/kernel/src/transaction/domain_metadata.rs new file mode 100644 index 0000000000..3d89cf269b --- /dev/null +++ b/kernel/src/transaction/domain_metadata.rs @@ -0,0 +1,243 @@ +use std::collections::HashSet; + +use crate::actions::{get_log_domain_metadata_schema, DomainMetadata, INTERNAL_DOMAIN_PREFIX}; +use crate::error::Error; +use crate::row_tracking::{RowTrackingDomainMetadata, ROW_TRACKING_DOMAIN_NAME}; +use crate::table_features::TableFeature; +use crate::{DeltaResult, Engine, IntoEngineData}; + +use super::{EngineDataResultIterator, Transaction}; + +impl Transaction { + /// Validate domain metadata operations for both create-table and existing-table transactions. + /// + /// Enforces the following rules: + /// - DomainMetadata feature must be supported if any domain operations are present + /// - System domains (in system_domain_metadata_additions) must correspond to a known feature + /// - User domains cannot use the delta.* prefix (system-reserved) + /// - Domain removals are not allowed in create-table transactions + /// - No duplicate domains within a single transaction (across both user and system) + pub(super) fn validate_domain_metadata_operations(&self) -> DeltaResult<()> { + // Feature validation (applies to all transactions with domain operations) + let has_domain_ops = !self.system_domain_metadata_additions.is_empty() + || !self.user_domain_metadata_additions.is_empty() + || !self.user_domain_removals.is_empty(); + + // Early return if no domain operations to validate + if !has_domain_ops { + return Ok(()); + } + + if !self + .read_snapshot + .table_configuration() + .is_feature_supported(&TableFeature::DomainMetadata) + { + return Err(Error::unsupported( + "Domain metadata operations require writer version 7 and the 'domainMetadata' writer feature", + )); + } + + let is_create = self.is_create_table(); + let mut seen_domains = HashSet::with_capacity( + self.system_domain_metadata_additions.len() + + self.user_domain_metadata_additions.len() + + self.user_domain_removals.len(), + ); + + // Validate SYSTEM domain additions (from transforms, e.g., clustering) + // System domains are only populated during create-table + for dm in &self.system_domain_metadata_additions { + let domain = dm.domain(); + + // Validate the system domain corresponds to a known feature + self.validate_system_domain_feature(domain)?; + + // Check for duplicates + if !seen_domains.insert(domain) { + return Err(Error::generic(format!( + "Metadata for domain {domain} already specified in this transaction" + ))); + } + } + + // Validate USER domain additions (via with_domain_metadata API) + for dm in &self.user_domain_metadata_additions { + let domain = dm.domain(); + + // Users cannot add system domains via the public API + if domain.starts_with(INTERNAL_DOMAIN_PREFIX) { + return Err(Error::generic( + "Cannot modify domains that start with 'delta.' as those are system controlled", + )); + } + + // Check for duplicates (spans both system and user domains) + if !seen_domains.insert(domain) { + return Err(Error::generic(format!( + "Metadata for domain {domain} already specified in this transaction" + ))); + } + } + + // No removals allowed for create-table. + // Note: CreateTableTransaction does not expose with_domain_metadata_removed(), + // so this is a defensive check. See #1768. + if is_create && !self.user_domain_removals.is_empty() { + return Err(Error::unsupported( + "Domain metadata removals are not supported in create-table transactions", + )); + } + + // Validate domain removals (for non-create-table) + for domain in &self.user_domain_removals { + // Cannot remove system domains + if domain.starts_with(INTERNAL_DOMAIN_PREFIX) { + return Err(Error::generic( + "Cannot modify domains that start with 'delta.' as those are system controlled", + )); + } + + // Check for duplicates + if !seen_domains.insert(domain.as_str()) { + return Err(Error::generic(format!( + "Metadata for domain {domain} already specified in this transaction" + ))); + } + } + + Ok(()) + } + + /// Validate that a system domain corresponds to a known feature and that the feature is supported. + /// + /// This prevents arbitrary `delta.*` domains from being added during table creation. + /// Each known system domain must have its corresponding feature enabled in the protocol. + fn validate_system_domain_feature(&self, domain: &str) -> DeltaResult<()> { + let table_config = self.read_snapshot.table_configuration(); + + // Map domain to its required feature + let required_feature = match domain { + ROW_TRACKING_DOMAIN_NAME => Some(TableFeature::RowTracking), + // Will be changed to a constant in a follow up clustering create table feature PR + "delta.clustering" => Some(TableFeature::ClusteredTable), + _ => { + return Err(Error::generic(format!( + "Unknown system domain '{domain}'. Only known system domains are allowed." + ))); + } + }; + + // If the domain requires a feature, validate it's supported + if let Some(feature) = required_feature { + if !table_config.is_feature_supported(&feature) { + return Err(Error::generic(format!( + "System domain '{domain}' requires the '{feature}' feature to be enabled" + ))); + } + } + + Ok(()) + } + + /// Generate removal actions for user domain metadata by scanning the log. + /// + /// This performs an expensive log replay operation to fetch the previous configuration + /// value for each domain being removed, as required by the Delta spec for tombstones. + /// Returns an empty vector if there are no domain removals. + pub(super) fn generate_user_domain_removal_actions( + &self, + engine: &dyn Engine, + ) -> DeltaResult> { + if self.user_domain_removals.is_empty() { + return Ok(vec![]); + } + + // Scan log to fetch existing configurations for tombstones. + // Pass the specific set of domains to remove so that log replay can terminate early + // once all target domains have been found, instead of replaying the entire log. + let domains: HashSet<&str> = self + .user_domain_removals + .iter() + .map(String::as_str) + .collect(); + let existing_domains = self + .read_snapshot + .get_domain_metadatas_internal(engine, Some(&domains))?; + + // Create removal tombstones with pre-image configurations + Ok(self + .user_domain_removals + .iter() + .filter_map(|domain| { + // If domain doesn't exist in the log, this is a no-op (filter it out) + existing_domains.get(domain).map(|existing| { + DomainMetadata::remove(domain.clone(), existing.configuration().to_owned()) + }) + }) + .collect()) + } + + /// Generate domain metadata actions with validation. Handle both user and system domains. + /// + /// Returns a tuple of `(action_iter, domain_metadata_vec)`. + /// - The action iterator contains EngineData to be written to the commit file (`00N.json`). + /// - The `Vec` is used to construct a [`CrcDelta`](crate::crc::CrcDelta), + /// which feeds the post-commit snapshot with the domain metadata written in this transaction + /// and powers CRC file writes. + /// + /// This function may perform an expensive log replay operation if there are any domain removals. + /// The log replay is required to fetch the previous configuration value for the domain to preserve + /// in removal tombstones as mandated by the Delta spec. + pub(super) fn generate_domain_metadata_actions<'a>( + &'a self, + engine: &'a dyn Engine, + row_tracking_high_watermark: Option, + ) -> DeltaResult<(EngineDataResultIterator<'a>, Vec)> { + let is_create = self.is_create_table(); + + // Validate domain operations (includes feature validation) + self.validate_domain_metadata_operations()?; + + // TODO(sanuj) Create-table must not have row tracking or removals + // Defensive. Needs to be updated when row tracking support is added. + if is_create { + if row_tracking_high_watermark.is_some() { + return Err(Error::internal_error( + "CREATE TABLE cannot have row tracking domain metadata", + )); + } + // user_domain_removals already validated above, but be explicit + debug_assert!(self.user_domain_removals.is_empty()); + } + + // Generate removal actions (empty for create-table due to validation above) + let removal_actions = self.generate_user_domain_removal_actions(engine)?; + + // Generate row tracking domain action (None for create-table) + let row_tracking_domain_action = row_tracking_high_watermark + .map(DomainMetadata::try_from) + .transpose()? + .into_iter(); + + // Chain all domain actions: system domains, row tracking, user domains, removals + let dm_actions_vec: Vec = self + .system_domain_metadata_additions + .iter() + .cloned() + .chain(row_tracking_domain_action) + .chain(self.user_domain_metadata_additions.iter().cloned()) + .chain(removal_actions) + .collect(); + + let schema = get_log_domain_metadata_schema().clone(); + + let dm_actions_iter: Vec<_> = dm_actions_vec + .iter() + .cloned() + .map(|dm| dm.into_engine_data(schema.clone(), engine)) + .collect(); + + Ok((Box::new(dm_actions_iter.into_iter()), dm_actions_vec)) + } +} diff --git a/kernel/src/transaction/mod.rs b/kernel/src/transaction/mod.rs index f647ba0e80..2f86ccca31 100644 --- a/kernel/src/transaction/mod.rs +++ b/kernel/src/transaction/mod.rs @@ -1,31 +1,64 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::iter; +use std::marker::PhantomData; use std::ops::Deref; use std::sync::{Arc, LazyLock}; +use tracing::{info, instrument}; use url::Url; +use crate::actions::deletion_vector::DeletionVectorPath; use crate::actions::{ - as_log_add_schema, domain_metadata::scan_domain_metadatas, get_log_commit_info_schema, - get_log_domain_metadata_schema, get_log_txn_schema, CommitInfo, DomainMetadata, SetTransaction, + as_log_add_schema, get_commit_schema, get_log_remove_schema, get_log_txn_schema, CommitInfo, + DomainMetadata, Metadata, Protocol, SetTransaction, METADATA_NAME, PROTOCOL_NAME, }; -#[cfg(feature = "catalog-managed")] -use crate::committer::FileSystemCommitter; -use crate::committer::{CommitMetadata, CommitResponse, Committer}; +use crate::committer::{ + CommitMetadata, CommitProtocolMetadata, CommitResponse, CommitType, Committer, +}; +use crate::crc::{CrcDelta, FileStatsDelta}; use crate::engine_data::FilteredEngineData; use crate::error::Error; +use crate::expressions::ColumnName; use crate::expressions::{ArrayData, Transform, UnaryExpressionOp::ToJson}; -use crate::path::LogRoot; +use crate::path::{LogRoot, ParsedLogPath}; use crate::row_tracking::{RowTrackingDomainMetadata, RowTrackingVisitor}; -use crate::schema::{ArrayType, MapType, SchemaRef, StructField, StructType}; +use crate::scan::data_skipping::stats_schema::schema_with_all_fields_nullable; +use crate::scan::log_replay::{ + BASE_ROW_ID_NAME, DEFAULT_ROW_COMMIT_VERSION_NAME, FILE_CONSTANT_VALUES_NAME, TAGS_NAME, +}; +use crate::scan::scan_row_schema; +use crate::schema::{ArrayType, MapType, SchemaRef, StructField, StructType, StructTypeBuilder}; use crate::snapshot::SnapshotRef; -use crate::utils::current_time_ms; +use crate::table_features::{ColumnMappingMode, TableFeature}; +use crate::utils::require; +use crate::FileMeta; use crate::{ DataType, DeltaResult, Engine, EngineData, Expression, ExpressionRef, IntoEngineData, - RowVisitor, Version, + RowVisitor, Version, PRE_COMMIT_VERSION, }; use delta_kernel_derive::internal_api; +#[cfg(feature = "internal-api")] +pub mod builder; +#[cfg(not(feature = "internal-api"))] +pub(crate) mod builder; + +#[cfg(feature = "internal-api")] +pub mod create_table; +#[cfg(not(feature = "internal-api"))] +pub(crate) mod create_table; + +#[cfg(feature = "internal-api")] +pub mod data_layout; +#[cfg(not(feature = "internal-api"))] +pub(crate) mod data_layout; + +mod commit_info; +mod domain_metadata; +mod stats_verifier; +mod update; +use stats_verifier::StatsVerifier; + /// Type alias for an iterator of [`EngineData`] results. pub(crate) type EngineDataResultIterator<'a> = Box>> + Send + 'a>; @@ -52,16 +85,37 @@ pub(crate) fn mandatory_add_file_schema() -> &'static SchemaRef { &MANDATORY_ADD_FILE_SCHEMA } -/// The static instance referenced by [`add_files_schema`]. +/// The base schema for add file metadata, referenced by [`Transaction::add_files_schema`]. +/// +/// The `stats` field represents the minimum structure. The actual stats written by +/// [`DefaultEngine::write_parquet`] include additional fields computed from the data: +/// - `nullCount`: nested struct mirroring the data schema (all fields LONG) +/// - `minValues`: nested struct with min/max eligible column types +/// - `maxValues`: nested struct with min/max eligible column types +/// +/// The nested structures within nullCount/minValues/maxValues depend on the table's data schema +/// and which columns have statistics enabled. Use [`Transaction::stats_schema`] to get the +/// expected stats schema for a specific table. +/// +/// [`DefaultEngine::write_parquet`]: crate::engine::default::DefaultEngine::write_parquet pub(crate) static BASE_ADD_FILES_SCHEMA: LazyLock = LazyLock::new(|| { let stats = StructField::nullable( "stats", - DataType::struct_type_unchecked(vec![StructField::nullable("numRecords", DataType::LONG)]), + DataType::struct_type_unchecked(vec![ + StructField::nullable("numRecords", DataType::LONG), + // nullCount, minValues, maxValues are dynamic based on data schema. + // Empty struct placeholders indicate these fields exist but their inner + // structure depends on the table schema and stats column configuration. + StructField::nullable("nullCount", DataType::struct_type_unchecked(vec![])), + StructField::nullable("minValues", DataType::struct_type_unchecked(vec![])), + StructField::nullable("maxValues", DataType::struct_type_unchecked(vec![])), + StructField::nullable("tightBounds", DataType::BOOLEAN), + ]), ); - Arc::new(StructType::new_unchecked( - mandatory_add_file_schema().fields().cloned().chain([stats]), - )) + StructTypeBuilder::from_schema(mandatory_add_file_schema()) + .add_field(stats) + .build_arc_unchecked() }); static DATA_CHANGE_COLUMN: LazyLock = @@ -79,36 +133,53 @@ static ADD_FILES_SCHEMA_WITH_DATA_CHANGE: LazyLock = LazyLock::new(|| Arc::new(StructType::new_unchecked(fields.into_iter().cloned())) }); -// NOTE: The following two methods are a workaround for the fact that we do not have a proper SchemaBuilder yet. -// See https://github.com/delta-io/delta-kernel-rs/issues/1284 /// Extend a schema with a statistics column and return a new SchemaRef. /// /// The stats column is of type string as required by the spec. /// /// Note that this method is only useful to extend an Add action schema. fn with_stats_col(schema: &SchemaRef) -> SchemaRef { - let fields = schema - .fields() - .cloned() - .chain([StructField::nullable("stats", DataType::STRING)]); - Arc::new(StructType::new_unchecked(fields)) + StructTypeBuilder::from_schema(schema) + .add_field(StructField::nullable("stats", DataType::STRING)) + .build_arc_unchecked() } /// Extend a schema with row tracking columns and return a new SchemaRef. /// /// Note that this method is only useful to extend an Add action schema. fn with_row_tracking_cols(schema: &SchemaRef) -> SchemaRef { - let fields = schema.fields().cloned().chain([ - StructField::nullable("baseRowId", DataType::LONG), - StructField::nullable("defaultRowCommitVersion", DataType::LONG), - ]); - Arc::new(StructType::new_unchecked(fields)) + StructTypeBuilder::from_schema(schema) + .add_field(StructField::nullable("baseRowId", DataType::LONG)) + .add_field(StructField::nullable( + "defaultRowCommitVersion", + DataType::LONG, + )) + .build_arc_unchecked() } +/// Marker type for transactions on existing tables. +/// +/// This is the default state for [`Transaction`] and provides the full set of operations +/// including file removal, deletion vector updates, and blind append semantics. +#[derive(Debug)] +pub struct ExistingTable; + +/// Marker type for create-table transactions. +/// +/// Transactions in this state have a restricted API surface — operations that are semantically +/// invalid for table creation (e.g. file removal, domain metadata removal) are not available. +#[derive(Debug)] +pub struct CreateTable; + /// A transaction represents an in-progress write to a table. After creating a transaction, changes /// to the table may be staged via the transaction methods before calling `commit` to commit the /// changes to the table. /// +/// The type parameter `S` controls which operations are available: +/// - [`ExistingTable`] (default): Full API for modifying existing tables. +/// - [`CreateTable`]: Restricted API for table creation (see +/// [`CreateTableTransaction`](create_table::CreateTableTransaction)). +/// /// # Examples /// /// ```rust,ignore @@ -119,12 +190,17 @@ fn with_row_tracking_cols(schema: &SchemaRef) -> SchemaRef { /// // commit! (consume the transaction) /// txn.commit(&engine)?; /// ``` -pub struct Transaction { +pub struct Transaction { + span: tracing::Span, + // The snapshot this transaction is based on. For create-table transactions, + // this is a pre-commit snapshot with PRE_COMMIT_VERSION. read_snapshot: SnapshotRef, committer: Box, operation: Option, engine_info: Option, + engine_commit_info: Option<(Box, SchemaRef)>, add_files_metadata: Vec>, + remove_files_metadata: Vec, // NB: hashmap would require either duplicating the appid or splitting SetTransaction // key/payload. HashSet requires Borrow<&str> with matching Eq, Ord, and Hash. Plus, // HashSet::insert drops the to-be-inserted value without returning the existing one, which @@ -134,80 +210,77 @@ pub struct Transaction { // commit-wide timestamp (in milliseconds since epoch) - used in ICT, `txn` action, etc. to // keep all timestamps within the same commit consistent. commit_timestamp: i64, - domain_metadatas: Vec, + // User-provided domain metadata additions (via with_domain_metadata API). + user_domain_metadata_additions: Vec, + // System-generated domain metadata (from transforms, e.g., clustering). + // TODO(#1779): Currently only populated during CREATE TABLE. For inserts, row tracking + // domain metadata is handled separately via `row_tracking_high_watermark` parameter in + // `generate_domain_metadata_actions`. Consider unifying system domain handling. + system_domain_metadata_additions: Vec, + // Domain names to remove in this transaction. The configuration values are fetched during + // commit from the log to preserve the pre-image in tombstones. + user_domain_removals: Vec, // Whether this transaction contains any logical data changes. data_change: bool, + // Whether this transaction should be marked as a blind append. + is_blind_append: bool, + // Files matched by update_deletion_vectors() with new DV descriptors appended. These are used + // to generate remove/add action pairs during commit, ensuring file statistics are preserved. + dv_matched_files: Vec, + // Clustering columns from domain metadata. Only populated if the ClusteredTable feature is + // enabled. Used for determining which columns require statistics collection. Expected to be + // physical column names. + physical_clustering_columns: Option>, + // PhantomData marker for transaction state (ExistingTable or CreateTable). + // Zero-sized; only affects the type system. + _state: PhantomData, } -impl std::fmt::Debug for Transaction { +impl std::fmt::Debug for Transaction { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let version_info = if self.is_create_table() { + "create_table".to_string() + } else { + format!("{}", self.read_snapshot.version()) + }; f.write_str(&format!( "Transaction {{ read_snapshot version: {}, engine_info: {} }}", - self.read_snapshot.version(), + version_info, self.engine_info.is_some() )) } } -impl Transaction { - /// Create a new transaction from a snapshot. The snapshot will be used to read the current - /// state of the table (e.g. to read the current version). - /// - /// Instead of using this API, the more typical (user-facing) API is - /// [Snapshot::transaction](crate::snapshot::Snapshot::transaction) to create a transaction from - /// a snapshot. - pub(crate) fn try_new( - snapshot: impl Into, - committer: Box, - ) -> DeltaResult { - let read_snapshot = snapshot.into(); - - // important! before a read/write to the table we must check it is supported - read_snapshot - .table_configuration() - .ensure_write_supported()?; - - let commit_timestamp = current_time_ms()?; - - Ok(Transaction { - read_snapshot: read_snapshot.clone(), - committer, - operation: None, - engine_info: None, - add_files_metadata: vec![], - set_transactions: vec![], - commit_timestamp, - domain_metadatas: vec![], - data_change: true, - }) - } - - /// Set the committer that will be used to commit this transaction. If not set, the default - /// filesystem-based committer will be used. Note that the default committer is only allowed - /// for non-catalog-managed tables. That is, you _must_ provide a committer via this API in - /// order to write to catalog-managed tables. - /// - /// See [`committer`] module for more details. - /// - /// [`committer`]: crate::committer - #[cfg(feature = "catalog-managed")] - pub fn with_committer(mut self, committer: Box) -> Self { - self.committer = committer; - self - } - +// ============================================================================= +// Shared methods available on ALL transaction types +// ============================================================================= +impl Transaction { /// Consume the transaction and commit it to the table. The result is a result of /// [CommitResult] with the following semantics: /// - Ok(CommitResult) for either success or a recoverable error (includes the failed /// transaction in case of a conflict so the user can retry, etc.) /// - Err(Error) indicates a non-retryable error (e.g. logic/validation error). - pub fn commit(self, engine: &dyn Engine) -> DeltaResult { + #[instrument( + parent = &self.span, + name = "txn.commit", + skip_all, + fields( + commit_version = self.get_commit_version(), + ), + err + )] + pub fn commit(self, engine: &dyn Engine) -> DeltaResult> { + info!( + num_add_files = self.add_files_metadata.len(), + num_remove_files = self.remove_files_metadata.len(), + num_dv_updates = self.dv_matched_files.len(), + ); // Step 1: Check for duplicate app_ids and generate set transactions (`txn`) // Note: The commit info must always be the first action in the commit but we generate it in // step 2 to fail early on duplicate transaction appIds // TODO(zach): we currently do this in two passes - can we do it in one and still keep refs // in the HashSet? - let mut app_ids = HashSet::new(); + let mut app_ids = HashSet::with_capacity(self.set_transactions.len()); if let Some(dup) = self .set_transactions .iter() @@ -218,6 +291,38 @@ impl Transaction { dup.app_id ))); } + + self.validate_blind_append_semantics()?; + + // CDF check only applies to existing tables (not create table) + // If there are add and remove files with data change in the same transaction, we block it. + // This is because kernel does not yet have a way to discern DML operations. For DML + // operations that perform updates on rows, ChangeDataFeed requires that a `cdc` file be + // written to the delta log. + if !self.is_create_table() + && !self.add_files_metadata.is_empty() + && !self.remove_files_metadata.is_empty() + && self.data_change + { + let cdf_enabled = self + .read_snapshot + .table_configuration() + .table_properties() + .enable_change_data_feed + .unwrap_or(false); + require!( + !cdf_enabled, + Error::generic( + "Cannot add and remove data in the same transaction when Change Data Feed is enabled (delta.enableChangeDataFeed = true). \ + This would require writing CDC files for DML operations, which is not yet supported. \ + Consider using separate transactions: one to add files, another to remove files." + ) + ); + } + + // Validate clustering column stats if ClusteredTable feature is enabled + self.validate_add_files_stats(&self.add_files_metadata)?; + // Step 1: Generate SetTransaction actions let set_transaction_actions = self .set_transactions @@ -226,62 +331,91 @@ impl Transaction { .map(|txn| txn.into_engine_data(get_log_txn_schema().clone(), engine)); // Step 2: Construct commit info with ICT if enabled - let in_commit_timestamp = - self.read_snapshot - .get_in_commit_timestamp(engine)? - .map(|prev_ict| { - // The Delta protocol requires the timestamp to be "the larger of two values": - // - The time at which the writer attempted the commit (current_time) - // - One millisecond later than the previous commit's inCommitTimestamp (last_commit_timestamp + 1) - self.commit_timestamp.max(prev_ict + 1) - }); - let commit_info = CommitInfo::new( + let in_commit_timestamp = self.get_in_commit_timestamp(engine)?; + let kernel_commit_info = CommitInfo::new( self.commit_timestamp, in_commit_timestamp, self.operation.clone(), self.engine_info.clone(), + self.is_blind_append, ); - let commit_info_action = - commit_info.into_engine_data(get_log_commit_info_schema().clone(), engine); + let commit_info_action = self.generate_commit_info(engine, kernel_commit_info); + + // Step 3: Generate Protocol and Metadata actions for create-table + let (protocol_action, metadata_action, protocol, metadata) = if self.is_create_table() { + let table_config = self.read_snapshot.table_configuration(); + let protocol = table_config.protocol().clone(); + let metadata = table_config.metadata().clone(); + + let protocol_schema = get_commit_schema().project(&[PROTOCOL_NAME])?; + let metadata_schema = get_commit_schema().project(&[METADATA_NAME])?; + + let protocol_data = protocol.clone().into_engine_data(protocol_schema, engine)?; + let metadata_data = metadata.clone().into_engine_data(metadata_schema, engine)?; + + ( + Some(protocol_data), + Some(metadata_data), + Some(protocol), + Some(metadata), + ) + } else { + (None, None, None, None) + }; - // Step 3: Generate add actions and get data for domain metadata actions (e.g. row tracking high watermark) - let commit_version = self.read_snapshot.version() + 1; + // Step 4: Generate add actions and get data for domain metadata actions (e.g. row tracking high watermark) + let commit_version = self.get_commit_version(); let (add_actions, row_tracking_domain_metadata) = self.generate_adds(engine, commit_version)?; - // Step 4: Generate all domain metadata actions (user and system domains) - let domain_metadata_actions = + // Step 4b: Generate all domain metadata actions (user and system domains) + let (domain_metadata_actions, dm_changes) = self.generate_domain_metadata_actions(engine, row_tracking_domain_metadata)?; - // Step 5: Chain all our actions to be handed off to the Committer + // Step 5: Generate DV update actions (remove/add pairs) if any DV updates are present + let dv_update_actions = self.generate_dv_update_actions(engine)?; + + // Step 6: Generate remove actions (collect to avoid borrowing self) + let remove_actions = + self.generate_remove_actions(engine, self.remove_files_metadata.iter(), &[])?; + + // Build the action chain + // For create-table: CommitInfo -> Protocol -> Metadata -> adds -> txns -> domain_metadata -> removes + // For existing table: CommitInfo -> adds -> txns -> domain_metadata -> removes let actions = iter::once(commit_info_action) + .chain(protocol_action.map(Ok)) + .chain(metadata_action.map(Ok)) .chain(add_actions) .chain(set_transaction_actions) .chain(domain_metadata_actions); - // Convert EngineData to FilteredEngineData with all rows selected + let filtered_actions = actions - .map(|action_result| action_result.map(FilteredEngineData::with_all_rows_selected)); + .map(|action_result| action_result.map(FilteredEngineData::with_all_rows_selected)) + .chain(remove_actions) + .chain(dv_update_actions); - // Step 6: Commit via the committer - #[cfg(feature = "catalog-managed")] - if self.committer.any_ref().is::() - && self - .read_snapshot - .table_configuration() - .protocol() - .is_catalog_managed() - { - return Err(Error::generic("The FileSystemCommitter cannot be used to commit to catalog-managed tables. Please provide a committer for your catalog via Transaction::with_committer().")); - } - let log_root = LogRoot::new(self.read_snapshot.table_root().clone())?; - let commit_metadata = CommitMetadata::new(log_root, commit_version); + // Step 7: Commit via the committer + let commit_metadata = + self.create_commit_metadata(commit_version, protocol, metadata, dm_changes.clone())?; match self .committer .commit(engine, Box::new(filtered_actions), commit_metadata) { - Ok(CommitResponse::Committed { version }) => Ok(CommitResult::CommittedTransaction( - self.into_committed(version), - )), + Ok(CommitResponse::Committed { file_meta }) => { + let bin_boundaries = self + .read_snapshot + .get_file_stats_if_loaded() + .and_then(|s| s.file_size_histogram) + .map(|h| h.sorted_bin_boundaries); + let crc_delta = self.build_crc_delta( + in_commit_timestamp, + dm_changes, + bin_boundaries.as_deref(), + )?; + Ok(CommitResult::CommittedTransaction( + self.into_committed(file_meta, crc_delta)?, + )) + } Ok(CommitResponse::Conflict { version }) => Ok(CommitResult::ConflictedTransaction( self.into_conflicted(version), )), @@ -316,19 +450,32 @@ impl Transaction { self.data_change = data_change; } - /// Set the operation that this transaction is performing. This string will be persisted in the - /// commit and visible to anyone who describes the table history. - pub fn with_operation(mut self, operation: String) -> Self { - self.operation = Some(operation); - self - } - /// Set the engine info field of this transaction's commit info action. This field is optional. pub fn with_engine_info(mut self, engine_info: impl Into) -> Self { self.engine_info = Some(engine_info.into()); self } + /// Set the content of the commitInfo action for this transaction. Note that kernel will _always_ write a commitInfo, + /// this function simply allows engines to add their own data into that action if they wish. + /// Note that the following fields in `engine_commit_info` will be overridden by kernel if they are set (meaning you should not set them): + /// - timestamp + /// - inCommitTimestamp + /// - operation + /// - operationParameters + /// - kernelVersion + /// - isBlindAppend + /// - engineInfo + /// - txnId + pub fn with_commit_info( + mut self, + engine_commit_info: Box, + commit_info_schema: SchemaRef, + ) -> Self { + self.engine_commit_info = Some((engine_commit_info, commit_info_schema)); + self + } + /// Include a SetTransaction (app_id and version) action for this transaction (with an optional /// `last_updated` timestamp). /// Note that each app_id can only appear once per transaction. That is, multiple app_ids with @@ -347,103 +494,176 @@ impl Transaction { /// fail (that is, we don't eagerly check domain validity here). /// Setting metadata for multiple distinct domains is allowed. pub fn with_domain_metadata(mut self, domain: String, configuration: String) -> Self { - self.domain_metadatas + self.user_domain_metadata_additions .push(DomainMetadata::new(domain, configuration)); self } - /// Remove domain metadata from the Delta log. - /// If the domain exists in the Delta log, this creates a tombstone to logically delete - /// the domain. The tombstone preserves the previous configuration value. - /// If the domain does not exist in the Delta log, this is a no-op. - /// Note that each domain can only appear once per transaction. That is, multiple operations - /// on the same domain are disallowed in a single transaction, as well as setting and removing - /// the same domain in a single transaction. If a duplicate domain is included, the `commit` will - /// fail (that is, we don't eagerly check domain validity here). - /// Removing metadata for multiple distinct domains is allowed. - pub fn with_domain_metadata_removed(mut self, domain: String) -> Self { - // actual configuration value determined during commit - self.domain_metadatas - .push(DomainMetadata::remove(domain, String::new())); - self + /// Determines the commit type based on whether this is a create-table operation and whether + /// the table is catalog-managed. + fn determine_commit_type( + is_create: bool, + table_config: &crate::table_configuration::TableConfiguration, + ) -> CommitType { + let is_catalog_managed = table_config.is_catalog_managed(); + + // TODO: Handle UpgradeToCatalogManaged and DowngradeToPathBased when ALTER TABLE + // SET TBLPROPERTIES is supported. + match (is_create, is_catalog_managed) { + (true, true) => CommitType::CatalogManagedCreate, + (true, false) => CommitType::PathBasedCreate, + (false, true) => CommitType::CatalogManagedWrite, + (false, false) => CommitType::PathBasedWrite, + } } - /// Generate domain metadata actions with validation. Handle both user and system domains. + /// Validates that the committer type matches the commit type. A catalog committer must be + /// used for catalog-managed operations, and a non-catalog committer for path-based operations. + fn validate_commit_type( + is_catalog_committer: bool, + commit_type: &CommitType, + ) -> DeltaResult<()> { + match ( + is_catalog_committer, + commit_type.requires_catalog_committer(), + ) { + (true, true) | (false, false) => Ok(()), + (false, true) => Err(Error::generic( + "This table is catalog-managed and requires a catalog committer. \ + Please provide a catalog committer via Snapshot::transaction().", + )), + (true, false) => Err(Error::generic( + "This table is path-based and cannot be committed to with a catalog committer.", + )), + } + } + + /// Builds the [`CommitMetadata`] for this transaction. Determines the commit type, + /// validates the committer, and assembles the protocol/metadata state. + fn create_commit_metadata( + &self, + commit_version: Version, + new_protocol: Option, + new_metadata: Option, + domain_metadata_changes: Vec, + ) -> DeltaResult { + let log_root = LogRoot::new(self.read_snapshot.table_root().clone())?; + let table_config = self.read_snapshot.table_configuration(); + let is_create = self.is_create_table(); + let commit_type = Self::determine_commit_type(is_create, table_config); + Self::validate_commit_type(self.committer.is_catalog_committer(), &commit_type)?; + // For create-table: read P&M is None (no previous table), new P&M is set. + // For existing table: read P&M is from the snapshot, new P&M is None. + let (read_protocol, read_metadata) = if is_create { + (None, None) + } else { + ( + Some(table_config.protocol().clone()), + Some(table_config.metadata().clone()), + ) + }; + let protocol_metadata = CommitProtocolMetadata::try_new( + read_protocol, + read_metadata, + new_protocol, + new_metadata, + )?; + Ok(CommitMetadata::new( + log_root, + commit_version, + commit_type, + self.commit_timestamp, + self.read_snapshot + .log_segment() + .listed + .max_published_version, + protocol_metadata, + domain_metadata_changes, + )) + } + + /// Validate that the transaction is eligible to be marked as a blind append. /// - /// This function may perform an expensive log replay operation if there are any domain removals. - /// The log replay is required to fetch the previous configuration value for the domain to preserve - /// in removal tombstones as mandated by the Delta spec. - fn generate_domain_metadata_actions<'a>( - &'a self, - engine: &'a dyn Engine, - row_tracking_high_watermark: Option, - ) -> DeltaResult>> + 'a> { - // if there are domain metadata actions, the table must support it - if !self.domain_metadatas.is_empty() - && !self - .read_snapshot - .table_configuration() - .is_domain_metadata_supported() - { - return Err(Error::unsupported( - "Domain metadata operations require writer version 7 and the 'domainMetadata' writer feature" - )); + /// Note: Domain metadata additions/removals are allowed; blind append only constrains + /// data-file operations and read predicates. Conflict resolution determines whether + /// metadata changes are problematic. + fn validate_blind_append_semantics(&self) -> DeltaResult<()> { + if !self.is_blind_append { + return Ok(()); } + require!( + !self.is_create_table(), + Error::invalid_transaction_state( + "Blind append is not supported for create-table transactions", + ) + ); + require!( + !self.add_files_metadata.is_empty(), + Error::invalid_transaction_state("Blind append requires at least one added data file") + ); + require!( + self.data_change, + Error::invalid_transaction_state("Blind append requires data_change to be true") + ); + require!( + self.remove_files_metadata.is_empty(), + Error::invalid_transaction_state("Blind append cannot remove files") + ); + require!( + self.dv_matched_files.is_empty(), + Error::invalid_transaction_state("Blind append cannot update deletion vectors") + ); - // validate user domain metadata and check if we have removals - let mut seen_domains = HashSet::new(); - let mut has_removals = false; - for dm in &self.domain_metadatas { - if dm.is_internal() { - return Err(Error::Generic( - "Cannot modify domains that start with 'delta.' as those are system controlled" - .to_string(), - )); - } + Ok(()) + } - if !seen_domains.insert(dm.domain()) { - return Err(Error::Generic(format!( - "Metadata for domain {} already specified in this transaction", - dm.domain() - ))); - } + /// Returns true if this is a create-table transaction. + /// A create-table transaction has operation "CREATE TABLE" and a pre-commit snapshot + /// with PRE_COMMIT_VERSION. + fn is_create_table(&self) -> bool { + let is_create = self.operation.as_deref() == Some("CREATE TABLE"); + debug_assert!( + !is_create || self.read_snapshot.version() == PRE_COMMIT_VERSION, + "CREATE TABLE transaction must have PRE_COMMIT_VERSION snapshot" + ); + is_create + } - if dm.is_removed() { - has_removals = true; - } - } + /// Computes the in-commit timestamp for this transaction if ICT is enabled. + /// Returns `None` if ICT is not enabled on the table. A feature being in the protocol + /// (`is_feature_supported`) is not sufficient -- the `delta.enableInCommitTimestamps` + /// property must also be `true` (`is_feature_enabled`). + fn get_in_commit_timestamp(&self, engine: &dyn Engine) -> DeltaResult> { + let has_ict = self + .read_snapshot + .table_configuration() + .is_feature_enabled(&TableFeature::InCommitTimestamp); - // fetch previous configuration values (requires log replay) - let existing_domains = if has_removals { - scan_domain_metadatas(self.read_snapshot.log_segment(), None, engine)? - } else { - HashMap::new() - }; + if !has_ict { + return Ok(None); + } - let user_domains = self - .domain_metadatas - .iter() - .filter_map(move |dm: &DomainMetadata| { - if dm.is_removed() { - existing_domains.get(dm.domain()).map(|existing| { - DomainMetadata::remove( - dm.domain().to_string(), - existing.configuration().to_string(), - ) - }) - } else { - Some(dm.clone()) - } - }); + if self.is_create_table() { + // For CREATE TABLE there are no prior commits -- use the wall-clock time directly. + return Ok(Some(self.commit_timestamp)); + } - let system_domains = row_tracking_high_watermark - .map(DomainMetadata::try_from) - .transpose()? - .into_iter(); + // Existing table: enforce monotonicity per the Delta protocol. The timestamp + // must be the larger of: + // - The time at which the writer attempted the commit + // - One millisecond later than the previous commit's inCommitTimestamp + Ok(self + .read_snapshot + .get_in_commit_timestamp(engine)? + .map(|prev_ict| self.commit_timestamp.max(prev_ict + 1))) + } - Ok(user_domains - .chain(system_domains) - .map(|dm| dm.into_engine_data(get_log_domain_metadata_schema().clone(), engine))) + /// Returns the commit version for this transaction. + /// For existing table transactions, this is snapshot.version() + 1. + /// For create-table transactions (PRE_COMMIT_VERSION + 1 wraps to 0), this is 0. + fn get_commit_version(&self) -> Version { + // PRE_COMMIT_VERSION (u64::MAX) + 1 wraps to 0, which is the correct first version + self.read_snapshot.version().wrapping_add(1) } /// The schema that the [`Engine`]'s [`ParquetHandler`] is expected to use when reporting information about @@ -454,34 +674,94 @@ impl Transaction { /// file to be added to the table. Kernel takes this information and extends it to the full add_file /// action schema, adding internal fields (e.g., baseRowID) as necessary. /// - /// For now, Kernel only supports the number of records as a file statistic. - /// This will change in a future release. + /// The `stats` field contains file-level statistics. The schema returned here shows the base + /// structure; the actual stats written by [`DefaultEngine::write_parquet`] include dynamically + /// computed fields (numRecords, nullCount, minValues, maxValues, tightBounds) based on the + /// data schema and table configuration. See [`stats_schema`] for the table-specific expected + /// stats schema. /// /// Note: While currently static, in the future the schema might change depending on /// options set on the transaction or features enabled on the table. /// /// [`add_files`]: crate::transaction::Transaction::add_files /// [`ParquetHandler`]: crate::ParquetHandler + /// [`DefaultEngine::write_parquet`]: crate::engine::default::DefaultEngine::write_parquet + /// [`stats_schema`]: Transaction::stats_schema pub fn add_files_schema(&self) -> &'static SchemaRef { &BASE_ADD_FILES_SCHEMA } + /// Returns the expected schema for file statistics. + /// + /// The schema structure is derived from table configuration: + /// - `delta.dataSkippingStatsColumns`: Explicit column list (if set) + /// - `delta.dataSkippingNumIndexedCols`: Column count limit (default 32) + /// - Partition columns: Always excluded + /// + /// The returned schema has the following structure: + /// ```ignore + /// { + /// numRecords: long, + /// nullCount: { ... }, // Nested struct mirroring data schema, all fields LONG + /// minValues: { ... }, // Nested struct, only min/max eligible types + /// maxValues: { ... }, // Nested struct, only min/max eligible types + /// tightBounds: boolean, + /// } + /// ``` + /// + /// Engines should collect statistics matching this schema structure when writing files. + /// + /// Per the Delta protocol, required columns (e.g. clustering columns) are always included + /// in statistics, regardless of `dataSkippingStatsColumns` or `dataSkippingNumIndexedCols` + /// settings. + #[allow(unused)] + pub fn stats_schema(&self) -> DeltaResult { + let tc = self.read_snapshot.table_configuration(); + let stats_schemas = + tc.build_expected_stats_schemas(self.physical_clustering_columns.as_deref(), None)?; + Ok(stats_schemas.physical) + } + + /// Returns the list of column names that should have statistics collected. + /// + /// This returns leaf column paths as [`ColumnName`] objects. Each `ColumnName` + /// stores path components separately (e.g., `ColumnName::new(["nested", "field"])`). + /// See [`ColumnName`'s `Display` implementation][ColumnName#impl-Display-for-ColumnName] + /// for details on string formatting and escaping. + /// + /// Engines can use this to determine which columns need stats during writes. + /// + /// Per the Delta protocol, clustering columns are always included in statistics, + /// regardless of `dataSkippingStatsColumns` or `dataSkippingNumIndexedCols` settings. + #[allow(unused)] + pub fn stats_columns(&self) -> Vec { + self.read_snapshot + .table_configuration() + .physical_stats_column_names(self.physical_clustering_columns.as_deref()) + } + // Generate the logical-to-physical transform expression which must be evaluated on every data // chunk before writing. At the moment, this is a transaction-wide expression. fn generate_logical_to_physical(&self) -> Expression { - // for now, we just pass through all the columns except partition columns. - // note this is _incorrect_ if table config deems we need partition columns. - let partition_columns = self + let partition_cols = self .read_snapshot .table_configuration() - .metadata() - .partition_columns(); - let schema = self.read_snapshot.schema(); - let fields = schema - .fields() - .filter(|f| !partition_columns.contains(f.name())) - .map(|f| Expression::column([f.name()])); - Expression::struct_from(fields) + .partition_columns() + .to_vec(); + // Check if materializePartitionColumns feature is enabled + let materialize_partition_columns = self + .read_snapshot + .table_configuration() + .is_feature_enabled(&TableFeature::MaterializePartitionColumns); + // Build a Transform expression that drops partition columns from the input + // (unless materializePartitionColumns is enabled). + let mut transform = Transform::new_top_level(); + if !materialize_partition_columns { + for col in &partition_cols { + transform = transform.with_dropped_field_if_exists(col); + } + } + Expression::transform(transform) } /// Get the write context for this transaction. At the moment, this is constant for the whole @@ -489,14 +769,27 @@ impl Transaction { // Note: after we introduce metadata updates (modify table schema, etc.), we need to make sure // that engines cannot call this method after a metadata change, since the write context could // have invalid metadata. + // Note: Callers that use get_write_context may be writing data to the table and they might + // have invalid metadata. pub fn get_write_context(&self) -> WriteContext { let target_dir = self.read_snapshot.table_root(); let snapshot_schema = self.read_snapshot.schema(); let logical_to_physical = self.generate_logical_to_physical(); + let table_config = self.read_snapshot.table_configuration(); + let column_mapping_mode = table_config.column_mapping_mode(); + + let physical_schema = table_config.physical_write_schema(); + + // Get stats columns from table configuration + let stats_columns = self.stats_columns(); + WriteContext::new( target_dir.clone(), snapshot_schema, + physical_schema, Arc::new(logical_to_physical), + column_mapping_mode, + stats_columns, ) } @@ -509,7 +802,44 @@ impl Transaction { self.add_files_metadata.push(add_metadata); } + /// Validate that add files have required statistics for clustering columns. + /// + /// Per the Delta protocol, writers MUST collect per-file statistics for clustering columns + /// when the `ClusteredTable` feature is enabled. Other stat columns (e.g. the conventional + /// "first 32 columns") are not validated here because they are not protocol-required. + /// + /// Only add files are validated — remove files do not carry statistics. + fn validate_add_files_stats(&self, add_files: &[Box]) -> DeltaResult<()> { + if add_files.is_empty() { + return Ok(()); + } + if let Some(ref clustering_cols) = self.physical_clustering_columns { + if !clustering_cols.is_empty() { + let physical_schema = self.read_snapshot.table_configuration().physical_schema(); + let columns_with_types: Vec<(ColumnName, DataType)> = clustering_cols + .iter() + .map(|col| { + let data_type = physical_schema + .walk_column_fields(col)? + .last() + .map(|field| field.data_type().clone()) + .ok_or_else(|| { + Error::internal_error(format!( + "Required column '{col}' not found in table schema" + )) + })?; + Ok((col.clone(), data_type)) + }) + .collect::>()?; + let verifier = StatsVerifier::new(columns_with_types); + verifier.verify(add_files)?; + } + } + Ok(()) + } + /// Generate add actions, handling row tracking internally if needed + #[instrument(name = "txn.gen_adds", skip_all, err)] fn generate_adds<'a>( &'a self, engine: &dyn Engine, @@ -549,7 +879,7 @@ impl Transaction { input_schema.clone(), Arc::new(adds_expr), as_log_add_schema(output_schema.clone()).into(), - ); + )?; adds_evaluator.evaluate(add_files_batch?.deref()) }) } @@ -566,6 +896,13 @@ impl Transaction { .table_configuration() .should_write_row_tracking(); + // Row tracking is not yet supported for create-table with data + if needs_row_tracking && self.is_create_table() { + return Err(Error::unsupported( + "Row tracking is not yet supported for create table with data", + )); + } + if needs_row_tracking { // Read the current rowIdHighWaterMark from the snapshot's row tracking domain metadata let row_id_high_water_mark = @@ -633,8 +970,16 @@ impl Transaction { } } - fn into_committed(self, version: Version) -> CommittedTransaction { - let stats = PostCommitStats { + fn into_committed( + self, + file_meta: FileMeta, + crc_delta: CrcDelta, + ) -> DeltaResult { + let parsed_commit = ParsedLogPath::parse_commit(file_meta)?; + + let commit_version = parsed_commit.version; + + let post_commit_stats = PostCommitStats { commits_since_checkpoint: self.read_snapshot.log_segment().commits_since_checkpoint() + 1, commits_since_log_compaction: self @@ -644,26 +989,151 @@ impl Transaction { + 1, }; - CommittedTransaction { - transaction: self, - commit_version: version, - post_commit_stats: stats, - } + Ok(CommittedTransaction { + commit_version, + post_commit_stats, + post_commit_snapshot: Some(Arc::new( + self.read_snapshot + .new_post_commit(parsed_commit, crc_delta)?, + )), + }) + } + + /// Build a [`CrcDelta`] from the transaction's staged file metadata and commit state. + fn build_crc_delta( + &self, + in_commit_timestamp: Option, + dm_changes: Vec, + bin_boundaries: Option<&[i64]>, + ) -> DeltaResult { + let file_stats = FileStatsDelta::try_compute_for_txn( + &self.add_files_metadata, + &self.remove_files_metadata, + bin_boundaries, + )?; + let is_create = self.is_create_table(); + Ok(CrcDelta { + file_stats, + protocol: is_create + .then(|| self.read_snapshot.table_configuration().protocol().clone()), + metadata: is_create + .then(|| self.read_snapshot.table_configuration().metadata().clone()), + domain_metadata_changes: dm_changes, + set_transaction_changes: self.set_transactions.clone(), + in_commit_timestamp, + operation: self.operation.clone(), + has_missing_file_size: false, // writes always have sizes + }) } - fn into_conflicted(self, conflict_version: Version) -> ConflictedTransaction { + fn into_conflicted(self, conflict_version: Version) -> ConflictedTransaction { ConflictedTransaction { transaction: self, conflict_version, } } - fn into_retryable(self, error: Error) -> RetryableTransaction { + fn into_retryable(self, error: Error) -> RetryableTransaction { RetryableTransaction { transaction: self, error, } } + + /// Generates Remove actions from scan file metadata. + /// + /// This internal method transforms scan row metadata into Remove actions for the Delta log. + /// It's called during commit to process files staged via [`remove_files`] or files being + /// updated with new deletion vectors via [`update_deletion_vectors`]. + /// + /// # Parameters + /// + /// - `engine`: The engine used for expression evaluation + /// - `remove_files_metadata`: Iterator over scan file metadata to transform into Remove actions + /// - `columns_to_drop`: Column names to drop from the scan metadata before transformation. + /// This is used to remove temporary columns like the intermediate deletion vector column + /// added during DV updates. + /// + /// # Returns + /// + /// An iterator of FilteredEngineData containing Remove actions in the log schema format. + /// + /// [`remove_files`]: Transaction::remove_files + /// [`update_deletion_vectors`]: Transaction::update_deletion_vectors + #[instrument(name = "txn.gen_removes", skip_all, err)] + fn generate_remove_actions<'a>( + &'a self, + engine: &dyn Engine, + remove_files_metadata: impl Iterator + Send + 'a, + columns_to_drop: &'a [&str], + ) -> DeltaResult> + Send + 'a> { + // Create-table transactions should not have any remove actions. + // Only error if there are actually files queued for removal. + if self.is_create_table() && !self.remove_files_metadata.is_empty() { + return Err(Error::internal_error( + "CREATE TABLE transaction cannot have remove actions", + )); + } + + let input_schema = scan_row_schema(); + let target_schema = schema_with_all_fields_nullable(get_log_remove_schema())?; + let evaluation_handler = engine.evaluation_handler(); + + // Create the transform expression once, since it only contains literals and column references + let mut transform = Transform::new_top_level() + // deletionTimestamp + .with_inserted_field( + Some("path"), + Expression::literal(self.commit_timestamp).into(), + ) + // dataChange + .with_inserted_field(Some("path"), Expression::literal(self.data_change).into()) + .with_inserted_field( + // extended_file_metadata + Some("path"), + Expression::literal(true).into(), + ) + .with_inserted_field( + Some("path"), + Expression::column([FILE_CONSTANT_VALUES_NAME, "partitionValues"]).into(), + ) + // tags + .with_inserted_field( + Some("stats"), + Expression::column([FILE_CONSTANT_VALUES_NAME, TAGS_NAME]).into(), + ) + .with_inserted_field( + Some("deletionVector"), + Expression::column([FILE_CONSTANT_VALUES_NAME, BASE_ROW_ID_NAME]).into(), + ) + .with_inserted_field( + Some("deletionVector"), + Expression::column([FILE_CONSTANT_VALUES_NAME, DEFAULT_ROW_COMMIT_VERSION_NAME]) + .into(), + ) + .with_dropped_field(FILE_CONSTANT_VALUES_NAME) + .with_dropped_field("modificationTime"); + + // Drop any additional columns specified in columns_to_drop + for column_to_drop in columns_to_drop { + transform = transform.with_dropped_field(*column_to_drop); + } + + let expr = Arc::new(Expression::struct_from([Expression::transform(transform)])); + let file_action_eval = Arc::new(evaluation_handler.new_expression_evaluator( + input_schema.clone(), + expr.clone(), + target_schema.into(), + )?); + + Ok(remove_files_metadata.map(move |file_metadata_batch| { + let updated_engine_data = file_action_eval.evaluate(file_metadata_batch.data())?; + FilteredEngineData::try_new( + updated_engine_data, + file_metadata_batch.selection_vector().to_vec(), + ) + })) + } } /// WriteContext is data derived from a [`Transaction`] that can be provided to writers in order to @@ -672,16 +1142,30 @@ impl Transaction { /// [`Transaction`]: struct.Transaction.html pub struct WriteContext { target_dir: Url, - schema: SchemaRef, + logical_schema: SchemaRef, + physical_schema: SchemaRef, logical_to_physical: ExpressionRef, + column_mapping_mode: ColumnMappingMode, + /// Column names that should have statistics collected during writes. + stats_columns: Vec, } impl WriteContext { - fn new(target_dir: Url, schema: SchemaRef, logical_to_physical: ExpressionRef) -> Self { + fn new( + target_dir: Url, + logical_schema: SchemaRef, + physical_schema: SchemaRef, + logical_to_physical: ExpressionRef, + column_mapping_mode: ColumnMappingMode, + stats_columns: Vec, + ) -> Self { WriteContext { target_dir, - schema, + logical_schema, + physical_schema, logical_to_physical, + column_mapping_mode, + stats_columns, } } @@ -689,13 +1173,53 @@ impl WriteContext { &self.target_dir } - pub fn schema(&self) -> &SchemaRef { - &self.schema + pub fn logical_schema(&self) -> &SchemaRef { + &self.logical_schema + } + + pub fn physical_schema(&self) -> &SchemaRef { + &self.physical_schema } pub fn logical_to_physical(&self) -> ExpressionRef { self.logical_to_physical.clone() } + + /// The [`ColumnMappingMode`] for this table. + pub fn column_mapping_mode(&self) -> ColumnMappingMode { + self.column_mapping_mode + } + + /// Returns the column names that should have statistics collected during writes. + /// + /// Based on table configuration (dataSkippingNumIndexedCols, dataSkippingStatsColumns). + pub fn stats_columns(&self) -> &[ColumnName] { + &self.stats_columns + } + + /// Generate a new unique absolute URL for a deletion vector file. + /// + /// This method generates a unique file name in the table directory. + /// Each call to this method returns a new unique path. + /// + /// # Arguments + /// + /// * `random_prefix` - A random prefix to use for the deletion vector file name. + /// Making this non-empty can help distributed load on object storage when writing/reading + /// to avoid throttling. Typically a random string fo 2-4 characters is sufficient + /// for this purpose. + /// + /// + /// # Examples + /// + /// ```rust,ignore + /// let write_context = transaction.get_write_context(); + /// let dv_path = write_context.new_deletion_vector_path(String::from(rand_string())); + /// // dv_url might be: s3://bucket/table/deletion_vector_d2c639aa-8816-431a-aaf6-d3fe2512ff61.bin + /// ``` + pub fn new_deletion_vector_path(&self, random_prefix: String) -> DeletionVectorPath { + DeletionVectorPath::new(self.target_dir.clone(), random_prefix) + } } /// Kernel exposes information about the state of the table that engines might want to use to @@ -724,7 +1248,7 @@ pub struct PostCommitStats { /// can be retried without rebasing. #[derive(Debug)] #[must_use] -pub enum CommitResult { +pub enum CommitResult { /// The transaction was successfully committed. CommittedTransaction(CommittedTransaction), /// This transaction conflicted with an existing version (see @@ -733,32 +1257,47 @@ pub enum CommitResult { /// conflicted). // TODO(zach): in order to make the returning of a transaction useful, we need to add APIs to // update the transaction to a new version etc. - ConflictedTransaction(ConflictedTransaction), + ConflictedTransaction(ConflictedTransaction), /// An IO (retryable) error occurred during the commit. - RetryableTransaction(RetryableTransaction), + RetryableTransaction(RetryableTransaction), } -impl CommitResult { +impl CommitResult { /// Returns true if the commit was successful. pub fn is_committed(&self) -> bool { matches!(self, CommitResult::CommittedTransaction(_)) } } +impl CommitResult { + /// Unwraps the [`CommittedTransaction`], panicking if the commit was not successful. + #[cfg(any(test, feature = "test-utils"))] + #[allow(clippy::panic)] + pub fn unwrap_committed(self) -> CommittedTransaction { + match self { + CommitResult::CommittedTransaction(c) => c, + other => panic!("Expected CommittedTransaction, got: {other:?}"), + } + } +} + /// This is the result of a successfully committed [Transaction]. One can retrieve the -/// [PostCommitStats] and [commit version] from this struct. In the future a post-commit snapshot -/// can be obtained as well. +/// [post_commit_stats], [commit version], and optionally the [post-commit snapshot] from this struct. /// +/// [post_commit_stats]: Self::post_commit_stats /// [commit version]: Self::commit_version +/// [post-commit snapshot]: Self::post_commit_snapshot #[derive(Debug)] pub struct CommittedTransaction { - // TODO: remove after post-commit snapshot - #[allow(dead_code)] - transaction: Transaction, - /// the version of the table that was just committed + /// The version of the table that was just committed. commit_version: Version, - /// The [`PostCommitStats`] for this transaction + /// The [`PostCommitStats`] for this transaction. post_commit_stats: PostCommitStats, + /// The [`SnapshotRef`] of the table after this transaction was committed. + /// + /// This is optional to allow incremental development of new features (e.g., table creation, + /// transaction retries) without blocking on implementing post-commit snapshot support. + post_commit_snapshot: Option, } impl CommittedTransaction { @@ -772,7 +1311,10 @@ impl CommittedTransaction { &self.post_commit_stats } - // TODO(#916): post-commit snapshot + /// The [`SnapshotRef`] of the table after this transaction was committed. + pub fn post_commit_snapshot(&self) -> Option<&SnapshotRef> { + self.post_commit_snapshot.as_ref() + } } /// This is the result of a conflicted [Transaction]. One can retrieve the [conflict version] from @@ -780,14 +1322,14 @@ impl CommittedTransaction { /// /// [conflict version]: Self::conflict_version #[derive(Debug)] -pub struct ConflictedTransaction { +pub struct ConflictedTransaction { // TODO: remove after rebase APIs #[allow(dead_code)] - transaction: Transaction, + transaction: Transaction, conflict_version: Version, } -impl ConflictedTransaction { +impl ConflictedTransaction { /// The version attempted commit that yielded a conflict pub fn conflict_version(&self) -> Version { self.conflict_version @@ -798,21 +1340,149 @@ impl ConflictedTransaction { /// can be recovered with `RetryableTransaction::transaction` and retried without rebasing. The /// associated error can be inspected via `RetryableTransaction::error`. #[derive(Debug)] -pub struct RetryableTransaction { +pub struct RetryableTransaction { /// The transaction that failed to commit due to a retryable error. - pub transaction: Transaction, + pub transaction: Transaction, /// Transient error that caused the commit to fail. pub error: Error, } #[cfg(test)] mod tests { + use std::collections::HashMap; + use super::*; + use crate::actions::deletion_vector::DeletionVectorDescriptor; + use crate::actions::CommitInfo; + use crate::arrow::array::{ArrayRef, Int64Array, StringArray}; + use crate::arrow::datatypes::Schema as ArrowSchema; + use crate::arrow::record_batch::RecordBatch; + use crate::committer::{FileSystemCommitter, PublishMetadata}; + use crate::engine::arrow_conversion::TryIntoArrow; + use crate::engine::arrow_data::ArrowEngineData; + use crate::engine::arrow_expression::ArrowEvaluationHandler; use crate::engine::sync::SyncEngine; + use crate::expressions::{MapData, Scalar, StructData}; + use crate::object_store::local::LocalFileSystem; + use crate::object_store::memory::InMemory; + use crate::object_store::path::Path; + use crate::object_store::ObjectStore as _; use crate::schema::MapType; + use crate::table_features::ColumnMappingMode; + use crate::transaction::create_table::create_table; + use crate::utils::test_utils::{ + load_test_table, string_array_to_engine_data, test_schema_flat, test_schema_nested, + test_schema_with_array, test_schema_with_map, + }; + use crate::EvaluationHandler; use crate::Snapshot; + use rstest::rstest; use std::path::PathBuf; + impl Transaction { + /// Set clustering columns for testing purposes without needing a table + /// with the ClusteredTable feature enabled. + fn with_clustering_columns_for_test(mut self, columns: Vec) -> Self { + self.physical_clustering_columns = Some(columns); + self + } + } + + /// A mock committer that always returns an IOError, used to test the retryable error path. + struct IoErrorCommitter; + + impl Committer for IoErrorCommitter { + fn commit( + &self, + _engine: &dyn Engine, + _actions: Box> + Send + '_>, + _commit_metadata: CommitMetadata, + ) -> DeltaResult { + Err(Error::IOError(std::io::Error::other("simulated IO error"))) + } + fn is_catalog_committer(&self) -> bool { + false + } + fn publish( + &self, + _engine: &dyn Engine, + _publish_metadata: PublishMetadata, + ) -> DeltaResult<()> { + Ok(()) + } + } + + /// A mock catalog committer, used to test catalog committer validation. + struct MockCatalogCommitter; + + impl Committer for MockCatalogCommitter { + fn commit( + &self, + _engine: &dyn Engine, + _actions: Box> + Send + '_>, + _commit_metadata: CommitMetadata, + ) -> DeltaResult { + // This won't be reached in tests — the validation error fires before commit. + Ok(CommitResponse::Conflict { version: 0 }) + } + fn is_catalog_committer(&self) -> bool { + true + } + fn publish( + &self, + _engine: &dyn Engine, + _publish_metadata: PublishMetadata, + ) -> DeltaResult<()> { + Ok(()) + } + } + + /// Sets up a snapshot for a table with deletion vector support at version 1 + fn setup_dv_enabled_table() -> (SyncEngine, Arc) { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let snapshot = Snapshot::builder_for(url) + .at_version(1) + .build(&engine) + .unwrap(); + (engine, snapshot) + } + + fn setup_non_dv_table() -> (SyncEngine, Arc) { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-without-dv-small/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + (engine, snapshot) + } + + /// Creates a test deletion vector descriptor with default values (the DV might not exist on disk) + fn create_test_dv_descriptor(path_suffix: &str) -> DeletionVectorDescriptor { + use crate::actions::deletion_vector::{ + DeletionVectorDescriptor, DeletionVectorStorageType, + }; + DeletionVectorDescriptor { + storage_type: DeletionVectorStorageType::PersistedRelative, + path_or_inline_dv: format!("dv_{path_suffix}"), + offset: Some(0), + size_in_bytes: 100, + cardinality: 1, + } + } + + fn create_dv_transaction( + snapshot: Arc, + engine: &dyn Engine, + ) -> DeltaResult { + Ok(snapshot + .transaction(Box::new(FileSystemCommitter::new()), engine)? + .with_operation("DELETE".to_string()) + .with_engine_info("test_engine")) + } + // TODO: create a finer-grained unit tests for transactions (issue#1091) #[test] fn test_add_files_schema() -> Result<(), Box> { @@ -825,7 +1495,7 @@ mod tests { .build(&engine) .unwrap(); let txn = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_engine_info("default engine"); let schema = txn.add_files_schema(); @@ -839,13 +1509,827 @@ mod tests { StructField::not_null("modificationTime", DataType::LONG), StructField::nullable( "stats", - DataType::struct_type_unchecked(vec![StructField::nullable( - "numRecords", - DataType::LONG, - )]), + DataType::struct_type_unchecked(vec![ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable("nullCount", DataType::struct_type_unchecked(vec![])), + StructField::nullable("minValues", DataType::struct_type_unchecked(vec![])), + StructField::nullable("maxValues", DataType::struct_type_unchecked(vec![])), + StructField::nullable("tightBounds", DataType::BOOLEAN), + ]), ), ]); assert_eq!(*schema, expected.into()); Ok(()) } + + #[test] + fn test_new_deletion_vector_path() -> Result<(), Box> { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let snapshot = Snapshot::builder_for(url.clone()) + .at_version(1) + .build(&engine) + .unwrap(); + let txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine)? + .with_engine_info("default engine"); + let write_context = txn.get_write_context(); + + // Test with empty prefix + let dv_path1 = write_context.new_deletion_vector_path(String::from("")); + let abs_path1 = dv_path1.absolute_path()?; + assert!(abs_path1.as_str().contains(url.as_str())); + + // Test with non-empty prefix + let prefix = String::from("dv_test"); + let dv_path2 = write_context.new_deletion_vector_path(prefix.clone()); + let abs_path2 = dv_path2.absolute_path()?; + assert!(abs_path2.as_str().contains(url.as_str())); + assert!(abs_path2.as_str().contains(&prefix)); + + // Test that two paths with same prefix are different (unique UUIDs) + let dv_path3 = write_context.new_deletion_vector_path(prefix.clone()); + let abs_path3 = dv_path3.absolute_path()?; + assert_ne!(abs_path2, abs_path3); + + Ok(()) + } + + #[test] + fn test_physical_schema_excludes_partition_columns() -> Result<(), Box> { + let engine = SyncEngine::new(); + let path = std::fs::canonicalize(PathBuf::from("./tests/data/basic_partitioned/")).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let snapshot = Snapshot::builder_for(url).build(&engine).unwrap(); + let txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine)? + .with_engine_info("default engine"); + + let write_context = txn.get_write_context(); + let logical_schema = write_context.logical_schema(); + let physical_schema = write_context.physical_schema(); + + // Logical schema should include the partition column + assert!( + logical_schema.contains("letter"), + "Logical schema should contain partition column 'letter'" + ); + + // Physical schema should exclude the partition column + assert!( + !physical_schema.contains("letter"), + "Physical schema should not contain partition column 'letter' (stored in path)" + ); + + // Both should contain the non-partition columns + assert!( + logical_schema.contains("number"), + "Logical schema should contain data column 'number'" + ); + + assert!( + physical_schema.contains("number"), + "Physical schema should contain data column 'number'" + ); + + Ok(()) + } + + /// Helper: loads a test table snapshot and returns both the snapshot and its write context. + fn snapshot_and_write_context( + table_path: &str, + ) -> Result<(Arc, WriteContext), Box> { + let engine = SyncEngine::new(); + let path = std::fs::canonicalize(PathBuf::from(table_path)).unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let snapshot = Snapshot::builder_for(url).build(&engine)?; + let txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), &engine)?; + Ok((snapshot, txn.get_write_context())) + } + + /// Helper: evaluates the logical-to-physical transform on the given batch and returns the + /// output RecordBatch. + fn eval_logical_to_physical( + wc: &WriteContext, + batch: RecordBatch, + ) -> Result> { + let logical_schema = wc.logical_schema(); + let physical_schema = wc.physical_schema(); + let l2p = wc.logical_to_physical(); + + let handler = ArrowEvaluationHandler; + let evaluator = handler.new_expression_evaluator( + logical_schema.clone(), + l2p, + physical_schema.clone().into(), + )?; + let result = ArrowEngineData::try_from_engine_data( + evaluator.evaluate(&ArrowEngineData::new(batch))?, + )?; + Ok(result.record_batch().clone()) + } + + #[test] + fn test_materialize_partition_columns_in_write_context( + ) -> Result<(), Box> { + // Without materializePartitionColumns, partition column should be dropped + let (snap_without, wc_without) = + snapshot_and_write_context("./tests/data/basic_partitioned/")?; + let partition_cols = snap_without.table_configuration().partition_columns(); + assert_eq!(partition_cols.len(), 1); + assert_eq!(partition_cols[0], "letter"); + assert!( + !snap_without + .table_configuration() + .protocol() + .has_table_feature(&TableFeature::MaterializePartitionColumns), + "basic_partitioned should not have materializePartitionColumns feature" + ); + let expr_str = format!("{}", wc_without.logical_to_physical()); + assert!( + expr_str.contains("drop letter"), + "Partition column 'letter' should be dropped. Expression: {expr_str}" + ); + + // With materializePartitionColumns, no columns should be dropped (identity transform) + let (snap_with, wc_with) = + snapshot_and_write_context("./tests/data/partitioned_with_materialize_feature/")?; + let partition_cols = snap_with.table_configuration().partition_columns(); + assert_eq!(partition_cols.len(), 1); + assert_eq!(partition_cols[0], "letter"); + assert!( + snap_with + .table_configuration() + .protocol() + .has_table_feature(&TableFeature::MaterializePartitionColumns), + "partitioned_with_materialize_feature should have materializePartitionColumns feature" + ); + let expr_str = format!("{}", wc_with.logical_to_physical()); + assert!( + !expr_str.contains("drop"), + "No columns should be dropped with materializePartitionColumns. Expression: {expr_str}" + ); + + Ok(()) + } + + /// Physical schema should include partition columns when materializePartitionColumns is on. + #[test] + fn test_physical_schema_includes_partition_columns_when_materialized( + ) -> Result<(), Box> { + let engine = SyncEngine::new(); + let path = std::fs::canonicalize(PathBuf::from( + "./tests/data/partitioned_with_materialize_feature/", + )) + .unwrap(); + let url = url::Url::from_directory_path(path).unwrap(); + let snapshot = Snapshot::builder_for(url).at_version(1).build(&engine)?; + + let txn = snapshot.transaction(Box::new(FileSystemCommitter::new()), &engine)?; + let write_context = txn.get_write_context(); + let physical_schema = write_context.physical_schema(); + + assert!( + physical_schema.contains("letter"), + "Partition column 'letter' should be in physical schema when materialized" + ); + assert!( + physical_schema.contains("number"), + "Non-partition column 'number' should be in physical schema" + ); + Ok(()) + } + + /// Tests that update_deletion_vectors validates table protocol requirements. + /// Validates that attempting DV updates on unsupported tables returns protocol error. + #[test] + fn test_update_deletion_vectors_unsupported_table() -> Result<(), Box> { + let (engine, snapshot) = setup_non_dv_table(); + let mut txn = create_dv_transaction(snapshot, &engine)?; + + let dv_map = HashMap::new(); + let result = txn.update_deletion_vectors(dv_map, std::iter::empty()); + + let err = result.expect_err("Should fail on table without DV support"); + let err_msg = err.to_string(); + assert!( + err_msg.contains("Deletion vector") + && (err_msg.contains("require") || err_msg.contains("version")), + "Expected protocol error about DV requirements, got: {err_msg}" + ); + Ok(()) + } + + /// Tests that update_deletion_vectors validates DV descriptors match scan files. + /// Validates detection of mismatch between provided DV descriptors and actual files. + #[test] + fn test_update_deletion_vectors_mismatch_count() -> Result<(), Box> { + let (engine, snapshot) = setup_dv_enabled_table(); + let mut txn = create_dv_transaction(snapshot, &engine)?; + + let mut dv_map = HashMap::new(); + let descriptor = create_test_dv_descriptor("non_existent"); + dv_map.insert("non_existent_file.parquet".to_string(), descriptor); + + let result = txn.update_deletion_vectors(dv_map, std::iter::empty()); + + assert!( + result.is_err(), + "Should fail when DV descriptors don't match scan files" + ); + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("matched") && err_msg.contains("does not match"), + "Expected error about mismatched count (expected 1 descriptor, 0 matched files), got: {err_msg}"); + Ok(()) + } + + /// Tests that update_deletion_vectors handles empty DV updates correctly as a no-op. + /// This edge case occurs when a DELETE operation matches no rows. + #[test] + fn test_update_deletion_vectors_empty_inputs() -> Result<(), Box> { + let (engine, snapshot) = setup_dv_enabled_table(); + let mut txn = create_dv_transaction(snapshot, &engine)?; + + let dv_map = HashMap::new(); + let result = txn.update_deletion_vectors(dv_map, std::iter::empty()); + + assert!( + result.is_ok(), + "Empty DV updates should succeed as no-op, got error: {result:?}" + ); + + Ok(()) + } + + // ============================================================================ + // validate_blind_append tests + // ============================================================================ + fn add_dummy_file(txn: &mut Transaction) { + let data = string_array_to_engine_data(StringArray::from(vec!["dummy"])); + txn.add_files(data); + } + + fn create_existing_table_txn( + ) -> DeltaResult<(Arc, Transaction, Option)> { + let (engine, snapshot, tempdir) = load_test_table("table-without-dv-small")?; + let txn = snapshot.transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())?; + Ok((engine, txn, tempdir)) + } + + #[test] + fn test_validate_blind_append_success() -> DeltaResult<()> { + let (_engine, mut txn, _tempdir) = create_existing_table_txn()?; + txn = txn.with_blind_append(); + add_dummy_file(&mut txn); + txn.validate_blind_append_semantics()?; + Ok(()) + } + + #[test] + fn test_validate_blind_append_requires_adds() -> DeltaResult<()> { + let (_engine, mut txn, _tempdir) = create_existing_table_txn()?; + txn = txn.with_blind_append(); + let result = txn.validate_blind_append_semantics(); + assert!(matches!(result, Err(Error::InvalidTransactionState(_)))); + Ok(()) + } + + #[test] + fn test_validate_blind_append_requires_data_change() -> DeltaResult<()> { + let (_engine, mut txn, _tempdir) = create_existing_table_txn()?; + txn = txn.with_blind_append(); + txn.set_data_change(false); + add_dummy_file(&mut txn); + let result = txn.validate_blind_append_semantics(); + assert!(matches!(result, Err(Error::InvalidTransactionState(_)))); + Ok(()) + } + + #[test] + fn test_validate_blind_append_rejects_removes() -> DeltaResult<()> { + let (_engine, mut txn, _tempdir) = create_existing_table_txn()?; + txn = txn.with_blind_append(); + add_dummy_file(&mut txn); + let remove_data = FilteredEngineData::with_all_rows_selected(string_array_to_engine_data( + StringArray::from(vec!["remove"]), + )); + txn.remove_files(remove_data); + let result = txn.validate_blind_append_semantics(); + assert!(matches!(result, Err(Error::InvalidTransactionState(_)))); + Ok(()) + } + + #[test] + fn test_validate_blind_append_rejects_dv_updates() -> DeltaResult<()> { + let (_engine, mut txn, _tempdir) = create_existing_table_txn()?; + txn = txn.with_blind_append(); + add_dummy_file(&mut txn); + let dv_data = FilteredEngineData::with_all_rows_selected(string_array_to_engine_data( + StringArray::from(vec!["dv"]), + )); + txn.dv_matched_files.push(dv_data); + let result = txn.validate_blind_append_semantics(); + assert!(matches!(result, Err(Error::InvalidTransactionState(_)))); + Ok(()) + } + + #[test] + fn test_validate_blind_append_rejects_create_table() -> DeltaResult<()> { + let tempdir = tempfile::tempdir()?; + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + let store = Arc::new(LocalFileSystem::new()); + let engine = Arc::new(crate::engine::default::DefaultEngineBuilder::new(store).build()); + let mut txn = create_table( + tempdir.path().to_str().expect("valid temp path"), + schema, + "test_engine", + ) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))?; + // CreateTableTransaction does not expose with_blind_append() (compile-time + // prevention per #1768). Directly set the field to test the runtime check. + txn.is_blind_append = true; + add_dummy_file(&mut txn); + let result = txn.validate_blind_append_semantics(); + assert!(matches!(result, Err(Error::InvalidTransactionState(_)))); + Ok(()) + } + + #[test] + fn test_blind_append_sets_commit_info_flag() -> Result<(), Box> { + let commit_info = CommitInfo::new(1, None, None, None, true); + assert_eq!(commit_info.is_blind_append, Some(true)); + + let commit_info_false = CommitInfo::new(1, None, None, None, false); + assert_eq!(commit_info_false.is_blind_append, None); + Ok(()) + } + + #[test] + fn test_blind_append_commit_rejects_no_adds() -> DeltaResult<()> { + let (_engine, mut txn, _tempdir) = create_existing_table_txn()?; + txn = txn.with_blind_append(); + // No files added — commit should fail with blind append validation + let err = txn + .commit(_engine.as_ref()) + .expect_err("Blind append with no adds should fail"); + assert!( + err.to_string() + .contains("Blind append requires at least one added data file"), + "Unexpected error: {err}" + ); + Ok(()) + } + + #[test] + fn test_blind_append_commit_success() -> DeltaResult<()> { + let (engine, mut txn, _tempdir) = create_existing_table_txn()?; + txn = txn.with_blind_append(); + add_dummy_file(&mut txn); + // Blind append with add files should pass validation and proceed to commit. + // The commit itself may fail due to schema mismatch with the dummy data, + // but we verify validation (line 415) passes on the Ok path. + let result = txn.commit(engine.as_ref()); + // If it fails, it should NOT be an InvalidTransactionState error + if let Err(e) = result { + assert!( + !matches!(e, Error::InvalidTransactionState(_)), + "Blind append validation should have passed, got: {e}" + ); + } + Ok(()) + } + + // Note: Additional test coverage for partial file matching (where some files in a scan + // have DV updates but others don't) is provided by the end-to-end integration test + // kernel/tests/dv.rs and kernel/tests/write.rs, which exercises + // the full deletion vector write workflow including the DvMatchVisitor logic. + + #[test] + fn test_commit_io_error_returns_retryable_transaction() -> DeltaResult<()> { + let (engine, snapshot, _tempdir) = load_test_table("table-without-dv-small")?; + let mut txn = snapshot.transaction(Box::new(IoErrorCommitter), engine.as_ref())?; + add_dummy_file(&mut txn); + let result = txn.commit(engine.as_ref())?; + assert!( + matches!(result, CommitResult::RetryableTransaction(_)), + "Expected RetryableTransaction, got: {result:?}" + ); + if let CommitResult::RetryableTransaction(retryable) = result { + assert!( + retryable.error.to_string().contains("simulated IO error"), + "Unexpected error: {}", + retryable.error + ); + } + Ok(()) + } + + #[test] + fn test_existing_table_txn_debug() -> DeltaResult<()> { + let (_engine, txn, _tempdir) = create_existing_table_txn()?; + let debug_str = format!("{txn:?}"); + // Existing-table transactions should include the snapshot version number + assert!( + debug_str.contains("Transaction") && debug_str.contains("read_snapshot version"), + "Debug output should contain Transaction info: {debug_str}" + ); + // Should NOT contain "create_table" + assert!( + !debug_str.contains("create_table"), + "Existing table debug should not contain create_table: {debug_str}" + ); + Ok(()) + } + + // Input schemas have no CM metadata; create_table automatically assigns IDs and + // physical names when mode is Name or Id. + #[rstest] + #[case::flat_none(test_schema_flat(), ColumnMappingMode::None)] + #[case::flat_name(test_schema_flat(), ColumnMappingMode::Name)] + #[case::flat_id(test_schema_flat(), ColumnMappingMode::Id)] + #[case::nested_none(test_schema_nested(), ColumnMappingMode::None)] + #[case::nested_name(test_schema_nested(), ColumnMappingMode::Name)] + #[case::nested_id(test_schema_nested(), ColumnMappingMode::Id)] + #[case::map_none(test_schema_with_map(), ColumnMappingMode::None)] + #[case::map_name(test_schema_with_map(), ColumnMappingMode::Name)] + #[case::map_id(test_schema_with_map(), ColumnMappingMode::Id)] + #[case::array_none(test_schema_with_array(), ColumnMappingMode::None)] + #[case::array_name(test_schema_with_array(), ColumnMappingMode::Name)] + #[case::array_id(test_schema_with_array(), ColumnMappingMode::Id)] + fn test_physical_schema_column_mapping( + #[case] schema: SchemaRef, + #[case] mode: ColumnMappingMode, + ) -> DeltaResult<()> { + let (_engine, txn) = crate::utils::test_utils::setup_column_mapping_txn(schema, mode)?; + let write_context = txn.get_write_context(); + crate::utils::test_utils::validate_physical_schema_column_mapping( + write_context.logical_schema(), + write_context.physical_schema(), + mode, + ); + Ok(()) + } + + /// Builds two-row [`EngineData`] with logical field names matching [`test_schema_nested`]. + fn build_test_record_batch() -> DeltaResult> { + let schema = test_schema_nested(); + let tag_type = MapType::new(DataType::STRING, DataType::STRING, true); + let score_type = ArrayType::new(DataType::INTEGER, true); + let info_fields = vec![ + StructField::nullable("name", DataType::STRING), + StructField::nullable("age", DataType::INTEGER), + StructField::nullable("tags", tag_type.clone()), + StructField::nullable("scores", score_type.clone()), + ]; + let info1 = Scalar::Struct(StructData::try_new( + info_fields.clone(), + vec![ + "alice".into(), + 30i32.into(), + Scalar::Map(MapData::try_new(tag_type.clone(), [("k1", "v1")])?), + Scalar::Array(ArrayData::try_new(score_type.clone(), [10i32, 20i32])?), + ], + )?); + let info2 = Scalar::Struct(StructData::try_new( + info_fields, + vec![ + "bob".into(), + 25i32.into(), + Scalar::Map(MapData::try_new(tag_type, [("k2", "v2")])?), + Scalar::Array(ArrayData::try_new(score_type, [30i32])?), + ], + )?); + ArrowEvaluationHandler.create_many(schema, &[&[1i64.into(), info1], &[2i64.into(), info2]]) + } + + /// Validates that [`WriteContext::logical_to_physical`] correctly renames fields at all nesting levels. + /// Builds a RecordBatch with logical names, evaluates the transform, and checks that the + /// output uses physical names from the physical schema — including nested struct children. + fn validate_logical_to_physical_transform(mode: ColumnMappingMode) -> DeltaResult<()> { + let schema = test_schema_nested(); + let (_engine, txn) = crate::utils::test_utils::setup_column_mapping_txn(schema, mode)?; + let write_context = txn.get_write_context(); + let logical_schema = write_context.logical_schema(); + let physical_schema = write_context.physical_schema(); + let logical_to_physical_expression = write_context.logical_to_physical(); + + if mode != ColumnMappingMode::None { + assert_ne!( + logical_schema, physical_schema, + "Physical schema should differ from logical schema when column mapping is enabled" + ); + } + + let data = build_test_record_batch()?; + + // Evaluate the logical_to_physical expression + let input_schema: SchemaRef = logical_schema.clone(); + let handler = ArrowEvaluationHandler; + let evaluator = handler.new_expression_evaluator( + input_schema, + logical_to_physical_expression.clone(), + physical_schema.clone().into(), + )?; + let result = evaluator.evaluate(data.as_ref())?; + let result = ArrowEngineData::try_from_engine_data(result)?; + let result_batch = result.record_batch(); + + // Verify: all field names, types, and metadata match the physical schema + let expected_arrow_schema: ArrowSchema = physical_schema.as_ref().try_into_arrow()?; + assert_eq!(result_batch.schema().as_ref(), &expected_arrow_schema); + + // Verify: data is preserved (id values) + let id_col = result_batch + .column(0) + .as_any() + .downcast_ref::() + .expect("id column should be Int64"); + assert_eq!(id_col.values(), &[1i64, 2]); + + Ok(()) + } + + #[rstest] + #[case::name_mode(ColumnMappingMode::Name)] + #[case::id_mode(ColumnMappingMode::Id)] + #[case::none_mode(ColumnMappingMode::None)] + fn test_logical_to_physical_transform(#[case] mode: ColumnMappingMode) -> DeltaResult<()> { + validate_logical_to_physical_transform(mode) + } + + #[rstest] + #[case::dropped("./tests/data/basic_partitioned/", 2, &[])] + #[case::kept("./tests/data/partitioned_with_materialize_feature/", 3, &["letter"])] + fn test_partition_column_in_eval_output( + #[case] table_path: &str, + #[case] expected_cols: usize, + #[case] expected_partition_cols: &[&str], + ) -> Result<(), Box> { + use crate::arrow::array::Float64Array; + let (_snap, wc) = snapshot_and_write_context(table_path)?; + let batch = RecordBatch::try_new( + Arc::new(wc.logical_schema().as_ref().try_into_arrow()?), + vec![ + Arc::new(StringArray::from(vec!["x"])) as ArrayRef, + Arc::new(Int64Array::from(vec![42])), + Arc::new(Float64Array::from(vec![1.5])), + ], + )?; + let rb = eval_logical_to_physical(&wc, batch)?; + assert_eq!(rb.num_columns(), expected_cols); + for col in expected_partition_cols { + assert!(rb.schema().fields().iter().any(|f| f.name() == *col)); + } + Ok(()) + } + + // ========================================================================= + // Stats validation tests for clustering columns + // ========================================================================= + + /// Per-file stats configuration for test add file helpers. + enum TestFileStats { + /// No stats (null stats struct) + None, + /// Normal stats with non-null min/max + Present, + /// All-null column: nullCount == numRecords, null min/max + AllNull, + } + + /// Creates test add file metadata with configurable stats for the "value" column. + fn create_test_add_files(paths: Vec<&str>, stats: Vec) -> Box { + let value_fields = vec![StructField::nullable("value", DataType::LONG)]; + let value_struct_type = DataType::struct_type_unchecked(value_fields.clone()); + let stats_type = DataType::struct_type_unchecked(vec![ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable("nullCount", value_struct_type.clone()), + StructField::nullable("minValues", value_struct_type.clone()), + StructField::nullable("maxValues", value_struct_type.clone()), + ]); + let stats_fields = vec![ + StructField::nullable("numRecords", DataType::LONG), + StructField::nullable("nullCount", value_struct_type.clone()), + StructField::nullable("minValues", value_struct_type.clone()), + StructField::nullable("maxValues", value_struct_type), + ]; + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::not_null("path", DataType::STRING), + StructField::not_null( + "partitionValues", + MapType::new(DataType::STRING, DataType::STRING, true), + ), + StructField::not_null("size", DataType::LONG), + StructField::not_null("modificationTime", DataType::LONG), + StructField::nullable("stats", stats_type.clone()), + ])); + + let empty_map = Scalar::Map( + MapData::try_new( + MapType::new(DataType::STRING, DataType::STRING, true), + Vec::<(&str, &str)>::new(), + ) + .unwrap(), + ); + + let rows: Vec> = paths + .iter() + .zip(stats.iter()) + .map(|(path, stat)| { + let stats_scalar = match stat { + TestFileStats::None => Scalar::Null(stats_type.clone()), + TestFileStats::Present | TestFileStats::AllNull => { + let value_struct = |v: Option| { + let scalar = v.map_or(Scalar::Null(DataType::LONG), |n| n.into()); + Scalar::Struct( + StructData::try_new(value_fields.clone(), vec![scalar]).unwrap(), + ) + }; + let (null_count, min, max) = match stat { + TestFileStats::Present => ( + value_struct(Some(0)), + value_struct(Some(1)), + value_struct(Some(100)), + ), + _ => ( + value_struct(Some(100)), + value_struct(None), + value_struct(None), + ), + }; + Scalar::Struct( + StructData::try_new( + stats_fields.clone(), + vec![100i64.into(), null_count, min, max], + ) + .unwrap(), + ) + } + }; + vec![ + (*path).into(), + empty_map.clone(), + 1024i64.into(), + 1000000i64.into(), + stats_scalar, + ] + }) + .collect(); + let row_refs: Vec<&[Scalar]> = rows.iter().map(|r| r.as_slice()).collect(); + ArrowEvaluationHandler + .create_many(schema, &row_refs) + .unwrap() + } + + #[test] + fn test_stats_validation_allows_all_null_clustering_column() { + let (engine, snapshot) = setup_non_dv_table(); + let txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine) + .unwrap() + .with_operation("WRITE".to_string()) + .with_clustering_columns_for_test(vec![ColumnName::new(["value"])]); + + let add_files = create_test_add_files(vec!["file1.parquet"], vec![TestFileStats::AllNull]); + + let result = txn.validate_add_files_stats(&[add_files]); + + assert!( + result.is_ok(), + "Stats validation should pass for all-null clustering columns, got: {result:?}", + ); + } + + #[test] + fn test_stats_validation_when_clustering_cols_missing_stats() { + let (engine, snapshot) = setup_non_dv_table(); + let txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine) + .unwrap() + .with_operation("WRITE".to_string()) + // Enable clustering columns for this test + .with_clustering_columns_for_test(vec![ColumnName::new(["value"])]); + + // Add files WITHOUT stats + let add_files = create_test_add_files(vec!["file1.parquet"], vec![TestFileStats::None]); + + // Directly test the validation method instead of committing + let result = txn.validate_add_files_stats(&[add_files]); + + assert!( + result.is_err(), + "Expected validation to fail when stats are missing for clustering columns" + ); + + let err_msg = result.unwrap_err().to_string(); + assert!( + err_msg.contains("Stats validation error") || err_msg.contains("no stats"), + "Expected stats validation error, got: {err_msg}" + ); + } + + #[test] + fn test_stats_validation_when_clustering_stats_present() { + let (engine, snapshot) = setup_non_dv_table(); + let txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine) + .unwrap() + .with_operation("WRITE".to_string()) + // Enable clustering columns for this test + .with_clustering_columns_for_test(vec![ColumnName::new(["value"])]); + + // Add files WITH stats + let add_files = create_test_add_files(vec!["file1.parquet"], vec![TestFileStats::Present]); + + // Directly test the validation method + let result = txn.validate_add_files_stats(&[add_files]); + + assert!( + result.is_ok(), + "Stats validation should pass when stats are present, got: {result:?}" + ); + } + + #[test] + fn test_stats_validation_skipped_without_clustering() { + let (engine, snapshot) = setup_non_dv_table(); + let txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine) + .unwrap() + .with_operation("WRITE".to_string()); + // No clustering columns set (default) + + // Add files WITHOUT stats + let add_files = create_test_add_files(vec!["file1.parquet"], vec![TestFileStats::None]); + + // Directly test the validation method - should pass because no clustering + let result = txn.validate_add_files_stats(&[add_files]); + + assert!( + result.is_ok(), + "Stats validation should be skipped without clustering, got: {result:?}" + ); + } + + #[test] + fn disallow_catalog_committer_for_non_catalog_managed_table() { + let storage = Arc::new(InMemory::new()); + let table_root = url::Url::parse("memory:///").unwrap(); + let engine = crate::engine::default::DefaultEngineBuilder::new(storage.clone()).build(); + + // Create a non-catalog-managed table (no catalogManaged feature) + let actions = [ + r#"{"commitInfo":{"timestamp":12345678900,"inCommitTimestamp":12345678900}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":[],"writerFeatures":["inCommitTimestamp"]}}"#, + r#"{"metaData":{"id":"test-id","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[]}","partitionColumns":[],"configuration":{"delta.enableInCommitTimestamps":"true"},"createdTime":1234567890}}"#, + ].join("\n"); + + let commit_path = Path::from("_delta_log/00000000000000000000.json"); + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(storage.put(&commit_path, actions.into())) + .unwrap(); + + let snapshot = Snapshot::builder_for(table_root).build(&engine).unwrap(); + + // Try to commit with a catalog committer to a non-catalog-managed table + let committer = Box::new(MockCatalogCommitter); + let err = snapshot + .transaction(committer, &engine) + .unwrap() + .commit(&engine) + .unwrap_err(); + assert!(matches!( + err, + crate::Error::Generic(e) if e.contains("This table is path-based and cannot be committed to with a catalog committer") + )); + } + + #[test] + fn disallow_catalog_committer_for_non_catalog_managed_create_table() { + let storage = Arc::new(InMemory::new()); + let engine = crate::engine::default::DefaultEngineBuilder::new(storage).build(); + + // Create a non-catalog-managed table using a catalog committer + let schema = Arc::new(crate::schema::StructType::new_unchecked(vec![ + crate::schema::StructField::new("id", crate::schema::DataType::INTEGER, false), + ])); + let committer = Box::new(MockCatalogCommitter); + let err = create_table("memory:///", schema, "test-engine") + .build(&engine, committer) + .unwrap() + .commit(&engine) + .unwrap_err(); + assert!(matches!( + err, + crate::Error::Generic(e) if e.contains("This table is path-based and cannot be committed to with a catalog committer") + )); + } } diff --git a/kernel/src/transaction/stats_verifier.rs b/kernel/src/transaction/stats_verifier.rs new file mode 100644 index 0000000000..a5457906ae --- /dev/null +++ b/kernel/src/transaction/stats_verifier.rs @@ -0,0 +1,704 @@ +//! Validates that add file statistics contain required columns. +//! +//! Per the Delta protocol, writers MUST write per-file statistics (nullCount, minValues, +//! maxValues) for certain required columns. For example, clustering columns require stats when +//! the `ClusteredTable` feature is enabled. This module validates that those stat entries +//! exist for each required column. + +use std::sync::LazyLock; + +use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; +use crate::error::Error; +use crate::expressions::{column_name, ColumnName}; +use crate::schema::{ColumnNamesAndTypes, DataType, DecimalType, PrimitiveType}; +use crate::utils::require; +use crate::DeltaResult; + +/// Verifies that add file statistics contain required columns. +/// +/// For each required column, validates that `nullCount` is present (non-null) and that +/// `minValues` and `maxValues` are present unless the column is all-null +/// (`nullCount == numRecords`). +pub(crate) struct StatsVerifier { + required_columns: Vec<(ColumnName, DataType)>, +} + +impl StatsVerifier { + /// Create a new verifier that checks statistics for the given required columns and types. + pub(crate) fn new(required_columns: Vec<(ColumnName, DataType)>) -> Self { + Self { required_columns } + } + + /// Verify that all files in the provided batches have required statistics. + /// + /// For each required column, extracts all three stat columns (nullCount, minValues, + /// maxValues) in a single `visit_rows` call per batch. + pub(crate) fn verify(&self, add_files: &[Box]) -> DeltaResult<()> { + if self.required_columns.is_empty() { + return Ok(()); + } + + for (col, data_type) in &self.required_columns { + self.verify_column(add_files, col, data_type)?; + } + + Ok(()) + } + + /// Verify a single required column has nullCount, minValues, and maxValues stats in + /// every file. Extracts all three stat columns in a single `visit_rows` call per batch. + fn verify_column( + &self, + add_files: &[Box], + column: &ColumnName, + data_type: &DataType, + ) -> DeltaResult<()> { + let column_names = vec![ + ColumnName::new(["path"]), + ColumnName::new(["stats", "numRecords"]), + build_stat_path(column, "nullCount"), + build_stat_path(column, "minValues"), + build_stat_path(column, "maxValues"), + ]; + let types = column_types_for(data_type)?; + + let mut missing_null_count: Vec = Vec::new(); + let mut missing_min: Vec = Vec::new(); + let mut missing_max: Vec = Vec::new(); + + for batch in add_files { + let mut visitor = ColumnStatsVisitor { + data_type, + types, + missing_null_count: &mut missing_null_count, + missing_min: &mut missing_min, + missing_max: &mut missing_max, + }; + batch.visit_rows(&column_names, &mut visitor)?; + } + + if !missing_null_count.is_empty() { + return Err(Error::stats_validation(format!( + "Required column '{column}' is missing 'nullCount' statistics for files: [{}]", + missing_null_count.join(", ") + ))); + } + if !missing_min.is_empty() { + return Err(Error::stats_validation(format!( + "Required column '{column}' is missing 'minValues' statistics for files: [{}]", + missing_min.join(", ") + ))); + } + if !missing_max.is_empty() { + return Err(Error::stats_validation(format!( + "Required column '{column}' is missing 'maxValues' statistics for files: [{}]", + missing_max.join(", ") + ))); + } + Ok(()) + } +} + +/// Build a stat column path: `stats.{category}.{column_path}`. +fn build_stat_path(column: &ColumnName, category: &str) -> ColumnName { + let mut path = vec!["stats".to_string(), category.to_string()]; + path.extend(column.iter().map(|s| s.to_string())); + ColumnName::new(path) +} + +// Predefined static type arrays for per-column validation. Each array contains types for +// [path, numRecords, nullCount, minValues, maxValues], where numRecords and nullCount are +// always LONG and min/max use the column's original type. The column names are placeholders +// -- `visit_rows` receives actual column names as a separate parameter. +macro_rules! define_column_types { + ($name:ident, $data_type:expr) => { + static $name: LazyLock = LazyLock::new(|| { + let names = vec![ + column_name!("path"), + column_name!("nr"), + column_name!("nc"), + column_name!("min"), + column_name!("max"), + ]; + let types = vec![ + DataType::STRING, + DataType::LONG, + DataType::LONG, + $data_type, + $data_type, + ]; + (names, types).into() + }); + }; +} + +define_column_types!(COL_TYPES_BOOL, DataType::BOOLEAN); +define_column_types!(COL_TYPES_INT, DataType::INTEGER); +define_column_types!(COL_TYPES_LONG, DataType::LONG); +define_column_types!(COL_TYPES_STRING, DataType::STRING); +define_column_types!(COL_TYPES_BINARY, DataType::BINARY); +define_column_types!(COL_TYPES_FLOAT, DataType::FLOAT); +define_column_types!(COL_TYPES_DOUBLE, DataType::DOUBLE); +define_column_types!(COL_TYPES_DATE, DataType::DATE); +define_column_types!(COL_TYPES_TIMESTAMP, DataType::TIMESTAMP); +define_column_types!(COL_TYPES_TIMESTAMP_NTZ, DataType::TIMESTAMP_NTZ); +#[allow(clippy::unwrap_used)] +static COL_TYPES_DECIMAL: LazyLock = LazyLock::new(|| { + let names = vec![ + column_name!("path"), + column_name!("nr"), + column_name!("nc"), + column_name!("min"), + column_name!("max"), + ]; + let types = vec![ + DataType::STRING, + DataType::LONG, + DataType::LONG, + DataType::Primitive(PrimitiveType::Decimal(DecimalType::try_new(38, 0).unwrap())), + DataType::Primitive(PrimitiveType::Decimal(DecimalType::try_new(38, 0).unwrap())), + ]; + (names, types).into() +}); + +/// Select the predefined static type array for a given column data type. +fn column_types_for(dt: &DataType) -> DeltaResult<&'static ColumnNamesAndTypes> { + match dt { + &DataType::BOOLEAN => Ok(&COL_TYPES_BOOL), + &DataType::INTEGER => Ok(&COL_TYPES_INT), + &DataType::LONG => Ok(&COL_TYPES_LONG), + &DataType::STRING => Ok(&COL_TYPES_STRING), + &DataType::BINARY => Ok(&COL_TYPES_BINARY), + &DataType::FLOAT => Ok(&COL_TYPES_FLOAT), + &DataType::DOUBLE => Ok(&COL_TYPES_DOUBLE), + &DataType::DATE => Ok(&COL_TYPES_DATE), + &DataType::TIMESTAMP => Ok(&COL_TYPES_TIMESTAMP), + &DataType::TIMESTAMP_NTZ => Ok(&COL_TYPES_TIMESTAMP_NTZ), + DataType::Primitive(PrimitiveType::Decimal(_)) => Ok(&COL_TYPES_DECIMAL), + _ => Err(Error::internal_error(format!( + "Unsupported data type for stats validation: {dt}" + ))), + } +} + +/// Check if a stat value is present (non-null) using the appropriate typed getter. +fn is_stat_present<'b>( + getter: &'b dyn GetData<'b>, + row_idx: usize, + data_type: &DataType, +) -> DeltaResult { + let field_name = "stat"; + match data_type { + &DataType::BOOLEAN => Ok(getter.get_bool(row_idx, field_name)?.is_some()), + &DataType::INTEGER => Ok(getter.get_int(row_idx, field_name)?.is_some()), + &DataType::LONG => Ok(getter.get_long(row_idx, field_name)?.is_some()), + &DataType::FLOAT => Ok(getter.get_float(row_idx, field_name)?.is_some()), + &DataType::DOUBLE => Ok(getter.get_double(row_idx, field_name)?.is_some()), + &DataType::DATE => Ok(getter.get_date(row_idx, field_name)?.is_some()), + &DataType::TIMESTAMP | &DataType::TIMESTAMP_NTZ => { + Ok(getter.get_timestamp(row_idx, field_name)?.is_some()) + } + &DataType::STRING => Ok(getter.get_str(row_idx, field_name)?.is_some()), + &DataType::BINARY => Ok(getter.get_binary(row_idx, field_name)?.is_some()), + DataType::Primitive(PrimitiveType::Decimal(_)) => { + Ok(getter.get_decimal(row_idx, field_name)?.is_some()) + } + _ => Err(Error::internal_error(format!( + "Unsupported data type for stats presence check: {data_type}" + ))), + } +} + +/// Visitor that checks nullCount, minValues, and maxValues for a single column in one pass. +/// Expects 5 getters: [path, numRecords, nullCount, minValues, maxValues]. +struct ColumnStatsVisitor<'a> { + data_type: &'a DataType, + types: &'static ColumnNamesAndTypes, + missing_null_count: &'a mut Vec, + missing_min: &'a mut Vec, + missing_max: &'a mut Vec, +} + +impl RowVisitor for ColumnStatsVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + self.types.as_ref() + } + + fn visit<'b>(&mut self, row_count: usize, getters: &[&'b dyn GetData<'b>]) -> DeltaResult<()> { + require!( + getters.len() == 5, + Error::internal_error(format!( + "Expected 5 getters for column stats validation, got {}", + getters.len() + )) + ); + + for row_idx in 0..row_count { + let path: String = getters[0].get(row_idx, "path")?; + let num_records = getters[1].get_long(row_idx, "numRecords")?; + let null_count = getters[2].get_long(row_idx, "nullCount")?; + + // When all rows are null (or the file is empty), minValues/maxValues are + // expected to be null since there are no non-null values to aggregate. + let all_null = matches!((num_records, null_count), (Some(nr), Some(nc)) if nr == nc); + + if null_count.is_none() { + self.missing_null_count.push(path.clone()); + } + if !(all_null || is_stat_present(getters[3], row_idx, self.data_type)?) { + self.missing_min.push(path.clone()); + } + if !(all_null || is_stat_present(getters[4], row_idx, self.data_type)?) { + self.missing_max.push(path); + } + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use std::sync::Arc; + + use rstest::rstest; + + use crate::arrow::array::{ + Array, ArrayRef, Int64Array, LargeStringArray, RecordBatch, StringArray, StringViewArray, + StructArray, + }; + use crate::arrow::datatypes::{ + DataType as ArrowDataType, Field as ArrowField, Fields, Schema as ArrowSchema, + }; + use crate::engine::arrow_data::ArrowEngineData; + use crate::engine::default::stats::collect_stats; + use crate::expressions::column_name; + use crate::EngineData; + + /// Creates test add file data with stats.numRecords, stats.nullCount.col, + /// stats.minValues.col, and stats.maxValues.col — all of type LONG. + fn create_add_file_batch( + paths: Vec<&str>, + num_records: Vec>, + null_counts: Vec>, + min_values: Vec>, + max_values: Vec>, + ) -> Box { + assert_eq!(paths.len(), num_records.len()); + assert_eq!(paths.len(), null_counts.len()); + assert_eq!(paths.len(), min_values.len()); + assert_eq!(paths.len(), max_values.len()); + + let path_array = StringArray::from(paths.to_vec()); + let col_field = Arc::new(ArrowField::new("col", ArrowDataType::Int64, true)); + + let num_records_array = Int64Array::from(num_records); + let null_count_struct = StructArray::new( + Fields::from(vec![col_field.clone()]), + vec![Arc::new(Int64Array::from(null_counts)) as ArrayRef], + None, + ); + let min_values_struct = StructArray::new( + Fields::from(vec![col_field.clone()]), + vec![Arc::new(Int64Array::from(min_values)) as ArrayRef], + None, + ); + let max_values_struct = StructArray::new( + Fields::from(vec![col_field]), + vec![Arc::new(Int64Array::from(max_values)) as ArrayRef], + None, + ); + + let inner_struct_type = |name: &str| { + ArrowField::new( + name, + ArrowDataType::Struct(Fields::from(vec![ArrowField::new( + "col", + ArrowDataType::Int64, + true, + )])), + true, + ) + }; + + let stats_fields = Fields::from(vec![ + ArrowField::new("numRecords", ArrowDataType::Int64, true), + inner_struct_type("nullCount"), + inner_struct_type("minValues"), + inner_struct_type("maxValues"), + ]); + let stats_struct = StructArray::new( + stats_fields.clone(), + vec![ + Arc::new(num_records_array) as ArrayRef, + Arc::new(null_count_struct) as ArrayRef, + Arc::new(min_values_struct) as ArrayRef, + Arc::new(max_values_struct) as ArrayRef, + ], + None, + ); + + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("path", ArrowDataType::Utf8, false), + ArrowField::new("stats", ArrowDataType::Struct(stats_fields), true), + ])); + + let batch = + RecordBatch::try_new(schema, vec![Arc::new(path_array), Arc::new(stats_struct)]) + .unwrap(); + + Box::new(ArrowEngineData::new(batch)) + } + + #[test] + fn test_verifier_with_empty_add_files() { + let columns = vec![(ColumnName::new(["col"]), DataType::LONG)]; + let verifier = StatsVerifier::new(columns); + let result = verifier.verify(&[]); + assert!(result.is_ok()); + } + + #[test] + fn test_verify_valid_stats() { + let batch = create_add_file_batch( + vec!["file1.parquet", "file2.parquet"], + vec![Some(100), Some(100)], + vec![Some(0), Some(5)], + vec![Some(1), Some(10)], + vec![Some(100), Some(50)], + ); + + let columns = vec![(ColumnName::new(["col"]), DataType::LONG)]; + let verifier = StatsVerifier::new(columns); + let result = verifier.verify(&[batch]); + assert!(result.is_ok()); + } + + #[test] + fn test_verify_missing_stat_category() { + let cases = [ + ("nullCount", vec![None], vec![Some(1)], vec![Some(100)]), + ("minValues", vec![Some(0)], vec![None], vec![Some(100)]), + ("maxValues", vec![Some(0)], vec![Some(1)], vec![None]), + ]; + for (category, null_counts, min_values, max_values) in cases { + let batch = create_add_file_batch( + vec!["file1.parquet"], + vec![Some(100)], + null_counts, + min_values, + max_values, + ); + let verifier = StatsVerifier::new(vec![(ColumnName::new(["col"]), DataType::LONG)]); + let err_msg = verifier.verify(&[batch]).unwrap_err().to_string(); + assert!(err_msg.contains("file1.parquet"), "case: {category}"); + assert!(err_msg.contains(category), "case: {category}"); + } + } + + #[test] + fn test_verify_multiple_batches() { + let batch1 = create_add_file_batch( + vec!["good_file.parquet"], + vec![Some(100)], + vec![Some(0)], + vec![Some(1)], + vec![Some(100)], + ); + let batch2 = create_add_file_batch( + vec!["bad_file.parquet"], + vec![Some(100)], + vec![None], + vec![None], + vec![None], + ); + + let columns = vec![(ColumnName::new(["col"]), DataType::LONG)]; + let verifier = StatsVerifier::new(columns); + let result = verifier.verify(&[batch1, batch2]); + + assert!(result.is_err()); + let err_msg = result.unwrap_err().to_string(); + assert!(err_msg.contains("bad_file.parquet")); + assert!(!err_msg.contains("good_file.parquet")); + } + + #[test] + fn test_verify_no_required_columns() { + let batch = create_add_file_batch( + vec!["file1.parquet"], + vec![Some(100)], + vec![None], + vec![None], + vec![None], + ); + + let verifier = StatsVerifier::new(vec![]); + let result = verifier.verify(&[batch]); + assert!(result.is_ok()); + } + + #[test] + fn test_verify_all_null_column_allows_null_min_max() { + // nullCount == numRecords means all rows are null, so null min/max is valid + let batch = create_add_file_batch( + vec!["file1.parquet"], + vec![Some(100)], + vec![Some(100)], + vec![None], + vec![None], + ); + + let columns = vec![(ColumnName::new(["col"]), DataType::LONG)]; + let verifier = StatsVerifier::new(columns); + assert!(verifier.verify(&[batch]).is_ok()); + } + + #[test] + fn test_verify_partial_null_column_requires_min_max() { + // nullCount < numRecords means not all rows are null, so min/max must be present + let batch = create_add_file_batch( + vec!["file1.parquet"], + vec![Some(100)], + vec![Some(50)], + vec![None], + vec![None], + ); + + let columns = vec![(ColumnName::new(["col"]), DataType::LONG)]; + let verifier = StatsVerifier::new(columns); + let result = verifier.verify(&[batch]); + assert!(matches!(result, Err(Error::StatsValidation(_)))); + let err = result.unwrap_err().to_string(); + assert!(err.contains("minValues")); + } + + /// Creates test data with two columns (col_a, col_b) in stats, both LONG. + #[allow(clippy::too_many_arguments)] + fn create_two_column_batch( + paths: Vec<&str>, + num_records: Vec>, + col_a_nullcount: Vec>, + col_a_min: Vec>, + col_a_max: Vec>, + col_b_nullcount: Vec>, + col_b_min: Vec>, + col_b_max: Vec>, + ) -> Box { + let path_array = StringArray::from(paths.to_vec()); + let col_a_field = Arc::new(ArrowField::new("col_a", ArrowDataType::Int64, true)); + let col_b_field = Arc::new(ArrowField::new("col_b", ArrowDataType::Int64, true)); + let both_fields = Fields::from(vec![col_a_field, col_b_field]); + + let make_struct = |a: Vec>, b: Vec>| { + StructArray::new( + both_fields.clone(), + vec![ + Arc::new(Int64Array::from(a)) as ArrayRef, + Arc::new(Int64Array::from(b)) as ArrayRef, + ], + None, + ) + }; + + let num_records_array = Int64Array::from(num_records); + let null_count_struct = make_struct(col_a_nullcount, col_b_nullcount); + let min_values_struct = make_struct(col_a_min, col_b_min); + let max_values_struct = make_struct(col_a_max, col_b_max); + + let inner_type = ArrowDataType::Struct(Fields::from(vec![ + ArrowField::new("col_a", ArrowDataType::Int64, true), + ArrowField::new("col_b", ArrowDataType::Int64, true), + ])); + let stats_fields = Fields::from(vec![ + ArrowField::new("numRecords", ArrowDataType::Int64, true), + ArrowField::new("nullCount", inner_type.clone(), true), + ArrowField::new("minValues", inner_type.clone(), true), + ArrowField::new("maxValues", inner_type, true), + ]); + let stats_struct = StructArray::new( + stats_fields.clone(), + vec![ + Arc::new(num_records_array) as ArrayRef, + Arc::new(null_count_struct) as ArrayRef, + Arc::new(min_values_struct) as ArrayRef, + Arc::new(max_values_struct) as ArrayRef, + ], + None, + ); + + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("path", ArrowDataType::Utf8, false), + ArrowField::new("stats", ArrowDataType::Struct(stats_fields), true), + ])); + let batch = + RecordBatch::try_new(schema, vec![Arc::new(path_array), Arc::new(stats_struct)]) + .unwrap(); + Box::new(ArrowEngineData::new(batch)) + } + + #[test] + fn test_verify_multiple_columns() { + // Both columns have valid stats + let batch = create_two_column_batch( + vec!["file1.parquet"], + vec![Some(100)], + vec![Some(0)], + vec![Some(1)], + vec![Some(10)], + vec![Some(0)], + vec![Some(2)], + vec![Some(20)], + ); + let columns = vec![ + (ColumnName::new(["col_a"]), DataType::LONG), + (ColumnName::new(["col_b"]), DataType::LONG), + ]; + assert!(StatsVerifier::new(columns).verify(&[batch]).is_ok()); + + // col_a valid, col_b missing minValues + let batch = create_two_column_batch( + vec!["file1.parquet"], + vec![Some(100)], + vec![Some(0)], + vec![Some(1)], + vec![Some(10)], + vec![Some(0)], + vec![None], + vec![Some(20)], + ); + let columns = vec![ + (ColumnName::new(["col_a"]), DataType::LONG), + (ColumnName::new(["col_b"]), DataType::LONG), + ]; + let err_msg = StatsVerifier::new(columns) + .verify(&[batch]) + .unwrap_err() + .to_string(); + assert!(err_msg.contains("col_b")); + assert!(err_msg.contains("minValues")); + assert!(!err_msg.contains("col_a")); + } + + /// Verifies that stats collected from non-standard Arrow string representations + /// (LargeUtf8/LargeStringArray, Utf8View/StringViewArray) can be validated by + /// StatsVerifier, which expects Delta's logical STRING type. Engines may use any of + /// these representations, and the stats pipeline must handle them without type errors. + #[rstest] + #[case::large_utf8(Arc::new(LargeStringArray::from(vec!["Austin", "Boston", "Chicago"])) as ArrayRef)] + #[case::utf8_view(Arc::new(StringViewArray::from(vec!["Austin", "Boston", "Chicago"])) as ArrayRef)] + fn test_verify_string_stats(#[case] values: ArrayRef) { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "city", + values.data_type().clone(), + false, + )])); + let batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + + let stats = collect_stats(&batch, &[column_name!("city")]).unwrap(); + + let path_array = StringArray::from(vec!["file1.parquet"]); + let add_file_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("path", ArrowDataType::Utf8, false), + ArrowField::new("stats", stats.data_type().clone(), true), + ])); + let add_file_batch = RecordBatch::try_new( + add_file_schema, + vec![ + Arc::new(path_array) as ArrayRef, + Arc::new(stats) as ArrayRef, + ], + ) + .unwrap(); + + let engine_data: Box = Box::new(ArrowEngineData::new(add_file_batch)); + + let verifier = StatsVerifier::new(vec![(ColumnName::new(["city"]), DataType::STRING)]); + verifier.verify(&[engine_data]).unwrap(); + } + + /// Round-trip test: collect_stats produces stats that pass verification for all null + /// patterns. The all-null and empty cases are regression tests -- collect_stats must keep + /// the field present (with null value) so the verifier's all_null check can run. + #[rstest] + #[case::non_null(vec![Some(1i64), Some(2), Some(3)])] + #[case::all_null(vec![None, None, None])] + #[case::empty(vec![])] + fn test_collected_stats_pass_verification(#[case] values: Vec>) { + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "col", + ArrowDataType::Int64, + true, + )])); + let batch = + RecordBatch::try_new(schema, vec![Arc::new(Int64Array::from(values)) as ArrayRef]) + .unwrap(); + + let stats = collect_stats(&batch, &[column_name!("col")]).unwrap(); + + let path_array = StringArray::from(vec!["file1.parquet"]); + let add_file_schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("path", ArrowDataType::Utf8, false), + ArrowField::new("stats", stats.data_type().clone(), true), + ])); + let add_file_batch = RecordBatch::try_new( + add_file_schema, + vec![ + Arc::new(path_array) as ArrayRef, + Arc::new(stats) as ArrayRef, + ], + ) + .unwrap(); + + let engine_data: Box = Box::new(ArrowEngineData::new(add_file_batch)); + + let verifier = StatsVerifier::new(vec![(ColumnName::new(["col"]), DataType::LONG)]); + verifier.verify(&[engine_data]).unwrap(); + } + + /// Verify collect_stats produces correct stats shape for all-null and empty batches. + /// These cases keep the column in minValues/maxValues with null values (so that + /// StatsVerifier can find the field via visit_rows and check nullCount == numRecords). + #[rstest] + #[case::all_null_values(Arc::new(Int64Array::from(vec![None::, None, None])) as ArrayRef)] + #[case::empty_batch(Arc::new(Int64Array::from(Vec::>::new())) as ArrayRef)] + fn test_collected_stats_shape_for_all_null_and_empty(#[case] values: ArrayRef) { + let num_rows = values.len(); + let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "col", + values.data_type().clone(), + true, + )])); + let batch = RecordBatch::try_new(schema, vec![values]).unwrap(); + + let stats = collect_stats(&batch, &[column_name!("col")]).unwrap(); + + // numRecords should match row count + let num_records = stats + .column_by_name("numRecords") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(num_records.value(0), num_rows as i64); + + // All-null/empty columns are present in minValues/maxValues with null values + let min_values = stats + .column_by_name("minValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert!(min_values.column_by_name("col").unwrap().is_null(0)); + + let max_values = stats + .column_by_name("maxValues") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert!(max_values.column_by_name("col").unwrap().is_null(0)); + } +} diff --git a/kernel/src/transaction/update.rs b/kernel/src/transaction/update.rs new file mode 100644 index 0000000000..aead0ba5aa --- /dev/null +++ b/kernel/src/transaction/update.rs @@ -0,0 +1,608 @@ +//! Update table transaction methods. +//! +//! This module contains the constructor, public API, and deletion vector update logic for +//! update-table transactions. Each transaction type lives in its own file; +//! see [`mod.rs`](super) for shared commit logic. +//! +//! Includes: +//! - [`try_new_existing_table`](Transaction::try_new_existing_table) constructor +//! - Deletion vector updates +//! - Blind append, operation setting, domain metadata removal, and file removal + +use std::collections::HashMap; +use std::marker::PhantomData; +use std::sync::{Arc, LazyLock}; + +use tracing::instrument; + +use crate::actions::deletion_vector::DeletionVectorDescriptor; +use crate::actions::get_log_add_schema; +use crate::committer::Committer; +use crate::engine_data::{ + FilteredEngineData, FilteredRowVisitor, GetData, RowIndexIterator, TypedGetData, +}; +use crate::error::Error; +use crate::expressions::{column_name, ArrayData, ColumnName, Scalar, StructData, Transform}; +use crate::scan::data_skipping::stats_schema::schema_with_all_fields_nullable; +use crate::scan::log_replay::get_scan_metadata_transform_expr; +use crate::scan::{restored_add_schema, scan_row_schema}; +use crate::schema::{ArrayType, SchemaRef, StructField, StructType, ToSchema}; +use crate::snapshot::SnapshotRef; +use crate::table_features::{Operation, TableFeature}; +use crate::utils::current_time_ms; +use crate::{DataType, DeltaResult, Engine, Expression}; +use delta_kernel_derive::internal_api; + +use super::Transaction; + +// ============================================================================= +// Update table transactions only +// ============================================================================= +impl Transaction { + // ------------------------------------------------------------------------- + // Constructor + // ------------------------------------------------------------------------- + + /// Create a new transaction from a snapshot for an existing table. The snapshot will be used + /// to read the current state of the table (e.g. to read the current version). + /// + /// Instead of using this API, the more typical (user-facing) API is + /// [Snapshot::transaction](crate::snapshot::Snapshot::transaction) to create a transaction from + /// a snapshot. + pub(crate) fn try_new_existing_table( + snapshot: impl Into, + committer: Box, + engine: &dyn Engine, + ) -> DeltaResult { + let read_snapshot = snapshot.into(); + + // important! before writing to the table we must check it is supported + read_snapshot + .table_configuration() + .ensure_operation_supported(Operation::Write)?; + + // Read clustering columns from snapshot (returns None if clustering not enabled) + let clustering_columns = read_snapshot.get_physical_clustering_columns(engine)?; + + let commit_timestamp = current_time_ms()?; + + let span = tracing::info_span!( + "txn", + path = %read_snapshot.table_root(), + read_version = read_snapshot.version(), + ); + + Ok(Transaction { + span, + read_snapshot, + committer, + operation: None, + engine_info: None, + add_files_metadata: vec![], + remove_files_metadata: vec![], + set_transactions: vec![], + commit_timestamp, + user_domain_metadata_additions: vec![], + system_domain_metadata_additions: vec![], + user_domain_removals: vec![], + data_change: true, + engine_commit_info: None, + is_blind_append: false, + dv_matched_files: vec![], + physical_clustering_columns: clustering_columns, + _state: PhantomData, + }) + } + + // ------------------------------------------------------------------------- + // Public API + // ------------------------------------------------------------------------- + + /// Mark this transaction as a blind append. + /// + /// Blind append transactions should only add new files and avoid write operations that + /// depend on existing table state. + pub fn with_blind_append(mut self) -> Self { + self.is_blind_append = true; + self + } + + /// Set the operation that this transaction is performing. This string will be persisted in the + /// commit and visible to anyone who describes the table history. + pub fn with_operation(mut self, operation: String) -> Self { + self.operation = Some(operation); + self + } + + /// Remove domain metadata from the Delta log. + /// If the domain exists in the Delta log, this creates a tombstone to logically delete + /// the domain. The tombstone preserves the previous configuration value. + /// If the domain does not exist in the Delta log, this is a no-op. + /// Note that each domain can only appear once per transaction. That is, multiple operations + /// on the same domain are disallowed in a single transaction, as well as setting and removing + /// the same domain in a single transaction. If a duplicate domain is included, the `commit` will + /// fail (that is, we don't eagerly check domain validity here). + /// Removing metadata for multiple distinct domains is allowed. + pub fn with_domain_metadata_removed(mut self, domain: String) -> Self { + self.user_domain_removals.push(domain); + self + } + + /// Remove files from the table in this transaction. This API generally enables the engine to + /// delete data (at file-level granularity) from the table. Note that this API can be called + /// multiple times to remove multiple batches. + /// + /// The expected schema for `remove_metadata` is given by [`scan_row_schema`]. It is expected + /// this will be the result of passing [`FilteredEngineData`] returned from a scan + /// with the selection vector modified to select rows for removal (selected rows in the selection vector are the ones to be removed). + /// + /// # Example + /// + /// ```no_run + /// # use std::sync::Arc; + /// # use delta_kernel::Engine; + /// # use delta_kernel::snapshot::Snapshot; + /// # use delta_kernel::committer::FileSystemCommitter; + /// # fn example(engine: Arc, table_url: url::Url) -> delta_kernel::DeltaResult<()> { + /// // Create a snapshot and transaction + /// let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + /// let mut txn = snapshot.clone().transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())?; + /// + /// // Get file metadata from a scan + /// let scan = snapshot.scan_builder().build()?; + /// let scan_metadata = scan.scan_metadata(engine.as_ref())?; + /// + /// // Remove specific files based on scan metadata + /// for metadata in scan_metadata { + /// let metadata = metadata?; + /// // In practice, you would modify the selection vector to choose which files to remove + /// let files_to_remove = metadata.scan_files; + /// txn.remove_files(files_to_remove); + /// } + /// + /// // Commit the transaction + /// txn.commit(engine.as_ref())?; + /// # Ok(()) + /// # } + /// ``` + pub fn remove_files(&mut self, remove_metadata: FilteredEngineData) { + self.remove_files_metadata.push(remove_metadata); + } + + // ------------------------------------------------------------------------- + // Deletion vector updates + // ------------------------------------------------------------------------- + + /// Helper function to convert scan metadata iterator to filtered engine data iterator. + /// + /// This adapter extracts the `scan_files` field from each [`crate::scan::ScanMetadata`] item, + /// making it easy to pass scan results directly to `update_deletion_vectors`. + /// + /// # Example + /// + /// ```ignore + /// let scan = snapshot.scan_builder().build()?; + /// let metadata = scan.scan_metadata(engine)?; + /// let mut dv_map = HashMap::new(); + /// // ... populate dv_map ... + /// let files_iter = Transaction::scan_metadata_to_engine_data(metadata); + /// txn.update_deletion_vectors(dv_map, files_iter)?; + /// ``` + pub fn scan_metadata_to_engine_data( + scan_metadata: impl Iterator>, + ) -> impl Iterator> { + scan_metadata.map(|result| result.map(|metadata| metadata.scan_files)) + } + + /// Update deletion vectors for files in the table. + /// + /// This method can be called multiple times to update deletion vectors for different sets of files. + /// + /// This method takes a map of file paths to new deletion vector descriptors and an iterator + /// of scan file data. It joins the two together internally and will generate appropriate + /// remove/add actions on commit to update the deletion vectors. + /// + /// # Arguments + /// + /// * `new_dv_descriptors` - A map from data file path (as provided in scan operations) to + /// the new deletion vector descriptor for that file. + /// * `existing_data_files` - An iterator over FilteredEngineData from scan metadata. The + /// selected elements of each FilteredEngineData must be a superset of the paths that key + /// `new_dv_descriptors`. + /// + /// # Errors + /// + /// Returns an error if: + /// - A file path in `new_dv_descriptors` is not found in `existing_data_files` + /// + /// # Examples + /// + /// ```rust,ignore + /// let mut txn = snapshot.clone().transaction(Box::new(FileSystemCommitter::new()))? + /// .with_operation("UPDATE".to_string()); + /// + /// let scan = snapshot.scan_builder().build()?; + /// let files: Vec = scan.scan_metadata(engine)? + /// .collect::, _>>()? + /// .into_iter() + /// .map(|sm| sm.scan_files) + /// .collect(); + /// + /// // Create map of file paths to new deletion vector descriptors + /// let mut dv_map = HashMap::new(); + /// // ... populate dv_map with file paths and their new DV descriptors ... + /// + /// txn.update_deletion_vectors(dv_map, files.into_iter())?; + /// txn.commit(engine)?; + /// ``` + #[internal_api] + #[cfg_attr(not(feature = "internal-api"), allow(dead_code))] + #[instrument( + name = "txn.update_dvs", + skip_all, + fields(num_dv_updates = new_dv_descriptors.len()), + err + )] + pub(crate) fn update_deletion_vectors( + &mut self, + new_dv_descriptors: HashMap, + existing_data_files: impl Iterator>, + ) -> DeltaResult<()> { + if self.is_create_table() { + return Err(Error::generic( + "Deletion vector operations require an existing table", + )); + } + if !self + .read_snapshot + .table_configuration() + .is_feature_supported(&TableFeature::DeletionVectors) + { + return Err(Error::unsupported( + "Deletion vector operations require reader version 3, writer version 7, \ + and the 'deletionVectors' feature in both reader and writer features", + )); + } + + let mut matched_dv_files = 0; + let mut visitor = DvMatchVisitor::new(&new_dv_descriptors); + + // Process each batch of scan file metadata to prepare for DV updates: + // 1. Visit rows to match file paths against the DV descriptor map + // 2. Append new DV descriptors as a temporary column to matched files + // 3. Update selection vector to only keep files that need DV updates + // 4. Cache the result in dv_matched_files for generating remove/add actions during commit + for scan_file_result in existing_data_files { + let scan_file = scan_file_result?; + visitor.new_dv_entries.clear(); + visitor.matched_file_indexes.clear(); + visitor.visit_rows_of(&scan_file)?; + let (data, mut selection_vector) = scan_file.into_parts(); + + // Update selection vector to keep only files that matched DV descriptors. + // This ensures we only generate remove/add actions for files being updated. + let mut current_matched_index = 0; + for (i, selected) in selection_vector.iter_mut().enumerate() { + if current_matched_index < visitor.matched_file_indexes.len() { + if visitor.matched_file_indexes[current_matched_index] != i { + *selected = false; + } else { + current_matched_index += 1; + matched_dv_files += if *selected { 1 } else { 0 }; + } + } else { + // Deselect any files after the last matched file + *selected = false; + } + } + + let new_columns = vec![ArrayData::try_new( + struct_deletion_vector_schema().clone(), + visitor.new_dv_entries.clone(), + )?]; + self.dv_matched_files.push(FilteredEngineData::try_new( + data.append_columns(new_dv_column_schema().clone(), new_columns)?, + selection_vector, + )?); + } + + if matched_dv_files != new_dv_descriptors.len() { + return Err(Error::generic(format!( + "Number of matched DV files does not match number of new DV descriptors: {} != {}", + matched_dv_files, + new_dv_descriptors.len() + ))); + } + + Ok(()) + } +} + +// ============================================================================= +// Deletion vector schemas and commit helpers +// ============================================================================= + +/// Column name for temporary column used during deletion vector updates. +/// This column holds new DV descriptors appended to scan file metadata before transforming to final add actions. +static NEW_DELETION_VECTOR_NAME: &str = "newDeletionVector"; + +/// Schema for scan row data with an additional column for new deletion vector descriptors. +/// This is an intermediate schema used during deletion vector updates before transforming to final add actions. +static INTERMEDIATE_DV_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(StructType::new_unchecked( + scan_row_schema() + .fields() + .cloned() + .chain([StructField::nullable( + NEW_DELETION_VECTOR_NAME.to_string(), + DeletionVectorDescriptor::to_schema(), + )]), + )) +}); + +/// Returns the intermediate schema with deletion vector column appended to scan row schema. +fn intermediate_dv_schema() -> &'static SchemaRef { + &INTERMEDIATE_DV_SCHEMA +} + +/// Schema for scan row data with nullable statistics fields. +/// Used when generating remove actions to ensure statistics can be null if missing. +// Safety: The panic here is acceptable because scan_row_schema() is a known valid schema. +// If transformation fails, it indicates a programmer error in schema construction that should be caught during development. +#[allow(clippy::panic)] +static NULLABLE_SCAN_ROWS_SCHEMA: LazyLock = LazyLock::new(|| { + schema_with_all_fields_nullable(scan_row_schema().as_ref()) + .unwrap_or_else(|_| panic!("Failed to transform scan_row_schema")) + .into() +}); + +/// Returns the nullable scan row schema. +fn nullable_scan_rows_schema() -> &'static SchemaRef { + &NULLABLE_SCAN_ROWS_SCHEMA +} + +/// Schema for restored add actions with nullable statistics fields. +/// Used when transforming scan data back to add actions with potentially missing statistics. +// Safety: The panic here is acceptable because restored_add_schema() is a known valid schema. +// If transformation fails, it indicates a programmer error in schema construction that should be caught during development. +#[allow(clippy::panic)] +static NULLABLE_RESTORED_ADD_SCHEMA: LazyLock = LazyLock::new(|| { + schema_with_all_fields_nullable(restored_add_schema()) + .unwrap_or_else(|_| panic!("Failed to transform restored_add_schema")) + .into() +}); + +/// Returns the nullable restored add action schema. +fn nullable_restored_add_schema() -> &'static SchemaRef { + &NULLABLE_RESTORED_ADD_SCHEMA +} + +/// Schema for add actions that is nullable for use in transforms as as a workaround to avoid issues with null values in required fields +/// that aren't selected. +// Safety: The panic here is acceptable because add_log_schema is a known valid schema. +// If transformation fails, it indicates a programmer error in schema construction that should be caught during development. +#[allow(clippy::panic)] +static NULLABLE_ADD_LOG_SCHEMA: LazyLock = LazyLock::new(|| { + schema_with_all_fields_nullable(get_log_add_schema()) + .unwrap_or_else(|_| panic!("Failed to transform nullable_restored_add_schema")) + .into() +}); + +/// Returns the schema for nullable restored add actions with dataChange field. +/// This schema extends the nullable restored add schema with a dataChange boolean field +/// that indicates whether the add action represents a logical data change. +fn nullable_add_log_schema() -> &'static SchemaRef { + &NULLABLE_ADD_LOG_SCHEMA +} + +/// Schema for an array of deletion vector descriptors. +/// Used when appending DV columns to scan file data. +#[cfg_attr(not(feature = "internal-api"), allow(dead_code))] +static STRUCT_DELETION_VECTOR_SCHEMA: LazyLock = + LazyLock::new(|| ArrayType::new(DeletionVectorDescriptor::to_schema().into(), true)); + +/// Returns the schema for an array of deletion vector descriptors. +#[cfg_attr(not(feature = "internal-api"), allow(dead_code))] +fn struct_deletion_vector_schema() -> &'static ArrayType { + &STRUCT_DELETION_VECTOR_SCHEMA +} + +/// Schema for the intermediate column holding new DV descriptors. +/// This temporary column is dropped during transformation to final add actions. +#[cfg_attr(not(feature = "internal-api"), allow(dead_code))] +static NEW_DV_COLUMN_SCHEMA: LazyLock = LazyLock::new(|| { + Arc::new(StructType::new_unchecked(vec![StructField::nullable( + NEW_DELETION_VECTOR_NAME, + DeletionVectorDescriptor::to_schema(), + )])) +}); + +/// Returns the schema for the intermediate column holding new DV descriptors. +#[cfg_attr(not(feature = "internal-api"), allow(dead_code))] +fn new_dv_column_schema() -> &'static SchemaRef { + &NEW_DV_COLUMN_SCHEMA +} + +// These methods are generic over the transaction state `S` because they are called from the +// shared `commit()` path in `mod.rs` (`impl Transaction`). DV updates can only be +// populated on `ExistingTableTransaction`, so the `is_create_table()` guard below is +// defence-in-depth against future misuse. +impl Transaction { + /// Generate remove/add action pairs for files with DV updates. + /// + /// This method processes the cached matched files, generating the necessary Remove and Add actions. + /// For each file: + /// 1. A Remove action is generated for the old file + /// 2. An Add action is generated with the new DV descriptor + pub(super) fn generate_dv_update_actions<'a>( + &'a self, + engine: &'a dyn Engine, + ) -> DeltaResult> + Send + 'a> { + // Create-table transactions should not have any DV update actions + if self.is_create_table() && !self.dv_matched_files.is_empty() { + return Err(crate::error::Error::internal_error( + "CREATE TABLE transaction cannot have DV update actions", + )); + } + + static COLUMNS_TO_DROP: &[&str] = &[NEW_DELETION_VECTOR_NAME]; + let remove_actions = + self.generate_remove_actions(engine, self.dv_matched_files.iter(), COLUMNS_TO_DROP)?; + let add_actions = self.generate_adds_for_dv_update(engine, self.dv_matched_files.iter())?; + Ok(remove_actions.chain(add_actions)) + } + + /// Generates Add actions for files with updated deletion vectors. + /// + /// This transforms scan file metadata with new DV descriptors (appended as a temporary column) + /// into Add actions for the Delta log. + fn generate_adds_for_dv_update<'a>( + &'a self, + engine: &'a dyn Engine, + file_metadata_batch: impl Iterator + Send + 'a, + ) -> DeltaResult> + Send + 'a> { + let evaluation_handler = engine.evaluation_handler(); + // Transform to replace the deletionVector field with the new DV from NEW_DELETION_VECTOR_NAME, + // then drop the NEW_DELETION_VECTOR_NAME column. The engine data has this temporary column + // appended by update_deletion_vectors(), but it is not expected by the transforms used in + // generate_remove_actions() which expect only the scan row schema fields. + let with_new_dv_transform = Expression::transform( + Transform::new_top_level() + .with_replaced_field( + "deletionVector", + Expression::column([NEW_DELETION_VECTOR_NAME]).into(), + ) + .with_dropped_field(NEW_DELETION_VECTOR_NAME), + ); + let with_new_dv_eval = evaluation_handler.new_expression_evaluator( + intermediate_dv_schema().clone(), + Arc::new(with_new_dv_transform), + nullable_scan_rows_schema().clone().into(), + )?; + let restored_add_eval = evaluation_handler.new_expression_evaluator( + nullable_scan_rows_schema().clone(), + get_scan_metadata_transform_expr(), + nullable_restored_add_schema().clone().into(), + )?; + let with_data_change_transform = + Arc::new(Expression::struct_from([Expression::transform( + Transform::new_nested(["add"]).with_inserted_field( + Some("modificationTime"), + Expression::literal(self.data_change).into(), + ), + )])); + let with_data_change_eval = evaluation_handler.new_expression_evaluator( + nullable_restored_add_schema().clone(), + with_data_change_transform, + nullable_add_log_schema().clone().into(), + )?; + Ok(file_metadata_batch.map( + move |file_metadata_batch| -> DeltaResult { + let with_new_dv_data = with_new_dv_eval.evaluate(file_metadata_batch.data())?; + + let as_partial_add_data = restored_add_eval.evaluate(with_new_dv_data.as_ref())?; + + let with_data_change_data = + with_data_change_eval.evaluate(as_partial_add_data.as_ref())?; + + FilteredEngineData::try_new( + with_data_change_data, + file_metadata_batch.selection_vector().to_vec(), + ) + }, + )) + } +} + +// ============================================================================= +// DvMatchVisitor: matches file paths from scan data against new DV descriptors +// ============================================================================= + +/// Visitor that matches file paths from scan data against new deletion vector descriptors. +/// Used by update_deletion_vectors() to attach new DV descriptors to scan file metadata. +#[cfg_attr(not(feature = "internal-api"), allow(dead_code))] +struct DvMatchVisitor<'a> { + /// Map from file path to the new deletion vector descriptor for that file + dv_updates: &'a HashMap, + /// Accumulated DV descriptors (or nulls) for each visited row, in visit order + new_dv_entries: Vec, + /// Indexes of rows that matched a file path in dv_update. These must be in + /// ascending order + matched_file_indexes: Vec, +} + +impl<'a> DvMatchVisitor<'a> { + #[cfg_attr(not(feature = "internal-api"), allow(dead_code))] + const PATH_INDEX: usize = 0; + + /// Creates a new DvMatchVisitor that will match file paths against the provided DV updates map. + #[cfg_attr(not(feature = "internal-api"), allow(dead_code))] + fn new(dv_updates: &'a HashMap) -> Self { + Self { + dv_updates, + new_dv_entries: Vec::new(), + matched_file_indexes: Vec::new(), + } + } +} + +/// A `FilteredRowVisitor` that matches file paths against the provided DV updates map. +impl FilteredRowVisitor for DvMatchVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + static NAMES_AND_TYPES: LazyLock<(Vec, Vec)> = LazyLock::new(|| { + let names = vec![column_name!("path")]; + let types = vec![DataType::STRING]; + (names, types) + }); + (&NAMES_AND_TYPES.0, &NAMES_AND_TYPES.1) + } + + /// For each selected row checks if the path is in the hash-map and if so, extracts DV + /// details that can be appended back to the EngineData. Also tracks matched row indexes + /// so the selection vector can be updated to contain only DV-matched files. + fn visit_filtered<'a>( + &mut self, + getters: &[&'a dyn GetData<'a>], + rows: RowIndexIterator<'_>, + ) -> DeltaResult<()> { + static NULL_DV: LazyLock = + LazyLock::new(|| Scalar::Null(DataType::from(DeletionVectorDescriptor::to_schema()))); + static DV_SCHEMA_FIELDS: LazyLock> = LazyLock::new(|| { + DeletionVectorDescriptor::to_schema() + .into_fields() + .collect() + }); + let num_rows = rows.num_rows(); + self.new_dv_entries.reserve(num_rows); + for row_index in rows { + // Fill in nulls for any deselected rows before this one. + self.new_dv_entries + .resize_with(row_index, || NULL_DV.clone()); + let path_opt: Option = getters[Self::PATH_INDEX].get_opt(row_index, "path")?; + let Some(path) = path_opt else { + // Null path means a non-add action row (remove, metadata, etc.) + self.new_dv_entries.push(NULL_DV.clone()); + continue; + }; + if let Some(dv_result) = self.dv_updates.get(&path) { + self.new_dv_entries.push(Scalar::Struct(StructData::try_new( + DV_SCHEMA_FIELDS.clone(), + vec![ + Scalar::from(dv_result.storage_type.to_string()), + Scalar::from(dv_result.path_or_inline_dv.clone()), + Scalar::from(dv_result.offset), + Scalar::from(dv_result.size_in_bytes), + Scalar::from(dv_result.cardinality), + ], + )?)); + self.matched_file_indexes.push(row_index); + } else { + self.new_dv_entries.push(NULL_DV.clone()); + } + } + // Pad with trailing nulls for any deselected rows at the end. + self.new_dv_entries + .resize_with(num_rows, || NULL_DV.clone()); + Ok(()) + } +} diff --git a/kernel/src/expressions/transforms.rs b/kernel/src/transforms/expression.rs similarity index 56% rename from kernel/src/expressions/transforms.rs rename to kernel/src/transforms/expression.rs index 326706191d..072c2b7d57 100644 --- a/kernel/src/expressions/transforms.rs +++ b/kernel/src/transforms/expression.rs @@ -1,13 +1,12 @@ use std::borrow::{Cow, ToOwned}; -use std::collections::HashSet; use std::sync::Arc; use crate::expressions::{ BinaryExpression, BinaryPredicate, ColumnName, Expression, ExpressionRef, JunctionPredicate, - OpaqueExpression, OpaquePredicate, Predicate, Scalar, Transform, UnaryExpression, - UnaryPredicate, VariadicExpression, + MapToStructExpression, OpaqueExpression, OpaquePredicate, ParseJsonExpression, Predicate, + Scalar, Transform, UnaryExpression, UnaryPredicate, VariadicExpression, }; -use crate::utils::CowExt as _; +use crate::transforms::{map_owned_children_or_else, map_owned_or_else, map_owned_pair_or_else}; /// Generic framework for recursive bottom-up transforms of expressions and /// predicates. Transformations return `Option` with the following semantics: @@ -16,31 +15,48 @@ use crate::utils::CowExt as _; /// * `Some(Cow::Borrowed)` -- The input was not transformed. /// * `None` -- The input was filtered out and the parent should be updated to not reference it. /// -/// The transform can start from the generic [`Self::transform_expr`] or [`Self::transform_pred`'], -/// or directly from a specific expression/predicate variant (e.g. [`Self::transform_expr_column`] -/// for [`ColumnName`], [`Self::transform_pred_unary`] for [`UnaryPredicate`]). +/// The transform entry point is generally [`Self::transform_expr`] or [`Self::transform_pred`] (for +/// expressions or predicates, respectively), but callers can also directly invoke the transform +/// for a specific expression/predicate variant (e.g. [`Self::transform_expr_column`] for +/// [`ColumnName`] or [`Self::transform_pred_unary`] for [`UnaryPredicate`]). /// -/// The provided `transform_xxx` methods all default to no-op (returning their input as -/// `Some(Cow::Borrowed)`), and implementations should selectively override specific `transform_xxx` -/// methods as needed for the task at hand. +/// The provided `transform_xxx` methods all default to no-op (usually by invoking the corresponding +/// recursive helper method), and implementations should selectively override specific +/// `transform_xxx` methods as needed for the task at hand. +/// +/// # Recursive helper methods /// /// The provided `recurse_into_xxx` methods encapsulate the boilerplate work of recursing into the -/// children of each expression or predicate variant. Implementations can call these as needed but -/// will generally not need to override them. +/// child expression of each expression type. Except as specifically noted otherwise, these +/// recursive helpers all behave uniformly, based on the number of children the parent has: +/// +/// * Leaf (no children) - Leaf `transform_xxx` methods simply return their argument unchanged, and +/// no corresponding `recurse_into_xxx` method is provided. +/// +/// * Unary (single child) - If the child was filtered out, filter out the parent. If the child +/// changed, build a new parent around it. Otherwise, return the parent unchanged. +/// +/// * Binary (two children) - If either child was filtered out, filter out the parent. If at least +/// one child changed, build a new parent around them. Otherwise, return the parent unchanged. +/// +/// * Variadic (0+ children) - If no children remain (all filtered out), filter out the +/// parent. Otherwise, if at least one child changed or was filtered out, build a new parent around +/// the children. Otherwise, return the parent unchanged. +/// +/// Implementations can call these as needed but will generally not need to override them. pub trait ExpressionTransform<'a> { - /// Called for each literal encountered during the expression traversal. + /// Called for each literal encountered during the traversal (leaf). fn transform_expr_literal(&mut self, value: &'a Scalar) -> Option> { Some(Cow::Borrowed(value)) } - /// Called for each column reference encountered during the expression traversal. + /// Called for each column reference encountered during the traversal (leaf). fn transform_expr_column(&mut self, name: &'a ColumnName) -> Option> { Some(Cow::Borrowed(name)) } - /// Called for the expression list of each [`Expression::Struct`] encountered during the - /// traversal. Implementations can call [`Self::recurse_into_expr_struct`] if they wish to - /// recursively transform the child expressions. + /// Called for the expression list of each struct expression encountered during the + /// traversal. The provided implementation just forwards to [`Self::recurse_into_expr_struct`]. fn transform_expr_struct( &mut self, fields: &'a [ExpressionRef], @@ -48,8 +64,8 @@ pub trait ExpressionTransform<'a> { self.recurse_into_expr_struct(fields) } - /// Called for each [`OpaqueExpression`] encountered during the traversal. Implementations can - /// call [`Self::recurse_into_expr_opaque`] if they wish to recursively transform the children. + /// Called for each opaque expression encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_expr_opaque`]. fn transform_expr_opaque( &mut self, expr: &'a OpaqueExpression, @@ -57,33 +73,50 @@ pub trait ExpressionTransform<'a> { self.recurse_into_expr_opaque(expr) } - /// Called for each [`Expression::Unknown`] encountered during the traversal. + /// Called for each unknown expression encountered during the traversal (leaf). fn transform_expr_unknown(&mut self, name: &'a String) -> Option> { Some(Cow::Borrowed(name)) } - /// Called for each [`Transform`] encountered during the traversal. By default, it is a no-op - /// that simply returns its argument and does _NOT_ recurse into its children. + /// Called for each transform expression encountered during the traversal (leaf). + /// + /// The provided implementation does _NOT_ recurse into its children. fn transform_expr_transform(&mut self, transform: &'a Transform) -> Option> { Some(Cow::Borrowed(transform)) } - /// Called for the child predicate of each [`Expression::Predicate`] encountered during the - /// traversal. Implementations can call [`Self::recurse_into_expr_pred`] if they wish to - /// recursively transform the child predicate. + /// Called for each parse-json expression encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_expr_parse_json`]. + fn transform_expr_parse_json( + &mut self, + expr: &'a ParseJsonExpression, + ) -> Option> { + self.recurse_into_expr_parse_json(expr) + } + + /// Called for each map-to-struct expression encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_expr_map_to_struct`]. + fn transform_expr_map_to_struct( + &mut self, + expr: &'a MapToStructExpression, + ) -> Option> { + self.recurse_into_expr_map_to_struct(expr) + } + + /// Called for the child of each predicate expression encountered during the + /// traversal. The provided implementation just forwards to [`Self::recurse_into_expr_pred`]. fn transform_expr_pred(&mut self, pred: &'a Predicate) -> Option> { self.recurse_into_expr_pred(pred) } - /// Called for the child predicate of each [`Predicate::Not`] encountered during the - /// traversal. Implementations can call [`Self::recurse_into_pred_not`] if they wish to - /// recursively transform the child expression. + /// Called for the child of each NOT predicate encountered during the + /// traversal. The provided implementation just forwards to [`Self::recurse_into_pred_not`]. fn transform_pred_not(&mut self, pred: &'a Predicate) -> Option> { self.recurse_into_pred_not(pred) } - /// Called for each [`UnaryExpression`] encountered during the traversal. Implementations can - /// call [`Self::recurse_into_expr_unary`] if they wish to recursively transform the child. + /// Called for each unary expression encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_expr_unary`]. fn transform_expr_unary( &mut self, expr: &'a UnaryExpression, @@ -91,8 +124,8 @@ pub trait ExpressionTransform<'a> { self.recurse_into_expr_unary(expr) } - /// Called for each [`UnaryPredicate`] encountered during the traversal. Implementations can - /// call [`Self::recurse_into_pred_unary`] if they wish to recursively transform the child. + /// Called for each unary predicate encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_pred_unary`]. fn transform_pred_unary( &mut self, pred: &'a UnaryPredicate, @@ -100,8 +133,8 @@ pub trait ExpressionTransform<'a> { self.recurse_into_pred_unary(pred) } - /// Called for each [`BinaryExpression`] encountered during the traversal. Implementations can - /// call [`Self::recurse_into_expr_binary`] if they wish to recursively transform the children. + /// Called for each binary expression encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_expr_binary`]. fn transform_expr_binary( &mut self, expr: &'a BinaryExpression, @@ -109,8 +142,8 @@ pub trait ExpressionTransform<'a> { self.recurse_into_expr_binary(expr) } - /// Called for each [`BinaryPredicate`] encountered during the traversal. Implementations can - /// call [`Self::recurse_into_pred_binary`] if they wish to recursively transform the children. + /// Called for each binary predicate encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_pred_binary`]. fn transform_pred_binary( &mut self, pred: &'a BinaryPredicate, @@ -118,8 +151,8 @@ pub trait ExpressionTransform<'a> { self.recurse_into_pred_binary(pred) } - /// Called for each [`VariadicExpression`] encountered during the traversal. Implementations can - /// call [`Self::recurse_into_expr_variadic`] if they wish to recursively transform the children. + /// Called for each variadic expression encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_expr_variadic`]. fn transform_expr_variadic( &mut self, expr: &'a VariadicExpression, @@ -127,8 +160,8 @@ pub trait ExpressionTransform<'a> { self.recurse_into_expr_variadic(expr) } - /// Called for each [`JunctionPredicate`] encountered during the traversal. Implementations can - /// call [`Self::recurse_into_pred_junction`] if they wish to recursively transform the children. + /// Called for each junction predicate encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_pred_junction`]. fn transform_pred_junction( &mut self, pred: &'a JunctionPredicate, @@ -136,8 +169,8 @@ pub trait ExpressionTransform<'a> { self.recurse_into_pred_junction(pred) } - /// Called for each [`OpaquePredicate`] encountered during the traversal. Implementations can - /// call [`Self::recurse_into_pred_opaque`] if they wish to recursively transform the children. + /// Called for each opaque predicate encountered during the traversal. The provided + /// implementation just forwards to [`Self::recurse_into_pred_opaque`]. fn transform_pred_opaque( &mut self, pred: &'a OpaquePredicate, @@ -145,239 +178,221 @@ pub trait ExpressionTransform<'a> { self.recurse_into_pred_opaque(pred) } - /// Called for each [`Predicate::Unknown`] encountered during the traversal. + /// Called for each unknown predicate encountered during the traversal (leaf). fn transform_pred_unknown(&mut self, name: &'a String) -> Option> { Some(Cow::Borrowed(name)) } /// General entry point for transforming an expression. This method will dispatch to the /// specific transform for each expression variant. Also invoked internally in order to recurse - /// on the child(ren) of non-leaf variants. + /// on the child(ren) of non-leaf expressions. fn transform_expr(&mut self, expr: &'a Expression) -> Option> { - let expr = match expr { - Expression::Literal(s) => self - .transform_expr_literal(s)? - .map_owned_or_else(expr, Expression::Literal), - Expression::Column(c) => self - .transform_expr_column(c)? - .map_owned_or_else(expr, Expression::Column), - Expression::Predicate(p) => self - .transform_expr_pred(p)? - .map_owned_or_else(expr, Expression::from), - Expression::Struct(s) => self - .transform_expr_struct(s)? - .map_owned_or_else(expr, Expression::Struct), - Expression::Transform(t) => self - .transform_expr_transform(t)? - .map_owned_or_else(expr, Expression::Transform), - Expression::Unary(u) => self - .transform_expr_unary(u)? - .map_owned_or_else(expr, Expression::Unary), - Expression::Binary(b) => self - .transform_expr_binary(b)? - .map_owned_or_else(expr, Expression::Binary), - Expression::Variadic(v) => self - .transform_expr_variadic(v)? - .map_owned_or_else(expr, Expression::Variadic), - Expression::Opaque(o) => self - .transform_expr_opaque(o)? - .map_owned_or_else(expr, Expression::Opaque), - Expression::Unknown(u) => self - .transform_expr_unknown(u)? - .map_owned_or_else(expr, Expression::Unknown), - }; - Some(expr) + match expr { + Expression::Literal(s) => { + let child = self.transform_expr_literal(s); + map_owned_or_else(expr, child, Expression::Literal) + } + Expression::Column(c) => { + let child = self.transform_expr_column(c); + map_owned_or_else(expr, child, Expression::Column) + } + Expression::Predicate(p) => { + let child = self.transform_expr_pred(p); + map_owned_or_else(expr, child, Expression::from) + } + Expression::Struct(s, nullability) => { + let map_owned = |exprs| Expression::Struct(exprs, nullability.clone()); + map_owned_or_else(expr, self.transform_expr_struct(s), map_owned) + } + Expression::Transform(t) => { + let child = self.transform_expr_transform(t); + map_owned_or_else(expr, child, Expression::Transform) + } + Expression::Unary(u) => { + let child = self.transform_expr_unary(u); + map_owned_or_else(expr, child, Expression::Unary) + } + Expression::Binary(b) => { + let child = self.transform_expr_binary(b); + map_owned_or_else(expr, child, Expression::Binary) + } + Expression::Variadic(v) => { + let child = self.transform_expr_variadic(v); + map_owned_or_else(expr, child, Expression::Variadic) + } + Expression::Opaque(o) => { + let child = self.transform_expr_opaque(o); + map_owned_or_else(expr, child, Expression::Opaque) + } + Expression::ParseJson(p) => { + let child = self.transform_expr_parse_json(p); + map_owned_or_else(expr, child, Expression::ParseJson) + } + Expression::MapToStruct(m) => { + let child = self.transform_expr_map_to_struct(m); + map_owned_or_else(expr, child, Expression::MapToStruct) + } + Expression::Unknown(u) => { + let child = self.transform_expr_unknown(u); + map_owned_or_else(expr, child, Expression::Unknown) + } + } } /// General entry point for transforming a predicate. This method will dispatch to the specific /// transform for each predicate variant. Also invoked internally in order to recurse on the /// child(ren) of non-leaf variants. fn transform_pred(&mut self, pred: &'a Predicate) -> Option> { - let pred = match pred { - Predicate::BooleanExpression(e) => self - .transform_expr(e)? - .map_owned_or_else(pred, Predicate::BooleanExpression), - Predicate::Not(p) => self.transform_pred_not(p)?.map_owned_or_else(pred, |p| p), - Predicate::Unary(u) => self - .transform_pred_unary(u)? - .map_owned_or_else(pred, Predicate::Unary), - Predicate::Binary(b) => self - .transform_pred_binary(b)? - .map_owned_or_else(pred, Predicate::Binary), - Predicate::Junction(j) => self - .transform_pred_junction(j)? - .map_owned_or_else(pred, Predicate::Junction), - Predicate::Opaque(o) => self - .transform_pred_opaque(o)? - .map_owned_or_else(pred, Predicate::Opaque), - Predicate::Unknown(u) => self - .transform_pred_unknown(u)? - .map_owned_or_else(pred, Predicate::Unknown), - }; - Some(pred) - } - - /// Recursively transforms a struct's child expressions. Returns `None` if all children were - /// removed, `Some(Cow::Owned)` if at least one child was changed or removed, and - /// `Some(Cow::Borrowed)` otherwise. + match pred { + Predicate::BooleanExpression(e) => { + let child = self.transform_expr(e); + map_owned_or_else(pred, child, Predicate::BooleanExpression) + } + Predicate::Not(p) => { + let child = self.transform_pred_not(p); + map_owned_or_else(pred, child, |p| p) + } + Predicate::Unary(u) => { + let child = self.transform_pred_unary(u); + map_owned_or_else(pred, child, Predicate::Unary) + } + Predicate::Binary(b) => { + let child = self.transform_pred_binary(b); + map_owned_or_else(pred, child, Predicate::Binary) + } + // Route through the constructor to normalize in case the transform removed children. + // When `transform_pred` returns `None` for a child, it is filtered out, which may + // reduce the junction to one or zero elements. The constructor normalizes these. + Predicate::Junction(j) => { + let child = self.transform_pred_junction(j); + map_owned_or_else(pred, child, |j| Predicate::junction(j.op, j.preds)) + } + Predicate::Opaque(o) => { + let child = self.transform_pred_opaque(o); + map_owned_or_else(pred, child, Predicate::Opaque) + } + Predicate::Unknown(u) => { + let child = self.transform_pred_unknown(u); + map_owned_or_else(pred, child, Predicate::Unknown) + } + } + } + + /// Recursively transforms a struct's child expressions (variadic). fn recurse_into_expr_struct( &mut self, fields: &'a [ExpressionRef], ) -> Option> { - recurse_into_children(fields, |f| { - self.transform_expr(f) - .map(|cow| cow.map_owned_or_else(f, Arc::new)) - }) + let children = fields.iter().map(|f| -> Option> { + map_owned_or_else(f, self.transform_expr(f), Arc::new) + }); + map_owned_children_or_else(fields, children, |fields| fields) } - /// Recursively transforms the children of an [`OpaqueExpression`]. Returns `None` if all - /// children were removed, `Some(Cow::Owned)` if at least one child was changed or removed, and - /// `Some(Cow::Borrowed)` otherwise. + /// Recursively transforms the child expression of a parse-json expression (unary). + fn recurse_into_expr_parse_json( + &mut self, + expr: &'a ParseJsonExpression, + ) -> Option> { + let f = |json_expr| ParseJsonExpression::new(json_expr, expr.output_schema.clone()); + map_owned_or_else(expr, self.transform_expr(&expr.json_expr), f) + } + + /// Recursively transforms the child expression of a map-to-struct expression (unary). + fn recurse_into_expr_map_to_struct( + &mut self, + expr: &'a MapToStructExpression, + ) -> Option> { + let nested = self.transform_expr(&expr.map_expr); + map_owned_or_else(expr, nested, MapToStructExpression::new) + } + + /// Recursively transforms the children of an opaque expression (variadic). fn recurse_into_expr_opaque( &mut self, o: &'a OpaqueExpression, ) -> Option> { - let nested_result = recurse_into_children(&o.exprs, |e| self.transform_expr(e))?; - Some(nested_result.map_owned_or_else(o, |exprs| OpaqueExpression::new(o.op.clone(), exprs))) + let transformed_children = o.exprs.iter().map(|e| self.transform_expr(e)); + let map_owned = |exprs| OpaqueExpression::new(o.op.clone(), exprs); + map_owned_children_or_else(o, transformed_children, map_owned) } - /// Recursively transforms the child of an [`Expression::Predicate`]. Returns `None` if all - /// children were removed, `Some(Cow::Owned)` if at least one child was changed or removed, and - /// `Some(Cow::Borrowed)` otherwise. + /// Recursively transforms the child of a predicate expression (unary). fn recurse_into_expr_pred(&mut self, pred: &'a Predicate) -> Option> { self.transform_pred(pred) } - /// Recursively transforms the child of a [`Predicate::Not`] expression. Returns `None` if the - /// child was removed, `Some(Cow::Owned)` if the child was changed, and `Some(Cow::Borrowed)` - /// otherwise. + /// Recursively transforms the child of a not predicate expression (unary). fn recurse_into_pred_not(&mut self, p: &'a Predicate) -> Option> { - Some(self.transform_pred(p)?.map_owned_or_else(p, Predicate::not)) + map_owned_or_else(p, self.transform_pred(p), Predicate::not) } - /// Recursively transforms a unary predicate's child. Returns `None` if the child was removed, - /// `Some(Cow::Owned)` if the child was changed, and `Some(Cow::Borrowed)` otherwise. + /// Recursively transforms a unary predicate's child (unary). fn recurse_into_pred_unary( &mut self, u: &'a UnaryPredicate, ) -> Option> { - let nested_result = self.transform_expr(&u.expr)?; - Some(nested_result.map_owned_or_else(u, |expr| UnaryPredicate::new(u.op, expr))) + let nested = self.transform_expr(&u.expr); + map_owned_or_else(u, nested, |expr| UnaryPredicate::new(u.op, expr)) } - /// Recursively transforms a binary predicate's children. Returns `None` if at least one child - /// was removed, `Some(Cow::Owned)` if at least one child changed, and `Some(Cow::Borrowed)` - /// otherwise. + /// Recursively transforms a binary predicate's children (binary). fn recurse_into_pred_binary( &mut self, b: &'a BinaryPredicate, ) -> Option> { - let left = self.transform_expr(&b.left)?; - let right = self.transform_expr(&b.right)?; + let left = self.transform_expr(&b.left); + let right = self.transform_expr(&b.right); let f = |(left, right)| BinaryPredicate::new(b.op, left, right); - Some((left, right).map_owned_or_else(b, f)) + map_owned_pair_or_else(b, left, right, f) } - /// Recursively transforms a unary expression's child. Returns `None` if the child was removed, - /// `Some(Cow::Owned)` if the child was changed, and `Some(Cow::Borrowed)` otherwise. + /// Recursively transforms a unary expression's child (unary). fn recurse_into_expr_unary( &mut self, u: &'a UnaryExpression, ) -> Option> { - let nested_result = self.transform_expr(&u.expr)?; - Some(nested_result.map_owned_or_else(u, |expr| UnaryExpression::new(u.op, expr))) + let nested = self.transform_expr(&u.expr); + map_owned_or_else(u, nested, |expr| UnaryExpression::new(u.op, expr)) } - /// Recursively transforms a binary expression's children. Returns `None` if at least one child - /// was removed, `Some(Cow::Owned)` if at least one child changed, and `Some(Cow::Borrowed)` - /// otherwise. + /// Recursively transforms a binary expression's children (binary). fn recurse_into_expr_binary( &mut self, b: &'a BinaryExpression, ) -> Option> { - let left = self.transform_expr(&b.left)?; - let right = self.transform_expr(&b.right)?; + let left = self.transform_expr(&b.left); + let right = self.transform_expr(&b.right); let f = |(left, right)| BinaryExpression::new(b.op, left, right); - Some((left, right).map_owned_or_else(b, f)) + map_owned_pair_or_else(b, left, right, f) } - /// Recursively transforms a variadic expression's children. Returns `None` if all children were - /// removed, `Some(Cow::Owned)` if at least one child was changed or removed, and - /// `Some(Cow::Borrowed)` otherwise. + /// Recursively transforms a variadic expression's children (variadic). fn recurse_into_expr_variadic( &mut self, v: &'a VariadicExpression, ) -> Option> { - let nested_result = recurse_into_children(&v.exprs, |e| self.transform_expr(e))?; - Some(nested_result.map_owned_or_else(v, |exprs| VariadicExpression::new(v.op, exprs))) + let children = v.exprs.iter().map(|e| self.transform_expr(e)); + map_owned_children_or_else(v, children, |exprs| VariadicExpression::new(v.op, exprs)) } - /// Recursively transforms a junction predicate's children. Returns `None` if all children were - /// removed, `Some(Cow::Owned)` if at least one child was changed or removed, and - /// `Some(Cow::Borrowed)` otherwise. + /// Recursively transforms a junction predicate's children (variadic). fn recurse_into_pred_junction( &mut self, j: &'a JunctionPredicate, ) -> Option> { - let nested_result = recurse_into_children(&j.preds, |p| self.transform_pred(p))?; - Some(nested_result.map_owned_or_else(j, |preds| JunctionPredicate::new(j.op, preds))) + let children = j.preds.iter().map(|p| self.transform_pred(p)); + map_owned_children_or_else(j, children, |preds| JunctionPredicate::new(j.op, preds)) } - /// Recursively transforms the children of an [`OpaquePredicate`]. Returns `None` if all - /// children were removed, `Some(Cow::Owned)` if at least one child was changed or removed, and - /// `Some(Cow::Borrowed)` otherwise. + /// Recursively transforms an opaque predicate's children (variadic). fn recurse_into_pred_opaque( &mut self, o: &'a OpaquePredicate, ) -> Option> { - let nested_result = recurse_into_children(&o.exprs, |e| self.transform_expr(e))?; - Some(nested_result.map_owned_or_else(o, |exprs| OpaquePredicate::new(o.op.clone(), exprs))) - } -} - -/// Used to recurse into the children of an `Expression::Struct` or `Predicate::Junction`. -fn recurse_into_children<'a, T: Clone>( - children: &'a [T], - recurse_fn: impl FnMut(&'a T) -> Option>, -) -> Option> { - let mut num_borrowed = 0; - let new_children: Vec<_> = children - .iter() - .filter_map(recurse_fn) - .inspect(|f| { - if matches!(f, Cow::Borrowed(_)) { - num_borrowed += 1; - } - }) - .collect(); - - if new_children.is_empty() { - None // all children filtered out - } else if num_borrowed < children.len() { - // At least one child was changed or removed, so make a new child list - let children = new_children.into_iter().map(Cow::into_owned).collect(); - Some(Cow::Owned(children)) - } else { - Some(Cow::Borrowed(children)) - } -} - -/// Retrieves the set of column names referenced by an expression. -#[derive(Default)] -pub(crate) struct GetColumnReferences<'a> { - references: HashSet<&'a ColumnName>, -} - -impl<'a> GetColumnReferences<'a> { - pub(crate) fn into_inner(self) -> HashSet<&'a ColumnName> { - self.references - } -} - -impl<'a> ExpressionTransform<'a> for GetColumnReferences<'a> { - fn transform_expr_column(&mut self, name: &'a ColumnName) -> Option> { - self.references.insert(name); - Some(Cow::Borrowed(name)) + let children = o.exprs.iter().map(|e| self.transform_expr(e)); + let map_owned = |exprs| OpaquePredicate::new(o.op.clone(), exprs); + map_owned_children_or_else(o, children, map_owned) } } @@ -409,14 +424,14 @@ impl ExpressionDepthChecker { // Exposed for testing fn check_expr_with_call_count(expr: &Expression, depth_limit: usize) -> (usize, usize) { let mut checker = Self::new(depth_limit); - checker.transform_expr(expr); + let _ = checker.transform_expr(expr); (checker.max_depth_seen, checker.call_count) } // Exposed for testing fn check_pred_with_call_count(pred: &Predicate, depth_limit: usize) -> (usize, usize) { let mut checker = Self::new(depth_limit); - checker.transform_pred(pred); + let _ = checker.transform_pred(pred); (checker.max_depth_seen, checker.call_count) } @@ -511,6 +526,13 @@ impl<'a> ExpressionTransform<'a> for ExpressionDepthChecker { ) -> Option> { self.depth_limited(Self::recurse_into_expr_opaque, expr) } + + fn transform_expr_map_to_struct( + &mut self, + expr: &'a MapToStructExpression, + ) -> Option> { + self.depth_limited(Self::recurse_into_expr_map_to_struct, expr) + } } #[cfg(test)] @@ -518,14 +540,17 @@ mod tests { use super::*; use crate::expressions::VariadicExpressionOp::Coalesce; use crate::expressions::{ - column_expr, column_pred, Expression as Expr, OpaqueExpressionOp, OpaquePredicateOp, - Predicate as Pred, ScalarExpressionEvaluator, + column_expr, column_pred, Expression, Expression as Expr, OpaqueExpressionOp, + OpaquePredicateOp, ParseJsonExpression, Predicate as Pred, Scalar, + ScalarExpressionEvaluator, VariadicExpression, }; use crate::kernel_predicates::{ DirectDataSkippingPredicateEvaluator, DirectPredicateEvaluator, IndirectDataSkippingPredicateEvaluator, }; + use crate::schema::{DataType, StructField, StructType}; use crate::DeltaResult; + use std::sync::Arc; #[derive(Debug, PartialEq)] struct OpaqueTestOp(String); @@ -580,6 +605,17 @@ mod tests { struct NoopTransform; impl ExpressionTransform<'_> for NoopTransform {} + struct ColumnReplacer; + impl<'a> ExpressionTransform<'a> for ColumnReplacer { + fn transform_expr_column(&mut self, name: &'a ColumnName) -> Option> { + if name.len() == 1 && name[0] == "old_col" { + Some(Cow::Owned(ColumnName::new(["new_col"]))) + } else { + Some(Cow::Borrowed(name)) + } + } + } + #[test] fn test_transform_expr_variadic_noop() { // Test default no-op behavior - should return Cow::Borrowed @@ -613,20 +649,6 @@ mod tests { #[test] fn test_transform_expr_variadic_child_transformation() { // Test transformation of child expressions - should return Cow::Owned - struct ColumnReplacer; - impl<'a> ExpressionTransform<'a> for ColumnReplacer { - fn transform_expr_column( - &mut self, - name: &'a ColumnName, - ) -> Option> { - if name.len() == 1 && name[0] == "old_col" { - Some(Cow::Owned(ColumnName::new(["new_col"]))) - } else { - Some(Cow::Borrowed(name)) - } - } - } - let variadic_expr = VariadicExpression::new( Coalesce, vec![ @@ -637,8 +659,7 @@ mod tests { ], ); - let mut transform = ColumnReplacer; - let result = transform.transform_expr_variadic(&variadic_expr); + let result = ColumnReplacer.transform_expr_variadic(&variadic_expr); assert!(matches!(result, Some(Cow::Owned(_)))); if let Some(Cow::Owned(result_expr)) = result { @@ -802,6 +823,121 @@ mod tests { } } + fn test_output_schema() -> Arc { + Arc::new(StructType::new_unchecked(vec![ + StructField::new("a", DataType::LONG, true), + StructField::new("b", DataType::STRING, true), + ])) + } + + #[test] + fn test_transform_expr_parse_json_noop() { + // Test default no-op behavior - should return Cow::Borrowed + let parse_json_expr = + ParseJsonExpression::new(column_expr!("json_col"), test_output_schema()); + + let mut transform = NoopTransform; + let result = transform.transform_expr_parse_json(&parse_json_expr); + + assert!(matches!(result, Some(Cow::Borrowed(_)))); + if let Some(Cow::Borrowed(result_expr)) = result { + assert_eq!(result_expr, &parse_json_expr); + } + } + + #[test] + fn test_transform_expr_parse_json_child_transformation() { + // Test transformation of child expression - should return Cow::Owned + let parse_json_expr = + ParseJsonExpression::new(column_expr!("old_col"), test_output_schema()); + + let result = ColumnReplacer.transform_expr_parse_json(&parse_json_expr); + + assert!(matches!(result, Some(Cow::Owned(_)))); + if let Some(Cow::Owned(result_expr)) = result { + // Check that the column was replaced + if let Expr::Column(col) = result_expr.json_expr.as_ref() { + assert_eq!(col.len(), 1); + assert_eq!(col[0], "new_col"); + } else { + panic!("Expected column expression"); + } + // Schema should be preserved + assert_eq!(result_expr.output_schema, test_output_schema()); + } + } + + #[test] + fn test_transform_expr_parse_json_child_unchanged() { + // Test when child column doesn't match replacement criteria - should return Cow::Borrowed + let parse_json_expr = + ParseJsonExpression::new(column_expr!("unchanged_col"), test_output_schema()); + + let result = ColumnReplacer.transform_expr_parse_json(&parse_json_expr); + + // Since "unchanged_col" doesn't match "old_col", nothing changes + assert!(matches!(result, Some(Cow::Borrowed(_)))); + } + + #[test] + fn test_transform_expr_parse_json_child_removal() { + // Test removal of child expression - should return None + struct ColumnRemover; + impl<'a> ExpressionTransform<'a> for ColumnRemover { + fn transform_expr_column( + &mut self, + _name: &'a ColumnName, + ) -> Option> { + None // Remove all column references + } + } + + let parse_json_expr = + ParseJsonExpression::new(column_expr!("json_col"), test_output_schema()); + + let mut transform = ColumnRemover; + let result = transform.transform_expr_parse_json(&parse_json_expr); + + // Child was removed, so the whole ParseJson should be None + assert!(result.is_none()); + } + + #[test] + fn test_transform_expr_parse_json_nested_child() { + // Test with a more complex nested child expression + struct LiteralDoubler; + impl<'a> ExpressionTransform<'a> for LiteralDoubler { + fn transform_expr_literal(&mut self, value: &'a Scalar) -> Option> { + if let Scalar::Integer(n) = value { + Some(Cow::Owned(Scalar::Integer(n * 2))) + } else { + Some(Cow::Borrowed(value)) + } + } + } + + // ParseJson with a binary expression as child: column + 5 + let child_expr = column_expr!("x") + Expr::literal(5); + let parse_json_expr = ParseJsonExpression::new(child_expr, test_output_schema()); + + let mut transform = LiteralDoubler; + let result = transform.transform_expr_parse_json(&parse_json_expr); + + assert!(matches!(result, Some(Cow::Owned(_)))); + if let Some(Cow::Owned(result_expr)) = result { + // The literal 5 should have been doubled to 10 + if let Expr::Binary(binary) = result_expr.json_expr.as_ref() { + if let Expr::Literal(Scalar::Integer(n)) = &*binary.right { + assert_eq!(*n, 10); + } else { + panic!("Expected integer literal"); + } + } else { + panic!("Expected binary expression"); + } + } + } + #[test] fn test_depth_checker() { let pred = Pred::or_from([ @@ -951,4 +1087,42 @@ mod tests { assert_eq!(check_with_call_count(6), (6, 16)); assert_eq!(check_with_call_count(7), (6, 16)); } + + #[test] + fn transform_junction_to_single_child_unwraps() { + // A transform that removes one child from AND(a, b) should produce the surviving + // predicate directly, not a degenerate single-element junction AND(a). + struct LiteralRemover; + impl<'a> ExpressionTransform<'a> for LiteralRemover { + fn transform_expr_literal(&mut self, _value: &'a Scalar) -> Option> { + None + } + } + + let pred = Pred::and(column_pred!("x"), Pred::literal(true)); + let mut transform = LiteralRemover; + let result = transform.transform_pred(&pred); + let result = result.map(Cow::into_owned); + assert_eq!(result.as_ref(), Some(&column_pred!("x"))); + assert!(!matches!(result, Some(Pred::Junction(_)))); + } + + #[test] + fn transform_junction_removing_all_children_returns_none() { + // Removing all children propagates None (the junction is dropped entirely), + // rather than producing an empty junction or identity literal. + struct ColumnRemover; + impl<'a> ExpressionTransform<'a> for ColumnRemover { + fn transform_expr_column( + &mut self, + _name: &'a ColumnName, + ) -> Option> { + None + } + } + + let pred = Pred::and(column_pred!("x"), column_pred!("y")); + let mut transform = ColumnRemover; + assert!(transform.transform_pred(&pred).is_none()); + } } diff --git a/kernel/src/transforms/mod.rs b/kernel/src/transforms/mod.rs new file mode 100644 index 0000000000..4254d6fb9a --- /dev/null +++ b/kernel/src/transforms/mod.rs @@ -0,0 +1,84 @@ +use std::borrow::{Cow, ToOwned}; + +mod expression; +mod schema; +pub use self::expression::{ExpressionDepthChecker, ExpressionTransform}; +pub use self::schema::{SchemaDepthChecker, SchemaTransform}; + +/// Rebuilds a parent from transformed children only when needed. +/// +/// Child transforms may filter nodes by returning `None`. If all children are filtered out, this +/// returns `None`. If all original children survive as borrowed values, this returns a borrowed +/// parent. Otherwise, it rebuilds and returns an owned parent. +pub(crate) fn map_owned_children_or_else<'a, Parent, Child>( + parent: &'a Parent, + children: impl ExactSizeIterator>>, + map_owned: impl FnOnce(Vec) -> Parent::Owned, +) -> Option> +where + Parent: ToOwned + ?Sized, + Child: ToOwned + ?Sized + 'a, +{ + let num_children = children.len(); + let mut num_borrowed = 0; + let mut new_children = Vec::with_capacity(num_children); + for child in children.flatten() { + if let Cow::Borrowed(_) = child { + num_borrowed += 1; + } + new_children.push(child); + } + + if new_children.is_empty() { + None + } else if num_borrowed < num_children { + let owned = new_children.into_iter().map(Cow::into_owned).collect(); + Some(Cow::Owned(map_owned(owned))) + } else { + Some(Cow::Borrowed(parent)) + } +} + +/// Rebuilds a two-child parent from transformed children only when needed. +/// +/// If either child is filtered out (`None`), filter out the parent by returning `None`. If both children survive as +/// borrowed values, this returns a borrowed parent. Otherwise, it uses the provided `map_owned` function to rebuild and return an owned +/// parent. +pub(crate) fn map_owned_pair_or_else<'a, Parent, Child>( + parent: &'a Parent, + left: Option>, + right: Option>, + map_owned: impl FnOnce((Child::Owned, Child::Owned)) -> Parent, +) -> Option> +where + Parent: Clone, + Child: ToOwned + ?Sized + 'a, +{ + let (Some(left), Some(right)) = (left, right) else { + return None; + }; + Some(match (left, right) { + (Cow::Borrowed(_), Cow::Borrowed(_)) => Cow::Borrowed(parent), + (left, right) => Cow::Owned(map_owned((left.into_owned(), right.into_owned()))), + }) +} + +/// Rebuilds a single-child parent from a transformed child only when needed. +/// +/// If the child is filtered out (`None`), filter out the parent by returning `None`. If the child +/// survives as a borrowed value, this returns a borrowed parent. Otherwise, it uses the provided +/// `map_owned` function to rebuild and return an owned parent. +pub(crate) fn map_owned_or_else<'a, Parent, Child>( + parent: &'a Parent, + child: Option>, + map_owned: impl FnOnce(Child::Owned) -> Parent, +) -> Option> +where + Parent: Clone, + Child: ToOwned + ?Sized + 'a, +{ + Some(match child? { + Cow::Owned(v) => Cow::Owned(map_owned(v)), + Cow::Borrowed(_) => Cow::Borrowed(parent), + }) +} diff --git a/kernel/src/transforms/schema.rs b/kernel/src/transforms/schema.rs new file mode 100644 index 0000000000..30905b2f0f --- /dev/null +++ b/kernel/src/transforms/schema.rs @@ -0,0 +1,325 @@ +use std::borrow::Cow; + +use crate::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; +use crate::transforms::{map_owned_children_or_else, map_owned_or_else, map_owned_pair_or_else}; + +/// Generic framework for describing recursive bottom-up schema transforms. Transformations return +/// `Option` with the following semantics: +/// * `Some(Cow::Owned)` -- The schema element was transformed and should propagate to its parent. +/// * `Some(Cow::Borrowed)` -- The schema element was not transformed. +/// * `None` -- The schema element was filtered out and the parent should no longer reference it. +/// +/// The transform can start from whatever schema element is available +/// (e.g. [`Self::transform_struct`] to start with [`StructType`]), or it can start from the generic +/// [`Self::transform`]. +/// +/// The provided `transform_xxx` methods all default to no-op (usually by invoking the corresponding +/// recursive helper method), and implementations should selectively override specific +/// `transform_xxx` methods as needed for the task at hand. +/// +/// # Recursive helper methods +/// +/// The provided `recurse_into_xxx` methods encapsulate the boilerplate work of recursing into the +/// child schema elements of each schema element. Except as specifically noted otherwise, these +/// recursive helpers all behave uniformly, based on the number of children the schema element has: +/// +/// * Leaf (no children) - Leaf `transform_xxx` methods simply return their argument unchanged, and +/// no corresponding `recurse_into_xxx` method is provided. +/// +/// * Unary (single child) - If the child was filtered out, filter out the parent. If the child +/// changed, build a new parent around it. Otherwise, return the parent unchanged. +/// +/// * Binary (two children) - If either child was filtered out, filter out the parent. If at least +/// one child changed, build a new parent around them. Otherwise, return the parent unchanged. +/// +/// * Variadic (0+ children) - If no children remain (all filtered out), filter out the +/// parent. Otherwise, if at least one child changed or was filtered out, build a new parent around +/// the children. Otherwise, return the parent unchanged. +/// +/// Implementations can call these as needed, but will generally not need to override them. +pub trait SchemaTransform<'a> { + /// Called for each primitive encountered during the traversal (leaf). + fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option> { + Some(Cow::Borrowed(ptype)) + } + + /// Called for each struct encountered during the traversal. The provided implementation just + /// forwards to [`Self::recurse_into_struct`]. + fn transform_struct(&mut self, stype: &'a StructType) -> Option> { + self.recurse_into_struct(stype) + } + + /// Called for each struct field encountered during the traversal. The provided implementation + /// forwards to [`Self::recurse_into_struct_field`]. + fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { + self.recurse_into_struct_field(field) + } + + /// Called for each array encountered during the traversal. The provided implementation just + /// forwards to [`Self::recurse_into_array`]. + fn transform_array(&mut self, atype: &'a ArrayType) -> Option> { + self.recurse_into_array(atype) + } + + /// Called for each array element type encountered during the traversal. The provided + /// implementation forwards to [`Self::transform`]. + fn transform_array_element(&mut self, etype: &'a DataType) -> Option> { + self.transform(etype) + } + + /// Called for each map encountered during the traversal. The provided implementation just + /// forwards to [`Self::recurse_into_map`]. + fn transform_map(&mut self, mtype: &'a MapType) -> Option> { + self.recurse_into_map(mtype) + } + + /// Called for each map key encountered during the traversal. The provided implementation + /// forwards to [`Self::transform`]. + fn transform_map_key(&mut self, etype: &'a DataType) -> Option> { + self.transform(etype) + } + + /// Called for each map value encountered during the traversal. The provided implementation + /// forwards to [`Self::transform`]. + fn transform_map_value(&mut self, etype: &'a DataType) -> Option> { + self.transform(etype) + } + + /// Called for each variant value encountered. The provided implementation just + /// forwards to [`Self::recurse_into_struct`]. + fn transform_variant(&mut self, stype: &'a StructType) -> Option> { + self.recurse_into_struct(stype) + } + + /// General entry point for a recursive traversal over any data type. Also invoked internally to + /// dispatch on nested data types encountered during the traversal. + fn transform(&mut self, data_type: &'a DataType) -> Option> { + match data_type { + DataType::Primitive(ptype) => { + let child = self.transform_primitive(ptype); + map_owned_or_else(data_type, child, DataType::from) + } + DataType::Array(atype) => { + let child = self.transform_array(atype); + map_owned_or_else(data_type, child, DataType::from) + } + DataType::Struct(stype) => { + let child = self.transform_struct(stype); + map_owned_or_else(data_type, child, DataType::from) + } + DataType::Map(mtype) => { + let child = self.transform_map(mtype); + map_owned_or_else(data_type, child, DataType::from) + } + DataType::Variant(stype) => { + let child = self.transform_variant(stype); + map_owned_or_else(data_type, child, |s| DataType::Variant(Box::new(s))) + } + } + } + + /// Recursively transforms a struct field's data type (unary). + fn recurse_into_struct_field( + &mut self, + field: &'a StructField, + ) -> Option> { + let child = self.transform(&field.data_type); + map_owned_or_else(field, child, |new_data_type| StructField { + name: field.name.clone(), + data_type: new_data_type, + nullable: field.nullable, + metadata: field.metadata.clone(), + }) + } + + /// Recursively transforms a struct's fields (variadic). + fn recurse_into_struct(&mut self, stype: &'a StructType) -> Option> { + let children = stype.fields().map(|f| self.transform_struct_field(f)); + map_owned_children_or_else(stype, children, StructType::new_unchecked) + } + + /// Recursively transforms an array's element type (unary). + fn recurse_into_array(&mut self, atype: &'a ArrayType) -> Option> { + let child = self.transform_array_element(&atype.element_type); + map_owned_or_else(atype, child, |element_type| ArrayType { + type_name: atype.type_name.clone(), + element_type, + contains_null: atype.contains_null, + }) + } + + /// Recursively transforms a map's key and value types (binary). + fn recurse_into_map(&mut self, mtype: &'a MapType) -> Option> { + let key_type = self.transform_map_key(&mtype.key_type); + let value_type = self.transform_map_value(&mtype.value_type); + let f = |(key_type, value_type)| MapType { + type_name: mtype.type_name.clone(), + key_type, + value_type, + value_contains_null: mtype.value_contains_null, + }; + map_owned_pair_or_else(mtype, key_type, value_type, f) + } +} + +/// A schema "transform" that doesn't actually change the schema at all. Instead, it measures the +/// maximum depth of a schema, with a depth limit to prevent stack overflow. Useful for verifying +/// that a schema has reasonable depth before attempting to work with it. +pub struct SchemaDepthChecker { + depth_limit: usize, + max_depth_seen: usize, + current_depth: usize, + call_count: usize, +} +impl SchemaDepthChecker { + /// Depth-checks the given data type against a given depth limit. The return value is the + /// largest depth seen, which is capped at one more than the depth limit (indicating the + /// recursion was terminated). + pub fn check(data_type: &DataType, depth_limit: usize) -> usize { + Self::check_with_call_count(data_type, depth_limit).0 + } + + // Exposed for testing + fn check_with_call_count(data_type: &DataType, depth_limit: usize) -> (usize, usize) { + let mut checker = Self { + depth_limit, + max_depth_seen: 0, + current_depth: 0, + call_count: 0, + }; + let _ = checker.transform(data_type); + (checker.max_depth_seen, checker.call_count) + } + + // Triggers the requested recursion only doing so would not exceed the depth limit. + fn depth_limited<'a, T: Clone + std::fmt::Debug>( + &mut self, + recurse: impl FnOnce(&mut Self, &'a T) -> Option>, + arg: &'a T, + ) -> Option> { + self.call_count += 1; + if self.max_depth_seen < self.current_depth { + self.max_depth_seen = self.current_depth; + if self.depth_limit < self.current_depth { + tracing::warn!("Max schema depth {} exceeded by {arg:?}", self.depth_limit); + } + } + if self.max_depth_seen <= self.depth_limit { + self.current_depth += 1; + let _ = recurse(self, arg); + self.current_depth -= 1; + } + None + } +} +impl<'a> SchemaTransform<'a> for SchemaDepthChecker { + fn transform_struct(&mut self, stype: &'a StructType) -> Option> { + self.depth_limited(Self::recurse_into_struct, stype) + } + fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { + self.depth_limited(Self::recurse_into_struct_field, field) + } + fn transform_array(&mut self, atype: &'a ArrayType) -> Option> { + self.depth_limited(Self::recurse_into_array, atype) + } + fn transform_map(&mut self, mtype: &'a MapType) -> Option> { + self.depth_limited(Self::recurse_into_map, mtype) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::schema::{DataType, StructField}; + + #[test] + fn test_depth_checker() { + let schema = DataType::try_struct_type([ + StructField::nullable( + "a", + ArrayType::new( + DataType::try_struct_type([ + StructField::nullable("w", DataType::LONG), + StructField::nullable("x", ArrayType::new(DataType::LONG, true)), + StructField::nullable( + "y", + MapType::new(DataType::LONG, DataType::STRING, true), + ), + StructField::nullable( + "z", + DataType::try_struct_type([ + StructField::nullable("n", DataType::LONG), + StructField::nullable("m", DataType::STRING), + ]) + .unwrap(), + ), + ]) + .unwrap(), + true, + ), + ), + StructField::nullable( + "b", + DataType::try_struct_type([ + StructField::nullable("o", ArrayType::new(DataType::LONG, true)), + StructField::nullable( + "p", + MapType::new(DataType::LONG, DataType::STRING, true), + ), + StructField::nullable( + "q", + DataType::try_struct_type([ + StructField::nullable( + "s", + DataType::try_struct_type([ + StructField::nullable("u", DataType::LONG), + StructField::nullable("v", DataType::LONG), + ]) + .unwrap(), + ), + StructField::nullable("t", DataType::LONG), + ]) + .unwrap(), + ), + StructField::nullable("r", DataType::LONG), + ]) + .unwrap(), + ), + StructField::nullable( + "c", + MapType::new( + DataType::LONG, + DataType::try_struct_type([ + StructField::nullable("f", DataType::LONG), + StructField::nullable("g", DataType::STRING), + ]) + .unwrap(), + true, + ), + ), + ]) + .unwrap(); + + // Similar to SchemaDepthChecker::check, but also returns call count + let check_with_call_count = + |depth_limit| SchemaDepthChecker::check_with_call_count(&schema, depth_limit); + + // Hit depth limit at "a" but still have to look at "b" "c" "d" + assert_eq!(check_with_call_count(1), (2, 5)); + assert_eq!(check_with_call_count(2), (3, 6)); + + // Hit depth limit at "w" but still have to look at "x" "y" "z" + assert_eq!(check_with_call_count(3), (4, 10)); + assert_eq!(check_with_call_count(4), (5, 11)); + + // Depth limit hit at "n" but still have to look at "m" + assert_eq!(check_with_call_count(5), (6, 15)); + + // Depth limit not hit until "u" + assert_eq!(check_with_call_count(6), (7, 28)); + + // Depth limit not hit (full traversal required) + assert_eq!(check_with_call_count(7), (7, 32)); + assert_eq!(check_with_call_count(8), (7, 32)); + } +} diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index e77a3c8935..1c274b3165 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -99,7 +99,7 @@ fn resolve_uri_type(table_uri: impl AsRef) -> DeltaResult { pub(crate) fn current_time_duration() -> DeltaResult { SystemTime::now() .duration_since(UNIX_EPOCH) - .map_err(|e| Error::generic(format!("System time before Unix epoch: {}", e))) + .map_err(|e| Error::generic(format!("System time before Unix epoch: {e}"))) } /// Returns the current time in milliseconds since Unix epoch. @@ -109,60 +109,116 @@ pub(crate) fn current_time_ms() -> DeltaResult { .map_err(|_| Error::generic("Current timestamp exceeds i64 millisecond range")) } -// Extension trait for Cow<'_, T> -pub(crate) trait CowExt { - /// The owned type that corresopnds to Self - type Owned; - - /// Propagate the results of nested transforms. If the nested transform made no change (borrowed - /// `self`), then return a borrowed result `s` as well. Otherwise, invoke the provided mapping - /// function `f` to convert the owned nested result into an owned result. - fn map_owned_or_else(self, s: &S, f: impl FnOnce(Self::Owned) -> S) -> Cow<'_, S>; +/// Extension trait for adding completion callbacks to iterators. +pub(crate) trait IteratorExt: Iterator + Sized { + /// Wraps this iterator to call a closure when fully exhausted. + /// + /// The closure is called only when `next()` returns `None`. If the iterator + /// is dropped before exhaustion, a warning is logged but the closure is not called. + fn on_complete(self, f: F) -> OnComplete { + OnComplete { + inner: self, + on_complete: Some(f), + } + } } -// Basic implementation for a single Cow value -impl CowExt for Cow<'_, T> { - type Owned = T::Owned; +impl IteratorExt for I {} - fn map_owned_or_else(self, s: &S, f: impl FnOnce(T::Owned) -> S) -> Cow<'_, S> { - match self { - Cow::Owned(v) => Cow::Owned(f(v)), - Cow::Borrowed(_) => Cow::Borrowed(s), +/// Iterator adaptor that executes a closure when fully exhausted. +pub(crate) struct OnComplete { + inner: I, + on_complete: Option, +} + +impl Drop for OnComplete { + fn drop(&mut self) { + if self.on_complete.is_some() { + tracing::debug!( + "OnComplete iterator dropped before exhaustion; completion callback not called" + ); } } } -// Additional implementation for a pair of Cow values -impl<'a, T: ToOwned + ?Sized> CowExt<(Cow<'a, T>, Cow<'a, T>)> for (Cow<'a, T>, Cow<'a, T>) { - type Owned = (T::Owned, T::Owned); +impl Iterator for OnComplete +where + I: Iterator, + F: FnOnce(), +{ + type Item = I::Item; + + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } - fn map_owned_or_else(self, s: &S, f: impl FnOnce(Self::Owned) -> S) -> Cow<'_, S> { - match self { - (Cow::Borrowed(_), Cow::Borrowed(_)) => Cow::Borrowed(s), - (left, right) => Cow::Owned(f((left.into_owned(), right.into_owned()))), + fn next(&mut self) -> Option { + match self.inner.next() { + Some(item) => Some(item), + None => { + if let Some(f) = self.on_complete.take() { + f(); + } + None + } } } } #[cfg(test)] pub(crate) mod test_utils { + use std::path::PathBuf; + use std::sync::Mutex; + use std::{path::Path, sync::Arc}; + + use itertools::Itertools; + use serde::Serialize; + use tempfile::TempDir; + use test_utils::{delta_path_for_version, load_test_data}; + use url::Url; + use crate::actions::{ get_all_actions_schema, Add, Cdc, CommitInfo, Metadata, Protocol, Remove, }; use crate::arrow::array::{RecordBatch, StringArray}; use crate::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; + use crate::committer::FileSystemCommitter; use crate::engine::arrow_data::ArrowEngineData; + use crate::engine::default::DefaultEngineBuilder; use crate::engine::sync::SyncEngine; - use crate::Engine; - use crate::EngineData; + use crate::metrics::{MetricEvent, MetricsReporter}; + use crate::object_store::local::LocalFileSystem; + use crate::object_store::memory::InMemory; + use crate::object_store::ObjectStore; + use crate::table_features::ColumnMappingMode; + use crate::transaction::create_table::create_table; + use crate::transaction::{CreateTable, Transaction}; + use crate::{DeltaResult, EngineData, Error, SnapshotRef}; + use crate::{Engine, Snapshot}; + + /// A metrics reporter that captures all events for test assertions. + #[derive(Debug, Default)] + pub(crate) struct CapturingReporter { + events: Mutex>, + } - use itertools::Itertools; - use object_store::local::LocalFileSystem; - use object_store::ObjectStore; - use serde::Serialize; - use std::{path::Path, sync::Arc}; - use tempfile::TempDir; - use test_utils::delta_path_for_version; + impl MetricsReporter for CapturingReporter { + fn report(&self, event: MetricEvent) { + self.events.lock().unwrap().push(event); + } + } + + impl CapturingReporter { + /// Returns a copy of all captured events. + pub(crate) fn events(&self) -> Vec { + self.events.lock().unwrap().clone() + } + + /// Clears all captured events. + pub(crate) fn clear(&self) { + self.events.lock().unwrap().clear(); + } + } #[derive(Serialize)] pub(crate) enum Action { @@ -181,6 +237,11 @@ pub(crate) mod test_utils { CommitInfo(CommitInfo), } + use crate::schema::{ + ArrayType, ColumnMetadataKey, DataType as KernelDataType, MapType, MetadataValue, + PrimitiveType, SchemaRef, StructField, StructType, + }; + /// A mock table that writes commits to a local temporary delta log. This can be used to /// construct a delta log used for testing. pub(crate) struct LocalMockTable { @@ -268,12 +329,13 @@ pub(crate) mod test_utils { } // TODO: allow tests to pass in context (issue#1133) + #[track_caller] pub(crate) fn assert_result_error_with_message( res: Result, message: &str, ) { match res { - Ok(_) => panic!("Expected error, but got Ok result"), + Ok(_) => panic!("Expected error with message {message}, but got Ok result"), Err(error) => { let error_str = error.to_string(); assert!( @@ -283,6 +345,503 @@ pub(crate) mod test_utils { } } } + + /// Asserts the 2x2 matrix of (schema_has_feature, protocol_supports_feature) outcomes + /// for schema-level feature validators. The expected pattern is: + /// - schema + protocol => Ok + /// - no schema + no protocol => Ok + /// - no schema + protocol => Ok + /// - schema + no protocol => Err (orphaned schema presence) + /// + /// Additional error schemas (e.g. nested) are also tested against `protocol_without`. + #[track_caller] + pub(crate) fn assert_schema_feature_validation( + schema_with: &StructType, + schema_without: &StructType, + protocol_with: &Protocol, + protocol_without: &Protocol, + extra_err_schemas: &[&StructType], + err_msg: &str, + ) { + make_test_tc(schema_with.clone(), protocol_with.clone(), []) + .expect("feature present + supported"); + make_test_tc(schema_without.clone(), protocol_without.clone(), []) + .expect("feature absent + unsupported"); + make_test_tc(schema_without.clone(), protocol_with.clone(), []) + .expect("feature absent + supported"); + assert_result_error_with_message( + make_test_tc(schema_with.clone(), protocol_without.clone(), []), + err_msg, + ); + for schema in extra_err_schemas { + assert_result_error_with_message( + make_test_tc((*schema).clone(), protocol_without.clone(), []), + err_msg, + ); + } + } + + /// Creates a [`TableConfiguration`] from a schema, protocol, and table properties. + /// Useful for testing validators that need a TC. + pub(crate) fn make_test_tc( + schema: StructType, + protocol: Protocol, + props: impl IntoIterator, + ) -> crate::DeltaResult { + let schema = std::sync::Arc::new(schema); + let metadata = + Metadata::try_new(None, None, schema, vec![], 0, props.into_iter().collect()).unwrap(); + let table_root = Url::try_from("file:///").unwrap(); + crate::table_configuration::TableConfiguration::try_new(metadata, protocol, table_root, 0) + } + + /// Helper to get a field from a StructType by name, panicking if not found. + pub(crate) fn get_schema_field(struct_type: &StructType, name: &str) -> StructField { + struct_type + .fields() + .find(|f| f.name() == name) + .unwrap_or_else(|| panic!("Field '{name}' not found")) + .clone() + } + + /// Validates that a schema has the expected checkpoint structure with top-level action fields + /// and proper nested types for add, metaData, and protocol actions. + pub(crate) fn validate_checkpoint_schema(schema: &SchemaRef) { + // Verify top-level action fields exist and are structs + let top_level_fields = ["txn", "add", "remove", "metaData", "protocol"]; + for field_name in top_level_fields { + let field = get_schema_field(schema, field_name); + assert!( + matches!(field.data_type(), KernelDataType::Struct(_)), + "Field '{field_name}' should be a struct type" + ); + } + + // Verify 'add' struct has expected fields with correct types + let add_field = get_schema_field(schema, "add"); + let add_struct = match add_field.data_type() { + KernelDataType::Struct(s) => s, + _ => panic!("'add' should be a struct"), + }; + assert_eq!( + get_schema_field(add_struct, "path").data_type(), + &KernelDataType::Primitive(PrimitiveType::String) + ); + assert_eq!( + get_schema_field(add_struct, "size").data_type(), + &KernelDataType::Primitive(PrimitiveType::Long) + ); + assert!( + matches!( + get_schema_field(add_struct, "partitionValues").data_type(), + KernelDataType::Map(_) + ), + "'partitionValues' should be a map type" + ); + + // Verify 'metaData' struct has nested 'format' struct + let metadata_field = get_schema_field(schema, "metaData"); + let metadata_struct = match metadata_field.data_type() { + KernelDataType::Struct(s) => s, + _ => panic!("'metaData' should be a struct"), + }; + let format_field = get_schema_field(metadata_struct, "format"); + let format_struct = match format_field.data_type() { + KernelDataType::Struct(s) => s, + _ => panic!("'format' should be a struct"), + }; + assert_eq!( + get_schema_field(format_struct, "provider").data_type(), + &KernelDataType::Primitive(PrimitiveType::String) + ); + + // Verify 'protocol' struct has version fields + let protocol_field = get_schema_field(schema, "protocol"); + let protocol_struct = match protocol_field.data_type() { + KernelDataType::Struct(s) => s, + _ => panic!("'protocol' should be a struct"), + }; + assert_eq!( + get_schema_field(protocol_struct, "minReaderVersion").data_type(), + &KernelDataType::Primitive(PrimitiveType::Integer) + ); + assert_eq!( + get_schema_field(protocol_struct, "minWriterVersion").data_type(), + &KernelDataType::Primitive(PrimitiveType::Integer) + ); + } + + // ==================== Test schema helpers ==================== + // + // Reusable test schemas + // Each variant exists with and without column mapping metadata. + + /// Helper to add column mapping metadata to a [`StructField`]. + fn with_column_mapping(field: StructField, id: i64, physical_name: &str) -> StructField { + field.with_metadata([ + ( + ColumnMetadataKey::ColumnMappingId.as_ref(), + MetadataValue::Number(id), + ), + ( + ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(), + MetadataValue::String(physical_name.into()), + ), + ]) + } + + /// Flat schema: `[id: long, name: string]` + pub(crate) fn test_schema_flat() -> SchemaRef { + Arc::new(StructType::new_unchecked([ + StructField::new("id", KernelDataType::LONG, false), + StructField::nullable("name", KernelDataType::STRING), + ])) + } + + /// Flat schema with column mapping metadata. + pub(crate) fn test_schema_flat_with_column_mapping() -> SchemaRef { + Arc::new(StructType::new_unchecked([ + with_column_mapping( + StructField::new("id", KernelDataType::LONG, false), + 1, + "phys_id", + ), + with_column_mapping( + StructField::nullable("name", KernelDataType::STRING), + 2, + "phys_name", + ), + ])) + } + + /// Nested struct schema with array and map inside the struct + pub(crate) fn test_schema_nested() -> SchemaRef { + Arc::new(StructType::new_unchecked([ + StructField::new("id", KernelDataType::LONG, false), + StructField::nullable( + "info", + StructType::new_unchecked([ + StructField::nullable("name", KernelDataType::STRING), + StructField::nullable("age", KernelDataType::INTEGER), + StructField::nullable( + "tags", + MapType::new(KernelDataType::STRING, KernelDataType::STRING, true), + ), + StructField::nullable("scores", ArrayType::new(KernelDataType::INTEGER, true)), + ]), + ), + ])) + } + + /// Nested struct schema with column mapping metadata. + pub(crate) fn test_schema_nested_with_column_mapping() -> SchemaRef { + Arc::new(StructType::new_unchecked([ + with_column_mapping( + StructField::new("id", KernelDataType::LONG, false), + 1, + "phys_id", + ), + with_column_mapping( + StructField::nullable( + "info", + StructType::new_unchecked([ + with_column_mapping( + StructField::nullable("name", KernelDataType::STRING), + 3, + "phys_name", + ), + with_column_mapping( + StructField::nullable("age", KernelDataType::INTEGER), + 4, + "phys_age", + ), + with_column_mapping( + StructField::nullable( + "tags", + MapType::new(KernelDataType::STRING, KernelDataType::STRING, true), + ), + 5, + "phys_tags", + ), + with_column_mapping( + StructField::nullable( + "scores", + ArrayType::new(KernelDataType::INTEGER, true), + ), + 6, + "phys_scores", + ), + ]), + ), + 2, + "phys_info", + ), + ])) + } + + /// Schema with a map + pub(crate) fn test_schema_with_map() -> SchemaRef { + let value_struct = StructType::new_unchecked([ + StructField::nullable("key", KernelDataType::STRING), + StructField::nullable("value", KernelDataType::INTEGER), + ]); + Arc::new(StructType::new_unchecked([ + StructField::new("id", KernelDataType::LONG, false), + StructField::nullable( + "entries", + MapType::new( + KernelDataType::STRING, + KernelDataType::Struct(Box::new(value_struct)), + true, + ), + ), + StructField::nullable("name", KernelDataType::STRING), + ])) + } + + /// Schema with a map and column mapping metadata. + pub(crate) fn test_schema_with_map_and_column_mapping() -> SchemaRef { + let value_struct = StructType::new_unchecked([ + with_column_mapping( + StructField::nullable("key", KernelDataType::STRING), + 4, + "phys_key", + ), + with_column_mapping( + StructField::nullable("value", KernelDataType::INTEGER), + 5, + "phys_value", + ), + ]); + Arc::new(StructType::new_unchecked([ + with_column_mapping( + StructField::new("id", KernelDataType::LONG, false), + 1, + "phys_id", + ), + with_column_mapping( + StructField::nullable( + "entries", + MapType::new( + KernelDataType::STRING, + KernelDataType::Struct(Box::new(value_struct)), + true, + ), + ), + 2, + "phys_entries", + ), + with_column_mapping( + StructField::nullable("name", KernelDataType::STRING), + 3, + "phys_name", + ), + ])) + } + + /// Schema with an array + pub(crate) fn test_schema_with_array() -> SchemaRef { + let item_struct = StructType::new_unchecked([ + StructField::nullable("label", KernelDataType::STRING), + StructField::nullable("count", KernelDataType::INTEGER), + ]); + Arc::new(StructType::new_unchecked([ + StructField::new("id", KernelDataType::LONG, false), + StructField::nullable( + "items", + ArrayType::new(KernelDataType::Struct(Box::new(item_struct)), true), + ), + StructField::nullable("name", KernelDataType::STRING), + ])) + } + + /// Schema with an array and column mapping metadata. + pub(crate) fn test_schema_with_array_and_column_mapping() -> SchemaRef { + let item_struct = StructType::new_unchecked([ + with_column_mapping( + StructField::nullable("label", KernelDataType::STRING), + 4, + "phys_label", + ), + with_column_mapping( + StructField::nullable("count", KernelDataType::INTEGER), + 5, + "phys_count", + ), + ]); + Arc::new(StructType::new_unchecked([ + with_column_mapping( + StructField::new("id", KernelDataType::LONG, false), + 1, + "phys_id", + ), + with_column_mapping( + StructField::nullable( + "items", + ArrayType::new(KernelDataType::Struct(Box::new(item_struct)), true), + ), + 2, + "phys_items", + ), + with_column_mapping( + StructField::nullable("name", KernelDataType::STRING), + 3, + "phys_name", + ), + ])) + } + + /// Deeply nested schema: struct -> array -> struct -> map(value) -> struct. + /// + /// The leaf struct field is intentionally **not** annotated with column mapping metadata, + /// so this schema can be used to test error paths when column mapping is enabled. + pub(crate) fn test_deep_nested_schema_missing_leaf_cm() -> StructType { + let leaf_struct = + StructType::new_unchecked([StructField::new("leaf", KernelDataType::INTEGER, false)]); + let map_type = MapType::new( + KernelDataType::STRING, + KernelDataType::Struct(Box::new(leaf_struct)), + true, + ); + let mid_struct = StructType::new_unchecked([with_column_mapping( + StructField::nullable("mid_field", map_type), + 2, + "phys_mid_field", + )]); + let array_type = ArrayType::new(KernelDataType::Struct(Box::new(mid_struct)), true); + StructType::new_unchecked([with_column_mapping( + StructField::nullable("top", array_type), + 1, + "phys_top", + )]) + } + + /// Build a create-table transaction with the given schema and column mapping mode. + /// Returns the engine and uncommitted transaction. + pub(crate) fn setup_column_mapping_txn( + schema: SchemaRef, + mode: ColumnMappingMode, + ) -> DeltaResult<(Arc, Transaction)> { + let mode_str = match mode { + ColumnMappingMode::Name => "name", + ColumnMappingMode::Id => "id", + ColumnMappingMode::None => "none", + }; + let store = Arc::new(InMemory::new()); + let engine: Arc = Arc::new(DefaultEngineBuilder::new(store).build()); + + let txn = create_table("memory:///test_table", schema, "DefaultEngine") + .with_table_properties([("delta.columnMapping.mode", mode_str)]) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))?; + Ok((engine, txn)) + } + + /// Validate that a physical schema matches the logical schema's column mapping metadata. + /// For Name/Id modes, checks physicalName, columnMapping.id, and parquet.field.id on + /// each field. For None mode, only checks field names match. + pub(crate) fn validate_physical_schema_column_mapping( + logical_schema: &StructType, + physical_schema: &StructType, + mode: ColumnMappingMode, + ) { + assert_eq!( + physical_schema.fields().count(), + logical_schema.fields().count() + ); + + // Collect expected (physical_name, field_id) from logical schema + let expected: Vec<_> = logical_schema + .fields() + .map(|f| { + let physical_name = + match f.get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) { + Some(MetadataValue::String(name)) => name.clone(), + _ if mode == ColumnMappingMode::None => f.name().to_string(), + _ => panic!("Logical field '{}' missing physicalName metadata", f.name()), + }; + let field_id = match f.get_config_value(&ColumnMetadataKey::ColumnMappingId) { + Some(MetadataValue::Number(id)) => *id, + _ if mode == ColumnMappingMode::None => -1, + _ => panic!( + "Logical field '{}' missing columnMapping.id metadata", + f.name() + ), + }; + (physical_name, field_id) + }) + .collect(); + + // Validate each physical field against expected values + for (physical_field, (expected_name, expected_id)) in + physical_schema.fields().zip(expected.iter()) + { + assert_eq!( + physical_field.name(), + expected_name, + "Physical field name mismatch" + ); + + if mode == ColumnMappingMode::None { + continue; + } + + assert_eq!( + physical_field.get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName), + Some(&MetadataValue::String(expected_name.clone())), + "columnMapping.physicalName mismatch for '{}'", + physical_field.name() + ); + + assert_eq!( + physical_field.get_config_value(&ColumnMetadataKey::ColumnMappingId), + Some(&MetadataValue::Number(*expected_id)), + "columnMapping.id mismatch for '{}'", + physical_field.name() + ); + + assert_eq!( + physical_field.get_config_value(&ColumnMetadataKey::ParquetFieldId), + Some(&MetadataValue::Number(*expected_id)), + "parquet.field.id mismatch for '{}'", + physical_field.name() + ); + } + } + + /// Load a test table from tests/data directory. + /// Tries compressed (tar.zst) first, falls back to extracted. + /// Returns (engine, snapshot, optional tempdir). The TempDir must be kept alive + /// for the duration of the test to prevent premature cleanup of extracted files. + pub(crate) fn load_test_table( + table_name: &str, + ) -> DeltaResult<(Arc, SnapshotRef, Option)> { + // Try loading compressed table first, fall back to extracted + let (path, tempdir) = match load_test_data("tests/data", table_name) { + Ok(test_dir) => { + let test_path = test_dir.path().join(table_name); + (test_path, Some(test_dir)) + } + Err(_) => { + // Fall back to already-extracted table + let manifest_dir = env!("CARGO_MANIFEST_DIR"); + let mut path = PathBuf::from(manifest_dir); + path.push("tests/data"); + path.push(table_name); + let path = std::fs::canonicalize(path) + .map_err(|e| Error::Generic(format!("Failed to canonicalize path: {e}")))?; + (path, None) + } + }; + + // Create engine and snapshot from the resolved path + let url = Url::from_directory_path(&path) + .map_err(|_| Error::Generic("Failed to create URL from path".to_string()))?; + + let store = Arc::new(LocalFileSystem::new()); + let engine = Arc::new(DefaultEngineBuilder::new(store).build()); + let snapshot = Snapshot::builder_for(url).build(engine.as_ref())?; + Ok((engine, snapshot, tempdir)) + } } #[cfg(test)] @@ -340,4 +899,53 @@ mod tests { "s3://foo/__unitystorage/catalogs/cid/tables/tid/" ); } + + mod on_complete_tests { + use super::*; + use std::sync::atomic::{AtomicBool, AtomicU32, Ordering}; + use std::sync::Arc; + + #[test] + fn test_calls_on_exhaustion() { + let called = Arc::new(AtomicBool::new(false)); + let called_clone = called.clone(); + let mut iter = vec![1, 2].into_iter().on_complete(move || { + called_clone.store(true, Ordering::SeqCst); + }); + assert_eq!(iter.next(), Some(1)); + assert!(!called.load(Ordering::SeqCst)); + assert_eq!(iter.next(), Some(2)); + assert_eq!(iter.next(), None); + assert!(called.load(Ordering::SeqCst)); + } + + #[test] + fn test_does_not_call_on_early_drop() { + let called = Arc::new(AtomicBool::new(false)); + let called_clone = called.clone(); + { + let mut iter = vec![1, 2].into_iter().on_complete(move || { + called_clone.store(true, Ordering::SeqCst); + }); + assert_eq!(iter.next(), Some(1)); + // Drop without exhausting - callback should NOT be called + } + assert!(!called.load(Ordering::SeqCst)); + } + + #[test] + fn test_calls_only_once() { + let count = Arc::new(AtomicU32::new(0)); + let count_clone = count.clone(); + { + let mut iter = vec![1].into_iter().on_complete(move || { + count_clone.fetch_add(1, Ordering::SeqCst); + }); + assert_eq!(iter.next(), Some(1)); + assert_eq!(iter.next(), None); // triggers callback + assert_eq!(iter.next(), None); // should not trigger again + } // drop should not trigger again + assert_eq!(count.load(Ordering::SeqCst), 1); + } + } } diff --git a/kernel/tests/README.md b/kernel/tests/README.md new file mode 100644 index 0000000000..856615d577 --- /dev/null +++ b/kernel/tests/README.md @@ -0,0 +1,257 @@ +# Test Tables Reference + +Test tables organized by feature area. Tables live in two locations: + +- **`data/`** -- Tables used by specific, targeted tests. Either unpacked directories or `.tar.zst` archives that individual test files decompress on the fly. +- **`golden_data/`** -- Tables from the [Delta compatibility suite](https://github.com/delta-io/delta/tree/master/connectors/golden-tables). These are `.tar.zst` archives loaded by the `golden_test!`, `negative_test!`, or `skip_test!` macros in `golden_tables.rs`, which run a standard read-and-compare flow against each table. + +## Deletion Vectors + +| Table | Location | Schema | Protocol (R/W) | Features | Description | Tests | +|-------|----------|--------|----------|----------|-------------|-------| +| `table-with-dv-small` | data/ | `value: int` | v3/v7 | r:`deletionVectors` w:`deletionVectors` | 10 rows, 2 soft-deleted by DV, 8 visible. Most heavily referenced test table. | `dv.rs::test_table_scan(with_dv)`, `write.rs::test_remove_files_adds_expected_entries`, `write.rs::test_update_deletion_vectors_adds_expected_entries`, `read.rs::with_predicate_and_removes`, `path.rs::test_to_uri/test_child/test_child_escapes`, `snapshot.rs::test_snapshot_read_metadata/test_new_snapshot/test_snapshot_new_from/test_read_table_with_missing_last_checkpoint/test_log_compaction_writer`, `deletion_vector.rs` tests, `transaction/mod.rs::setup_dv_enabled_table/test_add_files_schema/test_new_deletion_vector_path`, `default/parquet.rs` read test, `default/json.rs` read test, `log_compaction/tests.rs::create_mock_snapshot`, `resolve_dvs.rs` tests | +| `table-without-dv-small` | data/ | `value: long` | v1/v2 | | 10 rows, all visible. Companion to table-with-dv-small. | `dv.rs::test_table_scan(without_dv)`, `transaction/mod.rs::setup_non_dv_table/create_existing_table_txn/test_commit_io_error_returns_retryable_transaction`, `sequential_phase.rs::test_sequential_v2_with_commits_only/test_sequential_finish_before_exhaustion_error`, `parallel_phase.rs` tests, `scan/tests.rs::test_scan_metadata_paths/test_scan_metadata/test_scan_metadata_from_same_version` | +| `with-short-dv` | data/ | `id: long, value: string, timestamp: timestamp, rand: double` | v3/v7 | r:`deletionVectors` w:`deletionVectors` | 2 files x 5 rows. First file has inline DV (`storageType="u"`) deleting 3 rows. | `read.rs::short_dv` | +| `dv-partitioned-with-checkpoint` | golden_data/ | `value: int, part: int` partitioned by `part` | v3/v7 | r:`deletionVectors` w:`deletionVectors` | DVs on a partitioned table with a checkpoint | `golden_tables.rs::golden_test!` | +| `dv-with-columnmapping` | golden_data/ | `value: int` | v3/v7 | r:`deletionVectors,columnMapping` w:`deletionVectors,columnMapping`, `columnMapping.mode=name` | DVs combined with column mapping | `golden_tables.rs::golden_test!` | +| `log-replay-dv-key-cases` | golden_data/ | `value: int` | v3/v7 | r:`deletionVectors` w:`deletionVectors` | Edge cases in DV key handling during log replay | `golden_tables.rs::golden_test!` | + +## Change Data Feed (CDF) + +| Table | Location | Schema | Protocol (R/W) | Features | Description | Tests | +|-------|----------|--------|----------|----------|-------------|-------| +| `table-with-cdf` | data/ | `part: int, id: int` | v1/v4 | `enableChangeDataFeed=true` | Fake file paths for CDF validation logic | `table_changes/mod.rs::test_table_changes_errors/test_cdf_start_end_version/test_cdf_no_end_version`, `table_changes/scan.rs::test_table_changes_scan_errors/test_create_cdf_scan_batches_errors` | +| `cdf-table` | data/cdf-table.tar.zst | `id: int, name: string, birthday: date` partitioned by `birthday` | v1/v4 | `enableChangeDataFeed=true` | Base CDF table | `cdf.rs::basic_cdf` | +| `cdf-table-simple` | data/cdf-table-simple.tar.zst | `id: long` | v1/v7 | w:`changeDataFeed,appendOnly,invariants`, `enableChangeDataFeed=true` | Version range queries and validation | `cdf.rs::simple_cdf_version_ranges/invalid_range_end_before_start/invalid_range_start_after_last_version_of_table` | +| `cdf-table-non-partitioned` | data/cdf-table-non-partitioned.tar.zst | `id: int, name: string, birthday: date, long_field: long, boolean_field: boolean, double_field: double, smallint_field: short` | v1/v4 | `enableChangeDataFeed=true` | Non-partitioned CDF table | `cdf.rs::cdf_non_partitioned` | +| `cdf-table-partitioned` | data/cdf-table-partitioned.tar.zst | `id: long, text: string, part: long` partitioned by `part` | v1/v7 | w:`changeDataFeed,appendOnly,invariants`, `enableChangeDataFeed=true` | Partitioned CDF table | `cdf.rs::partition_table` | +| `cdf-table-delete-unconditional` | data/cdf-table-delete-unconditional.tar.zst | `id: long` | v1/v7 | w:`changeDataFeed,appendOnly,invariants`, `enableChangeDataFeed=true` | Unconditional DELETE | `cdf.rs::unconditional_delete` | +| `cdf-table-delete-conditional-all-rows` | data/cdf-table-delete-conditional-all-rows.tar.zst | `id: long` | v1/v7 | w:`changeDataFeed,appendOnly,invariants`, `enableChangeDataFeed=true` | Conditional DELETE removing all rows | `cdf.rs::conditional_delete_all_rows` | +| `cdf-table-delete-conditional-two-rows` | data/cdf-table-delete-conditional-two-rows.tar.zst | `id: long` | v1/v7 | w:`changeDataFeed,appendOnly,invariants`, `enableChangeDataFeed=true` | Conditional DELETE removing two rows | `cdf.rs::conditional_delete_two_rows` | +| `cdf-table-update-ops` | data/cdf-table-update-ops.tar.zst | `id: long` | v1/v7 | w:`changeDataFeed,appendOnly,invariants`, `enableChangeDataFeed=true` | UPDATE operations | `cdf.rs::update_operations` | +| `cdf-table-data-change` | data/cdf-table-data-change.tar.zst | `id: long` | v1/v7 | w:`changeDataFeed,appendOnly,invariants`, `enableChangeDataFeed=true` | Operations with `dataChange=false` | `cdf.rs::false_data_change_is_ignored` | +| `cdf-table-with-dv` | data/cdf-table-with-dv.tar.zst | `value: int` | v3/v7 | r:`deletionVectors` w:`deletionVectors,changeDataFeed`, `enableChangeDataFeed=true` | CDF with deletion vectors | `cdf.rs::cdf_with_deletion_vector` | +| `cdf-table-with-cdc-and-dvs` | data/cdf-table-with-cdc-and-dvs.tar.zst | `id: int, comment: string` | v3/v7 | r:`deletionVectors` w:`deletionVectors,changeDataFeed`, `enableChangeDataFeed=true` | CDF with both CDC files and DVs | `cdf.rs::cdf_with_cdc_and_dvs` | +| `cdf-table-backtick-column-names` | data/cdf-table-backtick-column-names.tar.zst | `` `id.num`: int, `id.num\`s`: int, struct_col: struct{field: int, `field.one`: int} `` | v1/v7 | w:`changeDataFeed,appendOnly,invariants`, `enableChangeDataFeed=true` | Backtick-escaped column names | `cdf.rs::backtick_column_names` | +| `cdf-column-mapping-name-mode` | data/cdf-column-mapping-name-mode.tar.zst | `id: long, name: string, value: double` | v2/v5 | Column mapping (name) | CDF + column mapping name mode | `cdf.rs::cdf_with_column_mapping_name_mode` | +| `cdf-column-mapping-name-mode-3-7` | data/cdf-column-mapping-name-mode-3-7.tar.zst | `id: long, name: string, value: double` | v3/v7 | r:`columnMapping,deletionVectors` w:`columnMapping,deletionVectors` | CDF + column mapping name mode (v3/v7) | `cdf.rs::cdf_with_column_mapping_name_mode` | +| `cdf-column-mapping-id-mode` | data/cdf-column-mapping-id-mode.tar.zst | `id: long, name: string, value: double` | v2/v5 | Column mapping (id) | CDF + column mapping id mode | `cdf.rs::cdf_with_column_mapping_id_mode` | + +## Column Mapping + +| Table | Location | Schema | Protocol (R/W) | Features | Description | Tests | +|-------|----------|--------|----------|----------|-------------|-------| +| `partition_cm/none` | data/ | `value: int, category: string` partitioned by `category` | v1/v1 | `columnMapping.mode=none` | Partitioned write with CM disabled | `write.rs::test_column_mapping_partitioned_write(cm_none)` | +| `partition_cm/id` | data/ | `value: int, category: string` partitioned by `category` | v3/v7 | r:`columnMapping` w:`columnMapping`, `columnMapping.mode=id` | Partitioned write with CM id mode | `write.rs::test_column_mapping_partitioned_write(cm_id)` | +| `partition_cm/name` | data/ | `value: int, category: string` partitioned by `category` | v3/v7 | r:`columnMapping` w:`columnMapping`, `columnMapping.mode=name` | Partitioned write with CM name mode | `write.rs::test_column_mapping_partitioned_write(cm_name)` | +| `table-with-columnmapping-mode-name` | golden_data/ | `ByteType: byte, ShortType: short, IntegerType: int, LongType: long, FloatType: float, DoubleType: double, decimal: decimal(10,2), BooleanType: boolean, StringType: string, BinaryType: binary, DateType: date, TimestampType: timestamp, nested_struct: struct{aa: string, ac: struct{aca: int}}, array_of_prims: array, array_of_arrays: array>, array_of_structs: array, map_of_prims: map, map_of_rows: map, map_of_arrays: map>` | v2/v5 | `columnMapping.mode=name` | Column mapping name mode | `golden_tables.rs::golden_test!` | +| `table-with-columnmapping-mode-id` | golden_data/ | `ByteType: byte, ShortType: short, IntegerType: int, LongType: long, FloatType: float, DoubleType: double, decimal: decimal(10,2), BooleanType: boolean, StringType: string, BinaryType: binary, DateType: date, TimestampType: timestamp, nested_struct: struct{aa: string, ac: struct{aca: int}}, array_of_prims: array, array_of_arrays: array>, array_of_structs: array, map_of_prims: map, map_of_rows: map, map_of_arrays: map>` | v2/v5 | `columnMapping.mode=id` | Column mapping id mode | `golden_tables.rs::golden_test!` | + +## Checkpoints + +| Table | Location | Schema | Protocol (R/W) | Features | Description | Tests | +|-------|----------|--------|----------|----------|-------------|-------| +| `with_checkpoint_no_last_checkpoint` | data/ | `letter: string, int: long, date: date` | v1/v2 | `checkpointInterval=2` | Checkpoint at v2 but missing `_last_checkpoint` hint file | `snapshot.rs::test_read_table_with_checkpoint`, `scan/tests.rs::test_scan_with_checkpoint`, `sequential_phase.rs::test_sequential_checkpoint_no_commits`, `checkpoint_manifest.rs` tests, `sync/parquet.rs` test, `default/parquet.rs` test | +| `external-table-different-nullability` | data/ | `i: int` | v1/v2 | `checkpointInterval=2` | Parquet files have different nullability than Delta schema; includes checkpoint | `write.rs::test_checkpoint_non_kernel_written_table` | +| `checkpoint` | golden_data/ | `intCol: int` | v1/v2 | | Basic checkpoint read | `golden_tables.rs::golden_test!(checkpoint_test)` | +| `corrupted-last-checkpoint-kernel` | golden_data/ | `id: long` | v1/v2 | | Corrupted `_last_checkpoint` file | `golden_tables.rs::golden_test!` | +| `multi-part-checkpoint` | golden_data/ | `id: long` | v1/v2 | `checkpointInterval=1` | Multi-part checkpoint files | `golden_tables.rs::golden_test!` | +| `only-checkpoint-files` | golden_data/ | `id: long` | v1/v2 | `checkpointInterval=1` | Table with only checkpoint files, no JSON commits | `golden_tables.rs::golden_test!` | + +## V2 Checkpoints + +Tests V2 checkpoints across format, sidecar, and `_last_checkpoint` combinations. + +All `data/` V2 checkpoint tables share: schema=`id: long`, protocol v3/v7, r:`v2Checkpoint` w:`v2Checkpoint,appendOnly,invariants`, `checkpointPolicy=v2`. + +| Table | Location | Format | Sidecars | `_last_checkpoint` | Tests | +|-------|----------|--------|----------|--------------------|-------| +| `v2-checkpoints-json-with-sidecars` | data/v2-checkpoints-json-with-sidecars.tar.zst | JSON | Yes | No | `v2_checkpoints.rs::v2_checkpoints_json_with_sidecars`, `checkpoint_manifest.rs` test, `sequential_phase.rs::test_sequential_v2_with_sidecars`, `parallel_phase.rs` tests | +| `v2-checkpoints-json-without-sidecars` | data/v2-checkpoints-json-without-sidecars.tar.zst | JSON | No | No | `v2_checkpoints.rs::v2_checkpoints_json_without_sidecars`, `sequential_phase.rs::test_sequential_checkpoint_without_sidecars` | +| `v2-checkpoints-json-with-last-checkpoint` | data/v2-checkpoints-json-with-last-checkpoint.tar.zst | JSON | Yes | Yes | `v2_checkpoints.rs::v2_checkpoints_json_with_last_checkpoint` | +| `v2-checkpoints-parquet-with-sidecars` | data/v2-checkpoints-parquet-with-sidecars.tar.zst | Parquet | Yes | No | `v2_checkpoints.rs::v2_checkpoints_parquet_with_sidecars`, `checkpoint_manifest.rs` test, `sequential_phase.rs::test_sequential_parquet_checkpoint_with_sidecars`, `parallel_phase.rs` test | +| `v2-checkpoints-parquet-without-sidecars` | data/v2-checkpoints-parquet-without-sidecars.tar.zst | Parquet | No | No | `v2_checkpoints.rs::v2_checkpoints_parquet_without_sidecars` | +| `v2-checkpoints-parquet-with-last-checkpoint` | data/v2-checkpoints-parquet-with-last-checkpoint.tar.zst | Parquet | Yes | Yes | `v2_checkpoints.rs::v2_checkpoints_parquet_with_last_checkpoint` | +| `v2-classic-checkpoint-json` | data/v2-classic-checkpoint-json.tar.zst | JSON | Classic compat | No | `v2_checkpoints.rs::v2_classic_checkpoint_json` | +| `v2-classic-checkpoint-parquet` | data/v2-classic-checkpoint-parquet.tar.zst | Parquet | Classic compat | No | `v2_checkpoints.rs::v2_classic_checkpoint_parquet` | + +The golden V2 checkpoint tables have a different protocol (v1/v2, no table features): + +| Table | Location | Schema | Protocol (R/W) | Features | Tests | +|-------|----------|--------|----------|----------|-------| +| `v2-checkpoint-json` | golden_data/ | `id: long` | v1/v2 | `checkpointInterval=2` | `golden_tables.rs::golden_test!` | +| `v2-checkpoint-parquet` | golden_data/ | `id: long` | v1/v2 | `checkpointInterval=2` | `golden_tables.rs::golden_test!` | + +## Data Skipping & Statistics + +| Table | Location | Schema | Protocol (R/W) | Features | Description | Tests | +|-------|----------|--------|----------|----------|-------------|-------| +| `parsed-stats` | data/ | `id: long, name: string, age: long, salary: long, ts_col: timestamp` | v1/v2 | `checkpointInterval=3` | 100 rows per commit (v0-v5). Checkpoint at v3 with `stats_parsed` columns including `ts_col` for testing timestamp stat truncation adjustment and data skipping predicates. | `scan/tests.rs::test_scan_metadata_with_stats_columns/test_data_skipping_with_parsed_stats/test_scan_metadata_stats_columns_with_predicate/test_prefix_columns_simple/test_build_actions_meta_predicate_with_predicate/test_build_actions_meta_predicate_no_predicate/test_build_actions_meta_predicate_static_skip_all/timestamp_predicate_skipping/unsupported_predicate_skipping/test_skip_stats_disables_data_skipping/test_skip_stats_and_include_stats_columns_errors` | +| `timestamp-truncation-stats` | data/ | `id: long, ts_col: timestamp` | v1/v2 | | Three Spark-written files with sub-millisecond timestamps. File 2 has max truncated from 4.000500s to 4.000s in JSON stats. Verifies max stat pruning with truncation adjustment. | `read.rs::timestamp_max_stat_pruning_with_real_table` | +| `parquet_row_group_skipping` | data/ | `bool, chrono{date32,timestamp,timestamp_ntz}, numeric{decimals{...},floats{...},ints{...}}, varlen{binary,utf8}` | v3/v7 | r:`timestampNtz` w:`timestampNtz`, `dataSkippingNumIndexedCols=0` | Deep nested structs, all types. Delta stats disabled; tests parquet-level row group skipping. | `read.rs::predicate_references_invalid_missing_column`, `scan/tests.rs::test_replay_for_scan_metadata/test_data_row_group_skipping/test_missing_column_row_group_skipping`, `set_transaction.rs::test_replay_for_app_ids`, `parquet_row_group_skipping/tests.rs`, `protocol_metadata_replay.rs` | +| `v1-single-part-struct-stats-only` | data/ | `id: long, value: string` | v3/v7 | r:`deletionVectors` w:`deletionVectors,appendOnly,invariants`, `writeStatsAsStruct=true`, `writeStatsAsJson=false` | 5 rows across 5 commits, single-part V1 checkpoint at v5 with `stats_parsed` only (no JSON stats). | `read.rs::stats_parsed_skipping::v1_single_part` | +| `v1-multi-part-struct-stats-only` | data/ | `id: long, value: string` | v3/v7 | r:`deletionVectors` w:`deletionVectors,appendOnly,invariants`, `writeStatsAsStruct=true`, `writeStatsAsJson=false` | 5 rows across 5 commits, 3-part V1 checkpoint at v5 with `stats_parsed` only (no JSON stats). | `read.rs::stats_parsed_skipping::v1_multi_part` | +| `v1-multi-part-partitioned-struct-stats-only` | data/ | `id: long, value: string, part: int` partitioned by `part` | v3/v7 | r:`deletionVectors` w:`deletionVectors,appendOnly,invariants`, `writeStatsAsStruct=true`, `writeStatsAsJson=false` | 5 rows across 5 commits (part=i%3), 3-part V1 checkpoint at v5 with `partitionValues_parsed`. | `read.rs::partition_values_parsed_skipping` | +| `v2-parquet-sidecars-struct-stats-only` | data/ | `id: long, value: string` | v3/v7 | r:`deletionVectors,v2Checkpoint` w:`deletionVectors,v2Checkpoint,appendOnly,invariants`, `writeStatsAsStruct=true`, `writeStatsAsJson=false`, `checkpointPolicy=v2` | 5 rows across 5 commits, UUID-named V2 parquet checkpoint at v5 with sidecars containing `stats_parsed`. | `read.rs::stats_parsed_skipping::v2_parquet_sidecars` | +| `v2-json-sidecars-struct-stats-only` | data/ | `id: long, value: string` | v3/v7 | r:`deletionVectors,v2Checkpoint` w:`deletionVectors,v2Checkpoint,appendOnly,invariants`, `writeStatsAsStruct=true`, `writeStatsAsJson=false`, `checkpointPolicy=v2` | 5 rows across 5 commits, UUID-named V2 JSON checkpoint at v5 with sidecars containing `stats_parsed`. | `read.rs::stats_parsed_skipping::v2_json_sidecars` | +| `v2-classic-parquet-struct-stats-only` | data/ | `id: long, value: string` | v3/v7 | r:`deletionVectors` w:`deletionVectors,appendOnly,invariants`, `writeStatsAsStruct=true`, `writeStatsAsJson=false`, `checkpointPolicy=classic` | 5 rows across 5 commits, classic-named V2 parquet checkpoint at v5 with `stats_parsed`. | `read.rs::stats_parsed_skipping::v2_classic_parquet` | +| `mixed-nulls` | data/ | `id: long, part: long, value: string, n: string` partitioned by `part` | v1/v2 | | part=0: `n` all null. part=1: no nulls. part=2: mixed nulls. | `read.rs::mixed_null/mixed_not_null` | +| `stats-writing-all-types` | data/stats-writing-all-types/delta/ | 16 columns: all primitives + array, map, nested struct | v3/v7 | r:`timestampNtz,columnMapping` w:`timestampNtz,columnMapping`, `columnMapping.mode=name` | Verifies stats collection matches Spark output | `default/stats.rs::test_collect_stats_matches_spark` | +| `data-skipping-basic-stats-all-types` | golden_data/ | `as_int: int, as_long: long, as_byte: byte, as_short: short, as_float: float, as_double: double, as_string: string, as_date: date, as_timestamp: timestamp, as_big_decimal: decimal(1,0)` | v1/v2 | | Basic stats for all types | `golden_tables.rs::golden_test!` | +| `data-skipping-basic-stats-all-types-checkpoint` | golden_data/ | `as_int: int, as_long: long, as_byte: byte, as_short: short, as_float: float, as_double: double, as_string: string, as_date: date, as_timestamp: timestamp, as_big_decimal: decimal(1,0)` | v1/v2 | `checkpointInterval=1` | Stats from checkpoint | `golden_tables.rs::golden_test!` | +| `data-skipping-basic-stats-all-types-columnmapping-id` | golden_data/ | `as_int: int, as_long: long, as_byte: byte, as_short: short, as_float: float, as_double: double, as_string: string, as_date: date, as_timestamp: timestamp, as_big_decimal: decimal(1,0)` | v2/v5 | `columnMapping.mode=id` | Stats with CM id mode (not supported) | `golden_tables.rs::skip_test!` | +| `data-skipping-basic-stats-all-types-columnmapping-name` | golden_data/ | `as_int: int, as_long: long, as_byte: byte, as_short: short, as_float: float, as_double: double, as_string: string, as_date: date, as_timestamp: timestamp, as_big_decimal: decimal(1,0)` | v2/v5 | `columnMapping.mode=name` | Stats with CM name mode | `golden_tables.rs::golden_test!` | +| `data-skipping-change-stats-collected-across-versions` | golden_data/ | `col1: int, col2: int` | v1/v2 | | Stats that change across versions | `golden_tables.rs::golden_test!` | +| `data-skipping-partition-and-data-column` | golden_data/ | `part: int, id: int` | v1/v2 | | Skipping with partition + data predicates | `golden_tables.rs::golden_test!` | + +## Type Handling + +### Decimal + +| Table | Location | Schema | Protocol (R/W) | Description | Tests | +|-------|----------|--------|----------|-------------|-------| +| `basic-decimal-table` | data/ | `part: decimal(12,5), col1: decimal(5,2), col2: decimal(10,5), col3: decimal(20,10)` partitioned by `part` | v1/v2 | Decimal partitions with negative, small, and large values | `read.rs::basic_decimal`, `golden_tables.rs::golden_basic_decimal_table` | +| `124-decimal-decode-bug` | golden_data/ | `large_decimal: decimal(10,0)` | v1/v2 | Regression for decimal decode bug #124 | `golden_tables.rs::golden_test!` | +| `125-iterator-bug` | golden_data/ | `col1: int` | v1/v2 | Regression for iterator bug #125 | `golden_tables.rs::golden_test!` | +| `basic-decimal-table-legacy` | golden_data/ | `part: decimal(12,5), col1: decimal(5,2), col2: decimal(10,5), col3: decimal(20,10)` partitioned by `part` | v1/v2 | Decimal types in legacy parquet format | `golden_tables.rs::golden_test!` | +| `decimal-various-scale-precision` | golden_data/ | 29 decimal columns with varying precisions (4,0 through 38,36) | v1/v2 | Various scale/precision combinations | `golden_tables.rs::golden_test!` | +| `parquet-decimal-dictionaries` | golden_data/ | `id: int, col1: decimal(9,0), col2: decimal(12,0), col3: decimal(25,0)` | v1/v2 | Dictionary-encoded decimals | `golden_tables.rs::golden_test!` | +| `parquet-decimal-dictionaries-v1` | golden_data/ | `id: int, col1: decimal(9,0), col2: decimal(12,0), col3: decimal(25,0)` | v1/v2 | Dictionary-encoded decimals (parquet v1) | `golden_tables.rs::golden_test!` | +| `parquet-decimal-dictionaries-v2` | golden_data/ | `id: int, col1: decimal(9,0), col2: decimal(12,0), col3: decimal(25,0)` | v1/v2 | Dictionary-encoded decimals (parquet v2) | `golden_tables.rs::golden_test!` | +| `parquet-decimal-type` | golden_data/ | `id: int, col1: decimal(5,1), col2: decimal(10,5), col3: decimal(20,5)` | v1/v2 | Parquet decimal type handling | `golden_tables.rs::golden_test!` | + +### Timestamps + +| Table | Location | Schema | Protocol (R/W) | Description | Tests | +|-------|----------|--------|----------|-------------|-------| +| `data-reader-timestamp_ntz` | data/ | `id: int, tsNtz: timestamp_ntz, tsNtzPartition: timestamp_ntz` partitioned by `tsNtzPartition` | v3/v7, r:`timestampNtz` | Timestamp without timezone as data + partition column | `read.rs::timestamp_ntz`, `golden_tables.rs::golden_data_reader_timestamp_ntz` | +| `timestamp-partitioned-table` | data/timestamp-partitioned-table.tar.zst | `id: long, x: double, s: string, time: timestamp` partitioned by `time` | v3/v7 | r:`deletionVectors` w:`deletionVectors,invariants,appendOnly`, `enableDeletionVectors=true` | Table partitioned by timestamp column | `read.rs::timestamp_partitioned_table` | +| `kernel-timestamp-int96` | golden_data/ | `id: long, tsNtz: timestamp` | v1/v2 | Timestamps stored as INT96 | `golden_tables.rs::golden_test!` | +| `kernel-timestamp-pst` | golden_data/ | `id: long, tsNtz: timestamp` | v1/v2 | Timestamps in PST timezone | `golden_tables.rs::golden_test!` | +| `kernel-timestamp-timestamp_micros` | golden_data/ | `id: long, tsNtz: timestamp` | v1/v2 | TIMESTAMP_MICROS encoding | `golden_tables.rs::golden_test!` | +| `kernel-timestamp-timestamp_millis` | golden_data/ | `id: long, tsNtz: timestamp` | v1/v2 | TIMESTAMP_MILLIS encoding | `golden_tables.rs::golden_test!` | +| `data-reader-timestamp_ntz-id-mode` | golden_data/ | `id: int, tsNtz: timestamp_ntz, tsNtzPartition: timestamp_ntz` partitioned by `tsNtzPartition` | v3/v7 | r:`timestampNtz,columnMapping` w:`timestampNtz,columnMapping`, `columnMapping.mode=id` | Timestamp NTZ with CM id mode | `golden_tables.rs::golden_test!` | +| `data-reader-timestamp_ntz-name-mode` | golden_data/ | `id: int, tsNtz: timestamp_ntz, tsNtzPartition: timestamp_ntz` partitioned by `tsNtzPartition` | v3/v7 | r:`timestampNtz,columnMapping` w:`timestampNtz,columnMapping`, `columnMapping.mode=name` | Timestamp NTZ with CM name mode | `golden_tables.rs::golden_test!` | + +### Type Widening & Variant + +| Table | Location | Schema | Protocol (R/W) | Description | Tests | +|-------|----------|--------|----------|-------------|-------| +| `type-widening` | data/ | 13 columns named by widening path (e.g. `byte_long`, `float_double`, `date_timestamp_ntz`) | v1/v2 (at v0) | Commit 0 = narrow types, commit 1 = widened | `read.rs::type_widening_basic/type_widening_decimal` | +| `unshredded-variant` | data/unshredded-variant.tar.zst | `id: long, v: variant, array_of_variants: array, struct_of_variants: struct{v: variant}, map_of_variants: map, array_of_struct_of_variants: array, struct_of_array_of_variants: struct{v: array}` | v3/v7 | r:`variantType-preview` w:`variantType-preview,appendOnly,invariants`, `checkpointInterval=2` | Unshredded variant type column | `read.rs::unshredded_variant_table` | + +## Application Transactions + +| Table | Location | Schema | Protocol (R/W) | Features | Description | Tests | +|-------|----------|--------|----------|----------|-------------|-------| +| `app-txn-checkpoint` | data/ | `id: string, value: int, modified: string` partitioned by `modified` | v1/v2 | | `txn` for `appId="my-app"`. Two partitions. Has checkpoint. | `hdfs.rs::read_table_version_hdfs`, `set_transaction.rs::test_txn` | +| `app-txn-no-checkpoint` | data/ | `id: string, value: int, modified: string` partitioned by `modified` | v1/v2 | | Same as above but no checkpoint | `commit.rs::test_commit_phase_processes_commits`, `checkpoint/mod.rs` (doc example), `set_transaction.rs::test_txn` | +| `app-txn-with-last-updated` | data/ | `a: long, b: long` | v1/v2 | `setTransactionRetentionDuration="interval 1 days"` | `txn` with `lastUpdated` timestamps and retention config | `set_transaction.rs::test_txn_retention_filtering` | + +## CRC (Checksum) Files + +| Table | Location | Schema | Protocol (R/W) | Features | Description | Tests | +|-------|----------|--------|----------|----------|-------------|-------| +| `crc-full` | data/ | `id: long` | v3/v7 | r:`deletionVectors` w:`domainMetadata,deletionVectors,rowTracking` | Full CRC with `allFiles` (10 files), 2 `setTransactions`, `domainMetadata`, `fileSizeHistogram` | `crc/reader.rs::test_read_crc`, `crc/lazy.rs::test_lazy_crc` | +| `crc-malformed` | data/ | N/A | N/A | | CRC file contains only `"malformed"` | `crc/lazy.rs::test_lazy_crc_malformed_file`, `crc/reader.rs` error test | + +## Partitioning & Write Path + +| Table | Location | Schema | Protocol (R/W) | Features | Description | Tests | +|-------|----------|--------|----------|----------|-------------|-------| +| `basic_partitioned` | data/ | `letter: string, number: long, a_float: double` partitioned by `letter` | v1/v2 | | Partitions: a, b, c, e, `__HIVE_DEFAULT_PARTITION__`. Second most referenced table. | `read.rs::data/column_ordering/column_ordering_and_projection/predicate_on_number/predicate_on_letter/predicate_on_letter_and_number/predicate_on_number_not/predicate_on_number_with_not_null/predicate_null/and_or_predicates/not_and_or_predicates/invalid_skips_none_predicates/predicate_references_invalid_missing_column`, `benchmarks/runners.rs`, `snapshot.rs::test_new_post_commit_simple`, `log_compaction/tests.rs::create_multi_version_snapshot`, `transaction/mod.rs::test_physical_schema_excludes_partition_columns/test_materialize_partition_columns_in_write_context/test_partition_column_in_eval_output`, `set_transaction.rs::test_txn`, `scan/tests.rs::test_scan_metadata_from_with_update` | +| `partitioned_with_materialize_feature` | data/ | `letter: string, number: long, a_float: double` partitioned by `letter` | v3/v7 | w:`materializePartitionColumns` | Same data as basic_partitioned but partition columns materialized in write output | `transaction/mod.rs::test_materialize_partition_columns_in_write_context/test_physical_schema_includes_partition_columns_when_materialized/test_partition_column_in_eval_output` | + +## Log Replay & State Reconstruction + +| Table | Location | Schema | Protocol (R/W) | Description | Tests | +|-------|----------|--------|----------|-------------|-------| +| `compacted-log-files-table` | data/compacted-log-files-table.tar.zst | `id: int, comment: string` | v3/v7, r:`deletionVectors` w:`deletionVectors,changeDataFeed` | Table with compacted (merged) log files | `read.rs::compacted_log_files_table` | +| `log-replay-latest-metadata-protocol` | golden_data/ | `id: int, val: string` | v1/v2 (at v0, changes later) | Log replay picks latest metadata/protocol | `golden_tables.rs::golden_test!` | +| `log-replay-special-characters` | golden_data/ | `id: int` | v1/v2 | File paths with special characters | `golden_tables.rs::golden_test!` | +| `log-replay-special-characters-a` | golden_data/ | `id: int` | v1/v2 | Special characters variant (a) | `golden_tables.rs::golden_test!` | +| `log-replay-special-characters-b` | golden_data/ | `id: int` | v1/v2 | Special characters variant (b) | `golden_tables.rs::skip_test!` (not yet implemented) | +| `deltalog-getChanges` | golden_data/ | `part: int, id: int` | v1/v2 | Reading changes from delta log | `golden_tables.rs::golden_test!` | +| `deltalog-invalid-protocol-version` | golden_data/ | `id: int` | v99/v7 | Invalid protocol version | `golden_tables.rs::negative_test!` | +| `deltalog-state-reconstruction-from-checkpoint-missing-metadata` | golden_data/ | N/A | v1/v2 | Checkpoint missing metadata action | `golden_tables.rs::negative_test!` | +| `deltalog-state-reconstruction-from-checkpoint-missing-protocol` | golden_data/ | N/A | v1/v2 | Checkpoint missing protocol action | `golden_tables.rs::negative_test!` | +| `deltalog-state-reconstruction-without-metadata` | golden_data/ | N/A | v3/v7 | Log without metadata action | `golden_tables.rs::negative_test!` | +| `deltalog-state-reconstruction-without-protocol` | golden_data/ | `intCol: int` | N/A | Log without protocol action | `golden_tables.rs::negative_test!` | +| `no-delta-log-folder` | golden_data/ | N/A | N/A | Missing `_delta_log` directory | `golden_tables.rs::negative_test!` | +| `versions-not-contiguous` | golden_data/ | N/A | v1/v2 | Non-contiguous version numbers | `golden_tables.rs::negative_test!` | + +## Snapshots & Time Travel + +| Table | Location | Schema | Protocol (R/W) | Description | Tests | +|-------|----------|--------|----------|-------------|-------| +| `snapshot-data0` | golden_data/ | `col1: int, col2: string` | v1/v2 | Snapshot at initial version | `golden_tables.rs::golden_test!` | +| `snapshot-data1` | golden_data/ | `col1: int, col2: string` | v1/v2 | Snapshot after first insert | `golden_tables.rs::golden_test!` | +| `snapshot-data2` | golden_data/ | `col1: int, col2: string` | v1/v2 | Snapshot after second insert | `golden_tables.rs::golden_test!` | +| `snapshot-data2-deleted` | golden_data/ | `col1: int, col2: string` | v1/v2 | Snapshot after delete operation | `golden_tables.rs::golden_test!` | +| `snapshot-data3` | golden_data/ | `col1: int, col2: string` | v1/v2 | Snapshot after third insert | `golden_tables.rs::golden_test!` | +| `snapshot-repartitioned` | golden_data/ | `col1: int, col2: string` | v1/v2 | Snapshot after repartitioning | `golden_tables.rs::golden_test!` | +| `snapshot-vacuumed` | golden_data/ | `col1: int, col2: string` | v1/v2 | Snapshot after vacuum | `golden_tables.rs::golden_test!` | +| `time-travel-start` | golden_data/ | `id: long` | v1/v2 | Time travel to initial version | `golden_tables.rs::golden_test!` | +| `time-travel-start-start20` | golden_data/ | `id: long` | v1/v2 | Time travel across 20 versions | `golden_tables.rs::golden_test!` | +| `time-travel-start-start20-start40` | golden_data/ | `id: long` | v1/v2 | Time travel across 40 versions | `golden_tables.rs::golden_test!` | +| `time-travel-partition-changes-a` | golden_data/ | `id: long, part5: long` partitioned by `part5` | v1/v2 | Time travel with partition changes (a) | `golden_tables.rs::golden_test!` | +| `time-travel-partition-changes-b` | golden_data/ | `id: long, part5: long` partitioned by `part5` | v1/v2 | Time travel with partition changes (b) | `golden_tables.rs::golden_test!` | +| `time-travel-schema-changes-a` | golden_data/ | `id: long` | v1/v2 | Time travel with schema changes (a) | `golden_tables.rs::golden_test!` | +| `time-travel-schema-changes-b` | golden_data/ | `id: long` | v1/v2 | Time travel with schema changes (b) | `golden_tables.rs::golden_test!` | + +## Data Readers (Golden) + +| Table | Location | Schema | Protocol (R/W) | Description | Tests | +|-------|----------|--------|----------|-------------|-------| +| `data-reader-array-complex-objects` | golden_data/ | `i: int, 3d_int_list: array>>, 4d_int_list: array>>>, list_of_maps: array>, list_of_records: array` | v1/v2 | Arrays of complex objects | `golden_tables.rs::golden_test!` | +| `data-reader-array-primitives` | golden_data/ | `as_array_int: array, as_array_long: array, as_array_byte: array, as_array_short: array, as_array_boolean: array, as_array_float: array, as_array_double: array, as_array_string: array, as_array_binary: array, as_array_big_decimal: array` | v1/v2 | Arrays of primitive types | `golden_tables.rs::golden_test!` | +| `data-reader-date-types-America` | golden_data/ | `timestamp: timestamp, date: date` | v1/v2 | Date types in America timezone | `golden_tables.rs::golden_test!` | +| `data-reader-date-types-Asia` | golden_data/ | `timestamp: timestamp, date: date` | v1/v2 | Date types in Asia timezone | `golden_tables.rs::golden_test!` | +| `data-reader-date-types-Etc` | golden_data/ | `timestamp: timestamp, date: date` | v1/v2 | Date types in Etc timezone | `golden_tables.rs::golden_test!` | +| `data-reader-date-types-Iceland` | golden_data/ | `timestamp: timestamp, date: date` | v1/v2 | Date types in Iceland timezone | `golden_tables.rs::golden_test!` | +| `data-reader-date-types-Jst` | golden_data/ | `timestamp: timestamp, date: date` | v1/v2 | Date types in JST timezone | `golden_tables.rs::golden_test!` | +| `data-reader-date-types-Pst` | golden_data/ | `timestamp: timestamp, date: date` | v1/v2 | Date types in PST timezone | `golden_tables.rs::golden_test!` | +| `data-reader-date-types-utc` | golden_data/ | `timestamp: timestamp, date: date` | v1/v2 | Date types in UTC timezone | `golden_tables.rs::golden_test!` | +| `data-reader-escaped-chars` | golden_data/ | `_1: string, _2: string` partitioned by `_2` | v1/v2 | File paths with escaped characters | `golden_tables.rs::golden_test!` | +| `data-reader-map` | golden_data/ | `i: int, a: map, b: map, c: map, d: map, e: map, f: map>` | v1/v2 | Map type columns | `golden_tables.rs::golden_test!` | +| `data-reader-nested-struct` | golden_data/ | `a: struct{aa: string, ab: string, ac: struct{aca: int, acb: long}}, b: int` | v1/v2 | Nested struct columns | `golden_tables.rs::golden_test!` | +| `data-reader-nullable-field-invalid-schema-key` | golden_data/ | `array_can_contain_null: array` | v1/v2 | Nullable fields with invalid schema keys | `golden_tables.rs::golden_test!` | +| `data-reader-partition-values` | golden_data/ | `as_int: int, as_long: long, as_byte: byte, as_short: short, as_boolean: boolean, as_float: float, as_double: double, as_string: string, as_string_lit_null: string, as_date: date, as_timestamp: timestamp, as_big_decimal: decimal(1,0), as_list_of_records: array, as_nested_struct: struct{aa: string, ab: string, ac: struct{aca: int, acb: long}}, value: string` partitioned by first 12 columns | v1/v2 | Partition values | `golden_tables.rs::skip_test!` (needs timestamp fix) | +| `data-reader-primitives` | golden_data/ | `as_int: int, as_long: long, as_byte: byte, as_short: short, as_boolean: boolean, as_float: float, as_double: double, as_string: string, as_binary: binary, as_big_decimal: decimal(1,0)` | v1/v2 | All primitive types | `golden_tables.rs::golden_test!` | +| `parquet-all-types` | golden_data/ | `ByteType: byte, ShortType: short, IntegerType: int, LongType: long, FloatType: float, DoubleType: double, decimal: decimal(10,2), BooleanType: boolean, StringType: string, BinaryType: binary, DateType: date, TimestampType: timestamp, TimestampNTZType: timestamp_ntz, nested_struct: struct{aa: string, ac: struct{aca: int}}, array_of_prims: array, array_of_arrays: array>, array_of_structs: array, map_of_prims: map, map_of_rows: map, map_of_arrays: map>` | v3/v7, r:`timestampNtz` w:`timestampNtz` | All parquet types | `golden_tables.rs::skip_test!` (nullability disagreement) | +| `parquet-all-types-legacy-format` | golden_data/ | `ByteType: byte, ShortType: short, IntegerType: int, LongType: long, FloatType: float, DoubleType: double, decimal: decimal(10,2), BooleanType: boolean, StringType: string, BinaryType: binary, DateType: date, TimestampType: timestamp, TimestampNTZType: timestamp_ntz, nested_struct: struct{aa: string, ac: struct{aca: int}}, array_of_prims: array, array_of_arrays: array>, array_of_structs: array, map_of_prims: map, map_of_rows: map, map_of_arrays: map>` | v3/v7, r:`timestampNtz` w:`timestampNtz` | Legacy parquet format | `golden_tables.rs::skip_test!` (legacy name issue) | + +## Insert / Update / Delete Operations (Golden) + +| Table | Location | Schema | Protocol (R/W) | Description | Tests | +|-------|----------|--------|----------|-------------|-------| +| `basic-with-inserts-deletes-checkpoint` | golden_data/ | `id: long` | v1/v2 | Inserts and deletes with checkpoint | `golden_tables.rs::golden_test!` | +| `basic-with-inserts-merge` | golden_data/ | `id: int, str: string` | v1/v2 | Inserts via MERGE operation | `golden_tables.rs::golden_test!` | +| `basic-with-inserts-overwrite-restore` | golden_data/ | `id: long` | v1/v2 | Inserts with overwrite and RESTORE | `golden_tables.rs::golden_test!` | +| `basic-with-inserts-updates` | golden_data/ | `id: int, str: string` | v1/v2 | Inserts and updates | `golden_tables.rs::golden_test!` | +| `basic-with-vacuum-protocol-check-feature` | golden_data/ | `id: int, str: string` | v1/v2 | Table with vacuum protocol check feature | `golden_tables.rs::golden_test!` | + +## Benchmarks + +| Table | Location | Schema | Protocol (R/W) | Description | Tests | +|-------|----------|--------|----------|-------------|-------| +| `300k-add-files-100-col-partitioned` | data/300k-add-files-100-col-partitioned.tar.zst | `partition_col: string, col_0..col_98: long` (100 data columns) partitioned by `partition_col` | v1/v2 | 300k add actions, 100 partition columns | `kernel/benches/metadata_bench.rs` | + +## Miscellaneous / Edge Cases + +| Table | Location | Schema | Protocol (R/W) | Description | Tests | +|-------|----------|--------|----------|-------------|-------| +| `hive` | golden_data/ | N/A | N/A | Hive-format table | `golden_tables.rs::skip_test!` (not yet implemented) | +| `canonicalized-paths-normal-a` | golden_data/ | `id: int` | v1/v2 | Path canonicalization | `golden_tables.rs::skip_test!` (canonicalization bug) | +| `canonicalized-paths-normal-b` | golden_data/ | `id: int` | v1/v2 | Path canonicalization | `golden_tables.rs::skip_test!` (canonicalization bug) | +| `canonicalized-paths-special-a` | golden_data/ | `id: int` | v1/v2 | Path canonicalization with special chars | `golden_tables.rs::skip_test!` (canonicalization bug) | +| `canonicalized-paths-special-b` | golden_data/ | `id: int` | v1/v2 | Path canonicalization with special chars | `golden_tables.rs::skip_test!` (canonicalization bug) | +| `delete-re-add-same-file-different-transactions` | golden_data/ | `intCol: int` | v1/v2 | Delete and re-add same file | `golden_tables.rs::skip_test!` (not yet implemented) | + +## Unreferenced Tables + +| Table | Location | Schema | Protocol (R/W) | +|-------|----------|--------|----------| +| `data-reader-absolute-paths-escaped-chars` | golden_data/ | N/A (no metadata in commit) | N/A | +| `deltalog-commit-info` | golden_data/ | N/A | v1/v2 | +| `update-deleted-directory` | golden_data/ | N/A | v1/v2 | diff --git a/kernel/tests/cdf.rs b/kernel/tests/cdf.rs index 002edc15ed..8f6d306ac9 100644 --- a/kernel/tests/cdf.rs +++ b/kernel/tests/cdf.rs @@ -1,19 +1,17 @@ use std::error; use delta_kernel::arrow::array::RecordBatch; -use delta_kernel::arrow::compute::filter_record_batch; use delta_kernel::arrow::datatypes::Schema as ArrowSchema; +use delta_kernel::engine::arrow_data::EngineDataArrowExt as _; use itertools::Itertools; use delta_kernel::engine::arrow_conversion::TryFromKernel as _; -use delta_kernel::engine::default::DefaultEngine; use delta_kernel::table_changes::TableChanges; use delta_kernel::{DeltaResult, Error, PredicateRef, Version}; mod common; -use test_utils::DefaultEngineExtension; -use test_utils::{load_test_data, to_arrow}; +use test_utils::load_test_data; fn read_cdf_for_table( test_name: impl AsRef, @@ -24,7 +22,7 @@ fn read_cdf_for_table( let test_dir = load_test_data("tests/data", test_name.as_ref()).unwrap(); let test_path = test_dir.path().join(test_name.as_ref()); let test_path = delta_kernel::try_parse_uri(test_path.to_str().expect("table path to string"))?; - let engine = DefaultEngine::new_local(); + let engine = test_utils::create_default_engine(&test_path)?; let table_changes = TableChanges::try_new( test_path, engine.as_ref(), @@ -50,17 +48,11 @@ fn read_cdf_for_table( ArrowSchema::try_from_kernel(scan.logical_schema().as_ref()).unwrap(); let batches: Vec = scan .execute(engine)? - .map(|scan_result| -> DeltaResult<_> { - let scan_result = scan_result?; - let mask = scan_result.full_mask(); - let data = scan_result.raw_data?; - let record_batch = to_arrow(data)?; + .map(|data| -> DeltaResult<_> { + let record_batch = data?.try_into_record_batch()?; // Verify that the arrow record batches match the expected schema assert!(record_batch.schema().as_ref() == &scan_schema_as_arrow); - match mask { - Some(mask) => Ok(filter_record_batch(&record_batch, &mask.into())?), - None => Ok(record_batch), - } + Ok(record_batch) }) .try_collect()?; Ok(batches) @@ -411,7 +403,7 @@ fn invalid_range_end_before_start() { #[test] fn invalid_range_start_after_last_version_of_table() { let res = read_cdf_for_table("cdf-table-simple", 3, 4, None); - let expected_msg = "Expected the first commit to have version 3"; + let expected_msg = "Expected the first commit to have version 3, got None"; assert!(matches!(res, Err(Error::Generic(msg)) if msg == expected_msg)); } @@ -553,3 +545,63 @@ fn conditional_delete_two_rows() -> DeltaResult<()> { assert_batches_sorted_eq!(expected, &batches); Ok(()) } + +#[test] +fn cdf_with_column_mapping_name_mode() -> Result<(), Box> { + // NOTE: these tables only have CDF enabled in version 1+, so we start reading from 1. This is + // due to pyspark limitation while writing: we were unable to create a table with column + // mapping + CDF enabled in commit 0, so we created with column mapping and enabled CDF in + // commit 1. + let batches = read_cdf_for_table("cdf-column-mapping-name-mode", 1, None, None)?; + let mut expected = vec![ + "+----+-------+-------+------------------+-----------------+", + "| id | name | value | _change_type | _commit_version |", + "+----+-------+-------+------------------+-----------------+", + "| 1 | Alice | 100.0 | delete | 4 |", + "| 2 | Bob | 200.0 | update_preimage | 2 |", + "| 2 | Bob | 250.0 | update_postimage | 2 |", + "| 4 | David | 400.0 | insert | 3 |", + "+----+-------+-------+------------------+-----------------+", + ]; + sort_lines!(expected); + assert_batches_sorted_eq!(expected, &batches); + + // same as above but instead of protocol 2,5 this is 3,7 with columnMapping+DV features + let batches = read_cdf_for_table("cdf-column-mapping-name-mode-3-7", 1, None, None)?; + let mut expected = vec![ + "+----+-------+-------+------------------+-----------------+", + "| id | name | value | _change_type | _commit_version |", + "+----+-------+-------+------------------+-----------------+", + "| 1 | Alice | 100.0 | delete | 4 |", + "| 2 | Bob | 200.0 | update_preimage | 2 |", + "| 2 | Bob | 250.0 | update_postimage | 2 |", + "| 4 | David | 400.0 | insert | 3 |", + "+----+-------+-------+------------------+-----------------+", + ]; + sort_lines!(expected); + assert_batches_sorted_eq!(expected, &batches); + + Ok(()) +} + +#[test] +fn cdf_with_column_mapping_id_mode() -> Result<(), Box> { + // NOTE: these tables only have CDF enabled in version 1+, so we start reading from 1. This is + // due to pyspark limitation while writing: we were unable to create a table with column + // mapping + CDF enabled in commit 0, so we created with column mapping and enabled CDF in + // commit 1. + let batches = read_cdf_for_table("cdf-column-mapping-id-mode", 1, None, None)?; + let mut expected = vec![ + "+----+-------+-------+------------------+-----------------+", + "| id | name | value | _change_type | _commit_version |", + "+----+-------+-------+------------------+-----------------+", + "| 2 | Frank | 250.0 | update_preimage | 2 |", + "| 2 | Frank | 275.0 | update_postimage | 2 |", + "| 3 | Grace | 350.0 | delete | 4 |", + "| 4 | Henry | 450.0 | insert | 3 |", + "+----+-------+-------+------------------+-----------------+", + ]; + sort_lines!(expected); + assert_batches_sorted_eq!(expected, &batches); + Ok(()) +} diff --git a/kernel/tests/checkpoint_transform.rs b/kernel/tests/checkpoint_transform.rs new file mode 100644 index 0000000000..1be609bb36 --- /dev/null +++ b/kernel/tests/checkpoint_transform.rs @@ -0,0 +1,632 @@ +//! Integration tests for checkpoint stats and partition values across all +//! writeStatsAsJson / writeStatsAsStruct configuration combinations. +//! +//! These tests write real parquet data through the transaction API, create checkpoints, +//! change stats configuration, create new checkpoints, and read all data back to verify +//! the full write → checkpoint → config change → checkpoint → read pipeline. + +use std::collections::HashMap; +use std::sync::Arc; + +use delta_kernel::arrow::array::{ + Array, ArrayRef, AsArray, Int64Array, RecordBatch, StringArray, StructArray, +}; +use delta_kernel::arrow::compute::{concat_batches, sort_to_indices, take}; +#[cfg(any(not(feature = "arrow-56"), feature = "arrow-57"))] +use delta_kernel::arrow::datatypes::TimestampMicrosecondType; +use delta_kernel::arrow::datatypes::{ + DataType as ArrowDataType, Field, Int64Type, Schema as ArrowSchema, +}; +use delta_kernel::engine::default::executor::tokio::TokioMultiThreadExecutor; +use delta_kernel::engine::default::DefaultEngineBuilder; +use delta_kernel::expressions::column_expr; +use delta_kernel::object_store::memory::InMemory; +use delta_kernel::object_store::path::Path; +use delta_kernel::object_store::ObjectStore; +use delta_kernel::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use delta_kernel::DeltaResult; +use delta_kernel::Expression; +use delta_kernel::Snapshot; + +use serde_json::json; +use test_utils::{insert_data, read_scan, write_batch_to_table}; +use url::Url; + +/// Creates an in-memory store and the table root URL. +fn new_in_memory_store() -> (Arc, Url) { + (Arc::new(InMemory::new()), Url::parse("memory:///").unwrap()) +} + +/// Writes a JSON commit file to the store. +async fn write_commit(store: &Arc, content: &str, version: u64) -> DeltaResult<()> { + let path = Path::from(format!("_delta_log/{version:020}.json")); + store.put(&path, content.to_string().into()).await?; + Ok(()) +} + +const NON_PARTITIONED_SCHEMA: &str = r#"{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}}]}"#; + +#[cfg(any(not(feature = "arrow-56"), feature = "arrow-57"))] +const PARTITIONED_SCHEMA: &str = r#"{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"created_at","type":"timestamp","nullable":true,"metadata":{}},{"name":"tag","type":"binary","nullable":true,"metadata":{}}]}"#; + +/// Builds a JSON commit string with optional protocol, metadata, and stats config. +/// When `include_protocol` is true, includes the protocol action (for version 0 commits). +fn build_commit( + schema_string: &str, + partition_columns: &[&str], + write_stats_as_json: bool, + write_stats_as_struct: bool, + include_protocol: bool, +) -> String { + let metadata = json!({ + "metaData": { + "id": "test-table", + "format": { "provider": "parquet", "options": {} }, + "schemaString": schema_string, + "partitionColumns": partition_columns, + "configuration": { + "delta.checkpoint.writeStatsAsJson": write_stats_as_json.to_string(), + "delta.checkpoint.writeStatsAsStruct": write_stats_as_struct.to_string() + }, + "createdTime": 1587968585495i64 + } + }); + if include_protocol { + let protocol = json!({ + "protocol": { + "minReaderVersion": 3, + "minWriterVersion": 7, + "readerFeatures": [], + "writerFeatures": [] + } + }); + format!("{protocol}\n{metadata}") + } else { + metadata.to_string() + } +} + +/// Tests all 16 combinations of writeStatsAsJson/writeStatsAsStruct settings with real +/// parquet data written through the transaction API on a non-partitioned table. +/// +/// For each combination (json1, struct1, json2, struct2): +/// 1. Creates a table with (json1, struct1) settings +/// 2. Writes real data through transactions (generating actual parquet files with stats) +/// 3. Creates checkpoint 1 with (json1, struct1) settings +/// 4. Writes more data, then changes config to (json2, struct2) +/// 5. Creates checkpoint 2 (reads from checkpoint 1 + new commits, exercises COALESCE) +/// 6. Reads all data back and verifies correctness +#[rstest::rstest] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_checkpoint_stats_config_with_real_data( + #[values(true, false)] json1: bool, + #[values(true, false)] struct1: bool, + #[values(true, false)] json2: bool, + #[values(true, false)] struct2: bool, +) -> Result<(), Box> { + let (store, table_root) = new_in_memory_store(); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = Arc::new( + DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(), + ); + + // Version 0: protocol + metadata with initial stats config + write_commit( + &store, + &build_commit(NON_PARTITIONED_SCHEMA, &[], json1, struct1, true), + 0, + ) + .await?; + + // Version 1: write real data (generates actual parquet files with stats) + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + let result = insert_data( + snapshot, + &engine, + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef, + Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie"])), + ], + ) + .await?; + assert!(result.is_committed()); + + // Checkpoint 1 with (json1, struct1) settings + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + snapshot.checkpoint(engine.as_ref())?; + + // Version 2: write more data + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + let result = insert_data( + snapshot, + &engine, + vec![ + Arc::new(Int64Array::from(vec![4, 5, 6])) as ArrayRef, + Arc::new(StringArray::from(vec!["Diana", "Eve", "Frank"])), + ], + ) + .await?; + assert!(result.is_committed()); + + // Version 3: change stats config + write_commit( + &store, + &build_commit(NON_PARTITIONED_SCHEMA, &[], json2, struct2, false), + 3, + ) + .await?; + + // Checkpoint 2 with (json2, struct2) settings (reads from checkpoint 1 + commits 2-3) + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + snapshot.checkpoint(engine.as_ref())?; + + // Read all data back and verify correctness + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; + let scan = snapshot.scan_builder().build()?; + let batches = read_scan(&scan, engine.clone())?; + + let schema = batches[0].schema(); + let merged = concat_batches(&schema, &batches)?; + let id_col = merged.column_by_name("id").unwrap(); + let sort_indices = sort_to_indices(id_col, None, None)?; + let sorted_columns: Vec = merged + .columns() + .iter() + .map(|col| take(col.as_ref(), &sort_indices, None).unwrap()) + .collect(); + let sorted = RecordBatch::try_new(schema, sorted_columns)?; + + assert_eq!(sorted.num_rows(), 6, "All 6 rows should be readable"); + + let ids: Vec = sorted + .column_by_name("id") + .unwrap() + .as_primitive::() + .values() + .iter() + .copied() + .collect(); + assert_eq!(ids, vec![1, 2, 3, 4, 5, 6]); + + let names: Vec<&str> = (0..6) + .map(|i| { + sorted + .column_by_name("name") + .unwrap() + .as_string::() + .value(i) + }) + .collect(); + assert_eq!( + names, + vec!["Alice", "Bob", "Charlie", "Diana", "Eve", "Frank"] + ); + + Ok(()) +} + +/// Tests all 16 combinations of writeStatsAsJson/writeStatsAsStruct settings with a +/// partitioned table containing real parquet data. +/// +/// When writeStatsAsStruct=true, the checkpoint includes `partitionValues_parsed` which +/// is a native typed struct derived from the string-valued `partitionValues` map using +/// `COALESCE(partitionValues_parsed, MAP_TO_STRUCT(partitionValues, partition_schema))`. +/// +/// Two partition columns exercise different parsing paths: +/// - `created_at` (timestamp): "2024-01-15 10:30:00" → microseconds-since-epoch +/// - `tag` (binary): "hello" → raw bytes +#[rstest::rstest] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +// Arrow 56's JSON reader rejects Binary typed fields. This test exercises checkpoint +// JSON paths that include a binary partition column (`tag`), so we have to disable it. +#[cfg(any(not(feature = "arrow-56"), feature = "arrow-57"))] +async fn test_checkpoint_partitioned_with_real_data( + #[values(true, false)] json1: bool, + #[values(true, false)] struct1: bool, + #[values(true, false)] json2: bool, + #[values(true, false)] struct2: bool, +) -> Result<(), Box> { + let (store, table_root) = new_in_memory_store(); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = Arc::new( + DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(), + ); + + // Version 0: protocol + partitioned metadata with initial stats config + write_commit( + &store, + &build_commit( + PARTITIONED_SCHEMA, + &["created_at", "tag"], + json1, + struct1, + true, + ), + 0, + ) + .await?; + + // Version 1: write data for partition created_at=2024-01-15 10:30:00, tag=hello + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + Field::new("id", ArrowDataType::Int64, true), + Field::new("name", ArrowDataType::Utf8, true), + ])), + vec![ + Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef, + Arc::new(StringArray::from(vec!["Alice", "Bob"])), + ], + )?; + write_batch_to_table( + &snapshot, + engine.as_ref(), + batch, + HashMap::from([ + ("created_at".to_string(), "2024-01-15 10:30:00".to_string()), + ("tag".to_string(), "hello".to_string()), + ]), + ) + .await?; + + // Checkpoint 1 with (json1, struct1) settings + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + snapshot.checkpoint(engine.as_ref())?; + + // Version 2: write data for partition created_at=2025-03-01 09:15:30.123456, tag=world + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + Field::new("id", ArrowDataType::Int64, true), + Field::new("name", ArrowDataType::Utf8, true), + ])), + vec![ + Arc::new(Int64Array::from(vec![3, 4])) as ArrayRef, + Arc::new(StringArray::from(vec!["Charlie", "Diana"])), + ], + )?; + write_batch_to_table( + &snapshot, + engine.as_ref(), + batch, + HashMap::from([ + ( + "created_at".to_string(), + "2025-03-01 09:15:30.123456".to_string(), + ), + ("tag".to_string(), "world".to_string()), + ]), + ) + .await?; + + // Version 3: change stats config + write_commit( + &store, + &build_commit( + PARTITIONED_SCHEMA, + &["created_at", "tag"], + json2, + struct2, + false, + ), + 3, + ) + .await?; + + // Checkpoint 2 with (json2, struct2) settings (reads from checkpoint 1 + commits 2-3) + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + snapshot.checkpoint(engine.as_ref())?; + + // Verify partitionValues_parsed content directly in the checkpoint + if struct2 { + let checkpoint_path = Path::from("_delta_log/00000000000000000003.checkpoint.parquet"); + let checkpoint_bytes = store.get(&checkpoint_path).await?.bytes().await?; + let reader = ParquetRecordBatchReaderBuilder::try_new(checkpoint_bytes)?.build()?; + let ckpt_batches: Vec = reader.map(|b| b.unwrap()).collect(); + let ckpt_schema = ckpt_batches[0].schema(); + let ckpt_merged = concat_batches(&ckpt_schema, &ckpt_batches)?; + + let add_col = ckpt_merged + .column_by_name("add") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // Verify partitionValues_parsed has correctly typed partition values + let pv_parsed = add_col + .column_by_name("partitionValues_parsed") + .expect("checkpoint should have partitionValues_parsed when writeStatsAsStruct=true") + .as_any() + .downcast_ref::() + .unwrap(); + let add_rows: Vec = (0..ckpt_merged.num_rows()) + .filter(|&i| add_col.is_valid(i)) + .collect(); + assert_eq!(add_rows.len(), 2, "should have 2 add actions"); + + // Verify tag (binary) partition values + let tag_col = pv_parsed + .column_by_name("tag") + .expect("partitionValues_parsed should have tag"); + let mut tag_values: Vec<&[u8]> = add_rows + .iter() + .map(|&i| tag_col.as_binary::().value(i)) + .collect(); + tag_values.sort(); + assert_eq!(tag_values, vec![b"hello", b"world"]); + + // Verify created_at (timestamp) partition values + let created_at_col = pv_parsed + .column_by_name("created_at") + .expect("partitionValues_parsed should have created_at"); + let mut ts_values: Vec = add_rows + .iter() + .map(|&i| { + created_at_col + .as_primitive::() + .value(i) + }) + .collect(); + ts_values.sort(); + // 2024-01-15 10:30:00 UTC and 2025-03-01 09:15:30.123456 UTC in microseconds + assert_eq!(ts_values, vec![1705314600000000, 1740820530123456]); + } + + // Read all data back and verify correctness + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; + let scan = snapshot.scan_builder().build()?; + let batches = read_scan(&scan, engine.clone())?; + + // Merge all batches and sort by id to get deterministic ordering + let schema = batches[0].schema(); + let merged = concat_batches(&schema, &batches)?; + let id_col = merged.column_by_name("id").unwrap(); + let sort_indices = sort_to_indices(id_col, None, None)?; + let sorted_columns: Vec = merged + .columns() + .iter() + .map(|col| take(col.as_ref(), &sort_indices, None).unwrap()) + .collect(); + let sorted = RecordBatch::try_new(schema, sorted_columns)?; + + assert_eq!(sorted.num_rows(), 4, "All 4 rows should be readable"); + + // Verify partition values are correctly round-tripped + let ids: Vec = sorted + .column_by_name("id") + .unwrap() + .as_primitive::() + .values() + .iter() + .copied() + .collect(); + assert_eq!(ids, vec![1, 2, 3, 4]); + + let tags = sorted.column_by_name("tag").unwrap(); + let tags: Vec<&[u8]> = (0..4).map(|i| tags.as_binary::().value(i)).collect(); + // Rows 1,2 have tag=hello; rows 3,4 have tag=world + assert_eq!(tags, vec![b"hello", b"hello", b"world", b"world"]); + + Ok(()) +} + +/// Schema with column mapping metadata: logical names differ from physical names. +const COLUMN_MAPPING_SCHEMA: &str = r#"{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{"delta.columnMapping.id":1,"delta.columnMapping.physicalName":"col-id-phys"}},{"name":"name","type":"string","nullable":true,"metadata":{"delta.columnMapping.id":2,"delta.columnMapping.physicalName":"col-name-phys"}},{"name":"category","type":"string","nullable":true,"metadata":{"delta.columnMapping.id":3,"delta.columnMapping.physicalName":"col-category-phys"}}]}"#; + +/// Verifies that `partitionValues_parsed` uses physical column names when column mapping +/// is enabled. The checkpoint should contain `col-category-phys` (physical) not `category` +/// (logical) as the field name inside `partitionValues_parsed`. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_checkpoint_partition_values_parsed_with_column_mapping( +) -> Result<(), Box> { + let (store, table_root) = new_in_memory_store(); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = Arc::new( + DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(), + ); + + // Version 0: protocol + metadata with column mapping and writeStatsAsStruct=true + let protocol = json!({ + "protocol": { + "minReaderVersion": 3, + "minWriterVersion": 7, + "readerFeatures": ["columnMapping"], + "writerFeatures": ["columnMapping"] + } + }); + let metadata = json!({ + "metaData": { + "id": "test-table", + "format": { "provider": "parquet", "options": {} }, + "schemaString": COLUMN_MAPPING_SCHEMA, + "partitionColumns": ["category"], + "configuration": { + "delta.checkpoint.writeStatsAsJson": "true", + "delta.checkpoint.writeStatsAsStruct": "true", + "delta.columnMapping.mode": "name", + "delta.columnMapping.maxColumnId": "3" + }, + "createdTime": 1587968585495i64 + } + }); + write_commit(&store, &format!("{protocol}\n{metadata}"), 0).await?; + + // Version 1: write data for partition category=books + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + let batch = RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![ + Field::new("col-id-phys", ArrowDataType::Int64, true), + Field::new("col-name-phys", ArrowDataType::Utf8, true), + ])), + vec![ + Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef, + Arc::new(StringArray::from(vec!["Alice", "Bob"])), + ], + )?; + write_batch_to_table( + &snapshot, + engine.as_ref(), + batch, + HashMap::from([("category".to_string(), "books".to_string())]), + ) + .await?; + + // Create checkpoint with writeStatsAsStruct=true + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + snapshot.checkpoint(engine.as_ref())?; + + // Read checkpoint and verify partitionValues_parsed uses physical name + let checkpoint_path = Path::from("_delta_log/00000000000000000001.checkpoint.parquet"); + let checkpoint_bytes = store.get(&checkpoint_path).await?.bytes().await?; + let reader = ParquetRecordBatchReaderBuilder::try_new(checkpoint_bytes)?.build()?; + let ckpt_batches: Vec = reader.map(|b| b.unwrap()).collect(); + let ckpt_schema = ckpt_batches[0].schema(); + let ckpt_merged = concat_batches(&ckpt_schema, &ckpt_batches)?; + + let add_col = ckpt_merged + .column_by_name("add") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let pv_parsed = add_col + .column_by_name("partitionValues_parsed") + .expect("checkpoint should have partitionValues_parsed") + .as_any() + .downcast_ref::() + .unwrap(); + + // Should use physical name "col-category-phys", NOT logical name "category" + assert!( + pv_parsed.column_by_name("col-category-phys").is_some(), + "partitionValues_parsed should use physical column name" + ); + assert!( + pv_parsed.column_by_name("category").is_none(), + "partitionValues_parsed should not use logical column name" + ); + + // Verify the value is correct + let add_row = (0..ckpt_merged.num_rows()) + .find(|&i| add_col.is_valid(i)) + .expect("should have an add action"); + let category_col = pv_parsed.column_by_name("col-category-phys").unwrap(); + let category_value = category_col.as_string::().value(add_row); + assert_eq!(category_value, "books"); + + // Also verify data round-trips correctly through scan + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; + let scan = snapshot.scan_builder().build()?; + let batches = read_scan(&scan, engine.clone())?; + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 2); + + Ok(()) +} + +const WIDE_SCHEMA: &str = r#"{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"age","type":"long","nullable":true,"metadata":{}}]}"#; + +/// Regression test for https://github.com/delta-io/delta-kernel-rs/issues/2165 +/// +/// When a schema-evolved table has a checkpoint with stats_parsed covering only the +/// pre-evolution columns, scanning with a predicate on a newly added column should +/// not panic. The missing stats column should be treated as unknown (no data skipping). +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_scan_schema_evolved_table_with_checkpoint_predicate_on_new_column( +) -> Result<(), Box> { + let (store, table_root) = new_in_memory_store(); + let executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + let engine = Arc::new( + DefaultEngineBuilder::new(store.clone()) + .with_task_executor(executor) + .build(), + ); + + // Version 0: protocol + metadata with schema [id, name], stats as struct enabled + write_commit( + &store, + &build_commit(NON_PARTITIONED_SCHEMA, &[], true, true, true), + 0, + ) + .await?; + + // Version 1: write data with [id, name] + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + let result = insert_data( + snapshot, + &engine, + vec![ + Arc::new(Int64Array::from(vec![1, 2, 3])) as ArrayRef, + Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie"])), + ], + ) + .await?; + assert!(result.is_committed()); + + // Checkpoint at V1: stats_parsed covers only [id, name] + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + snapshot.checkpoint(engine.as_ref())?; + + // Version 2: schema evolution - add `age` column via new metadata action + write_commit( + &store, + &build_commit(WIDE_SCHEMA, &[], true, true, false), + 2, + ) + .await?; + + // Version 3: write data with [id, name, age] + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + let result = insert_data( + snapshot, + &engine, + vec![ + Arc::new(Int64Array::from(vec![4, 5, 6])) as ArrayRef, + Arc::new(StringArray::from(vec!["Diana", "Eve", "Frank"])), + Arc::new(Int64Array::from(vec![25, 35, 45])), + ], + ) + .await?; + assert!(result.is_committed()); + + // Scan with predicate on `id` (present in checkpoint stats_parsed) should work + let snapshot = Snapshot::builder_for(table_root.clone()).build(engine.as_ref())?; + let predicate = Arc::new(column_expr!("id").gt(Expression::literal(2i64))); + let scan = snapshot.scan_builder().with_predicate(predicate).build()?; + let batches = read_scan(&scan, engine.clone())?; + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert!(total_rows > 0, "should return rows matching id > 2"); + + // Scan with predicate on `age` (missing from checkpoint stats_parsed) should NOT panic. + // The kernel should handle the missing stats column gracefully. + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; + let predicate = Arc::new(column_expr!("age").gt(Expression::literal(30i64))); + let scan = snapshot.scan_builder().with_predicate(predicate).build()?; + let batches = read_scan(&scan, engine.clone())?; + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + // The predicate is used for data skipping only (not row-level filtering). + // All 6 rows should be returned since data skipping cannot skip any files: + // V1 data has no stats for `age`, and V3 data's age range includes values > 30. + assert_eq!( + total_rows, 6, + "all rows returned when data skipping cannot filter" + ); + + Ok(()) +} diff --git a/kernel/tests/clustering_e2e.rs b/kernel/tests/clustering_e2e.rs new file mode 100644 index 0000000000..cecff1f14f --- /dev/null +++ b/kernel/tests/clustering_e2e.rs @@ -0,0 +1,214 @@ +//! End-to-end integration tests for clustered tables. +//! +//! These tests exercise the full lifecycle: create table with clustering columns, +//! write data, commit, checkpoint, and verify the data and clustering metadata +//! are preserved throughout. + +use std::collections::HashMap; +use std::sync::Arc; + +use delta_kernel::arrow::array::{ArrayRef, Int32Array}; +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::expressions::ColumnName; +use delta_kernel::schema::{DataType, StructField, StructType}; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::transaction::data_layout::DataLayout; +use delta_kernel::transaction::CommitResult; +use rstest::rstest; + +use test_utils::{ + generate_batch, read_add_infos, read_scan, test_table_setup_mt, write_batch_to_table, IntoArray, +}; + +/// Full lifecycle: create a clustered table, write data, verify stats include clustering columns, +/// checkpoint, and verify clustering metadata and data survive. When `use_fresh_snapshot` is true, +/// the write happens via a fresh snapshot (simulating a separate session that did not create the +/// table). +#[rstest] +#[case::post_commit_snapshot(false)] +#[case::fresh_snapshot(true)] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_clustered_table_write_and_checkpoint( + #[case] use_fresh_snapshot: bool, +) -> Result<(), Box> { + let (_temp_dir, table_path, engine) = test_table_setup_mt()?; + let schema = Arc::new( + StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + StructField::new("city", DataType::STRING, true), + ]) + .unwrap(), + ); + let expected_clustering = vec![ColumnName::new(["id"]), ColumnName::new(["city"])]; + + // Create table clustered on "id" and "city" + let create_result = create_table(&table_path, schema, "Test/1.0") + .with_data_layout(DataLayout::Clustered { + columns: expected_clustering.clone(), + }) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let snapshot = if use_fresh_snapshot { + // Open a fresh snapshot (as if a different process is writing) + let table_url = delta_kernel::try_parse_uri(&table_path)?; + Snapshot::builder_for(table_url).build(engine.as_ref())? + } else { + match create_result { + CommitResult::CommittedTransaction(committed) => committed + .post_commit_snapshot() + .expect("post-commit snapshot should exist") + .clone(), + other => panic!("Expected CommittedTransaction, got: {other:?}"), + } + }; + + // First write: 3 rows + let batch = generate_batch(vec![ + ("id", vec![1, 2, 3].into_array()), + ("name", vec!["alice", "bob", "charlie"].into_array()), + ("city", vec!["seattle", "portland", "seattle"].into_array()), + ])?; + let snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new()).await?; + assert_eq!(snapshot.version(), 1); + + // Second write: 2 more rows + let batch = generate_batch(vec![ + ("id", vec![4, 5].into_array()), + ("name", vec!["dave", "eve"].into_array()), + ("city", vec!["austin", "portland"].into_array()), + ])?; + let snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new()).await?; + assert_eq!(snapshot.version(), 2); + + // Verify stats include all clustering columns + let add_infos = read_add_infos(&snapshot, engine.as_ref())?; + assert!(!add_infos.is_empty()); + for info in &add_infos { + let stats = info.stats.as_ref().expect("Add action should have stats"); + for col in &expected_clustering { + let col_name = col.to_string(); + assert!( + stats["minValues"].get(&col_name).is_some(), + "Stats should include minValues for clustering column '{col_name}'" + ); + assert!( + stats["maxValues"].get(&col_name).is_some(), + "Stats should include maxValues for clustering column '{col_name}'" + ); + } + } + + // Verify data readable before checkpoint + let scan = snapshot.clone().scan_builder().build()?; + let batches = read_scan(&scan, engine.clone())?; + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 5); + + // Checkpoint + snapshot.checkpoint(engine.as_ref())?; + + // Load fresh snapshot from checkpoint and verify everything survived + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let fresh = Snapshot::builder_for(table_url).build(engine.as_ref())?; + assert_eq!(fresh.version(), 2); + let scan = fresh.clone().scan_builder().build()?; + let batches = read_scan(&scan, engine.clone())?; + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 5); + + // Verify stats still include clustering columns after checkpoint + let add_infos = read_add_infos(&fresh, engine.as_ref())?; + assert!(!add_infos.is_empty()); + for info in &add_infos { + let stats = info.stats.as_ref().expect("Add action should have stats"); + for col in &expected_clustering { + let col_name = col.to_string(); + assert!( + stats["minValues"].get(&col_name).is_some(), + "Stats should include minValues for clustering column '{col_name}' after checkpoint" + ); + assert!( + stats["maxValues"].get(&col_name).is_some(), + "Stats should include maxValues for clustering column '{col_name}' after checkpoint" + ); + } + } + + Ok(()) +} + +/// Regression test: writing a batch where a clustering column has ALL null values should succeed. +/// +/// `collect_stats` emits null-valued min/max entries for all-null columns, allowing +/// `StatsVerifier` to find the field and confirm `nullCount == numRecords`. The JSON serializer +/// omits null fields on disk, matching Spark's `ignoreNullFields` behavior. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_clustered_table_write_all_null_clustering_column() { + let (_temp_dir, table_path, engine) = test_table_setup_mt().unwrap(); + let schema = Arc::new( + StructType::try_new(vec![ + StructField::new("category", DataType::STRING, false), + StructField::new("region_id", DataType::INTEGER, true), + ]) + .unwrap(), + ); + + // Create table clustered on "category" and "region_id" + let create_result = create_table(&table_path, schema, "Test/1.0") + .with_data_layout(DataLayout::Clustered { + columns: vec![ + ColumnName::new(["category"]), + ColumnName::new(["region_id"]), + ], + }) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new())) + .unwrap() + .commit(engine.as_ref()) + .unwrap(); + + let snapshot = create_result + .unwrap_committed() + .post_commit_snapshot() + .expect("post-commit snapshot should exist") + .clone(); + + // Write a batch where region_id is ALL nulls. + // This should succeed -- all-null clustering columns are valid. + let all_null_region: ArrayRef = Arc::new(Int32Array::from(vec![None, None, None])); + let batch = generate_batch(vec![ + ("category", vec!["a", "b", "c"].into_array()), + ("region_id", all_null_region), + ]) + .unwrap(); + + let snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new()) + .await + .unwrap(); + assert_eq!(snapshot.version(), 1); + + // Verify data is readable + let scan = snapshot.clone().scan_builder().build().unwrap(); + let batches = read_scan(&scan, engine.clone()).unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 3); + + // Verify stats: region_id should have nullCount=3. The JSON serializer omits null + // fields (matching Spark's ignoreNullFields), so minValues/maxValues should not + // contain region_id in the serialized JSON stats. + let add_infos = read_add_infos(&snapshot, engine.as_ref()).unwrap(); + assert_eq!(add_infos.len(), 1); + let stats = add_infos[0].stats.as_ref().expect("should have stats"); + assert_eq!(stats["numRecords"], 3); + assert_eq!(stats["nullCount"]["region_id"], 3); + assert!( + stats["minValues"].get("region_id").is_none(), + "JSON minValues should omit region_id when all values are null" + ); + assert!( + stats["maxValues"].get("region_id").is_none(), + "JSON maxValues should omit region_id when all values are null" + ); +} diff --git a/kernel/tests/crc.rs b/kernel/tests/crc.rs new file mode 100644 index 0000000000..d0f1de3e56 --- /dev/null +++ b/kernel/tests/crc.rs @@ -0,0 +1,973 @@ +//! Integration tests for CRC (version checksum) file-based APIs on Snapshot. + +use std::path::PathBuf; +use std::sync::Arc; + +use delta_kernel::arrow::array::{ArrayRef, Int32Array}; +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::crc::{Crc, FileStatsValidity}; +use delta_kernel::engine::default::DefaultEngineBuilder; +use delta_kernel::object_store::local::LocalFileSystem; +use delta_kernel::schema::{DataType, StructField, StructType}; +use delta_kernel::snapshot::{ChecksumWriteResult, Snapshot, SnapshotRef}; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::transaction::data_layout::DataLayout; +use delta_kernel::{DeltaResult, Engine}; +use rstest::rstest; +use test_utils::{add_commit, insert_data, test_table_setup}; + +// ============================================================================ +// File stats from CRC on disk +// ============================================================================ + +#[tokio::test] +async fn test_get_file_stats_from_crc() -> DeltaResult<()> { + let path = std::fs::canonicalize(PathBuf::from("./tests/data/crc-full/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let store = Arc::new(LocalFileSystem::new()); + let engine = DefaultEngineBuilder::new(store).build(); + + let snapshot = Snapshot::builder_for(table_root).build(&engine)?; + assert_eq!(snapshot.version(), 0); + + let file_stats = snapshot.get_or_load_file_stats(&engine).unwrap(); + assert_eq!(file_stats.num_files, 10); + assert_eq!(file_stats.table_size_bytes, 5259); + assert!(file_stats.file_size_histogram.is_some()); + + Ok(()) +} + +#[tokio::test] +async fn test_get_file_stats_no_crc() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = Arc::new(StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("value", DataType::STRING, true), + ])?); + + let _ = create_table(&table_path, schema, "Test/1.0") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + assert_eq!(snapshot.version(), 0); + + let file_stats = snapshot.get_or_load_file_stats(engine.as_ref()); + assert_eq!(file_stats, None); + + Ok(()) +} + +#[tokio::test] +async fn test_get_file_stats_crc_not_at_snapshot_version() -> DeltaResult<()> { + use test_utils::copy_directory; + + // ===== GIVEN ===== + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Copy crc-full table (has CRC at version 0) into the temp dir + let source_path = std::fs::canonicalize(PathBuf::from("./tests/data/crc-full/")).unwrap(); + copy_directory(&source_path, _temp_dir.path()).unwrap(); + + // Verify the table starts at version 0 with valid CRC stats + let snapshot = Snapshot::builder_for(table_path.clone()).build(engine.as_ref())?; + assert_eq!(snapshot.version(), 0); + assert!(snapshot.get_or_load_file_stats(engine.as_ref()).is_some()); + + // ===== WHEN ===== + // Empty commit to advance to version 1 (no new CRC file written) + let txn = snapshot.transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())?; + let _ = txn.commit(engine.as_ref())?; + + // ===== THEN ===== + // Load a fresh snapshot at version 1 + let snapshot = Snapshot::builder_for(table_path).build(engine.as_ref())?; + assert_eq!(snapshot.version(), 1); + + // No CRC at version 1, so file stats should be None + let file_stats = snapshot.get_or_load_file_stats(engine.as_ref()); + assert_eq!(file_stats, None); + + Ok(()) +} + +// ============================================================================ +// CRC test visibility: get_current_crc_if_loaded_for_testing +// ============================================================================ + +#[tokio::test] +async fn test_get_current_crc_if_loaded_returns_loaded_crc() -> DeltaResult<()> { + // ===== GIVEN ===== + let path = std::fs::canonicalize(PathBuf::from("./tests/data/crc-full/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let store = Arc::new(LocalFileSystem::new()); + let engine = DefaultEngineBuilder::new(store).build(); + + let snapshot = Snapshot::builder_for(table_root).build(&engine)?; + assert_eq!(snapshot.version(), 0); + + // ===== WHEN ===== + let crc = snapshot.get_current_crc_if_loaded_for_testing().unwrap(); + + // ===== THEN ===== + let file_stats = crc.file_stats().unwrap(); + assert_eq!(file_stats.table_size_bytes, 5259); + assert_eq!(file_stats.num_files, 10); + assert_eq!(crc.num_metadata, 1); + assert_eq!(crc.num_protocol, 1); + + // Protocol and metadata should match the snapshot's table configuration + assert_eq!(crc.protocol, *snapshot.table_configuration().protocol()); + assert_eq!(crc.metadata, *snapshot.table_configuration().metadata()); + + // Domain metadata + let dms = crc.domain_metadata.as_ref().unwrap(); + assert_eq!(dms.len(), 3); + assert!(dms.contains_key("delta.clustering")); + assert!(dms.contains_key("delta.rowTracking")); + assert!(dms.contains_key("myApp.metadata")); + + Ok(()) +} + +#[tokio::test] +async fn test_get_current_crc_if_loaded_returns_none_when_no_crc() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + + let _ = create_table(&table_path, schema, "Test/1.0") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + assert_eq!(snapshot.version(), 0); + + // No CRC file exists, so get_current_crc_if_loaded_for_testing should return None + assert!(snapshot.get_current_crc_if_loaded_for_testing().is_none()); + + Ok(()) +} + +// ============================================================================ +// Post-commit CRC existence: does a CRC exist on the post-commit snapshot? +// ============================================================================ + +fn create_table_and_commit( + table_path: &str, + engine: &dyn delta_kernel::Engine, +) -> DeltaResult { + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + let txn = create_table(table_path, schema, "test_engine") + .with_data_layout(DataLayout::clustered(["id"])) + .build(engine, Box::new(FileSystemCommitter::new()))? + .with_domain_metadata("zip".to_string(), "zap0".to_string()); + + Ok(txn.commit(engine)?.unwrap_committed()) +} + +#[tokio::test] +async fn test_create_table_produces_post_commit_crc() -> DeltaResult<()> { + // ===== GIVEN / WHEN: Create the table ===== + let (_temp_dir, table_path, engine) = test_table_setup()?; + let committed = create_table_and_commit(&table_path, engine.as_ref())?; + + // ===== THEN: should have CRC at v0 ===== + assert_eq!(committed.commit_version(), 0); + let snapshot = committed.post_commit_snapshot().unwrap(); + let crc = snapshot.get_current_crc_if_loaded_for_testing().unwrap(); + + let file_stats = crc.file_stats().unwrap(); + assert_eq!(file_stats.num_files, 0); + assert_eq!(file_stats.table_size_bytes, 0); + assert_eq!(crc.num_metadata, 1); + assert_eq!(crc.num_protocol, 1); + assert_eq!(crc.protocol, *snapshot.table_configuration().protocol()); + assert_eq!(crc.metadata, *snapshot.table_configuration().metadata()); + let dms = crc.domain_metadata.as_ref().unwrap(); + assert_eq!(dms["zip"].configuration(), "zap0"); + + Ok(()) +} + +#[rstest] +#[case::with_in_memory_crc(true)] +#[case::without_crc(false)] +#[tokio::test] +async fn test_post_commit_crc_chains_only_if_read_snapshot_has_crc( + #[case] use_post_commit_snapshot: bool, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let create_committed = create_table_and_commit(&table_path, engine.as_ref())?; + + let read_snapshot = if use_post_commit_snapshot { + // Post-commit snapshot has in-memory CRC from the previous commit. + create_committed.post_commit_snapshot().unwrap().clone() + } else { + // Fresh-from-disk snapshot has no CRC (no .crc file on disk). + Snapshot::builder_for(table_path).build(engine.as_ref())? + }; + assert_eq!( + read_snapshot + .get_current_crc_if_loaded_for_testing() + .is_some(), + use_post_commit_snapshot + ); + + let committed = read_snapshot + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("WRITE".to_string()) + .with_domain_metadata("zip".to_string(), "zap1".to_string()) + .commit(engine.as_ref())? + .unwrap_committed(); + + // The new post-commit snapshot should only have a CRC if the read snapshot had one. + assert_eq!(committed.commit_version(), 1); + assert_eq!( + committed + .post_commit_snapshot() + .unwrap() + .get_current_crc_if_loaded_for_testing() + .is_some(), + use_post_commit_snapshot + ); + + Ok(()) +} + +// ================================================================================ +// Post-commit CRC correctness: are the CRC fields accurate after write and reload? +// ================================================================================ + +/// Writes the in-memory CRC to disk, reloads a fresh snapshot, and asserts that the +/// round-tripped CRC matches the in-memory one. Returns the loaded CRC for further assertions. +fn write_and_verify_crc( + snapshot: &SnapshotRef, + table_path: &str, + engine: &dyn delta_kernel::Engine, +) -> Crc { + let crc_in_memory = snapshot.get_current_crc_if_loaded_for_testing().unwrap(); + snapshot.write_checksum(engine).unwrap(); + + let snapshot_fresh = Snapshot::builder_for(table_path).build(engine).unwrap(); + let crc_from_disk = snapshot_fresh + .get_current_crc_if_loaded_for_testing() + .unwrap(); + assert_eq!(crc_in_memory, crc_from_disk); + crc_from_disk.clone() +} + +#[tokio::test] +async fn test_post_commit_crc_tracks_file_stats_across_inserts() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // ===== GIVEN: Create the table ===== + let committed = create_table_and_commit(&table_path, engine.as_ref())?; + let snapshot_v0 = committed.post_commit_snapshot().unwrap().clone(); + + // ===== WHEN: Insert values 1..=10 ===== + let col1: ArrayRef = Arc::new(Int32Array::from((1..=10).collect::>())); + let committed = insert_data(snapshot_v0, &engine, vec![col1]) + .await? + .unwrap_committed(); + + // ===== THEN: should have CRC at v1 with right file stats ===== + assert_eq!(committed.commit_version(), 1); + let snapshot_v1 = committed.post_commit_snapshot().unwrap(); + let crc_v1 = write_and_verify_crc(snapshot_v1, &table_path, engine.as_ref()); + let stats_v1 = crc_v1.file_stats().unwrap(); + assert_eq!(stats_v1.num_files, 1); // <--- 1 file added + assert!(stats_v1.table_size_bytes > 0); // <--- size is non-zero + + // ===== WHEN: Insert values 11..=20 ===== + let col2: ArrayRef = Arc::new(Int32Array::from((11..=20).collect::>())); + let committed = insert_data(snapshot_v1.clone(), &engine, vec![col2]) + .await? + .unwrap_committed(); + + // ===== THEN: should have CRC at v2 with right file stats ===== + assert_eq!(committed.commit_version(), 2); + let snapshot_v2 = committed.post_commit_snapshot().unwrap(); + let crc_v2 = write_and_verify_crc(snapshot_v2, &table_path, engine.as_ref()); + let stats_v2 = crc_v2.file_stats().unwrap(); + assert_eq!(stats_v2.num_files, 2); // <--- 2 files added + assert!(stats_v2.table_size_bytes > stats_v1.table_size_bytes); // <--- size is greater than after first insert + + // ===== WHEN: Remove all files ===== + let scan = snapshot_v2.clone().scan_builder().build()?; + let mut txn = snapshot_v2 + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("DELETE".to_string()) + .with_data_change(true); + for sm in scan.scan_metadata(engine.as_ref())? { + txn.remove_files(sm?.scan_files); + } + let committed = txn.commit(engine.as_ref())?.unwrap_committed(); + + // ===== THEN: should have CRC at v3 with right file stats ===== + assert_eq!(committed.commit_version(), 3); + let snapshot_v3 = committed.post_commit_snapshot().unwrap(); + let crc_v3 = write_and_verify_crc(snapshot_v3, &table_path, engine.as_ref()); + let stats_v3 = crc_v3.file_stats().unwrap(); + assert_eq!(stats_v3.num_files, 0); // <--- 0 net file in the table + assert_eq!(stats_v3.table_size_bytes, 0); // <--- size is 0 + + Ok(()) +} + +#[tokio::test] +async fn test_post_commit_crc_tracks_domain_metadata_changes() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // ===== WHEN: CREATE TABLE with zip -> zap0 ===== + let committed = create_table_and_commit(&table_path, engine.as_ref())?; + let snapshot_v0 = committed.post_commit_snapshot().unwrap(); + + // ===== THEN: should have CRC at v0 with zip -> zap0 ===== + let crc_v0 = write_and_verify_crc(snapshot_v0, &table_path, engine.as_ref()); + let dms = crc_v0.domain_metadata.as_ref().unwrap(); + assert_eq!(dms["zip"].configuration(), "zap0"); + + // ===== WHEN: update zip -> zap1, add foo -> bar ===== + let txn = snapshot_v0 + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("WRITE".to_string()) + .with_domain_metadata("zip".to_string(), "zap1".to_string()) // <-- set to zap1 + .with_domain_metadata("foo".to_string(), "bar".to_string()); // <-- add foo + let committed = txn.commit(engine.as_ref())?.unwrap_committed(); + + // ===== THEN: should have CRC at v1 with zip -> zap1, foo -> bar ===== + let snapshot_v1 = committed.post_commit_snapshot().unwrap(); + let crc_v1 = write_and_verify_crc(snapshot_v1, &table_path, engine.as_ref()); + let dms = crc_v1.domain_metadata.as_ref().unwrap(); + assert_eq!(dms["zip"].configuration(), "zap1"); // <-- must be zap1 + assert_eq!(dms["foo"].configuration(), "bar"); // <-- must be bar + + // ===== WHEN: remove zip, keep foo ===== + let txn = snapshot_v1 + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("WRITE".to_string()) + .with_domain_metadata_removed("zip".to_string()); // <-- remove zip + let committed = txn.commit(engine.as_ref())?.unwrap_committed(); + + // ===== THEN: should have CRC at v2 with zip gone, foo still there ===== + let snapshot_v2 = committed.post_commit_snapshot().unwrap(); + let crc_v2 = write_and_verify_crc(snapshot_v2, &table_path, engine.as_ref()); + let dms = crc_v2.domain_metadata.as_ref().unwrap(); + assert!(!dms.contains_key("zip")); // <-- must be gone + assert_eq!(dms["foo"].configuration(), "bar"); // <-- must still be bar + + Ok(()) +} + +#[tokio::test] +async fn test_post_commit_crc_non_incremental_op_makes_file_stats_indeterminate() -> DeltaResult<()> +{ + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // ===== GIVEN: Create table (v0) and insert data (v1) ===== + let committed = create_table_and_commit(&table_path, engine.as_ref())?; + let snapshot_v0 = committed.post_commit_snapshot().unwrap().clone(); + + let col: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + let committed = insert_data(snapshot_v0, &engine, vec![col]) + .await? + .unwrap_committed(); + let snapshot_v1 = committed.post_commit_snapshot().unwrap(); + + // ===== WHEN: Commit a non-incremental operation (ANALYZE STATS) ===== + let committed = snapshot_v1 + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("ANALYZE STATS".to_string()) + .commit(engine.as_ref())? + .unwrap_committed(); + + // ===== THEN: CRC at v2 has indeterminate file stats ===== + assert_eq!(committed.commit_version(), 2); + let snapshot_v2 = committed.post_commit_snapshot().unwrap(); + let crc_v2 = snapshot_v2.get_current_crc_if_loaded_for_testing().unwrap(); + assert_eq!(crc_v2.file_stats_validity, FileStatsValidity::Indeterminate); + + Ok(()) +} + +// ============================================================================ +// Write checksum to disk +// ============================================================================ + +#[tokio::test] +async fn test_write_checksum_success_simple() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let committed = create_table_and_commit(&table_path, engine.as_ref())?; + let snapshot = committed.post_commit_snapshot().unwrap(); + + let (result, _updated) = snapshot.write_checksum(engine.as_ref())?; + assert_eq!(result, ChecksumWriteResult::Written); + + // Verify the CRC file is readable by loading a fresh snapshot from disk + let fresh_snapshot = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert!(fresh_snapshot + .get_current_crc_if_loaded_for_testing() + .is_some()); + + Ok(()) +} + +#[rstest] +#[case::same_snapshot(false)] +#[case::fresh_snapshot(true)] +#[tokio::test] +async fn test_write_checksum_double_write_returns_already_exists( + #[case] reload_snapshot: bool, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let committed = create_table_and_commit(&table_path, engine.as_ref())?; + let snapshot = committed.post_commit_snapshot().unwrap(); + + let (first, updated) = snapshot.write_checksum(engine.as_ref())?; + assert_eq!(first, ChecksumWriteResult::Written); + + let second = if reload_snapshot { + let fresh = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + let (result, _) = fresh.write_checksum(engine.as_ref())?; + result + } else { + let (result, _) = updated.write_checksum(engine.as_ref())?; + result + }; + assert_eq!(second, ChecksumWriteResult::AlreadyExists); + + Ok(()) +} + +#[tokio::test] +async fn test_write_checksum_with_no_in_memory_crc_returns_error() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let _ = create_table_and_commit(&table_path, engine.as_ref())?; + + // Load from disk -- no CRC file on disk, so no in-memory CRC + let snapshot = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + + let result = snapshot.write_checksum(engine.as_ref()); + assert!(result.is_err()); + + Ok(()) +} + +#[tokio::test] +async fn test_in_memory_crc_chains_across_multiple_commits_then_writes() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let committed = create_table_and_commit(&table_path, engine.as_ref())?; + let mut snapshot = committed.post_commit_snapshot().unwrap().clone(); + assert!(snapshot.get_current_crc_if_loaded_for_testing().is_some()); + + // Chain several commits without writing CRC to disk + for i in 0..5 { + let col: ArrayRef = Arc::new(Int32Array::from(vec![i])); + let committed = insert_data(snapshot, &engine, vec![col]) + .await? + .unwrap_committed(); + snapshot = committed.post_commit_snapshot().unwrap().clone(); + assert!( + snapshot.get_current_crc_if_loaded_for_testing().is_some(), + "in-memory CRC lost at commit {}", + committed.commit_version() + ); + } + + // Only now write the CRC -- should have accumulated all 5 inserts + assert_eq!(snapshot.version(), 5); + let crc = write_and_verify_crc(&snapshot, &table_path, engine.as_ref()); + let crc_stats = crc.file_stats().unwrap(); + assert_eq!(crc_stats.num_files, 5); + assert!(crc_stats.table_size_bytes > 0); + + Ok(()) +} + +// When an incremental snapshot update picks up a CRC file from the new log segment, the loaded +// CRC data should be preserved in the resulting snapshot (not discarded by creating a second +// LazyCrc). This verifies that compute_post_commit_crc can find the CRC without additional I/O. +#[tokio::test] +async fn test_incremental_snapshot_preserves_loaded_crc() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Create table at v0 and write its CRC to disk + let committed_v0 = create_table_and_commit(&table_path, engine.as_ref())?; + let snapshot_v0 = committed_v0.post_commit_snapshot().unwrap(); + snapshot_v0.write_checksum(engine.as_ref())?; + + // Insert data at v1 and write its CRC to disk + let col: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + let committed_v1 = insert_data(snapshot_v0.clone(), &engine, vec![col]) + .await? + .unwrap_committed(); + committed_v1 + .post_commit_snapshot() + .unwrap() + .write_checksum(engine.as_ref())?; + + // Load a fresh snapshot at v0 (from disk, not post-commit) + let fresh_v0 = Snapshot::builder_for(&table_path) + .at_version(0) + .build(engine.as_ref())?; + assert_eq!(fresh_v0.version(), 0); + + // Incrementally update from v0 -> v1 + let incremental_v1 = Snapshot::builder_from(fresh_v0).build(engine.as_ref())?; + assert_eq!(incremental_v1.version(), 1); + + // The CRC at v1 should be loaded from the incremental update (not discarded) + assert_eq!(incremental_v1.crc_version_for_testing(), Some(1)); + assert!( + incremental_v1 + .get_current_crc_if_loaded_for_testing() + .is_some(), + "CRC should be loaded at v1 after incremental snapshot update" + ); + + // Committing from this snapshot should produce a post-commit CRC (proves + // compute_post_commit_crc found the loaded CRC and applied the delta) + let col: ArrayRef = Arc::new(Int32Array::from(vec![4, 5, 6])); + let committed_v2 = insert_data(incremental_v1, &engine, vec![col]) + .await? + .unwrap_committed(); + assert_eq!(committed_v2.commit_version(), 2); + let snapshot_v2 = committed_v2.post_commit_snapshot().unwrap(); + assert!( + snapshot_v2 + .get_current_crc_if_loaded_for_testing() + .is_some(), + "Post-commit CRC should chain from incremental snapshot's CRC" + ); + + Ok(()) +} + +// Incremental update where only the old segment has a CRC file (no new CRC written). +// The old CRC is preserved and the LazyCrc is reused from the old snapshot, but since +// it's at v0 while the snapshot is at v1, it won't be reported as loaded at the +// snapshot's version. +#[tokio::test] +async fn test_incremental_snapshot_old_crc_no_new_crc() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Create table at v0 and write CRC to disk + let committed_v0 = create_table_and_commit(&table_path, engine.as_ref())?; + committed_v0 + .post_commit_snapshot() + .unwrap() + .write_checksum(engine.as_ref())?; + + // Insert data at v1 -- do NOT write CRC for v1 + let col: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3])); + let committed_v1 = insert_data( + committed_v0.post_commit_snapshot().unwrap().clone(), + &engine, + vec![col], + ) + .await? + .unwrap_committed(); + assert_eq!(committed_v1.commit_version(), 1); + + // Load a fresh snapshot at v0 (this loads 0.crc during P&M reading) + let fresh_v0 = Snapshot::builder_for(&table_path) + .at_version(0) + .build(engine.as_ref())?; + assert!( + fresh_v0.get_current_crc_if_loaded_for_testing().is_some(), + "Fresh v0 snapshot should have CRC loaded from 0.crc" + ); + + // Incrementally update from v0 -> v1. The new listing (starting at v1) doesn't find + // any CRC file, so it falls back to the old segment's 0.crc. Since the old snapshot's + // LazyCrc is at the same version, it is reused (may already be loaded in memory). + let incremental_v1 = Snapshot::builder_from(fresh_v0).build(engine.as_ref())?; + assert_eq!(incremental_v1.version(), 1); + + // The CRC is at v0, not v1, so it won't be reported as loaded at v1 + assert!( + incremental_v1 + .get_current_crc_if_loaded_for_testing() + .is_none(), + "CRC at v0 should not be reported as loaded at v1 (version mismatch)" + ); + + Ok(()) +} + +// CRC should always write domainMetadata as an empty list (not omit the field) when there are +// no domain metadata actions, regardless of whether the feature is supported. +#[rstest] +#[case::dm_feature_supported(true)] +#[case::dm_feature_not_supported(false)] +#[tokio::test] +async fn test_write_checksum_with_no_dms_writes_empty_list( + #[case] dm_supported: bool, +) -> DeltaResult<()> { + use std::collections::HashMap; + + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + + let mut builder = create_table(&table_path, schema, "test_engine"); + if dm_supported { + let properties = HashMap::from([( + "delta.feature.domainMetadata".to_string(), + "supported".to_string(), + )]); + builder = builder.with_table_properties(properties); + } + let committed = builder + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())? + .unwrap_committed(); + + let snapshot = committed.post_commit_snapshot().unwrap(); + assert!(snapshot + .get_all_domain_metadata(engine.as_ref())? + .is_empty()); + let crc = write_and_verify_crc(snapshot, &table_path, engine.as_ref()); + assert_eq!(crc.domain_metadata, Some(Default::default())); + + Ok(()) +} + +// ============================================================================ +// Domain metadata CRC fast path +// ============================================================================ + +/// Engine that panics if any handler is accessed. +struct FailingEngine; + +impl Engine for FailingEngine { + fn evaluation_handler(&self) -> Arc { + unimplemented!() + } + fn storage_handler(&self) -> Arc { + unimplemented!() + } + fn json_handler(&self) -> Arc { + unimplemented!() + } + fn parquet_handler(&self) -> Arc { + unimplemented!() + } +} + +#[tokio::test] +async fn test_get_domain_metadata_with_crc_skips_log_replay() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // v0: CREATE TABLE with zip -> zap0 (and clustering DM from create_table_and_commit) + let committed = create_table_and_commit(&table_path, engine.as_ref())?; + let snapshot_v0 = committed.post_commit_snapshot().unwrap(); + + // v1: update zip -> zap1, add foo -> bar + let committed = snapshot_v0 + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("WRITE".to_string()) + .with_domain_metadata("zip".to_string(), "zap1".to_string()) + .with_domain_metadata("foo".to_string(), "bar".to_string()) + .commit(engine.as_ref())? + .unwrap_committed(); + + // Asserts domain metadata on any snapshot, regardless of how it was loaded. + let assert_domain_metadata = |snapshot: &Snapshot, engine: &dyn delta_kernel::Engine| { + assert_eq!( + snapshot.get_domain_metadata("zip", engine).unwrap(), + Some("zap1".to_string()) + ); + assert_eq!( + snapshot.get_domain_metadata("foo", engine).unwrap(), + Some("bar".to_string()) + ); + assert!(snapshot + .get_domain_metadata_internal("delta.clustering", engine) + .unwrap() + .is_some()); + assert_eq!( + snapshot + .get_domain_metadatas_internal(engine, None) + .unwrap() + .len(), + 3 + ); + }; + + // Case 1: Post-commit snapshot with in-memory CRC => DM loaded from CRC (fast path). + // Use NoJsonReadsEngine to prove no log replay occurs. + let post_commit_snapshot = committed.post_commit_snapshot().unwrap(); + assert!(post_commit_snapshot + .get_current_crc_if_loaded_for_testing() + .is_some()); + assert_domain_metadata(post_commit_snapshot, &FailingEngine); + + // Case 2: Fresh snapshot loaded from disk, no CRC file => DM loaded via log replay (slow path) + let fresh_snapshot_no_crc = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert!(fresh_snapshot_no_crc + .get_current_crc_if_loaded_for_testing() + .is_none()); + assert_domain_metadata(&fresh_snapshot_no_crc, engine.as_ref()); + + // Case 3: Write CRC to disk, then reload fresh snapshot => DM loaded from CRC (fast path) + // Use NoJsonReadsEngine to prove no log replay occurs. + let _ = post_commit_snapshot.write_checksum(engine.as_ref())?; + + let fresh_snapshot_with_crc = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert!(fresh_snapshot_with_crc + .get_current_crc_if_loaded_for_testing() + .is_some()); + assert_domain_metadata(&fresh_snapshot_with_crc, &FailingEngine); + + Ok(()) +} + +// ============================================================================ +// Set transaction CRC tracking +// ============================================================================ + +/// Comprehensive test for set transaction CRC tracking: verifies that set transactions are +/// correctly tracked in the CRC across commits, round-trip through write/reload, and that +/// the CRC fast path (no log replay) works for set transaction queries. +#[tokio::test] +async fn test_set_transaction_crc_tracking_and_fast_path() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // -- v0: CREATE TABLE (no set transactions) -- + let committed = create_table_and_commit(&table_path, engine.as_ref())?; + let snapshot_v0 = committed.post_commit_snapshot().unwrap(); + + // Post-commit CRC has empty set_transactions (not null) + let crc_v0 = write_and_verify_crc(snapshot_v0, &table_path, engine.as_ref()); + assert_eq!(crc_v0.set_transactions, Some(Default::default())); + + // Fresh snapshot with CRC on disk serves queries via fast path (no log replay) + let fresh_v0 = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert!(fresh_v0.get_current_crc_if_loaded_for_testing().is_some()); + assert_eq!( + fresh_v0 + .get_app_id_version("my-app", &FailingEngine) + .unwrap(), + None + ); + + // -- v1: commit with my-app=1 -- + let committed = snapshot_v0 + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("WRITE".to_string()) + .with_transaction_id("my-app".to_string(), 1) + .commit(engine.as_ref())? + .unwrap_committed(); + let snapshot_v1 = committed.post_commit_snapshot().unwrap(); + + // Post-commit CRC tracks my-app=1, queryable via fast path + assert_eq!( + snapshot_v1 + .get_app_id_version("my-app", &FailingEngine) + .unwrap(), + Some(1) + ); + assert_eq!( + snapshot_v1 + .get_app_id_version("nonexistent", &FailingEngine) + .unwrap(), + None + ); + + // Write CRC to disk, reload, verify round-trip and fast path + let crc_v1 = write_and_verify_crc(snapshot_v1, &table_path, engine.as_ref()); + let txns_v1 = crc_v1.set_transactions.as_ref().unwrap(); + assert_eq!(txns_v1.len(), 1); + assert!(txns_v1.contains_key("my-app")); + + let fresh_v1 = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert!(fresh_v1.get_current_crc_if_loaded_for_testing().is_some()); + assert_eq!( + fresh_v1 + .get_app_id_version("my-app", &FailingEngine) + .unwrap(), + Some(1) + ); + assert_eq!( + fresh_v1 + .get_app_id_version("nonexistent", &FailingEngine) + .unwrap(), + None + ); + + // -- v2: commit with my-app=2 (upsert) + other-app=1 (new) -- + let committed = snapshot_v1 + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("WRITE".to_string()) + .with_transaction_id("my-app".to_string(), 2) + .with_transaction_id("other-app".to_string(), 1) + .commit(engine.as_ref())? + .unwrap_committed(); + let snapshot_v2 = committed.post_commit_snapshot().unwrap(); + + // Post-commit CRC tracks updated versions, queryable via fast path + assert_eq!( + snapshot_v2 + .get_app_id_version("my-app", &FailingEngine) + .unwrap(), + Some(2) + ); + assert_eq!( + snapshot_v2 + .get_app_id_version("other-app", &FailingEngine) + .unwrap(), + Some(1) + ); + + // Write CRC to disk, reload, verify round-trip and fast path + let crc_v2 = write_and_verify_crc(snapshot_v2, &table_path, engine.as_ref()); + let txns_v2 = crc_v2.set_transactions.as_ref().unwrap(); + assert_eq!(txns_v2.len(), 2); + + let fresh_v2 = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert!(fresh_v2.get_current_crc_if_loaded_for_testing().is_some()); + assert_eq!( + fresh_v2 + .get_app_id_version("my-app", &FailingEngine) + .unwrap(), + Some(2) + ); + assert_eq!( + fresh_v2 + .get_app_id_version("other-app", &FailingEngine) + .unwrap(), + Some(1) + ); + + Ok(()) +} + +// ============================================================================ +// Set transaction CRC expiration +// ============================================================================ + +/// Tests the CRC fast path for set transaction expiration filtering. Since `lastUpdated` is set +/// to now, "interval 0 seconds" yields `expiration_timestamp = now`, so `last_updated <= now` +/// holds and the txn expires. A large retention or no retention should keep the txn visible. +#[rstest] +#[case::zero_retention_expires(Some("interval 0 seconds"), None)] +#[case::large_retention_not_expired(Some("interval 365 days"), Some(1))] +#[case::no_retention_no_filtering(None, Some(1))] +#[tokio::test] +async fn test_set_txn_expiration_via_crc_fast_path( + #[case] retention: Option<&str>, + #[case] expected: Option, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + + // v0: create the table with optional retention property + let mut builder = create_table(&table_path, schema, "test_engine"); + if let Some(r) = retention { + builder = builder.with_table_properties([("delta.setTransactionRetentionDuration", r)]); + } + let committed = builder + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())? + .unwrap_committed(); + + // v1: commit a set transaction for "my-app" (lastUpdated = now) + let snapshot_v0 = committed.post_commit_snapshot().unwrap().clone(); + let committed = snapshot_v0 + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("WRITE".to_string()) + .with_transaction_id("my-app".to_string(), 1) + .commit(engine.as_ref())? + .unwrap_committed(); + + // Write CRC at v1 so the fast path is used on reload + let snapshot_v1 = committed.post_commit_snapshot().unwrap(); + snapshot_v1.write_checksum(engine.as_ref())?; + + let snapshot = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert_eq!(snapshot.version(), 1); + + // Verify CRC was loaded from disk + assert!(snapshot.get_current_crc_if_loaded_for_testing().is_some()); + + // FailingEngine proves the CRC fast path is used (no log replay) + assert_eq!( + snapshot + .get_app_id_version("my-app", &FailingEngine) + .unwrap(), + expected + ); + + Ok(()) +} + +/// Verifies that a set transaction with null `last_updated` never expires, even with the most +/// aggressive retention ("interval 0 seconds"). +#[tokio::test] +async fn test_set_txn_null_last_updated_never_expires_via_log_replay() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // v0: create table with aggressive retention + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + create_table(&table_path, schema, "test_engine") + .with_table_properties([( + "delta.setTransactionRetentionDuration", + "interval 0 seconds", + )]) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())? + .unwrap_committed(); + + // v1: raw commit with txn action that omits lastUpdated + let store = Arc::new(LocalFileSystem::new()); + add_commit( + &table_path, + store.as_ref(), + 1, + r#"{"txn":{"appId":"null-app","version":42}}"#.to_string(), + ) + .await + .unwrap(); + + // Reload fresh snapshot at v1 -- no CRC covers v1, so log replay is used + let fresh = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert_eq!(fresh.version(), 1); + + // Despite aggressive retention, null last_updated means the txn never expires + assert_eq!( + fresh.get_app_id_version("null-app", engine.as_ref())?, + Some(42) + ); + + Ok(()) +} diff --git a/kernel/tests/create_table/clustering.rs b/kernel/tests/create_table/clustering.rs new file mode 100644 index 0000000000..c31b1c9bf8 --- /dev/null +++ b/kernel/tests/create_table/clustering.rs @@ -0,0 +1,221 @@ +//! Clustering integration tests for the CreateTable API. + +use std::sync::Arc; + +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::expressions::ColumnName; +use delta_kernel::schema::{DataType, StructField, StructType}; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::table_features::TableFeature; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::transaction::data_layout::DataLayout; +use delta_kernel::DeltaResult; +use rstest::rstest; +use test_utils::{assert_result_error_with_message, test_table_setup}; + +use super::simple_schema; + +/// Builds a schema that supports clustering at depths 1, 2, and 5: +/// { id: int, name: string, address: { city: string, zip: string }, +/// l1: { l2: { l3: { l4: { value: double } } } } } +fn clustering_test_schema() -> DeltaResult> { + let address = StructType::try_new(vec![ + StructField::new("city", DataType::STRING, true), + StructField::new("zip", DataType::STRING, true), + ])?; + let l4 = StructType::try_new(vec![StructField::new("value", DataType::DOUBLE, true)])?; + let l3 = StructType::try_new(vec![StructField::new( + "l4", + DataType::Struct(Box::new(l4)), + true, + )])?; + let l2 = StructType::try_new(vec![StructField::new( + "l3", + DataType::Struct(Box::new(l3)), + true, + )])?; + let l1 = StructType::try_new(vec![StructField::new( + "l2", + DataType::Struct(Box::new(l2)), + true, + )])?; + Ok(Arc::new(StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + StructField::new("address", DataType::Struct(Box::new(address)), true), + StructField::new("l1", DataType::Struct(Box::new(l1)), true), + ])?)) +} + +#[rstest] +#[case::top_level(vec![vec!["id"]])] +#[case::nested_2(vec![vec!["address", "city"]])] +#[case::mixed(vec![vec!["id"], vec!["name"], vec!["address", "city"], vec!["address", "zip"], vec!["l1", "l2", "l3", "l4", "value"]])] +#[tokio::test] +async fn test_create_clustered_table(#[case] col_paths: Vec>) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let schema = clustering_test_schema()?; + let expected_cols: Vec = col_paths + .iter() + .map(|p| ColumnName::new(p.iter().copied())) + .collect(); + + let txn = create_table(&table_path, schema, "Test/1.0") + .with_data_layout(DataLayout::Clustered { + columns: expected_cols.clone(), + }) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))?; + + let stats_cols = txn.stats_columns(); + for col in &expected_cols { + assert!( + stats_cols.contains(col), + "Clustering column '{col}' should be in stats columns" + ); + } + + let _ = txn.commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + let clustering_columns = snapshot.get_physical_clustering_columns(engine.as_ref())?; + assert_eq!(clustering_columns, Some(expected_cols)); + + let table_configuration = snapshot.table_configuration(); + assert!( + table_configuration.is_feature_supported(&TableFeature::DomainMetadata), + "Protocol should support domainMetadata feature" + ); + assert!( + table_configuration.is_feature_supported(&TableFeature::ClusteredTable), + "Protocol should support clustering feature" + ); + + Ok(()) +} + +/// Test that combining explicit feature signals with auto-enabled features doesn't create duplicates. +/// +/// This tests the edge case where a user provides `delta.feature.domainMetadata=supported` +/// AND uses `DataLayout::Clustered`. Both would try to add DomainMetadata, but we should +/// only have it once in the feature lists. +#[tokio::test] +async fn test_clustering_with_explicit_feature_signal_no_duplicates() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = simple_schema()?; + + // Combine BOTH: explicit feature signal AND clustering (which auto-adds domainMetadata) + let _ = create_table(&table_path, schema, "Test/1.0") + .with_table_properties([("delta.feature.domainMetadata", "supported")]) + .with_data_layout(DataLayout::clustered(["id"])) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + // Read back using kernel APIs and verify no duplicate features + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + let protocol = snapshot.table_configuration().protocol(); + let writer_features = protocol + .writer_features() + .expect("Writer features should exist"); + + // Count occurrences of DomainMetadata - should be exactly 1, not 2 + let domain_metadata_count = writer_features + .iter() + .filter(|f| **f == TableFeature::DomainMetadata) + .count(); + + assert_eq!( + domain_metadata_count, 1, + "domainMetadata should appear exactly once, not {domain_metadata_count} times (duplicate detected!)" + ); + + // Verify clustering columns via snapshot read path + let clustering_columns = snapshot.get_physical_clustering_columns(engine.as_ref())?; + assert_eq!(clustering_columns, Some(vec![ColumnName::new(["id"])])); + + Ok(()) +} + +#[tokio::test] +async fn test_clustering_stats_columns_within_limit() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Build schema with 10 columns (cluster on column 5, within default 32 limit) + let fields: Vec = (0..10) + .map(|i| StructField::new(format!("col{i}"), DataType::INTEGER, true)) + .collect(); + let schema = Arc::new(StructType::try_new(fields)?); + + // Create clustered table on col5 + let txn = create_table(&table_path, schema, "Test/1.0") + .with_data_layout(DataLayout::clustered(["col5"])) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))?; + + // Verify stats_columns includes the clustering column + let stats_cols = txn.stats_columns(); + assert!( + stats_cols.iter().any(|c| c.to_string() == "col5"), + "Clustering column col5 should be in stats columns" + ); + + Ok(()) +} + +#[tokio::test] +async fn test_clustering_stats_columns_beyond_limit() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Build schema with 40 columns (cluster on column 35, beyond default 32 limit) + let fields: Vec = (0..40) + .map(|i| StructField::new(format!("col{i}"), DataType::INTEGER, true)) + .collect(); + let schema = Arc::new(StructType::try_new(fields)?); + + // Create clustered table on col35 (position > 32) + let txn = create_table(&table_path, schema, "Test/1.0") + .with_data_layout(DataLayout::clustered(["col35"])) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))?; + + // Verify stats_columns includes the clustering column even beyond limit + let stats_cols = txn.stats_columns(); + assert!( + stats_cols.iter().any(|c| c.to_string() == "col35"), + "Clustering column col35 should be in stats columns even beyond DEFAULT_NUM_INDEXED_COLS" + ); + + // Verify we have exactly 33 stats columns: first 32 + col35 + // (col35 is added in Pass 2 of collect_columns) + assert_eq!( + stats_cols.len(), + 33, + "Should have 32 indexed cols + 1 clustering col" + ); + + Ok(()) +} + +#[rstest] +#[case::not_in_schema(vec!["nonexistent"], "not found in schema")] +#[case::nested_not_found(vec!["l1", "l2", "l3", "l4", "missing"], "not found in schema")] +#[case::struct_as_leaf(vec!["address"], "unsupported type")] +#[tokio::test] +async fn test_clustering_column_error( + #[case] col_path: Vec<&str>, + #[case] expected_error: &str, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let schema = clustering_test_schema()?; + + let result = create_table(&table_path, schema, "Test/1.0") + .with_data_layout(DataLayout::Clustered { + columns: vec![ColumnName::new(col_path.iter().copied())], + }) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new())); + + assert_result_error_with_message(result, expected_error); + + Ok(()) +} diff --git a/kernel/tests/create_table/column_mapping.rs b/kernel/tests/create_table/column_mapping.rs new file mode 100644 index 0000000000..e44b6eaa64 --- /dev/null +++ b/kernel/tests/create_table/column_mapping.rs @@ -0,0 +1,619 @@ +//! Column Mapping integration tests for the CreateTable API. +//! +//! These tests use kernel's snapshot API to read back the table, which exercises +//! the full column mapping validation path (via TableConfiguration::try_new -> +//! validate_schema_column_mapping). This ensures the written schema is valid and +//! readable by kernel. + +use std::sync::Arc; + +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::schema::{ + ArrayType, ColumnMetadataKey, DataType, MapType, StructField, StructType, +}; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::table_features::{ColumnMappingMode, TableFeature}; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::transaction::data_layout::DataLayout; +use delta_kernel::DeltaResult; +use test_utils::{create_table_and_load_snapshot, test_table_setup}; + +use super::simple_schema; + +/// Helper to strip column mapping metadata (IDs and physical names) from all StructFields recursively. +pub(super) fn strip_column_mapping_metadata(schema: &StructType) -> StructType { + let cm_id = ColumnMetadataKey::ColumnMappingId.as_ref(); + let cm_name = ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(); + + fn strip_field(field: &StructField, cm_id: &str, cm_name: &str) -> StructField { + let mut metadata = field.metadata().clone(); + metadata.remove(cm_id); + metadata.remove(cm_name); + + let data_type = strip_data_type(field.data_type(), cm_id, cm_name); + StructField::new(field.name(), data_type, field.is_nullable()).with_metadata(metadata) + } + + fn strip_data_type(dt: &DataType, cm_id: &str, cm_name: &str) -> DataType { + match dt { + DataType::Struct(s) => { + let fields: Vec<_> = s.fields().map(|f| strip_field(f, cm_id, cm_name)).collect(); + DataType::Struct(Box::new(StructType::new_unchecked(fields))) + } + DataType::Array(a) => DataType::from(ArrayType::new( + strip_data_type(a.element_type(), cm_id, cm_name), + a.contains_null(), + )), + DataType::Map(m) => DataType::from(MapType::new( + strip_data_type(m.key_type(), cm_id, cm_name), + strip_data_type(m.value_type(), cm_id, cm_name), + m.value_contains_null(), + )), + other => other.clone(), + } + } + + let fields: Vec<_> = schema + .fields() + .map(|f| strip_field(f, cm_id, cm_name)) + .collect(); + StructType::new_unchecked(fields) +} + +/// Assert column mapping configuration on a snapshot. +/// +/// For `Name` / `Id`: feature supported & enabled, mode matches, `maxColumnId` equals +/// the recursive field count. +/// +/// For `None`: mode is `None`, no `maxColumnId`, and no column mapping metadata (IDs or +/// physical names) on any field. Note: whether `ColumnMapping` appears in the protocol +/// depends on whether the feature flag was explicitly set, so that check is left to the +/// caller. +pub(super) fn assert_column_mapping_config(snapshot: &Snapshot, expected_mode: ColumnMappingMode) { + let table_config = snapshot.table_configuration(); + + assert_eq!( + table_config.column_mapping_mode(), + expected_mode, + "Column mapping mode mismatch" + ); + + match expected_mode { + ColumnMappingMode::Name | ColumnMappingMode::Id => { + assert!( + table_config.is_feature_supported(&TableFeature::ColumnMapping), + "Protocol should support columnMapping feature" + ); + assert!( + table_config.is_feature_enabled(&TableFeature::ColumnMapping), + "ColumnMapping feature should be enabled" + ); + + let expected_max_id = snapshot.schema().total_struct_fields(); + let max_id_str = expected_max_id.to_string(); + let config = table_config.metadata().configuration(); + assert_eq!( + config + .get("delta.columnMapping.maxColumnId") + .map(|s| s.as_str()), + Some(max_id_str.as_str()), + "maxColumnId should equal the total number of struct fields ({expected_max_id})" + ); + } + ColumnMappingMode::None => { + // No maxColumnId property + let config = table_config.metadata().configuration(); + assert!( + config.get("delta.columnMapping.maxColumnId").is_none(), + "maxColumnId should not be present when column mapping mode is None" + ); + + // No column mapping metadata on any field + for field in snapshot.schema().fields() { + assert!( + field + .get_config_value(&ColumnMetadataKey::ColumnMappingId) + .is_none(), + "Field '{}' should not have a column mapping ID when mode is None", + field.name() + ); + assert!( + field + .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) + .is_none(), + "Field '{}' should not have a physical name when mode is None", + field.name() + ); + } + } + } +} + +#[test] +fn test_create_table_with_column_mapping_name_mode() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = simple_schema()?; + + // Create table and load snapshot (this validates column mapping annotations on read) + let snapshot = create_table_and_load_snapshot( + &table_path, + schema, + engine.as_ref(), + &[("delta.columnMapping.mode", "name")], + )?; + + assert_column_mapping_config(&snapshot, ColumnMappingMode::Name); + + // Verify schema preserves field names, types, and nullability + let read_schema = snapshot.schema(); + assert_eq!(read_schema.fields().count(), 2); + + let id_field = read_schema.field("id").expect("id field should exist"); + assert_eq!(id_field.data_type(), &DataType::INTEGER); + assert!(!id_field.is_nullable()); + + let value_field = read_schema + .field("value") + .expect("value field should exist"); + assert_eq!(value_field.data_type(), &DataType::STRING); + assert!(value_field.is_nullable()); + + Ok(()) +} + +#[test] +fn test_create_table_with_column_mapping_id_mode() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = Arc::new(StructType::try_new(vec![StructField::new( + "id", + DataType::INTEGER, + false, + )])?); + + // Create table and load snapshot (validates column mapping on read) + let snapshot = create_table_and_load_snapshot( + &table_path, + schema, + engine.as_ref(), + &[("delta.columnMapping.mode", "id")], + )?; + + assert_column_mapping_config(&snapshot, ColumnMappingMode::Id); + + // Verify schema + let read_schema = snapshot.schema(); + assert_eq!(read_schema.fields().count(), 1); + let id_field = read_schema.field("id").expect("id field should exist"); + assert_eq!(id_field.data_type(), &DataType::INTEGER); + assert!(!id_field.is_nullable()); + + Ok(()) +} + +#[test] +fn test_column_mapping_mode_none_no_annotations() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = simple_schema()?; + + // Create table WITHOUT column mapping and load snapshot + let snapshot = create_table_and_load_snapshot(&table_path, schema, engine.as_ref(), &[])?; + + // Verify protocol does NOT have columnMapping feature + assert!( + !snapshot + .table_configuration() + .is_feature_supported(&TableFeature::ColumnMapping), + "Protocol should NOT have columnMapping feature when mode is not set" + ); + + // Verify no column mapping config (mode=None, no maxColumnId, no field metadata) + assert_column_mapping_config(&snapshot, ColumnMappingMode::None); + + // Verify schema preserves fields + let read_schema = snapshot.schema(); + assert_eq!(read_schema.fields().count(), 2); + assert!(read_schema.field("id").is_some()); + assert!(read_schema.field("value").is_some()); + + Ok(()) +} + +/// Test: setting `delta.feature.columnMapping=supported` without a mode means the feature +/// is in the protocol but column mapping is not active (mode resolves to `None`). +/// The schema should NOT have column mapping IDs or physical names. +#[test] +fn test_column_mapping_feature_only_without_mode() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = simple_schema()?; + + // Create table with ONLY the feature flag, no delta.columnMapping.mode + let _ = create_table(&table_path, schema, "Test/1.0") + .with_table_properties([("delta.feature.columnMapping", "supported")]) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + // Feature IS in the protocol (the feature signal put it there) + assert!( + snapshot + .table_configuration() + .is_feature_supported(&TableFeature::ColumnMapping), + "Protocol should list columnMapping as a supported feature" + ); + + // But mode is None, no maxColumnId, no field metadata + assert_column_mapping_config(&snapshot, ColumnMappingMode::None); + + Ok(()) +} + +#[test] +fn test_column_mapping_invalid_mode_rejected() { + let (_temp_dir, table_path, engine) = test_table_setup().unwrap(); + + let schema = Arc::new( + StructType::try_new(vec![StructField::new("id", DataType::INTEGER, false)]).unwrap(), + ); + + // Try to create table with invalid column mapping mode + let result = create_table(&table_path, schema, "Test/1.0") + .with_table_properties([("delta.columnMapping.mode", "invalid")]) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new())); + + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("Invalid column mapping mode")); +} + +/// Test cases for clustering columns with column mapping enabled. +/// Each case specifies: (logical_column_names, description) +#[rstest::rstest] +#[case::single_column(&["id"], "single clustering column")] +#[case::multiple_columns(&["id", "value"], "multiple clustering columns")] +#[test] +fn test_create_clustered_table_with_column_mapping( + #[case] clustering_cols: &[&str], + #[case] description: &str, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = simple_schema()?; + + // Create clustered table with column mapping enabled + let _ = create_table(&table_path, schema, "Test/1.0") + .with_table_properties([("delta.columnMapping.mode", "name")]) + .with_data_layout(DataLayout::clustered(clustering_cols.iter().copied())) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + // Load snapshot (validates column mapping annotations on read) + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + // Verify column mapping configuration + assert_column_mapping_config(&snapshot, ColumnMappingMode::Name); + + // Verify clustering-specific features + let table_config = snapshot.table_configuration(); + assert!(table_config.is_feature_supported(&TableFeature::ClusteredTable)); + assert!(table_config.is_feature_supported(&TableFeature::DomainMetadata)); + + // Verify clustering domain metadata exists and uses physical column names + let clustering_columns = snapshot.get_physical_clustering_columns(engine.as_ref())?; + let columns = clustering_columns.expect("Clustering columns should be present"); + assert_eq!( + columns.len(), + clustering_cols.len(), + "{}: expected {} clustering columns, got {}", + description, + clustering_cols.len(), + columns.len() + ); + + // With column mapping enabled, clustering domain metadata stores physical names + for (i, col) in columns.iter().enumerate() { + let physical_name: &str = col.path()[0].as_ref(); + let logical_name = clustering_cols[i]; + assert!( + physical_name.starts_with("col-"), + "{description}: clustering column {i} should use physical name '{physical_name}', not logical name '{logical_name}'" + ); + } + + Ok(()) +} + +#[test] +fn test_column_mapping_nested_schema() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Create nested schema + let address_type = StructType::try_new(vec![ + StructField::new("street", DataType::STRING, true), + StructField::new("city", DataType::STRING, true), + ])?; + + let schema = Arc::new(StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("address", DataType::Struct(Box::new(address_type)), true), + ])?); + + // Create table and load snapshot (validates column mapping for nested schema on read) + let snapshot = create_table_and_load_snapshot( + &table_path, + schema, + engine.as_ref(), + &[("delta.columnMapping.mode", "name")], + )?; + + // Verify column mapping config (maxColumnId = 4: id, address, street, city) + assert_column_mapping_config(&snapshot, ColumnMappingMode::Name); + + // Verify schema preserves the full nested structure + let read_schema = snapshot.schema(); + assert_eq!(read_schema.fields().count(), 2); + + // Verify top-level fields + let id_field = read_schema.field("id").expect("id field should exist"); + assert_eq!(id_field.data_type(), &DataType::INTEGER); + assert!(!id_field.is_nullable()); + + let address_field = read_schema + .field("address") + .expect("address field should exist"); + assert!(address_field.is_nullable()); + + // Verify nested struct fields are preserved + match address_field.data_type() { + DataType::Struct(nested) => { + assert_eq!(nested.fields().count(), 2); + + let street = nested.field("street").expect("street field should exist"); + assert_eq!(street.data_type(), &DataType::STRING); + assert!(street.is_nullable()); + + let city = nested.field("city").expect("city field should exist"); + assert_eq!(city.data_type(), &DataType::STRING); + assert!(city.is_nullable()); + } + other => panic!("Expected Struct type for address, got {other:?}"), + } + + Ok(()) +} + +/// E2E test: create a table with column mapping on a schema containing map and array types, +/// then read it back via snapshot and verify column mapping metadata survives the roundtrip. +#[test] +fn test_column_mapping_schema_with_maps_and_arrays() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Schema: + // id: int (not null) + // tags: map + // scores: array + // metadata: struct< + // labels: map> + // > + let labels_type = MapType::new( + DataType::STRING, + ArrayType::new(DataType::INTEGER, true), + true, + ); + + let metadata_type = StructType::try_new(vec![StructField::new( + "labels", + DataType::from(labels_type), + true, + )])?; + + let schema = Arc::new(StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new( + "tags", + DataType::from(MapType::new(DataType::STRING, DataType::STRING, true)), + true, + ), + StructField::new( + "scores", + DataType::from(ArrayType::new(DataType::INTEGER, true)), + true, + ), + StructField::new("metadata", DataType::Struct(Box::new(metadata_type)), true), + ])?); + + // Create table with column mapping and read back the snapshot. + // The snapshot read exercises validate_schema_column_mapping, which verifies + // that all fields (including map key/value, array element, and nested structs) + // have valid column mapping metadata. + let snapshot = create_table_and_load_snapshot( + &table_path, + schema.clone(), + engine.as_ref(), + &[("delta.columnMapping.mode", "name")], + )?; + + // First verify column mapping annotations (IDs, physical names, maxColumnId, feature flags) + assert_column_mapping_config(&snapshot, ColumnMappingMode::Name); + + // Then strip column mapping metadata and verify the schema structure matches the input. + let read_schema = strip_column_mapping_metadata(&snapshot.schema()); + assert_eq!(&read_schema, schema.as_ref(), "Schema roundtrip mismatch"); + + Ok(()) +} + +/// Builds a schema that supports clustering at depths 1, 2, and 5: +/// { id: int, name: string, address: { city: string, zip: string }, +/// l1: { l2: { l3: { l4: { value: double } } } } } +fn clustering_cm_test_schema() -> DeltaResult> { + let address = StructType::try_new(vec![ + StructField::new("city", DataType::STRING, true), + StructField::new("zip", DataType::STRING, true), + ])?; + let l4 = StructType::try_new(vec![StructField::new("value", DataType::DOUBLE, true)])?; + let l3 = StructType::try_new(vec![StructField::new( + "l4", + DataType::Struct(Box::new(l4)), + true, + )])?; + let l2 = StructType::try_new(vec![StructField::new( + "l3", + DataType::Struct(Box::new(l3)), + true, + )])?; + let l1 = StructType::try_new(vec![StructField::new( + "l2", + DataType::Struct(Box::new(l2)), + true, + )])?; + Ok(Arc::new(StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + StructField::new("address", DataType::Struct(Box::new(address)), true), + StructField::new("l1", DataType::Struct(Box::new(l1)), true), + ])?)) +} + +#[rstest::rstest] +#[case::top_level_cm_none(vec![vec!["id"]], "none")] +#[case::top_level_cm_name(vec![vec!["id"]], "name")] +#[case::top_level_cm_id(vec![vec!["id"]], "id")] +#[case::nested_2_cm_none(vec![vec!["address", "city"]], "none")] +#[case::nested_2_cm_name(vec![vec!["address", "city"]], "name")] +#[case::nested_2_cm_id(vec![vec!["address", "city"]], "id")] +#[case::mixed_cm_none(vec![vec!["id"], vec!["name"], vec!["address", "city"], vec!["address", "zip"], vec!["l1", "l2", "l3", "l4", "value"]], "none")] +#[case::mixed_cm_name(vec![vec!["id"], vec!["name"], vec!["address", "city"], vec!["address", "zip"], vec!["l1", "l2", "l3", "l4", "value"]], "name")] +#[case::mixed_cm_id(vec![vec!["id"], vec!["name"], vec!["address", "city"], vec!["address", "zip"], vec!["l1", "l2", "l3", "l4", "value"]], "id")] +#[test] +fn test_create_clustered_table_nested_with_column_mapping( + #[case] col_paths: Vec>, + #[case] cm_mode: &str, +) -> DeltaResult<()> { + use delta_kernel::expressions::ColumnName; + + let (_temp_dir, table_path, engine) = test_table_setup()?; + let schema = clustering_cm_test_schema()?; + let expected_cols: Vec = col_paths + .iter() + .map(|p| ColumnName::new(p.iter().copied())) + .collect(); + + let _ = create_table(&table_path, schema, "Test/1.0") + .with_table_properties([("delta.columnMapping.mode", cm_mode)]) + .with_data_layout(DataLayout::Clustered { + columns: expected_cols.clone(), + }) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + let table_configuration = snapshot.table_configuration(); + assert!( + table_configuration.is_feature_supported(&TableFeature::DomainMetadata), + "Protocol should support domainMetadata feature" + ); + assert!( + table_configuration.is_feature_supported(&TableFeature::ClusteredTable), + "Protocol should support clustering feature" + ); + + let expected_cm_mode = match cm_mode { + "name" => ColumnMappingMode::Name, + "id" => ColumnMappingMode::Id, + _ => ColumnMappingMode::None, + }; + assert_column_mapping_config(&snapshot, expected_cm_mode); + + let clustering_columns = snapshot.get_physical_clustering_columns(engine.as_ref())?; + let columns = clustering_columns.expect("Clustering columns should be present"); + assert_eq!(columns.len(), expected_cols.len()); + + for (col, expected_path) in columns.iter().zip(col_paths.iter()) { + assert_eq!(col.path().len(), expected_path.len()); + match expected_cm_mode { + ColumnMappingMode::Name | ColumnMappingMode::Id => { + for field_name in col.path() { + assert!( + field_name.starts_with("col-"), + "Clustering path field '{field_name}' should use physical name" + ); + } + } + ColumnMappingMode::None => { + let expected_col = ColumnName::new(expected_path.iter().copied()); + assert_eq!(*col, expected_col); + } + } + } + + Ok(()) +} + +#[rstest::rstest] +#[case::single_column(&["id"])] +#[case::multiple_columns(&["id", "date"])] +fn test_partitioned_table_stores_logical_column_names_with_column_mapping( + #[case] partition_cols: &[&str], +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let schema = super::partition_test_schema()?; + + let _ = create_table(&table_path, schema, "Test/1.0") + .with_table_properties([("delta.columnMapping.mode", "name")]) + .with_data_layout(DataLayout::partitioned(partition_cols.iter().copied())) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + assert_column_mapping_config(&snapshot, ColumnMappingMode::Name); + + let log_file_path = format!("{table_path}/_delta_log/00000000000000000000.json"); + let log_contents = std::fs::read_to_string(&log_file_path).expect("Failed to read log file"); + let actions: Vec = log_contents + .lines() + .map(|line| serde_json::from_str(line).expect("Failed to parse JSON")) + .collect(); + + let metadata_action = actions + .iter() + .find(|a| a.get("metaData").is_some()) + .expect("Should have metaData action"); + let metadata = metadata_action.get("metaData").unwrap(); + let stored_partition_columns: Vec = metadata["partitionColumns"] + .as_array() + .expect("partitionColumns should be an array") + .iter() + .map(|v| v.as_str().unwrap().to_string()) + .collect(); + + assert_eq!(stored_partition_columns.len(), partition_cols.len()); + + for (i, stored_name) in stored_partition_columns.iter().enumerate() { + let logical_name = partition_cols[i]; + assert_eq!( + stored_name, logical_name, + "partition column {i} should be logical name '{logical_name}', got '{stored_name}'" + ); + } + + let clustering = snapshot.get_physical_clustering_columns(engine.as_ref())?; + assert!( + clustering.is_none(), + "Partitioned table should not have clustering columns" + ); + + Ok(()) +} diff --git a/kernel/tests/create_table/ctas.rs b/kernel/tests/create_table/ctas.rs new file mode 100644 index 0000000000..588cbbba4d --- /dev/null +++ b/kernel/tests/create_table/ctas.rs @@ -0,0 +1,420 @@ +//! CTAS (Create Table As Select) integration tests. +//! +//! These tests exercise a CTAS-style flow: create a source table with certain +//! features, write seed data, scan it, create a target table with (possibly +//! different) features, write the scanned data, then verify the target. + +use std::collections::HashMap; +use std::sync::Arc; + +use delta_kernel::arrow::array::{Array, Int64Array, StringArray, StructArray}; +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::engine::arrow_data::ArrowEngineData; +use delta_kernel::engine::default::executor::tokio::TokioMultiThreadExecutor; +use delta_kernel::engine::default::DefaultEngineBuilder; +use delta_kernel::expressions::ColumnName; +use delta_kernel::object_store::local::LocalFileSystem; +use delta_kernel::object_store::path::Path; +use delta_kernel::object_store::DynObjectStore; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::table_features::{ + get_any_level_column_physical_name, ColumnMappingMode, TableFeature, +}; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::transaction::data_layout::DataLayout; +use delta_kernel::transaction::CommitResult; +use delta_kernel::{Engine, FileMeta}; +use url::Url; + +use test_utils::{ + assert_schema_has_field, nested_batches, nested_schema, read_add_infos, test_table_setup, + write_batch_to_table, +}; + +const VERIFIED_PATHS: &[&[&str]] = &[&["row_number"], &["address", "street"]]; + +// --------------------------------------------------------------------------- +// Unified column naming verification +// --------------------------------------------------------------------------- + +/// Validates that column names are consistent (logical or physical) across all +/// table metadata surfaces: schema annotations, stats, clustering domain +/// metadata, and Parquet file footers. +async fn verify_column_names_in_metadata( + snapshot: &Snapshot, + engine: &impl Engine, + store: &DynObjectStore, + table_url: &Url, + cm_mode: ColumnMappingMode, + clustered: bool, +) -> Result<(), Box> { + super::column_mapping::assert_column_mapping_config(snapshot, cm_mode); + verify_column_names_in_stats(snapshot, engine, cm_mode)?; + if clustered { + verify_column_names_in_clustering_metadata(snapshot, engine, cm_mode)?; + } + verify_column_names_in_parquet_footer(snapshot, engine, store, table_url, cm_mode).await?; + Ok(()) +} + +/// Asserts that minValues keys in add-action stats use the expected column +/// names (physical when column mapping is enabled, logical otherwise). +fn verify_column_names_in_stats( + snapshot: &Snapshot, + engine: &impl Engine, + cm_mode: ColumnMappingMode, +) -> Result<(), Box> { + let schema = snapshot.schema(); + let add_actions = read_add_infos(snapshot, engine)?; + let stats = add_actions + .iter() + .filter_map(|a| a.stats.as_ref()) + .find(|s| s.get("minValues").is_some()); + + if let Some(stats) = stats { + let min_values = &stats["minValues"]; + for logical_path in VERIFIED_PATHS { + let col = ColumnName::new(logical_path.iter().copied()); + let expected = + get_any_level_column_physical_name(schema.as_ref(), &col, cm_mode)?.into_inner(); + let mut current = min_values; + for (i, field) in expected.iter().enumerate() { + assert!( + current.get(field).is_some(), + "stats minValues missing key '{field}' for {logical_path:?}" + ); + if i < expected.len() - 1 { + current = ¤t[field]; + } + } + } + } + + Ok(()) +} + +/// Asserts that column paths stored in clustering domain metadata use the +/// expected names (physical when column mapping is enabled, logical otherwise). +fn verify_column_names_in_clustering_metadata( + snapshot: &Snapshot, + engine: &impl Engine, + cm_mode: ColumnMappingMode, +) -> Result<(), Box> { + let schema = snapshot.schema(); + let clustering_columns = snapshot + .get_physical_clustering_columns(engine)? + .expect("Clustering columns should be present"); + + assert_eq!( + clustering_columns.len(), + 1, + "Expected exactly one clustering column" + ); + let stored_path = clustering_columns[0].path(); + let col = ColumnName::new(["row_number"]); + let expected = get_any_level_column_physical_name(schema.as_ref(), &col, cm_mode)?.into_inner(); + + assert_eq!( + stored_path, &expected, + "Clustering column naming mismatch: stored={stored_path:?}, expected={expected:?}" + ); + + if cm_mode != ColumnMappingMode::None { + for field in stored_path { + assert!( + field.as_str().starts_with("col-"), + "Clustering path field '{field}' should be a physical name" + ); + } + } else { + assert_eq!( + stored_path, + &["row_number"], + "Without column mapping, clustering path should use logical name" + ); + } + + Ok(()) +} + +/// Asserts that Parquet file footer field names match the expected column +/// names (physical when column mapping is enabled, logical otherwise). +async fn verify_column_names_in_parquet_footer( + snapshot: &Snapshot, + engine: &impl Engine, + store: &DynObjectStore, + table_url: &Url, + cm_mode: ColumnMappingMode, +) -> Result<(), Box> { + let schema = snapshot.schema(); + let add_actions = read_add_infos(snapshot, engine)?; + let first_add = add_actions + .first() + .expect("should have at least one add file"); + + let parquet_url = table_url.join(&first_add.path)?; + let obj_meta = store + .head(&Path::from_url_path(parquet_url.path())?) + .await?; + let file_meta = FileMeta::new(parquet_url, 0, obj_meta.size as u64); + let footer = engine.parquet_handler().read_parquet_footer(&file_meta)?; + + for logical_path in VERIFIED_PATHS { + let col = ColumnName::new(logical_path.iter().copied()); + let expected = + get_any_level_column_physical_name(schema.as_ref(), &col, cm_mode)?.into_inner(); + assert_schema_has_field(&footer.schema, &expected); + } + + Ok(()) +} + +// --------------------------------------------------------------------------- +// Core CTAS test flow +// --------------------------------------------------------------------------- + +/// Returns the table property value for the given column mapping mode, or +/// `None` for `ColumnMappingMode::None` (no property needed). +fn cm_mode_property(mode: ColumnMappingMode) -> Option<&'static str> { + match mode { + ColumnMappingMode::None => None, + ColumnMappingMode::Name => Some("name"), + ColumnMappingMode::Id => Some("id"), + } +} + +/// Core CTAS test logic: +/// 1. Set up engine and source table with the requested features +/// 2. Write seed data to the source table +/// 3. Scan all data from the source table +/// 4. Create target table and write scanned data in a single CTAS transaction +/// 5. Verify target version, feature flags, and column naming consistency +/// 6. Verify data integrity: scan target and check row count, row_number +/// values, and nested address.street values all match the source +async fn run_ctas_test( + src_cm: ColumnMappingMode, + src_clustered: bool, + tgt_cm: ColumnMappingMode, + tgt_clustered: bool, +) -> Result<(), Box> { + // 1. Set up engine and source table with the requested features + let schema = nested_schema()?; + + let (_src_tmp, src_table_path, _) = test_table_setup()?; + let src_url = Url::from_directory_path(&src_table_path).unwrap(); + let store: Arc = Arc::new(LocalFileSystem::new()); + let engine = Arc::new( + DefaultEngineBuilder::new(store.clone()) + .with_task_executor(Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + ))) + .build(), + ); + + let mut src_snapshot = { + let mut builder = create_table(&src_table_path, schema.clone(), "ctas-test"); + if let Some(mode_str) = cm_mode_property(src_cm) { + builder = builder.with_table_properties([("delta.columnMapping.mode", mode_str)]); + } + if src_clustered { + builder = builder.with_data_layout(DataLayout::clustered(["row_number"])); + } + let result = builder + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + match result { + CommitResult::CommittedTransaction(c) => c + .post_commit_snapshot() + .expect("should have post_commit_snapshot") + .clone(), + _ => panic!("Source create should succeed"), + } + }; + + // 2. Write seed data to the source table + for batch in nested_batches()? { + src_snapshot = + write_batch_to_table(&src_snapshot, engine.as_ref(), batch, HashMap::new()).await?; + } + + // 3. Scan all data from the source table + let src_snapshot_for_scan = Snapshot::builder_for(src_url.clone()).build(engine.as_ref())?; + let src_scan = src_snapshot_for_scan.scan_builder().build()?; + let src_batches = test_utils::read_scan(&src_scan, engine.clone())?; + let src_arrow_schema = src_batches[0].schema(); + let source_data = + delta_kernel::arrow::compute::concat_batches(&src_arrow_schema, &src_batches)?; + let source_row_count = source_data.num_rows(); + assert_eq!(source_row_count, 6, "Source should have 6 rows"); + + // 4. Create target table and write scanned data in a single CTAS transaction + let (_tgt_tmp, tgt_table_path, _) = test_table_setup()?; + let tgt_url = Url::from_directory_path(&tgt_table_path).unwrap(); + + let mut tgt_builder = create_table(&tgt_table_path, schema.clone(), "ctas-test"); + if let Some(mode_str) = cm_mode_property(tgt_cm) { + tgt_builder = tgt_builder.with_table_properties([("delta.columnMapping.mode", mode_str)]); + } + if tgt_clustered { + tgt_builder = tgt_builder.with_data_layout(DataLayout::clustered(["row_number"])); + } + let mut tgt_txn = tgt_builder.build(engine.as_ref(), Box::new(FileSystemCommitter::new()))?; + + let write_context = Arc::new(tgt_txn.get_write_context()); + let add_meta = engine + .write_parquet( + &ArrowEngineData::new(source_data), + write_context.as_ref(), + HashMap::new(), + ) + .await?; + tgt_txn.add_files(add_meta); + + let commit_result = tgt_txn.commit(engine.as_ref())?; + let tgt_snapshot = match commit_result { + CommitResult::CommittedTransaction(c) => c + .post_commit_snapshot() + .expect("should have post_commit_snapshot") + .clone(), + _ => panic!("CTAS commit should succeed"), + }; + + // 5. Verify target version, feature flags, and column naming consistency + assert_eq!(tgt_snapshot.version(), 0, "CTAS should produce version-0"); + + if tgt_clustered { + let tc = tgt_snapshot.table_configuration(); + assert!( + tc.is_feature_supported(&TableFeature::ClusteredTable), + "Clustered table feature should be supported" + ); + assert!( + tc.is_feature_supported(&TableFeature::DomainMetadata), + "Domain metadata feature should be supported for clustered tables" + ); + } + + verify_column_names_in_metadata( + &tgt_snapshot, + engine.as_ref(), + store.as_ref(), + &tgt_url, + tgt_cm, + tgt_clustered, + ) + .await?; + + // 6. Verify data integrity: scan target and check row count, row_number + // values, and nested address.street values all match the source + let tgt_snapshot_for_scan = Snapshot::builder_for(tgt_url.clone()).build(engine.as_ref())?; + let tgt_scan = tgt_snapshot_for_scan.scan_builder().build()?; + let tgt_batches = test_utils::read_scan(&tgt_scan, engine.clone())?; + let tgt_arrow_schema = tgt_batches[0].schema(); + let tgt_combined = + delta_kernel::arrow::compute::concat_batches(&tgt_arrow_schema, &tgt_batches)?; + assert_eq!( + tgt_combined.num_rows(), + source_row_count, + "Target row count should match source" + ); + + let row_numbers = tgt_combined + .column_by_name("row_number") + .expect("should have 'row_number'") + .as_any() + .downcast_ref::() + .expect("row_number should be Int64"); + // Scan order is non-deterministic, so sort before comparing + let vals = { + let mut v: Vec = (0..row_numbers.len()) + .map(|i| row_numbers.value(i)) + .collect(); + v.sort(); + v + }; + assert_eq!( + vals, + (1..=source_row_count as i64).collect::>(), + "row_number values should be 1..={source_row_count}" + ); + + let address = tgt_combined + .column_by_name("address") + .expect("should have 'address'") + .as_any() + .downcast_ref::() + .expect("address should be a struct"); + let streets = address + .column_by_name("street") + .expect("address should have 'street'") + .as_any() + .downcast_ref::() + .expect("street should be String"); + let street_vals = { + let mut v: Vec<&str> = (0..streets.len()).map(|i| streets.value(i)).collect(); + v.sort(); + v + }; + let expected: Vec = (1..=source_row_count).map(|i| format!("st{i}")).collect(); + let expected_refs: Vec<&str> = expected.iter().map(String::as_str).collect(); + assert_eq!( + street_vals, expected_refs, + "address.street values should be st1..st{source_row_count}" + ); + + Ok(()) +} + +// --------------------------------------------------------------------------- +// Test functions +// --------------------------------------------------------------------------- + +/// Verifies CTAS data roundtrip for all 9 source/target column-mapping mode +/// combinations (None/Name/Id x None/Name/Id, no clustering). Ensures column +/// naming is consistent across metadata and Parquet files regardless of mode. +#[rstest::rstest] +#[tokio::test(flavor = "multi_thread")] +async fn test_ctas_column_mapping_combinations( + #[values( + ColumnMappingMode::None, + ColumnMappingMode::Name, + ColumnMappingMode::Id + )] + src_cm: ColumnMappingMode, + #[values( + ColumnMappingMode::None, + ColumnMappingMode::Name, + ColumnMappingMode::Id + )] + tgt_cm: ColumnMappingMode, +) -> Result<(), Box> { + run_ctas_test(src_cm, false, tgt_cm, false).await +} + +/// Verifies CTAS data roundtrip for all 27 non-trivial combinations of +/// source/target column-mapping mode (None/Name/Id) and clustering. Skips +/// the 9 cases where neither table is clustered (covered by +/// `test_ctas_column_mapping_combinations`). +#[rstest::rstest] +#[tokio::test(flavor = "multi_thread")] +async fn test_ctas_clustering_and_column_mapping_combinations( + #[values( + ColumnMappingMode::None, + ColumnMappingMode::Name, + ColumnMappingMode::Id + )] + src_cm: ColumnMappingMode, + #[values(false, true)] src_clustered: bool, + #[values( + ColumnMappingMode::None, + ColumnMappingMode::Name, + ColumnMappingMode::Id + )] + tgt_cm: ColumnMappingMode, + #[values(false, true)] tgt_clustered: bool, +) -> Result<(), Box> { + if !src_clustered && !tgt_clustered { + return Ok(()); + } + run_ctas_test(src_cm, src_clustered, tgt_cm, tgt_clustered).await +} diff --git a/kernel/tests/create_table/ict.rs b/kernel/tests/create_table/ict.rs new file mode 100644 index 0000000000..9b859612a9 --- /dev/null +++ b/kernel/tests/create_table/ict.rs @@ -0,0 +1,118 @@ +//! In-Commit Timestamp (ICT) integration tests for the CreateTable API. +//! +//! Tests that creating a table with `delta.enableInCommitTimestamps=true` automatically adds the +//! `inCommitTimestamp` feature to the protocol (writer-only) and that the snapshot exposes a +//! valid `inCommitTimestamp` value. + +use std::time::{SystemTime, UNIX_EPOCH}; + +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::table_features::{ + TableFeature, TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION, +}; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::{DeltaResult, Engine}; +use test_utils::test_table_setup; + +/// Asserts the ICT protocol and enablement state of a snapshot, returning the ICT value. +fn assert_ict_state( + snapshot: &Snapshot, + engine: &dyn Engine, + expect_supported: bool, + expect_enabled: bool, + test_start_ms: i64, +) -> DeltaResult> { + let table_config = snapshot.table_configuration(); + assert_eq!( + table_config.is_feature_supported(&TableFeature::InCommitTimestamp), + expect_supported, + ); + if expect_supported { + let protocol = table_config.protocol(); + assert!( + protocol.min_reader_version() >= TABLE_FEATURES_MIN_READER_VERSION, + "Reader version should be at least {TABLE_FEATURES_MIN_READER_VERSION}" + ); + assert!( + protocol.min_writer_version() >= TABLE_FEATURES_MIN_WRITER_VERSION, + "Writer version should be at least {TABLE_FEATURES_MIN_WRITER_VERSION}" + ); + assert!( + protocol + .writer_features() + .is_some_and(|f| f.contains(&TableFeature::InCommitTimestamp)), + "inCommitTimestamp should be in writer features" + ); + assert!( + !protocol + .reader_features() + .is_some_and(|f| f.contains(&TableFeature::InCommitTimestamp)), + "inCommitTimestamp should NOT be in reader features" + ); + } + + let ict = snapshot.get_in_commit_timestamp(engine)?; + if expect_enabled { + let ts = ict.expect("ICT should be present when enabled"); + assert!( + ts >= test_start_ms, + "inCommitTimestamp {ts} should be >= test start time {test_start_ms}" + ); + } else { + assert!(ict.is_none(), "ICT should be None when not enabled"); + } + Ok(ict) +} + +#[rstest::rstest] +#[case::ict_enabled(&[("delta.enableInCommitTimestamps", "true")], true, true)] +#[case::no_ict(&[], false, false)] +#[case::feature_signal_only(&[("delta.feature.inCommitTimestamp", "supported")], true, false)] +fn test_create_table_ict( + #[case] properties: &[(&str, &str)], + #[case] expect_ict_feature_supported: bool, + #[case] expect_ict_enabled: bool, +) -> DeltaResult<()> { + let test_start_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as i64; + + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let committed = create_table(&table_path, super::simple_schema()?, "Test/1.0") + .with_table_properties(properties.iter().copied()) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())? + .unwrap_committed(); + + // Verify via post-commit snapshot (reads ICT from in-memory CRC delta) + let post_snapshot = committed + .post_commit_snapshot() + .expect("should have snapshot"); + let post_ict = assert_ict_state( + post_snapshot, + engine.as_ref(), + expect_ict_feature_supported, + expect_ict_enabled, + test_start_ms, + )?; + + // Verify via fresh snapshot loaded from disk (reads ICT from commit JSON) + let disk_snapshot = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + let disk_ict = assert_ict_state( + &disk_snapshot, + engine.as_ref(), + expect_ict_feature_supported, + expect_ict_enabled, + test_start_ms, + )?; + + assert_eq!( + post_ict, disk_ict, + "post-commit and disk ICT values should match" + ); + + Ok(()) +} diff --git a/kernel/tests/create_table/main.rs b/kernel/tests/create_table/main.rs new file mode 100644 index 0000000000..7748fd60b6 --- /dev/null +++ b/kernel/tests/create_table/main.rs @@ -0,0 +1,481 @@ +//! Integration tests for the CreateTable API + +mod clustering; +mod column_mapping; +mod ctas; +mod ict; +mod partitioned; +mod timestamp_ntz; +mod variant; + +use std::sync::Arc; + +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::schema::{DataType, StructField, StructType}; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::table_features::{ + TableFeature, TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION, +}; +use delta_kernel::table_properties::TableProperties; +use delta_kernel::transaction::create_table::{create_table, CreateTableTransaction}; +use delta_kernel::DeltaResult; +use rstest::rstest; +use serde_json::Value; +use test_utils::{assert_result_error_with_message, test_table_setup}; + +/// Helper to create a simple two-column schema for tests. +/// Shared with sub-modules. +pub(crate) fn simple_schema() -> DeltaResult> { + Ok(Arc::new(StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("value", DataType::STRING, true), + ])?)) +} + +/// Helper to create a three-column schema for partition tests (id, date, value). +/// Shared with sub-modules. +pub(crate) fn partition_test_schema() -> DeltaResult> { + Ok(Arc::new(StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("date", DataType::DATE, true), + StructField::new("value", DataType::STRING, true), + ])?)) +} + +#[tokio::test] +async fn test_create_simple_table() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Create schema for an events table + let schema = Arc::new(StructType::try_new(vec![ + StructField::new("event_id", DataType::LONG, false), + StructField::new("user_id", DataType::LONG, false), + StructField::new("event_type", DataType::STRING, false), + StructField::new("timestamp", DataType::TIMESTAMP, false), + StructField::new("properties", DataType::STRING, true), + ])?); + + // Create table using new API + let _ = create_table(&table_path, schema.clone(), "DeltaKernel-RS/0.17.0") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + // Verify table was created + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + assert_eq!(snapshot.version(), 0); + assert_eq!(snapshot.schema().fields().len(), 5); + + // Verify protocol versions via snapshot + let protocol = snapshot.table_configuration().protocol(); + assert_eq!( + protocol.min_reader_version(), + TABLE_FEATURES_MIN_READER_VERSION + ); + assert_eq!( + protocol.min_writer_version(), + TABLE_FEATURES_MIN_WRITER_VERSION + ); + // Verify no reader/writer features are set (empty for table features mode) + assert!(protocol.reader_features().is_some_and(|f| f.is_empty())); + assert!(protocol.writer_features().is_some_and(|f| f.is_empty())); + + // Verify no table properties are set via public API + assert_eq!(snapshot.table_properties(), &TableProperties::default()); + + // Verify schema field names + let field_names: Vec<_> = snapshot + .schema() + .fields() + .map(|f| f.name().to_string()) + .collect(); + assert!(field_names.contains(&"event_id".to_string())); + assert!(field_names.contains(&"user_id".to_string())); + assert!(field_names.contains(&"event_type".to_string())); + assert!(field_names.contains(&"timestamp".to_string())); + assert!(field_names.contains(&"properties".to_string())); + + Ok(()) +} + +#[tokio::test] +async fn test_create_table_with_user_domain_metadata() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = simple_schema()?; + + // Create table with domainMetadata feature enabled + let txn = create_table(&table_path, schema, "Test/1.0") + .with_table_properties([("delta.feature.domainMetadata", "supported")]) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))?; + + // Add user domain metadata during table creation + let domain = "app.settings"; + let config = r#"{"version": 1, "enabled": true}"#; + + let _ = txn + .with_domain_metadata(domain.to_string(), config.to_string()) + .commit(engine.as_ref())?; + + // Load snapshot and verify domain metadata was persisted + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + // Verify domainMetadata feature is enabled in protocol + assert!( + snapshot + .table_configuration() + .is_feature_supported(&TableFeature::DomainMetadata), + "DomainMetadata feature should be enabled" + ); + + // Verify domain metadata string was persisted correctly + let retrieved_config = snapshot.get_domain_metadata(domain, engine.as_ref())?; + assert_eq!( + retrieved_config, + Some(config.to_string()), + "Domain metadata should be persisted and retrievable" + ); + + // Parse and verify the JSON contents + let parsed: Value = serde_json::from_str(retrieved_config.as_ref().unwrap())?; + assert_eq!(parsed["version"], 1); + assert_eq!(parsed["enabled"], true); + + // Verify non-existent domain returns None + let missing = snapshot.get_domain_metadata("nonexistent.domain", engine.as_ref())?; + assert!(missing.is_none(), "Non-existent domain should return None"); + + Ok(()) +} + +#[tokio::test] +async fn test_create_table_already_exists() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Create schema for a user profiles table + let schema = Arc::new(StructType::try_new(vec![ + StructField::new("user_id", DataType::LONG, false), + StructField::new("username", DataType::STRING, false), + StructField::new("email", DataType::STRING, false), + StructField::new("created_at", DataType::TIMESTAMP, false), + StructField::new("is_active", DataType::BOOLEAN, false), + ])?); + + // Create table first time + let _ = create_table(&table_path, schema.clone(), "UserManagementService/1.2.0") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + // Try to create again - should fail at build time (table already exists) + let result = create_table(&table_path, schema.clone(), "UserManagementService/1.2.0") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new())); + + assert_result_error_with_message(result, "already exists"); + + Ok(()) +} + +#[tokio::test] +async fn test_create_table_empty_schema_not_supported() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Create empty schema + let schema = Arc::new(StructType::try_new(vec![])?); + + // Try to create table with empty schema - should fail at build time + let result = create_table(&table_path, schema, "InvalidApp/0.1.0") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new())); + + assert_result_error_with_message(result, "cannot be empty"); + + Ok(()) +} + +#[tokio::test] +async fn test_create_table_log_actions() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + // Create schema + let schema = Arc::new(StructType::try_new(vec![ + StructField::new("user_id", DataType::LONG, false), + StructField::new("action", DataType::STRING, false), + ])?); + + let engine_info = "AuditService/2.1.0"; + + // Create table + let _ = create_table(&table_path, schema, engine_info) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + // Read the actual Delta log file + let log_file_path = format!("{table_path}/_delta_log/00000000000000000000.json"); + let log_contents = std::fs::read_to_string(&log_file_path).expect("Failed to read log file"); + + // Parse each line (each line is a separate JSON action) + let actions: Vec = log_contents + .lines() + .map(|line| serde_json::from_str(line).expect("Failed to parse JSON")) + .collect(); + + // Verify we have exactly 3 actions: CommitInfo, Protocol, Metadata + // CommitInfo is first to comply with ICT (In-Commit Timestamps) protocol requirements + assert_eq!( + actions.len(), + 3, + "Expected 3 actions (commitInfo, protocol, metaData), found {}", + actions.len() + ); + + // Verify CommitInfo action (first for ICT compliance) + let commit_info_action = &actions[0]; + assert!( + commit_info_action.get("commitInfo").is_some(), + "First action should be commitInfo" + ); + let commit_info = commit_info_action.get("commitInfo").unwrap(); + assert!( + commit_info.get("timestamp").is_some(), + "CommitInfo should have timestamp" + ); + assert!( + commit_info.get("engineInfo").is_some(), + "CommitInfo should have engineInfo" + ); + assert!( + commit_info.get("operation").is_some(), + "CommitInfo should have operation" + ); + assert_eq!( + commit_info["operation"], "CREATE TABLE", + "Operation should be CREATE TABLE" + ); + + // Verify Protocol action + let protocol_action = &actions[1]; + assert!( + protocol_action.get("protocol").is_some(), + "Second action should be protocol" + ); + let protocol = protocol_action.get("protocol").unwrap(); + assert_eq!( + protocol["minReaderVersion"], + TABLE_FEATURES_MIN_READER_VERSION + ); + assert_eq!( + protocol["minWriterVersion"], + TABLE_FEATURES_MIN_WRITER_VERSION + ); + + // Verify Metadata action + let metadata_action = &actions[2]; + assert!( + metadata_action.get("metaData").is_some(), + "Third action should be metaData" + ); + let metadata = metadata_action.get("metaData").unwrap(); + assert!(metadata.get("id").is_some(), "Metadata should have id"); + assert!( + metadata.get("schemaString").is_some(), + "Metadata should have schemaString" + ); + assert!( + metadata.get("createdTime").is_some(), + "Metadata should have createdTime" + ); + + // Additional CommitInfo verification (commit_info was already extracted from actions[0] above) + assert_eq!( + commit_info["engineInfo"], engine_info, + "CommitInfo should contain the engine info we provided" + ); + + assert!( + commit_info.get("txnId").is_some(), + "CommitInfo should have txnId" + ); + + // Verify kernelVersion is present + let kernel_version = commit_info.get("kernelVersion"); + assert!( + kernel_version.is_some(), + "CommitInfo should have kernelVersion" + ); + assert!( + kernel_version.unwrap().as_str().unwrap().starts_with("v"), + "Kernel version should start with 'v'" + ); + + Ok(()) +} + +/// Helper to create a `CreateTableTransaction` for tests. +fn create_test_create_table_txn() -> DeltaResult<( + Arc, + CreateTableTransaction, + tempfile::TempDir, +)> { + let (tempdir, table_path, engine) = test_table_setup()?; + let schema = Arc::new( + StructType::try_new(vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("name", DataType::STRING), + ]) + .expect("valid schema"), + ); + let txn = create_table(&table_path, schema, "test_engine") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))?; + Ok((engine, txn, tempdir)) +} + +#[tokio::test] +async fn test_create_table_txn_debug() -> DeltaResult<()> { + let (_engine, txn, _tempdir) = create_test_create_table_txn()?; + let debug_str = format!("{txn:?}"); + assert!( + debug_str.contains("Transaction") && debug_str.contains("create_table"), + "Debug output should contain Transaction info: {debug_str}" + ); + Ok(()) +} + +#[rstest] +// ReaderWriter features (AlwaysIfSupported) +#[case("vacuumProtocolCheck", TableFeature::VacuumProtocolCheck, true, true)] +#[case("v2Checkpoint", TableFeature::V2Checkpoint, true, true)] +// ReaderWriter features (EnabledIf -- feature signal alone does not enable) +#[case("deletionVectors", TableFeature::DeletionVectors, true, false)] +#[case("typeWidening", TableFeature::TypeWidening, true, false)] +// WriterOnly features (EnabledIf -- feature signal alone does not enable) +#[case("appendOnly", TableFeature::AppendOnly, false, false)] +#[case("changeDataFeed", TableFeature::ChangeDataFeed, false, false)] +fn test_create_table_with_feature_signal( + #[case] feature_name: &str, + #[case] feature: TableFeature, + #[case] is_reader_writer: bool, + #[case] enabled_when_supported: bool, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let property_key = format!("delta.feature.{feature_name}"); + let _ = create_table(&table_path, simple_schema()?, "Test/1.0") + .with_table_properties([(property_key.as_str(), "supported")]) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let snapshot = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + let table_config = snapshot.table_configuration(); + + assert!( + table_config.is_feature_supported(&feature), + "{feature_name} should be supported" + ); + assert_eq!( + table_config.is_feature_enabled(&feature), + enabled_when_supported, + "{feature_name}: is_feature_enabled should be {enabled_when_supported}" + ); + let protocol = table_config.protocol(); + assert!( + protocol + .writer_features() + .is_some_and(|f| f.contains(&feature)), + "{feature_name} should be in writer features" + ); + if is_reader_writer { + assert!( + protocol + .reader_features() + .is_some_and(|f| f.contains(&feature)), + "{feature_name} should be in reader features" + ); + } + + Ok(()) +} + +#[rstest] +fn test_create_table_with_checkpoint_stats_properties( + #[values(true, false)] write_stats_as_json: bool, + #[values(true, false)] write_stats_as_struct: bool, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let json_val = write_stats_as_json.to_string(); + let struct_val = write_stats_as_struct.to_string(); + + let _ = create_table(&table_path, simple_schema()?, "Test/1.0") + .with_table_properties([ + ("delta.checkpoint.writeStatsAsJson", json_val.as_str()), + ("delta.checkpoint.writeStatsAsStruct", struct_val.as_str()), + ]) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let snapshot = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + let tp = snapshot.table_properties(); + assert_eq!(tp.checkpoint_write_stats_as_json, Some(write_stats_as_json)); + assert_eq!( + tp.checkpoint_write_stats_as_struct, + Some(write_stats_as_struct) + ); + + Ok(()) +} + +#[rstest] +// ReaderWriter features +#[case("delta.enableDeletionVectors", TableFeature::DeletionVectors, true)] +#[case("delta.enableTypeWidening", TableFeature::TypeWidening, true)] +// WriterOnly features +#[case("delta.enableChangeDataFeed", TableFeature::ChangeDataFeed, false)] +#[case("delta.appendOnly", TableFeature::AppendOnly, false)] +fn test_create_table_with_enablement_property( + #[case] property: &str, + #[case] feature: TableFeature, + #[case] is_reader_writer: bool, + #[values(true, false)] expect_enabled: bool, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + let value = expect_enabled.to_string(); + + let _ = create_table(&table_path, simple_schema()?, "Test/1.0") + .with_table_properties([(property, value.as_str())]) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let snapshot = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + let table_config = snapshot.table_configuration(); + + assert_eq!( + table_config.is_feature_supported(&feature), + expect_enabled, + "{property}={value}: feature supported should be {expect_enabled}" + ); + assert_eq!( + table_config.is_feature_enabled(&feature), + expect_enabled, + "{property}={value}: feature enabled should be {expect_enabled}" + ); + let protocol = table_config.protocol(); + assert_eq!( + protocol + .writer_features() + .is_some_and(|f| f.contains(&feature)), + expect_enabled, + "{property}={value}: in writer features should be {expect_enabled}" + ); + if is_reader_writer { + assert_eq!( + protocol + .reader_features() + .is_some_and(|f| f.contains(&feature)), + expect_enabled, + "{property}={value}: in reader features should be {expect_enabled}" + ); + } + + Ok(()) +} diff --git a/kernel/tests/create_table/partitioned.rs b/kernel/tests/create_table/partitioned.rs new file mode 100644 index 0000000000..c5e2820d3c --- /dev/null +++ b/kernel/tests/create_table/partitioned.rs @@ -0,0 +1,37 @@ +//! Partition integration tests for the CreateTable API. +//! +//! TODO(#2201): Add end-to-end tests for insert + scan + checkpoint on partitioned tables. + +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::transaction::data_layout::DataLayout; +use delta_kernel::DeltaResult; +use test_utils::test_table_setup; + +use super::partition_test_schema; + +#[test] +fn test_create_table_partitioned_basic() -> DeltaResult<()> { + let schema = partition_test_schema()?; + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let _ = create_table(&table_path, schema, "Test/1.0") + .with_data_layout(DataLayout::partitioned(["date"])) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let snapshot = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert_eq!(snapshot.version(), 0); + + let partition_cols = snapshot.table_configuration().partition_columns(); + assert_eq!(partition_cols, &["date"]); + + let clustering = snapshot.get_physical_clustering_columns(engine.as_ref())?; + assert!( + clustering.is_none(), + "Partitioned table should not have clustering columns" + ); + + Ok(()) +} diff --git a/kernel/tests/create_table/timestamp_ntz.rs b/kernel/tests/create_table/timestamp_ntz.rs new file mode 100644 index 0000000000..9e97e50ac1 --- /dev/null +++ b/kernel/tests/create_table/timestamp_ntz.rs @@ -0,0 +1,140 @@ +//! TimestampNTZ integration tests for the CreateTable API. +//! +//! Tests that creating a table with TimestampNTZ columns in the schema automatically adds the +//! `timestampNtz` feature to the protocol, and that TimestampNTZ columns interact correctly +//! with other features (column mapping, variant). + +use std::sync::Arc; + +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::schema::{DataType, StructField, StructType}; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::table_features::{ + ColumnMappingMode, TableFeature, TABLE_FEATURES_MIN_READER_VERSION, + TABLE_FEATURES_MIN_WRITER_VERSION, +}; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::DeltaResult; +use test_utils::{ + cm_properties, multiple_ntz_schema, nested_ntz_schema, test_table_setup, top_level_ntz_schema, +}; + +/// Asserts the snapshot's protocol includes timestampNtz with correct reader/writer versions. +fn assert_timestamp_ntz_protocol(snapshot: &Snapshot) { + let table_config = snapshot.table_configuration(); + assert!( + table_config.is_feature_supported(&TableFeature::TimestampWithoutTimezone), + "timestampNtz feature should be supported" + ); + let protocol = table_config.protocol(); + assert!( + protocol.min_reader_version() >= TABLE_FEATURES_MIN_READER_VERSION, + "Reader version should be at least {TABLE_FEATURES_MIN_READER_VERSION}" + ); + assert!( + protocol.min_writer_version() >= TABLE_FEATURES_MIN_WRITER_VERSION, + "Writer version should be at least {TABLE_FEATURES_MIN_WRITER_VERSION}" + ); +} + +/// TimestampNTZ schema auto-enables timestampNtz across schema shapes and column mapping modes. +#[rstest::rstest] +fn test_create_table_with_timestamp_ntz( + #[values(top_level_ntz_schema(), nested_ntz_schema(), multiple_ntz_schema())] schema: Arc< + StructType, + >, + #[values("none", "name", "id")] cm_mode: &str, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let _ = create_table(&table_path, schema.clone(), "Test/1.0") + .with_table_properties(cm_properties(cm_mode)) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + assert_timestamp_ntz_protocol(&snapshot); + + if cm_mode != "none" { + let table_config = snapshot.table_configuration(); + assert!( + table_config.is_feature_supported(&TableFeature::ColumnMapping), + "columnMapping feature should be supported when cm_mode={cm_mode}" + ); + let expected_mode = match cm_mode { + "name" => ColumnMappingMode::Name, + "id" => ColumnMappingMode::Id, + _ => unreachable!(), + }; + assert_eq!(table_config.column_mapping_mode(), expected_mode); + } + + let read_schema = snapshot.schema(); + let stripped = super::column_mapping::strip_column_mapping_metadata(&read_schema); + assert_eq!( + &stripped, + schema.as_ref(), + "Schema should round-trip through create table" + ); + + Ok(()) +} + +/// A schema without TimestampNTZ columns should not add the timestampNtz feature. +#[test] +fn test_create_table_no_timestamp_ntz_no_feature() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = Arc::new(StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ])?); + + let _ = create_table(&table_path, schema, "Test/1.0") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + let table_config = snapshot.table_configuration(); + assert!( + !table_config.is_feature_supported(&TableFeature::TimestampWithoutTimezone), + "timestampNtz feature should NOT be in protocol for non-NTZ schema" + ); + + Ok(()) +} + +/// A schema with both TimestampNTZ and Variant columns enables both features. +#[test] +fn test_create_table_timestamp_ntz_and_variant() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("ts", DataType::TIMESTAMP_NTZ, true), + StructField::new("v", DataType::unshredded_variant(), true), + ])); + + let _ = create_table(&table_path, schema, "Test/1.0") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + let table_config = snapshot.table_configuration(); + assert!( + table_config.is_feature_supported(&TableFeature::TimestampWithoutTimezone), + "timestampNtz feature should be supported" + ); + assert!( + table_config.is_feature_supported(&TableFeature::VariantType), + "variantType feature should be supported" + ); + + Ok(()) +} diff --git a/kernel/tests/create_table/variant.rs b/kernel/tests/create_table/variant.rs new file mode 100644 index 0000000000..f6bf9e408f --- /dev/null +++ b/kernel/tests/create_table/variant.rs @@ -0,0 +1,130 @@ +//! Variant type integration tests for the CreateTable API. +//! +//! Tests that creating a table with Variant columns in the schema automatically adds the +//! `variantType` feature to the protocol, and that Variant columns interact correctly +//! with other features (column mapping, clustering rejection). + +use std::sync::Arc; + +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::schema::{DataType, StructField, StructType}; +use delta_kernel::snapshot::Snapshot; +use delta_kernel::table_features::{ + ColumnMappingMode, TableFeature, TABLE_FEATURES_MIN_READER_VERSION, + TABLE_FEATURES_MIN_WRITER_VERSION, +}; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::transaction::data_layout::DataLayout; +use delta_kernel::DeltaResult; +use test_utils::{ + assert_result_error_with_message, cm_properties, multiple_variant_schema, + nested_variant_schema, test_table_setup, top_level_variant_schema, +}; + +/// Asserts the snapshot's protocol includes variantType with correct reader/writer versions. +fn assert_variant_protocol(snapshot: &Snapshot) { + let table_config = snapshot.table_configuration(); + assert!( + table_config.is_feature_supported(&TableFeature::VariantType), + "variantType feature should be supported" + ); + let protocol = table_config.protocol(); + assert!( + protocol.min_reader_version() >= TABLE_FEATURES_MIN_READER_VERSION, + "Reader version should be at least {TABLE_FEATURES_MIN_READER_VERSION}" + ); + assert!( + protocol.min_writer_version() >= TABLE_FEATURES_MIN_WRITER_VERSION, + "Writer version should be at least {TABLE_FEATURES_MIN_WRITER_VERSION}" + ); +} + +/// Variant schema auto-enables variantType across schema shapes and column mapping modes. +#[rstest::rstest] +fn test_create_table_with_variant( + #[values( + top_level_variant_schema(), + nested_variant_schema(), + multiple_variant_schema() + )] + schema: Arc, + #[values("none", "name", "id")] cm_mode: &str, +) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let _ = create_table(&table_path, schema.clone(), "Test/1.0") + .with_table_properties(cm_properties(cm_mode)) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + assert_variant_protocol(&snapshot); + + if cm_mode != "none" { + let table_config = snapshot.table_configuration(); + assert!( + table_config.is_feature_supported(&TableFeature::ColumnMapping), + "columnMapping feature should be supported when cm_mode={cm_mode}" + ); + let expected_mode = match cm_mode { + "name" => ColumnMappingMode::Name, + "id" => ColumnMappingMode::Id, + _ => unreachable!(), + }; + assert_eq!(table_config.column_mapping_mode(), expected_mode); + } + + // Verify the schema round-trips correctly (strip CM metadata before comparing, + // since the read-back schema has physical names/IDs that the original doesn't). + let read_schema = snapshot.schema(); + let stripped = super::column_mapping::strip_column_mapping_metadata(&read_schema); + assert_eq!( + &stripped, + schema.as_ref(), + "Schema should round-trip through create table" + ); + + Ok(()) +} + +/// A schema without variant columns should not add the variantType feature. +#[test] +fn test_create_table_no_variant_no_feature() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let schema = Arc::new(StructType::try_new(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("name", DataType::STRING, true), + ])?); + + let _ = create_table(&table_path, schema, "Test/1.0") + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let table_url = delta_kernel::try_parse_uri(&table_path)?; + let snapshot = Snapshot::builder_for(table_url).build(engine.as_ref())?; + + let table_config = snapshot.table_configuration(); + assert!( + !table_config.is_feature_supported(&TableFeature::VariantType), + "variantType feature should NOT be in protocol for non-variant schema" + ); + + Ok(()) +} + +/// Clustering on a variant column is rejected per the Delta spec. +#[test] +fn test_create_table_variant_clustering_rejected() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup()?; + + let result = create_table(&table_path, top_level_variant_schema(), "Test/1.0") + .with_data_layout(DataLayout::clustered(["col"])) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new())); + + assert_result_error_with_message(result, "unsupported type"); + + Ok(()) +} diff --git a/kernel/tests/data/300k-add-files-100-col-partitioned.tar.zst b/kernel/tests/data/300k-add-files-100-col-partitioned.tar.zst index a9f553b1ec..7bb551bc6e 100644 Binary files a/kernel/tests/data/300k-add-files-100-col-partitioned.tar.zst and b/kernel/tests/data/300k-add-files-100-col-partitioned.tar.zst differ diff --git a/kernel/tests/data/cdf-column-mapping-id-mode.tar.zst b/kernel/tests/data/cdf-column-mapping-id-mode.tar.zst new file mode 100644 index 0000000000..86cd855787 Binary files /dev/null and b/kernel/tests/data/cdf-column-mapping-id-mode.tar.zst differ diff --git a/kernel/tests/data/cdf-column-mapping-name-mode-3-7.tar.zst b/kernel/tests/data/cdf-column-mapping-name-mode-3-7.tar.zst new file mode 100644 index 0000000000..ce7bd9968a Binary files /dev/null and b/kernel/tests/data/cdf-column-mapping-name-mode-3-7.tar.zst differ diff --git a/kernel/tests/data/cdf-column-mapping-name-mode.tar.zst b/kernel/tests/data/cdf-column-mapping-name-mode.tar.zst new file mode 100644 index 0000000000..49eed50efe Binary files /dev/null and b/kernel/tests/data/cdf-column-mapping-name-mode.tar.zst differ diff --git a/kernel/tests/data/cdf-table-backtick-column-names.tar.zst b/kernel/tests/data/cdf-table-backtick-column-names.tar.zst index a72a65511d..322ed5552b 100644 Binary files a/kernel/tests/data/cdf-table-backtick-column-names.tar.zst and b/kernel/tests/data/cdf-table-backtick-column-names.tar.zst differ diff --git a/kernel/tests/data/cdf-table-data-change.tar.zst b/kernel/tests/data/cdf-table-data-change.tar.zst index 974f7587f0..77b5d6af46 100644 Binary files a/kernel/tests/data/cdf-table-data-change.tar.zst and b/kernel/tests/data/cdf-table-data-change.tar.zst differ diff --git a/kernel/tests/data/cdf-table-delete-conditional-all-rows.tar.zst b/kernel/tests/data/cdf-table-delete-conditional-all-rows.tar.zst index fe38e64b18..3f05bd315a 100644 Binary files a/kernel/tests/data/cdf-table-delete-conditional-all-rows.tar.zst and b/kernel/tests/data/cdf-table-delete-conditional-all-rows.tar.zst differ diff --git a/kernel/tests/data/cdf-table-delete-conditional-two-rows.tar.zst b/kernel/tests/data/cdf-table-delete-conditional-two-rows.tar.zst index e103719914..68c637e947 100644 Binary files a/kernel/tests/data/cdf-table-delete-conditional-two-rows.tar.zst and b/kernel/tests/data/cdf-table-delete-conditional-two-rows.tar.zst differ diff --git a/kernel/tests/data/cdf-table-delete-unconditional.tar.zst b/kernel/tests/data/cdf-table-delete-unconditional.tar.zst index 4e57df8be0..cd5fb3e9db 100644 Binary files a/kernel/tests/data/cdf-table-delete-unconditional.tar.zst and b/kernel/tests/data/cdf-table-delete-unconditional.tar.zst differ diff --git a/kernel/tests/data/cdf-table-non-partitioned.tar.zst b/kernel/tests/data/cdf-table-non-partitioned.tar.zst index f97e1ea8ac..8f69794aa8 100644 Binary files a/kernel/tests/data/cdf-table-non-partitioned.tar.zst and b/kernel/tests/data/cdf-table-non-partitioned.tar.zst differ diff --git a/kernel/tests/data/cdf-table-partitioned.tar.zst b/kernel/tests/data/cdf-table-partitioned.tar.zst index 9d47d85a69..ed3207bc33 100644 Binary files a/kernel/tests/data/cdf-table-partitioned.tar.zst and b/kernel/tests/data/cdf-table-partitioned.tar.zst differ diff --git a/kernel/tests/data/cdf-table-simple.tar.zst b/kernel/tests/data/cdf-table-simple.tar.zst index 0051f05f4f..4918900a8a 100644 Binary files a/kernel/tests/data/cdf-table-simple.tar.zst and b/kernel/tests/data/cdf-table-simple.tar.zst differ diff --git a/kernel/tests/data/cdf-table-update-ops.tar.zst b/kernel/tests/data/cdf-table-update-ops.tar.zst index 33134b22b1..3b72966216 100644 Binary files a/kernel/tests/data/cdf-table-update-ops.tar.zst and b/kernel/tests/data/cdf-table-update-ops.tar.zst differ diff --git a/kernel/tests/data/cdf-table-with-cdc-and-dvs.tar.zst b/kernel/tests/data/cdf-table-with-cdc-and-dvs.tar.zst index 110aa0e0e5..5e95ea8c55 100644 Binary files a/kernel/tests/data/cdf-table-with-cdc-and-dvs.tar.zst and b/kernel/tests/data/cdf-table-with-cdc-and-dvs.tar.zst differ diff --git a/kernel/tests/data/cdf-table-with-dv.tar.zst b/kernel/tests/data/cdf-table-with-dv.tar.zst index d3aa6574de..480e784882 100644 Binary files a/kernel/tests/data/cdf-table-with-dv.tar.zst and b/kernel/tests/data/cdf-table-with-dv.tar.zst differ diff --git a/kernel/tests/data/cdf-table.tar.zst b/kernel/tests/data/cdf-table.tar.zst index 9bdb37d44b..178cda8c24 100644 Binary files a/kernel/tests/data/cdf-table.tar.zst and b/kernel/tests/data/cdf-table.tar.zst differ diff --git a/kernel/tests/data/crc-full/_delta_log/00000000000000000000.crc b/kernel/tests/data/crc-full/_delta_log/00000000000000000000.crc new file mode 100644 index 0000000000..16b4964a9d --- /dev/null +++ b/kernel/tests/data/crc-full/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"29ebf587-9705-4bb4-ac40-2f02324065c8","tableSizeBytes":5259,"numFiles":10,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"inCommitTimestampOpt":1694758257000,"setTransactions":[{"appId":"spark-app-1","version":42,"lastUpdated":1694758250000},{"appId":"streaming-job-abc","version":100,"lastUpdated":1694758255000}],"domainMetadata":[{"domain":"delta.clustering","configuration":"{\"clusteringColumns\":[[\"col1\"],[\"user\",\"address\",\"city\"]]}","removed":false},{"domain":"delta.rowTracking","configuration":"{\"rowIdHighWaterMark\":9}","removed":false},{"domain":"myApp.metadata","configuration":"{\"key\":\"value\"}","removed":false}],"metadata":{"id":"6ca3020b-3cd9-4048-82e3-1417a0abb98f","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.enableRowTracking":"true","delta.checkpoint.writeStatsAsJson":"false","delta.rowTracking.materializedRowCommitVersionColumnName":"_row-commit-version-col-2f60dcc1-9e36-4424-95e7-799b707e4ddb","delta.rowTracking.materializedRowIdColumnName":"_row-id-col-4cbc7924-f662-4db1-aa59-22c23f59eb5d"},"createdTime":1694758256009},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["domainMetadata","clustering","deletionVectors","rowTracking"]},"fileSizeHistogram":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[5259,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[10,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"part-00223-24d8cffb-245d-4027-87d6-940fcf593a60.c000.snappy.parquet","partitionValues":{},"size":526,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4},\"maxValues\":{\"id\":4},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000004","MIN_INSERTION_TIME":"1694758257000004","MAX_INSERTION_TIME":"1694758257000004","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":4,"defaultRowCommitVersion":0},{"path":"part-00268-365db28b-f856-49e6-a25f-b0211cf95d20.c000.snappy.parquet","partitionValues":{},"size":526,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5},\"maxValues\":{\"id\":5},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000005","MIN_INSERTION_TIME":"1694758257000005","MAX_INSERTION_TIME":"1694758257000005","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":5,"defaultRowCommitVersion":0},{"path":"part-00089-b466c656-9b4a-41d6-ab41-f02007d1658c.c000.snappy.parquet","partitionValues":{},"size":526,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1},\"maxValues\":{\"id\":1},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000001","MIN_INSERTION_TIME":"1694758257000001","MAX_INSERTION_TIME":"1694758257000001","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":1,"defaultRowCommitVersion":0},{"path":"part-00044-22c23f7f-2411-4d88-b78c-cebe430cdd47.c000.snappy.parquet","partitionValues":{},"size":526,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":0},\"maxValues\":{\"id\":0},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000000","MIN_INSERTION_TIME":"1694758257000000","MAX_INSERTION_TIME":"1694758257000000","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":0,"defaultRowCommitVersion":0},{"path":"part-00447-1755ad02-9b47-4287-8333-92cb01a5124b.c000.snappy.parquet","partitionValues":{},"size":526,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":9},\"maxValues\":{\"id\":9},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000009","MIN_INSERTION_TIME":"1694758257000009","MAX_INSERTION_TIME":"1694758257000009","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":9,"defaultRowCommitVersion":0},{"path":"part-00134-34f9b771-c60a-4bd4-bdc0-cd25fcc951c6.c000.snappy.parquet","partitionValues":{},"size":525,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2},\"maxValues\":{\"id\":2},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000002","MIN_INSERTION_TIME":"1694758257000002","MAX_INSERTION_TIME":"1694758257000002","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":2,"defaultRowCommitVersion":0},{"path":"part-00358-5937ec73-64a5-44dd-a793-922e30c1b9df.c000.snappy.parquet","partitionValues":{},"size":526,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":7},\"maxValues\":{\"id\":7},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000007","MIN_INSERTION_TIME":"1694758257000007","MAX_INSERTION_TIME":"1694758257000007","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":7,"defaultRowCommitVersion":0},{"path":"part-00313-c528546e-c8ab-425d-b49a-5afe731aaac8.c000.snappy.parquet","partitionValues":{},"size":526,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":6},\"maxValues\":{\"id\":6},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000006","MIN_INSERTION_TIME":"1694758257000006","MAX_INSERTION_TIME":"1694758257000006","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":6,"defaultRowCommitVersion":0},{"path":"part-00179-76f56874-b389-409b-8a2d-18462928840e.c000.snappy.parquet","partitionValues":{},"size":526,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3},\"maxValues\":{\"id\":3},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000003","MIN_INSERTION_TIME":"1694758257000003","MAX_INSERTION_TIME":"1694758257000003","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":3,"defaultRowCommitVersion":0},{"path":"part-00403-6af19469-0fc5-4809-b02a-ddebda3966e8.c000.snappy.parquet","partitionValues":{},"size":526,"modificationTime":1694758257000,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":8},\"maxValues\":{\"id\":8},\"nullCount\":{\"id\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1694758257000008","MIN_INSERTION_TIME":"1694758257000008","MAX_INSERTION_TIME":"1694758257000008","OPTIMIZE_TARGET_SIZE":"268435456"},"baseRowId":8,"defaultRowCommitVersion":0}]} diff --git a/kernel/tests/data/crc-full/_delta_log/00000000000000000000.json b/kernel/tests/data/crc-full/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..0f8cda9461 --- /dev/null +++ b/kernel/tests/data/crc-full/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1694758257000,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"inCommitTimestamp":1694758257000,"engineInfo":"Databricks-Runtime/","txnId":"29ebf587-9705-4bb4-ac40-2f02324065c8"}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["domainMetadata","clustering","deletionVectors","rowTracking"]}} +{"metaData":{"id":"6ca3020b-3cd9-4048-82e3-1417a0abb98f","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.enableRowTracking":"true","delta.checkpoint.writeStatsAsJson":"false","delta.rowTracking.materializedRowCommitVersionColumnName":"_row-commit-version-col-2f60dcc1-9e36-4424-95e7-799b707e4ddb","delta.rowTracking.materializedRowIdColumnName":"_row-id-col-4cbc7924-f662-4db1-aa59-22c23f59eb5d"},"createdTime":1694758256009}} diff --git a/kernel/tests/data/crc-malformed/_delta_log/00000000000000000000.crc b/kernel/tests/data/crc-malformed/_delta_log/00000000000000000000.crc new file mode 100644 index 0000000000..eca77f3543 --- /dev/null +++ b/kernel/tests/data/crc-malformed/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +malformed \ No newline at end of file diff --git a/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000000.json b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..ff3e8a8709 --- /dev/null +++ b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1767019603960,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.checkpointInterval\":\"2\",\"delta.writePartitionColumnsToParquet\":\"true\"}","statsOnLoad":false},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"ca257341-0877-4bb4-bf7a-a67daefc7e02"}} +{"metaData":{"id":"7da8fa1d-08be-4954-ad6b-5f647d1302e1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"i\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"2","delta.writePartitionColumnsToParquet":"true"},"createdTime":1767019603861}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} diff --git a/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000001.json b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..b71ca34e35 --- /dev/null +++ b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1767019610022,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"3","numOutputBytes":"451"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"634908bb-e4a0-4de6-9bb0-d75d4d11d703"}} +{"add":{"path":"part-00000-821daf99-ba26-4b10-b43e-f46e1055226e.c000.snappy.parquet","partitionValues":{},"size":451,"modificationTime":1767019609941,"dataChange":true,"stats":"{\"numRecords\":3,\"minValues\":{\"i\":1},\"maxValues\":{\"i\":3},\"nullCount\":{\"i\":0}}","tags":{"INSERTION_TIME":"1767019609941000","MIN_INSERTION_TIME":"1767019609941000","MAX_INSERTION_TIME":"1767019609941000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000002.checkpoint.parquet b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000002.checkpoint.parquet new file mode 100644 index 0000000000..ff21a45957 Binary files /dev/null and b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000002.checkpoint.parquet differ diff --git a/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000002.json b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..2f5343f685 --- /dev/null +++ b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1767019615255,"operation":"ADD COLUMNS","operationParameters":{"columns":"[{\"column\":{\"name\":\"j\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}}]"},"readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Databricks-Runtime/","txnId":"0ff03221-6134-4724-80bf-b5d6dde2b824"}} +{"metaData":{"id":"7da8fa1d-08be-4954-ad6b-5f647d1302e1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"i\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"j\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"2","delta.writePartitionColumnsToParquet":"true"},"createdTime":1767019603861}} diff --git a/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000003.json b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..e320fcf50b --- /dev/null +++ b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1767019637425,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"2","numOutputBytes":"610"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"4824ac5f-0ceb-4d02-b857-9e3f3dfd48c0"}} +{"add":{"path":"part-00000-e49f0b9c-2e97-4649-bc6f-0d30a2bde389.c000.snappy.parquet","partitionValues":{},"size":610,"modificationTime":1767019637405,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"i\":4,\"j\":40},\"maxValues\":{\"i\":5,\"j\":50},\"nullCount\":{\"i\":0,\"j\":0}}","tags":{"INSERTION_TIME":"1767019637405000","MIN_INSERTION_TIME":"1767019637405000","MAX_INSERTION_TIME":"1767019637405000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000004.json b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000004.json new file mode 100644 index 0000000000..418cdaaa55 --- /dev/null +++ b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1767019644847,"operation":"ADD COLUMNS","operationParameters":{"columns":"[{\"column\":{\"name\":\"k\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}}]"},"readVersion":3,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Databricks-Runtime/","txnId":"e2febcb8-238a-42f5-923d-3b189cb6f264"}} +{"metaData":{"id":"7da8fa1d-08be-4954-ad6b-5f647d1302e1","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"i\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"j\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"k\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"2","delta.writePartitionColumnsToParquet":"true"},"createdTime":1767019603861}} diff --git a/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000005.json b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000005.json new file mode 100644 index 0000000000..400acb3e4f --- /dev/null +++ b/kernel/tests/data/external-table-different-nullability/_delta_log/00000000000000000005.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1767019651268,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":4,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"2","numOutputBytes":"772"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"2a9a73b1-cb54-4f68-a33d-8dcfb40e79c6"}} +{"add":{"path":"part-00000-cc5488a7-9a18-48bd-8721-a331aa430c54.c000.snappy.parquet","partitionValues":{},"size":772,"modificationTime":1767019651252,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"i\":6,\"j\":60,\"k\":600},\"maxValues\":{\"i\":7,\"j\":70,\"k\":700},\"nullCount\":{\"i\":0,\"j\":0,\"k\":0}}","tags":{"INSERTION_TIME":"1767019651252000","MIN_INSERTION_TIME":"1767019651252000","MAX_INSERTION_TIME":"1767019651252000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/external-table-different-nullability/_delta_log/_last_checkpoint b/kernel/tests/data/external-table-different-nullability/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..9ab71a852e --- /dev/null +++ b/kernel/tests/data/external-table-different-nullability/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":2,"size":3,"sizeInBytes":19831,"numOfAddFiles":1,"checkpointSchema":{"type":"struct","fields":[{"name":"txn","type":{"type":"struct","fields":[{"name":"appId","type":"string","nullable":true,"metadata":{}},{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"lastUpdated","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"add","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"clusteringProvider","type":"string","nullable":true,"metadata":{}},{"name":"stats","type":"string","nullable":true,"metadata":{}},{"name":"stats_parsed","type":{"type":"struct","fields":[{"name":"numRecords","type":"long","nullable":true,"metadata":{}},{"name":"minValues","type":{"type":"struct","fields":[{"name":"i","type":"integer","nullable":true,"metadata":{}},{"name":"j","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"maxValues","type":{"type":"struct","fields":[{"name":"i","type":"integer","nullable":true,"metadata":{}},{"name":"j","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"nullCount","type":{"type":"struct","fields":[{"name":"i","type":"long","nullable":true,"metadata":{}},{"name":"j","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"remove","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"deletionTimestamp","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"extendedFileMetadata","type":"boolean","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"metaData","type":{"type":"struct","fields":[{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"format","type":{"type":"struct","fields":[{"name":"provider","type":"string","nullable":true,"metadata":{}},{"name":"options","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"schemaString","type":"string","nullable":true,"metadata":{}},{"name":"partitionColumns","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"configuration","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"createdTime","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"protocol","type":{"type":"struct","fields":[{"name":"minReaderVersion","type":"integer","nullable":true,"metadata":{}},{"name":"minWriterVersion","type":"integer","nullable":true,"metadata":{}},{"name":"readerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"writerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"domainMetadata","type":{"type":"struct","fields":[{"name":"domain","type":"string","nullable":true,"metadata":{}},{"name":"configuration","type":"string","nullable":true,"metadata":{}},{"name":"removed","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"checkpointFiles":[{"path":"00000000000000000002.checkpoint.parquet","length":19831,"isDir":false,"modificationTime":1767020727284}],"checksum":"6cd376e671dd9cd6faf49822401ad0c5"} diff --git a/kernel/tests/data/external-table-different-nullability/part-00000-821daf99-ba26-4b10-b43e-f46e1055226e.c000.snappy.parquet b/kernel/tests/data/external-table-different-nullability/part-00000-821daf99-ba26-4b10-b43e-f46e1055226e.c000.snappy.parquet new file mode 100644 index 0000000000..e31ce60a19 Binary files /dev/null and b/kernel/tests/data/external-table-different-nullability/part-00000-821daf99-ba26-4b10-b43e-f46e1055226e.c000.snappy.parquet differ diff --git a/kernel/tests/data/external-table-different-nullability/part-00000-cc5488a7-9a18-48bd-8721-a331aa430c54.c000.snappy.parquet b/kernel/tests/data/external-table-different-nullability/part-00000-cc5488a7-9a18-48bd-8721-a331aa430c54.c000.snappy.parquet new file mode 100644 index 0000000000..d842061b11 Binary files /dev/null and b/kernel/tests/data/external-table-different-nullability/part-00000-cc5488a7-9a18-48bd-8721-a331aa430c54.c000.snappy.parquet differ diff --git a/kernel/tests/data/external-table-different-nullability/part-00000-e49f0b9c-2e97-4649-bc6f-0d30a2bde389.c000.snappy.parquet b/kernel/tests/data/external-table-different-nullability/part-00000-e49f0b9c-2e97-4649-bc6f-0d30a2bde389.c000.snappy.parquet new file mode 100644 index 0000000000..5de52a8a1e Binary files /dev/null and b/kernel/tests/data/external-table-different-nullability/part-00000-e49f0b9c-2e97-4649-bc6f-0d30a2bde389.c000.snappy.parquet differ diff --git a/kernel/tests/data/parsed-stats/_delta_log/00000000000000000000.json b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..b640c269d2 --- /dev/null +++ b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"07b3a271-f0d6-4c3b-a300-3170847ceafe","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"age\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"salary\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ts_col\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"3"},"createdTime":1765928274753}} +{"add":{"path":"part-00000-06d85a38-b141-479b-a315-4157335e9a11-c000.snappy.parquet","partitionValues":{},"size":3103,"modificationTime":1765928276060,"dataChange":true,"stats":"{\"numRecords\":100,\"minValues\":{\"id\":1,\"name\":\"name_1\",\"age\":20,\"salary\":50100,\"ts_col\":\"1970-01-01T00:00:01.000Z\"},\"maxValues\":{\"id\":100,\"name\":\"name_99\",\"age\":69,\"salary\":60000,\"ts_col\":\"1970-01-01T00:00:02.000Z\"},\"nullCount\":{\"id\":0,\"name\":0,\"age\":0,\"salary\":0,\"ts_col\":0}}"}} +{"commitInfo":{"timestamp":1765928276979,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"100","numOutputBytes":"3103"},"engineInfo":"Apache-Spark/3.2.4 Delta-Lake/2.1.0","txnId":"e3e4034b-7a93-4d2b-9de4-3fda01735de4"}} diff --git a/kernel/tests/data/parsed-stats/_delta_log/00000000000000000001.json b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..602885ea22 --- /dev/null +++ b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"add":{"path":"part-00000-065eae2b-b4ea-4708-bb30-0888f35cabdd-c000.snappy.parquet","partitionValues":{},"size":2826,"modificationTime":1765928279084,"dataChange":true,"stats":"{\"numRecords\":100,\"minValues\":{\"id\":101,\"name\":\"name_101\",\"age\":20,\"salary\":60100,\"ts_col\":\"1970-01-01T00:00:03.000Z\"},\"maxValues\":{\"id\":200,\"name\":\"name_200\",\"age\":69,\"salary\":70000,\"ts_col\":\"1970-01-01T00:00:04.000Z\"},\"nullCount\":{\"id\":0,\"name\":0,\"age\":0,\"salary\":0,\"ts_col\":0}}"}} +{"commitInfo":{"timestamp":1765928279091,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"100","numOutputBytes":"2826"},"engineInfo":"Apache-Spark/3.2.4 Delta-Lake/2.1.0","txnId":"6b5f28e4-a9cb-4c53-8b29-99e13d48a280"}} diff --git a/kernel/tests/data/parsed-stats/_delta_log/00000000000000000002.json b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..25620d532c --- /dev/null +++ b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"add":{"path":"part-00000-a4c1def5-742e-4248-8c58-fc9f4018e43d-c000.snappy.parquet","partitionValues":{},"size":2828,"modificationTime":1765928279752,"dataChange":true,"stats":"{\"numRecords\":100,\"minValues\":{\"id\":201,\"name\":\"name_201\",\"age\":20,\"salary\":70100,\"ts_col\":\"1970-01-01T00:00:05.000Z\"},\"maxValues\":{\"id\":300,\"name\":\"name_300\",\"age\":69,\"salary\":80000,\"ts_col\":\"1970-01-01T00:00:06.000Z\"},\"nullCount\":{\"id\":0,\"name\":0,\"age\":0,\"salary\":0,\"ts_col\":0}}"}} +{"commitInfo":{"timestamp":1765928279759,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"100","numOutputBytes":"2828"},"engineInfo":"Apache-Spark/3.2.4 Delta-Lake/2.1.0","txnId":"23695a3f-264a-427d-a3f1-8e0eda4016f5"}} diff --git a/kernel/tests/data/parsed-stats/_delta_log/00000000000000000003.checkpoint.parquet b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000003.checkpoint.parquet new file mode 100644 index 0000000000..bde063d257 Binary files /dev/null and b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000003.checkpoint.parquet differ diff --git a/kernel/tests/data/parsed-stats/_delta_log/00000000000000000003.json b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..7b284aaf83 --- /dev/null +++ b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"add":{"path":"part-00000-2d9663e0-37c0-425e-98df-2e7141f9b5fb-c000.snappy.parquet","partitionValues":{},"size":2827,"modificationTime":1765928280364,"dataChange":true,"stats":"{\"numRecords\":100,\"minValues\":{\"id\":301,\"name\":\"name_301\",\"age\":20,\"salary\":80100,\"ts_col\":\"1970-01-01T00:00:07.000Z\"},\"maxValues\":{\"id\":400,\"name\":\"name_400\",\"age\":69,\"salary\":90000,\"ts_col\":\"1970-01-01T00:00:08.000Z\"},\"nullCount\":{\"id\":0,\"name\":0,\"age\":0,\"salary\":0,\"ts_col\":0}}"}} +{"commitInfo":{"timestamp":1765928280369,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":2,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"100","numOutputBytes":"2827"},"engineInfo":"Apache-Spark/3.2.4 Delta-Lake/2.1.0","txnId":"88155a70-c671-4065-9a07-6e1eb3bbad68"}} diff --git a/kernel/tests/data/parsed-stats/_delta_log/00000000000000000004.json b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000004.json new file mode 100644 index 0000000000..8073aaecf6 --- /dev/null +++ b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"add":{"path":"part-00000-40525115-50e1-4475-aae1-c8edc59274e6-c000.snappy.parquet","partitionValues":{},"size":2824,"modificationTime":1765928281356,"dataChange":true,"stats":"{\"numRecords\":100,\"minValues\":{\"id\":401,\"name\":\"name_401\",\"age\":20,\"salary\":90100,\"ts_col\":\"1970-01-01T00:00:09.000Z\"},\"maxValues\":{\"id\":500,\"name\":\"name_500\",\"age\":69,\"salary\":100000,\"ts_col\":\"1970-01-01T00:00:10.000Z\"},\"nullCount\":{\"id\":0,\"name\":0,\"age\":0,\"salary\":0,\"ts_col\":0}}"}} +{"commitInfo":{"timestamp":1765928281364,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":3,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"100","numOutputBytes":"2824"},"engineInfo":"Apache-Spark/3.2.4 Delta-Lake/2.1.0","txnId":"53ed470b-13c7-437f-829d-9a6704a285a0"}} diff --git a/kernel/tests/data/parsed-stats/_delta_log/00000000000000000005.json b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000005.json new file mode 100644 index 0000000000..5a7fc3cd51 --- /dev/null +++ b/kernel/tests/data/parsed-stats/_delta_log/00000000000000000005.json @@ -0,0 +1,2 @@ +{"add":{"path":"part-00000-c0cbdedc-d11b-4e4c-b6f2-5f40c55ef515-c000.snappy.parquet","partitionValues":{},"size":2828,"modificationTime":1765928282040,"dataChange":true,"stats":"{\"numRecords\":100,\"minValues\":{\"id\":501,\"name\":\"name_501\",\"age\":20,\"salary\":100100,\"ts_col\":\"1970-01-01T00:00:11.000Z\"},\"maxValues\":{\"id\":600,\"name\":\"name_600\",\"age\":69,\"salary\":110000,\"ts_col\":\"1970-01-01T00:00:12.000Z\"},\"nullCount\":{\"id\":0,\"name\":0,\"age\":0,\"salary\":0,\"ts_col\":0}}"}} +{"commitInfo":{"timestamp":1765928282047,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":4,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"100","numOutputBytes":"2828"},"engineInfo":"Apache-Spark/3.2.4 Delta-Lake/2.1.0","txnId":"e0681515-134d-427b-b265-da87d06ec8c3"}} diff --git a/kernel/tests/data/parsed-stats/_delta_log/_last_checkpoint b/kernel/tests/data/parsed-stats/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..96b858e325 --- /dev/null +++ b/kernel/tests/data/parsed-stats/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":3,"size":6,"sizeInBytes":22972,"numOfAddFiles":4} diff --git a/kernel/tests/data/parsed-stats/part-00000-065eae2b-b4ea-4708-bb30-0888f35cabdd-c000.snappy.parquet b/kernel/tests/data/parsed-stats/part-00000-065eae2b-b4ea-4708-bb30-0888f35cabdd-c000.snappy.parquet new file mode 100644 index 0000000000..b47546b961 Binary files /dev/null and b/kernel/tests/data/parsed-stats/part-00000-065eae2b-b4ea-4708-bb30-0888f35cabdd-c000.snappy.parquet differ diff --git a/kernel/tests/data/parsed-stats/part-00000-06d85a38-b141-479b-a315-4157335e9a11-c000.snappy.parquet b/kernel/tests/data/parsed-stats/part-00000-06d85a38-b141-479b-a315-4157335e9a11-c000.snappy.parquet new file mode 100644 index 0000000000..cc425f7519 Binary files /dev/null and b/kernel/tests/data/parsed-stats/part-00000-06d85a38-b141-479b-a315-4157335e9a11-c000.snappy.parquet differ diff --git a/kernel/tests/data/parsed-stats/part-00000-2d9663e0-37c0-425e-98df-2e7141f9b5fb-c000.snappy.parquet b/kernel/tests/data/parsed-stats/part-00000-2d9663e0-37c0-425e-98df-2e7141f9b5fb-c000.snappy.parquet new file mode 100644 index 0000000000..e25b8a16c1 Binary files /dev/null and b/kernel/tests/data/parsed-stats/part-00000-2d9663e0-37c0-425e-98df-2e7141f9b5fb-c000.snappy.parquet differ diff --git a/kernel/tests/data/parsed-stats/part-00000-40525115-50e1-4475-aae1-c8edc59274e6-c000.snappy.parquet b/kernel/tests/data/parsed-stats/part-00000-40525115-50e1-4475-aae1-c8edc59274e6-c000.snappy.parquet new file mode 100644 index 0000000000..d7ac169ee0 Binary files /dev/null and b/kernel/tests/data/parsed-stats/part-00000-40525115-50e1-4475-aae1-c8edc59274e6-c000.snappy.parquet differ diff --git a/kernel/tests/data/parsed-stats/part-00000-a4c1def5-742e-4248-8c58-fc9f4018e43d-c000.snappy.parquet b/kernel/tests/data/parsed-stats/part-00000-a4c1def5-742e-4248-8c58-fc9f4018e43d-c000.snappy.parquet new file mode 100644 index 0000000000..5199afdc9f Binary files /dev/null and b/kernel/tests/data/parsed-stats/part-00000-a4c1def5-742e-4248-8c58-fc9f4018e43d-c000.snappy.parquet differ diff --git a/kernel/tests/data/parsed-stats/part-00000-c0cbdedc-d11b-4e4c-b6f2-5f40c55ef515-c000.snappy.parquet b/kernel/tests/data/parsed-stats/part-00000-c0cbdedc-d11b-4e4c-b6f2-5f40c55ef515-c000.snappy.parquet new file mode 100644 index 0000000000..4cdf6bfc7e Binary files /dev/null and b/kernel/tests/data/parsed-stats/part-00000-c0cbdedc-d11b-4e4c-b6f2-5f40c55ef515-c000.snappy.parquet differ diff --git a/kernel/tests/data/partition_cm/id/_delta_log/00000000000000000000.json b/kernel/tests/data/partition_cm/id/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..4c5007cf92 --- /dev/null +++ b/kernel/tests/data/partition_cm/id/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["columnMapping"],"writerFeatures":["columnMapping"]}} +{"metaData":{"id":"test-partitioned-cm-id","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":1,\"delta.columnMapping.physicalName\":\"col-e0571b78-2d49-4109-9add-f2c54d6ea29e\"}},{\"name\":\"category\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":2,\"delta.columnMapping.physicalName\":\"col-6dc68f07-711d-4f00-8bd6-1f5bc698e8ad\"}}]}","partitionColumns":["category"],"configuration":{"delta.columnMapping.mode":"id","delta.columnMapping.maxColumnId":"2"},"createdTime":1677811175819}} + diff --git a/kernel/tests/data/partition_cm/name/_delta_log/00000000000000000000.json b/kernel/tests/data/partition_cm/name/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..fa7696e5d4 --- /dev/null +++ b/kernel/tests/data/partition_cm/name/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["columnMapping"],"writerFeatures":["columnMapping"]}} +{"metaData":{"id":"test-partitioned-cm-name","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":1,\"delta.columnMapping.physicalName\":\"col-e0571b78-2d49-4109-9add-f2c54d6ea29e\"}},{\"name\":\"category\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":2,\"delta.columnMapping.physicalName\":\"col-6dc68f07-711d-4f00-8bd6-1f5bc698e8ad\"}}]}","partitionColumns":["category"],"configuration":{"delta.columnMapping.mode":"name","delta.columnMapping.maxColumnId":"2"},"createdTime":1677811175819}} + diff --git a/kernel/tests/data/partition_cm/none/_delta_log/00000000000000000000.json b/kernel/tests/data/partition_cm/none/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..e826e2608a --- /dev/null +++ b/kernel/tests/data/partition_cm/none/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"protocol":{"minReaderVersion":1,"minWriterVersion":1}} +{"metaData":{"id":"test-partitioned-cm-none","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"category\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["category"],"configuration":{"delta.columnMapping.mode":"none"},"createdTime":1677811175819}} + diff --git a/kernel/tests/data/partitioned_with_materialize_feature/_delta_log/00000000000000000000.json b/kernel/tests/data/partitioned_with_materialize_feature/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..f38e273a05 --- /dev/null +++ b/kernel/tests/data/partitioned_with_materialize_feature/_delta_log/00000000000000000000.json @@ -0,0 +1,6 @@ +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":[],"writerFeatures":["materializePartitionColumns"]}} +{"metaData":{"id":"ced0baf6-aa13-4871-af26-91e6e2787052","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"letter\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"number\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a_float\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["letter"],"configuration":{},"createdTime":1674611426764}} +{"add":{"path":"letter=a/part-00000-a08d296a-d2c5-4a99-bea9-afcea42ba2e9.c000.snappy.parquet","partitionValues":{"letter":"a"},"size":751,"modificationTime":1674611427093,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"number\":1,\"a_float\":1.1},\"maxValues\":{\"number\":1,\"a_float\":1.1},\"nullCount\":{\"number\":0,\"a_float\":0}}"}} +{"add":{"path":"letter=b/part-00000-41954fb0-ef91-47e5-bd41-b75169c41c17.c000.snappy.parquet","partitionValues":{"letter":"b"},"size":751,"modificationTime":1674611427109,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"number\":2,\"a_float\":2.2},\"maxValues\":{\"number\":2,\"a_float\":2.2},\"nullCount\":{\"number\":0,\"a_float\":0}}"}} +{"add":{"path":"letter=c/part-00000-27a17b8f-be68-485c-9c49-70c742be30c0.c000.snappy.parquet","partitionValues":{"letter":"c"},"size":751,"modificationTime":1674611427117,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"number\":3,\"a_float\":3.3},\"maxValues\":{\"number\":3,\"a_float\":3.3},\"nullCount\":{\"number\":0,\"a_float\":0}}"}} +{"commitInfo":{"timestamp":1674611427131,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[\"letter\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"2253"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"6814b5cf-8e72-4add-83a3-3f79dda94dd6"}} diff --git a/kernel/tests/data/partitioned_with_materialize_feature/_delta_log/00000000000000000001.json b/kernel/tests/data/partitioned_with_materialize_feature/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..6203fcf74b --- /dev/null +++ b/kernel/tests/data/partitioned_with_materialize_feature/_delta_log/00000000000000000001.json @@ -0,0 +1,4 @@ +{"add":{"path":"letter=__HIVE_DEFAULT_PARTITION__/part-00000-8eb7f29a-e6a1-436e-a638-bbf0a7953f09.c000.snappy.parquet","partitionValues":{"letter":null},"size":751,"modificationTime":1674611429929,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"number\":6,\"a_float\":6.6},\"maxValues\":{\"number\":6,\"a_float\":6.6},\"nullCount\":{\"number\":0,\"a_float\":0}}"}} +{"add":{"path":"letter=a/part-00000-0dbe0cc5-e3bf-4fb0-b36a-b5fdd67fe843.c000.snappy.parquet","partitionValues":{"letter":"a"},"size":751,"modificationTime":1674611429937,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"number\":4,\"a_float\":4.4},\"maxValues\":{\"number\":4,\"a_float\":4.4},\"nullCount\":{\"number\":0,\"a_float\":0}}"}} +{"add":{"path":"letter=e/part-00000-847cf2d1-1247-4aa0-89ef-2f90c68ea51e.c000.snappy.parquet","partitionValues":{"letter":"e"},"size":750,"modificationTime":1674611429949,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"number\":5,\"a_float\":5.5},\"maxValues\":{\"number\":5,\"a_float\":5.5},\"nullCount\":{\"number\":0,\"a_float\":0}}"}} +{"commitInfo":{"timestamp":1674611429957,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"letter\"]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"2252"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"c6d7e8e6-bc66-46fb-b44a-58e0d47414f5"}} diff --git a/kernel/tests/data/partitioned_with_materialize_feature/letter=__HIVE_DEFAULT_PARTITION__/part-00000-8eb7f29a-e6a1-436e-a638-bbf0a7953f09.c000.snappy.parquet b/kernel/tests/data/partitioned_with_materialize_feature/letter=__HIVE_DEFAULT_PARTITION__/part-00000-8eb7f29a-e6a1-436e-a638-bbf0a7953f09.c000.snappy.parquet new file mode 100644 index 0000000000..d038761ab4 Binary files /dev/null and b/kernel/tests/data/partitioned_with_materialize_feature/letter=__HIVE_DEFAULT_PARTITION__/part-00000-8eb7f29a-e6a1-436e-a638-bbf0a7953f09.c000.snappy.parquet differ diff --git a/kernel/tests/data/partitioned_with_materialize_feature/letter=a/part-00000-0dbe0cc5-e3bf-4fb0-b36a-b5fdd67fe843.c000.snappy.parquet b/kernel/tests/data/partitioned_with_materialize_feature/letter=a/part-00000-0dbe0cc5-e3bf-4fb0-b36a-b5fdd67fe843.c000.snappy.parquet new file mode 100644 index 0000000000..d8ca80642c Binary files /dev/null and b/kernel/tests/data/partitioned_with_materialize_feature/letter=a/part-00000-0dbe0cc5-e3bf-4fb0-b36a-b5fdd67fe843.c000.snappy.parquet differ diff --git a/kernel/tests/data/partitioned_with_materialize_feature/letter=a/part-00000-a08d296a-d2c5-4a99-bea9-afcea42ba2e9.c000.snappy.parquet b/kernel/tests/data/partitioned_with_materialize_feature/letter=a/part-00000-a08d296a-d2c5-4a99-bea9-afcea42ba2e9.c000.snappy.parquet new file mode 100644 index 0000000000..d2fcbc5fb6 Binary files /dev/null and b/kernel/tests/data/partitioned_with_materialize_feature/letter=a/part-00000-a08d296a-d2c5-4a99-bea9-afcea42ba2e9.c000.snappy.parquet differ diff --git a/kernel/tests/data/partitioned_with_materialize_feature/letter=b/part-00000-41954fb0-ef91-47e5-bd41-b75169c41c17.c000.snappy.parquet b/kernel/tests/data/partitioned_with_materialize_feature/letter=b/part-00000-41954fb0-ef91-47e5-bd41-b75169c41c17.c000.snappy.parquet new file mode 100644 index 0000000000..b3a38186a9 Binary files /dev/null and b/kernel/tests/data/partitioned_with_materialize_feature/letter=b/part-00000-41954fb0-ef91-47e5-bd41-b75169c41c17.c000.snappy.parquet differ diff --git a/kernel/tests/data/partitioned_with_materialize_feature/letter=c/part-00000-27a17b8f-be68-485c-9c49-70c742be30c0.c000.snappy.parquet b/kernel/tests/data/partitioned_with_materialize_feature/letter=c/part-00000-27a17b8f-be68-485c-9c49-70c742be30c0.c000.snappy.parquet new file mode 100644 index 0000000000..7831b3e6fb Binary files /dev/null and b/kernel/tests/data/partitioned_with_materialize_feature/letter=c/part-00000-27a17b8f-be68-485c-9c49-70c742be30c0.c000.snappy.parquet differ diff --git a/kernel/tests/data/partitioned_with_materialize_feature/letter=e/part-00000-847cf2d1-1247-4aa0-89ef-2f90c68ea51e.c000.snappy.parquet b/kernel/tests/data/partitioned_with_materialize_feature/letter=e/part-00000-847cf2d1-1247-4aa0-89ef-2f90c68ea51e.c000.snappy.parquet new file mode 100644 index 0000000000..e6de68f0f1 Binary files /dev/null and b/kernel/tests/data/partitioned_with_materialize_feature/letter=e/part-00000-847cf2d1-1247-4aa0-89ef-2f90c68ea51e.c000.snappy.parquet differ diff --git a/kernel/tests/data/stats-writing-all-types/README.md b/kernel/tests/data/stats-writing-all-types/README.md new file mode 100644 index 0000000000..126a5adc5f --- /dev/null +++ b/kernel/tests/data/stats-writing-all-types/README.md @@ -0,0 +1,38 @@ +# stats-writing-all-types + +Golden table for validating that kernel's `collect_stats()` produces file statistics +matching Spark's output. + +## Table details + +- **Engine**: Apache Spark 3.5.8 / Delta Lake 3.3.2 +- **Generated by**: PySpark script +- **Reader/writer version**: reader v3, writer v7 +- **Table features**: `columnMapping` (name mode), `timestampNtz` + +## Schema + +| Column | Type | +|--------------------|---------------------------------------------| +| byte_col | byte | +| short_col | short | +| int_col | integer | +| long_col | long | +| float_col | float | +| double_col | double | +| date_col | date | +| timestamp_col | timestamp | +| timestamp_ntz_col | timestamp_ntz | +| string_col | string | +| decimal_col | decimal(10,2) | +| boolean_col | boolean | +| binary_col | binary | +| array_col | array\ | +| map_col | map\ | +| nested_struct | struct\ | + +## Contents + +- **Commit 0**: Table creation (empty, 0 rows) +- **Commit 1**: Append of 4 rows with 1 null per column, covering all stat-eligible types + plus complex types (array, map) that have only nullCount stats diff --git a/kernel/tests/data/stats-writing-all-types/delta/_delta_log/00000000000000000000.json b/kernel/tests/data/stats-writing-all-types/delta/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..02dddcd8cf --- /dev/null +++ b/kernel/tests/data/stats-writing-all-types/delta/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1770328317561,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"0","numOutputBytes":"5772"},"engineInfo":"Apache-Spark/3.5.8 Delta-Lake/3.3.2","txnId":"65a2b5c0-dda8-4ab0-9d23-8775e36da1a0"}} +{"metaData":{"id":"d135b592-f742-44c3-b5f4-a99ddc46ea18","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"byte_col\",\"type\":\"byte\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":1,\"delta.columnMapping.physicalName\":\"col-042fadad-0f76-466d-a94d-e1159c2a9ed6\"}},{\"name\":\"short_col\",\"type\":\"short\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":2,\"delta.columnMapping.physicalName\":\"col-6dc68f07-711d-4f00-8bd6-1f5bc698e8ad\"}},{\"name\":\"int_col\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":3,\"delta.columnMapping.physicalName\":\"col-e0571b78-2d49-4109-9add-f2c54d6ea29e\"}},{\"name\":\"long_col\",\"type\":\"long\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":4,\"delta.columnMapping.physicalName\":\"col-6c805ddb-539c-4ed3-bada-ac7a2082c889\"}},{\"name\":\"float_col\",\"type\":\"float\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":5,\"delta.columnMapping.physicalName\":\"col-ab1a4b4f-8850-48e3-a2ab-4eb52aa5bc57\"}},{\"name\":\"double_col\",\"type\":\"double\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":6,\"delta.columnMapping.physicalName\":\"col-fd0d362b-c828-4ab2-aea0-d7fb194a87fa\"}},{\"name\":\"date_col\",\"type\":\"date\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":7,\"delta.columnMapping.physicalName\":\"col-2e98df1d-e620-4a91-8ef8-6b196ce79105\"}},{\"name\":\"timestamp_col\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":8,\"delta.columnMapping.physicalName\":\"col-caa487cc-e27c-4d92-85ae-1c497cc9e92c\"}},{\"name\":\"timestamp_ntz_col\",\"type\":\"timestamp_ntz\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":9,\"delta.columnMapping.physicalName\":\"col-5a3cb861-db96-4e6d-a3b9-91aab0a9f0ab\"}},{\"name\":\"string_col\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":10,\"delta.columnMapping.physicalName\":\"col-5006b1fd-f61e-40da-a5d4-3957248a4269\"}},{\"name\":\"decimal_col\",\"type\":\"decimal(10,2)\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":11,\"delta.columnMapping.physicalName\":\"col-970dbfdb-8cdd-4adf-a952-2fbefc038496\"}},{\"name\":\"boolean_col\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":12,\"delta.columnMapping.physicalName\":\"col-f7007db6-65d0-4e2b-8552-d8a37bc06d2c\"}},{\"name\":\"binary_col\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":13,\"delta.columnMapping.physicalName\":\"col-cbe103e8-96dc-4fd7-821e-602cf8902e6b\"}},{\"name\":\"array_col\",\"type\":{\"type\":\"array\",\"elementType\":\"integer\",\"containsNull\":true},\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":14,\"delta.columnMapping.physicalName\":\"col-57618ab3-6362-439f-b211-f1c164a8988a\"}},{\"name\":\"map_col\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"integer\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":15,\"delta.columnMapping.physicalName\":\"col-64713545-4f48-430e-bf3f-14203591996c\"}},{\"name\":\"nested_struct\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"inner_int\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":17,\"delta.columnMapping.physicalName\":\"col-7f2f94cf-7082-430c-bba7-852bc6c5215e\"}},{\"name\":\"inner_string\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":18,\"delta.columnMapping.physicalName\":\"col-26fcfd6b-04c7-4772-8bdf-04ac9425f06e\"}},{\"name\":\"inner_double\",\"type\":\"double\",\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":19,\"delta.columnMapping.physicalName\":\"col-92dcf16d-d249-48a9-afb8-93deeaf7ce23\"}}]},\"nullable\":true,\"metadata\":{\"delta.columnMapping.id\":16,\"delta.columnMapping.physicalName\":\"col-481c7590-d3b8-4e9c-b40e-7b7128a972f4\"}}]}","partitionColumns":[],"configuration":{"delta.columnMapping.mode":"name","delta.columnMapping.maxColumnId":"19"},"createdTime":1770328314367}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["timestampNtz","columnMapping"],"writerFeatures":["timestampNtz","columnMapping"]}} diff --git a/kernel/tests/data/stats-writing-all-types/delta/_delta_log/00000000000000000001.json b/kernel/tests/data/stats-writing-all-types/delta/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..fec711e17b --- /dev/null +++ b/kernel/tests/data/stats-writing-all-types/delta/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1770328318954,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"4","numOutputBytes":"9751"},"engineInfo":"Apache-Spark/3.5.8 Delta-Lake/3.3.2","txnId":"5ab591dc-07a4-4be3-8d54-54164d3b884d"}} +{"add":{"path":"part-00000-374afe34-4ff6-46e8-a860-1948aadc744f-c000.snappy.parquet","partitionValues":{},"size":9751,"modificationTime":1770328318928,"dataChange":true,"stats":"{\"numRecords\":4,\"minValues\":{\"col-042fadad-0f76-466d-a94d-e1159c2a9ed6\":1,\"col-6dc68f07-711d-4f00-8bd6-1f5bc698e8ad\":100,\"col-e0571b78-2d49-4109-9add-f2c54d6ea29e\":1000,\"col-6c805ddb-539c-4ed3-bada-ac7a2082c889\":10000,\"col-ab1a4b4f-8850-48e3-a2ab-4eb52aa5bc57\":1.5,\"col-fd0d362b-c828-4ab2-aea0-d7fb194a87fa\":2.5,\"col-2e98df1d-e620-4a91-8ef8-6b196ce79105\":\"2023-01-15\",\"col-caa487cc-e27c-4d92-85ae-1c497cc9e92c\":\"2023-06-15T12:30:00.000Z\",\"col-5a3cb861-db96-4e6d-a3b9-91aab0a9f0ab\":\"2023-06-15T12:30:00.000\",\"col-5006b1fd-f61e-40da-a5d4-3957248a4269\":\"alpha\",\"col-970dbfdb-8cdd-4adf-a952-2fbefc038496\":123.45,\"col-481c7590-d3b8-4e9c-b40e-7b7128a972f4\":{\"col-7f2f94cf-7082-430c-bba7-852bc6c5215e\":10,\"col-26fcfd6b-04c7-4772-8bdf-04ac9425f06e\":\"nested_a\",\"col-92dcf16d-d249-48a9-afb8-93deeaf7ce23\":1.1}},\"maxValues\":{\"col-042fadad-0f76-466d-a94d-e1159c2a9ed6\":5,\"col-6dc68f07-711d-4f00-8bd6-1f5bc698e8ad\":500,\"col-e0571b78-2d49-4109-9add-f2c54d6ea29e\":5000,\"col-6c805ddb-539c-4ed3-bada-ac7a2082c889\":50000,\"col-ab1a4b4f-8850-48e3-a2ab-4eb52aa5bc57\":3.14,\"col-fd0d362b-c828-4ab2-aea0-d7fb194a87fa\":6.28,\"col-2e98df1d-e620-4a91-8ef8-6b196ce79105\":\"2024-06-15\",\"col-caa487cc-e27c-4d92-85ae-1c497cc9e92c\":\"2024-12-25T00:00:00.000Z\",\"col-5a3cb861-db96-4e6d-a3b9-91aab0a9f0ab\":\"2024-12-25T00:00:00.000\",\"col-5006b1fd-f61e-40da-a5d4-3957248a4269\":\"gamma\",\"col-970dbfdb-8cdd-4adf-a952-2fbefc038496\":999.99,\"col-481c7590-d3b8-4e9c-b40e-7b7128a972f4\":{\"col-7f2f94cf-7082-430c-bba7-852bc6c5215e\":50,\"col-26fcfd6b-04c7-4772-8bdf-04ac9425f06e\":\"nested_c\",\"col-92dcf16d-d249-48a9-afb8-93deeaf7ce23\":5.5}},\"nullCount\":{\"col-042fadad-0f76-466d-a94d-e1159c2a9ed6\":1,\"col-6dc68f07-711d-4f00-8bd6-1f5bc698e8ad\":1,\"col-e0571b78-2d49-4109-9add-f2c54d6ea29e\":1,\"col-6c805ddb-539c-4ed3-bada-ac7a2082c889\":1,\"col-ab1a4b4f-8850-48e3-a2ab-4eb52aa5bc57\":1,\"col-fd0d362b-c828-4ab2-aea0-d7fb194a87fa\":1,\"col-2e98df1d-e620-4a91-8ef8-6b196ce79105\":1,\"col-caa487cc-e27c-4d92-85ae-1c497cc9e92c\":1,\"col-5a3cb861-db96-4e6d-a3b9-91aab0a9f0ab\":1,\"col-5006b1fd-f61e-40da-a5d4-3957248a4269\":1,\"col-970dbfdb-8cdd-4adf-a952-2fbefc038496\":1,\"col-f7007db6-65d0-4e2b-8552-d8a37bc06d2c\":1,\"col-cbe103e8-96dc-4fd7-821e-602cf8902e6b\":1,\"col-57618ab3-6362-439f-b211-f1c164a8988a\":1,\"col-64713545-4f48-430e-bf3f-14203591996c\":1,\"col-481c7590-d3b8-4e9c-b40e-7b7128a972f4\":{\"col-7f2f94cf-7082-430c-bba7-852bc6c5215e\":1,\"col-26fcfd6b-04c7-4772-8bdf-04ac9425f06e\":2,\"col-92dcf16d-d249-48a9-afb8-93deeaf7ce23\":1}}}"}} diff --git a/kernel/tests/data/stats-writing-all-types/delta/part-00000-374afe34-4ff6-46e8-a860-1948aadc744f-c000.snappy.parquet b/kernel/tests/data/stats-writing-all-types/delta/part-00000-374afe34-4ff6-46e8-a860-1948aadc744f-c000.snappy.parquet new file mode 100644 index 0000000000..d8c24a5847 Binary files /dev/null and b/kernel/tests/data/stats-writing-all-types/delta/part-00000-374afe34-4ff6-46e8-a860-1948aadc744f-c000.snappy.parquet differ diff --git a/kernel/tests/data/stats-writing-all-types/delta/part-00000-4b805e72-4c65-4854-a4a2-a1fc79176a7f-c000.snappy.parquet b/kernel/tests/data/stats-writing-all-types/delta/part-00000-4b805e72-4c65-4854-a4a2-a1fc79176a7f-c000.snappy.parquet new file mode 100644 index 0000000000..a262fe9be9 Binary files /dev/null and b/kernel/tests/data/stats-writing-all-types/delta/part-00000-4b805e72-4c65-4854-a4a2-a1fc79176a7f-c000.snappy.parquet differ diff --git a/kernel/tests/data/table-with-cdf/_delta_log/00000000000000000000.json b/kernel/tests/data/table-with-cdf/_delta_log/00000000000000000000.json index 09bf8dbb27..0728498906 100644 --- a/kernel/tests/data/table-with-cdf/_delta_log/00000000000000000000.json +++ b/kernel/tests/data/table-with-cdf/_delta_log/00000000000000000000.json @@ -1,4 +1,4 @@ {"commitInfo":{"timestamp":1704392842074,"operation":"Manual Update","operationParameters":{},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.1.0-SNAPSHOT","txnId":"95ec924a-6859-4433-8008-6d6b4a0e3ba5"}} -{"protocol":{"minReaderVersion":3,"minWriterVersion":7, "readerFeatures":[], "writerFeatures":[]}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":4}} {"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"part\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableChangeDataFeed": "true"}}} {"add":{"path":"fake/path/1","partitionValues":{},"size":1,"modificationTime":1,"dataChange":true}} diff --git a/kernel/tests/data/timestamp-partitioned-table.tar.zst b/kernel/tests/data/timestamp-partitioned-table.tar.zst index 834b06e180..33086e9cb8 100644 Binary files a/kernel/tests/data/timestamp-partitioned-table.tar.zst and b/kernel/tests/data/timestamp-partitioned-table.tar.zst differ diff --git a/kernel/tests/data/timestamp-truncation-stats/_delta_log/00000000000000000000.json b/kernel/tests/data/timestamp-truncation-stats/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..0ffa5fc26d --- /dev/null +++ b/kernel/tests/data/timestamp-truncation-stats/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"95c116f6-c389-410c-bf6d-269fa789ec46","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"ts_col\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"100"},"createdTime":1774497261439}} +{"add":{"path":"part-00000-a234b260-4c3a-46cc-a9f2-67b5dc46e82b-c000.snappy.parquet","partitionValues":{},"size":725,"modificationTime":1774497263192,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"id\":1,\"ts_col\":\"1970-01-01T00:00:01.000Z\"},\"maxValues\":{\"id\":2,\"ts_col\":\"1970-01-01T00:00:02.000Z\"},\"nullCount\":{\"id\":0,\"ts_col\":0}}"}} +{"commitInfo":{"timestamp":1774497264199,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"2","numOutputBytes":"725"},"engineInfo":"Apache-Spark/3.2.4 Delta-Lake/2.0.2","txnId":"a39707c6-e539-441b-8ae8-5016810f5010"}} diff --git a/kernel/tests/data/timestamp-truncation-stats/_delta_log/00000000000000000001.json b/kernel/tests/data/timestamp-truncation-stats/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..8c00dccefe --- /dev/null +++ b/kernel/tests/data/timestamp-truncation-stats/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"add":{"path":"part-00000-8be8dbf9-6bbb-481c-9ff4-31146eb021e8-c000.snappy.parquet","partitionValues":{},"size":725,"modificationTime":1774497267308,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"id\":3,\"ts_col\":\"1970-01-01T00:00:03.000Z\"},\"maxValues\":{\"id\":4,\"ts_col\":\"1970-01-01T00:00:04.000Z\"},\"nullCount\":{\"id\":0,\"ts_col\":0}}"}} +{"commitInfo":{"timestamp":1774497267317,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"2","numOutputBytes":"725"},"engineInfo":"Apache-Spark/3.2.4 Delta-Lake/2.0.2","txnId":"2e5fdfd7-f775-4c8e-b8a8-add9b64d01f7"}} diff --git a/kernel/tests/data/timestamp-truncation-stats/_delta_log/00000000000000000002.json b/kernel/tests/data/timestamp-truncation-stats/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..d980895f98 --- /dev/null +++ b/kernel/tests/data/timestamp-truncation-stats/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"add":{"path":"part-00000-f05ea4e8-94b0-4215-802d-d5050c89d584-c000.snappy.parquet","partitionValues":{},"size":725,"modificationTime":1774497269072,"dataChange":true,"stats":"{\"numRecords\":2,\"minValues\":{\"id\":5,\"ts_col\":\"1970-01-01T00:00:07.000Z\"},\"maxValues\":{\"id\":6,\"ts_col\":\"1970-01-01T00:00:08.000Z\"},\"nullCount\":{\"id\":0,\"ts_col\":0}}"}} +{"commitInfo":{"timestamp":1774497269078,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"2","numOutputBytes":"725"},"engineInfo":"Apache-Spark/3.2.4 Delta-Lake/2.0.2","txnId":"9866f6f1-12d3-4de3-82c8-3f6532b7de4f"}} diff --git a/kernel/tests/data/timestamp-truncation-stats/part-00000-8be8dbf9-6bbb-481c-9ff4-31146eb021e8-c000.snappy.parquet b/kernel/tests/data/timestamp-truncation-stats/part-00000-8be8dbf9-6bbb-481c-9ff4-31146eb021e8-c000.snappy.parquet new file mode 100644 index 0000000000..25c95ef74c Binary files /dev/null and b/kernel/tests/data/timestamp-truncation-stats/part-00000-8be8dbf9-6bbb-481c-9ff4-31146eb021e8-c000.snappy.parquet differ diff --git a/kernel/tests/data/timestamp-truncation-stats/part-00000-a234b260-4c3a-46cc-a9f2-67b5dc46e82b-c000.snappy.parquet b/kernel/tests/data/timestamp-truncation-stats/part-00000-a234b260-4c3a-46cc-a9f2-67b5dc46e82b-c000.snappy.parquet new file mode 100644 index 0000000000..ddf330df1d Binary files /dev/null and b/kernel/tests/data/timestamp-truncation-stats/part-00000-a234b260-4c3a-46cc-a9f2-67b5dc46e82b-c000.snappy.parquet differ diff --git a/kernel/tests/data/timestamp-truncation-stats/part-00000-f05ea4e8-94b0-4215-802d-d5050c89d584-c000.snappy.parquet b/kernel/tests/data/timestamp-truncation-stats/part-00000-f05ea4e8-94b0-4215-802d-d5050c89d584-c000.snappy.parquet new file mode 100644 index 0000000000..2fce1f6007 Binary files /dev/null and b/kernel/tests/data/timestamp-truncation-stats/part-00000-f05ea4e8-94b0-4215-802d-d5050c89d584-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000000.crc.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000000.crc.crc new file mode 100644 index 0000000000..cefde913ff Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000000.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000000.json.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000000.json.crc new file mode 100644 index 0000000000..213193f6c9 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000000.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000001.crc.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000001.crc.crc new file mode 100644 index 0000000000..4c579b15a7 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000001.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000001.json.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000001.json.crc new file mode 100644 index 0000000000..318cf0aa26 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000001.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000002.crc.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000002.crc.crc new file mode 100644 index 0000000000..b45996f754 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000002.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000002.json.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000002.json.crc new file mode 100644 index 0000000000..e686df603f Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000002.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000003.crc.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000003.crc.crc new file mode 100644 index 0000000000..1eadd7aebc Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000003.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000003.json.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000003.json.crc new file mode 100644 index 0000000000..6a039f5615 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000003.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000004.crc.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000004.crc.crc new file mode 100644 index 0000000000..bf404d7de9 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000004.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000004.json.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000004.json.crc new file mode 100644 index 0000000000..329ec52c07 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000004.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000001.0000000003.parquet.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000001.0000000003.parquet.crc new file mode 100644 index 0000000000..e0f48813c0 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000001.0000000003.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000002.0000000003.parquet.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000002.0000000003.parquet.crc new file mode 100644 index 0000000000..69c03ece05 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000002.0000000003.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000003.0000000003.parquet.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000003.0000000003.parquet.crc new file mode 100644 index 0000000000..cf5cef05c2 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000003.0000000003.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.crc.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.crc.crc new file mode 100644 index 0000000000..8dfb00a8e9 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.json.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.json.crc new file mode 100644 index 0000000000..cc2cf61b53 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/.00000000000000000005.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/._last_checkpoint.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/._last_checkpoint.crc new file mode 100644 index 0000000000..a0c1389482 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/._last_checkpoint.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000000.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000000.crc new file mode 100644 index 0000000000..b90fe9f475 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"16de0a5e-f2ce-4e96-8ad2-eb3a4a232324","tableSizeBytes":0,"numFiles":0,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"4a880820-e52f-437c-be8a-0551b00a16ee","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"part\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part"],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306407453},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[0,0,0,0,0,0,0,0,0,0]},"allFiles":[]} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000000.json b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..98b3b69604 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1774306407573,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[\"part\"]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.checkpoint.writeStatsAsStruct\":\"true\",\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpointInterval\":\"10\",\"delta.enableDeletionVectors\":\"true\"}","statsOnLoad":false},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"16de0a5e-f2ce-4e96-8ad2-eb3a4a232324"}} +{"metaData":{"id":"4a880820-e52f-437c-be8a-0551b00a16ee","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"part\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part"],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306407453}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]}} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000001.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000001.crc new file mode 100644 index 0000000000..db07b623a9 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000001.crc @@ -0,0 +1 @@ +{"txnId":"c55316e0-b463-49e0-b814-4e390332655d","tableSizeBytes":986,"numFiles":1,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"4a880820-e52f-437c-be8a-0551b00a16ee","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"part\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part"],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306407453},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[986,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[1,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"part=1/test%25file%25prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet","partitionValues":{"part":"1"},"size":986,"modificationTime":1774306408636,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306408636000","MIN_INSERTION_TIME":"1774306408636000","MAX_INSERTION_TIME":"1774306408636000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000001.json b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..b034b2f071 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306408772,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"986"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"c55316e0-b463-49e0-b814-4e390332655d"}} +{"add":{"path":"part=1/test%25file%25prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet","partitionValues":{"part":"1"},"size":986,"modificationTime":1774306408636,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306408636000","MIN_INSERTION_TIME":"1774306408636000","MAX_INSERTION_TIME":"1774306408636000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000002.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000002.crc new file mode 100644 index 0000000000..6dd901ca5b --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000002.crc @@ -0,0 +1 @@ +{"txnId":"98e9f38c-758c-425f-9455-14255391ed04","tableSizeBytes":1972,"numFiles":2,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"4a880820-e52f-437c-be8a-0551b00a16ee","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"part\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part"],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306407453},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[1972,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[2,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"part=2/test%25file%25prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet","partitionValues":{"part":"2"},"size":986,"modificationTime":1774306409768,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306409768000","MIN_INSERTION_TIME":"1774306409768000","MAX_INSERTION_TIME":"1774306409768000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=1/test%25file%25prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet","partitionValues":{"part":"1"},"size":986,"modificationTime":1774306408636,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306408636000","MIN_INSERTION_TIME":"1774306408636000","MAX_INSERTION_TIME":"1774306408636000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000002.json b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..87051861e2 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306409894,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"986"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"98e9f38c-758c-425f-9455-14255391ed04"}} +{"add":{"path":"part=2/test%25file%25prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet","partitionValues":{"part":"2"},"size":986,"modificationTime":1774306409768,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306409768000","MIN_INSERTION_TIME":"1774306409768000","MAX_INSERTION_TIME":"1774306409768000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000003.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000003.crc new file mode 100644 index 0000000000..f8be8748c5 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000003.crc @@ -0,0 +1 @@ +{"txnId":"00106389-a5e9-43b2-8175-f8fce2a9755c","tableSizeBytes":2957,"numFiles":3,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"4a880820-e52f-437c-be8a-0551b00a16ee","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"part\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part"],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306407453},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[2957,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[3,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"part=0/test%25file%25prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet","partitionValues":{"part":"0"},"size":985,"modificationTime":1774306410880,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306410880000","MIN_INSERTION_TIME":"1774306410880000","MAX_INSERTION_TIME":"1774306410880000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=1/test%25file%25prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet","partitionValues":{"part":"1"},"size":986,"modificationTime":1774306408636,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306408636000","MIN_INSERTION_TIME":"1774306408636000","MAX_INSERTION_TIME":"1774306408636000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=2/test%25file%25prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet","partitionValues":{"part":"2"},"size":986,"modificationTime":1774306409768,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306409768000","MIN_INSERTION_TIME":"1774306409768000","MAX_INSERTION_TIME":"1774306409768000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000003.json b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..89b828542a --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306411023,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"985"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"00106389-a5e9-43b2-8175-f8fce2a9755c"}} +{"add":{"path":"part=0/test%25file%25prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet","partitionValues":{"part":"0"},"size":985,"modificationTime":1774306410880,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306410880000","MIN_INSERTION_TIME":"1774306410880000","MAX_INSERTION_TIME":"1774306410880000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000004.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000004.crc new file mode 100644 index 0000000000..58b9846451 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000004.crc @@ -0,0 +1 @@ +{"txnId":"4a3b18dd-2601-4c94-bd4e-98fc73495586","tableSizeBytes":3943,"numFiles":4,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"4a880820-e52f-437c-be8a-0551b00a16ee","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"part\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part"],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306407453},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3943,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[4,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"part=1/test%25file%25prefix-part-00000-a7a1cef9-8cb8-4adb-b7ba-adc2d3c33f39.c000.snappy.parquet","partitionValues":{"part":"1"},"size":986,"modificationTime":1774306411956,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306411956000","MIN_INSERTION_TIME":"1774306411956000","MAX_INSERTION_TIME":"1774306411956000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=0/test%25file%25prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet","partitionValues":{"part":"0"},"size":985,"modificationTime":1774306410880,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306410880000","MIN_INSERTION_TIME":"1774306410880000","MAX_INSERTION_TIME":"1774306410880000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=2/test%25file%25prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet","partitionValues":{"part":"2"},"size":986,"modificationTime":1774306409768,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306409768000","MIN_INSERTION_TIME":"1774306409768000","MAX_INSERTION_TIME":"1774306409768000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=1/test%25file%25prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet","partitionValues":{"part":"1"},"size":986,"modificationTime":1774306408636,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306408636000","MIN_INSERTION_TIME":"1774306408636000","MAX_INSERTION_TIME":"1774306408636000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000004.json b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000004.json new file mode 100644 index 0000000000..fd29f6f348 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306412085,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":3,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"986"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"4a3b18dd-2601-4c94-bd4e-98fc73495586"}} +{"add":{"path":"part=1/test%25file%25prefix-part-00000-a7a1cef9-8cb8-4adb-b7ba-adc2d3c33f39.c000.snappy.parquet","partitionValues":{"part":"1"},"size":986,"modificationTime":1774306411956,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306411956000","MIN_INSERTION_TIME":"1774306411956000","MAX_INSERTION_TIME":"1774306411956000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000001.0000000003.parquet b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000001.0000000003.parquet new file mode 100644 index 0000000000..8307d145f0 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000001.0000000003.parquet differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000002.0000000003.parquet b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000002.0000000003.parquet new file mode 100644 index 0000000000..8c83a24598 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000002.0000000003.parquet differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000003.0000000003.parquet b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000003.0000000003.parquet new file mode 100644 index 0000000000..8385c9b02a Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000003.0000000003.parquet differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.crc new file mode 100644 index 0000000000..9eb8d5de94 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.crc @@ -0,0 +1 @@ +{"txnId":"d7bea328-c983-476b-b099-e17d3c1f2fc6","tableSizeBytes":4929,"numFiles":5,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"4a880820-e52f-437c-be8a-0551b00a16ee","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"part\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["part"],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306407453},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[4929,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[5,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"part=1/test%25file%25prefix-part-00000-a7a1cef9-8cb8-4adb-b7ba-adc2d3c33f39.c000.snappy.parquet","partitionValues":{"part":"1"},"size":986,"modificationTime":1774306411956,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306411956000","MIN_INSERTION_TIME":"1774306411956000","MAX_INSERTION_TIME":"1774306411956000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=0/test%25file%25prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet","partitionValues":{"part":"0"},"size":985,"modificationTime":1774306410880,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306410880000","MIN_INSERTION_TIME":"1774306410880000","MAX_INSERTION_TIME":"1774306410880000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=1/test%25file%25prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet","partitionValues":{"part":"1"},"size":986,"modificationTime":1774306408636,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306408636000","MIN_INSERTION_TIME":"1774306408636000","MAX_INSERTION_TIME":"1774306408636000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=2/test%25file%25prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet","partitionValues":{"part":"2"},"size":986,"modificationTime":1774306409768,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306409768000","MIN_INSERTION_TIME":"1774306409768000","MAX_INSERTION_TIME":"1774306409768000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"part=2/test%25file%25prefix-part-00000-03c40ea1-51b8-4636-9900-fb0885b47b87.c000.snappy.parquet","partitionValues":{"part":"2"},"size":986,"modificationTime":1774306413016,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306413016000","MIN_INSERTION_TIME":"1774306413016000","MAX_INSERTION_TIME":"1774306413016000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.json b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.json new file mode 100644 index 0000000000..f02e870b8a --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/00000000000000000005.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306413146,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":4,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"986"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"d7bea328-c983-476b-b099-e17d3c1f2fc6"}} +{"add":{"path":"part=2/test%25file%25prefix-part-00000-03c40ea1-51b8-4636-9900-fb0885b47b87.c000.snappy.parquet","partitionValues":{"part":"2"},"size":986,"modificationTime":1774306413016,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306413016000","MIN_INSERTION_TIME":"1774306413016000","MAX_INSERTION_TIME":"1774306413016000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/_last_checkpoint b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..28c4f37780 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":5,"size":7,"parts":3,"sizeInBytes":58606,"numOfAddFiles":5,"checkpointSchema":{"type":"struct","fields":[{"name":"txn","type":{"type":"struct","fields":[{"name":"appId","type":"string","nullable":true,"metadata":{}},{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"lastUpdated","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"add","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"clusteringProvider","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues_parsed","type":{"type":"struct","fields":[{"name":"part","type":"integer","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"stats_parsed","type":{"type":"struct","fields":[{"name":"numRecords","type":"long","nullable":true,"metadata":{}},{"name":"minValues","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"maxValues","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"nullCount","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"tightBounds","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"remove","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"deletionTimestamp","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"extendedFileMetadata","type":"boolean","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"metaData","type":{"type":"struct","fields":[{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"format","type":{"type":"struct","fields":[{"name":"provider","type":"string","nullable":true,"metadata":{}},{"name":"options","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"schemaString","type":"string","nullable":true,"metadata":{}},{"name":"partitionColumns","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"configuration","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"createdTime","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"protocol","type":{"type":"struct","fields":[{"name":"minReaderVersion","type":"integer","nullable":true,"metadata":{}},{"name":"minWriterVersion","type":"integer","nullable":true,"metadata":{}},{"name":"readerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"writerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"domainMetadata","type":{"type":"struct","fields":[{"name":"domain","type":"string","nullable":true,"metadata":{}},{"name":"configuration","type":"string","nullable":true,"metadata":{}},{"name":"removed","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"checksum":"4f28f78d7d36c441541a7d760a1d853c"} diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=0/.test%file%prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=0/.test%file%prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet.crc new file mode 100644 index 0000000000..48ac303d76 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=0/.test%file%prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=0/test%file%prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=0/test%file%prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet new file mode 100644 index 0000000000..c657bb11f0 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=0/test%file%prefix-part-00000-dea8716a-0901-4b81-a9c9-fa6d76ea5ea9.c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/.test%file%prefix-part-00000-a7a1cef9-8cb8-4adb-b7ba-adc2d3c33f39.c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/.test%file%prefix-part-00000-a7a1cef9-8cb8-4adb-b7ba-adc2d3c33f39.c000.snappy.parquet.crc new file mode 100644 index 0000000000..51ecfe0daa Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/.test%file%prefix-part-00000-a7a1cef9-8cb8-4adb-b7ba-adc2d3c33f39.c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/.test%file%prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/.test%file%prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet.crc new file mode 100644 index 0000000000..32805edc53 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/.test%file%prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/test%file%prefix-part-00000-a7a1cef9-8cb8-4adb-b7ba-adc2d3c33f39.c000.snappy.parquet b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/test%file%prefix-part-00000-a7a1cef9-8cb8-4adb-b7ba-adc2d3c33f39.c000.snappy.parquet new file mode 100644 index 0000000000..9dbcc4d0c5 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/test%file%prefix-part-00000-a7a1cef9-8cb8-4adb-b7ba-adc2d3c33f39.c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/test%file%prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/test%file%prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet new file mode 100644 index 0000000000..072ed609f0 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=1/test%file%prefix-part-00000-ad2e4232-aae1-41ea-b46c-569d88253339.c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/.test%file%prefix-part-00000-03c40ea1-51b8-4636-9900-fb0885b47b87.c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/.test%file%prefix-part-00000-03c40ea1-51b8-4636-9900-fb0885b47b87.c000.snappy.parquet.crc new file mode 100644 index 0000000000..ee945fa527 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/.test%file%prefix-part-00000-03c40ea1-51b8-4636-9900-fb0885b47b87.c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/.test%file%prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/.test%file%prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet.crc new file mode 100644 index 0000000000..681e4c1106 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/.test%file%prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/test%file%prefix-part-00000-03c40ea1-51b8-4636-9900-fb0885b47b87.c000.snappy.parquet b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/test%file%prefix-part-00000-03c40ea1-51b8-4636-9900-fb0885b47b87.c000.snappy.parquet new file mode 100644 index 0000000000..a10f42d61e Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/test%file%prefix-part-00000-03c40ea1-51b8-4636-9900-fb0885b47b87.c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/test%file%prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/test%file%prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet new file mode 100644 index 0000000000..93f8e18a4a Binary files /dev/null and b/kernel/tests/data/v1-multi-part-partitioned-struct-stats-only/part=2/test%file%prefix-part-00000-b101d239-17b8-4e26-9b34-5a43db693182.c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet.crc new file mode 100644 index 0000000000..561aed6528 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-323f4e76-58ff-48ce-bf0d-14d179e9bf0c-c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-323f4e76-58ff-48ce-bf0d-14d179e9bf0c-c000.snappy.parquet.crc new file mode 100644 index 0000000000..d3733258b8 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-323f4e76-58ff-48ce-bf0d-14d179e9bf0c-c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-743ccd8e-15b0-49f2-b0e2-aa0efbf148ae-c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-743ccd8e-15b0-49f2-b0e2-aa0efbf148ae-c000.snappy.parquet.crc new file mode 100644 index 0000000000..045a2dd4da Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-743ccd8e-15b0-49f2-b0e2-aa0efbf148ae-c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet.crc new file mode 100644 index 0000000000..798f2039d9 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet.crc new file mode 100644 index 0000000000..e5247f2373 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/.test%file%prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000000.crc.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000000.crc.crc new file mode 100644 index 0000000000..d5bffa5f88 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000000.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000000.json.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000000.json.crc new file mode 100644 index 0000000000..1372cecb26 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000000.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000001.crc.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000001.crc.crc new file mode 100644 index 0000000000..1dacc17a20 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000001.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000001.json.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000001.json.crc new file mode 100644 index 0000000000..546507c441 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000001.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000002.crc.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000002.crc.crc new file mode 100644 index 0000000000..dd8eafd28e Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000002.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000002.json.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000002.json.crc new file mode 100644 index 0000000000..e52117daa6 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000002.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000003.crc.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000003.crc.crc new file mode 100644 index 0000000000..90e39ba969 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000003.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000003.json.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000003.json.crc new file mode 100644 index 0000000000..0c4083e121 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000003.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000004.crc.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000004.crc.crc new file mode 100644 index 0000000000..71b8edf204 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000004.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000004.json.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000004.json.crc new file mode 100644 index 0000000000..a98d46fb6b Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000004.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000001.0000000003.parquet.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000001.0000000003.parquet.crc new file mode 100644 index 0000000000..d6464fc417 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000001.0000000003.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000002.0000000003.parquet.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000002.0000000003.parquet.crc new file mode 100644 index 0000000000..9090eecb2a Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000002.0000000003.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000003.0000000003.parquet.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000003.0000000003.parquet.crc new file mode 100644 index 0000000000..a66073d640 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.checkpoint.0000000003.0000000003.parquet.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.crc.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.crc.crc new file mode 100644 index 0000000000..6acac728ce Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.crc.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.json.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.json.crc new file mode 100644 index 0000000000..22eff0cb48 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/.00000000000000000005.json.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/._last_checkpoint.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/._last_checkpoint.crc new file mode 100644 index 0000000000..4227fc1c5b Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/._last_checkpoint.crc differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000000.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000000.crc new file mode 100644 index 0000000000..44ea44ce1f --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"cd694959-725d-44cc-b36e-fdda171ae1ad","tableSizeBytes":0,"numFiles":0,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"7fb4ebca-e84f-4dcb-9ec9-ab4032c10032","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306387714},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[0,0,0,0,0,0,0,0,0,0]},"allFiles":[]} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000000.json b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..cd9461a304 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1774306390194,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.checkpoint.writeStatsAsStruct\":\"true\",\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpointInterval\":\"10\",\"delta.enableDeletionVectors\":\"true\"}","statsOnLoad":false},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"cd694959-725d-44cc-b36e-fdda171ae1ad"}} +{"metaData":{"id":"7fb4ebca-e84f-4dcb-9ec9-ab4032c10032","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306387714}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]}} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000001.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000001.crc new file mode 100644 index 0000000000..c4e41802e9 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000001.crc @@ -0,0 +1 @@ +{"txnId":"27faf0dc-2bbc-474c-b6a5-974606d30811","tableSizeBytes":761,"numFiles":1,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"7fb4ebca-e84f-4dcb-9ec9-ab4032c10032","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306387714},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[761,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[1,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306398915,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306398915000","MIN_INSERTION_TIME":"1774306398915000","MAX_INSERTION_TIME":"1774306398915000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000001.json b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..911a5d76e2 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306399541,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"27faf0dc-2bbc-474c-b6a5-974606d30811"}} +{"add":{"path":"test%25file%25prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306398915,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306398915000","MIN_INSERTION_TIME":"1774306398915000","MAX_INSERTION_TIME":"1774306398915000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000002.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000002.crc new file mode 100644 index 0000000000..6847ff21dd --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000002.crc @@ -0,0 +1 @@ +{"txnId":"6fe44e3e-6c3b-4e59-8dea-5f24d12bf46f","tableSizeBytes":1522,"numFiles":2,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"7fb4ebca-e84f-4dcb-9ec9-ab4032c10032","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306387714},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[1522,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[2,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306401007,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306401007000","MIN_INSERTION_TIME":"1774306401007000","MAX_INSERTION_TIME":"1774306401007000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306398915,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306398915000","MIN_INSERTION_TIME":"1774306398915000","MAX_INSERTION_TIME":"1774306398915000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000002.json b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..5aceff3f54 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306401184,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"6fe44e3e-6c3b-4e59-8dea-5f24d12bf46f"}} +{"add":{"path":"test%25file%25prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306401007,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306401007000","MIN_INSERTION_TIME":"1774306401007000","MAX_INSERTION_TIME":"1774306401007000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000003.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000003.crc new file mode 100644 index 0000000000..5d7944d64f --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000003.crc @@ -0,0 +1 @@ +{"txnId":"ede546c5-e345-461e-ad4e-9d0428f23df1","tableSizeBytes":2282,"numFiles":3,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"7fb4ebca-e84f-4dcb-9ec9-ab4032c10032","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306387714},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[2282,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[3,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306398915,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306398915000","MIN_INSERTION_TIME":"1774306398915000","MAX_INSERTION_TIME":"1774306398915000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306401007,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306401007000","MIN_INSERTION_TIME":"1774306401007000","MAX_INSERTION_TIME":"1774306401007000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774306402499,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306402499000","MIN_INSERTION_TIME":"1774306402499000","MAX_INSERTION_TIME":"1774306402499000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000003.json b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..96a77babd1 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306402691,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"760"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"ede546c5-e345-461e-ad4e-9d0428f23df1"}} +{"add":{"path":"test%25file%25prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774306402499,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306402499000","MIN_INSERTION_TIME":"1774306402499000","MAX_INSERTION_TIME":"1774306402499000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000004.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000004.crc new file mode 100644 index 0000000000..2836d3ba84 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000004.crc @@ -0,0 +1 @@ +{"txnId":"12423c00-bb1e-493f-b23a-618d1efaaf2e","tableSizeBytes":3043,"numFiles":4,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"7fb4ebca-e84f-4dcb-9ec9-ab4032c10032","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306387714},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3043,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[4,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306401007,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306401007000","MIN_INSERTION_TIME":"1774306401007000","MAX_INSERTION_TIME":"1774306401007000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306398915,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306398915000","MIN_INSERTION_TIME":"1774306398915000","MAX_INSERTION_TIME":"1774306398915000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-323f4e76-58ff-48ce-bf0d-14d179e9bf0c-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306403960,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306403960000","MIN_INSERTION_TIME":"1774306403960000","MAX_INSERTION_TIME":"1774306403960000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774306402499,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306402499000","MIN_INSERTION_TIME":"1774306402499000","MAX_INSERTION_TIME":"1774306402499000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000004.json b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000004.json new file mode 100644 index 0000000000..e18a6aae9e --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306404163,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":3,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"12423c00-bb1e-493f-b23a-618d1efaaf2e"}} +{"add":{"path":"test%25file%25prefix-part-00000-323f4e76-58ff-48ce-bf0d-14d179e9bf0c-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306403960,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306403960000","MIN_INSERTION_TIME":"1774306403960000","MAX_INSERTION_TIME":"1774306403960000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000001.0000000003.parquet b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000001.0000000003.parquet new file mode 100644 index 0000000000..aea9e46f4a Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000001.0000000003.parquet differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000002.0000000003.parquet b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000002.0000000003.parquet new file mode 100644 index 0000000000..0f5095f324 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000002.0000000003.parquet differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000003.0000000003.parquet b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000003.0000000003.parquet new file mode 100644 index 0000000000..152486d07d Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.0000000003.0000000003.parquet differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.crc b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.crc new file mode 100644 index 0000000000..3b6f6c25de --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.crc @@ -0,0 +1 @@ +{"txnId":"77b1e654-b6b8-45b0-8df7-4db0e1aff223","tableSizeBytes":3804,"numFiles":5,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"7fb4ebca-e84f-4dcb-9ec9-ab4032c10032","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774306387714},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3804,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[5,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306398915,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306398915000","MIN_INSERTION_TIME":"1774306398915000","MAX_INSERTION_TIME":"1774306398915000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306401007,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306401007000","MIN_INSERTION_TIME":"1774306401007000","MAX_INSERTION_TIME":"1774306401007000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-323f4e76-58ff-48ce-bf0d-14d179e9bf0c-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306403960,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306403960000","MIN_INSERTION_TIME":"1774306403960000","MAX_INSERTION_TIME":"1774306403960000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-743ccd8e-15b0-49f2-b0e2-aa0efbf148ae-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306405288,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306405288000","MIN_INSERTION_TIME":"1774306405288000","MAX_INSERTION_TIME":"1774306405288000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774306402499,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306402499000","MIN_INSERTION_TIME":"1774306402499000","MAX_INSERTION_TIME":"1774306402499000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.json b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.json new file mode 100644 index 0000000000..7d54ec8b10 --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/00000000000000000005.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774306405430,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":4,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"77b1e654-b6b8-45b0-8df7-4db0e1aff223"}} +{"add":{"path":"test%25file%25prefix-part-00000-743ccd8e-15b0-49f2-b0e2-aa0efbf148ae-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774306405288,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774306405288000","MIN_INSERTION_TIME":"1774306405288000","MAX_INSERTION_TIME":"1774306405288000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/_last_checkpoint b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..cc2eabf8ed --- /dev/null +++ b/kernel/tests/data/v1-multi-part-struct-stats-only/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":5,"size":7,"parts":3,"sizeInBytes":46005,"numOfAddFiles":5,"checkpointSchema":{"type":"struct","fields":[{"name":"txn","type":{"type":"struct","fields":[{"name":"appId","type":"string","nullable":true,"metadata":{}},{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"lastUpdated","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"add","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"clusteringProvider","type":"string","nullable":true,"metadata":{}},{"name":"stats_parsed","type":{"type":"struct","fields":[{"name":"numRecords","type":"long","nullable":true,"metadata":{}},{"name":"minValues","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"maxValues","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"nullCount","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"tightBounds","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"remove","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"deletionTimestamp","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"extendedFileMetadata","type":"boolean","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"metaData","type":{"type":"struct","fields":[{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"format","type":{"type":"struct","fields":[{"name":"provider","type":"string","nullable":true,"metadata":{}},{"name":"options","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"schemaString","type":"string","nullable":true,"metadata":{}},{"name":"partitionColumns","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"configuration","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"createdTime","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"protocol","type":{"type":"struct","fields":[{"name":"minReaderVersion","type":"integer","nullable":true,"metadata":{}},{"name":"minWriterVersion","type":"integer","nullable":true,"metadata":{}},{"name":"readerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"writerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"domainMetadata","type":{"type":"struct","fields":[{"name":"domain","type":"string","nullable":true,"metadata":{}},{"name":"configuration","type":"string","nullable":true,"metadata":{}},{"name":"removed","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"checksum":"fd3a25e1faa0025de64503c5038a18c6"} diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet new file mode 100644 index 0000000000..7f098ecab2 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-2699f745-4b33-4eb9-b3cf-04f6af08307f-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-323f4e76-58ff-48ce-bf0d-14d179e9bf0c-c000.snappy.parquet b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-323f4e76-58ff-48ce-bf0d-14d179e9bf0c-c000.snappy.parquet new file mode 100644 index 0000000000..55de20cfae Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-323f4e76-58ff-48ce-bf0d-14d179e9bf0c-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-743ccd8e-15b0-49f2-b0e2-aa0efbf148ae-c000.snappy.parquet b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-743ccd8e-15b0-49f2-b0e2-aa0efbf148ae-c000.snappy.parquet new file mode 100644 index 0000000000..9d1435acf4 Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-743ccd8e-15b0-49f2-b0e2-aa0efbf148ae-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet new file mode 100644 index 0000000000..60eb11bf0d Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-f98612d6-6213-41f1-a006-a11beb0bb544-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet new file mode 100644 index 0000000000..d39e06afab Binary files /dev/null and b/kernel/tests/data/v1-multi-part-struct-stats-only/test%file%prefix-part-00000-ff529603-203f-4a68-9ab1-d495e5c1c409-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000000.crc b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000000.crc new file mode 100644 index 0000000000..b2e2a1425f --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"c9205b56-4d26-4f2b-ad0a-fe0ccf1778ec","tableSizeBytes":0,"numFiles":0,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"3afc4c9a-c57b-4c20-8a53-f00ec003c9b4","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774386277024},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[0,0,0,0,0,0,0,0,0,0]},"allFiles":[]} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000000.json b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..39fe291082 --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1774386279464,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.checkpoint.writeStatsAsStruct\":\"true\",\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpointInterval\":\"10\",\"delta.enableDeletionVectors\":\"true\"}","statsOnLoad":false},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"c9205b56-4d26-4f2b-ad0a-fe0ccf1778ec"}} +{"metaData":{"id":"3afc4c9a-c57b-4c20-8a53-f00ec003c9b4","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774386277024}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]}} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000001.crc b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000001.crc new file mode 100644 index 0000000000..fb754878ec --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000001.crc @@ -0,0 +1 @@ +{"txnId":"811e51aa-84dc-4dc9-8033-52f571aea1a8","tableSizeBytes":761,"numFiles":1,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"3afc4c9a-c57b-4c20-8a53-f00ec003c9b4","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774386277024},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[761,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[1,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-09ac7369-06f6-48dd-99ac-5a97ff6b8cb8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386288245,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386288245000","MIN_INSERTION_TIME":"1774386288245000","MAX_INSERTION_TIME":"1774386288245000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000001.json b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..0564672530 --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774386288935,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"811e51aa-84dc-4dc9-8033-52f571aea1a8"}} +{"add":{"path":"test%25file%25prefix-part-00000-09ac7369-06f6-48dd-99ac-5a97ff6b8cb8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386288245,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386288245000","MIN_INSERTION_TIME":"1774386288245000","MAX_INSERTION_TIME":"1774386288245000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000002.crc b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000002.crc new file mode 100644 index 0000000000..ed8e4bc8e8 --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000002.crc @@ -0,0 +1 @@ +{"txnId":"a4f2cfb3-0ccb-42a8-becd-13ab502f10cb","tableSizeBytes":1522,"numFiles":2,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"3afc4c9a-c57b-4c20-8a53-f00ec003c9b4","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774386277024},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[1522,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[2,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-ca7dba04-ea61-4e38-882e-ff6ee82f0d46-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386290289,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386290289000","MIN_INSERTION_TIME":"1774386290289000","MAX_INSERTION_TIME":"1774386290289000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-09ac7369-06f6-48dd-99ac-5a97ff6b8cb8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386288245,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386288245000","MIN_INSERTION_TIME":"1774386288245000","MAX_INSERTION_TIME":"1774386288245000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000002.json b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..6a590666cc --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774386290471,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"a4f2cfb3-0ccb-42a8-becd-13ab502f10cb"}} +{"add":{"path":"test%25file%25prefix-part-00000-ca7dba04-ea61-4e38-882e-ff6ee82f0d46-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386290289,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386290289000","MIN_INSERTION_TIME":"1774386290289000","MAX_INSERTION_TIME":"1774386290289000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000003.crc b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000003.crc new file mode 100644 index 0000000000..9fc76c9865 --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000003.crc @@ -0,0 +1 @@ +{"txnId":"e73d383b-18ba-4a2e-ab90-f3ce723a8f13","tableSizeBytes":2282,"numFiles":3,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"3afc4c9a-c57b-4c20-8a53-f00ec003c9b4","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774386277024},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[2282,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[3,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-54d78e6d-3c0d-402d-97cf-b6b46ab54533-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774386291681,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386291681000","MIN_INSERTION_TIME":"1774386291681000","MAX_INSERTION_TIME":"1774386291681000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-ca7dba04-ea61-4e38-882e-ff6ee82f0d46-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386290289,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386290289000","MIN_INSERTION_TIME":"1774386290289000","MAX_INSERTION_TIME":"1774386290289000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-09ac7369-06f6-48dd-99ac-5a97ff6b8cb8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386288245,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386288245000","MIN_INSERTION_TIME":"1774386288245000","MAX_INSERTION_TIME":"1774386288245000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000003.json b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..ef3d7cd653 --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774386291840,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"760"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"e73d383b-18ba-4a2e-ab90-f3ce723a8f13"}} +{"add":{"path":"test%25file%25prefix-part-00000-54d78e6d-3c0d-402d-97cf-b6b46ab54533-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774386291681,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386291681000","MIN_INSERTION_TIME":"1774386291681000","MAX_INSERTION_TIME":"1774386291681000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000004.crc b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000004.crc new file mode 100644 index 0000000000..9eac110a6c --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000004.crc @@ -0,0 +1 @@ +{"txnId":"fbcfa79d-2daa-49db-be93-9eeda474dfe5","tableSizeBytes":3043,"numFiles":4,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"3afc4c9a-c57b-4c20-8a53-f00ec003c9b4","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774386277024},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3043,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[4,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-1b98908c-7430-4b7c-af27-ac1452047ed1-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386292913,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386292913000","MIN_INSERTION_TIME":"1774386292913000","MAX_INSERTION_TIME":"1774386292913000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-54d78e6d-3c0d-402d-97cf-b6b46ab54533-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774386291681,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386291681000","MIN_INSERTION_TIME":"1774386291681000","MAX_INSERTION_TIME":"1774386291681000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-ca7dba04-ea61-4e38-882e-ff6ee82f0d46-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386290289,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386290289000","MIN_INSERTION_TIME":"1774386290289000","MAX_INSERTION_TIME":"1774386290289000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-09ac7369-06f6-48dd-99ac-5a97ff6b8cb8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386288245,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386288245000","MIN_INSERTION_TIME":"1774386288245000","MAX_INSERTION_TIME":"1774386288245000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000004.json b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000004.json new file mode 100644 index 0000000000..f3585bf53b --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774386293054,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":3,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"fbcfa79d-2daa-49db-be93-9eeda474dfe5"}} +{"add":{"path":"test%25file%25prefix-part-00000-1b98908c-7430-4b7c-af27-ac1452047ed1-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386292913,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386292913000","MIN_INSERTION_TIME":"1774386292913000","MAX_INSERTION_TIME":"1774386292913000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.parquet b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.parquet new file mode 100644 index 0000000000..a0acb448b6 Binary files /dev/null and b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000005.checkpoint.parquet differ diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000005.crc b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000005.crc new file mode 100644 index 0000000000..2e7523842a --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000005.crc @@ -0,0 +1 @@ +{"txnId":"9cd89dcd-2e5d-4200-ae26-df7d8d5a4bf1","tableSizeBytes":3804,"numFiles":5,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"3afc4c9a-c57b-4c20-8a53-f00ec003c9b4","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpoint.writeStatsAsStruct":"true","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10","delta.enableDeletionVectors":"true"},"createdTime":1774386277024},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3804,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[5,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-1b98908c-7430-4b7c-af27-ac1452047ed1-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386292913,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386292913000","MIN_INSERTION_TIME":"1774386292913000","MAX_INSERTION_TIME":"1774386292913000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-f993c7d8-518a-4056-a77c-7f0d06a6e07e-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386294073,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386294073000","MIN_INSERTION_TIME":"1774386294073000","MAX_INSERTION_TIME":"1774386294073000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-54d78e6d-3c0d-402d-97cf-b6b46ab54533-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774386291681,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386291681000","MIN_INSERTION_TIME":"1774386291681000","MAX_INSERTION_TIME":"1774386291681000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-ca7dba04-ea61-4e38-882e-ff6ee82f0d46-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386290289,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386290289000","MIN_INSERTION_TIME":"1774386290289000","MAX_INSERTION_TIME":"1774386290289000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-09ac7369-06f6-48dd-99ac-5a97ff6b8cb8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386288245,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386288245000","MIN_INSERTION_TIME":"1774386288245000","MAX_INSERTION_TIME":"1774386288245000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000005.json b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000005.json new file mode 100644 index 0000000000..5ced5e56fd --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/00000000000000000005.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774386294218,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":4,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"9cd89dcd-2e5d-4200-ae26-df7d8d5a4bf1"}} +{"add":{"path":"test%25file%25prefix-part-00000-f993c7d8-518a-4056-a77c-7f0d06a6e07e-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774386294073,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774386294073000","MIN_INSERTION_TIME":"1774386294073000","MAX_INSERTION_TIME":"1774386294073000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/_last_checkpoint b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..18694d44ff --- /dev/null +++ b/kernel/tests/data/v1-single-part-struct-stats-only/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":5,"size":7,"sizeInBytes":20415,"numOfAddFiles":5,"checkpointSchema":{"type":"struct","fields":[{"name":"txn","type":{"type":"struct","fields":[{"name":"appId","type":"string","nullable":true,"metadata":{}},{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"lastUpdated","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"add","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"clusteringProvider","type":"string","nullable":true,"metadata":{}},{"name":"stats_parsed","type":{"type":"struct","fields":[{"name":"numRecords","type":"long","nullable":true,"metadata":{}},{"name":"minValues","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"maxValues","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"nullCount","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"tightBounds","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"remove","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"deletionTimestamp","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"extendedFileMetadata","type":"boolean","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"metaData","type":{"type":"struct","fields":[{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"format","type":{"type":"struct","fields":[{"name":"provider","type":"string","nullable":true,"metadata":{}},{"name":"options","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"schemaString","type":"string","nullable":true,"metadata":{}},{"name":"partitionColumns","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"configuration","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"createdTime","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"protocol","type":{"type":"struct","fields":[{"name":"minReaderVersion","type":"integer","nullable":true,"metadata":{}},{"name":"minWriterVersion","type":"integer","nullable":true,"metadata":{}},{"name":"readerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"writerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"domainMetadata","type":{"type":"struct","fields":[{"name":"domain","type":"string","nullable":true,"metadata":{}},{"name":"configuration","type":"string","nullable":true,"metadata":{}},{"name":"removed","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"checkpointFiles":[{"path":"00000000000000000005.checkpoint.parquet","length":20415,"isDir":false,"modificationTime":1774386295689}],"checksum":"b03649834c60f41ff3ac3438cb49e29e"} diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-09ac7369-06f6-48dd-99ac-5a97ff6b8cb8-c000.snappy.parquet b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-09ac7369-06f6-48dd-99ac-5a97ff6b8cb8-c000.snappy.parquet new file mode 100644 index 0000000000..e23848797a Binary files /dev/null and b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-09ac7369-06f6-48dd-99ac-5a97ff6b8cb8-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-1b98908c-7430-4b7c-af27-ac1452047ed1-c000.snappy.parquet b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-1b98908c-7430-4b7c-af27-ac1452047ed1-c000.snappy.parquet new file mode 100644 index 0000000000..bfc9dcfa44 Binary files /dev/null and b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-1b98908c-7430-4b7c-af27-ac1452047ed1-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-54d78e6d-3c0d-402d-97cf-b6b46ab54533-c000.snappy.parquet b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-54d78e6d-3c0d-402d-97cf-b6b46ab54533-c000.snappy.parquet new file mode 100644 index 0000000000..9713b2d963 Binary files /dev/null and b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-54d78e6d-3c0d-402d-97cf-b6b46ab54533-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-ca7dba04-ea61-4e38-882e-ff6ee82f0d46-c000.snappy.parquet b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-ca7dba04-ea61-4e38-882e-ff6ee82f0d46-c000.snappy.parquet new file mode 100644 index 0000000000..b6aabb8552 Binary files /dev/null and b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-ca7dba04-ea61-4e38-882e-ff6ee82f0d46-c000.snappy.parquet differ diff --git a/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-f993c7d8-518a-4056-a77c-7f0d06a6e07e-c000.snappy.parquet b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-f993c7d8-518a-4056-a77c-7f0d06a6e07e-c000.snappy.parquet new file mode 100644 index 0000000000..1f8980ad29 Binary files /dev/null and b/kernel/tests/data/v1-single-part-struct-stats-only/test%file%prefix-part-00000-f993c7d8-518a-4056-a77c-7f0d06a6e07e-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-checkpoints-json-with-last-checkpoint.tar.zst b/kernel/tests/data/v2-checkpoints-json-with-last-checkpoint.tar.zst index dbb8aa627d..91169f3c10 100644 Binary files a/kernel/tests/data/v2-checkpoints-json-with-last-checkpoint.tar.zst and b/kernel/tests/data/v2-checkpoints-json-with-last-checkpoint.tar.zst differ diff --git a/kernel/tests/data/v2-checkpoints-json-with-sidecars.tar.zst b/kernel/tests/data/v2-checkpoints-json-with-sidecars.tar.zst index a311946380..3347f4d279 100644 Binary files a/kernel/tests/data/v2-checkpoints-json-with-sidecars.tar.zst and b/kernel/tests/data/v2-checkpoints-json-with-sidecars.tar.zst differ diff --git a/kernel/tests/data/v2-checkpoints-json-without-sidecars.tar.zst b/kernel/tests/data/v2-checkpoints-json-without-sidecars.tar.zst index aaba3d3f88..edd5d51a1c 100644 Binary files a/kernel/tests/data/v2-checkpoints-json-without-sidecars.tar.zst and b/kernel/tests/data/v2-checkpoints-json-without-sidecars.tar.zst differ diff --git a/kernel/tests/data/v2-checkpoints-parquet-with-last-checkpoint.tar.zst b/kernel/tests/data/v2-checkpoints-parquet-with-last-checkpoint.tar.zst index 4f6833a5a9..9076d98966 100644 Binary files a/kernel/tests/data/v2-checkpoints-parquet-with-last-checkpoint.tar.zst and b/kernel/tests/data/v2-checkpoints-parquet-with-last-checkpoint.tar.zst differ diff --git a/kernel/tests/data/v2-checkpoints-parquet-with-sidecars.tar.zst b/kernel/tests/data/v2-checkpoints-parquet-with-sidecars.tar.zst index 0f2a289b1c..41873d7216 100644 Binary files a/kernel/tests/data/v2-checkpoints-parquet-with-sidecars.tar.zst and b/kernel/tests/data/v2-checkpoints-parquet-with-sidecars.tar.zst differ diff --git a/kernel/tests/data/v2-checkpoints-parquet-without-sidecars.tar.zst b/kernel/tests/data/v2-checkpoints-parquet-without-sidecars.tar.zst index b90fc04b49..d821083c9e 100644 Binary files a/kernel/tests/data/v2-checkpoints-parquet-without-sidecars.tar.zst and b/kernel/tests/data/v2-checkpoints-parquet-without-sidecars.tar.zst differ diff --git a/kernel/tests/data/v2-classic-checkpoint-json.tar.zst b/kernel/tests/data/v2-classic-checkpoint-json.tar.zst index c695339cd9..f9c603879f 100644 Binary files a/kernel/tests/data/v2-classic-checkpoint-json.tar.zst and b/kernel/tests/data/v2-classic-checkpoint-json.tar.zst differ diff --git a/kernel/tests/data/v2-classic-checkpoint-parquet.tar.zst b/kernel/tests/data/v2-classic-checkpoint-parquet.tar.zst index 87bca6f59c..63317c7159 100644 Binary files a/kernel/tests/data/v2-classic-checkpoint-parquet.tar.zst and b/kernel/tests/data/v2-classic-checkpoint-parquet.tar.zst differ diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000000.crc b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000000.crc new file mode 100644 index 0000000000..f75eb23364 --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"97c7c862-efa8-499f-ad8b-a2cc7de3b384","tableSizeBytes":0,"numFiles":0,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"23b172f9-95c3-43d0-8c0c-423bb644526a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"classic","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385170349},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[0,0,0,0,0,0,0,0,0,0]},"allFiles":[]} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000000.json b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..e0428732ee --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1774385170444,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.enableDeletionVectors\":\"true\",\"delta.checkpoint.writeStatsAsStruct\":\"true\",\"delta.checkpointPolicy\":\"classic\",\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpointInterval\":\"10\"}","statsOnLoad":false},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"97c7c862-efa8-499f-ad8b-a2cc7de3b384"}} +{"metaData":{"id":"23b172f9-95c3-43d0-8c0c-423bb644526a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"classic","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385170349}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]}} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000001.crc b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000001.crc new file mode 100644 index 0000000000..68dc8f5e73 --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000001.crc @@ -0,0 +1 @@ +{"txnId":"0eb5d6cd-b9d6-44d6-b958-b559d825d310","tableSizeBytes":761,"numFiles":1,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"23b172f9-95c3-43d0-8c0c-423bb644526a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"classic","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385170349},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[761,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[1,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-981d116a-6a3c-4c09-b604-3695d0db5dc8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385171192,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385171192000","MIN_INSERTION_TIME":"1774385171192000","MAX_INSERTION_TIME":"1774385171192000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000001.json b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..36488f6b0e --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385171301,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"0eb5d6cd-b9d6-44d6-b958-b559d825d310"}} +{"add":{"path":"test%25file%25prefix-part-00000-981d116a-6a3c-4c09-b604-3695d0db5dc8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385171192,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385171192000","MIN_INSERTION_TIME":"1774385171192000","MAX_INSERTION_TIME":"1774385171192000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000002.crc b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000002.crc new file mode 100644 index 0000000000..4a929422b3 --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000002.crc @@ -0,0 +1 @@ +{"txnId":"68c75a15-27e0-4e2f-904e-41fb4cc8607e","tableSizeBytes":1522,"numFiles":2,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"23b172f9-95c3-43d0-8c0c-423bb644526a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"classic","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385170349},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[1522,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[2,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-981d116a-6a3c-4c09-b604-3695d0db5dc8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385171192,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385171192000","MIN_INSERTION_TIME":"1774385171192000","MAX_INSERTION_TIME":"1774385171192000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-e2cba303-0dc3-4091-87cb-ce11d0447bda-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385172140,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385172140000","MIN_INSERTION_TIME":"1774385172140000","MAX_INSERTION_TIME":"1774385172140000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000002.json b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..852f871b7b --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385172250,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"68c75a15-27e0-4e2f-904e-41fb4cc8607e"}} +{"add":{"path":"test%25file%25prefix-part-00000-e2cba303-0dc3-4091-87cb-ce11d0447bda-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385172140,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385172140000","MIN_INSERTION_TIME":"1774385172140000","MAX_INSERTION_TIME":"1774385172140000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000003.crc b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000003.crc new file mode 100644 index 0000000000..3df2b4f89d --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000003.crc @@ -0,0 +1 @@ +{"txnId":"fe119ce8-b73f-43d7-9739-aa410867a8e3","tableSizeBytes":2282,"numFiles":3,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"23b172f9-95c3-43d0-8c0c-423bb644526a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"classic","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385170349},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[2282,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[3,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-bb4858c3-3c9a-4325-ab3f-a0e07e6a94d4-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385173080,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385173080000","MIN_INSERTION_TIME":"1774385173080000","MAX_INSERTION_TIME":"1774385173080000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-981d116a-6a3c-4c09-b604-3695d0db5dc8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385171192,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385171192000","MIN_INSERTION_TIME":"1774385171192000","MAX_INSERTION_TIME":"1774385171192000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-e2cba303-0dc3-4091-87cb-ce11d0447bda-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385172140,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385172140000","MIN_INSERTION_TIME":"1774385172140000","MAX_INSERTION_TIME":"1774385172140000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000003.json b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..386c947989 --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385173188,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"760"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"fe119ce8-b73f-43d7-9739-aa410867a8e3"}} +{"add":{"path":"test%25file%25prefix-part-00000-bb4858c3-3c9a-4325-ab3f-a0e07e6a94d4-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385173080,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385173080000","MIN_INSERTION_TIME":"1774385173080000","MAX_INSERTION_TIME":"1774385173080000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000004.crc b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000004.crc new file mode 100644 index 0000000000..981d7df1f6 --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000004.crc @@ -0,0 +1 @@ +{"txnId":"2b6eec86-6211-42fc-8574-7e3687c1c84c","tableSizeBytes":3043,"numFiles":4,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"23b172f9-95c3-43d0-8c0c-423bb644526a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"classic","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385170349},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3043,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[4,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-1f06e739-2467-49d8-bec1-dffa9c6ca027-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385174004,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385174004000","MIN_INSERTION_TIME":"1774385174004000","MAX_INSERTION_TIME":"1774385174004000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-bb4858c3-3c9a-4325-ab3f-a0e07e6a94d4-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385173080,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385173080000","MIN_INSERTION_TIME":"1774385173080000","MAX_INSERTION_TIME":"1774385173080000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-981d116a-6a3c-4c09-b604-3695d0db5dc8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385171192,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385171192000","MIN_INSERTION_TIME":"1774385171192000","MAX_INSERTION_TIME":"1774385171192000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-e2cba303-0dc3-4091-87cb-ce11d0447bda-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385172140,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385172140000","MIN_INSERTION_TIME":"1774385172140000","MAX_INSERTION_TIME":"1774385172140000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000004.json b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000004.json new file mode 100644 index 0000000000..3cb9a69add --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385174111,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":3,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"2b6eec86-6211-42fc-8574-7e3687c1c84c"}} +{"add":{"path":"test%25file%25prefix-part-00000-1f06e739-2467-49d8-bec1-dffa9c6ca027-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385174004,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385174004000","MIN_INSERTION_TIME":"1774385174004000","MAX_INSERTION_TIME":"1774385174004000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000005.checkpoint.parquet b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000005.checkpoint.parquet new file mode 100644 index 0000000000..7d3be34c06 Binary files /dev/null and b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000005.checkpoint.parquet differ diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000005.crc b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000005.crc new file mode 100644 index 0000000000..57202f88ca --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000005.crc @@ -0,0 +1 @@ +{"txnId":"4e570e60-4733-42cf-82ed-285f8fd4f44d","tableSizeBytes":3804,"numFiles":5,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"23b172f9-95c3-43d0-8c0c-423bb644526a","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"classic","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385170349},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3804,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[5,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-1f06e739-2467-49d8-bec1-dffa9c6ca027-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385174004,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385174004000","MIN_INSERTION_TIME":"1774385174004000","MAX_INSERTION_TIME":"1774385174004000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-bb4858c3-3c9a-4325-ab3f-a0e07e6a94d4-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385173080,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385173080000","MIN_INSERTION_TIME":"1774385173080000","MAX_INSERTION_TIME":"1774385173080000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-981d116a-6a3c-4c09-b604-3695d0db5dc8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385171192,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385171192000","MIN_INSERTION_TIME":"1774385171192000","MAX_INSERTION_TIME":"1774385171192000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-f91cd7b7-b47f-4f76-b052-971c4a313235-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385174928,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385174928000","MIN_INSERTION_TIME":"1774385174928000","MAX_INSERTION_TIME":"1774385174928000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-e2cba303-0dc3-4091-87cb-ce11d0447bda-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385172140,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385172140000","MIN_INSERTION_TIME":"1774385172140000","MAX_INSERTION_TIME":"1774385172140000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000005.json b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000005.json new file mode 100644 index 0000000000..b1da73d912 --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/00000000000000000005.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385175036,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":4,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"4e570e60-4733-42cf-82ed-285f8fd4f44d"}} +{"add":{"path":"test%25file%25prefix-part-00000-f91cd7b7-b47f-4f76-b052-971c4a313235-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385174928,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385174928000","MIN_INSERTION_TIME":"1774385174928000","MAX_INSERTION_TIME":"1774385174928000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/_last_checkpoint b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..7bc3ed76f2 --- /dev/null +++ b/kernel/tests/data/v2-classic-parquet-struct-stats-only/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":5,"size":7,"sizeInBytes":20434,"numOfAddFiles":5,"checkpointSchema":{"type":"struct","fields":[{"name":"txn","type":{"type":"struct","fields":[{"name":"appId","type":"string","nullable":true,"metadata":{}},{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"lastUpdated","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"add","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"clusteringProvider","type":"string","nullable":true,"metadata":{}},{"name":"stats_parsed","type":{"type":"struct","fields":[{"name":"numRecords","type":"long","nullable":true,"metadata":{}},{"name":"minValues","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"maxValues","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"nullCount","type":{"type":"struct","fields":[{"name":"id","type":"long","nullable":true,"metadata":{}},{"name":"value","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"tightBounds","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"remove","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"deletionTimestamp","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"extendedFileMetadata","type":"boolean","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"metaData","type":{"type":"struct","fields":[{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"format","type":{"type":"struct","fields":[{"name":"provider","type":"string","nullable":true,"metadata":{}},{"name":"options","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"schemaString","type":"string","nullable":true,"metadata":{}},{"name":"partitionColumns","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"configuration","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"createdTime","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"protocol","type":{"type":"struct","fields":[{"name":"minReaderVersion","type":"integer","nullable":true,"metadata":{}},{"name":"minWriterVersion","type":"integer","nullable":true,"metadata":{}},{"name":"readerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"writerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"domainMetadata","type":{"type":"struct","fields":[{"name":"domain","type":"string","nullable":true,"metadata":{}},{"name":"configuration","type":"string","nullable":true,"metadata":{}},{"name":"removed","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"checkpointFiles":[{"path":"00000000000000000005.checkpoint.parquet","length":20434,"isDir":false,"modificationTime":1774385175900}],"checksum":"c4d2f11ca6ccd2a86dfc0c2ea42087a7"} diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-1f06e739-2467-49d8-bec1-dffa9c6ca027-c000.snappy.parquet b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-1f06e739-2467-49d8-bec1-dffa9c6ca027-c000.snappy.parquet new file mode 100644 index 0000000000..bfc9dcfa44 Binary files /dev/null and b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-1f06e739-2467-49d8-bec1-dffa9c6ca027-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-981d116a-6a3c-4c09-b604-3695d0db5dc8-c000.snappy.parquet b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-981d116a-6a3c-4c09-b604-3695d0db5dc8-c000.snappy.parquet new file mode 100644 index 0000000000..e23848797a Binary files /dev/null and b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-981d116a-6a3c-4c09-b604-3695d0db5dc8-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-bb4858c3-3c9a-4325-ab3f-a0e07e6a94d4-c000.snappy.parquet b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-bb4858c3-3c9a-4325-ab3f-a0e07e6a94d4-c000.snappy.parquet new file mode 100644 index 0000000000..9713b2d963 Binary files /dev/null and b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-bb4858c3-3c9a-4325-ab3f-a0e07e6a94d4-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-e2cba303-0dc3-4091-87cb-ce11d0447bda-c000.snappy.parquet b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-e2cba303-0dc3-4091-87cb-ce11d0447bda-c000.snappy.parquet new file mode 100644 index 0000000000..b6aabb8552 Binary files /dev/null and b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-e2cba303-0dc3-4091-87cb-ce11d0447bda-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-f91cd7b7-b47f-4f76-b052-971c4a313235-c000.snappy.parquet b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-f91cd7b7-b47f-4f76-b052-971c4a313235-c000.snappy.parquet new file mode 100644 index 0000000000..1f8980ad29 Binary files /dev/null and b/kernel/tests/data/v2-classic-parquet-struct-stats-only/test%file%prefix-part-00000-f91cd7b7-b47f-4f76-b052-971c4a313235-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000000.crc b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000000.crc new file mode 100644 index 0000000000..4996568a8f --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"d2627053-64d8-40cc-ade5-bc8eae64b0f6","tableSizeBytes":0,"numFiles":0,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"aab52b7c-387a-4e1b-b804-4140a460bfd6","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385164156},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[0,0,0,0,0,0,0,0,0,0]},"allFiles":[]} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000000.json b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..689bbb5494 --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1774385164257,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.enableDeletionVectors\":\"true\",\"delta.checkpoint.writeStatsAsStruct\":\"true\",\"delta.checkpointPolicy\":\"v2\",\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpointInterval\":\"10\"}","statsOnLoad":false},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"d2627053-64d8-40cc-ade5-bc8eae64b0f6"}} +{"metaData":{"id":"aab52b7c-387a-4e1b-b804-4140a460bfd6","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385164156}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]}} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000001.crc b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000001.crc new file mode 100644 index 0000000000..999f0687e9 --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000001.crc @@ -0,0 +1 @@ +{"txnId":"f184b35c-f967-4c4f-91f9-b78e70bbdd20","tableSizeBytes":761,"numFiles":1,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"aab52b7c-387a-4e1b-b804-4140a460bfd6","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385164156},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[761,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[1,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-b93d3d1f-a7ec-42f2-ade5-27837538abcd-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385165060,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385165060000","MIN_INSERTION_TIME":"1774385165060000","MAX_INSERTION_TIME":"1774385165060000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000001.json b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..0219121b96 --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385165182,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"f184b35c-f967-4c4f-91f9-b78e70bbdd20"}} +{"add":{"path":"test%25file%25prefix-part-00000-b93d3d1f-a7ec-42f2-ade5-27837538abcd-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385165060,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385165060000","MIN_INSERTION_TIME":"1774385165060000","MAX_INSERTION_TIME":"1774385165060000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000002.crc b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000002.crc new file mode 100644 index 0000000000..78e9478dd3 --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000002.crc @@ -0,0 +1 @@ +{"txnId":"f91439b4-5a79-4fc9-8550-e03818be0ef7","tableSizeBytes":1522,"numFiles":2,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"aab52b7c-387a-4e1b-b804-4140a460bfd6","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385164156},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[1522,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[2,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-4c7c7cd8-3a62-4e66-b3b9-e89f44612e30-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385166068,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385166068000","MIN_INSERTION_TIME":"1774385166068000","MAX_INSERTION_TIME":"1774385166068000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-b93d3d1f-a7ec-42f2-ade5-27837538abcd-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385165060,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385165060000","MIN_INSERTION_TIME":"1774385165060000","MAX_INSERTION_TIME":"1774385165060000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000002.json b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..17c0541a81 --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385166186,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"f91439b4-5a79-4fc9-8550-e03818be0ef7"}} +{"add":{"path":"test%25file%25prefix-part-00000-4c7c7cd8-3a62-4e66-b3b9-e89f44612e30-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385166068,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385166068000","MIN_INSERTION_TIME":"1774385166068000","MAX_INSERTION_TIME":"1774385166068000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000003.crc b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000003.crc new file mode 100644 index 0000000000..982f3e145d --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000003.crc @@ -0,0 +1 @@ +{"txnId":"e6971bdb-020d-4964-8d37-eff3c426b4bf","tableSizeBytes":2282,"numFiles":3,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"aab52b7c-387a-4e1b-b804-4140a460bfd6","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385164156},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[2282,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[3,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-4c7c7cd8-3a62-4e66-b3b9-e89f44612e30-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385166068,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385166068000","MIN_INSERTION_TIME":"1774385166068000","MAX_INSERTION_TIME":"1774385166068000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-b93d3d1f-a7ec-42f2-ade5-27837538abcd-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385165060,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385165060000","MIN_INSERTION_TIME":"1774385165060000","MAX_INSERTION_TIME":"1774385165060000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-68f487f2-d7f8-48d8-8b34-4a567dc046f4-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385167032,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385167032000","MIN_INSERTION_TIME":"1774385167032000","MAX_INSERTION_TIME":"1774385167032000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000003.json b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..3c86b8921a --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385167154,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"760"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"e6971bdb-020d-4964-8d37-eff3c426b4bf"}} +{"add":{"path":"test%25file%25prefix-part-00000-68f487f2-d7f8-48d8-8b34-4a567dc046f4-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385167032,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385167032000","MIN_INSERTION_TIME":"1774385167032000","MAX_INSERTION_TIME":"1774385167032000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000004.crc b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000004.crc new file mode 100644 index 0000000000..ac5da9aa8f --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000004.crc @@ -0,0 +1 @@ +{"txnId":"82b91af6-eee8-4837-ab86-6b5becab5e6f","tableSizeBytes":3043,"numFiles":4,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"aab52b7c-387a-4e1b-b804-4140a460bfd6","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385164156},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3043,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[4,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-4c7c7cd8-3a62-4e66-b3b9-e89f44612e30-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385166068,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385166068000","MIN_INSERTION_TIME":"1774385166068000","MAX_INSERTION_TIME":"1774385166068000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-c89fe644-eade-4db6-b00f-6c5e1f8343ea-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385168156,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385168156000","MIN_INSERTION_TIME":"1774385168156000","MAX_INSERTION_TIME":"1774385168156000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-b93d3d1f-a7ec-42f2-ade5-27837538abcd-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385165060,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385165060000","MIN_INSERTION_TIME":"1774385165060000","MAX_INSERTION_TIME":"1774385165060000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-68f487f2-d7f8-48d8-8b34-4a567dc046f4-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385167032,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385167032000","MIN_INSERTION_TIME":"1774385167032000","MAX_INSERTION_TIME":"1774385167032000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000004.json b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000004.json new file mode 100644 index 0000000000..803f4232cf --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385168305,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":3,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"82b91af6-eee8-4837-ab86-6b5becab5e6f"}} +{"add":{"path":"test%25file%25prefix-part-00000-c89fe644-eade-4db6-b00f-6c5e1f8343ea-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385168156,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385168156000","MIN_INSERTION_TIME":"1774385168156000","MAX_INSERTION_TIME":"1774385168156000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000005.checkpoint.99cabe18-f541-4a52-b8fb-3f488d113032.json b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000005.checkpoint.99cabe18-f541-4a52-b8fb-3f488d113032.json new file mode 100644 index 0000000000..6b3fdc11dd --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000005.checkpoint.99cabe18-f541-4a52-b8fb-3f488d113032.json @@ -0,0 +1,4 @@ +{"checkpointMetadata":{"version":5,"tags":{"sidecarNumActions":"5","sidecarSizeInBytes":"12599","numOfAddFiles":"5","sidecarFileSchema":"{\"type\":\"struct\",\"fields\":[{\"name\":\"add\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"partitionValues\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"size\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"modificationTime\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dataChange\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tags\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionVector\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"storageType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"pathOrInlineDv\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"offset\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sizeInBytes\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"cardinality\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"maxRowIndex\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"baseRowId\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"defaultRowCommitVersion\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clusteringProvider\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"stats_parsed\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"numRecords\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"minValues\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"maxValues\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"nullCount\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"tightBounds\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"remove\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionTimestamp\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dataChange\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"extendedFileMetadata\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"partitionValues\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"size\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionVector\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"storageType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"pathOrInlineDv\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"offset\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sizeInBytes\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"cardinality\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"maxRowIndex\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"baseRowId\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"defaultRowCommitVersion\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}"}}} +{"sidecar":{"path":"00000000000000000005.checkpoint.0000000001.0000000001.aec23d5c-e86d-4012-adcc-d4f08ad67230.parquet","sizeInBytes":12599,"modificationTime":1774385170196}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]}} +{"metaData":{"id":"aab52b7c-387a-4e1b-b804-4140a460bfd6","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385164156}} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000005.crc b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000005.crc new file mode 100644 index 0000000000..c8d7cd40f5 --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000005.crc @@ -0,0 +1 @@ +{"txnId":"0be3928d-1ddf-4a2b-aca5-d0ddc96fcf42","tableSizeBytes":3804,"numFiles":5,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"aab52b7c-387a-4e1b-b804-4140a460bfd6","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385164156},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3804,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[5,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-4c7c7cd8-3a62-4e66-b3b9-e89f44612e30-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385166068,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385166068000","MIN_INSERTION_TIME":"1774385166068000","MAX_INSERTION_TIME":"1774385166068000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-c89fe644-eade-4db6-b00f-6c5e1f8343ea-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385168156,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385168156000","MIN_INSERTION_TIME":"1774385168156000","MAX_INSERTION_TIME":"1774385168156000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-b93d3d1f-a7ec-42f2-ade5-27837538abcd-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385165060,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385165060000","MIN_INSERTION_TIME":"1774385165060000","MAX_INSERTION_TIME":"1774385165060000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-e4fcdaa7-dff0-4c20-b263-fc29f1cba12c-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385169176,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385169176000","MIN_INSERTION_TIME":"1774385169176000","MAX_INSERTION_TIME":"1774385169176000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-68f487f2-d7f8-48d8-8b34-4a567dc046f4-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385167032,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385167032000","MIN_INSERTION_TIME":"1774385167032000","MAX_INSERTION_TIME":"1774385167032000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000005.json b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000005.json new file mode 100644 index 0000000000..ce26721926 --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/00000000000000000005.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385169289,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":4,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"0be3928d-1ddf-4a2b-aca5-d0ddc96fcf42"}} +{"add":{"path":"test%25file%25prefix-part-00000-e4fcdaa7-dff0-4c20-b263-fc29f1cba12c-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385169176,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385169176000","MIN_INSERTION_TIME":"1774385169176000","MAX_INSERTION_TIME":"1774385169176000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/_last_checkpoint b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..000935edfb --- /dev/null +++ b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":5,"size":9,"sizeInBytes":17710,"numOfAddFiles":5,"v2Checkpoint":{"path":"00000000000000000005.checkpoint.99cabe18-f541-4a52-b8fb-3f488d113032.json","sizeInBytes":5111,"modificationTime":1774385170232,"nonFileActions":[{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]}},{"metaData":{"id":"aab52b7c-387a-4e1b-b804-4140a460bfd6","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385164156}},{"checkpointMetadata":{"version":5,"tags":{"sidecarNumActions":"5","sidecarSizeInBytes":"12599","numOfAddFiles":"5","sidecarFileSchema":"{\"type\":\"struct\",\"fields\":[{\"name\":\"add\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"partitionValues\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"size\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"modificationTime\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dataChange\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tags\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionVector\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"storageType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"pathOrInlineDv\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"offset\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sizeInBytes\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"cardinality\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"maxRowIndex\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"baseRowId\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"defaultRowCommitVersion\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clusteringProvider\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"stats_parsed\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"numRecords\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"minValues\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"maxValues\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"nullCount\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"tightBounds\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"remove\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionTimestamp\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dataChange\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"extendedFileMetadata\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"partitionValues\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"size\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionVector\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"storageType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"pathOrInlineDv\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"offset\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sizeInBytes\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"cardinality\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"maxRowIndex\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"baseRowId\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"defaultRowCommitVersion\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}"}}}],"sidecarFiles":[{"path":"00000000000000000005.checkpoint.0000000001.0000000001.aec23d5c-e86d-4012-adcc-d4f08ad67230.parquet","sizeInBytes":12599,"modificationTime":1774385170196}]},"checksum":"eb36d5d7ee6a2d503cbfd76b2f116b2e"} diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/_sidecars/00000000000000000005.checkpoint.0000000001.0000000001.aec23d5c-e86d-4012-adcc-d4f08ad67230.parquet b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/_sidecars/00000000000000000005.checkpoint.0000000001.0000000001.aec23d5c-e86d-4012-adcc-d4f08ad67230.parquet new file mode 100644 index 0000000000..8507e8d4a7 Binary files /dev/null and b/kernel/tests/data/v2-json-sidecars-struct-stats-only/_delta_log/_sidecars/00000000000000000005.checkpoint.0000000001.0000000001.aec23d5c-e86d-4012-adcc-d4f08ad67230.parquet differ diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-4c7c7cd8-3a62-4e66-b3b9-e89f44612e30-c000.snappy.parquet b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-4c7c7cd8-3a62-4e66-b3b9-e89f44612e30-c000.snappy.parquet new file mode 100644 index 0000000000..b6aabb8552 Binary files /dev/null and b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-4c7c7cd8-3a62-4e66-b3b9-e89f44612e30-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-68f487f2-d7f8-48d8-8b34-4a567dc046f4-c000.snappy.parquet b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-68f487f2-d7f8-48d8-8b34-4a567dc046f4-c000.snappy.parquet new file mode 100644 index 0000000000..9713b2d963 Binary files /dev/null and b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-68f487f2-d7f8-48d8-8b34-4a567dc046f4-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-b93d3d1f-a7ec-42f2-ade5-27837538abcd-c000.snappy.parquet b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-b93d3d1f-a7ec-42f2-ade5-27837538abcd-c000.snappy.parquet new file mode 100644 index 0000000000..e23848797a Binary files /dev/null and b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-b93d3d1f-a7ec-42f2-ade5-27837538abcd-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-c89fe644-eade-4db6-b00f-6c5e1f8343ea-c000.snappy.parquet b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-c89fe644-eade-4db6-b00f-6c5e1f8343ea-c000.snappy.parquet new file mode 100644 index 0000000000..bfc9dcfa44 Binary files /dev/null and b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-c89fe644-eade-4db6-b00f-6c5e1f8343ea-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-e4fcdaa7-dff0-4c20-b263-fc29f1cba12c-c000.snappy.parquet b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-e4fcdaa7-dff0-4c20-b263-fc29f1cba12c-c000.snappy.parquet new file mode 100644 index 0000000000..1f8980ad29 Binary files /dev/null and b/kernel/tests/data/v2-json-sidecars-struct-stats-only/test%file%prefix-part-00000-e4fcdaa7-dff0-4c20-b263-fc29f1cba12c-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000000.crc b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000000.crc new file mode 100644 index 0000000000..ad431e43f1 --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000000.crc @@ -0,0 +1 @@ +{"txnId":"d029b8be-4b6c-40be-8c98-051174082a0f","tableSizeBytes":0,"numFiles":0,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"69f7cd6b-63b4-4e31-85b1-450ac0a9f70e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385157314},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[0,0,0,0,0,0,0,0,0,0]},"allFiles":[]} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000000.json b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..d48e73bd6e --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1774385157417,"operation":"CREATE TABLE","operationParameters":{"partitionBy":"[]","clusterBy":"[]","description":null,"isManaged":"false","properties":"{\"delta.enableDeletionVectors\":\"true\",\"delta.checkpoint.writeStatsAsStruct\":\"true\",\"delta.checkpointPolicy\":\"v2\",\"delta.checkpoint.writeStatsAsJson\":\"false\",\"delta.checkpointInterval\":\"10\"}","statsOnLoad":false},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"tags":{"restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"d029b8be-4b6c-40be-8c98-051174082a0f"}} +{"metaData":{"id":"69f7cd6b-63b4-4e31-85b1-450ac0a9f70e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385157314}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]}} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000001.crc b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000001.crc new file mode 100644 index 0000000000..52bd9eeeb8 --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000001.crc @@ -0,0 +1 @@ +{"txnId":"508beb32-0b1f-4e43-aea3-95dd221a0ece","tableSizeBytes":761,"numFiles":1,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"69f7cd6b-63b4-4e31-85b1-450ac0a9f70e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385157314},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[761,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[1,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-ba9dae5e-b947-4940-961f-6cf6a3c2fbfe-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385158244,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385158244000","MIN_INSERTION_TIME":"1774385158244000","MAX_INSERTION_TIME":"1774385158244000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000001.json b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..3ac90fab5f --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385158387,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"508beb32-0b1f-4e43-aea3-95dd221a0ece"}} +{"add":{"path":"test%25file%25prefix-part-00000-ba9dae5e-b947-4940-961f-6cf6a3c2fbfe-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385158244,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385158244000","MIN_INSERTION_TIME":"1774385158244000","MAX_INSERTION_TIME":"1774385158244000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000002.crc b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000002.crc new file mode 100644 index 0000000000..442510ddbb --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000002.crc @@ -0,0 +1 @@ +{"txnId":"4d25d6cc-be1d-47bf-b39c-035cdbbf3cec","tableSizeBytes":1522,"numFiles":2,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"69f7cd6b-63b4-4e31-85b1-450ac0a9f70e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385157314},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[1522,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[2,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-ba9dae5e-b947-4940-961f-6cf6a3c2fbfe-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385158244,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385158244000","MIN_INSERTION_TIME":"1774385158244000","MAX_INSERTION_TIME":"1774385158244000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-2ce1aba3-6914-48d4-b2b3-9cc25d61b211-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385159272,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385159272000","MIN_INSERTION_TIME":"1774385159272000","MAX_INSERTION_TIME":"1774385159272000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000002.json b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..325e641bae --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385159401,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":1,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"4d25d6cc-be1d-47bf-b39c-035cdbbf3cec"}} +{"add":{"path":"test%25file%25prefix-part-00000-2ce1aba3-6914-48d4-b2b3-9cc25d61b211-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385159272,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385159272000","MIN_INSERTION_TIME":"1774385159272000","MAX_INSERTION_TIME":"1774385159272000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000003.crc b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000003.crc new file mode 100644 index 0000000000..5fa5dbc61e --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000003.crc @@ -0,0 +1 @@ +{"txnId":"767c1612-d46b-4d50-b16d-7aa2d0a55e8b","tableSizeBytes":2282,"numFiles":3,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"69f7cd6b-63b4-4e31-85b1-450ac0a9f70e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385157314},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[2282,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[3,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-ba9dae5e-b947-4940-961f-6cf6a3c2fbfe-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385158244,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385158244000","MIN_INSERTION_TIME":"1774385158244000","MAX_INSERTION_TIME":"1774385158244000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-7268d56a-7420-4691-8062-5b0c3dbdd803-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385160264,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385160264000","MIN_INSERTION_TIME":"1774385160264000","MAX_INSERTION_TIME":"1774385160264000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-2ce1aba3-6914-48d4-b2b3-9cc25d61b211-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385159272,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385159272000","MIN_INSERTION_TIME":"1774385159272000","MAX_INSERTION_TIME":"1774385159272000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000003.json b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..03557b82ac --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385160385,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":2,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"760"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"767c1612-d46b-4d50-b16d-7aa2d0a55e8b"}} +{"add":{"path":"test%25file%25prefix-part-00000-7268d56a-7420-4691-8062-5b0c3dbdd803-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385160264,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385160264000","MIN_INSERTION_TIME":"1774385160264000","MAX_INSERTION_TIME":"1774385160264000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000004.crc b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000004.crc new file mode 100644 index 0000000000..8d072b8526 --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000004.crc @@ -0,0 +1 @@ +{"txnId":"23458c90-e99f-4f42-bcaa-3a00fab3a35d","tableSizeBytes":3043,"numFiles":4,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"69f7cd6b-63b4-4e31-85b1-450ac0a9f70e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385157314},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3043,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[4,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-ea8ba2ac-45e6-497c-99e5-ad5b5e78920e-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385161268,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385161268000","MIN_INSERTION_TIME":"1774385161268000","MAX_INSERTION_TIME":"1774385161268000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-ba9dae5e-b947-4940-961f-6cf6a3c2fbfe-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385158244,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385158244000","MIN_INSERTION_TIME":"1774385158244000","MAX_INSERTION_TIME":"1774385158244000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-7268d56a-7420-4691-8062-5b0c3dbdd803-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385160264,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385160264000","MIN_INSERTION_TIME":"1774385160264000","MAX_INSERTION_TIME":"1774385160264000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-2ce1aba3-6914-48d4-b2b3-9cc25d61b211-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385159272,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385159272000","MIN_INSERTION_TIME":"1774385159272000","MAX_INSERTION_TIME":"1774385159272000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000004.json b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000004.json new file mode 100644 index 0000000000..3e4a42d40a --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385161395,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":3,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"23458c90-e99f-4f42-bcaa-3a00fab3a35d"}} +{"add":{"path":"test%25file%25prefix-part-00000-ea8ba2ac-45e6-497c-99e5-ad5b5e78920e-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385161268,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385161268000","MIN_INSERTION_TIME":"1774385161268000","MAX_INSERTION_TIME":"1774385161268000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000005.checkpoint.f376503f-80c5-44c4-a353-a741181e8197.parquet b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000005.checkpoint.f376503f-80c5-44c4-a353-a741181e8197.parquet new file mode 100644 index 0000000000..ccef7737ad Binary files /dev/null and b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000005.checkpoint.f376503f-80c5-44c4-a353-a741181e8197.parquet differ diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000005.crc b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000005.crc new file mode 100644 index 0000000000..e6df7f936b --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000005.crc @@ -0,0 +1 @@ +{"txnId":"b61e8d13-f21c-44a5-b3e2-73e5b2cf4067","tableSizeBytes":3804,"numFiles":5,"numDeletedRecordsOpt":0,"numDeletionVectorsOpt":0,"numMetadata":1,"numProtocol":1,"setTransactions":[],"domainMetadata":[],"metadata":{"id":"69f7cd6b-63b4-4e31-85b1-450ac0a9f70e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385157314},"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[3804,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"deletedRecordCountsHistogramOpt":{"deletedRecordCounts":[5,0,0,0,0,0,0,0,0,0]},"allFiles":[{"path":"test%25file%25prefix-part-00000-ea8ba2ac-45e6-497c-99e5-ad5b5e78920e-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385161268,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"value\":\"value_4\"},\"maxValues\":{\"id\":4,\"value\":\"value_4\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385161268000","MIN_INSERTION_TIME":"1774385161268000","MAX_INSERTION_TIME":"1774385161268000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-ba9dae5e-b947-4940-961f-6cf6a3c2fbfe-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385158244,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"value\":\"value_1\"},\"maxValues\":{\"id\":1,\"value\":\"value_1\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385158244000","MIN_INSERTION_TIME":"1774385158244000","MAX_INSERTION_TIME":"1774385158244000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-8a3d46e0-9431-4cbd-b97d-6b62a2be1dc8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385162256,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385162256000","MIN_INSERTION_TIME":"1774385162256000","MAX_INSERTION_TIME":"1774385162256000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-7268d56a-7420-4691-8062-5b0c3dbdd803-c000.snappy.parquet","partitionValues":{},"size":760,"modificationTime":1774385160264,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"value\":\"value_3\"},\"maxValues\":{\"id\":3,\"value\":\"value_3\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385160264000","MIN_INSERTION_TIME":"1774385160264000","MAX_INSERTION_TIME":"1774385160264000","OPTIMIZE_TARGET_SIZE":"268435456"}},{"path":"test%25file%25prefix-part-00000-2ce1aba3-6914-48d4-b2b3-9cc25d61b211-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385159272,"dataChange":false,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"value\":\"value_2\"},\"maxValues\":{\"id\":2,\"value\":\"value_2\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385159272000","MIN_INSERTION_TIME":"1774385159272000","MAX_INSERTION_TIME":"1774385159272000","OPTIMIZE_TARGET_SIZE":"268435456"}}]} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000005.json b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000005.json new file mode 100644 index 0000000000..abc47ddb65 --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/00000000000000000005.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1774385162370,"operation":"WRITE","operationParameters":{"mode":"Append","statsOnLoad":false,"partitionBy":"[]"},"readVersion":4,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"761"},"tags":{"noRowsCopied":"true","restoresDeletedRows":"false"},"engineInfo":"Databricks-Runtime/","txnId":"b61e8d13-f21c-44a5-b3e2-73e5b2cf4067"}} +{"add":{"path":"test%25file%25prefix-part-00000-8a3d46e0-9431-4cbd-b97d-6b62a2be1dc8-c000.snappy.parquet","partitionValues":{},"size":761,"modificationTime":1774385162256,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"value\":\"value_5\"},\"maxValues\":{\"id\":5,\"value\":\"value_5\"},\"nullCount\":{\"id\":0,\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1774385162256000","MIN_INSERTION_TIME":"1774385162256000","MAX_INSERTION_TIME":"1774385162256000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/_last_checkpoint b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/_last_checkpoint new file mode 100644 index 0000000000..03ff778cfb --- /dev/null +++ b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/_last_checkpoint @@ -0,0 +1 @@ +{"version":5,"size":9,"sizeInBytes":37325,"numOfAddFiles":5,"checkpointSchema":{"type":"struct","fields":[{"name":"txn","type":{"type":"struct","fields":[{"name":"appId","type":"string","nullable":true,"metadata":{}},{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"lastUpdated","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"add","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"stats","type":"string","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"clusteringProvider","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"remove","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"deletionTimestamp","type":"long","nullable":true,"metadata":{}},{"name":"dataChange","type":"boolean","nullable":true,"metadata":{}},{"name":"extendedFileMetadata","type":"boolean","nullable":true,"metadata":{}},{"name":"partitionValues","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"size","type":"long","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"deletionVector","type":{"type":"struct","fields":[{"name":"storageType","type":"string","nullable":true,"metadata":{}},{"name":"pathOrInlineDv","type":"string","nullable":true,"metadata":{}},{"name":"offset","type":"integer","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"integer","nullable":true,"metadata":{}},{"name":"cardinality","type":"long","nullable":true,"metadata":{}},{"name":"maxRowIndex","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"baseRowId","type":"long","nullable":true,"metadata":{}},{"name":"defaultRowCommitVersion","type":"long","nullable":true,"metadata":{}},{"name":"stats","type":"string","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"metaData","type":{"type":"struct","fields":[{"name":"id","type":"string","nullable":true,"metadata":{}},{"name":"name","type":"string","nullable":true,"metadata":{}},{"name":"description","type":"string","nullable":true,"metadata":{}},{"name":"format","type":{"type":"struct","fields":[{"name":"provider","type":"string","nullable":true,"metadata":{}},{"name":"options","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"schemaString","type":"string","nullable":true,"metadata":{}},{"name":"partitionColumns","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"configuration","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}},{"name":"createdTime","type":"long","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"protocol","type":{"type":"struct","fields":[{"name":"minReaderVersion","type":"integer","nullable":true,"metadata":{}},{"name":"minWriterVersion","type":"integer","nullable":true,"metadata":{}},{"name":"readerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}},{"name":"writerFeatures","type":{"type":"array","elementType":"string","containsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"domainMetadata","type":{"type":"struct","fields":[{"name":"domain","type":"string","nullable":true,"metadata":{}},{"name":"configuration","type":"string","nullable":true,"metadata":{}},{"name":"removed","type":"boolean","nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"checkpointMetadata","type":{"type":"struct","fields":[{"name":"version","type":"long","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}},{"name":"sidecar","type":{"type":"struct","fields":[{"name":"path","type":"string","nullable":true,"metadata":{}},{"name":"sizeInBytes","type":"long","nullable":true,"metadata":{}},{"name":"modificationTime","type":"long","nullable":true,"metadata":{}},{"name":"tags","type":{"type":"map","keyType":"string","valueType":"string","valueContainsNull":true},"nullable":true,"metadata":{}}]},"nullable":true,"metadata":{}}]},"v2Checkpoint":{"path":"00000000000000000005.checkpoint.f376503f-80c5-44c4-a353-a741181e8197.parquet","sizeInBytes":24737,"modificationTime":1774385163992,"nonFileActions":[{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors","v2Checkpoint"],"writerFeatures":["deletionVectors","v2Checkpoint","appendOnly","invariants"]}},{"metaData":{"id":"69f7cd6b-63b4-4e31-85b1-450ac0a9f70e","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.checkpoint.writeStatsAsStruct":"true","delta.checkpointPolicy":"v2","delta.checkpoint.writeStatsAsJson":"false","delta.checkpointInterval":"10"},"createdTime":1774385157314}},{"checkpointMetadata":{"version":5,"tags":{"sidecarNumActions":"5","sidecarSizeInBytes":"12588","numOfAddFiles":"5","sidecarFileSchema":"{\"type\":\"struct\",\"fields\":[{\"name\":\"add\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"partitionValues\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"size\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"modificationTime\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dataChange\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"tags\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionVector\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"storageType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"pathOrInlineDv\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"offset\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sizeInBytes\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"cardinality\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"maxRowIndex\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"baseRowId\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"defaultRowCommitVersion\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"clusteringProvider\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"stats_parsed\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"numRecords\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"minValues\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"maxValues\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"nullCount\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"value\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"tightBounds\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"remove\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"path\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionTimestamp\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"dataChange\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"extendedFileMetadata\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"partitionValues\",\"type\":{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"size\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionVector\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"storageType\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"pathOrInlineDv\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"offset\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sizeInBytes\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"cardinality\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"maxRowIndex\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"baseRowId\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"defaultRowCommitVersion\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}}]}"}}}],"sidecarFiles":[{"path":"00000000000000000005.checkpoint.0000000001.0000000001.014cb627-30e0-46dd-a539-09d3247e9b7d.parquet","sizeInBytes":12588,"modificationTime":1774385163552}]},"checksum":"616f829d2bc1e5b6569609a6155cbf04"} diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/_sidecars/00000000000000000005.checkpoint.0000000001.0000000001.014cb627-30e0-46dd-a539-09d3247e9b7d.parquet b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/_sidecars/00000000000000000005.checkpoint.0000000001.0000000001.014cb627-30e0-46dd-a539-09d3247e9b7d.parquet new file mode 100644 index 0000000000..2630b09108 Binary files /dev/null and b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/_delta_log/_sidecars/00000000000000000005.checkpoint.0000000001.0000000001.014cb627-30e0-46dd-a539-09d3247e9b7d.parquet differ diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-2ce1aba3-6914-48d4-b2b3-9cc25d61b211-c000.snappy.parquet b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-2ce1aba3-6914-48d4-b2b3-9cc25d61b211-c000.snappy.parquet new file mode 100644 index 0000000000..b6aabb8552 Binary files /dev/null and b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-2ce1aba3-6914-48d4-b2b3-9cc25d61b211-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-7268d56a-7420-4691-8062-5b0c3dbdd803-c000.snappy.parquet b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-7268d56a-7420-4691-8062-5b0c3dbdd803-c000.snappy.parquet new file mode 100644 index 0000000000..9713b2d963 Binary files /dev/null and b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-7268d56a-7420-4691-8062-5b0c3dbdd803-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-8a3d46e0-9431-4cbd-b97d-6b62a2be1dc8-c000.snappy.parquet b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-8a3d46e0-9431-4cbd-b97d-6b62a2be1dc8-c000.snappy.parquet new file mode 100644 index 0000000000..1f8980ad29 Binary files /dev/null and b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-8a3d46e0-9431-4cbd-b97d-6b62a2be1dc8-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-ba9dae5e-b947-4940-961f-6cf6a3c2fbfe-c000.snappy.parquet b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-ba9dae5e-b947-4940-961f-6cf6a3c2fbfe-c000.snappy.parquet new file mode 100644 index 0000000000..e23848797a Binary files /dev/null and b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-ba9dae5e-b947-4940-961f-6cf6a3c2fbfe-c000.snappy.parquet differ diff --git a/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-ea8ba2ac-45e6-497c-99e5-ad5b5e78920e-c000.snappy.parquet b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-ea8ba2ac-45e6-497c-99e5-ad5b5e78920e-c000.snappy.parquet new file mode 100644 index 0000000000..bfc9dcfa44 Binary files /dev/null and b/kernel/tests/data/v2-parquet-sidecars-struct-stats-only/test%file%prefix-part-00000-ea8ba2ac-45e6-497c-99e5-ad5b5e78920e-c000.snappy.parquet differ diff --git a/kernel/tests/dv.rs b/kernel/tests/dv.rs index a8def7b29e..35a3980f1d 100644 --- a/kernel/tests/dv.rs +++ b/kernel/tests/dv.rs @@ -1,57 +1,386 @@ //! Read a small table with/without deletion vectors. //! Must run at the root of the crate +use std::collections::HashMap; use std::ops::Add; use std::path::PathBuf; +use std::sync::Arc; -use delta_kernel::engine::default::DefaultEngine; -use delta_kernel::scan::ScanResult; -use delta_kernel::{DeltaResult, Snapshot}; -use test_utils::DefaultEngineExtension; +use delta_kernel::actions::deletion_vector_writer::{ + KernelDeletionVector, StreamingDeletionVectorWriter, +}; +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::engine_data::FilteredEngineData; +use delta_kernel::schema::{DataType, StructField, StructType}; +use delta_kernel::transaction::CommitResult; +use delta_kernel::{DeltaResult, EngineData, Snapshot}; +use tempfile::tempdir; +use test_utils::{ + create_add_files_metadata, create_table, engine_store_setup, generate_batch, into_record_batch, + record_batch_to_bytes, IntoArray, +}; use itertools::Itertools; -use test_log::test; + +/// Helper to write a parquet file with the given data to the table. +/// Returns the file path (relative to table root) that was written. +async fn write_parquet_file( + store: &Arc, + table_url: &url::Url, + file_suffix: &str, + data: &delta_kernel::arrow::record_batch::RecordBatch, +) -> Result<(String, usize), Box> { + use delta_kernel::object_store::path::Path as ObjectStorePath; + + let parquet_data = record_batch_to_bytes(data); + let parquet_data_len = parquet_data.len(); + let data_file_path = format!("data_file_{file_suffix}.parquet"); + + // Construct the full object store path for the parquet file + let data_url = table_url.join(&data_file_path)?; + let data_object_path = ObjectStorePath::from_url_path(data_url.path())?; + store.put(&data_object_path, parquet_data.into()).await?; + + Ok((data_file_path, parquet_data_len)) +} fn count_total_scan_rows( - scan_result_iter: impl Iterator>, + scan_result_iter: impl Iterator>>, ) -> DeltaResult { scan_result_iter - .map(|scan_result| { - let scan_result = scan_result?; - // NOTE: The mask only suppresses rows for which it is both present and false. - let mask = scan_result.raw_mask(); - let deleted_rows = mask.into_iter().flatten().filter(|&&m| !m).count(); - let data = scan_result.raw_data?; - Ok(data.len() - deleted_rows) - }) + .map(|result| Ok(result?.len())) .fold_ok(0, Add::add) } -#[test] -fn dv_table() -> Result<(), Box> { - let path = std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/"))?; +#[test_log::test(rstest::rstest)] +#[case::with_dv("./tests/data/table-with-dv-small/", 8)] +#[case::without_dv("./tests/data/table-without-dv-small/", 10)] +fn test_table_scan( + #[case] table_path: &str, + #[case] expected_rows: usize, +) -> Result<(), Box> { + let path = std::fs::canonicalize(PathBuf::from(table_path))?; let url = url::Url::from_directory_path(path).unwrap(); - let engine = DefaultEngine::new_local(); + let engine = test_utils::create_default_engine(&url)?; let snapshot = Snapshot::builder_for(url).build(engine.as_ref())?; let scan = snapshot.scan_builder().build()?; let stream = scan.execute(engine)?; let total_rows = count_total_scan_rows(stream)?; - assert_eq!(total_rows, 8); + assert_eq!(total_rows, expected_rows); Ok(()) } -#[test] -fn non_dv_table() -> Result<(), Box> { - let path = std::fs::canonicalize(PathBuf::from("./tests/data/table-without-dv-small/"))?; - let url = url::Url::from_directory_path(path).unwrap(); - let engine = DefaultEngine::new_local(); +/// Helper to extract scan files from a snapshot +fn get_scan_files( + snapshot: Arc, + engine: &dyn delta_kernel::Engine, +) -> DeltaResult> { + let scan = snapshot.scan_builder().build()?; + let all_scan_metadata: Vec<_> = scan.scan_metadata(engine)?.collect::, _>>()?; + + Ok(all_scan_metadata + .into_iter() + .map(|sm| sm.scan_files) + .collect()) +} + +/// Helper to get a write context for creating deletion vector paths. +fn get_write_context( + table_url: &url::Url, + engine: &dyn delta_kernel::Engine, +) -> Result> { + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine)?; + let txn = snapshot.transaction(Box::new(FileSystemCommitter::new()), engine)?; + Ok(txn.get_write_context()) +} + +/// Helper to write a deletion vector to object store and return its descriptor. +async fn write_deletion_vector_to_store( + store: &Arc, + write_context: &delta_kernel::transaction::WriteContext, + dv: KernelDeletionVector, + prefix: &str, +) -> Result< + delta_kernel::actions::deletion_vector::DeletionVectorDescriptor, + Box, +> { + use delta_kernel::object_store::path::Path as ObjectStorePath; + + let dv_path = write_context.new_deletion_vector_path(String::from(prefix)); + let dv_absolute_path = dv_path.absolute_path()?; + let dv_object_path = ObjectStorePath::parse(dv_absolute_path.path())?; + + let mut dv_buffer = Vec::new(); + let mut dv_writer = StreamingDeletionVectorWriter::new(&mut dv_buffer); + let dv_write_result = dv_writer.write_deletion_vector(dv)?; + dv_writer.finalize()?; + + store.put(&dv_object_path, dv_buffer.into()).await?; + + Ok(dv_write_result.to_descriptor(&dv_path)) +} + +/// Helper to create a transaction for deletion vector updates. +fn create_dv_update_transaction( + table_url: &url::Url, + engine: &dyn delta_kernel::Engine, +) -> Result> { + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine)?; + Ok(snapshot + .transaction(Box::new(FileSystemCommitter::new()), engine)? + .with_engine_info("test engine") + .with_operation("DELETE".to_string())) +} + +/// Helper to verify that scan results match expected ids and values (after sorting). +/// Extracts int32 id column and string value column from batches and compares with expected. +fn verify_sorted_scan_results( + batches: Vec, + expected_ids: Vec, + expected_values: &[&str], +) -> Result<(), Box> { + use delta_kernel::arrow::array::{Array, Int32Array, StringArray}; + + // Extract actual ids and values from batches + let mut actual_ids = Vec::new(); + let mut actual_values = Vec::new(); + + for batch in batches { + let id_col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let val_col = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + for i in 0..batch.num_rows() { + actual_ids.push(id_col.value(i)); + actual_values.push(val_col.value(i).to_string()); + } + } + + // Sort and compare ids + actual_ids.sort(); + assert_eq!( + actual_ids, expected_ids, + "IDs should match expected non-deleted rows" + ); + + // Sort and compare values + actual_values.sort(); + let mut expected_values_sorted = expected_values + .iter() + .map(|s| s.to_string()) + .collect::>(); + expected_values_sorted.sort(); + assert_eq!( + actual_values, expected_values_sorted, + "Values should match expected non-deleted rows" + ); + + Ok(()) +} + +/// End-to-end test that: +/// 1. Creates a table with deletion vector support +/// 2. Writes a parquet file with actual data rows +/// 3. Creates deletion vectors marking specific rows as deleted +/// 4. Writes the deletion vectors to a file using StreamingDeletionVectorWriter +/// 5. Commits the deletion vectors in a transaction +/// 6. Verifies that scanning only returns non-deleted rows +#[tokio::test] +async fn test_write_deletion_vectors_end_to_end() -> Result<(), Box> { + let _ = tracing_subscriber::fmt::try_init(); + + // Create a table schema with id and value columns + let schema = Arc::new(StructType::try_new(vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("value", DataType::STRING), + ])?); + + // Setup table with deletion vector support + let temp_dir = tempdir()?; + let base_url = url::Url::from_directory_path(temp_dir.path()).unwrap(); + let (store, engine, table_url) = engine_store_setup("test_table", Some(&base_url)); + let engine = Arc::new(engine); + + // Create table with DV support (protocol 3/7 with deletionVectors feature) + create_table( + store.clone(), + table_url.clone(), + schema.clone(), + &[], + true, // use_37_protocol + vec!["deletionVectors"], + vec!["deletionVectors"], + ) + .await?; + + // Step 1: Create and write two parquet files + let data_batch_1 = generate_batch(vec![ + ("id", vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_array()), + ( + "value", + vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"].into_array(), + ), + ])?; + + let data_batch_2 = generate_batch(vec![ + ( + "id", + vec![10, 11, 12, 13, 14, 15, 16, 17, 18, 19].into_array(), + ), + ( + "value", + vec!["k", "l", "m", "n", "o", "p", "q", "r", "s", "t"].into_array(), + ), + ])?; + + let (data_file_path_1, parquet_data_len_1) = + write_parquet_file(&store, &table_url, "1", &data_batch_1).await?; + let (data_file_path_2, parquet_data_len_2) = + write_parquet_file(&store, &table_url, "2", &data_batch_2).await?; + + // Step 2: Add both files to the table via a transaction + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("test engine") + .with_operation("WRITE".to_string()); + + // Create add file metadata for both files + let add_files_schema = txn.add_files_schema(); + let add_metadata = create_add_files_metadata( + add_files_schema, + vec![ + (&data_file_path_1, parquet_data_len_1 as i64, 1000000, 10), + (&data_file_path_2, parquet_data_len_2 as i64, 1000000, 10), + ], + )?; + + txn.add_files(add_metadata); + let commit_result = txn.commit(engine.as_ref())?; + assert!(matches!( + commit_result, + CommitResult::CommittedTransaction(_) + )); + + // Step 3: Verify we can read all 20 rows before deletion + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; - let snapshot = Snapshot::builder_for(url).build(engine.as_ref())?; let scan = snapshot.scan_builder().build()?; + let stream = scan.execute(engine.clone())?; + let total_rows_before = count_total_scan_rows(stream)?; + assert_eq!(total_rows_before, 20, "Should have 20 rows before deletion"); + + // Step 4: First deletion - Apply DV only to the first file (delete rows 2, 5, and 7) + // Define deletion indexes in one place to avoid duplication + const FILE1_FIRST_DELETE_INDEXES: [u64; 3] = [2, 5, 7]; + const FILE1_SECOND_DELETE_INDEX: u64 = 1; + const FILE2_DELETE_INDEXES: [u64; 2] = [2, 5]; + + let mut dv_file1_first = KernelDeletionVector::new(); + dv_file1_first.add_deleted_row_indexes(FILE1_FIRST_DELETE_INDEXES); + + // Step 5: Get write context and write the first deletion vector to a file + let write_context = get_write_context(&table_url, engine.as_ref())?; + let dv_descriptor_1 = + write_deletion_vector_to_store(&store, &write_context, dv_file1_first, "").await?; + + // Step 6: Update deletion vectors for first file only + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let mut txn = create_dv_update_transaction(&table_url, engine.as_ref())?; + let scan_files = get_scan_files(snapshot.clone(), engine.as_ref())?; + + let mut dv_map = HashMap::new(); + dv_map.insert(data_file_path_1.clone(), dv_descriptor_1); + + txn.update_deletion_vectors(dv_map, scan_files.into_iter().map(Ok))?; + let commit_result = txn.commit(engine.as_ref())?; + assert!(matches!( + commit_result, + CommitResult::CommittedTransaction(_) + )); + + // Step 9: Verify first deletion - should have 17 rows (7 from file 1 + 10 from file 2) + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let scan = snapshot.scan_builder().build()?; + let stream = scan.execute(engine.clone())?; + + let total_rows_after_first_delete = count_total_scan_rows(stream)?; + assert_eq!( + total_rows_after_first_delete, 17, + "Should have 17 rows after deleting 3 rows from first file" + ); + + // Step 10: Second deletion - Delete row 1 from file 1 and rows 12, 15 from file 2 + let mut dv_file1_second = KernelDeletionVector::new(); + dv_file1_second.add_deleted_row_indexes(FILE1_FIRST_DELETE_INDEXES); // Previous deletions + dv_file1_second.add_deleted_row_indexes([FILE1_SECOND_DELETE_INDEX]); // Additional deletion + + let mut dv_file2 = KernelDeletionVector::new(); + dv_file2.add_deleted_row_indexes(FILE2_DELETE_INDEXES); // Delete rows at indices 2 and 5 (ids 12, 15) + + // Write deletion vectors for both files + let dv_descriptor_1_second = + write_deletion_vector_to_store(&store, &write_context, dv_file1_second, "").await?; + let dv_descriptor_2 = + write_deletion_vector_to_store(&store, &write_context, dv_file2, "").await?; + + // Step 11: Update deletion vectors for both files + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let mut txn = create_dv_update_transaction(&table_url, engine.as_ref())?; + + let mut dv_map1 = HashMap::new(); + dv_map1.insert(data_file_path_1.clone(), dv_descriptor_1_second); + let mut dv_map2 = HashMap::new(); + dv_map2.insert(data_file_path_2.clone(), dv_descriptor_2); + + // Test multiple calls + txn.update_deletion_vectors( + dv_map1, + get_scan_files(snapshot.clone(), engine.as_ref())? + .into_iter() + .map(Ok), + )?; + txn.update_deletion_vectors( + dv_map2, + get_scan_files(snapshot.clone(), engine.as_ref())? + .into_iter() + .map(Ok), + )?; + let commit_result = txn.commit(engine.as_ref())?; + assert!(matches!( + commit_result, + CommitResult::CommittedTransaction(_) + )); + + // Step 12: Verify final deletion - should have 14 rows (6 from file 1 + 8 from file 2) + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let scan = snapshot.scan_builder().build()?; + let stream = scan.execute(engine.clone())?; + + // Collect all rows to verify content + let batches: Vec<_> = stream + .map(|result| result.map(into_record_batch)) + .collect::, _>>()?; + + // Verify the correct rows remain + // File 1: all except 1, 2, 5, 7 => 0, 3, 4, 6, 8, 9 + // File 2: all except 12, 15 (indices 2, 5) => 10, 11, 13, 14, 16, 17, 18, 19 + let expected_ids = vec![0, 3, 4, 6, 8, 9, 10, 11, 13, 14, 16, 17, 18, 19]; + let expected_values = [ + "a", "d", "e", "g", "i", "j", "k", "l", "n", "o", "q", "r", "s", "t", + ]; + + // Verify the correct rows remain using helper + verify_sorted_scan_results(batches, expected_ids, &expected_values)?; - let stream = scan.execute(engine)?; - let total_rows = count_total_scan_rows(stream)?; - assert_eq!(total_rows, 10); Ok(()) } diff --git a/kernel/tests/golden_tables.rs b/kernel/tests/golden_tables.rs index 758902e7fc..bdb0f90ccb 100644 --- a/kernel/tests/golden_tables.rs +++ b/kernel/tests/golden_tables.rs @@ -10,25 +10,25 @@ use delta_kernel::arrow::array::{Array, AsArray, StructArray}; use delta_kernel::arrow::compute::{concat_batches, take}; use delta_kernel::arrow::compute::{lexsort_to_indices, SortColumn}; use delta_kernel::arrow::datatypes::{DataType, FieldRef, Schema}; -use delta_kernel::arrow::{compute::filter_record_batch, record_batch::RecordBatch}; -use delta_kernel::parquet::arrow::async_reader::{ - ParquetObjectReader, ParquetRecordBatchStreamBuilder, -}; - +use delta_kernel::arrow::record_batch::RecordBatch; use delta_kernel::engine::arrow_conversion::TryFromKernel as _; +use delta_kernel::engine::arrow_data::EngineDataArrowExt; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::object_store::{local::LocalFileSystem, ObjectStore}; +use delta_kernel::parquet::arrow::async_reader::{ + ParquetObjectReader, ParquetRecordBatchStreamBuilder, +}; use delta_kernel::{DeltaResult, Snapshot}; use futures::{stream::TryStreamExt, StreamExt}; use itertools::Itertools; -use object_store::{local::LocalFileSystem, ObjectStore}; use paste::paste; use url::Url; mod common; -use test_utils::{load_test_data, to_arrow}; +use test_utils::load_test_data; // NB adapted from DAT: read all parquet files in the directory and concatenate them async fn read_expected(path: &Path) -> DeltaResult { @@ -172,17 +172,7 @@ async fn latest_snapshot_test( let scan = snapshot.scan_builder().build()?; let scan_res = scan.execute(Arc::new(engine))?; let batches: Vec = scan_res - .map(|scan_result| -> DeltaResult<_> { - let scan_result = scan_result?; - let mask = scan_result.full_mask(); - let data = scan_result.raw_data?; - let record_batch = to_arrow(data)?; - if let Some(mask) = mask { - Ok(filter_record_batch(&record_batch, &mask.into())?) - } else { - Ok(record_batch) - } - }) + .map(EngineDataArrowExt::try_into_record_batch) .try_collect()?; let expected = read_expected(&expected_path.expect("expect an expected dir")).await?; @@ -212,12 +202,8 @@ fn setup_golden_table( let table_path = test_path.join("delta"); let url = delta_kernel::try_parse_uri(table_path.to_str().expect("table path to string")) .expect("table from uri"); - let engine = DefaultEngine::try_new( - &url, - std::iter::empty::<(&str, &str)>(), - Arc::new(TokioBackgroundExecutor::new()), - ) - .unwrap(); + let engine = Arc::try_unwrap(test_utils::create_default_engine(&url).unwrap()) + .expect("Arc should have single reference"); let expected_path = test_path.join("expected"); let expected_path = expected_path.exists().then_some(expected_path); (engine, url, expected_path, test_dir) diff --git a/kernel/tests/hdfs.rs b/kernel/tests/hdfs.rs index da39c9c775..4f991b5ad8 100644 --- a/kernel/tests/hdfs.rs +++ b/kernel/tests/hdfs.rs @@ -7,15 +7,12 @@ // cargo test --features integration-test --test hdfs #![cfg(all(feature = "integration-test", not(target_os = "windows")))] -use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; -use delta_kernel::engine::default::DefaultEngine; use delta_kernel::Snapshot; use hdfs_native::{Client, WriteOptions}; use hdfs_native_object_store::minidfs::MiniDfs; use std::collections::HashSet; use std::fs; use std::path::Path; -use std::sync::Arc; extern crate walkdir; use walkdir::WalkDir; @@ -52,6 +49,7 @@ async fn write_local_path_to_hdfs( } #[tokio::test] +#[ignore = "Skipping HDFS integration test"] async fn read_table_version_hdfs() -> Result<(), Box> { let minidfs = MiniDfs::with_features(&HashSet::new()); let hdfs_client = Client::default(); @@ -67,13 +65,9 @@ async fn read_table_version_hdfs() -> Result<(), Box> { let url_str = format!("{}/my-delta-table", minidfs.url); let url = url::Url::parse(&url_str).unwrap(); - let engine = DefaultEngine::try_new( - &url, - std::iter::empty::<(&str, &str)>(), - Arc::new(TokioBackgroundExecutor::new()), - )?; + let engine = test_utils::create_default_engine(&url)?; - let snapshot = Snapshot::builder_for(url).build(&engine)?; + let snapshot = Snapshot::builder_for(url).build(engine.as_ref())?; assert_eq!(snapshot.version(), 1); Ok(()) diff --git a/kernel/tests/log_compaction.rs b/kernel/tests/log_compaction.rs index e90607fda7..2a2eaf1cd5 100644 --- a/kernel/tests/log_compaction.rs +++ b/kernel/tests/log_compaction.rs @@ -2,19 +2,19 @@ use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; use delta_kernel::engine::to_json_bytes; +use delta_kernel::object_store::path::Path; +use delta_kernel::object_store::ObjectStore; use delta_kernel::schema::{DataType, StructField, StructType}; use delta_kernel::Snapshot; use test_utils::{create_table, engine_store_setup}; -use object_store::path::Path; -use object_store::ObjectStore; use url::Url; -/// Convert a URL to an object_store::Path +/// Convert a URL to a `delta_kernel::object_store::Path` fn url_to_object_store_path(url: &Url) -> Result> { let path_segments = url .path_segments() - .ok_or_else(|| format!("URL has no path segments: {}", url))?; + .ok_or_else(|| format!("URL has no path segments: {url}"))?; let path_string = path_segments.skip(1).collect::>().join("/"); @@ -64,10 +64,9 @@ async fn action_reconciliation_round_trip() -> Result<(), Box Result<(), Box Result<(), Box> .as_i64() .ok_or_else(|| { format!( - "deletionTimestamp should be present in recent remove action: {}", - recent_remove_line + "deletionTimestamp should be present in recent remove action: {recent_remove_line}" ) })?; assert_eq!(actual_deletion_timestamp, recent_timestamp); @@ -402,8 +397,9 @@ async fn expired_tombstone_exclusion() -> Result<(), Box> .count(); assert!( total_actions >= 4, - "Should have at least 4 actions: protocol, metadata, 1 add, 1 remove (recent). Found {}", - total_actions + "Should have at least 4 actions: protocol, metadata, 1 add, 1 remove (recent). Found {total_actions}" ); Ok(()) } + +// TODO: Add e2e test that log compaction contains domain metadtaas (and not tombstoned ones) diff --git a/kernel/tests/log_tail.rs b/kernel/tests/log_tail.rs index c4f602fd43..773fce5792 100644 --- a/kernel/tests/log_tail.rs +++ b/kernel/tests/log_tail.rs @@ -1,11 +1,11 @@ use std::sync::Arc; -use object_store::memory::InMemory; -use object_store::path::Path; use url::Url; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; -use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::engine::default::{DefaultEngine, DefaultEngineBuilder}; +use delta_kernel::object_store::memory::InMemory; +use delta_kernel::object_store::path::Path; use delta_kernel::{FileMeta, LogPath, Snapshot}; use test_utils::{ @@ -34,16 +34,14 @@ fn setup_test() -> ( ) { let storage = Arc::new(InMemory::new()); let table_root = Url::parse("memory:///").unwrap(); - let engine = Arc::new(DefaultEngine::new( - storage.clone(), - Arc::new(TokioBackgroundExecutor::new()), - )); + let engine = Arc::new(DefaultEngineBuilder::new(storage.clone()).build()); (storage, engine, table_root) } #[tokio::test] async fn basic_snapshot_with_log_tail_staged_commits() -> Result<(), Box> { - let (storage, engine, table_root) = setup_test(); + let (storage, engine, table_url) = setup_test(); + let table_root = table_url.as_str(); // with staged commits: // _delta_log/0.json (PM in here) @@ -51,103 +49,121 @@ async fn basic_snapshot_with_log_tail_staged_commits() -> Result<(), Box Result<(), Box Result<(), Box> { - let (storage, engine, table_root) = setup_test(); + let (storage, engine, table_url) = setup_test(); + let table_root = table_url.as_str(); // with normal commits: // _delta_log/0.json // _delta_log/1.json // _delta_log/2.json let actions = vec![TestAction::Metadata]; - add_commit(storage.as_ref(), 0, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 0, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_1.parquet".to_string())]; - add_commit(storage.as_ref(), 1, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 1, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_2.parquet".to_string())]; - add_commit(storage.as_ref(), 2, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 2, actions_to_string(actions)).await?; // Create log_tail for commits 1, 2 let log_tail = vec![ - create_log_path(&table_root, delta_path_for_version(1, "json")), - create_log_path(&table_root, delta_path_for_version(2, "json")), + create_log_path(&table_url, delta_path_for_version(1, "json")), + create_log_path(&table_url, delta_path_for_version(2, "json")), ]; - let snapshot = Snapshot::builder_for(table_root.clone()) + let snapshot = Snapshot::builder_for(table_root) .with_log_tail(log_tail) .build(engine.as_ref())?; @@ -184,23 +201,24 @@ async fn basic_snapshot_with_log_tail() -> Result<(), Box #[tokio::test] async fn log_tail_behind_filesystem() -> Result<(), Box> { - let (storage, engine, table_root) = setup_test(); + let (storage, engine, table_url) = setup_test(); + let table_root = table_url.as_str(); // Create commits 0, 1, 2 in storage let actions = vec![TestAction::Metadata]; - add_commit(storage.as_ref(), 0, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 0, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_1.parquet".to_string())]; - add_commit(storage.as_ref(), 1, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 1, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_2.parquet".to_string())]; - add_commit(storage.as_ref(), 2, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 2, actions_to_string(actions)).await?; // log_tail BEHIND file system => must respect log_tail let log_tail = vec![ - create_log_path(&table_root, delta_path_for_version(0, "json")), - create_log_path(&table_root, delta_path_for_version(1, "json")), + create_log_path(&table_url, delta_path_for_version(0, "json")), + create_log_path(&table_url, delta_path_for_version(1, "json")), ]; - let snapshot = Snapshot::builder_for(table_root.clone()) + let snapshot = Snapshot::builder_for(table_root) .with_log_tail(log_tail) .build(engine.as_ref())?; @@ -215,33 +233,36 @@ async fn log_tail_behind_filesystem() -> Result<(), Box> #[tokio::test] async fn incremental_snapshot_with_log_tail() -> Result<(), Box> { - let (storage, engine, table_root) = setup_test(); + let (storage, engine, table_url) = setup_test(); + let table_root = table_url.as_str(); // commits 0, 1, 2 in storage let actions = vec![TestAction::Metadata]; - add_commit(storage.as_ref(), 0, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 0, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_1.parquet".to_string())]; - add_commit(storage.as_ref(), 1, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 1, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_2.parquet".to_string())]; - add_commit(storage.as_ref(), 2, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 2, actions_to_string(actions)).await?; // initial snapshot at version 1 - let initial_snapshot = Snapshot::builder_for(table_root.clone()) + let initial_snapshot = Snapshot::builder_for(table_root) .at_version(1) .build(engine.as_ref())?; assert_eq!(initial_snapshot.version(), 1); // add commit 3, 4 let actions = vec![TestAction::Add("file_3.parquet".to_string())]; - let path3 = add_staged_commit(storage.as_ref(), 3, actions_to_string(actions)).await?; + let path3 = + add_staged_commit(table_root, storage.as_ref(), 3, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_4.parquet".to_string())]; - let path4 = add_staged_commit(storage.as_ref(), 4, actions_to_string(actions)).await?; + let path4 = + add_staged_commit(table_root, storage.as_ref(), 4, actions_to_string(actions)).await?; // log_tail with commits 2, 3, 4 let log_tail = vec![ - create_log_path(&table_root, delta_path_for_version(2, "json")), - create_log_path(&table_root, path3), - create_log_path(&table_root, path4), + create_log_path(&table_url, delta_path_for_version(2, "json")), + create_log_path(&table_url, path3), + create_log_path(&table_url, path4), ]; // Build incremental snapshot with log_tail @@ -257,30 +278,31 @@ async fn incremental_snapshot_with_log_tail() -> Result<(), Box Result<(), Box> { - let (storage, engine, table_root) = setup_test(); + let (storage, engine, table_url) = setup_test(); + let table_root = table_url.as_str(); // commits 0, 1, 2, 3, 4 in storage let actions = vec![TestAction::Metadata]; - add_commit(storage.as_ref(), 0, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 0, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_1.parquet".to_string())]; - add_commit(storage.as_ref(), 1, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 1, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_2.parquet".to_string())]; - add_commit(storage.as_ref(), 2, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 2, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_3.parquet".to_string())]; - add_commit(storage.as_ref(), 3, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 3, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_4.parquet".to_string())]; - add_commit(storage.as_ref(), 4, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 4, actions_to_string(actions)).await?; // log tail goes up to version 4 let log_tail = vec![ - create_log_path(&table_root, delta_path_for_version(1, "json")), - create_log_path(&table_root, delta_path_for_version(2, "json")), - create_log_path(&table_root, delta_path_for_version(3, "json")), - create_log_path(&table_root, delta_path_for_version(4, "json")), + create_log_path(&table_url, delta_path_for_version(1, "json")), + create_log_path(&table_url, delta_path_for_version(2, "json")), + create_log_path(&table_url, delta_path_for_version(3, "json")), + create_log_path(&table_url, delta_path_for_version(4, "json")), ]; // user asks for version 3 (or catalog says latest is 3) - let snapshot = Snapshot::builder_for(table_root.clone()) + let snapshot = Snapshot::builder_for(table_root) .at_version(3) .with_log_tail(log_tail) .build(engine.as_ref())?; @@ -292,30 +314,31 @@ async fn log_tail_exceeds_requested_version() -> Result<(), Box Result<(), Box> { - let (storage, engine, table_root) = setup_test(); + let (storage, engine, table_url) = setup_test(); + let table_root = table_url.as_str(); // create commits 0, 1, 2, 3, 4 in storage let actions = vec![TestAction::Metadata]; - add_commit(storage.as_ref(), 0, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 0, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_1.parquet".to_string())]; - add_commit(storage.as_ref(), 1, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 1, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_2.parquet".to_string())]; - add_commit(storage.as_ref(), 2, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 2, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_3.parquet".to_string())]; - add_commit(storage.as_ref(), 3, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 3, actions_to_string(actions)).await?; let actions = vec![TestAction::Add("file_4.parquet".to_string())]; - add_commit(storage.as_ref(), 4, actions_to_string(actions)).await?; + add_commit(table_root, storage.as_ref(), 4, actions_to_string(actions)).await?; // Log tail only goes up to version 3 let log_tail = vec![ - create_log_path(&table_root, delta_path_for_version(1, "json")), - create_log_path(&table_root, delta_path_for_version(2, "json")), - create_log_path(&table_root, delta_path_for_version(3, "json")), + create_log_path(&table_url, delta_path_for_version(1, "json")), + create_log_path(&table_url, delta_path_for_version(2, "json")), + create_log_path(&table_url, delta_path_for_version(3, "json")), ]; // User asks for version 4, but log tail only has up to version 3 // This should fail with an error - let result = Snapshot::builder_for(table_root.clone()) + let result = Snapshot::builder_for(table_root) .at_version(4) .with_log_tail(log_tail) .build(engine.as_ref()); diff --git a/kernel/tests/maintenance_ops.rs b/kernel/tests/maintenance_ops.rs new file mode 100644 index 0000000000..200965228b --- /dev/null +++ b/kernel/tests/maintenance_ops.rs @@ -0,0 +1,116 @@ +//! Integration tests for table maintenance operations (checkpoint, checksum). + +use std::sync::Arc; + +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::schema::{DataType, StructField, StructType}; +use delta_kernel::snapshot::{CheckpointWriteResult, ChecksumWriteResult}; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::DeltaResult; +use delta_kernel::Snapshot; +use rstest::rstest; +use test_utils::test_table_setup_mt; + +#[rstest] +#[case::v1_checkpoint(false)] +#[case::v2_checkpoint(true)] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_checkpoint_and_checksum_return_updated_snapshots( + #[case] v2_checkpoint: bool, +) -> DeltaResult<()> { + // ===== GIVEN ===== + let (_temp_dir, table_path, engine) = test_table_setup_mt()?; + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + let mut builder = create_table(&table_path, schema, "test_engine"); + if v2_checkpoint { + builder = builder.with_table_properties([("delta.feature.v2Checkpoint", "supported")]); + } + let committed = builder + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())? + .unwrap_committed(); + let snapshot = committed.post_commit_snapshot().unwrap(); + + // Precondition: no checkpoint, no CRC + let seg = snapshot.log_segment(); + assert!(seg.listed.checkpoint_parts.is_empty()); + assert!(seg.checkpoint_version.is_none()); + assert!(seg.listed.latest_crc_file.is_none()); + + // ===== WHEN: we checkpoint ===== + let (_, snapshot_w_ckpt) = snapshot.checkpoint(engine.as_ref())?; + let seg = snapshot_w_ckpt.log_segment(); + + // ===== THEN ===== + // Checkpoint version and parts are set + assert_eq!(seg.checkpoint_version, Some(snapshot.version())); + assert_eq!(seg.listed.checkpoint_parts.len(), 1); + assert_eq!(seg.listed.checkpoint_parts[0].version, snapshot.version()); + + // Commits and compactions subsumed by the checkpoint are cleared + assert!(seg.listed.ascending_commit_files.is_empty()); + assert!(seg.listed.ascending_compaction_files.is_empty()); + + // ===== WHEN: we write checksum ===== + let (crc_result, snapshot_w_both) = snapshot_w_ckpt.write_checksum(engine.as_ref())?; + let seg = snapshot_w_both.log_segment(); + + // ===== THEN ===== + // CRC file is recorded at the correct version + assert_eq!(crc_result, ChecksumWriteResult::Written); + let crc_file = seg + .listed + .latest_crc_file + .as_ref() + .expect("snapshot should have latest_crc_file set"); + assert_eq!(crc_file.version, snapshot.version()); + + // The checkpoint is still present after the CRC write + assert_eq!(seg.checkpoint_version, Some(snapshot.version())); + + Ok(()) +} + +#[rstest] +#[case::v1_checkpoint(false)] +#[case::v2_checkpoint(true)] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_checkpoint_already_exists(#[case] v2_checkpoint: bool) -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup_mt()?; + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "id", + DataType::INTEGER, + )])?); + let mut builder = create_table(&table_path, schema, "test_engine"); + if v2_checkpoint { + builder = builder.with_table_properties([("delta.feature.v2Checkpoint", "supported")]); + } + let committed = builder + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())? + .unwrap_committed(); + let snapshot = committed.post_commit_snapshot().unwrap(); + + // First checkpoint writes successfully + let (result, snapshot_w_ckpt) = snapshot.checkpoint(engine.as_ref())?; + assert_eq!(result, CheckpointWriteResult::Written); + + // Calling checkpoint again on the returned snapshot detects the existing checkpoint + let (result, unchanged) = snapshot_w_ckpt.checkpoint(engine.as_ref())?; + assert_eq!(result, CheckpointWriteResult::AlreadyExists); + assert_eq!(unchanged.version(), snapshot_w_ckpt.version()); + + // A fresh snapshot loaded from storage also returns AlreadyExists + let fresh = Snapshot::builder_for(&table_path).build(engine.as_ref())?; + assert_eq!( + fresh.log_segment().checkpoint_version, + Some(snapshot.version()) + ); + let (result, _) = fresh.checkpoint(engine.as_ref())?; + assert_eq!(result, CheckpointWriteResult::AlreadyExists); + + Ok(()) +} diff --git a/kernel/tests/read.rs b/kernel/tests/read.rs index 0a8c474a3a..86f99e3c62 100644 --- a/kernel/tests/read.rs +++ b/kernel/tests/read.rs @@ -1,30 +1,32 @@ -use std::collections::HashMap; use std::path::PathBuf; use std::sync::Arc; use delta_kernel::actions::deletion_vector::split_vector; -use delta_kernel::arrow::array::{AsArray as _, BooleanArray}; +use delta_kernel::arrow::array::{AsArray as _, RecordBatch, TimestampMicrosecondArray}; use delta_kernel::arrow::compute::{concat_batches, filter_record_batch}; -use delta_kernel::arrow::datatypes::{Int64Type, Schema as ArrowSchema}; -use delta_kernel::arrow::record_batch::RecordBatch; +use delta_kernel::arrow::datatypes::{ + Field as ArrowField, Int64Type, Schema as ArrowSchema, TimeUnit, +}; use delta_kernel::engine::arrow_conversion::TryFromKernel as _; -use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; -use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::engine::arrow_data::EngineDataArrowExt as _; +use delta_kernel::engine::default::DefaultEngineBuilder; use delta_kernel::expressions::{ - column_expr, column_pred, Expression as Expr, ExpressionRef, Predicate as Pred, + column_expr, column_pred, Expression as Expr, ExpressionRef, Predicate as Pred, Scalar, }; +use delta_kernel::log_segment::LogSegment; +use delta_kernel::object_store::{memory::InMemory, path::Path, ObjectStore}; use delta_kernel::parquet::file::properties::{EnabledStatistics, WriterProperties}; -use delta_kernel::scan::state::{transform_to_logical, DvInfo, Stats}; -use delta_kernel::scan::{Scan, ScanResult}; +use delta_kernel::path::ParsedLogPath; +use delta_kernel::scan::state::{transform_to_logical, ScanFile}; +use delta_kernel::scan::Scan; use delta_kernel::schema::{DataType, MetadataColumnSpec, Schema, StructField, StructType}; use delta_kernel::{Engine, FileMeta, Snapshot}; use itertools::Itertools; -use object_store::{memory::InMemory, path::Path, ObjectStore}; use test_utils::{ actions_to_string, add_commit, generate_batch, generate_simple_batch, into_record_batch, - load_test_data, read_scan, record_batch_to_bytes, record_batch_to_bytes_with_props, to_arrow, - IntoArray, TestAction, METADATA, + load_test_data, read_scan, record_batch_to_bytes, record_batch_to_bytes_with_props, IntoArray, + TestAction, METADATA, }; use url::Url; @@ -34,34 +36,35 @@ const PARQUET_FILE1: &str = "part-00000-a72b1fb3-f2df-41fe-a8f0-e65b746382dd-c00 const PARQUET_FILE2: &str = "part-00001-c506e79a-0bf8-4e2b-a42b-9731b2e490ae-c000.snappy.parquet"; const PARQUET_FILE3: &str = "part-00002-c506e79a-0bf8-4e2b-a42b-9731b2e490ff-c000.snappy.parquet"; -/// Helper function to extract filtered data from a scan result, respecting row masks -fn extract_record_batch( - scan_result: ScanResult, -) -> Result> { - let mask = scan_result.full_mask(); - let record_batch = into_record_batch(scan_result.raw_data?); - - if let Some(mask) = mask { - Ok(filter_record_batch( - &record_batch, - &BooleanArray::from(mask), - )?) - } else { - Ok(record_batch) - } +/// Convert all top-level fields in a RecordBatch to nullable, matching Delta table schema +/// conventions where the table metadata declares columns as nullable. +fn make_top_level_fields_nullable(batch: &RecordBatch) -> RecordBatch { + let schema = Arc::new(ArrowSchema::new( + batch + .schema() + .fields() + .iter() + .map(|f| ArrowField::new(f.name(), f.data_type().clone(), true)) + .collect::>(), + )); + RecordBatch::try_new(schema, batch.columns().to_vec()).unwrap() } #[tokio::test] async fn single_commit_two_add_files() -> Result<(), Box> { let batch = generate_simple_batch()?; let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + let parquet_bytes = record_batch_to_bytes(&batch); + let file_size = parquet_bytes.len() as u64; add_commit( + table_root, storage.as_ref(), 0, actions_to_string(vec![ TestAction::Metadata, - TestAction::Add(PARQUET_FILE1.to_string()), - TestAction::Add(PARQUET_FILE2.to_string()), + TestAction::AddWithSize(PARQUET_FILE1.to_string(), file_size), + TestAction::AddWithSize(PARQUET_FILE2.to_string(), file_size), ]), ) .await?; @@ -78,24 +81,20 @@ async fn single_commit_two_add_files() -> Result<(), Box> ) .await?; - let location = Url::parse("memory:///")?; - let engine = Arc::new(DefaultEngine::new( - storage.clone(), - Arc::new(TokioBackgroundExecutor::new()), - )); + let engine = Arc::new(DefaultEngineBuilder::new(storage.clone()).build()); - let expected_data = vec![batch.clone(), batch]; + let expected = make_top_level_fields_nullable(&batch); + let expected_data = vec![expected.clone(), expected]; - let snapshot = Snapshot::builder_for(location).build(engine.as_ref())?; + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; let scan = snapshot.scan_builder().build()?; let mut files = 0; let stream = scan.execute(engine)?.zip(expected_data); for (data, expected) in stream { - let raw_data = data?.raw_data?; files += 1; - assert_eq!(into_record_batch(raw_data), expected); + assert_eq!(into_record_batch(data?), expected); } assert_eq!(2, files, "Expected to have scanned two files"); Ok(()) @@ -105,19 +104,27 @@ async fn single_commit_two_add_files() -> Result<(), Box> async fn two_commits() -> Result<(), Box> { let batch = generate_simple_batch()?; let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + let parquet_bytes = record_batch_to_bytes(&batch); + let file_size = parquet_bytes.len() as u64; add_commit( + table_root, storage.as_ref(), 0, actions_to_string(vec![ TestAction::Metadata, - TestAction::Add(PARQUET_FILE1.to_string()), + TestAction::AddWithSize(PARQUET_FILE1.to_string(), file_size), ]), ) .await?; add_commit( + table_root, storage.as_ref(), 1, - actions_to_string(vec![TestAction::Add(PARQUET_FILE2.to_string())]), + actions_to_string(vec![TestAction::AddWithSize( + PARQUET_FILE2.to_string(), + file_size, + )]), ) .await?; storage @@ -133,21 +140,20 @@ async fn two_commits() -> Result<(), Box> { ) .await?; - let location = Url::parse("memory:///").unwrap(); - let engine = DefaultEngine::new(storage.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(storage.clone()).build(); - let expected_data = vec![batch.clone(), batch]; + let expected = make_top_level_fields_nullable(&batch); + let expected_data = vec![expected.clone(), expected]; - let snapshot = Snapshot::builder_for(location).build(&engine)?; + let snapshot = Snapshot::builder_for(table_root).build(&engine)?; let scan = snapshot.scan_builder().build()?; let mut files = 0; let stream = scan.execute(Arc::new(engine))?.zip(expected_data); for (data, expected) in stream { - let raw_data = data?.raw_data?; files += 1; - assert_eq!(into_record_batch(raw_data), expected); + assert_eq!(into_record_batch(data?), expected); } assert_eq!(2, files, "Expected to have scanned two files"); @@ -158,25 +164,37 @@ async fn two_commits() -> Result<(), Box> { async fn remove_action() -> Result<(), Box> { let batch = generate_simple_batch()?; let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + let parquet_bytes = record_batch_to_bytes(&batch); + let file_size = parquet_bytes.len() as u64; add_commit( + table_root, storage.as_ref(), 0, actions_to_string(vec![ TestAction::Metadata, - TestAction::Add(PARQUET_FILE1.to_string()), + TestAction::AddWithSize(PARQUET_FILE1.to_string(), file_size), ]), ) .await?; add_commit( + table_root, storage.as_ref(), 1, - actions_to_string(vec![TestAction::Add(PARQUET_FILE2.to_string())]), + actions_to_string(vec![TestAction::AddWithSize( + PARQUET_FILE2.to_string(), + file_size, + )]), ) .await?; add_commit( + table_root, storage.as_ref(), 2, - actions_to_string(vec![TestAction::Remove(PARQUET_FILE2.to_string())]), + actions_to_string(vec![TestAction::RemoveWithSize( + PARQUET_FILE2.to_string(), + file_size, + )]), ) .await?; storage @@ -186,21 +204,20 @@ async fn remove_action() -> Result<(), Box> { ) .await?; - let location = Url::parse("memory:///").unwrap(); - let engine = DefaultEngine::new(storage.clone(), Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(storage.clone()).build(); - let expected_data = vec![batch]; + let expected = make_top_level_fields_nullable(&batch); + let expected_data = vec![expected]; - let snapshot = Snapshot::builder_for(location).build(&engine)?; + let snapshot = Snapshot::builder_for(table_root).build(&engine)?; let scan = snapshot.scan_builder().build()?; let stream = scan.execute(Arc::new(engine))?.zip(expected_data); let mut files = 0; for (data, expected) in stream { - let raw_data = data?.raw_data?; files += 1; - assert_eq!(into_record_batch(raw_data), expected); + assert_eq!(into_record_batch(data?), expected); } assert_eq!(1, files, "Expected to have scanned one file"); Ok(()) @@ -215,31 +232,41 @@ async fn stats() -> Result<(), Box> { TestAction::Add(path) => format!(r#"{{"{action}":{{"path":"{path}","partitionValues":{{}},"size":262,"modificationTime":1587968586000,"dataChange":true, "stats":"{{\"numRecords\":2,\"nullCount\":{{\"id\":0}},\"minValues\":{{\"id\": 5}},\"maxValues\":{{\"id\":7}}}}"}}}}"#, action = "add", path = path), TestAction::Remove(path) => format!(r#"{{"{action}":{{"path":"{path}","partitionValues":{{}},"size":262,"modificationTime":1587968586000,"dataChange":true}}}}"#, action = "remove", path = path), TestAction::Metadata => METADATA.into(), + TestAction::AddWithSize(path, size) => format!(r#"{{"{action}":{{"path":"{path}","partitionValues":{{}},"size":{size},"modificationTime":1587968586000,"dataChange":true, "stats":"{{\"numRecords\":2,\"nullCount\":{{\"id\":0}},\"minValues\":{{\"id\": 5}},\"maxValues\":{{\"id\":7}}}}"}}}}"#, action = "add", path = path), + TestAction::RemoveWithSize(path, size) => format!(r#"{{"{action}":{{"path":"{path}","partitionValues":{{}},"size":{size},"modificationTime":1587968586000,"dataChange":true}}}}"#, action = "remove", path = path), }) .fold(String::new(), |a, b| a + &b + "\n") } - let batch1 = generate_simple_batch()?; - let batch2 = generate_batch(vec![ + let batch1 = make_top_level_fields_nullable(&generate_simple_batch()?); + let batch2 = make_top_level_fields_nullable(&generate_batch(vec![ ("id", vec![5, 7].into_array()), ("val", vec!["e", "g"].into_array()), - ])?; + ])?); + let file_size1 = record_batch_to_bytes(&batch1).len() as u64; + let file_size2 = record_batch_to_bytes(&batch2).len() as u64; let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; // valid commit with min/max (0, 2) add_commit( + table_root, storage.as_ref(), 0, actions_to_string(vec![ TestAction::Metadata, - TestAction::Add(PARQUET_FILE1.to_string()), + TestAction::AddWithSize(PARQUET_FILE1.to_string(), file_size1), ]), ) .await?; // storage.add_commit(1, &format!("{}\n", r#"{{"add":{{"path":"doesnotexist","partitionValues":{{}},"size":262,"modificationTime":1587968586000,"dataChange":true, "stats":"{{\"numRecords\":2,\"nullCount\":{{\"id\":0}},\"minValues\":{{\"id\": 0}},\"maxValues\":{{\"id\":2}}}}"}}}}"#)); add_commit( + table_root, storage.as_ref(), 1, - generate_commit2(vec![TestAction::Add(PARQUET_FILE2.to_string())]), + generate_commit2(vec![TestAction::AddWithSize( + PARQUET_FILE2.to_string(), + file_size2, + )]), ) .await?; @@ -257,12 +284,8 @@ async fn stats() -> Result<(), Box> { ) .await?; - let location = Url::parse("memory:///").unwrap(); - let engine = Arc::new(DefaultEngine::new( - storage.clone(), - Arc::new(TokioBackgroundExecutor::new()), - )); - let snapshot = Snapshot::builder_for(location).build(engine.as_ref())?; + let engine = Arc::new(DefaultEngineBuilder::new(storage.clone()).build()); + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; // The first file has id between 1 and 3; the second has id between 5 and 7. For each operator, // we validate the boundary values where we expect the set of matched files to change. @@ -315,9 +338,8 @@ async fn stats() -> Result<(), Box> { let stream = scan.execute(engine.clone())?.zip(expected_batches); for (batch, expected) in stream { - let raw_data = batch?.raw_data?; files_scanned += 1; - assert_eq!(into_record_batch(raw_data), expected.clone()); + assert_eq!(into_record_batch(batch?), expected.clone()); } assert_eq!(expected_files, files_scanned, "{predicate:?}"); } @@ -343,28 +365,8 @@ fn read_with_execute( Ok(()) } -struct ScanFile { - path: String, - size: i64, - dv_info: DvInfo, - transform: Option, -} - -fn scan_metadata_callback( - batches: &mut Vec, - path: &str, - size: i64, - _stats: Option, - dv_info: DvInfo, - transform: Option, - _: HashMap, -) { - batches.push(ScanFile { - path: path.to_string(), - size, - dv_info, - transform, - }); +fn scan_metadata_callback(batches: &mut Vec, scan_file: ScanFile) { + batches.push(scan_file); } fn read_with_scan_metadata( @@ -416,7 +418,7 @@ fn read_with_scan_metadata( scan_file.transform.clone(), ) .unwrap(); - let record_batch = to_arrow(logical).unwrap(); + let record_batch = logical.try_into_record_batch()?; let rest = split_vector(selection_vector.as_mut(), len, Some(true)); let batch = if let Some(mask) = selection_vector.clone() { // apply the selection vector @@ -447,11 +449,7 @@ fn read_table_data( let path = std::fs::canonicalize(PathBuf::from(path))?; let predicate = predicate.map(Arc::new); let url = url::Url::from_directory_path(path).unwrap(); - let engine = Arc::new(DefaultEngine::try_new( - &url, - std::iter::empty::<(&str, &str)>(), - Arc::new(TokioBackgroundExecutor::new()), - )?); + let engine = test_utils::create_default_engine(&url)?; let snapshot = Snapshot::builder_for(url.clone()).build(engine.as_ref())?; @@ -594,51 +592,48 @@ fn table_for_letters(letters: &[char]) -> Vec { res } -#[test] -fn predicate_on_number() -> Result<(), Box> { - let cases = vec![ - ( - column_expr!("number").lt(Expr::literal(4i64)), - table_for_numbers(vec![1, 2, 3]), - ), - ( - column_expr!("number").le(Expr::literal(4i64)), - table_for_numbers(vec![1, 2, 3, 4]), - ), - ( - column_expr!("number").gt(Expr::literal(4i64)), - table_for_numbers(vec![5, 6]), - ), - ( - column_expr!("number").ge(Expr::literal(4i64)), - table_for_numbers(vec![4, 5, 6]), - ), - ( - column_expr!("number").eq(Expr::literal(4i64)), - table_for_numbers(vec![4]), - ), - ( - column_expr!("number").ne(Expr::literal(4i64)), - table_for_numbers(vec![1, 2, 3, 5, 6]), - ), - ]; - - for (pred, expected) in cases.into_iter() { - read_table_data( - "./tests/data/basic_partitioned", - Some(&["a_float", "number"]), - Some(pred), - expected, - )?; - } +#[rstest::rstest] +#[case::less_than( + column_expr!("number").lt(Expr::literal(4i64)), + table_for_numbers(vec![1, 2, 3]) +)] +#[case::less_than_or_equal( + column_expr!("number").le(Expr::literal(4i64)), + table_for_numbers(vec![1, 2, 3, 4]) +)] +#[case::greater_than( + column_expr!("number").gt(Expr::literal(4i64)), + table_for_numbers(vec![5, 6]) +)] +#[case::greater_than_or_equal( + column_expr!("number").ge(Expr::literal(4i64)), + table_for_numbers(vec![4, 5, 6]) +)] +#[case::equal( + column_expr!("number").eq(Expr::literal(4i64)), + table_for_numbers(vec![4]) +)] +#[case::not_equal( + column_expr!("number").ne(Expr::literal(4i64)), + table_for_numbers(vec![1, 2, 3, 5, 6]) +)] +fn predicate_on_number( + #[case] pred: Pred, + #[case] expected: Vec, +) -> Result<(), Box> { + read_table_data( + "./tests/data/basic_partitioned", + Some(&["a_float", "number"]), + Some(pred), + expected, + )?; Ok(()) } -#[test] -fn predicate_on_letter() -> Result<(), Box> { - // Test basic column pruning. Note that the actual predicate machinery is already well-tested, - // so we're just testing wiring here. - let null_row_table: Vec = vec![ +#[rstest::rstest] +#[case::is_null( + column_expr!("letter").is_null(), + vec![ "+--------+--------+", "| letter | number |", "+--------+--------+", @@ -647,61 +642,64 @@ fn predicate_on_letter() -> Result<(), Box> { ] .into_iter() .map(String::from) - .collect(); - - let cases = vec![ - (column_expr!("letter").is_null(), null_row_table), - ( - column_expr!("letter").is_not_null(), - table_for_letters(&['a', 'b', 'c', 'e']), - ), - ( - column_expr!("letter").lt(Expr::literal("c")), - table_for_letters(&['a', 'b']), - ), - ( - column_expr!("letter").le(Expr::literal("c")), - table_for_letters(&['a', 'b', 'c']), - ), - ( - column_expr!("letter").gt(Expr::literal("c")), - table_for_letters(&['e']), - ), - ( - column_expr!("letter").ge(Expr::literal("c")), - table_for_letters(&['c', 'e']), - ), - ( - column_expr!("letter").eq(Expr::literal("c")), - table_for_letters(&['c']), - ), - ( - column_expr!("letter").ne(Expr::literal("c")), - table_for_letters(&['a', 'b', 'e']), - ), - ]; - - for (pred, expected) in cases { - read_table_data( - "./tests/data/basic_partitioned", - Some(&["letter", "number"]), - Some(pred), - expected, - )?; - } + .collect() +)] +#[case::is_not_null( + column_expr!("letter").is_not_null(), + table_for_letters(&['a', 'b', 'c', 'e']) +)] +#[case::less_than( + column_expr!("letter").lt(Expr::literal("c")), + table_for_letters(&['a', 'b']) +)] +#[case::less_than_or_equal( + column_expr!("letter").le(Expr::literal("c")), + table_for_letters(&['a', 'b', 'c']) +)] +#[case::greater_than( + column_expr!("letter").gt(Expr::literal("c")), + table_for_letters(&['e']) +)] +#[case::greater_than_or_equal( + column_expr!("letter").ge(Expr::literal("c")), + table_for_letters(&['c', 'e']) +)] +#[case::equal( + column_expr!("letter").eq(Expr::literal("c")), + table_for_letters(&['c']) +)] +#[case::not_equal( + column_expr!("letter").ne(Expr::literal("c")), + table_for_letters(&['a', 'b', 'e']) +)] +fn predicate_on_letter( + #[case] pred: Pred, + #[case] expected: Vec, +) -> Result<(), Box> { + // Test basic column pruning. Note that the actual predicate machinery is already well-tested, + // so we're just testing wiring here. + read_table_data( + "./tests/data/basic_partitioned", + Some(&["letter", "number"]), + Some(pred), + expected, + )?; Ok(()) } -#[test] -fn predicate_on_letter_and_number() -> Result<(), Box> { - // Partition skipping and file skipping are currently implemented separately. Mixing them in an - // AND clause will evaulate each separately, but mixing them in an OR clause disables both. - let full_table: Vec = vec![ +#[rstest::rstest] +#[case::or_with_pruning( + Pred::or( + column_expr!("letter").gt(Expr::literal("a")), + column_expr!("number").gt(Expr::literal(3i64)), + ), + // Unified data skipping evaluates partition + data predicates in a single pass. + // File a/1 (letter='a', max(number)=1): OR('a'>'a', 1>3) = FALSE -> pruned + vec![ "+--------+--------+", "| letter | number |", "+--------+--------+", "| | 6 |", - "| a | 1 |", "| a | 4 |", "| b | 2 |", "| c | 3 |", @@ -710,84 +708,365 @@ fn predicate_on_letter_and_number() -> Result<(), Box> { ] .into_iter() .map(String::from) - .collect(); - - let cases = vec![ - ( - Pred::or( - // No pruning power - column_expr!("letter").gt(Expr::literal("a")), - column_expr!("number").gt(Expr::literal(3i64)), - ), - full_table, - ), - ( - Pred::and( - column_expr!("letter").gt(Expr::literal("a")), // numbers 2, 3, 5 - column_expr!("number").gt(Expr::literal(3i64)), // letters a, e - ), - table_for_letters(&['e']), - ), - ( - Pred::and( - column_expr!("letter").gt(Expr::literal("a")), // numbers 2, 3, 5 - Pred::or( - // No pruning power - column_expr!("letter").eq(Expr::literal("c")), - column_expr!("number").eq(Expr::literal(3i64)), - ), - ), - table_for_letters(&['b', 'c', 'e']), + .collect() +)] +#[case::and_with_pruning( + Pred::and( + column_expr!("letter").gt(Expr::literal("a")), // numbers 2, 3, 5 + column_expr!("number").gt(Expr::literal(3i64)), // letters a, e + ), + table_for_letters(&['e']) +)] +#[case::and_with_nested_or( + Pred::and( + column_expr!("letter").gt(Expr::literal("a")), // numbers 2, 3, 5 + Pred::or( + column_expr!("letter").eq(Expr::literal("c")), + column_expr!("number").eq(Expr::literal(3i64)), ), - ]; + ), + // Unified data skipping evaluates the full expression: + // b/2: AND(TRUE, OR(FALSE, FALSE)) = FALSE -> pruned + // c/3: AND(TRUE, OR(TRUE, TRUE)) = TRUE -> kept + // e/5: AND(TRUE, OR(FALSE, FALSE)) = FALSE -> pruned + table_for_letters(&['c']) +)] +fn predicate_on_letter_and_number( + #[case] pred: Pred, + #[case] expected: Vec, +) -> Result<(), Box> { + // Unified data skipping evaluates partition + data predicates together in a single + // columnar pass, enabling pruning for mixed predicates including OR expressions. + read_table_data( + "./tests/data/basic_partitioned", + Some(&["letter", "number"]), + Some(pred), + expected, + )?; + Ok(()) +} - for (pred, expected) in cases { - read_table_data( - "./tests/data/basic_partitioned", - Some(&["letter", "number"]), - Some(pred), - expected, - )?; - } +/// Test partition pruning on a table with a checkpoint containing `partitionValues_parsed`. +/// This exercises the checkpoint code path where typed partition values are read directly +/// from the parquet column rather than parsed from the string map via `MapToStruct`. +/// +/// Table: app-txn-checkpoint (checkpoint at v1, partition column: `modified` (string)) +/// - 2 files with modified=2021-02-01 (value 4-11, 8 rows each) +/// - 2 files with modified=2021-02-02 (value 1-3, 3 rows each) +#[rstest::rstest] +#[case::partition_only_prunes_one_partition( + // Partition-only predicate: modified = '2021-02-02' should prune 2021-02-01 files + column_expr!("modified").eq(Expr::literal("2021-02-02")), + vec![ + "+----+------------+-------+", + "| id | modified | value |", + "+----+------------+-------+", + "| A | 2021-02-02 | 1 |", + "| A | 2021-02-02 | 1 |", + "| A | 2021-02-02 | 3 |", + "| A | 2021-02-02 | 3 |", + "| B | 2021-02-02 | 2 |", + "| B | 2021-02-02 | 2 |", + "+----+------------+-------+", + ] + .into_iter() + .map(String::from) + .collect() +)] +#[case::partition_prunes_other_partition( + // modified = '2021-02-01' should prune 2021-02-02 files, keeping all 2021-02-01 rows + column_expr!("modified").eq(Expr::literal("2021-02-01")), + vec![ + "+----+------------+-------+", + "| id | modified | value |", + "+----+------------+-------+", + "| A | 2021-02-01 | 10 |", + "| A | 2021-02-01 | 10 |", + "| A | 2021-02-01 | 11 |", + "| A | 2021-02-01 | 11 |", + "| A | 2021-02-01 | 5 |", + "| A | 2021-02-01 | 5 |", + "| A | 2021-02-01 | 6 |", + "| A | 2021-02-01 | 6 |", + "| A | 2021-02-01 | 7 |", + "| A | 2021-02-01 | 7 |", + "| B | 2021-02-01 | 4 |", + "| B | 2021-02-01 | 4 |", + "| B | 2021-02-01 | 8 |", + "| B | 2021-02-01 | 8 |", + "| B | 2021-02-01 | 9 |", + "| B | 2021-02-01 | 9 |", + "+----+------------+-------+", + ] + .into_iter() + .map(String::from) + .collect() +)] +fn partition_pruning_with_checkpoint_parsed_values( + #[case] pred: Pred, + #[case] expected: Vec, +) -> Result<(), Box> { + read_table_data( + "./tests/data/app-txn-checkpoint", + Some(&["id", "modified", "value"]), + Some(pred), + expected, + )?; Ok(()) } -#[test] -fn predicate_on_number_not() -> Result<(), Box> { - let cases = vec![ - ( - Pred::not(column_expr!("number").lt(Expr::literal(4i64))), - table_for_numbers(vec![4, 5, 6]), - ), - ( - Pred::not(column_expr!("number").le(Expr::literal(4i64))), - table_for_numbers(vec![5, 6]), - ), - ( - Pred::not(column_expr!("number").gt(Expr::literal(4i64))), - table_for_numbers(vec![1, 2, 3, 4]), - ), - ( - Pred::not(column_expr!("number").ge(Expr::literal(4i64))), - table_for_numbers(vec![1, 2, 3]), +/// Test mixed predicates (partition + data stats) on a checkpoint with both `partitionValues_parsed` +/// and `stats_parsed`. This exercises the unified columnar data skipping pass that evaluates +/// both partition values and data column statistics together. +/// +/// Table: app-txn-checkpoint (checkpoint at v1, partition column: `modified` (string)) +/// - 2 files: modified=2021-02-02 -- 3 rows each, value in [1, 3] +/// - 2 files: modified=2021-02-01 -- 8 rows each, value in [4, 11] +#[rstest::rstest] +#[case::and_keeps_partition_matched_files( + // Data skipping keeps 2021-02-01 files (partition matches, max(value)=11 > 9) and + // prunes 2021-02-02 files (partition mismatch). All rows from kept files are returned + // since kernel does not apply row-level predicate filtering. + Pred::and( + column_expr!("modified").eq(Expr::literal("2021-02-01")), + column_expr!("value").gt(Expr::literal(9i32)), + ), + vec![ + "+----+------------+-------+", + "| id | modified | value |", + "+----+------------+-------+", + "| A | 2021-02-01 | 10 |", + "| A | 2021-02-01 | 10 |", + "| A | 2021-02-01 | 11 |", + "| A | 2021-02-01 | 11 |", + "| A | 2021-02-01 | 5 |", + "| A | 2021-02-01 | 5 |", + "| A | 2021-02-01 | 6 |", + "| A | 2021-02-01 | 6 |", + "| A | 2021-02-01 | 7 |", + "| A | 2021-02-01 | 7 |", + "| B | 2021-02-01 | 4 |", + "| B | 2021-02-01 | 4 |", + "| B | 2021-02-01 | 8 |", + "| B | 2021-02-01 | 8 |", + "| B | 2021-02-01 | 9 |", + "| B | 2021-02-01 | 9 |", + "+----+------------+-------+", + ] + .into_iter() + .map(String::from) + .collect() +)] +#[case::and_prunes_all_files( + // 2021-02-02: partition matches but data stats fail (max value=3, NOT > 3). + // 2021-02-01: partition mismatch. All 4 files pruned. + Pred::and( + column_expr!("modified").eq(Expr::literal("2021-02-02")), + column_expr!("value").gt(Expr::literal(3i32)), + ), + vec![] +)] +#[case::or_prunes_by_both_partition_and_stats( + // 2021-02-01 pruned: partition mismatch AND max(value)=11 NOT > 11. + // 2021-02-02 kept by partition match. Only 2021-02-02 rows returned. + Pred::or( + column_expr!("modified").eq(Expr::literal("2021-02-02")), + column_expr!("value").gt(Expr::literal(11i32)), + ), + vec![ + "+----+------------+-------+", + "| id | modified | value |", + "+----+------------+-------+", + "| A | 2021-02-02 | 1 |", + "| A | 2021-02-02 | 1 |", + "| A | 2021-02-02 | 3 |", + "| A | 2021-02-02 | 3 |", + "| B | 2021-02-02 | 2 |", + "| B | 2021-02-02 | 2 |", + "+----+------------+-------+", + ] + .into_iter() + .map(String::from) + .collect() +)] +#[case::or_keeps_all_files( + // 2021-02-02 kept by partition match, 2021-02-01 kept by data stats (max=11 > 9). + // All rows from all 4 files are returned. + Pred::or( + column_expr!("modified").eq(Expr::literal("2021-02-02")), + column_expr!("value").gt(Expr::literal(9i32)), + ), + vec![ + "+----+------------+-------+", + "| id | modified | value |", + "+----+------------+-------+", + "| A | 2021-02-01 | 10 |", + "| A | 2021-02-01 | 10 |", + "| A | 2021-02-01 | 11 |", + "| A | 2021-02-01 | 11 |", + "| A | 2021-02-01 | 5 |", + "| A | 2021-02-01 | 5 |", + "| A | 2021-02-01 | 6 |", + "| A | 2021-02-01 | 6 |", + "| A | 2021-02-01 | 7 |", + "| A | 2021-02-01 | 7 |", + "| A | 2021-02-02 | 1 |", + "| A | 2021-02-02 | 1 |", + "| A | 2021-02-02 | 3 |", + "| A | 2021-02-02 | 3 |", + "| B | 2021-02-01 | 4 |", + "| B | 2021-02-01 | 4 |", + "| B | 2021-02-01 | 8 |", + "| B | 2021-02-01 | 8 |", + "| B | 2021-02-01 | 9 |", + "| B | 2021-02-01 | 9 |", + "| B | 2021-02-02 | 2 |", + "| B | 2021-02-02 | 2 |", + "+----+------------+-------+", + ] + .into_iter() + .map(String::from) + .collect() +)] +fn mixed_predicate_with_checkpoint_parsed_columns( + #[case] pred: Pred, + #[case] expected: Vec, +) -> Result<(), Box> { + // Exercises the unified data skipping path that reads both `partitionValues_parsed` and + // `stats_parsed` from the checkpoint parquet file in a single columnar pass. + read_table_data( + "./tests/data/app-txn-checkpoint", + Some(&["id", "modified", "value"]), + Some(pred), + expected, + )?; + Ok(()) +} + +/// Test partition pruning on a table with column mapping (name mode). The logical partition +/// column "category" has physical name "phys_category". With column mapping, `partitionValues` +/// in the log uses physical column names, and the partition schema + predicate must also use +/// physical names for `MapToStruct` extraction and data skipping to work correctly. +#[rstest::rstest] +#[case::partition_only( + // Partition-only predicate: category = 'A' prunes the category=B file + Arc::new(Pred::eq(column_expr!("category"), Expr::literal("A"))), + 1 +)] +#[case::mixed_partition_and_data( + // Mixed predicate: category = 'A' OR val > 'z'. Category=A kept by partition match. + // Category=B: partition mismatch, but max(val)='z' NOT > 'z', so data skipping prunes it. + Arc::new(Pred::or( + Pred::eq(column_expr!("category"), Expr::literal("A")), + Pred::gt(column_expr!("val"), Expr::literal("z")), + )), + 1 +)] +#[tokio::test] +async fn partition_pruning_with_column_mapping( + #[case] predicate: Arc, + #[case] expected_files: usize, +) -> Result<(), Box> { + let batch = generate_batch(vec![("phys_val", vec!["x", "y", "z"].into_array())])?; + + let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + + // Column mapping name mode: logical "category" -> physical "phys_category", + // logical "val" -> physical "phys_val" + let schema_str = r#"{"type":"struct","fields":[{"name":"category","type":"string","nullable":true,"metadata":{"delta.columnMapping.id":1,"delta.columnMapping.physicalName":"phys_category"}},{"name":"val","type":"string","nullable":true,"metadata":{"delta.columnMapping.id":2,"delta.columnMapping.physicalName":"phys_val"}}]}"#; + + let actions = [ + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["columnMapping"],"writerFeatures":["columnMapping"]}}"#.to_string(), + r#"{"commitInfo":{"timestamp":1587968586154,"operation":"WRITE","isBlindAppend":true}}"#.to_string(), + format!( + r#"{{"metaData":{{"id":"test-cm","format":{{"provider":"parquet","options":{{}}}},"schemaString":"{schema}","partitionColumns":["category"],"configuration":{{"delta.columnMapping.mode":"name","delta.columnMapping.maxColumnId":"2"}},"createdTime":1587968585495}}}}"#, + schema = schema_str.replace('"', r#"\""#), ), - ( - Pred::not(column_expr!("number").eq(Expr::literal(4i64))), - table_for_numbers(vec![1, 2, 3, 5, 6]), + // partitionValues uses physical column name when column mapping is enabled + format!( + r#"{{"add":{{"path":"phys_category=A/{PARQUET_FILE1}","partitionValues":{{"phys_category":"A"}},"size":0,"modificationTime":1587968586000,"dataChange":true,"stats":"{{\"numRecords\":3,\"nullCount\":{{\"phys_val\":0}},\"minValues\":{{\"phys_val\":\"x\"}},\"maxValues\":{{\"phys_val\":\"z\"}}}}" }}}}"# ), - ( - Pred::not(column_expr!("number").ne(Expr::literal(4i64))), - table_for_numbers(vec![4]), + format!( + r#"{{"add":{{"path":"phys_category=B/{PARQUET_FILE2}","partitionValues":{{"phys_category":"B"}},"size":0,"modificationTime":1587968586000,"dataChange":true,"stats":"{{\"numRecords\":3,\"nullCount\":{{\"phys_val\":0}},\"minValues\":{{\"phys_val\":\"x\"}},\"maxValues\":{{\"phys_val\":\"z\"}}}}" }}}}"# ), ]; - for (pred, expected) in cases.into_iter() { - read_table_data( - "./tests/data/basic_partitioned", - Some(&["a_float", "number"]), - Some(pred), - expected, - )?; + + add_commit(table_root, storage.as_ref(), 0, actions.iter().join("\n")).await?; + storage + .put( + &Path::from("phys_category=A").child(PARQUET_FILE1), + record_batch_to_bytes(&batch).into(), + ) + .await?; + storage + .put( + &Path::from("phys_category=B").child(PARQUET_FILE2), + record_batch_to_bytes(&batch).into(), + ) + .await?; + + let engine = Arc::new(DefaultEngineBuilder::new(storage.clone()).build()); + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; + + // Predicates use logical column names -- kernel must map to physical names + let scan = snapshot.scan_builder().with_predicate(predicate).build()?; + + let stream = scan.execute(engine)?; + let mut files_scanned = 0; + for engine_data in stream { + let result_batch = into_record_batch(engine_data?); + // The "category" partition column should be filled with "A" + let category_idx = result_batch.schema().index_of("category")?; + let category_col = result_batch.column(category_idx).as_string::(); + for i in 0..result_batch.num_rows() { + assert_eq!(category_col.value(i), "A"); + } + files_scanned += 1; } + assert_eq!( + expected_files, files_scanned, + "Expected partition pruning to return {expected_files} file(s)" + ); + + Ok(()) +} + +#[rstest::rstest] +#[case::not_less_than( + Pred::not(column_expr!("number").lt(Expr::literal(4i64))), + table_for_numbers(vec![4, 5, 6]) +)] +#[case::not_less_than_or_equal( + Pred::not(column_expr!("number").le(Expr::literal(4i64))), + table_for_numbers(vec![5, 6]) +)] +#[case::not_greater_than( + Pred::not(column_expr!("number").gt(Expr::literal(4i64))), + table_for_numbers(vec![1, 2, 3, 4]) +)] +#[case::not_greater_than_or_equal( + Pred::not(column_expr!("number").ge(Expr::literal(4i64))), + table_for_numbers(vec![1, 2, 3]) +)] +#[case::not_equal( + Pred::not(column_expr!("number").eq(Expr::literal(4i64))), + table_for_numbers(vec![1, 2, 3, 5, 6]) +)] +#[case::not_not_equal( + Pred::not(column_expr!("number").ne(Expr::literal(4i64))), + table_for_numbers(vec![4]) +)] +fn predicate_on_number_not( + #[case] pred: Pred, + #[case] expected: Vec, +) -> Result<(), Box> { + read_table_data( + "./tests/data/basic_partitioned", + Some(&["a_float", "number"]), + Some(pred), + expected, + )?; Ok(()) } @@ -879,142 +1158,138 @@ fn mixed_not_null() -> Result<(), Box> { Ok(()) } -#[test] -fn and_or_predicates() -> Result<(), Box> { - let cases = vec![ - ( - Pred::and( - column_expr!("number").gt(Expr::literal(4i64)), - column_expr!("a_float").gt(Expr::literal(5.5)), - ), - table_for_numbers(vec![6]), - ), - ( - Pred::and( - column_expr!("number").gt(Expr::literal(4i64)), - Pred::not(column_expr!("a_float").gt(Expr::literal(5.5))), - ), - table_for_numbers(vec![5]), - ), - ( - Pred::or( - column_expr!("number").gt(Expr::literal(4i64)), - column_expr!("a_float").gt(Expr::literal(5.5)), - ), - table_for_numbers(vec![5, 6]), - ), - ( - Pred::or( - column_expr!("number").gt(Expr::literal(4i64)), - Pred::not(column_expr!("a_float").gt(Expr::literal(5.5))), - ), - table_for_numbers(vec![1, 2, 3, 4, 5, 6]), - ), - ]; - for (pred, expected) in cases.into_iter() { - read_table_data( - "./tests/data/basic_partitioned", - Some(&["a_float", "number"]), - Some(pred), - expected, - )?; - } +#[rstest::rstest] +#[case::and_both_conditions( + Pred::and( + column_expr!("number").gt(Expr::literal(4i64)), + column_expr!("a_float").gt(Expr::literal(5.5)), + ), + table_for_numbers(vec![6]) +)] +#[case::and_with_negation( + Pred::and( + column_expr!("number").gt(Expr::literal(4i64)), + Pred::not(column_expr!("a_float").gt(Expr::literal(5.5))), + ), + table_for_numbers(vec![5]) +)] +#[case::or_either_condition( + Pred::or( + column_expr!("number").gt(Expr::literal(4i64)), + column_expr!("a_float").gt(Expr::literal(5.5)), + ), + table_for_numbers(vec![5, 6]) +)] +#[case::or_with_negation( + Pred::or( + column_expr!("number").gt(Expr::literal(4i64)), + Pred::not(column_expr!("a_float").gt(Expr::literal(5.5))), + ), + table_for_numbers(vec![1, 2, 3, 4, 5, 6]) +)] +fn and_or_predicates( + #[case] pred: Pred, + #[case] expected: Vec, +) -> Result<(), Box> { + read_table_data( + "./tests/data/basic_partitioned", + Some(&["a_float", "number"]), + Some(pred), + expected, + )?; Ok(()) } -#[test] -fn not_and_or_predicates() -> Result<(), Box> { - let cases = vec![ - ( - Pred::not(Pred::and( - column_expr!("number").gt(Expr::literal(4i64)), - column_expr!("a_float").gt(Expr::literal(5.5)), - )), - table_for_numbers(vec![1, 2, 3, 4, 5]), - ), - ( - Pred::not(Pred::and( - column_expr!("number").gt(Expr::literal(4i64)), - Pred::not(column_expr!("a_float").gt(Expr::literal(5.5))), - )), - table_for_numbers(vec![1, 2, 3, 4, 6]), - ), - ( - Pred::not(Pred::or( - column_expr!("number").gt(Expr::literal(4i64)), - column_expr!("a_float").gt(Expr::literal(5.5)), - )), - table_for_numbers(vec![1, 2, 3, 4]), - ), - ( - Pred::not(Pred::or( - column_expr!("number").gt(Expr::literal(4i64)), - Pred::not(column_expr!("a_float").gt(Expr::literal(5.5))), - )), - vec![], - ), - ]; - for (pred, expected) in cases.into_iter() { - read_table_data( - "./tests/data/basic_partitioned", - Some(&["a_float", "number"]), - Some(pred), - expected, - )?; - } +#[rstest::rstest] +#[case::not_and_both_conditions( + Pred::not(Pred::and( + column_expr!("number").gt(Expr::literal(4i64)), + column_expr!("a_float").gt(Expr::literal(5.5)), + )), + table_for_numbers(vec![1, 2, 3, 4, 5]) +)] +#[case::not_and_with_negation( + Pred::not(Pred::and( + column_expr!("number").gt(Expr::literal(4i64)), + Pred::not(column_expr!("a_float").gt(Expr::literal(5.5))), + )), + table_for_numbers(vec![1, 2, 3, 4, 6]) +)] +#[case::not_or_either_condition( + Pred::not(Pred::or( + column_expr!("number").gt(Expr::literal(4i64)), + column_expr!("a_float").gt(Expr::literal(5.5)), + )), + table_for_numbers(vec![1, 2, 3, 4]) +)] +#[case::not_or_with_negation( + Pred::not(Pred::or( + column_expr!("number").gt(Expr::literal(4i64)), + Pred::not(column_expr!("a_float").gt(Expr::literal(5.5))), + )), + vec![] +)] +fn not_and_or_predicates( + #[case] pred: Pred, + #[case] expected: Vec, +) -> Result<(), Box> { + read_table_data( + "./tests/data/basic_partitioned", + Some(&["a_float", "number"]), + Some(pred), + expected, + )?; Ok(()) } -#[test] -fn invalid_skips_none_predicates() -> Result<(), Box> { - let empty_struct = Expr::struct_from(Vec::::new()); - let cases = vec![ - (Pred::literal(false), table_for_numbers(vec![])), - ( - Pred::and(column_pred!("number"), Pred::literal(false)), - table_for_numbers(vec![]), - ), - ( - Pred::literal(true), - table_for_numbers(vec![1, 2, 3, 4, 5, 6]), - ), - ( - Pred::from_expr(Expr::literal(3i64)), - table_for_numbers(vec![1, 2, 3, 4, 5, 6]), - ), - ( - column_expr!("number").distinct(Expr::literal(3i64)), - table_for_numbers(vec![1, 2, 4, 5, 6]), - ), - ( - column_expr!("number").distinct(Expr::null_literal(DataType::LONG)), - table_for_numbers(vec![1, 2, 3, 4, 5, 6]), - ), - ( - Pred::not(column_expr!("number").distinct(Expr::literal(3i64))), - table_for_numbers(vec![3]), - ), - ( - Pred::not(column_expr!("number").distinct(Expr::null_literal(DataType::LONG))), - table_for_numbers(vec![]), - ), - ( - column_expr!("number").gt(empty_struct.clone()), - table_for_numbers(vec![1, 2, 3, 4, 5, 6]), - ), - ( - Pred::not(column_expr!("number").gt(empty_struct.clone())), - table_for_numbers(vec![1, 2, 3, 4, 5, 6]), - ), - ]; - for (pred, expected) in cases.into_iter() { - read_table_data( - "./tests/data/basic_partitioned", - Some(&["a_float", "number"]), - Some(pred), - expected, - )?; - } +#[rstest::rstest] +#[case::literal_false(Pred::literal(false), table_for_numbers(vec![]))] +#[case::and_with_literal_false( + Pred::and(column_pred!("number"), Pred::literal(false)), + table_for_numbers(vec![]) +)] +#[case::literal_true( + Pred::literal(true), + table_for_numbers(vec![1, 2, 3, 4, 5, 6]) +)] +#[case::from_literal_expr( + Pred::from_expr(Expr::literal(3i64)), + table_for_numbers(vec![1, 2, 3, 4, 5, 6]) +)] +#[case::distinct_value( + column_expr!("number").distinct(Expr::literal(3i64)), + table_for_numbers(vec![1, 2, 4, 5, 6]) +)] +#[case::distinct_null( + column_expr!("number").distinct(Expr::null_literal(DataType::LONG)), + table_for_numbers(vec![1, 2, 3, 4, 5, 6]) +)] +#[case::not_distinct_value( + Pred::not(column_expr!("number").distinct(Expr::literal(3i64))), + table_for_numbers(vec![3]) +)] +#[case::not_distinct_null( + Pred::not(column_expr!("number").distinct(Expr::null_literal(DataType::LONG))), + table_for_numbers(vec![]) +)] +#[case::gt_empty_struct( + column_expr!("number").gt(Expr::struct_from(Vec::::new())), + table_for_numbers(vec![1, 2, 3, 4, 5, 6]) +)] +#[case::not_gt_empty_struct( + Pred::not(column_expr!("number").gt(Expr::struct_from(Vec::::new()))), + table_for_numbers(vec![1, 2, 3, 4, 5, 6]) +)] +fn invalid_skips_none_predicates( + #[case] pred: Pred, + #[case] expected: Vec, +) -> Result<(), Box> { + read_table_data( + "./tests/data/basic_partitioned", + Some(&["a_float", "number"]), + Some(pred), + expected, + )?; Ok(()) } @@ -1049,6 +1324,7 @@ async fn predicate_on_non_nullable_partition_column() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { + use delta_kernel::arrow::array::{Array, StringArray}; + + // Set up an in-memory table with multiple data files + let batch1 = generate_batch(vec![ + ("id", vec![1i32, 2, 3].into_array()), + ("value", vec!["a", "b", "c"].into_array()), + ])?; + let batch2 = generate_batch(vec![ + ("id", vec![10i32, 20].into_array()), + ("value", vec!["x", "y"].into_array()), + ])?; + + let file_size1 = record_batch_to_bytes(&batch1).len() as u64; + let file_size2 = record_batch_to_bytes(&batch2).len() as u64; + let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + add_commit( + table_root, + storage.as_ref(), + 0, + actions_to_string(vec![ + TestAction::Metadata, + TestAction::AddWithSize(PARQUET_FILE1.to_string(), file_size1), + TestAction::AddWithSize(PARQUET_FILE2.to_string(), file_size2), + ]), + ) + .await?; + + for (parquet_file, batch) in [(PARQUET_FILE1, &batch1), (PARQUET_FILE2, &batch2)] { + storage + .put( + &Path::from(parquet_file), + record_batch_to_bytes(batch).into(), + ) + .await?; + } + + let engine = Arc::new(DefaultEngineBuilder::new(storage.clone()).build()); + + // Create a schema that includes the file path metadata column + let schema = Arc::new(StructType::try_new([ + StructField::nullable("id", DataType::INTEGER), + StructField::create_metadata_column("_file", MetadataColumnSpec::FilePath), + StructField::nullable("value", DataType::STRING), + ])?); + + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; + let scan = snapshot.scan_builder().with_schema(schema).build()?; + + let mut file_count = 0; + let expected_files = [PARQUET_FILE1, PARQUET_FILE2]; + let expected_row_counts = [3, 2]; + let stream = scan.execute(engine.clone())?; + + for data in stream { + let batch = into_record_batch(data?); + + // Verify the schema structure + assert_eq!(batch.num_columns(), 3, "Expected 3 columns in the batch"); + assert_eq!( + batch.schema().field(0).name(), + "id", + "First column should be 'id'" + ); + assert_eq!( + batch.schema().field(1).name(), + "_file", + "Second column should be '_file'" + ); + assert_eq!( + batch.schema().field(2).name(), + "value", + "Third column should be 'value'" + ); + + // Verify the file path column contains the expected file name + let file_path_array = batch.column(1); + let expected_file_name = expected_files[file_count]; + let expected_path = format!("{table_root}{expected_file_name}"); + + // The file path array should be a plain StringArray with the path repeated for each row. + let string_array = file_path_array + .as_any() + .downcast_ref::() + .expect("File path column should be a StringArray"); + + assert_eq!( + string_array.len(), + expected_row_counts[file_count], + "File {} should have {} rows", + expected_file_name, + expected_row_counts[file_count] + ); + assert!( + string_array + .iter() + .all(|v| v == Some(expected_path.as_str())), + "All rows should contain file path '{expected_path}'" + ); + + file_count += 1; + } + + assert_eq!(file_count, 2, "Expected to scan 2 files"); + Ok(()) +} + #[tokio::test] async fn test_unsupported_metadata_columns() -> Result<(), Box> { // Prepare an in-memory table with some data let batch = generate_simple_batch()?; let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; add_commit( + table_root, storage.as_ref(), 0, actions_to_string(vec![ @@ -1492,11 +1871,7 @@ async fn test_unsupported_metadata_columns() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { + let batch = generate_simple_batch()?; + let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + add_commit( + table_root, + storage.as_ref(), + 0, + actions_to_string(vec![ + TestAction::Metadata, + TestAction::Add(PARQUET_FILE1.to_string()), + TestAction::Add(PARQUET_FILE2.to_string()), + ]), + ) + .await?; + storage + .put( + &Path::from(PARQUET_FILE1), + record_batch_to_bytes(&batch).into(), + ) + .await?; + storage + .put( + &Path::from(PARQUET_FILE2), + record_batch_to_bytes(&batch).into(), + ) + .await?; + + let engine = Arc::new(DefaultEngineBuilder::new(storage.clone()).build()); + + let invalid_files = [ + "_delta_log/0.zip", + "_delta_log/_copy_into_log/0.zip", + "_delta_log/_ignore_me/00000000000000000000.json", + "_delta_log/_and_me/00000000000000000000.checkpoint.parquet", + "_delta_log/02184.json", + "_delta_log/0x000000000000000000.checkpoint.parquet", + "00000000000000000000.json", + "_delta_log/_staged_commits/_staged_commits/00000000000000000000.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json", + "_delta_log/my_random_dir/_staged_commits/00000000000000000000.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json", + "_delta_log/my_random_dir/_delta_log/_staged_commits/00000000000000000000.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json", + "_delta_log/_delta_log/00000000000000000000.json", + "_delta_log/_delta_log/00000000000000000000.checkpoint.parquet", + "_delta_log/something/_delta_log/00000000000000000000.crc", + "_delta_log/something/_delta_log/00000000000000000000.json", + "_delta_log/something/_delta_log/00000000000000000000.checkpoint.parquet", + ]; + + fn get_file_path_for_test(path: &ParsedLogPath) -> &str { + &path.location.location.as_str()[10..] + } + + fn ensure_segment_does_not_contain(invalid_files: &[&str], segment: &LogSegment) { + assert!( + !segment.listed.ascending_commit_files.iter().any(|p| { + let test_path = get_file_path_for_test(p); + invalid_files.contains(&test_path) + }), + "ascending_commit_files contained invalid file" + ); + assert!( + !segment.listed.ascending_compaction_files.iter().any(|p| { + let test_path = get_file_path_for_test(p); + invalid_files.contains(&test_path) + }), + "ascending_compaction_files contained invalid file" + ); + assert!( + !segment.listed.checkpoint_parts.iter().any(|p| { + let test_path = get_file_path_for_test(p); + invalid_files.contains(&test_path) + }), + "checkpoint_parts contained invalid file" + ); + if let Some(ref crc) = segment.listed.latest_crc_file { + assert!( + !invalid_files.contains(&get_file_path_for_test(crc)), + "Latest crc contained invalid file" + ); + } + if let Some(ref latest_commit) = segment.listed.latest_commit_file { + assert!( + !invalid_files.contains(&get_file_path_for_test(latest_commit)), + "Latest commit contained invalid file" + ); + } + } + + for invalid_file in invalid_files.iter() { + let invalid_path = Path::from(*invalid_file); + storage.put(&invalid_path, vec![1u8].into()).await?; + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; + ensure_segment_does_not_contain(&invalid_files, snapshot.log_segment()); + storage.delete(&invalid_path).await?; + } + + // final test with _all_ the files we should ignore + for invalid_file in invalid_files.iter() { + let invalid_path = Path::from(*invalid_file); + storage.put(&invalid_path, vec![1u8].into()).await?; + } + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; + ensure_segment_does_not_contain(&invalid_files, snapshot.log_segment()); + + Ok(()) +} + +// Verifies data skipping works via stats_parsed across all checkpoint types. +// All tables have writeStatsAsStruct=true, writeStatsAsJson=false (struct stats only), +// schema (id: long, value: string), 5 rows (id 1-5), checkpoint at v5. +// Predicate id > 3 skips files where max(id) <= 3, returning only rows 4 and 5. +#[rstest::rstest] +#[test] +fn checkpoint_stats_skipping( + #[values( + "v1-single-part", + "v1-multi-part", + "v2-parquet-sidecars", + "v2-json-sidecars", + "v2-classic-parquet" + )] + checkpoint_type: &str, +) -> Result<(), Box> { + let table_path = format!("./tests/data/{checkpoint_type}-struct-stats-only/"); + let expected = vec![ + "+----+---------+", + "| id | value |", + "+----+---------+", + "| 4 | value_4 |", + "| 5 | value_5 |", + "+----+---------+", + ]; + let predicate = column_expr!("id").gt(Expr::literal(3i64)); + read_table_data_str(&table_path, None, Some(predicate), expected)?; + Ok(()) +} + +// Multi-part V1 checkpoint with partitionValues_parsed on a partitioned table. +// Schema: (id: long, value: string, part: int) partitioned by part. +// Each commit inserts one row with part = i % 3 (parts 0, 1, 2). +#[test] +fn partition_values_parsed_skipping() -> Result<(), Box> { + // Predicate part = 0 should skip partitions 1 and 2, returning rows with part=0. + // i % 3 == 0: i=3 -> (3, "value_3", 0) + let expected = vec![ + "+----+---------+------+", + "| id | value | part |", + "+----+---------+------+", + "| 3 | value_3 | 0 |", + "+----+---------+------+", + ]; + let predicate = column_expr!("part").eq(Expr::literal(0i32)); + read_table_data_str( + "./tests/data/v1-multi-part-partitioned-struct-stats-only/", + None, + Some(predicate), + expected, + )?; + Ok(()) +} + +// In-memory test with crafted truncated JSON stats. Three files: +// file 1: ts_col [1s, 2s] -- max at ms boundary +// file 2: ts_col [3s, 4.000500s] -- JSON max truncated to 4.000s +// file 3: ts_col [7s, 8s] -- max at ms boundary +// +// Predicate `ts_col > 4_000_400us`: +// file 1: max=2s << adjusted predicate (3_999_401) -> pruned (skipping works) +// file 2: truncated max=4s > adjusted predicate (3_999_401) -> kept (truncation safe) +// file 3: max=8s >> adjusted predicate -> kept +#[tokio::test] +async fn timestamp_max_stat_truncation_does_not_over_prune( +) -> Result<(), Box> { + let ts_metadata = "{\ + \"id\":\"test-ts-table\",\ + \"format\":{\"provider\":\"parquet\",\"options\":{}},\ + \"schemaString\":\"{\\\"type\\\":\\\"struct\\\",\\\"fields\\\":[\ + {\\\"name\\\":\\\"ts_col\\\",\\\"type\\\":\\\"timestamp\\\",\\\"nullable\\\":true,\\\"metadata\\\":{}}\ + ]}\",\ + \"partitionColumns\":[],\ + \"configuration\":{},\ + \"createdTime\":1700000000000\ + }"; + + let ts_schema = Arc::new(ArrowSchema::new(vec![ArrowField::new( + "ts_col", + delta_kernel::arrow::datatypes::DataType::Timestamp( + TimeUnit::Microsecond, + Some("UTC".into()), + ), + true, + )])); + + // file1: max at ms boundary, will be pruned + let file1_stats = r#"{"numRecords":2,"nullCount":{"ts_col":0},"minValues":{"ts_col":"1970-01-01T00:00:01.000Z"},"maxValues":{"ts_col":"1970-01-01T00:00:02.000Z"}}"#; + // file2: max truncated from 4.000500s to 4.000s -- truncation adjustment must keep this + let file2_stats = r#"{"numRecords":2,"nullCount":{"ts_col":0},"minValues":{"ts_col":"1970-01-01T00:00:03.000Z"},"maxValues":{"ts_col":"1970-01-01T00:00:04.000Z"}}"#; + // file3: max clearly above predicate + let file3_stats = r#"{"numRecords":2,"nullCount":{"ts_col":0},"minValues":{"ts_col":"1970-01-01T00:00:07.000Z"},"maxValues":{"ts_col":"1970-01-01T00:00:08.000Z"}}"#; + + let batch1 = RecordBatch::try_new( + ts_schema.clone(), + vec![Arc::new( + TimestampMicrosecondArray::from(vec![1_000_000i64, 2_000_000]).with_timezone("UTC"), + )], + )?; + let batch2 = RecordBatch::try_new( + ts_schema.clone(), + vec![Arc::new( + TimestampMicrosecondArray::from(vec![3_000_000i64, 4_000_500]).with_timezone("UTC"), + )], + )?; + let batch3 = RecordBatch::try_new( + ts_schema, + vec![Arc::new( + TimestampMicrosecondArray::from(vec![7_000_000i64, 8_000_000]).with_timezone("UTC"), + )], + )?; + + let file1_bytes = record_batch_to_bytes(&batch1); + let file2_bytes = record_batch_to_bytes(&batch2); + let file3_bytes = record_batch_to_bytes(&batch3); + + let storage = Arc::new(InMemory::new()); + let table_root = "memory:///"; + + let make_add = |name: &str, size: usize, stats: &str| -> String { + format!( + "{{\"add\":{{\"path\":\"{name}\",\"partitionValues\":{{}},\"size\":{size},\ + \"modificationTime\":1700000000000,\"dataChange\":true,\ + \"stats\":\"{stats_escaped}\"}}}}", + stats_escaped = stats.replace('"', "\\\""), + ) + }; + + let commit0 = format!( + "{{\"protocol\":{{\"minReaderVersion\":1,\"minWriterVersion\":2}}}}\n\ + {{\"metaData\":{ts_metadata}}}\n\ + {}", + make_add("file1.parquet", file1_bytes.len(), file1_stats) + ); + add_commit(table_root, storage.as_ref(), 0, commit0).await?; + add_commit( + table_root, + storage.as_ref(), + 1, + make_add("file2.parquet", file2_bytes.len(), file2_stats), + ) + .await?; + add_commit( + table_root, + storage.as_ref(), + 2, + make_add("file3.parquet", file3_bytes.len(), file3_stats), + ) + .await?; + + storage + .put(&Path::from("file1.parquet"), file1_bytes.into()) + .await?; + storage + .put(&Path::from("file2.parquet"), file2_bytes.into()) + .await?; + storage + .put(&Path::from("file3.parquet"), file3_bytes.into()) + .await?; + + let engine = Arc::new(DefaultEngineBuilder::new(storage.clone()).build()); + let snapshot = Snapshot::builder_for(table_root).build(engine.as_ref())?; + + let row_count = |predicate_us: i64| -> Result> { + let predicate = Arc::new(Pred::gt( + column_expr!("ts_col"), + Expr::literal(Scalar::Timestamp(predicate_us)), + )); + let scan = snapshot + .clone() + .scan_builder() + .with_predicate(predicate) + .build()?; + let batches = read_scan(&scan, engine.clone())?; + Ok(batches.iter().map(|b| b.num_rows()).sum()) + }; + + // Mid-ms value (4.000400s): adjusted to 3_999_401 + // file1 max=2s < 3_999_401 -> pruned; file2+3 kept (4 rows) + assert_eq!(row_count(4_000_400)?, 4, "mid-ms: file2+file3 kept"); + + // Exact ms boundary (4.000000s = truncated max of file2): adjusted to 3_999_001 + // file1 max=2s < 3_999_001 -> pruned; file2 max=4s > 3_999_001 -> kept (4 rows) + assert_eq!( + row_count(4_000_000)?, + 4, + "exact ms boundary: file2+file3 kept" + ); + + // 1us above ms boundary (4.000001s): adjusted to 3_999_002 + // file1 pruned; file2 max=4s > 3_999_002 -> kept (4 rows) + assert_eq!(row_count(4_000_001)?, 4, "1us above ms: file2+file3 kept"); + + // 998us above ms boundary (4.000998s): adjusted to 3_999_999 + // file2 max=4s > 3_999_999 -> kept (just not prunable) + assert_eq!( + row_count(4_000_998)?, + 4, + "just not prunable: file2+file3 kept" + ); + + // 999us above ms boundary (4.000999s): adjusted to 4_000_000 + // file2 max=4s == 4_000_000 -> NOT strictly greater -> pruned (just prunable) + assert_eq!(row_count(4_000_999)?, 2, "just prunable: only file3 kept"); + + // Next ms boundary (4.001000s): adjusted to 4_000_001 + // file2 max=4s < 4_000_001 -> pruned (2 rows) + assert_eq!( + row_count(4_001_000)?, + 2, + "next ms boundary: only file3 kept" + ); + + Ok(()) +} + +// End-to-end tests using a Spark-written Delta table with real truncated JSON stats. +// Table has three files: +// file 1: id=[1,2], ts_col=[1s, 2s] -- max at ms boundary +// file 2: id=[3,4], ts_col=[3s, 4.000500s] -- max truncated to 4.000s in JSON stats +// file 3: id=[5,6], ts_col=[7s, 8s] -- max at ms boundary +// +// Predicate value 4.000400s sits between the truncated max (4.000s) and actual max +// (4.000500s) of file 2, exercising the truncation adjustment. + +// GT: file1 pruned (max=2s < adjusted 3.999401s), file2+3 kept +#[test] +fn timestamp_truncation_real_table_gt() -> Result<(), Box> { + read_table_data_str( + "./tests/data/timestamp-truncation-stats", + None, + Some(Pred::gt( + column_expr!("ts_col"), + Expr::literal(Scalar::Timestamp(4_000_400)), + )), + vec![ + "+----+-----------------------------+", + "| id | ts_col |", + "+----+-----------------------------+", + "| 3 | 1970-01-01T00:00:03Z |", + "| 4 | 1970-01-01T00:00:04.000500Z |", + "| 5 | 1970-01-01T00:00:07Z |", + "| 6 | 1970-01-01T00:00:08Z |", + "+----+-----------------------------+", + ], + ) +} + +// GE: file1 pruned (max=2s < adjusted 3.999401s), file2+3 kept +#[test] +fn timestamp_truncation_real_table_ge() -> Result<(), Box> { + read_table_data_str( + "./tests/data/timestamp-truncation-stats", + None, + Some(Pred::ge( + column_expr!("ts_col"), + Expr::literal(Scalar::Timestamp(4_000_400)), + )), + vec![ + "+----+-----------------------------+", + "| id | ts_col |", + "+----+-----------------------------+", + "| 3 | 1970-01-01T00:00:03Z |", + "| 4 | 1970-01-01T00:00:04.000500Z |", + "| 5 | 1970-01-01T00:00:07Z |", + "| 6 | 1970-01-01T00:00:08Z |", + "+----+-----------------------------+", + ], + ) +} + +// LT: file3 pruned (min=7s > 4.000400s), file1+2 kept +#[test] +fn timestamp_truncation_real_table_lt() -> Result<(), Box> { + read_table_data_str( + "./tests/data/timestamp-truncation-stats", + None, + Some(Pred::lt( + column_expr!("ts_col"), + Expr::literal(Scalar::Timestamp(4_000_400)), + )), + vec![ + "+----+-----------------------------+", + "| id | ts_col |", + "+----+-----------------------------+", + "| 1 | 1970-01-01T00:00:01Z |", + "| 2 | 1970-01-01T00:00:02Z |", + "| 3 | 1970-01-01T00:00:03Z |", + "| 4 | 1970-01-01T00:00:04.000500Z |", + "+----+-----------------------------+", + ], + ) +} + +// LE: file3 pruned (min=7s > 4.000400s), file1+2 kept +#[test] +fn timestamp_truncation_real_table_le() -> Result<(), Box> { + read_table_data_str( + "./tests/data/timestamp-truncation-stats", + None, + Some(Pred::le( + column_expr!("ts_col"), + Expr::literal(Scalar::Timestamp(4_000_400)), + )), + vec![ + "+----+-----------------------------+", + "| id | ts_col |", + "+----+-----------------------------+", + "| 1 | 1970-01-01T00:00:01Z |", + "| 2 | 1970-01-01T00:00:02Z |", + "| 3 | 1970-01-01T00:00:03Z |", + "| 4 | 1970-01-01T00:00:04.000500Z |", + "+----+-----------------------------+", + ], + ) +} + +// EQ: file1 pruned (max=2s < adjusted 3.999401s), file3 pruned (min=7s > 4.000400s). +// Only file2 kept (ids 3,4). +#[test] +fn timestamp_truncation_real_table_eq() -> Result<(), Box> { + read_table_data_str( + "./tests/data/timestamp-truncation-stats", + None, + Some(Pred::eq( + column_expr!("ts_col"), + Expr::literal(Scalar::Timestamp(4_000_400)), + )), + vec![ + "+----+-----------------------------+", + "| id | ts_col |", + "+----+-----------------------------+", + "| 3 | 1970-01-01T00:00:03Z |", + "| 4 | 1970-01-01T00:00:04.000500Z |", + "+----+-----------------------------+", + ], + ) +} diff --git a/kernel/tests/row_tracking.rs b/kernel/tests/row_tracking.rs index b3b67d7773..c5d58f291c 100644 --- a/kernel/tests/row_tracking.rs +++ b/kernel/tests/row_tracking.rs @@ -2,7 +2,6 @@ use std::collections::HashMap; use std::sync::Arc; use itertools::Itertools; -use object_store::{path::Path, ObjectStore}; use serde_json::{Deserializer, Value}; use tempfile::{tempdir, TempDir}; use url::Url; @@ -15,6 +14,7 @@ use delta_kernel::engine::arrow_conversion::TryIntoArrow; use delta_kernel::engine::arrow_data::ArrowEngineData; use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::object_store::{path::Path, DynObjectStore, ObjectStore as _}; use delta_kernel::schema::{DataType, SchemaRef, StructField, StructType}; use delta_kernel::transaction::CommitResult; use delta_kernel::{DeltaResult, Error, Snapshot}; @@ -29,7 +29,7 @@ async fn create_row_tracking_table( ) -> DeltaResult<( Url, Arc>, - Arc, + Arc, )> { let tmp_test_dir_url = Url::from_directory_path(tmp_dir.path()) .map_err(|_| Error::generic("Failed to convert directory path to URL"))?; @@ -59,7 +59,9 @@ async fn write_data_to_table( ) -> DeltaResult { let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; let committer = Box::new(FileSystemCommitter::new()); - let mut txn = snapshot.transaction(committer)?.with_data_change(true); + let mut txn = snapshot + .transaction(committer, engine.as_ref())? + .with_data_change(true); // Write data out by spawning async tasks to simulate executors let write_context = Arc::new(txn.get_write_context()); @@ -116,13 +118,13 @@ where /// Helper function to verify row tracking-related information in a commit. async fn verify_row_tracking_in_commit( - store: &Arc, + store: &Arc, table_url: &Url, commit_version: u64, expected_base_row_ids: Vec, expected_row_id_high_water_mark: i64, ) -> DeltaResult<()> { - let commit_url = table_url.join(&format!("_delta_log/{:020}.json", commit_version))?; + let commit_url = table_url.join(&format!("_delta_log/{commit_version:020}.json"))?; let commit = store.get(&Path::from_url_path(commit_url.path())?).await?; let parsed_actions: Vec<_> = Deserializer::from_slice(&commit.bytes().await?) @@ -598,7 +600,7 @@ async fn test_row_tracking_without_adds() -> DeltaResult<()> { create_row_tracking_table(&tmp_test_dir, "test_consecutive_commits", schema.clone()) .await?; let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; - let txn = snapshot.transaction(Box::new(FileSystemCommitter::new()))?; + let txn = snapshot.transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())?; // Commit without adding any add files assert!(txn.commit(engine.as_ref())?.is_committed()); @@ -642,11 +644,11 @@ async fn test_row_tracking_parallel_transactions_conflict() -> DeltaResult<()> { // Create two transactions from the same snapshot (simulating parallel transactions) let mut txn1 = snapshot1 - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), engine1.as_ref())? .with_engine_info("transaction 1") .with_data_change(true); let mut txn2 = snapshot2 - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), engine2.as_ref())? .with_engine_info("transaction 2") .with_data_change(true); diff --git a/kernel/tests/v2_checkpoints.rs b/kernel/tests/v2_checkpoints.rs index a3ae99b23b..e768ad5c28 100644 --- a/kernel/tests/v2_checkpoints.rs +++ b/kernel/tests/v2_checkpoints.rs @@ -1,22 +1,24 @@ -use delta_kernel::arrow::array::RecordBatch; -use delta_kernel::engine::default::DefaultEngine; +use std::sync::Arc; +use delta_kernel::arrow::array::{Int32Array, RecordBatch}; +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::schema::{DataType, StructField, StructType}; +use delta_kernel::transaction::create_table::create_table; +use delta_kernel::transaction::CommitResult; use delta_kernel::{DeltaResult, Snapshot}; mod common; -use test_utils::{load_test_data, DefaultEngineExtension}; - use itertools::Itertools; -use test_utils::read_scan; +use test_utils::{insert_data, load_test_data, read_scan, test_table_setup_mt}; fn read_v2_checkpoint_table(test_name: impl AsRef) -> DeltaResult> { let test_dir = load_test_data("tests/data", test_name.as_ref()).unwrap(); let test_path = test_dir.path().join(test_name.as_ref()); - let engine = DefaultEngine::new_local(); let url = delta_kernel::try_parse_uri(test_path.to_str().expect("table path to string")).unwrap(); + let engine = test_utils::create_default_engine(&url)?; let snapshot = Snapshot::builder_for(url).build(engine.as_ref()).unwrap(); let scan = snapshot.scan_builder().build()?; let batches = read_scan(&scan, engine)?; @@ -223,3 +225,79 @@ fn v2_checkpoints_parquet_with_last_checkpoint() -> DeltaResult<()> { get_simple_id_table(), ) } + +/// Tests that writing a V2 checkpoint to parquet succeeds. +/// +/// V2 checkpoints include a checkpointMetadata batch in addition to the regular action +/// batches. All batches in a parquet file must share the same schema. This test verifies +/// that `snapshot.checkpoint()` can write a V2 checkpoint without schema mismatch errors. +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_v2_checkpoint_parquet_write() -> DeltaResult<()> { + let (_temp_dir, table_path, engine) = test_table_setup_mt()?; + let table_url = delta_kernel::try_parse_uri(&table_path)?; + + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "value", + DataType::INTEGER, + )])?); + let _ = create_table(&table_path, schema.clone(), "Test/1.0") + .with_table_properties([("delta.feature.v2Checkpoint", "supported")]) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + // Commit an add action via the transaction API so the checkpoint has action batches + let snapshot0 = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let result = insert_data( + snapshot0, + &engine, + vec![Arc::new(Int32Array::from(vec![1]))], + ) + .await?; + + let CommitResult::CommittedTransaction(committed) = result else { + panic!("Expected CommittedTransaction"); + }; + + let snapshot = committed + .post_commit_snapshot() + .expect("expected post-commit snapshot"); + + // This writes to parquet — will fail if the checkpointMetadata batch has a different + // schema than the action batches. + snapshot.checkpoint(engine.as_ref())?; + + // Verify the checkpoint was written and is used by a fresh snapshot + let snapshot2 = Snapshot::builder_for(table_url).build(engine.as_ref())?; + assert_eq!(snapshot2.version(), 1); + let log_segment = snapshot2.log_segment(); + assert!( + !log_segment.listed.checkpoint_parts.is_empty(), + "expected snapshot to use the written checkpoint, but checkpoint_parts is empty" + ); + assert_eq!( + log_segment.checkpoint_version, + Some(1), + "expected checkpoint at version 1" + ); + assert!( + log_segment.listed.ascending_commit_files.is_empty(), + "expected no commit files after checkpoint, but found: {:?}", + log_segment.listed.ascending_commit_files + ); + + // Verify reading data from the checkpointed snapshot returns the expected rows + let scan = snapshot2.scan_builder().build()?; + let batches = read_scan(&scan, engine.clone() as Arc)?; + assert_batches_sorted_eq!( + vec![ + "+-------+", + "| value |", + "+-------+", + "| 1 |", + "+-------+", + ], + &batches + ); + + Ok(()) +} diff --git a/kernel/tests/write.rs b/kernel/tests/write.rs index cc534b36d1..1a797f0ed6 100644 --- a/kernel/tests/write.rs +++ b/kernel/tests/write.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use delta_kernel::committer::FileSystemCommitter; use delta_kernel::Error as KernelError; @@ -7,7 +8,8 @@ use delta_kernel::{DeltaResult, Engine, Snapshot, Version}; use url::Url; use uuid::Uuid; -use delta_kernel::arrow::array::{ArrayRef, BinaryArray, StructArray}; +use delta_kernel::actions::deletion_vector::{DeletionVectorDescriptor, DeletionVectorStorageType}; +use delta_kernel::arrow::array::{Array, ArrayRef, BinaryArray, Int64Array, StructArray}; use delta_kernel::arrow::array::{Int32Array, StringArray, TimestampMicrosecondArray}; use delta_kernel::arrow::buffer::NullBuffer; use delta_kernel::arrow::datatypes::{DataType as ArrowDataType, Field}; @@ -16,31 +18,73 @@ use delta_kernel::arrow::record_batch::RecordBatch; use delta_kernel::engine::arrow_conversion::{TryFromKernel, TryIntoArrow as _}; use delta_kernel::engine::arrow_data::ArrowEngineData; -use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; +use delta_kernel::engine::default::executor::tokio::{ + TokioBackgroundExecutor, TokioMultiThreadExecutor, +}; use delta_kernel::engine::default::parquet::DefaultParquetHandler; use delta_kernel::engine::default::DefaultEngine; - +use delta_kernel::engine::default::DefaultEngineBuilder; +use delta_kernel::engine_data::FilteredEngineData; +use delta_kernel::object_store::local::LocalFileSystem; +use delta_kernel::object_store::path::Path; +use delta_kernel::object_store::{DynObjectStore, ObjectStore as _}; +use delta_kernel::transaction::create_table::create_table as create_table_txn; use delta_kernel::transaction::CommitResult; use tempfile::TempDir; use test_utils::set_json_value; use itertools::Itertools; -use object_store::path::Path; -use object_store::ObjectStore; use serde_json::json; use serde_json::Deserializer; use tempfile::tempdir; -use delta_kernel::schema::{DataType, SchemaRef, StructField, StructType}; +use delta_kernel::expressions::ColumnName; +use delta_kernel::parquet::file::reader::{FileReader, SerializedFileReader}; +use delta_kernel::schema::{ + ColumnMetadataKey, DataType, MetadataValue, SchemaRef, StructField, StructType, +}; +use delta_kernel::table_features::{get_any_level_column_physical_name, ColumnMappingMode}; +use delta_kernel::FileMeta; +use test_utils::create_default_engine_mt_executor; use test_utils::{ - assert_result_error_with_message, create_table, engine_store_setup, setup_test_tables, - test_read, + assert_partition_values, assert_result_error_with_message, assert_schema_has_field, + copy_directory, create_add_files_metadata, create_default_engine, create_table, + create_table_and_load_snapshot, engine_store_setup, nested_batches, nested_schema, + read_actions_from_commit, read_add_infos, remove_all_and_get_remove_actions, resolve_field, + setup_test_tables, test_read, test_table_setup, write_batch_to_table, }; mod common; +/// Returns the native parquet `field_id` for a field at the given physical path in a parquet file, +/// or `None` if the field has no `field_id` set. +/// +/// Panics if the file cannot be read or the physical path doesn't exist in the parquet schema. +fn get_parquet_field_id(parquet_file: &std::path::Path, physical_path: &[String]) -> Option { + let file = std::fs::File::open(parquet_file).unwrap(); + let reader = SerializedFileReader::new(file).unwrap(); + let root = reader + .metadata() + .file_metadata() + .schema_descr() + .root_schema() + .clone(); + + let mut current = &root; + for name in physical_path { + current = current + .get_fields() + .iter() + .find(|f| f.name() == name) + .unwrap_or_else(|| panic!("parquet schema missing field '{name}'")); + } + + let info = current.get_basic_info(); + info.has_id().then(|| info.id()) +} + fn validate_txn_id(commit_info: &serde_json::Value) { let txn_id = commit_info["txnId"] .as_str() @@ -48,18 +92,206 @@ fn validate_txn_id(commit_info: &serde_json::Value) { Uuid::parse_str(txn_id).expect("txnId should be valid UUID format"); } +fn validate_timestamp(commit_info: &serde_json::Value) { + let timestamp = commit_info["timestamp"] + .as_i64() + .expect("timestamp should be present in commitInfo"); + let current_ts: i64 = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() + .try_into() + .unwrap(); + let two_days_ms = Duration::from_secs(2 * 24 * 60 * 60).as_millis() as i64; + assert!( + (timestamp <= current_ts && timestamp > current_ts - two_days_ms), + "commit timestamp should be at most 2 days behind current system time: got {timestamp}, now {current_ts}" + ); +} + const ZERO_UUID: &str = "00000000-0000-0000-0000-000000000000"; +/// Creates a table with deletion vector support and writes the specified files +async fn create_dv_table_with_files( + table_name: &str, + schema: Arc, + file_paths: &[&str], +) -> Result< + ( + Arc, + Arc, + Url, + Vec, + ), + Box, +> { + let (store, engine, table_url) = engine_store_setup(table_name, None); + let engine = Arc::new(engine); + + // Create table with DV support (protocol 3/7 with deletionVectors feature) + create_table( + store.clone(), + table_url.clone(), + schema.clone(), + &[], + true, // use_37_protocol + vec!["deletionVectors"], + vec!["deletionVectors"], + ) + .await?; + + // Write files + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("test engine") + .with_operation("WRITE".to_string()) + .with_data_change(true); + + let add_files_schema = txn.add_files_schema(); + + // Build metadata for all files at once + let files: Vec<(&str, i64, i64, i64)> = file_paths + .iter() + .enumerate() + .map(|(i, &path)| { + ( + path, + 1024 + i as i64 * 100, // size + 1000000 + i as i64, // mod_time + 3, // num_records + ) + }) + .collect(); + let metadata = create_add_files_metadata(add_files_schema, files)?; + txn.add_files(metadata); + + let _ = txn.commit(engine.as_ref())?; + + let paths: Vec = file_paths.iter().map(|&s| s.to_string()).collect(); + Ok((store, engine, table_url, paths)) +} + +/// Extracts scan files from a snapshot for use in deletion vector updates +fn get_scan_files( + snapshot: Arc, + engine: &dyn delta_kernel::Engine, +) -> DeltaResult> { + let scan = snapshot.scan_builder().build()?; + let all_scan_metadata: Vec<_> = scan.scan_metadata(engine)?.collect::, _>>()?; + + Ok(all_scan_metadata + .into_iter() + .map(|sm| sm.scan_files) + .collect()) +} + +fn get_simple_int_schema() -> Arc { + Arc::new(StructType::try_new(vec![StructField::nullable("number", DataType::INTEGER)]).unwrap()) +} + +/// Write a metadata-update commit that sets a table property on the existing table. +/// Returns a fresh snapshot reflecting the new commit. +/// Used in tests as a hack to set table properties when create table doesn't support the property. +fn set_table_properties( + table_path: &str, + table_url: &Url, + engine: &dyn Engine, + current_version: Version, + properties: &[(&str, &str)], +) -> Result, Box> { + let v0_path = std::path::Path::new(table_path).join("_delta_log/00000000000000000000.json"); + let mut meta: serde_json::Value = std::fs::read_to_string(&v0_path)? + .lines() + .find_map(|line| { + serde_json::from_str::(line) + .ok() + .filter(|v| v.get("metaData").is_some()) + }) + .expect("version 0 should contain a metaData action"); + + for &(key, value) in properties { + meta["metaData"]["configuration"][key] = json!(value); + } + + let new_commit = std::path::Path::new(table_path) + .join(format!("_delta_log/{:020}.json", current_version + 1)); + std::fs::write(&new_commit, serde_json::to_string(&meta)?)?; + Ok(Snapshot::builder_for(table_url.clone()).build(engine)?) +} + +/// Assert that the snapshot's column mapping mode matches the given `cm_mode` string, +/// and return the resolved mode. +fn assert_column_mapping_mode(snapshot: &Snapshot, cm_mode: &str) -> ColumnMappingMode { + let expected = match cm_mode { + "none" => ColumnMappingMode::None, + "name" => ColumnMappingMode::Name, + "id" => ColumnMappingMode::Id, + _ => panic!("unexpected cm_mode: {cm_mode}"), + }; + let actual = snapshot + .table_properties() + .column_mapping_mode + .expect("column mapping mode should be set"); + assert_eq!(actual, expected); + actual +} + +/// Resolve a nested column inside a [`StructArray`] by walking the given field-name path, +/// and downcast the leaf to the requested array type. +fn resolve_struct_field<'a, T: 'static>(root: &'a StructArray, path: &[String]) -> &'a T { + assert!(!path.is_empty(), "path must be non-empty"); + let mut current: &StructArray = root; + for (i, name) in path.iter().enumerate() { + let col = current + .column_by_name(name) + .unwrap_or_else(|| panic!("missing field: {name}")); + if i == path.len() - 1 { + return col + .as_any() + .downcast_ref::() + .expect("leaf array type mismatch"); + } + current = col + .as_any() + .downcast_ref::() + .unwrap_or_else(|| panic!("expected StructArray at field: {name}")); + } + unreachable!() +} + +/// Navigate into a nested JSON value by following a sequence of object keys. +/// E.g. `resolve_json_path(stats, &["address", "street"])` returns `stats["address"]["street"]`. +fn resolve_json_path<'a>(root: &'a serde_json::Value, path: &[String]) -> &'a serde_json::Value { + path.iter().fold(root, |v, key| &v[key]) +} + +/// Assert that `stats["minValues"]` and `stats["maxValues"]` at the given physical path equal the +/// expected values. +fn assert_min_max_stats( + stats: &serde_json::Value, + physical_path: &[String], + expected_min: impl Into, + expected_max: impl Into, +) { + assert_eq!( + *resolve_json_path(&stats["minValues"], physical_path), + expected_min.into() + ); + assert_eq!( + *resolve_json_path(&stats["maxValues"], physical_path), + expected_max.into() + ); +} + #[tokio::test] async fn test_commit_info() -> Result<(), Box> { // setup tracing let _ = tracing_subscriber::fmt::try_init(); // create a simple table: one int column named 'number' - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); for (table_url, engine, store, table_name) in setup_test_tables(schema, &[], None, "test_table").await? @@ -68,7 +300,7 @@ async fn test_commit_info() -> Result<(), Box> { let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; let committer = Box::new(FileSystemCommitter::new()); let txn = snapshot - .transaction(committer)? + .transaction(committer, &engine)? .with_engine_info("default engine"); // commit! @@ -127,7 +359,7 @@ fn check_action_timestamps<'a>( // list all the files at `path` and check that all parquet files have the same size, and return // that size -async fn get_and_check_all_parquet_sizes(store: Arc, path: &str) -> u64 { +async fn get_and_check_all_parquet_sizes(store: Arc, path: &str) -> u64 { use futures::stream::StreamExt; let files: Vec<_> = store.list(Some(&Path::from(path))).collect().await; let parquet_files = files @@ -153,7 +385,9 @@ async fn write_data_and_check_result_and_stats( ) -> Result<(), Box> { let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; let committer = Box::new(FileSystemCommitter::new()); - let mut txn = snapshot.transaction(committer)?.with_data_change(true); + let mut txn = snapshot + .transaction(committer, engine.as_ref())? + .with_data_change(true); // create two new arrow record batches to append let append_data = [[1, 2, 3], [4, 5, 6]].map(|data| -> DeltaResult<_> { @@ -210,17 +444,14 @@ async fn test_commit_info_action() -> Result<(), Box> { // setup tracing let _ = tracing_subscriber::fmt::try_init(); // create a simple table: one int column named 'number' - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); for (table_url, engine, store, table_name) in setup_test_tables(schema.clone(), &[], None, "test_table").await? { let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; let txn = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_engine_info("default engine"); let _ = txn.commit(&engine)?; @@ -258,15 +489,96 @@ async fn test_commit_info_action() -> Result<(), Box> { Ok(()) } +/// Verifies that when `engine_commit_info` is provided (the `Some` branch of `build_commit_info`): +/// - The written JSON is correctly wrapped in a top-level `"commitInfo"` key. +/// - Engine-only fields (not in `CommitInfo::to_schema()`) pass through to the log unchanged. +/// - Fields that overlap with kernel-managed CommitInfo fields are overridden by kernel values, +/// - All kernel-managed fields (`timestamp`, `kernelVersion`, `txnId`, `operationParameters`) +/// are present with correct values. +#[tokio::test] +async fn test_commit_info_with_engine_commit_info() -> Result<(), Box> { + let _ = tracing_subscriber::fmt::try_init(); + let schema = get_simple_int_schema(); + + for (table_url, engine, store, table_name) in + setup_test_tables(schema, &[], None, "test_table").await? + { + // Build engine_commit_info with: + // - "myApp" : engine-only field, must pass through unchanged. + // - "myVersion": engine-only field, must pass through unchanged. + // - "operation": overlapping with CommitInfo; kernel must override with "WRITE". + let arrow_schema = Arc::new(delta_kernel::arrow::datatypes::Schema::new(vec![ + Field::new("myApp", ArrowDataType::Utf8, false), + Field::new("myVersion", ArrowDataType::Utf8, false), + Field::new("operation", ArrowDataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + arrow_schema, + vec![ + Arc::new(StringArray::from(vec!["spark"])) as ArrayRef, + Arc::new(StringArray::from(vec!["3.5.0"])) as ArrayRef, + Arc::new(StringArray::from(vec!["STALE_OP"])) as ArrayRef, + ], + )?; + let engine_schema = Arc::new(StructType::new_unchecked(vec![ + StructField::not_null("myApp", DataType::STRING), + StructField::not_null("myVersion", DataType::STRING), + StructField::nullable("operation", DataType::STRING), + ])); + + let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; + let txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine)? + .with_operation("WRITE".to_string()) + .with_commit_info(Box::new(ArrowEngineData::new(batch)), engine_schema); + + let _ = txn.commit(&engine)?; + + let commit = store + .get(&Path::from(format!( + "/{table_name}/_delta_log/00000000000000000001.json" + ))) + .await?; + + let mut parsed_commits: Vec<_> = Deserializer::from_slice(&commit.bytes().await?) + .into_iter::() + .try_collect()?; + + validate_txn_id(&parsed_commits[0]["commitInfo"]); + validate_timestamp(&parsed_commits[0]["commitInfo"]); + + // Zero out non-deterministic fields for stable comparison. + set_json_value(&mut parsed_commits[0], "commitInfo.timestamp", json!(0))?; + set_json_value(&mut parsed_commits[0], "commitInfo.txnId", json!(ZERO_UUID))?; + + // Null-valued CommitInfo fields (inCommitTimestamp, isBlindAppend, engineInfo) are + // omitted from the JSON — consistent with how the Delta log serializes optional fields. + let expected_commits = vec![json!({ + "commitInfo": { + // Engine-only fields pass through unchanged. + "myApp": "spark", + "myVersion": "3.5.0", + // Kernel overrides the engine's stale "STALE_OP" with the real operation. + "operation": "WRITE", + // Remaining kernel-managed non-null fields are appended. + "operationParameters": {}, + "kernelVersion": format!("v{}", env!("CARGO_PKG_VERSION")), + "txnId": ZERO_UUID, + "timestamp": 0, + } + })]; + + assert_eq!(parsed_commits, expected_commits); + } + Ok(()) +} + #[tokio::test] async fn test_append() -> Result<(), Box> { // setup tracing let _ = tracing_subscriber::fmt::try_init(); // create a simple table: one int column named 'number' - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); for (table_url, engine, store, table_name) in setup_test_tables(schema.clone(), &[], None, "test_table").await? @@ -321,7 +633,7 @@ async fn test_append() -> Result<(), Box> { "size": size, "modificationTime": 0, "dataChange": true, - "stats": "{\"numRecords\":3}" + "stats": "{\"numRecords\":3,\"nullCount\":{\"number\":0},\"minValues\":{\"number\":1},\"maxValues\":{\"number\":3},\"tightBounds\":true}" } }), json!({ @@ -331,7 +643,7 @@ async fn test_append() -> Result<(), Box> { "size": size, "modificationTime": 0, "dataChange": true, - "stats": "{\"numRecords\":3}" + "stats": "{\"numRecords\":3,\"nullCount\":{\"number\":0},\"minValues\":{\"number\":4},\"maxValues\":{\"number\":6},\"tightBounds\":true}" } }), ]; @@ -355,17 +667,14 @@ async fn test_no_add_actions() -> Result<(), Box> { // setup tracing let _ = tracing_subscriber::fmt::try_init(); // create a simple table: one int column named 'number' - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); for (table_url, engine, store, table_name) in setup_test_tables(schema.clone(), &[], None, "test_table").await? { let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; let txn = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_engine_info("default engine"); // Commit without adding any add files @@ -393,10 +702,7 @@ async fn test_append_twice() -> Result<(), Box> { // setup tracing let _ = tracing_subscriber::fmt::try_init(); // create a simple table: one int column named 'number' - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); for (table_url, engine, _, _) in setup_test_tables(schema.clone(), &[], None, "test_table").await? @@ -444,7 +750,7 @@ async fn test_append_partitioned() -> Result<(), Box> { { let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; let mut txn = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_engine_info("default engine") .with_data_change(false); @@ -535,7 +841,7 @@ async fn test_append_partitioned() -> Result<(), Box> { "size": size, "modificationTime": 0, "dataChange": false, - "stats": "{\"numRecords\":3}" + "stats": "{\"numRecords\":3,\"nullCount\":{\"number\":0},\"minValues\":{\"number\":1},\"maxValues\":{\"number\":3},\"tightBounds\":true}" } }), json!({ @@ -547,7 +853,7 @@ async fn test_append_partitioned() -> Result<(), Box> { "size": size, "modificationTime": 0, "dataChange": false, - "stats": "{\"numRecords\":3}" + "stats": "{\"numRecords\":3,\"nullCount\":{\"number\":0},\"minValues\":{\"number\":4},\"maxValues\":{\"number\":6},\"tightBounds\":true}" } }), ]; @@ -589,7 +895,7 @@ async fn test_append_invalid_schema() -> Result<(), Box> { let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; let txn = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_engine_info("default engine"); // create two new arrow record batches to append @@ -621,9 +927,12 @@ async fn test_append_invalid_schema() -> Result<(), Box> let mut add_files_metadata = futures::future::join_all(tasks).await.into_iter().flatten(); assert!(add_files_metadata.all(|res| match res { - Err(KernelError::Arrow(ArrowError::SchemaError(_))) => true, + Err(KernelError::Arrow(ArrowError::InvalidArgumentError(_))) => true, Err(KernelError::Backtraced { source, .. }) - if matches!(&*source, KernelError::Arrow(ArrowError::SchemaError(_))) => + if matches!( + &*source, + KernelError::Arrow(ArrowError::InvalidArgumentError(_)) + ) => true, _ => false, })); @@ -637,10 +946,7 @@ async fn test_write_txn_actions() -> Result<(), Box> { let _ = tracing_subscriber::fmt::try_init(); // create a simple table: one int column named 'number' - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); for (table_url, engine, store, table_name) in setup_test_tables(schema, &[], None, "test_table").await? @@ -649,7 +955,7 @@ async fn test_write_txn_actions() -> Result<(), Box> { let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; assert!(matches!( snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_transaction_id("app_id1".to_string(), 0) .with_transaction_id("app_id1".to_string(), 1) .commit(&engine), @@ -658,7 +964,7 @@ async fn test_write_txn_actions() -> Result<(), Box> { let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; let txn = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_engine_info("default engine") .with_transaction_id("app_id1".to_string(), 1) .with_transaction_id("app_id2".to_string(), 2); @@ -669,18 +975,9 @@ async fn test_write_txn_actions() -> Result<(), Box> { let snapshot = Snapshot::builder_for(table_url.clone()) .at_version(1) .build(&engine)?; - assert_eq!( - snapshot.clone().get_app_id_version("app_id1", &engine)?, - Some(1) - ); - assert_eq!( - snapshot.clone().get_app_id_version("app_id2", &engine)?, - Some(2) - ); - assert_eq!( - snapshot.clone().get_app_id_version("app_id3", &engine)?, - None - ); + assert_eq!(snapshot.get_app_id_version("app_id1", &engine)?, Some(1)); + assert_eq!(snapshot.get_app_id_version("app_id2", &engine)?, Some(2)); + assert_eq!(snapshot.get_app_id_version("app_id3", &engine)?, None); let commit1 = store .get(&Path::from(format!( @@ -792,7 +1089,7 @@ async fn test_append_timestamp_ntz() -> Result<(), Box> { let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; let mut txn = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_engine_info("default engine"); // Create Arrow data with TIMESTAMP_NTZ values including edge cases @@ -879,38 +1176,20 @@ async fn test_append_variant() -> Result<(), Box> { // create a table with VARIANT column let table_schema = Arc::new(StructType::try_new(vec![ - StructField::nullable("v", DataType::unshredded_variant()) - .with_metadata([("delta.columnMapping.physicalName", "col1")]) - .add_metadata([("delta.columnMapping.id", 1)]), - StructField::nullable("i", DataType::INTEGER) - .with_metadata([("delta.columnMapping.physicalName", "col2")]) - .add_metadata([("delta.columnMapping.id", 2)]), + StructField::nullable("v", DataType::unshredded_variant()), + StructField::nullable("i", DataType::INTEGER), StructField::nullable( "nested", // We flip the value and metadata fields in the actual parquet file for the test StructType::try_new(vec![StructField::nullable( "nested_v", unshredded_variant_schema_flipped(), - ) - .with_metadata([("delta.columnMapping.physicalName", "col21")]) - .add_metadata([("delta.columnMapping.id", 3)])])?, - ) - .with_metadata([("delta.columnMapping.physicalName", "col3")]) - .add_metadata([("delta.columnMapping.id", 4)]), - ])?); - - let write_schema = Arc::new(StructType::try_new(vec![ - StructField::nullable("col1", DataType::unshredded_variant()), - StructField::nullable("col2", DataType::INTEGER), - StructField::nullable( - "col3", - StructType::try_new(vec![StructField::nullable( - "col21", - unshredded_variant_schema_flipped(), )])?, ), ])?); + let write_schema = table_schema.clone(); + let tmp_test_dir = tempdir()?; let tmp_test_dir_url = Url::from_directory_path(tmp_test_dir.path()).unwrap(); @@ -920,23 +1199,20 @@ async fn test_append_variant() -> Result<(), Box> { // We can add shredding features as well as we are allowed to write unshredded variants // into shredded tables and shredded reads are explicitly blocked in the default // engine's parquet reader. - // TODO: (#1124) we don't actually support column mapping writes yet, but have some - // tests that do column mapping on writes. For now omit the writer feature to let tests - // run, but after actual support this should be enabled. let table_url = create_table( store.clone(), table_location, table_schema.clone(), &[], true, - vec!["variantType", "variantShredding-preview", "columnMapping"], + vec!["variantType", "variantShredding-preview"], vec!["variantType", "variantShredding-preview"], ) .await?; let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; let mut txn = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_data_change(true); // First value corresponds to the variant value "1". Third value corresponds to the variant @@ -1015,7 +1291,7 @@ async fn test_append_variant() -> Result<(), Box> { Arc::new(Int32Array::from(i_values.clone())), // nested struct Arc::new(StructArray::try_new( - vec![Field::new("col21", variant_arrow_type_flipped(), true)].into(), + vec![Field::new("nested_v", variant_arrow_type_flipped(), true)].into(), vec![variant_nested_v_array.clone()], None, )?), @@ -1036,6 +1312,7 @@ async fn test_append_variant() -> Result<(), Box> { write_context.target_dir(), Box::new(ArrowEngineData::new(data.clone())), HashMap::new(), + Some(write_context.stats_columns()), ) .await?; @@ -1147,7 +1424,7 @@ async fn test_shredded_variant_read_rejection() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { let _ = tracing_subscriber::fmt::try_init(); - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); let table_name = "test_domain_metadata_basic"; @@ -1267,7 +1542,7 @@ async fn test_set_domain_metadata_basic() -> Result<(), Box Result<(), Box assert_eq!(config, config1), d if d == domain2 => assert_eq!(config, config2), - _ => panic!("Unexpected domain: {}", domain), + _ => panic!("Unexpected domain: {domain}"), } } @@ -1325,10 +1600,7 @@ async fn test_set_domain_metadata_basic() -> Result<(), Box Result<(), Box> { let _ = tracing_subscriber::fmt::try_init(); - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); let table_name = "test_domain_metadata_errors"; let (store, engine, table_location) = engine_store_setup(table_name, None); @@ -1348,7 +1620,7 @@ async fn test_set_domain_metadata_errors() -> Result<(), Box Result<(), Box Result<(), Box> { let _ = tracing_subscriber::fmt::try_init(); - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); let table_name = "test_domain_metadata_unsupported"; @@ -1400,7 +1669,7 @@ async fn test_set_domain_metadata_unsupported_writer_feature( let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; let res = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_domain_metadata("app.config".to_string(), "test_config".to_string()) .commit(&engine); @@ -1409,15 +1678,45 @@ async fn test_set_domain_metadata_unsupported_writer_feature( Ok(()) } +#[tokio::test] +async fn test_remove_domain_metadata_unsupported_writer_feature( +) -> Result<(), Box> { + let _ = tracing_subscriber::fmt::try_init(); + + let schema = get_simple_int_schema(); + + let table_name = "test_remove_domain_metadata_unsupported"; + + // Create table WITHOUT domain metadata writer feature support + let (store, engine, table_location) = engine_store_setup(table_name, None); + let table_url = create_table( + store.clone(), + table_location, + schema.clone(), + &[], + true, + vec![], + vec![], + ) + .await?; + + let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; + let res = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine)? + .with_domain_metadata_removed("app.config".to_string()) + .commit(&engine); + + assert_result_error_with_message(res, "Domain metadata operations require writer version 7 and the 'domainMetadata' writer feature"); + + Ok(()) +} + #[tokio::test] async fn test_remove_domain_metadata_non_existent_domain() -> Result<(), Box> { let _ = tracing_subscriber::fmt::try_init(); - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); let table_name = "test_domain_metadata_unsupported"; @@ -1434,7 +1733,7 @@ async fn test_remove_domain_metadata_non_existent_domain() -> Result<(), Box Result<(), Box Result<(), Box> { let _ = tracing_subscriber::fmt::try_init(); - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); let table_name = "test_domain_metadata_unsupported"; @@ -1495,7 +1791,7 @@ async fn test_domain_metadata_set_remove_conflicts() -> Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box Result<(), Box> { let _ = tracing_subscriber::fmt::try_init(); - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); let table_name = "test_domain_metadata_unsupported"; @@ -1574,14 +1867,14 @@ async fn test_domain_metadata_set_then_remove() -> Result<(), Box Result<(), Box, + store: Arc, table_url: &Url, version: u64, ) -> Result> { - let commit_path = table_url.join(&format!("_delta_log/{:020}.json", version))?; + let commit_path = table_url.join(&format!("_delta_log/{version:020}.json"))?; let commit = store.get(&Path::from_url_path(commit_path.path())?).await?; let commit_content = String::from_utf8(commit.bytes().await?.to_vec())?; @@ -1634,8 +1927,7 @@ async fn get_ict_at_version( .collect(); assert!( !lines.is_empty(), - "Commit log at version {} should not be empty", - version + "Commit log at version {version} should not be empty" ); // First line should contain commitInfo with inCommitTimestamp @@ -1681,10 +1973,7 @@ async fn test_ict_commit_e2e() -> Result<(), Box> { let _ = tracing_subscriber::fmt::try_init(); // create a simple table: one int column named 'number' with ICT enabled - let schema = Arc::new(StructType::try_new(vec![StructField::nullable( - "number", - DataType::INTEGER, - )])?); + let schema = get_simple_int_schema(); let tmp_dir = TempDir::new()?; let tmp_test_dir_url = Url::from_file_path(&tmp_dir).unwrap(); @@ -1713,7 +2002,7 @@ async fn test_ict_commit_e2e() -> Result<(), Box> { ); let mut txn = snapshot - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_engine_info("ict test"); // Add some data @@ -1745,8 +2034,7 @@ async fn test_ict_commit_e2e() -> Result<(), Box> { assert!( first_ict > 1612345678, - "First commit ICT ({}) should be greater than enablement timestamp (1612345678)", - first_ict + "First commit ICT ({first_ict}) should be greater than enablement timestamp (1612345678)" ); // Second commit @@ -1758,7 +2046,7 @@ async fn test_ict_commit_e2e() -> Result<(), Box> { ); let mut txn2 = snapshot2 - .transaction(Box::new(FileSystemCommitter::new()))? + .transaction(Box::new(FileSystemCommitter::new()), &engine)? .with_engine_info("ict test 2"); // Add more data @@ -1791,10 +2079,1875 @@ async fn test_ict_commit_e2e() -> Result<(), Box> { // Verify monotonic property: second_ict > first_ict assert!( second_ict > first_ict, - "Second ICT ({}) should be greater than first ICT ({})", - second_ict, - first_ict + "Second ICT ({second_ict}) should be greater than first ICT ({first_ict})" ); Ok(()) } + +#[tokio::test] +async fn test_remove_files_adds_expected_entries() -> Result<(), Box> { + // This test verifies that Remove actions generated from scan metadata contain all expected fields + // from the Remove struct (defined in kernel/src/actions/mod.rs). + // + // This test uses the table-with-dv-small dataset which contains files with tags and deletion vectors. + // + // Not populated in the dataset are (covered by row_tracking tests): + // baseRowId (optional i64) + // defaultRowCommitVersion (optional i64) + use std::path::PathBuf; + + let _ = tracing_subscriber::fmt::try_init(); + + let tmp_dir = tempdir()?; + let tmp_table_path = tmp_dir.path().join("table-with-dv-small"); + let source_path = std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/"))?; + copy_directory(&source_path, &tmp_table_path)?; + + let table_url = url::Url::from_directory_path(&tmp_table_path).unwrap(); + let engine = create_default_engine(&table_url)?; + + let snapshot = Snapshot::builder_for(table_url.clone()) + .at_version(1) + .build(engine.as_ref())?; + + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("test engine") + .with_data_change(true); + + let scan = snapshot.scan_builder().build()?; + let scan_metadata = scan.scan_metadata(engine.as_ref())?.next().unwrap()?; + + let (data, selection_vector) = scan_metadata.scan_files.into_parts(); + let remove_metadata = FilteredEngineData::try_new(data, selection_vector)?; + + txn.remove_files(remove_metadata); + + let result = txn.commit(engine.as_ref())?; + + match result { + CommitResult::CommittedTransaction(committed) => { + let commit_version = committed.commit_version(); + + // Read the commit log directly to verify remove actions + let commit_path = tmp_table_path.join(format!("_delta_log/{commit_version:020}.json")); + let commit_content = std::fs::read_to_string(commit_path)?; + + let parsed_commits: Vec<_> = Deserializer::from_str(&commit_content) + .into_iter::() + .try_collect()?; + + // Verify we have at least commitInfo and remove actions + assert!( + parsed_commits.len() >= 2, + "Expected at least 2 actions (commitInfo + remove)" + ); + + // Extract the commitInfo timestamp to validate against deletionTimestamp + let commit_info_action = parsed_commits + .iter() + .find(|action| action.get("commitInfo").is_some()) + .expect("Missing commitInfo action"); + let commit_info = &commit_info_action["commitInfo"]; + let commit_timestamp = commit_info["timestamp"] + .as_i64() + .expect("Missing timestamp in commitInfo"); + + // Verify remove actions + let remove_actions: Vec<_> = parsed_commits + .iter() + .filter(|action| action.get("remove").is_some()) + .collect(); + + assert!( + !remove_actions.is_empty(), + "Expected at least one remove action" + ); + + assert_eq!(remove_actions.len(), 1); + let remove_action = remove_actions[0]; + let remove = &remove_action["remove"]; + + // path (required) + assert!(remove.get("path").is_some(), "Missing path field"); + let path = remove["path"].as_str().expect("path should be a string"); + assert_eq!( + path, + "part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet" + ); + + // dataChange (required) + assert_eq!(remove["dataChange"].as_bool(), Some(true)); + + // deletionTimestamp (optional) - should match commit timestamp + let deletion_timestamp = remove["deletionTimestamp"] + .as_i64() + .expect("Missing deletionTimestamp"); + assert_eq!( + deletion_timestamp, commit_timestamp, + "deletionTimestamp should match commit timestamp" + ); + + // extendedFileMetadata (optional) + assert_eq!(remove["extendedFileMetadata"].as_bool(), Some(true)); + + // partitionValues (optional) + let partition_vals = remove["partitionValues"] + .as_object() + .expect("Missing partitionValues"); + assert_eq!(partition_vals.len(), 0); + + // size (optional) + let size = remove["size"].as_i64().expect("Missing size"); + assert_eq!(size, 635); + + // stats (optional) + let stats = remove["stats"].as_str().expect("Missing stats"); + let stats_json: serde_json::Value = serde_json::from_str(stats)?; + assert_eq!(stats_json["numRecords"], 10); + + // tags (optional) + let tags = remove["tags"].as_object().expect("Missing tags"); + assert_eq!( + tags.get("INSERTION_TIME").and_then(|v| v.as_str()), + Some("1677811178336000") + ); + assert_eq!( + tags.get("MIN_INSERTION_TIME").and_then(|v| v.as_str()), + Some("1677811178336000") + ); + assert_eq!( + tags.get("MAX_INSERTION_TIME").and_then(|v| v.as_str()), + Some("1677811178336000") + ); + assert_eq!( + tags.get("OPTIMIZE_TARGET_SIZE").and_then(|v| v.as_str()), + Some("268435456") + ); + + // deletionVector (optional) + let dv = remove["deletionVector"] + .as_object() + .expect("Missing deletionVector"); + assert_eq!(dv.get("storageType").and_then(|v| v.as_str()), Some("u")); + assert_eq!( + dv.get("pathOrInlineDv").and_then(|v| v.as_str()), + Some("vBn[lx{q8@P<9BNH/isA") + ); + assert_eq!(dv.get("offset").and_then(|v| v.as_i64()), Some(1)); + assert_eq!(dv.get("sizeInBytes").and_then(|v| v.as_i64()), Some(36)); + assert_eq!(dv.get("cardinality").and_then(|v| v.as_i64()), Some(2)); + + // Row tracking fields should be absent as the feature is was not enabled on writing + // row_tracking tests cover having these populated. + assert!(remove.get("baseRowId").is_none()); + assert!(remove.get("defaultRowCommitVersion").is_none()); + } + _ => panic!("Transaction should be committed"), + } + + Ok(()) +} + +#[tokio::test] +async fn test_update_deletion_vectors_adds_expected_entries( +) -> Result<(), Box> { + // This test verifies that deletion vector updates write proper Remove and Add actions + // to the transaction log. + // + // NOTE: Additional unit tests for update_deletion_vectors exist in kernel/src/transaction/mod.rs + // + // The test validates: + // 1. Transaction setup for DV updates + // 2. Scanning and extracting scan files with DV data + // 3. Creating new DV descriptors for the files + // 4. Calling update_deletion_vectors to update the DVs + // 5. Committing and verifying the generated actions + // + // Expected commit log structure: + // - commitInfo: Contains metadata about the transaction + // - remove: Contains OLD deletion vector data and original file metadata + // - add: Contains NEW deletion vector data and updated file metadata + // + // The test ensures: + // - Remove action has the OLD DV descriptor with all 5 fields + // - Add action has the NEW DV descriptor with all 5 fields + // - All file metadata is preserved (size, stats, tags, partitionValues) + // - dataChange is properly set to true + // - deletionTimestamp matches commit timestamp + use std::path::PathBuf; + + let _ = tracing_subscriber::fmt::try_init(); + + let tmp_dir = tempdir()?; + let tmp_table_path = tmp_dir.path().join("table-with-dv-small"); + let source_path = std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/"))?; + copy_directory(&source_path, &tmp_table_path)?; + + let table_url = url::Url::from_directory_path(&tmp_table_path).unwrap(); + let engine = create_default_engine(&table_url)?; + + let snapshot = Snapshot::builder_for(table_url.clone()) + .at_version(1) + .build(engine.as_ref())?; + + // Create transaction with DV update mode enabled + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("test engine") + .with_operation("UPDATE".to_string()) + .with_data_change(true); + + // Build scan and collect all scan metadata + let scan = snapshot.clone().scan_builder().build()?; + let all_scan_metadata: Vec<_> = scan + .scan_metadata(engine.as_ref())? + .collect::, _>>()?; + + // Extract scan files for DV update + let scan_files: Vec<_> = all_scan_metadata + .into_iter() + .map(|sm| sm.scan_files) + .collect(); + + // Create new DV descriptors for the files + let file_path = "part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet"; + let mut dv_map = HashMap::new(); + + // Create a NEW deletion vector descriptor (different from the original) + let new_dv = DeletionVectorDescriptor { + storage_type: DeletionVectorStorageType::PersistedRelative, + path_or_inline_dv: "cd^-aqEH.-t@S}K{vb[*k^".to_string(), + offset: Some(10), + size_in_bytes: 40, + cardinality: 3, + }; + dv_map.insert(file_path.to_string(), new_dv); + + // Call update_deletion_vectors to exercise the API + txn.update_deletion_vectors(dv_map, scan_files.into_iter().map(Ok))?; + + // Commit the transaction + let result = txn.commit(engine.as_ref())?; + + match result { + CommitResult::CommittedTransaction(committed) => { + let commit_version = committed.commit_version(); + + // Read the original version 1 log to get original file metadata + let original_log_path = tmp_table_path.join("_delta_log/00000000000000000001.json"); + let original_log_content = std::fs::read_to_string(original_log_path)?; + let original_commits: Vec<_> = Deserializer::from_str(&original_log_content) + .into_iter::() + .try_collect()?; + + let file_path = "part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet"; + + // Extract original file metadata from version 1 + let original_add = original_commits + .iter() + .find(|action| { + action + .get("add") + .and_then(|add| add.get("path").and_then(|p| p.as_str())) + == Some(file_path) + }) + .expect("Missing original add action in version 1") + .get("add") + .expect("Should have add field"); + + let original_size = original_add["size"] + .as_i64() + .expect("Original add action should have size"); + let original_partition_values = original_add["partitionValues"] + .as_object() + .expect("Original add action should have partitionValues"); + let original_tags = original_add.get("tags"); + let original_stats = original_add.get("stats"); + + // Read the commit log directly + let commit_path = tmp_table_path.join(format!("_delta_log/{commit_version:020}.json")); + let commit_content = std::fs::read_to_string(commit_path)?; + + let parsed_commits: Vec<_> = Deserializer::from_str(&commit_content) + .into_iter::() + .try_collect()?; + + // Should have commitInfo, remove, and add actions + assert!( + parsed_commits.len() >= 3, + "Expected at least 3 actions (commitInfo + remove + add), got {}", + parsed_commits.len() + ); + + // Extract commitInfo timestamp + let commit_info_action = parsed_commits + .iter() + .find(|action| action.get("commitInfo").is_some()) + .expect("Missing commitInfo action"); + let commit_info = &commit_info_action["commitInfo"]; + let commit_timestamp = commit_info["timestamp"] + .as_i64() + .expect("Missing timestamp in commitInfo"); + + // Verify remove action contains OLD DV information + let remove_actions: Vec<_> = parsed_commits + .iter() + .filter(|action| action.get("remove").is_some()) + .collect(); + + assert_eq!( + remove_actions.len(), + 1, + "Expected exactly one remove action" + ); + + let remove_action = remove_actions[0]; + let remove = &remove_action["remove"]; + + assert_eq!( + remove["path"].as_str(), + Some(file_path), + "Remove path should match" + ); + assert_eq!(remove["dataChange"].as_bool(), Some(true)); + assert_eq!( + remove["deletionTimestamp"].as_i64(), + Some(commit_timestamp), + "deletionTimestamp should match commit timestamp" + ); + + // Verify OLD deletion vector in remove action + let old_dv = remove["deletionVector"] + .as_object() + .expect("Remove action should have deletionVector"); + assert_eq!( + old_dv.get("storageType").and_then(|v| v.as_str()), + Some("u"), + "Old DV storage type should be 'u'" + ); + assert_eq!( + old_dv.get("pathOrInlineDv").and_then(|v| v.as_str()), + Some("vBn[lx{q8@P<9BNH/isA"), + "Old DV path should match original" + ); + assert_eq!( + old_dv.get("offset").and_then(|v| v.as_i64()), + Some(1), + "Old DV offset should be 1" + ); + assert_eq!( + old_dv.get("sizeInBytes").and_then(|v| v.as_i64()), + Some(36), + "Old DV size should be 36" + ); + assert_eq!( + old_dv.get("cardinality").and_then(|v| v.as_i64()), + Some(2), + "Old DV cardinality should be 2" + ); + + // Verify file metadata is preserved in remove action + let remove_size = remove["size"] + .as_i64() + .expect("Remove action should have size"); + let remove_partition_values = remove["partitionValues"] + .as_object() + .expect("Remove action should have partitionValues"); + let remove_tags = remove.get("tags"); + let remove_stats = remove.get("stats"); + + // Verify add action contains NEW DV information + let add_actions: Vec<_> = parsed_commits + .iter() + .filter(|action| action.get("add").is_some()) + .collect(); + + assert_eq!(add_actions.len(), 1, "Expected exactly one add action"); + + let add_action = add_actions[0]; + let add = &add_action["add"]; + + assert_eq!( + add["path"].as_str(), + Some(file_path), + "Add path should match" + ); + assert_eq!(add["dataChange"].as_bool(), Some(true)); + + // Verify NEW deletion vector in add action + let new_dv = add["deletionVector"] + .as_object() + .expect("Add action should have deletionVector"); + assert_eq!( + new_dv.get("storageType").and_then(|v| v.as_str()), + Some("u"), + "New DV storage type should be 'u'" + ); + assert_eq!( + new_dv.get("pathOrInlineDv").and_then(|v| v.as_str()), + Some("cd^-aqEH.-t@S}K{vb[*k^"), + "New DV path should match updated value" + ); + assert_eq!( + new_dv.get("offset").and_then(|v| v.as_i64()), + Some(10), + "New DV offset should be 10" + ); + assert_eq!( + new_dv.get("sizeInBytes").and_then(|v| v.as_i64()), + Some(40), + "New DV size should be 40" + ); + assert_eq!( + new_dv.get("cardinality").and_then(|v| v.as_i64()), + Some(3), + "New DV cardinality should be 3" + ); + + // Verify file metadata is preserved in add action + let add_size = add["size"].as_i64().expect("Add action should have size"); + let add_partition_values = add["partitionValues"] + .as_object() + .expect("Add action should have partitionValues"); + let add_tags = add.get("tags"); + let add_stats = add.get("stats"); + + // Ensure metadata is consistent between remove and add actions + assert_eq!( + remove_size, add_size, + "File size should be preserved between remove and add" + ); + assert_eq!( + remove_partition_values, add_partition_values, + "Partition values should be preserved between remove and add" + ); + assert_eq!( + remove_tags, add_tags, + "Tags should be preserved between remove and add" + ); + assert_eq!( + remove_stats, add_stats, + "Stats should be preserved between remove and add" + ); + + // Ensure metadata matches the original file metadata from version 1 + assert_eq!( + remove_size, original_size, + "Remove action size should match original file size" + ); + assert_eq!( + add_size, original_size, + "Add action size should match original file size" + ); + assert_eq!( + remove_partition_values, original_partition_values, + "Remove action partition values should match original" + ); + assert_eq!( + add_partition_values, original_partition_values, + "Add action partition values should match original" + ); + assert_eq!( + remove_tags, original_tags, + "Remove action tags should match original" + ); + assert_eq!( + add_tags, original_tags, + "Add action tags should match original" + ); + assert_eq!( + remove_stats, original_stats, + "Remove action stats should match original" + ); + assert_eq!( + add_stats, original_stats, + "Add action stats should match original" + ); + } + _ => panic!("Transaction should be committed"), + } + + Ok(()) +} + +#[tokio::test] +async fn test_update_deletion_vectors_multiple_files() -> Result<(), Box> { + // This test verifies that update_deletion_vectors can update multiple files + // in a single call, creating proper Remove and Add actions for each file. + let _ = tracing_subscriber::fmt::try_init(); + + let schema = Arc::new(StructType::try_new(vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("value", DataType::STRING), + ])?); + + // Setup: Create table with 3 files + let file_names = &["file0.parquet", "file1.parquet", "file2.parquet"]; + let (store, engine, table_url, file_paths) = + create_dv_table_with_files("test_table", schema, file_names).await?; + + // Create DV update transaction + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("test engine") + .with_operation("UPDATE".to_string()) + .with_data_change(true); + + let mut scan_files = get_scan_files(snapshot.clone(), engine.as_ref())?; + + // Update deletion vectors for all 3 files in a single call + let mut dv_map = HashMap::new(); + for (idx, file_path) in file_paths.iter().enumerate() { + let descriptor = DeletionVectorDescriptor { + storage_type: DeletionVectorStorageType::PersistedRelative, + path_or_inline_dv: format!("dv_file_{idx}.bin"), + offset: Some(idx as i32 * 10), + size_in_bytes: 40 + idx as i32, + cardinality: idx as i64 + 1, + }; + dv_map.insert(file_path.to_string(), descriptor); + } + + txn.update_deletion_vectors(dv_map, scan_files.drain(..).map(Ok))?; + + // Commit the transaction + let result = txn.commit(engine.as_ref())?; + + match result { + CommitResult::CommittedTransaction(committed) => { + let commit_version = committed.commit_version(); + + // Read the commit log directly from object store + let final_commit_path = + table_url.join(&format!("_delta_log/{commit_version:020}.json"))?; + let commit_content = store + .get(&Path::from_url_path(final_commit_path.path())?) + .await? + .bytes() + .await?; + + let parsed_commits: Vec<_> = Deserializer::from_slice(&commit_content) + .into_iter::() + .try_collect()?; + + // Extract all remove and add actions + let remove_actions: Vec<_> = parsed_commits + .iter() + .filter(|action| action.get("remove").is_some()) + .collect(); + + let add_actions: Vec<_> = parsed_commits + .iter() + .filter(|action| action.get("add").is_some()) + .collect(); + + // Should have 3 remove and 3 add actions + assert_eq!( + remove_actions.len(), + 3, + "Expected 3 remove actions for 3 files" + ); + assert_eq!(add_actions.len(), 3, "Expected 3 add actions for 3 files"); + + // Verify each file has a DV in both remove and add + for (idx, file_path) in file_paths.iter().enumerate() { + // Find the remove action for this file + let remove_action = remove_actions + .iter() + .find(|action| action["remove"]["path"].as_str() == Some(file_path.as_str())) + .unwrap_or_else(|| panic!("Should find remove action for {file_path}")); + + // Find the add action for this file + let add_action = add_actions + .iter() + .find(|action| action["add"]["path"].as_str() == Some(file_path.as_str())) + .unwrap_or_else(|| panic!("Should find add action for {file_path}")); + + // Verify remove action does NOT have a DV (since these were newly written files) + assert!( + remove_action["remove"]["deletionVector"].is_null(), + "Remove action for newly written file should not have a DV" + ); + + // Verify add action has the NEW DV + let add_dv = add_action["add"]["deletionVector"] + .as_object() + .expect("Add action should have deletionVector"); + + let expected_path = format!("dv_file_{idx}.bin"); + assert_eq!( + add_dv.get("pathOrInlineDv").and_then(|v| v.as_str()), + Some(expected_path.as_str()), + "DV path should match for file {file_path}" + ); + assert_eq!( + add_dv.get("offset").and_then(|v| v.as_i64()), + Some(idx as i64 * 10), + "DV offset should match for file {file_path}" + ); + assert_eq!( + add_dv.get("sizeInBytes").and_then(|v| v.as_i64()), + Some(40 + idx as i64), + "DV size should match for file {file_path}" + ); + assert_eq!( + add_dv.get("cardinality").and_then(|v| v.as_i64()), + Some(idx as i64 + 1), + "DV cardinality should match for file {file_path}" + ); + } + } + _ => panic!("Transaction should be committed"), + } + + Ok(()) +} + +#[tokio::test] +async fn test_remove_files_verify_files_excluded_from_scan( +) -> Result<(), Box> { + // Adds and then removes files and then verifies they don't appear in the scan. + + // setup tracing + let _ = tracing_subscriber::fmt::try_init(); + + // create a simple table: one int column named 'number' + let schema = get_simple_int_schema(); + + for (table_url, engine, _store, _table_name) in + setup_test_tables(schema.clone(), &[], None, "test_table").await? + { + // First, add some files to the table + let engine = Arc::new(engine); + write_data_and_check_result_and_stats(table_url.clone(), schema.clone(), engine.clone(), 1) + .await?; + + // Get initial file count + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let scan = snapshot.clone().scan_builder().build()?; + let scan_metadata = scan.scan_metadata(engine.as_ref())?.next().unwrap()?; + let (_, selection_vector) = scan_metadata.scan_files.into_parts(); + let initial_file_count = selection_vector.iter().filter(|&x| *x).count(); + + assert!(initial_file_count > 0); + + // Now create a transaction to remove files + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())?; + + // Create a new scan to get file metadata for removal + let scan2 = snapshot.scan_builder().build()?; + let scan_metadata2 = scan2.scan_metadata(engine.as_ref())?.next().unwrap()?; + + // Create FilteredEngineData for removal (select all rows for removal) + let file_remove_count = (scan_metadata2.scan_files.data().len() + - scan_metadata2.scan_files.selection_vector().len()) + + scan_metadata2 + .scan_files + .selection_vector() + .iter() + .filter(|&x| *x) + .count(); + assert!(file_remove_count > 0); + + // Add remove files to transaction + txn.remove_files(scan_metadata2.scan_files); + + // Commit the transaction + let result = txn.commit(engine.as_ref()); + + match result? { + CommitResult::CommittedTransaction(committed) => { + assert_eq!(committed.commit_version(), 2); + + let new_snapshot = Snapshot::builder_for(table_url.clone()) + .at_version(2) + .build(engine.as_ref())?; + + let new_scan = new_snapshot.scan_builder().build()?; + let mut new_file_count = 0; + for new_metadata in new_scan.scan_metadata(engine.as_ref())? { + new_file_count += new_metadata?.scan_files.data().len(); + } + + // All files were removed, so new_file_count should be zero + assert_eq!(new_file_count, 0); + } + _ => panic!("Transaction did not succeeed."), + } + } + Ok(()) +} + +#[tokio::test] +async fn test_remove_files_with_modified_selection_vector() -> Result<(), Box> +{ + // This test verifies that we can selectively remove files by: + // 1. Calling remove_files multiple times with different subsets + // 2. Modifying the selection vector to choose which files to remove + + let _ = tracing_subscriber::fmt::try_init(); + + let schema = get_simple_int_schema(); + + for (table_url, engine, _store, _table_name) in + setup_test_tables(schema.clone(), &[], None, "test_table").await? + { + let engine = Arc::new(engine); + + // Write data multiple times to create multiple files + for i in 1..=5 { + write_data_and_check_result_and_stats( + table_url.clone(), + schema.clone(), + engine.clone(), + i, + ) + .await?; + } + + // Get initial file count + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let scan = snapshot.clone().scan_builder().build()?; + + let mut initial_file_count = 0; + for metadata in scan.scan_metadata(engine.as_ref())? { + let metadata = metadata?; + initial_file_count += metadata + .scan_files + .selection_vector() + .iter() + .filter(|&x| *x) + .count(); + } + + assert!( + initial_file_count >= 3, + "Need at least 3 files for this test, got {initial_file_count}" + ); + + // Create a transaction to remove files in two batches + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("selective remove test") + .with_operation("DELETE".to_string()) + .with_data_change(true); + + // First batch: Remove only the first file + let scan2 = snapshot.clone().scan_builder().build()?; + let scan_metadata2 = scan2.scan_metadata(engine.as_ref())?.next().unwrap()?; + let (data, mut selection_vector) = scan_metadata2.scan_files.into_parts(); + + // Select only the first file for removal + let mut first_batch_removed = 0; + for selected in selection_vector.iter_mut() { + if *selected && first_batch_removed < 1 { + // Keep selected for removal + first_batch_removed += 1; + } else { + // Don't remove + *selected = false; + } + } + + assert_eq!( + first_batch_removed, 1, + "Should remove exactly 1 file in first batch" + ); + txn.remove_files(FilteredEngineData::try_new(data, selection_vector)?); + + // Second batch: Remove only the last file + let scan3 = snapshot.clone().scan_builder().build()?; + let scan_metadata3 = scan3.scan_metadata(engine.as_ref())?.next().unwrap()?; + let (data2, mut selection_vector2) = scan_metadata3.scan_files.into_parts(); + + // Find the last selected file and keep only that one selected + let mut last_selected_idx = None; + for (i, &selected) in selection_vector2.iter().enumerate() { + if selected { + last_selected_idx = Some(i); + } + } + + // Deselect all except the last one + for (i, selected) in selection_vector2.iter_mut().enumerate() { + if Some(i) != last_selected_idx { + *selected = false; + } + } + + let second_batch_removed = selection_vector2.iter().filter(|&x| *x).count(); + assert_eq!( + second_batch_removed, 1, + "Should remove exactly 1 file in second batch" + ); + txn.remove_files(FilteredEngineData::try_new(data2, selection_vector2)?); + + // Commit the transaction + let result = txn.commit(engine.as_ref())?; + + match result { + CommitResult::CommittedTransaction(committed) => { + assert_eq!(committed.commit_version(), 6); + + // Verify that exactly 2 files were removed (1 from each batch) + let new_snapshot = Snapshot::builder_for(table_url.clone()) + .at_version(6) + .build(engine.as_ref())?; + + let new_scan = new_snapshot.scan_builder().build()?; + let mut new_file_count = 0; + for new_metadata in new_scan.scan_metadata(engine.as_ref())? { + let metadata = new_metadata?; + new_file_count += metadata + .scan_files + .selection_vector() + .iter() + .filter(|&x| *x) + .count(); + } + + // Verify we removed exactly 2 files (1 + 1) + let total_removed = first_batch_removed + second_batch_removed; + assert_eq!(total_removed, 2); + assert_eq!(new_file_count, initial_file_count - total_removed); + assert!(new_file_count > 0, "At least one file should remain"); + } + _ => panic!("Transaction did not succeed"), + } + } + Ok(()) +} + +// Helper function to create a table with CDF enabled +async fn create_cdf_table( + table_name: &str, + schema: SchemaRef, +) -> Result<(Url, Arc>, TempDir), Box> +{ + let tmp_dir = tempdir()?; + let tmp_test_dir_url = Url::from_directory_path(tmp_dir.path()).unwrap(); + + let (store, engine, table_location) = engine_store_setup(table_name, Some(&tmp_test_dir_url)); + + let table_url = create_table( + store.clone(), + table_location, + schema.clone(), + &[], + true, // use protocol 3.7 + vec![], + vec!["changeDataFeed"], + ) + .await?; + + Ok((table_url, Arc::new(engine), tmp_dir)) +} + +// Helper function to write data to a table +async fn write_data_to_table( + table_url: &Url, + engine: &Arc>, + schema: SchemaRef, + values: Vec, +) -> Result> { + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let mut txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("test"); + + add_files_to_transaction(&mut txn, engine, schema, values).await?; + + let result = txn.commit(engine.as_ref())?; + match result { + CommitResult::CommittedTransaction(committed) => Ok(committed.commit_version()), + _ => panic!("Transaction should be committed"), + } +} + +// Helper function to add files to an existing transaction +async fn add_files_to_transaction( + txn: &mut delta_kernel::transaction::Transaction, + engine: &Arc>, + schema: SchemaRef, + values: Vec, +) -> Result<(), Box> { + let data = RecordBatch::try_new( + Arc::new(schema.as_ref().try_into_arrow()?), + vec![Arc::new(Int32Array::from(values))], + )?; + + let write_context = Arc::new(txn.get_write_context()); + let add_files_metadata = engine + .write_parquet( + &ArrowEngineData::new(data), + write_context.as_ref(), + HashMap::new(), + ) + .await?; + txn.add_files(add_files_metadata); + Ok(()) +} + +#[tokio::test] +async fn test_cdf_write_all_adds_succeeds() -> Result<(), Box> { + // This test verifies that add-only transactions work with CDF enabled + let _ = tracing_subscriber::fmt::try_init(); + + let schema = get_simple_int_schema(); + + let (table_url, engine, _tmp_dir) = + create_cdf_table("test_cdf_all_adds", schema.clone()).await?; + + // Add files - this should succeed + let version = write_data_to_table(&table_url, &engine, schema, vec![1, 2, 3]).await?; + assert_eq!(version, 1); + + Ok(()) +} + +#[tokio::test] +async fn test_cdf_write_all_removes_succeeds() -> Result<(), Box> { + // This test verifies that remove-only transactions work with CDF enabled + let _ = tracing_subscriber::fmt::try_init(); + + let schema = get_simple_int_schema(); + + let (table_url, engine, _tmp_dir) = + create_cdf_table("test_cdf_all_removes", schema.clone()).await?; + + // First, add some data + write_data_to_table(&table_url, &engine, schema, vec![1, 2, 3]).await?; + + // Now remove the files + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("cdf remove test") + .with_data_change(true); + + let scan = snapshot.scan_builder().build()?; + let scan_metadata = scan.scan_metadata(engine.as_ref())?.next().unwrap()?; + let (data, selection_vector) = scan_metadata.scan_files.into_parts(); + txn.remove_files(FilteredEngineData::try_new(data, selection_vector)?); + + // This should succeed - remove-only transactions are allowed with CDF + let result = txn.commit(engine.as_ref())?; + match result { + CommitResult::CommittedTransaction(committed) => { + assert_eq!(committed.commit_version(), 2); + } + _ => panic!("Transaction should be committed"), + } + + Ok(()) +} + +#[tokio::test] +async fn test_cdf_write_mixed_no_data_change_succeeds() -> Result<(), Box> { + // This test verifies that mixed add+remove transactions work when dataChange=false. + // It's allowed because the transaction does not contain any logical data changes. + // This can happen when a table is being optimized/compacted. + let _ = tracing_subscriber::fmt::try_init(); + + let schema = get_simple_int_schema(); + + let (table_url, engine, _tmp_dir) = + create_cdf_table("test_cdf_mixed_no_data_change", schema.clone()).await?; + + // First, add some data + write_data_to_table(&table_url, &engine, schema.clone(), vec![1, 2, 3]).await?; + + // Now create a transaction with both add AND remove files, but dataChange=false + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("cdf mixed test") + .with_data_change(false); // dataChange=false is key here + + // Add new files + add_files_to_transaction(&mut txn, &engine, schema, vec![4, 5, 6]).await?; + + // Also remove existing files + let scan = snapshot.scan_builder().build()?; + let scan_metadata = scan.scan_metadata(engine.as_ref())?.next().unwrap()?; + let (data, selection_vector) = scan_metadata.scan_files.into_parts(); + txn.remove_files(FilteredEngineData::try_new(data, selection_vector)?); + + // This should succeed - mixed operations are allowed when dataChange=false + let result = txn.commit(engine.as_ref())?; + match result { + CommitResult::CommittedTransaction(committed) => { + assert_eq!(committed.commit_version(), 2); + } + _ => panic!("Transaction should be committed"), + } + + Ok(()) +} + +#[tokio::test] +async fn test_cdf_write_mixed_with_data_change_fails() -> Result<(), Box> { + // This test verifies that mixed add+remove transactions fail with helpful error when dataChange=true + let _ = tracing_subscriber::fmt::try_init(); + + let schema = get_simple_int_schema(); + + let (table_url, engine, _tmp_dir) = + create_cdf_table("test_cdf_mixed_with_data_change", schema.clone()).await?; + + // First, add some data + write_data_to_table(&table_url, &engine, schema.clone(), vec![1, 2, 3]).await?; + + // Now create a transaction with both add AND remove files with dataChange=true + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("cdf mixed fail test") + .with_data_change(true); // dataChange=true - this should fail + + // Add new files + add_files_to_transaction(&mut txn, &engine, schema, vec![4, 5, 6]).await?; + + // Also remove existing files + let scan = snapshot.scan_builder().build()?; + let scan_metadata = scan.scan_metadata(engine.as_ref())?.next().unwrap()?; + let (data, selection_vector) = scan_metadata.scan_files.into_parts(); + txn.remove_files(FilteredEngineData::try_new(data, selection_vector)?); + + // This should fail with our new error message + assert_result_error_with_message( + txn.commit(engine.as_ref()), + "Cannot add and remove data in the same transaction when Change Data Feed is enabled (delta.enableChangeDataFeed = true). \ + This would require writing CDC files for DML operations, which is not yet supported. \ + Consider using separate transactions: one to add files, another to remove files." + ); + + Ok(()) +} + +#[tokio::test] +async fn test_post_commit_snapshot_create_then_insert() -> DeltaResult<()> { + let _ = tracing_subscriber::fmt::try_init(); + + let temp_dir = tempdir().unwrap(); + let table_url = Url::from_directory_path(temp_dir.path()).unwrap(); + let engine = create_default_engine(&table_url)?; + let schema = get_simple_int_schema(); + + // Create table and verify post_commit_snapshot + let create_result = create_table_txn(table_url.as_str(), schema, env!("CARGO_PKG_VERSION")) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let mut current_snapshot = match create_result { + CommitResult::CommittedTransaction(committed) => { + assert_eq!(committed.commit_version(), 0); + let post_snapshot = committed + .post_commit_snapshot() + .expect("should have post_commit_snapshot"); + assert_eq!(post_snapshot.version(), 0); + post_snapshot.clone() + } + _ => panic!("Create should succeed"), + }; + + // Do 10 inserts and verify post_commit_snapshot for each + for i in 1..11 { + let base_version = current_snapshot.version(); + + let txn = current_snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_engine_info("test"); + + match txn.commit(engine.as_ref())? { + CommitResult::CommittedTransaction(committed) => { + let post_snapshot = committed + .post_commit_snapshot() + .expect("should have post_commit_snapshot"); + + assert_eq!(post_snapshot.version(), base_version + 1); + assert_eq!(post_snapshot.version(), committed.commit_version()); + assert_eq!(post_snapshot.schema(), current_snapshot.schema()); + assert_eq!(post_snapshot.table_root(), current_snapshot.table_root()); + + current_snapshot = post_snapshot.clone(); + } + _ => panic!("Commit {i} should succeed"), + } + } + + Ok(()) +} + +#[tokio::test] +async fn test_write_parquet_succeed_with_logical_partition_names( +) -> Result<(), Box> { + let schema = Arc::new(StructType::try_new(vec![ + StructField::nullable("id", DataType::INTEGER), + StructField::nullable("letter", DataType::STRING), + ])?); + + for (table_url, engine, _store, _table_name) in setup_test_tables( + schema.clone(), + &["letter"], + None, + "test_partition_translate", + ) + .await? + { + let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; + + // Create data with only the non-partition column + let data_schema = Arc::new( + StructType::try_new(vec![StructField::nullable("id", DataType::INTEGER)]).unwrap(), + ); + let batch = RecordBatch::try_new( + Arc::new(data_schema.as_ref().try_into_arrow()?), + vec![Arc::new(Int32Array::from(vec![1, 2]))], + )?; + + // Pass partition values with logical name — should succeed + let result = write_batch_to_table( + &snapshot, + &engine, + batch, + HashMap::from([("letter".to_string(), "a".to_string())]), + ) + .await; + assert!( + result.is_ok(), + "write_parquet should succeed with valid logical partition name" + ); + } + Ok(()) +} + +#[tokio::test] +async fn test_write_parquet_rejects_unknown_partition_column( +) -> Result<(), Box> { + let schema = get_simple_int_schema(); + + for (table_url, engine, _store, _table_name) in + setup_test_tables(schema.clone(), &[], None, "test_partition_reject").await? + { + let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; + + let batch = RecordBatch::try_new( + Arc::new(schema.as_ref().try_into_arrow()?), + vec![Arc::new(Int32Array::from(vec![1, 2]))], + )?; + + let result = write_batch_to_table( + &snapshot, + &engine, + batch, + HashMap::from([("nonexistent".to_string(), "val".to_string())]), + ) + .await; + let err = result.expect_err("write_parquet should fail with unknown partition column"); + let err_msg = err.to_string(); + assert!( + err_msg.contains("Partition column 'nonexistent' not found in table schema"), + "Error should mention the unknown column name, got: {err_msg}" + ); + } + Ok(()) +} + +/// 1. Creates a table with the given column mapping mode +/// 2. Writes two batches of data +/// 3. Checkpoints and verifies add.stats uses physical column names in the checkpoint +/// 4. Reads a parquet footer to verify physical names/IDs +/// 5. Reads data back to verify correctness +/// 6. Removes files and verifies remove.stats matches the original add.stats +#[rstest::rstest] +#[case::cm_none(ColumnMappingMode::None)] +#[case::cm_id(ColumnMappingMode::Id)] +#[case::cm_name(ColumnMappingMode::Name)] +#[tokio::test(flavor = "multi_thread")] +async fn test_column_mapping_write( + #[case] cm_mode: ColumnMappingMode, +) -> Result<(), Box> { + let _ = tracing_subscriber::fmt::try_init(); + + let schema = nested_schema()?; + + let (_tmp_dir, table_path, _) = test_table_setup()?; + let table_url = Url::from_directory_path(&table_path).unwrap(); + let store: Arc = Arc::new(LocalFileSystem::new()); + let engine = Arc::new( + DefaultEngineBuilder::new(store.clone()) + .with_task_executor(Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + ))) + .build(), + ); + + // Step 1: Create table + let mode_str = match cm_mode { + ColumnMappingMode::None => "none", + ColumnMappingMode::Id => "id", + ColumnMappingMode::Name => "name", + }; + let mut latest_snapshot = create_table_and_load_snapshot( + &table_path, + schema.clone(), + engine.as_ref(), + &[("delta.columnMapping.mode", mode_str)], + )?; + + // Get physical field paths for stats verification (top-level and nested) + let cm = latest_snapshot + .table_properties() + .column_mapping_mode + .unwrap_or(ColumnMappingMode::None); + let row_number_physical = get_any_level_column_physical_name( + latest_snapshot.schema().as_ref(), + &ColumnName::new(["row_number"]), + cm, + )? + .into_inner(); + let street_physical = get_any_level_column_physical_name( + latest_snapshot.schema().as_ref(), + &ColumnName::new(["address", "street"]), + cm, + )? + .into_inner(); + + // Step 2: Write two batches + for data in nested_batches()? { + latest_snapshot = + write_batch_to_table(&latest_snapshot, engine.as_ref(), data, HashMap::new()).await?; + } + + // Enable writeStatsAsStruct so the checkpoint contains native stats_parsed. + // CREATE TABLE doesn't allow this property yet, so we write a metadata-update commit directly. + latest_snapshot = set_table_properties( + &table_path, + &table_url, + engine.as_ref(), + latest_snapshot.version(), + &[("delta.checkpoint.writeStatsAsStruct", "true")], + )?; + + // Step 3: Checkpoint and verify add.stats uses correct column names + let snapshot_for_checkpoint = latest_snapshot.clone(); + snapshot_for_checkpoint.checkpoint(engine.as_ref())?; + let ckpt_snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let add_actions = read_add_infos(&ckpt_snapshot, engine.as_ref())?; + let mut all_stats: Vec<_> = add_actions + .iter() + .filter_map(|a| a.stats.as_ref()) + .filter(|s| s.get("minValues").is_some()) + .collect(); + assert_eq!(all_stats.len(), 2, "should have stats for 2 files"); + all_stats.sort_by_key(|s| s["minValues"][&row_number_physical[0]].as_i64().unwrap()); + + // Batch 1: row_number 1..3, address.street "st1".."st3" + assert_min_max_stats(all_stats[0], &row_number_physical, 1, 3); + assert_min_max_stats(all_stats[0], &street_physical, "st1", "st3"); + + // Batch 2: row_number 4..6, address.street "st4".."st6" + assert_min_max_stats(all_stats[1], &row_number_physical, 4, 6); + assert_min_max_stats(all_stats[1], &street_physical, "st4", "st6"); + + // Step 3b: Verify stats_parsed in scan metadata uses correct physical column names + { + let scan = ckpt_snapshot + .scan_builder() + .include_all_stats_columns() + .build()?; + let scan_metadata_results: Vec<_> = scan + .scan_metadata(engine.as_ref())? + .collect::, _>>()?; + + let mut stats_rows: Vec<(i64, i64, String, String)> = Vec::new(); + for sm in scan_metadata_results { + let (data, sel) = sm.scan_files.into_parts(); + let batch: RecordBatch = ArrowEngineData::try_from_engine_data(data)?.into(); + + let batch_struct = StructArray::from(batch.clone()); + let stats_parsed: &StructArray = + resolve_struct_field(&batch_struct, &["stats_parsed".into()]); + + let min_path = |field: &[String]| -> Vec { + [&["stats_parsed".into(), "minValues".into()], field].concat() + }; + let max_path = |field: &[String]| -> Vec { + [&["stats_parsed".into(), "maxValues".into()], field].concat() + }; + let min_row_num: &Int64Array = + resolve_struct_field(&batch_struct, &min_path(&row_number_physical)); + let max_row_num: &Int64Array = + resolve_struct_field(&batch_struct, &max_path(&row_number_physical)); + let min_st: &StringArray = + resolve_struct_field(&batch_struct, &min_path(&street_physical)); + let max_st: &StringArray = + resolve_struct_field(&batch_struct, &max_path(&street_physical)); + + for (i, &selected) in sel.iter().enumerate().take(batch.num_rows()) { + if selected && !stats_parsed.is_null(i) { + stats_rows.push(( + min_row_num.value(i), + max_row_num.value(i), + min_st.value(i).to_string(), + max_st.value(i).to_string(), + )); + } + } + } + + stats_rows.sort_by_key(|r| r.0); + assert_eq!(stats_rows.len(), 2, "should have stats_parsed for 2 files"); + assert_eq!(stats_rows[0], (1, 3, "st1".to_string(), "st3".to_string())); + assert_eq!(stats_rows[1], (4, 6, "st4".to_string(), "st6".to_string())); + } + + // Step 4: Read parquet footer to verify physical names and native field_id + { + let parquet_path = &add_actions + .first() + .expect("should have at least one add file") + .path; + let parquet_url = table_url.join(parquet_path)?; + let local_path = parquet_url.to_file_path().unwrap(); + + let obj_meta = store + .head(&Path::from_url_path(parquet_url.path())?) + .await?; + let file_meta = FileMeta::new( + parquet_url, + 0, /* last_modified */ + obj_meta.size as u64, + ); + let footer = engine.parquet_handler().read_parquet_footer(&file_meta)?; + let footer_schema = footer.schema; + + let logical_schema = latest_snapshot.schema(); + for logical_path in [&["row_number"][..], &["address", "street"]] { + let col = ColumnName::new(logical_path.iter().copied()); + let physical = + get_any_level_column_physical_name(logical_schema.as_ref(), &col, cm)?.into_inner(); + assert_schema_has_field(&footer_schema, &physical); + + let field_id = get_parquet_field_id(&local_path, &physical); + let logical_field = resolve_field(logical_schema.as_ref(), logical_path).unwrap(); + match cm_mode { + ColumnMappingMode::Id | ColumnMappingMode::Name => { + let expected_id = + match logical_field.get_config_value(&ColumnMetadataKey::ColumnMappingId) { + Some(MetadataValue::Number(n)) => *n as i32, + other => panic!("expected ColumnMappingId number, got {other:?}"), + }; + assert_eq!( + field_id, + Some(expected_id), + "parquet field_id mismatch for {logical_path:?}" + ); + } + ColumnMappingMode::None => { + assert_eq!( + field_id, None, + "parquet field_id should not be set in None column mapping mode" + ); + } + } + } + } + + // Step 5: Read data back to verify correctness + { + let post_ckpt_snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let scan = post_ckpt_snapshot.scan_builder().build()?; + let batches: Vec = scan + .execute(engine.clone())? + .map(|r| { + let data = r.unwrap(); + let arrow = ArrowEngineData::try_from_engine_data(data).unwrap(); + arrow.record_batch().clone() + }) + .collect(); + + let result_schema = batches[0].schema(); + let combined = delta_kernel::arrow::compute::concat_batches(&result_schema, &batches)?; + assert_eq!( + combined.num_rows(), + 6, + "Should have 6 rows from two written batches" + ); + + // Verify logical column names and data values + let combined_struct = StructArray::from(combined); + + // Top-level: row_number should contain [1..=6] + let row_numbers: &Int64Array = + resolve_struct_field(&combined_struct, &["row_number".into()]); + let mut vals: Vec = (0..row_numbers.len()) + .map(|i| row_numbers.value(i)) + .collect(); + vals.sort(); + assert_eq!(vals, vec![1, 2, 3, 4, 5, 6]); + + // Nested: address.street should contain ["st1"..="st6"] + let streets: &StringArray = + resolve_struct_field(&combined_struct, &["address".into(), "street".into()]); + let mut street_vals: Vec<&str> = (0..streets.len()).map(|i| streets.value(i)).collect(); + street_vals.sort(); + assert_eq!(street_vals, vec!["st1", "st2", "st3", "st4", "st5", "st6"]); + } + + // Step 6: Remove files and verify remove.stats matches original add.stats + { + let original_add_stats: Vec = + add_actions.iter().filter_map(|a| a.stats.clone()).collect(); + assert!( + !original_add_stats.is_empty(), + "should have at least one add with stats" + ); + + let remove_actions = + remove_all_and_get_remove_actions(&latest_snapshot, &table_url, engine.as_ref())?; + assert!( + !remove_actions.is_empty(), + "Expected at least one remove action" + ); + + let remove_stats: Vec = remove_actions + .iter() + .filter_map(|r| { + r["stats"] + .as_str() + .map(|s| serde_json::from_str(s).unwrap()) + }) + .collect(); + assert_eq!( + remove_stats, original_add_stats, + "remove.stats should match original add.stats" + ); + } + + Ok(()) +} + +/// Verifies that partitioned writes use physical column names in add.partitionValues. +#[rstest::rstest] +#[case::cm_none("./tests/data/partition_cm/none")] +#[case::cm_id("./tests/data/partition_cm/id")] +#[case::cm_name("./tests/data/partition_cm/name")] +#[tokio::test(flavor = "multi_thread")] +async fn test_column_mapping_partitioned_write( + #[case] table_dir: &str, +) -> Result<(), Box> { + let _ = tracing_subscriber::fmt::try_init(); + + // Copy test data to a temp dir so we can write to it + let tmp_dir = tempdir()?; + copy_directory(std::path::Path::new(table_dir), tmp_dir.path())?; + let table_url = Url::from_directory_path(tmp_dir.path()).unwrap(); + let store: Arc = Arc::new(LocalFileSystem::new()); + let engine = Arc::new( + DefaultEngineBuilder::new(store.clone()) + .with_task_executor(Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + ))) + .build(), + ); + + let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let cm = snapshot + .table_properties() + .column_mapping_mode + .unwrap_or(ColumnMappingMode::None); + let physical_name = get_any_level_column_physical_name( + snapshot.schema().as_ref(), + &ColumnName::new(["category"]), + cm, + )? + .into_inner() + .remove(0); + + // Verify physical name for column mapping mode + if table_dir.ends_with("none") { + assert_eq!(physical_name, "category"); + } else { + assert_ne!( + physical_name, "category", + "physical name should differ from logical name under column mapping" + ); + } + + // Write data with partition value + let data_schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "value", + DataType::INTEGER, + )])?); + let batch = RecordBatch::try_new( + Arc::new(data_schema.as_ref().try_into_arrow()?), + vec![Arc::new(Int32Array::from(vec![1, 2]))], + )?; + let partition_values = HashMap::from([("category".to_string(), "A".to_string())]); + write_batch_to_table(&snapshot, engine.as_ref(), batch, partition_values).await?; + + // Read commit log and verify add.partitionValues key uses physical name + let add_actions = read_actions_from_commit(&table_url, 1, "add")?; + assert!(!add_actions.is_empty(), "no add action found in commit log"); + for add in &add_actions { + assert_partition_values(add, &physical_name, "A"); + } + + // Remove the written file and verify remove action preserves physical names + let post_write_snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; + let remove_actions = + remove_all_and_get_remove_actions(&post_write_snapshot, &table_url, engine.as_ref())?; + assert!( + !remove_actions.is_empty(), + "no remove action found in commit log" + ); + for remove in &remove_actions { + assert_partition_values(remove, &physical_name, "A"); + } + + Ok(()) +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn test_checkpoint_non_kernel_written_table() { + // Table written by a non-kernel-integrated connector: 7 rows with columns (i, j, k), where + // parquet field nullabilities differ from the Delta schema. DefaultEngine reads it, coerces + // nullabilities to match the Delta schema, creates a checkpoint, and verifies the data is + // unchanged. + let source_path = std::path::Path::new("./tests/data/external-table-different-nullability"); + let temp_dir = tempfile::tempdir().unwrap(); + let table_path = temp_dir.path().join("test-checkpoint-table"); + test_utils::copy_directory(source_path, &table_path).unwrap(); + + let url = Url::from_directory_path(&table_path).unwrap(); + let store: Arc = Arc::new(LocalFileSystem::new()); + let executor = Arc::new( + delta_kernel::engine::default::executor::tokio::TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + ), + ); + let engine: Arc> = Arc::new( + delta_kernel::engine::default::DefaultEngineBuilder::new(store) + .with_task_executor(executor) + .build(), + ); + + // Read data before checkpoint + let snapshot = Snapshot::builder_for(url.clone()) + .build(engine.as_ref()) + .unwrap(); + let scan_before = Arc::clone(&snapshot).scan_builder().build().unwrap(); + let batches_before = test_utils::read_scan(&scan_before, engine.clone()).unwrap(); + + // Create checkpoint via snapshot.checkpoint() + snapshot.checkpoint(engine.as_ref()).unwrap(); + + // Read data after checkpoint + let snapshot_after = Snapshot::builder_for(url.clone()) + .build(engine.as_ref()) + .unwrap(); + let scan_after = snapshot_after.scan_builder().build().unwrap(); + let batches_after = test_utils::read_scan(&scan_after, engine.clone()).unwrap(); + + // Verify data unchanged + let formatted_before = + delta_kernel::arrow::util::pretty::pretty_format_batches(&batches_before) + .unwrap() + .to_string(); + let formatted_after = delta_kernel::arrow::util::pretty::pretty_format_batches(&batches_after) + .unwrap() + .to_string(); + assert_eq!( + formatted_before, formatted_after, + "Row data changed after checkpoint creation!" + ); + + // Verify checkpoint file exists + let delta_log_path = table_path.join("_delta_log"); + let has_checkpoint = std::fs::read_dir(&delta_log_path) + .unwrap() + .filter_map(|e| e.ok()) + .any(|e| { + e.file_name() + .to_str() + .is_some_and(|n| n.contains(".checkpoint.parquet")) + }); + assert!(has_checkpoint, "Expected at least one checkpoint file"); +} + +struct ClusteredTableSetup { + _tmp_dir: TempDir, + table_path: String, + table_url: Url, + engine: Arc>, + snapshot: Arc, +} + +/// Creates a clustered table with column mapping and sets table properties. +fn setup_clustered_table( + cm_mode: &str, + schema: Arc, + clustering_cols: Vec, + table_properties: &[(&str, &str)], +) -> Result> { + use delta_kernel::transaction::data_layout::DataLayout; + + let (_tmp_dir, table_path, _) = test_table_setup()?; + let table_url = Url::from_directory_path(&table_path).unwrap(); + let engine = create_default_engine_mt_executor(&table_url)?; + + let _ = create_table_txn(table_url.as_str(), schema, "Test/1.0") + .with_table_properties([("delta.columnMapping.mode", cm_mode)]) + .with_data_layout(DataLayout::Clustered { + columns: clustering_cols, + }) + .build(engine.as_ref(), Box::new(FileSystemCommitter::new()))? + .commit(engine.as_ref())?; + + let snapshot = set_table_properties( + &table_path, + &table_url, + engine.as_ref(), + 0, + table_properties, + )?; + + Ok(ClusteredTableSetup { + _tmp_dir, + table_path, + table_url, + engine, + snapshot, + }) +} + +/// E2E test: create a clustered table with column mapping, write data, and verify that +/// add.stats in the commit log contains min/max statistics for the clustering columns +/// (including a nested column). +#[rstest::rstest] +#[case::cm_none("none")] +#[case::cm_name("name")] +#[case::cm_id("id")] +#[tokio::test(flavor = "multi_thread")] +async fn test_clustered_table_write_has_stats( + #[case] cm_mode: &str, +) -> Result<(), Box> { + let _ = tracing_subscriber::fmt::try_init(); + + let clustering_cols = vec![ + ColumnName::new(["row_number"]), + ColumnName::new(["address", "street"]), + ]; + let setup = setup_clustered_table( + cm_mode, + nested_schema()?, + clustering_cols.clone(), + &[("delta.dataSkippingNumIndexedCols", "0")], + )?; + let engine = &setup.engine; + let mut snapshot = setup.snapshot; + for batch in nested_batches()? { + snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new()).await?; + } + + let cm = assert_column_mapping_mode(&snapshot, cm_mode); + let physical_paths: Vec> = clustering_cols + .iter() + .map(|c| { + get_any_level_column_physical_name(snapshot.schema().as_ref(), c, cm) + .unwrap() + .into_inner() + }) + .collect(); + if cm != ColumnMappingMode::None { + let logical_paths: Vec> = vec![vec!["row_number"], vec!["address", "street"]]; + for (phys, logical) in physical_paths.iter().zip(&logical_paths) { + assert_ne!( + phys.iter().map(String::as_str).collect_vec(), + *logical, + "physical path should differ from logical when cm={cm:?}" + ); + } + } + + // Resolve a non-clustering column to verify it's excluded from stats + let non_clustering_physical = get_any_level_column_physical_name( + snapshot.schema().as_ref(), + &ColumnName::new(["name"]), + cm, + )? + .into_inner(); + + // Verify stats for each write commit (v2 and v3, since v1 is the property update). + // Batch 1 (v2): row_number 1..3, address.street "st1".."st3" + // Batch 2 (v3): row_number 4..6, address.street "st4".."st6" + let expected: [(i64, i64, &str, &str); 2] = [(1, 3, "st1", "st3"), (4, 6, "st4", "st6")]; + for (version, (min_rn, max_rn, min_st, max_st)) in expected.iter().enumerate() { + let version = (version + 2) as u64; + let add_actions = read_actions_from_commit(&setup.table_url, version, "add")?; + assert!( + !add_actions.is_empty(), + "v{version}: should have add actions" + ); + + for add in &add_actions { + let stats: serde_json::Value = serde_json::from_str( + add.get("stats") + .and_then(|s| s.as_str()) + .expect("add action should have stats"), + )?; + // Clustering columns should have stats despite numIndexedCols=0 + assert_min_max_stats(&stats, &physical_paths[0], *min_rn, *max_rn); + assert_min_max_stats(&stats, &physical_paths[1], *min_st, *max_st); + + // Non-clustering column "name" should NOT have stats + let non_cluster_min = resolve_json_path(&stats["minValues"], &non_clustering_physical); + assert!( + non_cluster_min.is_null(), + "v{version}: non-clustering column 'name' should not have stats" + ); + } + } + + Ok(()) +} + +/// E2E test: create a clustered table with column mapping, enable writeStatsAsStruct, +/// write data, checkpoint, and verify stats_parsed. +#[rstest::rstest] +#[case::cm_none("none")] +#[case::cm_name("name")] +#[case::cm_id("id")] +#[tokio::test(flavor = "multi_thread")] +async fn test_clustered_table_write_has_stats_parsed( + #[case] cm_mode: &str, +) -> Result<(), Box> { + let _ = tracing_subscriber::fmt::try_init(); + + let clustering_cols = vec![ + ColumnName::new(["row_number"]), + ColumnName::new(["address", "street"]), + ]; + let setup = setup_clustered_table( + cm_mode, + nested_schema()?, + clustering_cols.clone(), + &[ + ("delta.checkpoint.writeStatsAsStruct", "true"), + ("delta.dataSkippingNumIndexedCols", "0"), + ], + )?; + let engine = &setup.engine; + let mut snapshot = setup.snapshot; + for batch in nested_batches()? { + snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new()).await?; + } + + let cm = assert_column_mapping_mode(&snapshot, cm_mode); + let physical_paths: Vec> = clustering_cols + .iter() + .map(|c| { + get_any_level_column_physical_name(snapshot.schema().as_ref(), c, cm) + .unwrap() + .into_inner() + }) + .collect(); + if cm != ColumnMappingMode::None { + let logical_paths: Vec> = vec![vec!["row_number"], vec!["address", "street"]]; + for (phys, logical) in physical_paths.iter().zip(&logical_paths) { + assert_ne!( + phys.iter().map(String::as_str).collect_vec(), + *logical, + "physical path should differ from logical when cm={cm:?}" + ); + } + } + let non_clustering_physical = get_any_level_column_physical_name( + snapshot.schema().as_ref(), + &ColumnName::new(["name"]), + cm, + )? + .into_inner(); + + snapshot.checkpoint(engine.as_ref())?; + + // Read checkpoint parquet directly to verify stats_parsed contains only clustering columns. + // ScanBuilder::include_all_stats_columns() doesn't support stats_parsed when + // dataSkippingNumIndexedCols=0. Read directly from the checkpoint parquet file instead. + use delta_kernel::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + let delta_log = std::path::Path::new(&setup.table_path).join("_delta_log"); + let ckpt_path = std::fs::read_dir(&delta_log)? + .filter_map(|e| e.ok()) + .find(|e| { + e.file_name() + .to_str() + .is_some_and(|n| n.contains(".checkpoint.parquet")) + }) + .expect("checkpoint parquet should exist") + .path(); + let file = std::fs::File::open(&ckpt_path)?; + let reader = ParquetRecordBatchReaderBuilder::try_new(file)?.build()?; + + let min_path = |field: &[String]| -> Vec { [&["minValues".into()], field].concat() }; + let max_path = |field: &[String]| -> Vec { [&["maxValues".into()], field].concat() }; + + let mut stats_rows: Vec<(i64, i64, String, String)> = Vec::new(); + for batch in reader { + let batch = batch?; + let batch_struct = StructArray::from(batch); + let add: &StructArray = resolve_struct_field(&batch_struct, &["add".into()]); + let stats_parsed: &StructArray = resolve_struct_field(add, &["stats_parsed".into()]); + + // Non-clustering column should not appear in stats_parsed + let min_values: &StructArray = resolve_struct_field(stats_parsed, &["minValues".into()]); + assert!( + min_values + .column_by_name(&non_clustering_physical[0]) + .is_none(), + "non-clustering column '{}' should not have stats_parsed", + non_clustering_physical[0] + ); + + let min_row_num: &Int64Array = + resolve_struct_field(stats_parsed, &min_path(&physical_paths[0])); + let max_row_num: &Int64Array = + resolve_struct_field(stats_parsed, &max_path(&physical_paths[0])); + let min_st: &StringArray = + resolve_struct_field(stats_parsed, &min_path(&physical_paths[1])); + let max_st: &StringArray = + resolve_struct_field(stats_parsed, &max_path(&physical_paths[1])); + + for i in 0..stats_parsed.len() { + if !stats_parsed.is_null(i) { + stats_rows.push(( + min_row_num.value(i), + max_row_num.value(i), + min_st.value(i).to_string(), + max_st.value(i).to_string(), + )); + } + } + } + + stats_rows.sort_by_key(|r| r.0); + assert_eq!(stats_rows.len(), 2, "should have stats_parsed for 2 files"); + assert_eq!(stats_rows[0], (1, 3, "st1".to_string(), "st3".to_string())); + assert_eq!(stats_rows[1], (4, 6, "st4".to_string(), "st6".to_string())); + + Ok(()) +} diff --git a/kernel/tests/write_row_tracking.rs b/kernel/tests/write_row_tracking.rs new file mode 100644 index 0000000000..5a4a0015ae --- /dev/null +++ b/kernel/tests/write_row_tracking.rs @@ -0,0 +1,193 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::Snapshot; +use url::Url; + +use delta_kernel::arrow::array::Int32Array; +use delta_kernel::arrow::record_batch::RecordBatch; + +use delta_kernel::engine::arrow_conversion::TryIntoArrow as _; +use delta_kernel::engine::arrow_data::ArrowEngineData; +use delta_kernel::engine_data::FilteredEngineData; +use delta_kernel::object_store::path::Path; +use delta_kernel::object_store::ObjectStore; +use delta_kernel::transaction::CommitResult; + +use itertools::Itertools; +use serde_json::Deserializer; +use tempfile::tempdir; + +use delta_kernel::schema::{DataType, StructField, StructType}; + +use test_utils::{create_table, engine_store_setup}; + +/// Test that verifies baseRowId and defaultRowCommitVersion are correctly populated +/// when row tracking is enabled on the table when a remove action is generated for a +/// a file that had row tracking enabled. +/// +/// This test creates a table with row tracking enabled, writes data to it, and then +/// removes the data. It then verifies the remove action row ID fields. Propogating the +/// values is required by the delta protocol [1]. +/// +/// This complements the existing test `test_remove_files_adds_expected_entries` which +/// verifies that baseRowId and defaultRowCommitVersion are absent when row tracking is NOT enabled. +/// +/// [1]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#writer-requirements-for-row-tracking +#[tokio::test] +async fn test_row_tracking_fields_in_add_and_remove_actions( +) -> Result<(), Box> { + let _ = tracing_subscriber::fmt::try_init(); + + let schema = Arc::new(StructType::try_new(vec![StructField::nullable( + "number", + DataType::INTEGER, + )])?); + + let tmp_dir = tempdir()?; + let tmp_test_dir_url = Url::from_directory_path(tmp_dir.path()).unwrap(); + + let (store, engine, table_location) = + engine_store_setup("test_row_tracking", Some(&tmp_test_dir_url)); + + let table_url = create_table( + store.clone(), + table_location, + schema.clone(), + &[], + true, + vec![], + vec!["rowTracking", "domainMetadata"], + ) + .await?; + + // ===== FIRST COMMIT: Add files with row tracking ===== + let snapshot = Snapshot::builder_for(table_url.clone()).build(&engine)?; + let mut txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), &engine)? + .with_engine_info("row tracking test") + .with_data_change(true); + + let data = RecordBatch::try_new( + Arc::new(schema.as_ref().try_into_arrow()?), + vec![Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]))], + )?; + + let engine_arc = Arc::new(engine); + let write_context = Arc::new(txn.get_write_context()); + let add_files_metadata = engine_arc + .write_parquet( + &ArrowEngineData::new(data), + write_context.as_ref(), + HashMap::new(), + ) + .await?; + + txn.add_files(add_files_metadata); + + let result = txn.commit(engine_arc.as_ref())?; + match result { + CommitResult::CommittedTransaction(committed) => { + assert_eq!(committed.commit_version(), 1); + } + _ => panic!("First commit should be committed"), + } + + // ===== VERIFY: Check add action contains row tracking fields ===== + let commit1_url = tmp_test_dir_url + .join("test_row_tracking/_delta_log/00000000000000000001.json") + .unwrap(); + let commit1 = store + .get(&Path::from_url_path(commit1_url.path()).unwrap()) + .await?; + + let parsed_commits: Vec<_> = Deserializer::from_slice(&commit1.bytes().await?) + .into_iter::() + .try_collect()?; + + // Find the add action + let add_actions: Vec<_> = parsed_commits + .iter() + .filter(|action| action.get("add").is_some()) + .collect(); + + assert_eq!(add_actions.len(), 1, "Expected exactly one add action"); + + let add = &add_actions[0]["add"]; + + // Verify baseRowId is present and has expected value + assert!( + add.get("baseRowId").is_some(), + "baseRowId MUST be present when row tracking is enabled" + ); + let base_row_id = add["baseRowId"] + .as_i64() + .expect("baseRowId should be an i64"); + // For the first file in a table with row tracking, baseRowId should start at 0 + // (high water mark starts at -1, so first baseRowId is -1 + 1 = 0) + assert_eq!(base_row_id, 0, "First file should have baseRowId 0"); + + let default_row_commit_version = add["defaultRowCommitVersion"] + .as_i64() + .expect("Missing defaultRowCommitVersion"); + assert_eq!(default_row_commit_version, 1); + + // ===== SECOND COMMIT: Remove the file ===== + let snapshot2 = Snapshot::builder_for(table_url.clone()).build(engine_arc.as_ref())?; + let mut txn2 = snapshot2 + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine_arc.as_ref())? + .with_engine_info("row tracking remove test") + .with_data_change(true); + + let scan = snapshot2.scan_builder().build()?; + let scan_metadata = scan.scan_metadata(engine_arc.as_ref())?.next().unwrap()?; + + let (data, selection_vector) = scan_metadata.scan_files.into_parts(); + let remove_metadata = FilteredEngineData::try_new(data, selection_vector)?; + + txn2.remove_files(remove_metadata); + + let result2 = txn2.commit(engine_arc.as_ref())?; + match result2 { + CommitResult::CommittedTransaction(committed) => { + assert_eq!(committed.commit_version(), 2); + } + _ => panic!("Second commit should be committed"), + } + + // ===== VERIFY: Check remove action contains row tracking fields ===== + let commit2_url = tmp_test_dir_url + .join("test_row_tracking/_delta_log/00000000000000000002.json") + .unwrap(); + let commit2 = store + .get(&Path::from_url_path(commit2_url.path()).unwrap()) + .await?; + + let parsed_commits2: Vec<_> = Deserializer::from_slice(&commit2.bytes().await?) + .into_iter::() + .try_collect()?; + + let remove_actions: Vec<_> = parsed_commits2 + .iter() + .filter(|action| action.get("remove").is_some()) + .collect(); + + assert_eq!(remove_actions.len(), 1); + + let remove = &remove_actions[0]["remove"]; + + let remove_base_row_id = remove["baseRowId"].as_i64().expect("Missing baseRowId"); + assert_eq!(remove_base_row_id, base_row_id); + + let remove_default_row_commit_version = remove["defaultRowCommitVersion"] + .as_i64() + .expect("Missing defaultRowCommitVersion"); + assert_eq!( + remove_default_row_commit_version, + default_row_commit_version + ); + + Ok(()) +} diff --git a/mem-test/Cargo.toml b/mem-test/Cargo.toml index 81e3d2724e..b136822821 100644 --- a/mem-test/Cargo.toml +++ b/mem-test/Cargo.toml @@ -13,11 +13,15 @@ version.workspace = true [package.metadata.release] release = false +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["delta_kernel/arrow-57"] +arrow-56 = ["delta_kernel/arrow-56"] + [dependencies] -arrow = "56" -delta_kernel = { path = "../kernel", features = ["arrow", "default-engine-rustls"] } +delta_kernel = { path = "../kernel", features = ["default-engine-rustls"] } dhat = "0.3" -object_store = "0.12.3" rayon = "1.10" serde_json = "1" tempfile = "3" diff --git a/mem-test/tests/dhat_large_table_data.rs b/mem-test/tests/dhat_large_table_data.rs index eabfa09ec5..90579d2d16 100644 --- a/mem-test/tests/dhat_large_table_data.rs +++ b/mem-test/tests/dhat_large_table_data.rs @@ -9,15 +9,13 @@ use std::sync::Arc; use delta_kernel::arrow::array::{ArrayRef, Int64Array, StringArray}; use delta_kernel::arrow::record_batch::RecordBatch; -use delta_kernel::engine::arrow_data::ArrowEngineData; -use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; -use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::engine::arrow_data::EngineDataArrowExt as _; +use delta_kernel::engine::default::DefaultEngineBuilder; +use delta_kernel::object_store::local::LocalFileSystem; use delta_kernel::parquet::arrow::ArrowWriter; use delta_kernel::parquet::file::properties::WriterProperties; use delta_kernel::Snapshot; -use arrow::compute::filter_record_batch; -use object_store::local::LocalFileSystem; use serde_json::json; use tempfile::tempdir; use url::Url; @@ -45,16 +43,20 @@ fn write_large_parquet_to(path: &Path) -> Result<(), Box> // read to show file sizes let metadata = std::fs::metadata(&path)?; let file_size = metadata.len(); + #[cfg(all(feature = "arrow-56", not(feature = "arrow-57")))] let total_row_group_size: i64 = parquet_metadata .row_groups .iter() .map(|rg| rg.total_byte_size) .sum(); - println!("File size (compressed file size): {} bytes", file_size); - println!( - "Total size (uncompressed file size): {} bytes", - total_row_group_size - ); + #[cfg(any(not(feature = "arrow-56"), feature = "arrow-57"))] + let total_row_group_size: i64 = parquet_metadata + .row_groups() + .iter() + .map(|rg| rg.total_byte_size()) + .sum(); + println!("File size (compressed file size): {file_size} bytes"); + println!("Total size (uncompressed file size): {total_row_group_size} bytes"); Ok(()) } @@ -95,7 +97,7 @@ fn create_commit(path: &Path) -> Result<(), Box> { ]; for action in actions { - writeln!(file, "{}", action)?; + writeln!(file, "{action}")?; } Ok(()) @@ -111,7 +113,7 @@ fn test_dhat_large_table_data() -> Result<(), Box> { // Step 1: Write the large parquet file write_large_parquet_to(table_path)?; let stats = dhat::HeapStats::get(); - println!("Heap stats after writing parquet:\n{:?}", stats); + println!("Heap stats after writing parquet:\n{stats:?}"); // Step 2: Create the Delta log create_commit(table_path)?; @@ -119,17 +121,14 @@ fn test_dhat_large_table_data() -> Result<(), Box> { // Step 3: Create engine and snapshot let store = Arc::new(LocalFileSystem::new()); let url = Url::from_directory_path(table_path).unwrap(); - let engine = Arc::new(DefaultEngine::new( - store, - Arc::new(TokioBackgroundExecutor::new()), - )); + let engine = Arc::new(DefaultEngineBuilder::new(store).build()); let snapshot = Snapshot::builder_for(url) .build(engine.as_ref()) .expect("Failed to create snapshot"); let stats = dhat::HeapStats::get(); - println!("Heap stats after creating snapshot:\n{:?}", stats); + println!("Heap stats after creating snapshot:\n{stats:?}"); // Step 4: Build and execute scan let scan = snapshot @@ -138,31 +137,18 @@ fn test_dhat_large_table_data() -> Result<(), Box> { .expect("Failed to build scan"); let stats = dhat::HeapStats::get(); - println!("Heap stats after building scan:\n{:?}", stats); + println!("Heap stats after building scan:\n{stats:?}"); // Step 5: Execute the scan and read data let mut row_count = 0; - for scan_result in scan.execute(engine)? { - let scan_result = scan_result?; - let mask = scan_result.full_mask(); - let data = scan_result.raw_data?; - let record_batch: RecordBatch = data - .into_any() - .downcast::() - .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? - .into(); - - let batch = if let Some(mask) = mask { - filter_record_batch(&record_batch, &mask.into())? - } else { - record_batch - }; + for data in scan.execute(engine)? { + let batch = data?.try_into_record_batch()?; row_count += batch.num_rows(); } let stats = dhat::HeapStats::get(); - println!("Heap stats after scan execution:\n{:?}", stats); - println!("Total rows read: {}", row_count); + println!("Heap stats after scan execution:\n{stats:?}"); + println!("Total rows read: {row_count}"); Ok(()) } diff --git a/mem-test/tests/dhat_large_table_log.rs b/mem-test/tests/dhat_large_table_log.rs index a1186c8bff..b6a515e401 100644 --- a/mem-test/tests/dhat_large_table_log.rs +++ b/mem-test/tests/dhat_large_table_log.rs @@ -7,10 +7,9 @@ use std::io::Write; use std::path::Path; use std::sync::Arc; -use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; -use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::engine::default::DefaultEngineBuilder; +use delta_kernel::object_store::local::LocalFileSystem; use delta_kernel::Snapshot; -use object_store::local::LocalFileSystem; use serde_json::json; use tempfile::tempdir; use tracing::info; @@ -31,7 +30,7 @@ fn generate_delta_log(path: &Path) -> Result<(), Box> { let mut current_file_id = 0u64; for commit_id in 0..NUM_COMMITS { - let commit_filename = format!("{:020}.json", commit_id); + let commit_filename = format!("{commit_id:020}.json"); let commit_path = delta_log_path.join(&commit_filename); let mut file = File::create(&commit_path)?; @@ -85,7 +84,7 @@ fn generate_delta_log(path: &Path) -> Result<(), Box> { // Write actions to file for action in actions { - writeln!(file, "{}", action)?; + writeln!(file, "{action}")?; } } @@ -111,14 +110,14 @@ fn test_dhat_large_table_log() -> Result<(), Box> { let _profiler = dhat::Profiler::builder().testing().build(); let store = Arc::new(LocalFileSystem::new()); let url = Url::from_directory_path(table_path).unwrap(); - let engine = DefaultEngine::new(store, Arc::new(TokioBackgroundExecutor::new())); + let engine = DefaultEngineBuilder::new(store).build(); let snapshot = Snapshot::builder_for(url) .build(&engine) .expect("Failed to get latest snapshot"); let stats = dhat::HeapStats::get(); - println!("Heap stats after PM replay:\n{:?}", stats); + println!("Heap stats after PM replay:\n{stats:?}"); let scan = snapshot .scan_builder() @@ -132,7 +131,7 @@ fn test_dhat_large_table_log() -> Result<(), Box> { } let stats = dhat::HeapStats::get(); - println!("Heap stats after Scan replay:\n{:?}", stats); + println!("Heap stats after Scan replay:\n{stats:?}"); Ok(()) } diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml index a515dc721e..2d5de39be2 100644 --- a/test-utils/Cargo.toml +++ b/test-utils/Cargo.toml @@ -12,9 +12,17 @@ version.workspace = true [package.metadata.release] release = false +[features] +default = ["arrow"] +arrow = ["arrow-57"] +arrow-57 = ["delta_kernel/arrow-57"] +arrow-56 = ["delta_kernel/arrow-56"] + [dependencies] -delta_kernel = { path = "../kernel", features = [ "default-engine-rustls", "arrow" ] } -object_store = "0.12.3" +# NOTE: We MUST build with SOME arrow version (or compilation would fail). So we pull in `arrow-56`, +# which ensures `cargo build` succeeds. But we ALSO default to `arrow`, which activates `arrow-57` +# and overrides `arrow-56`. +delta_kernel = { path = "../kernel", features = [ "default-engine-rustls", "arrow-56", "internal-api", "prettyprint" ] } itertools = "0.14.0" serde_json = "1.0.142" tar = "0.4" @@ -22,3 +30,10 @@ tempfile = "3" url = "2.5.4" uuid = { version = "1", features = ["v4"] } zstd = "0.13" +tokio = "1.49.0" +# only for structured logging +tracing = { version = "0.1", features = ["log"] } +tracing-subscriber = { version = "0.3", default-features = false, features = [ + "env-filter", + "fmt", +] } diff --git a/test-utils/src/counting_reporter.rs b/test-utils/src/counting_reporter.rs new file mode 100644 index 0000000000..3d5c8e5aee --- /dev/null +++ b/test-utils/src/counting_reporter.rs @@ -0,0 +1,329 @@ +//! A [`MetricsReporter`] implementation that accumulates operation counts via atomic counters. +//! +//! Useful in tests to assert exact IO costs and in benchmarks to print per-call IO profiles. +//! Attach it to a `DefaultEngine` via `DefaultEngineBuilder::with_metrics_reporter`, then +//! inspect the counters or call [`CountingReporter::print_summary`]. + +use std::sync::atomic::{AtomicU64, Ordering}; + +use delta_kernel::metrics::{MetricEvent, MetricsReporter}; + +/// Accumulates storage and operation metrics via the [`MetricsReporter`] interface. +/// +/// All counters use [`Ordering::Relaxed`] -- sufficient here since there are no +/// ordering dependencies between counters. +/// +/// # Note: update [`reset`] and the `MetricsReporter` impl when adding fields. +/// +/// [`reset`]: Self::reset +#[derive(Debug, Default)] +pub struct CountingReporter { + // Storage-layer IO counters (StorageHandler::list_from / read_files / copy_atomic) + /// Number of `list_from` calls (one per [`MetricEvent::StorageListCompleted`]). + pub list_calls: AtomicU64, + /// Total files returned across all list calls. + pub list_files_seen: AtomicU64, + /// Number of `StorageHandler::read_files` calls (one per [`MetricEvent::StorageReadCompleted`]). + pub storage_read_calls: AtomicU64, + /// Total individual files read via `StorageHandler::read_files`. + pub storage_read_files: AtomicU64, + /// Total bytes consumed via `StorageHandler::read_files`. + pub storage_bytes_read: AtomicU64, + /// Number of `copy_atomic` calls (one per [`MetricEvent::StorageCopyCompleted`]). + pub copy_calls: AtomicU64, + + // JSON handler IO counters (DefaultJsonHandler::read_json_files) + /// Number of `read_json_files` calls (one per [`MetricEvent::JsonReadCompleted`]). + pub json_read_calls: AtomicU64, + /// Total JSON files requested across all `read_json_files` calls. + pub json_files_read: AtomicU64, + /// Total on-disk bytes of JSON files requested. + pub json_bytes_read: AtomicU64, + + // Parquet handler IO counters (DefaultParquetHandler::read_parquet_files) + /// Number of `read_parquet_files` calls (one per [`MetricEvent::ParquetReadCompleted`]). + pub parquet_read_calls: AtomicU64, + /// Total Parquet files requested across all `read_parquet_files` calls. + pub parquet_files_read: AtomicU64, + /// Total on-disk bytes of Parquet files requested. + pub parquet_bytes_read: AtomicU64, + + // Operation-level counters + /// Number of completed snapshot constructions. + pub snapshot_completions: AtomicU64, + /// Number of full (non-incremental) log segment loads. Each fresh snapshot construction + /// from a table root contributes one load; incremental snapshot updates do not. + pub log_segment_loads: AtomicU64, + /// Total commit (JSON delta) files in the commit tail across all log segment loads. + /// These are the commits between the last checkpoint and the snapshot version — not + /// all historical commits in the table. Commits older than the selected checkpoint + /// are not included. + pub commit_files: AtomicU64, + /// Total checkpoint part files read across all log segment loads. + /// For a single-part checkpoint this is 1; for a multi-part checkpoint it equals the + /// number of parts that make up the selected checkpoint. + pub checkpoint_files: AtomicU64, + /// Total log compaction files in the commit tail across all log segment loads. + pub compaction_files: AtomicU64, + // TODO: add `crc_files` counter for version checksum (.crc) files read. + // Tracking CRC reads is critical for understanding snapshot load costs on tables + // that use CRC files heavily. Requires a new MetricEvent variant and emitting it + // from the CRC read path. See https://github.com/delta-io/delta-kernel-rs/issues/2257 +} + +impl CountingReporter { + /// Create a new reporter with all counters at zero. + pub fn new() -> Self { + Self::default() + } + + /// Reset all counters to zero. + /// + /// Useful before a single profiling iteration to get per-call counts. + pub fn reset(&self) { + self.list_calls.store(0, Ordering::Relaxed); + self.list_files_seen.store(0, Ordering::Relaxed); + self.storage_read_calls.store(0, Ordering::Relaxed); + self.storage_read_files.store(0, Ordering::Relaxed); + self.storage_bytes_read.store(0, Ordering::Relaxed); + self.copy_calls.store(0, Ordering::Relaxed); + self.json_read_calls.store(0, Ordering::Relaxed); + self.json_files_read.store(0, Ordering::Relaxed); + self.json_bytes_read.store(0, Ordering::Relaxed); + self.parquet_read_calls.store(0, Ordering::Relaxed); + self.parquet_files_read.store(0, Ordering::Relaxed); + self.parquet_bytes_read.store(0, Ordering::Relaxed); + self.snapshot_completions.store(0, Ordering::Relaxed); + self.log_segment_loads.store(0, Ordering::Relaxed); + self.commit_files.store(0, Ordering::Relaxed); + self.checkpoint_files.store(0, Ordering::Relaxed); + self.compaction_files.store(0, Ordering::Relaxed); + } + + /// Print a human-readable IO and operation summary. + /// + /// Intended to be called after [`reset`][Self::reset] and one operation so values + /// reflect a single call's cost. Output is visible with `cargo test -- --nocapture` + /// or `cargo nextest run -- --no-capture`. + pub fn print_summary(&self, label: &str) { + let list_calls = self.list_calls.load(Ordering::Relaxed); + let list_files = self.list_files_seen.load(Ordering::Relaxed); + let storage_reads = self.storage_read_calls.load(Ordering::Relaxed); + let storage_files = self.storage_read_files.load(Ordering::Relaxed); + let storage_kib = self.storage_bytes_read.load(Ordering::Relaxed) / 1024; + let copy_calls = self.copy_calls.load(Ordering::Relaxed); + let json_calls = self.json_read_calls.load(Ordering::Relaxed); + let json_files = self.json_files_read.load(Ordering::Relaxed); + let json_kib = self.json_bytes_read.load(Ordering::Relaxed) / 1024; + let parquet_calls = self.parquet_read_calls.load(Ordering::Relaxed); + let parquet_files = self.parquet_files_read.load(Ordering::Relaxed); + let parquet_kib = self.parquet_bytes_read.load(Ordering::Relaxed) / 1024; + let log_loads = self.log_segment_loads.load(Ordering::Relaxed); + let commits = self.commit_files.load(Ordering::Relaxed); + let checkpoints = self.checkpoint_files.load(Ordering::Relaxed); + let compactions = self.compaction_files.load(Ordering::Relaxed); + + println!(" [io] {label}"); + println!(" storage : {list_calls} list ({list_files} files seen) {storage_reads} raw read ({storage_files} files, {storage_kib} KiB) {copy_calls} copy"); + println!(" json : {json_calls} call(s) {json_files} files {json_kib} KiB"); + println!(" parquet : {parquet_calls} call(s) {parquet_files} files {parquet_kib} KiB"); + println!(" log : {log_loads} segment load(s) -- {commits} commits {checkpoints} checkpoints {compactions} compactions"); + } +} + +impl MetricsReporter for CountingReporter { + fn report(&self, event: MetricEvent) { + match event { + MetricEvent::StorageListCompleted { num_files, .. } => { + self.list_calls.fetch_add(1, Ordering::Relaxed); + self.list_files_seen.fetch_add(num_files, Ordering::Relaxed); + } + MetricEvent::StorageReadCompleted { + num_files, + bytes_read, + .. + } => { + self.storage_read_calls.fetch_add(1, Ordering::Relaxed); + self.storage_read_files + .fetch_add(num_files, Ordering::Relaxed); + self.storage_bytes_read + .fetch_add(bytes_read, Ordering::Relaxed); + } + MetricEvent::StorageCopyCompleted { .. } => { + self.copy_calls.fetch_add(1, Ordering::Relaxed); + } + MetricEvent::JsonReadCompleted { + num_files, + bytes_read, + } => { + self.json_read_calls.fetch_add(1, Ordering::Relaxed); + self.json_files_read.fetch_add(num_files, Ordering::Relaxed); + self.json_bytes_read + .fetch_add(bytes_read, Ordering::Relaxed); + } + MetricEvent::ParquetReadCompleted { + num_files, + bytes_read, + } => { + self.parquet_read_calls.fetch_add(1, Ordering::Relaxed); + self.parquet_files_read + .fetch_add(num_files, Ordering::Relaxed); + self.parquet_bytes_read + .fetch_add(bytes_read, Ordering::Relaxed); + } + MetricEvent::SnapshotCompleted { .. } => { + self.snapshot_completions.fetch_add(1, Ordering::Relaxed); + } + MetricEvent::LogSegmentLoaded { + num_commit_files, + num_checkpoint_files, + num_compaction_files, + .. + } => { + self.log_segment_loads.fetch_add(1, Ordering::Relaxed); + self.commit_files + .fetch_add(num_commit_files, Ordering::Relaxed); + self.checkpoint_files + .fetch_add(num_checkpoint_files, Ordering::Relaxed); + self.compaction_files + .fetch_add(num_compaction_files, Ordering::Relaxed); + } + // Intentionally not tracked -- add counters if needed. + MetricEvent::ProtocolMetadataLoaded { .. } + | MetricEvent::SnapshotFailed { .. } + | MetricEvent::ScanMetadataCompleted { .. } => {} + } + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::time::Duration; + + use delta_kernel::metrics::MetricId; + + use super::*; + + fn dur() -> Duration { + Duration::from_millis(1) + } + + #[test] + fn report_storage_list_completed_increments_list_counters() { + let reporter = CountingReporter::new(); + reporter.report(MetricEvent::StorageListCompleted { + duration: dur(), + num_files: 10, + }); + reporter.report(MetricEvent::StorageListCompleted { + duration: dur(), + num_files: 5, + }); + assert_eq!(reporter.list_calls.load(Ordering::Relaxed), 2); + assert_eq!(reporter.list_files_seen.load(Ordering::Relaxed), 15); + } + + #[test] + fn report_storage_read_completed_increments_read_counters() { + let reporter = CountingReporter::new(); + reporter.report(MetricEvent::StorageReadCompleted { + duration: dur(), + num_files: 3, + bytes_read: 1024, + }); + assert_eq!(reporter.storage_read_calls.load(Ordering::Relaxed), 1); + assert_eq!(reporter.storage_read_files.load(Ordering::Relaxed), 3); + assert_eq!(reporter.storage_bytes_read.load(Ordering::Relaxed), 1024); + } + + #[test] + fn report_storage_copy_completed_increments_copy_counter() { + let reporter = CountingReporter::new(); + reporter.report(MetricEvent::StorageCopyCompleted { duration: dur() }); + assert_eq!(reporter.copy_calls.load(Ordering::Relaxed), 1); + } + + #[test] + fn report_snapshot_completed_increments_snapshot_counter() { + let reporter = CountingReporter::new(); + reporter.report(MetricEvent::SnapshotCompleted { + operation_id: MetricId::new(), + version: 0, + total_duration: dur(), + }); + assert_eq!(reporter.snapshot_completions.load(Ordering::Relaxed), 1); + } + + #[test] + fn report_log_segment_loaded_increments_log_replay_counters() { + let reporter = CountingReporter::new(); + reporter.report(MetricEvent::LogSegmentLoaded { + operation_id: MetricId::new(), + duration: dur(), + num_commit_files: 7, + num_checkpoint_files: 2, + num_compaction_files: 1, + }); + assert_eq!(reporter.log_segment_loads.load(Ordering::Relaxed), 1); + assert_eq!(reporter.commit_files.load(Ordering::Relaxed), 7); + assert_eq!(reporter.checkpoint_files.load(Ordering::Relaxed), 2); + assert_eq!(reporter.compaction_files.load(Ordering::Relaxed), 1); + } + + #[test] + fn report_untracked_events_does_not_panic() { + let reporter = CountingReporter::new(); + reporter.report(MetricEvent::ProtocolMetadataLoaded { + operation_id: MetricId::new(), + duration: dur(), + }); + reporter.report(MetricEvent::SnapshotFailed { + operation_id: MetricId::new(), + duration: dur(), + }); + assert_eq!(reporter.snapshot_completions.load(Ordering::Relaxed), 0); + } + + #[test] + fn reset_zeros_all_counters() { + let reporter = Arc::new(CountingReporter::new()); + reporter.report(MetricEvent::StorageListCompleted { + duration: dur(), + num_files: 10, + }); + reporter.report(MetricEvent::StorageReadCompleted { + duration: dur(), + num_files: 3, + bytes_read: 1024, + }); + reporter.report(MetricEvent::StorageCopyCompleted { duration: dur() }); + reporter.report(MetricEvent::LogSegmentLoaded { + operation_id: MetricId::new(), + duration: dur(), + num_commit_files: 7, + num_checkpoint_files: 2, + num_compaction_files: 1, + }); + + reporter.reset(); + + assert_eq!(reporter.list_calls.load(Ordering::Relaxed), 0); + assert_eq!(reporter.list_files_seen.load(Ordering::Relaxed), 0); + assert_eq!(reporter.storage_read_calls.load(Ordering::Relaxed), 0); + assert_eq!(reporter.storage_read_files.load(Ordering::Relaxed), 0); + assert_eq!(reporter.storage_bytes_read.load(Ordering::Relaxed), 0); + assert_eq!(reporter.copy_calls.load(Ordering::Relaxed), 0); + assert_eq!(reporter.json_read_calls.load(Ordering::Relaxed), 0); + assert_eq!(reporter.json_files_read.load(Ordering::Relaxed), 0); + assert_eq!(reporter.json_bytes_read.load(Ordering::Relaxed), 0); + assert_eq!(reporter.parquet_read_calls.load(Ordering::Relaxed), 0); + assert_eq!(reporter.parquet_files_read.load(Ordering::Relaxed), 0); + assert_eq!(reporter.parquet_bytes_read.load(Ordering::Relaxed), 0); + assert_eq!(reporter.snapshot_completions.load(Ordering::Relaxed), 0); + assert_eq!(reporter.log_segment_loads.load(Ordering::Relaxed), 0); + assert_eq!(reporter.commit_files.load(Ordering::Relaxed), 0); + assert_eq!(reporter.checkpoint_files.load(Ordering::Relaxed), 0); + assert_eq!(reporter.compaction_files.load(Ordering::Relaxed), 0); + } +} diff --git a/test-utils/src/lib.rs b/test-utils/src/lib.rs index d23de54e4f..cb3c5a8e42 100644 --- a/test-utils/src/lib.rs +++ b/test-utils/src/lib.rs @@ -1,29 +1,44 @@ //! A number of utilities useful for testing that we want to use in multiple crates +pub mod counting_reporter; +pub use counting_reporter::CountingReporter; + +use std::collections::HashMap; use std::sync::Arc; +use delta_kernel::actions::get_log_add_schema; use delta_kernel::arrow::array::{ - ArrayRef, BooleanArray, Int32Array, Int64Array, RecordBatch, StringArray, + Array, ArrayRef, BooleanArray, Float64Array, Int32Array, Int64Array, MapArray, RecordBatch, + StringArray, StructArray, }; - -use delta_kernel::arrow::compute::filter_record_batch; +use delta_kernel::arrow::buffer::OffsetBuffer; +use delta_kernel::arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; use delta_kernel::arrow::error::ArrowError; use delta_kernel::arrow::util::pretty::pretty_format_batches; -use delta_kernel::engine::arrow_data::ArrowEngineData; -use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; +use delta_kernel::committer::FileSystemCommitter; +use delta_kernel::engine::arrow_conversion::TryFromKernel; +use delta_kernel::engine::arrow_data::{ArrowEngineData, EngineDataArrowExt}; +use delta_kernel::engine::default::executor::tokio::{ + TokioBackgroundExecutor, TokioMultiThreadExecutor, +}; use delta_kernel::engine::default::executor::TaskExecutor; -use delta_kernel::engine::default::DefaultEngine; +use delta_kernel::engine::default::storage::store_from_url; +use delta_kernel::engine::default::{DefaultEngine, DefaultEngineBuilder}; +use delta_kernel::object_store::local::LocalFileSystem; +use delta_kernel::object_store::memory::InMemory; +use delta_kernel::object_store::{path::Path, DynObjectStore}; use delta_kernel::parquet::arrow::arrow_writer::ArrowWriter; use delta_kernel::parquet::file::properties::WriterProperties; use delta_kernel::scan::Scan; -use delta_kernel::schema::SchemaRef; -use delta_kernel::{DeltaResult, Engine, EngineData, Snapshot}; +use delta_kernel::schema::{DataType, SchemaRef, StructField, StructType}; +use delta_kernel::transaction::CommitResult; +use delta_kernel::{try_parse_uri, DeltaResult, Engine, EngineData, Snapshot}; use itertools::Itertools; -use object_store::local::LocalFileSystem; -use object_store::memory::InMemory; -use object_store::{path::Path, ObjectStore}; -use serde_json::{json, to_vec}; +use serde_json::{json, to_vec, Deserializer}; +use std::sync::Mutex; +use tracing::subscriber::DefaultGuard; +use tracing_subscriber::layer::SubscriberExt; use url::Url; /// unpack the test data from {test_parent_dir}/{test_name}.tar.zst into a temp dir, and return the @@ -40,6 +55,44 @@ pub fn load_test_data( Ok(temp_dir) } +/// Recursively copies a directory and all its contents from source to destination. +/// +/// This function is used to create isolated copies of test tables, enabling parallel +/// test execution without interference. Each test gets its own copy of the table data, +/// preventing race conditions and cross-test pollution. +/// +/// # Arguments +/// +/// * `source` - Path to the source directory to copy from +/// * `dest` - Path to the destination directory (will be created if it doesn't exist) +/// +/// # Note +/// +/// This function copies ALL files and subdirectories, including any test artifacts +/// that may have been created in the source directory. Ensure the source directory +/// contains only the intended baseline data. +pub fn copy_directory( + source: &std::path::Path, + dest: &std::path::Path, +) -> Result<(), Box> { + std::fs::create_dir_all(dest)?; + + for entry in std::fs::read_dir(source)? { + let entry = entry?; + let path = entry.path(); + let file_name = entry.file_name(); + let dest_path = dest.join(&file_name); + + if path.is_dir() { + copy_directory(&path, &dest_path)?; + } else { + std::fs::copy(&path, &dest_path)?; + } + } + + Ok(()) +} + /// A common useful initial metadata and protocol. Also includes a single commitInfo pub const METADATA: &str = r#"{"commitInfo":{"timestamp":1587968586154,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isBlindAppend":true}} {"protocol":{"minReaderVersion":1,"minWriterVersion":2}} @@ -50,10 +103,30 @@ pub const METADATA_WITH_PARTITION_COLS: &str = r#"{"commitInfo":{"timestamp":158 {"protocol":{"minReaderVersion":1,"minWriterVersion":2}} {"metaData":{"id":"5fba94ed-9794-4965-ba6e-6ee3c0d22af9","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"val\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["val"],"configuration":{},"createdTime":1587968585495}}"#; +/// Like [`METADATA`] but with non-empty table properties including `delta.appendOnly` and `custom.key`. +pub const METADATA_WITH_TABLE_PROPERTIES: &str = r#"{"commitInfo":{"timestamp":1587968586154,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isBlindAppend":true}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"5fba94ed-9794-4965-ba6e-6ee3c0d22af9","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"val\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.appendOnly":"true","custom.key":"custom_value"},"createdTime":1587968585495}}"#; + +/// Like [`METADATA`] but with table-features protocol (v3/v7) including columnMapping (reader) +/// and columnMapping + rowTracking (writer). Metadata includes a table name and column mapping +/// configuration. +pub const METADATA_WITH_FEATURES: &str = concat!( + r#"{"commitInfo":{"timestamp":1587968586154,"operation":"WRITE","operationParameters":{},"isBlindAppend":true}}"#, + "\n", + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["columnMapping"],"writerFeatures":["columnMapping","domainMetadata","rowTracking"]}}"#, + "\n", + r#"{"metaData":{"id":"deadbeef-1234-5678-abcd-000000000000","name":"test_table","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[]}","partitionColumns":[],"configuration":{"delta.columnMapping.mode":"name","delta.rowTracking.enabled":"true","delta.rowTracking.materializedRowIdColumnName":"_row_id","delta.rowTracking.materializedRowCommitVersionColumnName":"_row_commit_version"},"createdTime":1234567890000}}"#, +); + pub enum TestAction { Add(String), Remove(String), Metadata, + // TODO: This is a temporary fix to make the test compatible with the file size requirement. + // In the future, we can create an AddCommit/RemoveCommit struct type with DefaultAddCommit/DefaultRemoveCommit value to store the commit info in the enum for Add/Remove. + AddWithSize(String, u64), + RemoveWithSize(String, u64), } // TODO: We need a better way to mock tables :) @@ -68,13 +141,15 @@ pub fn actions_to_string_partitioned(actions: Vec) -> String { actions_to_string_with_metadata(actions, METADATA_WITH_PARTITION_COLS) } -fn actions_to_string_with_metadata(actions: Vec, metadata: &str) -> String { +pub fn actions_to_string_with_metadata(actions: Vec, metadata: &str) -> String { actions .into_iter() .map(|test_action| match test_action { TestAction::Add(path) => format!(r#"{{"add":{{"path":"{path}","partitionValues":{{}},"size":262,"modificationTime":1587968586000,"dataChange":true, "stats":"{{\"numRecords\":2,\"nullCount\":{{\"id\":0}},\"minValues\":{{\"id\": 1}},\"maxValues\":{{\"id\":3}}}}"}}}}"#), TestAction::Remove(path) => format!(r#"{{"remove":{{"path":"{path}","partitionValues":{{}},"size":262,"modificationTime":1587968586000,"dataChange":true}}}}"#), TestAction::Metadata => metadata.into(), + TestAction::AddWithSize(path, file_size) => format!(r#"{{"add":{{"path":"{path}","partitionValues":{{}},"size":{file_size},"modificationTime":1587968586000,"dataChange":true, "stats":"{{\"numRecords\":2,\"nullCount\":{{\"id\":0}},\"minValues\":{{\"id\": 1}},\"maxValues\":{{\"id\":3}}}}"}}}}"#), + TestAction::RemoveWithSize(path, file_size) => format!(r#"{{"remove":{{"path":"{path}","partitionValues":{{}},"size":{file_size},"modificationTime":1587968586000,"dataChange":true}}}}"#), }) .join("\n") } @@ -165,25 +240,59 @@ pub fn compacted_log_path_for_versions(start_version: u64, end_version: u64, suf Path::from(path.as_str()) } -/// put a commit file into the specified object store. +// Resolve a table from a root and relative path +fn resolve_table_path(table_root: impl AsRef, relative: &Path) -> DeltaResult { + let url = try_parse_uri(table_root)?; + Ok(Path::from_url_path(url.join(relative.as_ref())?.path())?) +} + +/// Write a Delta commit JSON file at the given version into `store`. +/// +/// The commit is written to `_delta_log/{version:020}.json` under `table_root`. The caller is +/// responsible for ensuring that `data` contains valid Delta actions (e.g. built via +/// [`actions_to_string`]) and that no commit already exists at `version`. +/// +/// # Parameters +/// - `table_root` - Root URL of the Delta table (e.g. `"memory:///"` or `"file:///tmp/table"`). +/// - `store` - Object store that backs the table. +/// - `version` - Commit version number (determines the log file name). +/// - `data` - JSON-serialized Delta actions to write as the commit body. pub async fn add_commit( - store: &dyn ObjectStore, + table_root: impl AsRef, + store: &DynObjectStore, version: u64, data: String, ) -> Result<(), Box> { - let path = delta_path_for_version(version, "json"); - store.put(&path, data.into()).await?; + let relative_path = delta_path_for_version(version, "json"); + let table_path = resolve_table_path(table_root, &relative_path)?; + store.put(&table_path, data.into()).await?; Ok(()) } +/// Write a staged (uncommitted) Delta commit JSON file at the given version into `store`. +/// +/// The file is written to `_delta_log/_staged_commits/{version}.{uuid}.json` under +/// `table_root`. Multiple staged commits may exist for the same version (each gets a unique +/// UUID). The caller is responsible for ensuring that `data` contains valid Delta actions. +/// +/// Returns the object-store [`Path`] of the written file so callers can reference it in a +/// log tail or assertions. +/// +/// # Parameters +/// - `table_root` - Root URL of the Delta table (e.g. `"memory:///"` or `"file:///tmp/table"`). +/// - `store` - Object store that backs the table. +/// - `version` - Target commit version number (determines the staged file name prefix). +/// - `data` - JSON-serialized Delta actions to write as the staged commit body. pub async fn add_staged_commit( - store: &dyn ObjectStore, + table_root: impl AsRef, + store: &DynObjectStore, version: u64, data: String, ) -> Result> { - let path = staged_commit_path_for_version(version); - store.put(&path, data.into()).await?; - Ok(path) + let relative_path = staged_commit_path_for_version(version); + let table_path = resolve_table_path(table_root, &relative_path)?; + store.put(&table_path, data.into()).await?; + Ok(table_path) } /// Try to convert an `EngineData` into a `RecordBatch`. Panics if not using `ArrowEngineData` from @@ -194,27 +303,81 @@ pub fn into_record_batch(engine_data: Box) -> RecordBatch { .into() } -/// Simple extension trait with helpful methods (just constuctor for now) for creating/using -/// DefaultEngine in our tests. +/// Helper to create a DefaultEngine with the default executor for tests. /// -/// Note: we implment this extension trait here so that we can import this trait (from test-utils -/// crate) and get to use all these test-only helper methods from places where we don't have access -pub trait DefaultEngineExtension { - type Executor: TaskExecutor; +/// Uses `TokioBackgroundExecutor` as the default executor. +pub fn create_default_engine( + table_root: &url::Url, +) -> DeltaResult>> { + let store = store_from_url(table_root)?; + Ok(Arc::new(DefaultEngineBuilder::new(store).build())) +} - fn new_local() -> Arc>; +/// Helper to create a DefaultEngine with the default executor for tests. +/// +/// Uses `TokioBackgroundExecutor` as the default executor. +pub fn create_default_engine_mt_executor( + table_root: &url::Url, +) -> DeltaResult>> { + let store = store_from_url(table_root)?; + let task_executor = Arc::new(TokioMultiThreadExecutor::new( + tokio::runtime::Handle::current(), + )); + Ok(Arc::new( + DefaultEngineBuilder::new(store) + .with_task_executor(task_executor) + .build(), + )) } -impl DefaultEngineExtension for DefaultEngine { - type Executor = TokioBackgroundExecutor; +/// Test setup helper that creates a temporary directory and a `DefaultEngine` backed by +/// [`TokioBackgroundExecutor`]. +/// +/// Returns `(temp_dir, table_path, engine)` for use in integration tests. +/// The `temp_dir` must be kept alive for the duration of the test to prevent cleanup. +/// +/// # Example +/// +/// ```ignore +/// let (_temp_dir, table_path, engine) = test_table_setup()?; +/// ``` +pub fn test_table_setup() -> DeltaResult<( + tempfile::TempDir, + String, + Arc>, +)> { + let temp_dir = tempfile::tempdir().map_err(|e| delta_kernel::Error::generic(e.to_string()))?; + let table_path = temp_dir + .path() + .to_str() + .ok_or_else(|| delta_kernel::Error::generic("Invalid path"))? + .to_string(); + let table_url = url::Url::from_directory_path(&table_path) + .map_err(|_| delta_kernel::Error::generic("Invalid URL"))?; + let engine = create_default_engine(&table_url)?; + Ok((temp_dir, table_path, engine)) +} - fn new_local() -> Arc> { - let object_store = Arc::new(LocalFileSystem::new()); - Arc::new(DefaultEngine::new( - object_store, - TokioBackgroundExecutor::new().into(), - )) - } +/// Test setup helper that creates a temporary directory and a `DefaultEngine` backed by +/// [`TokioMultiThreadExecutor`]. +/// +/// Returns `(temp_dir, table_path, engine)` for use in integration tests. +/// The `temp_dir` must be kept alive for the duration of the test to prevent cleanup. +pub fn test_table_setup_mt() -> DeltaResult<( + tempfile::TempDir, + String, + Arc>, +)> { + let temp_dir = tempfile::tempdir().map_err(|e| delta_kernel::Error::generic(e.to_string()))?; + let table_path = temp_dir + .path() + .to_str() + .ok_or_else(|| delta_kernel::Error::generic("Invalid path"))? + .to_string(); + let table_url = url::Url::from_directory_path(&table_path) + .map_err(|_| delta_kernel::Error::generic("Invalid URL"))?; + let engine = create_default_engine_mt_executor(&table_url)?; + Ok((temp_dir, table_path, engine)) } // setup default engine with in-memory (local_directory=None) or local fs (local_directory=Some(Url)) @@ -222,11 +385,11 @@ pub fn engine_store_setup( table_name: &str, local_directory: Option<&Url>, ) -> ( - Arc, + Arc, DefaultEngine, Url, ) { - let (storage, url): (Arc, Url) = match local_directory { + let (storage, url): (Arc, Url) = match local_directory { None => ( Arc::new(InMemory::new()), Url::parse(format!("memory:///{table_name}/").as_str()).expect("valid url"), @@ -236,8 +399,7 @@ pub fn engine_store_setup( Url::parse(format!("{dir}{table_name}/").as_str()).expect("valid url"), ), }; - let executor = Arc::new(TokioBackgroundExecutor::new()); - let engine = DefaultEngine::new(Arc::clone(&storage), executor); + let engine = DefaultEngineBuilder::new(Arc::clone(&storage)).build(); (storage, engine, url) } @@ -246,7 +408,7 @@ pub fn engine_store_setup( // this will just create an empty table with the given schema. (just protocol + metadata actions) #[allow(clippy::too_many_arguments)] pub async fn create_table( - store: Arc, + store: Arc, table_path: Url, schema: SchemaRef, partition_columns: &[&str], @@ -302,6 +464,12 @@ pub async fn create_table( json!("1612345678"), ); } + if writer_features.contains(&"changeDataFeed") { + config.insert("delta.enableChangeDataFeed".to_string(), json!("true")); + } + if reader_features.contains(&"catalogManaged") { + config.insert("io.unitycatalog.tableId".to_string(), json!(table_id)); + } config }; @@ -364,9 +532,9 @@ pub async fn create_table( Ok(table_path) } -/// Creates two empty test tables, one with 37 protocol and one with 11 protocol. -/// the tables will be named {table_base_name}_11 and table_base_name}_37. The local_directory param -/// can be set to write out the tables to the local filesystem, passing in None will create in-memory tables +/// Creates two empty test tables, one with 37 protocol and one with 11 protocol. the tables will +/// be named {table_base_name}_11 and {table_base_name}_37. The local_directory param can be set to +/// write out the tables to the local filesystem, passing in None will create in-memory tables pub async fn setup_test_tables( schema: SchemaRef, partition_columns: &[&str], @@ -376,7 +544,7 @@ pub async fn setup_test_tables( Vec<( Url, DefaultEngine, - Arc, + Arc, &'static str, )>, Box, @@ -421,28 +589,10 @@ pub async fn setup_test_tables( ]) } -pub fn to_arrow(data: Box) -> DeltaResult { - Ok(data - .into_any() - .downcast::() - .map_err(|_| delta_kernel::Error::EngineDataType("ArrowEngineData".to_string()))? - .into()) -} - pub fn read_scan(scan: &Scan, engine: Arc) -> DeltaResult> { let scan_results = scan.execute(engine)?; scan_results - .map(|scan_result| -> DeltaResult<_> { - let scan_result = scan_result?; - let mask = scan_result.full_mask(); - let data = scan_result.raw_data?; - let record_batch = to_arrow(data)?; - if let Some(mask) = mask { - Ok(filter_record_batch(&record_batch, &mask.into())?) - } else { - Ok(record_batch) - } - }) + .map(EngineDataArrowExt::try_into_record_batch) .try_collect() } @@ -467,6 +617,40 @@ pub fn test_read( Ok(()) } +/// Insert column arrays into an existing table in a single commit. +/// +/// Takes a snapshot and column arrays, constructs a [`RecordBatch`] from the snapshot schema, +/// opens a transaction, writes the batch as a parquet file, and commits. +/// Useful for quickly seeding test tables without writing the transaction boilerplate each time. +/// +/// # Example +/// +/// ```ignore +/// let snapshot = Snapshot::builder_for(table_url.clone()).build(engine.as_ref())?; +/// insert_data(snapshot, &engine, vec![Arc::new(Int32Array::from(vec![1]))]).await?; +/// ``` +pub async fn insert_data( + snapshot: Arc, + engine: &Arc>, + columns: Vec, +) -> DeltaResult { + let arrow_schema = TryFromKernel::try_from_kernel(snapshot.schema().as_ref())?; + let batch = RecordBatch::try_new(Arc::new(arrow_schema), columns) + .map_err(|e| delta_kernel::Error::generic(e.to_string()))?; + let mut txn = snapshot + .transaction(Box::new(FileSystemCommitter::new()), engine.as_ref())? + .with_operation("WRITE".to_string()) + .with_data_change(true); + + let write_context = txn.get_write_context(); + let add_files_metadata = engine + .write_parquet(&ArrowEngineData::new(batch), &write_context, HashMap::new()) + .await?; + txn.add_files(add_files_metadata); + + txn.commit(engine.as_ref()) +} + // Helper function to set json values in a serde_json Values pub fn set_json_value( value: &mut serde_json::Value, @@ -482,6 +666,194 @@ pub fn set_json_value( Ok(()) } +/// Returns a nested schema with 6 top-level fields including a nested struct: +/// `[row_number: long, name: string, score: double, address: {street: string, city: string}, tag: string, value: int]` +pub fn nested_schema() -> Result> { + Ok(Arc::new(StructType::try_new(vec![ + StructField::not_null("row_number", DataType::LONG), + StructField::nullable("name", DataType::STRING), + StructField::nullable("score", DataType::DOUBLE), + StructField::nullable( + "address", + StructType::try_new(vec![ + StructField::not_null("street", DataType::STRING), + StructField::nullable("city", DataType::STRING), + ])?, + ), + StructField::nullable("tag", DataType::STRING), + StructField::nullable("value", DataType::INTEGER), + ])?)) +} + +/// Returns two [`RecordBatch`]es with hardcoded test data matching [`nested_schema`]. +/// +/// Batch 1: rows 1..3, names alice/bob/charlie, streets st1..st3 +/// Batch 2: rows 4..6, names dave/eve/frank, streets st4..st6 +pub fn nested_batches() -> Result, Box> { + let schema = nested_schema()?; + let arrow_schema: ArrowSchema = TryFromKernel::try_from_kernel(schema.as_ref())?; + let address_fields = match arrow_schema.field_with_name("address").unwrap().data_type() { + ArrowDataType::Struct(fields) => fields.clone(), + _ => panic!("expected struct"), + }; + + let build = |ids: Vec, + names: Vec<&str>, + scores: Vec, + streets: Vec<&str>, + cities: Vec>, + tags: Vec>, + values: Vec>| + -> Result> { + let address_array = StructArray::new( + address_fields.clone(), + vec![ + Arc::new(StringArray::from(streets)) as ArrayRef, + Arc::new(StringArray::from(cities)) as ArrayRef, + ], + None, + ); + Ok(RecordBatch::try_new( + Arc::new(arrow_schema.clone()), + vec![ + Arc::new(Int64Array::from(ids)) as ArrayRef, + Arc::new(StringArray::from(names)) as ArrayRef, + Arc::new(Float64Array::from(scores)) as ArrayRef, + Arc::new(address_array) as ArrayRef, + Arc::new(StringArray::from(tags)) as ArrayRef, + Arc::new(Int32Array::from(values)) as ArrayRef, + ], + )?) + }; + + Ok(vec![ + build( + vec![1, 2, 3], + vec!["alice", "bob", "charlie"], + vec![1.0, 2.0, 3.0], + vec!["st1", "st2", "st3"], + vec![Some("c1"), None, Some("c3")], + vec![Some("t1"), Some("t2"), None], + vec![Some(10), Some(20), None], + )?, + build( + vec![4, 5, 6], + vec!["dave", "eve", "frank"], + vec![4.0, 5.0, 6.0], + vec!["st4", "st5", "st6"], + vec![Some("c4"), Some("c5"), Some("c6")], + vec![None, Some("t5"), Some("t6")], + vec![Some(40), None, Some(60)], + )?, + ]) +} + +// --------------------------------------------------------------------------- +// Schema helpers for feature auto-enablement tests (TimestampNTZ, Variant) +// --------------------------------------------------------------------------- + +/// Schema with one column of the given type: `(id INT, col )`. +pub fn schema_with_type(dtype: DataType) -> SchemaRef { + Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("col", dtype, true), + ])) +} + +/// Schema with the given type nested inside a struct: +/// `(id INT, nested STRUCT>)`. +pub fn nested_schema_with_type(dtype: DataType) -> SchemaRef { + Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new( + "nested", + DataType::Struct(Box::new(StructType::new_unchecked(vec![StructField::new( + "inner", dtype, true, + )]))), + true, + ), + ])) +} + +/// Schema with two columns of the given type: `(id INT, col1 , col2 )`. +pub fn multi_schema_with_type(dtype: DataType) -> SchemaRef { + Arc::new(StructType::new_unchecked(vec![ + StructField::new("id", DataType::INTEGER, false), + StructField::new("col1", dtype.clone(), true), + StructField::new("col2", dtype, true), + ])) +} + +pub fn top_level_ntz_schema() -> SchemaRef { + schema_with_type(DataType::TIMESTAMP_NTZ) +} + +pub fn nested_ntz_schema() -> SchemaRef { + nested_schema_with_type(DataType::TIMESTAMP_NTZ) +} + +pub fn multiple_ntz_schema() -> SchemaRef { + multi_schema_with_type(DataType::TIMESTAMP_NTZ) +} + +pub fn top_level_variant_schema() -> SchemaRef { + schema_with_type(DataType::unshredded_variant()) +} + +pub fn nested_variant_schema() -> SchemaRef { + nested_schema_with_type(DataType::unshredded_variant()) +} + +pub fn multiple_variant_schema() -> SchemaRef { + multi_schema_with_type(DataType::unshredded_variant()) +} + +/// Returns column mapping table properties for the given mode, or empty for `"none"`. +pub fn cm_properties(mode: &str) -> Vec<(&str, &str)> { + if mode == "none" { + vec![] + } else { + vec![("delta.columnMapping.mode", mode)] + } +} + +/// Resolves a nested field in a [`StructType`] schema by path. Returns an error if any +/// segment is missing or a non-terminal segment is not a struct type. +pub fn resolve_field<'a>( + schema: &'a delta_kernel::schema::StructType, + path: &[impl AsRef], +) -> Result<&'a delta_kernel::schema::StructField, String> { + let path_str: Vec<&str> = path.iter().map(|s| s.as_ref()).collect(); + let display = path_str.join("."); + let (last, rest) = path.split_last().ok_or_else(|| "empty path".to_string())?; + let mut current = schema; + for name in rest { + let field = current + .field(name.as_ref()) + .ok_or_else(|| format!("schema missing field '{display}'"))?; + current = match field.data_type() { + delta_kernel::schema::DataType::Struct(s) => s, + _ => return Err(format!("expected struct at '{display}'")), + }; + } + current + .field(last.as_ref()) + .ok_or_else(|| format!("schema missing field '{display}'")) +} + +/// Asserts that a field exists at the given path in a [`StructType`] schema, +/// traversing into nested structs as needed. +/// +/// # Example +/// +/// ```ignore +/// // Given schema: { address: { street: string, city: string } } +/// assert_schema_has_field(&schema, &["address".into(), "street".into()]); +/// ``` +pub fn assert_schema_has_field(schema: &delta_kernel::schema::StructType, path: &[String]) { + resolve_field(schema, path).unwrap(); +} + pub fn assert_result_error_with_message(res: Result, message: &str) { match res { Ok(_) => panic!("Expected error, but got Ok result"), @@ -494,3 +866,317 @@ pub fn assert_result_error_with_message(res: Result, messa } } } + +/// Creates add file metadata for one or more files without partition values. +/// Each tuple contains: (file_path, file_size, mod_time, num_records) +pub fn create_add_files_metadata( + add_files_schema: &SchemaRef, + files: Vec<(&str, i64, i64, i64)>, +) -> Result, Box> { + let num_files = files.len(); + + // Build arrays for each file + let path_array = StringArray::from(files.iter().map(|(p, _, _, _)| *p).collect::>()); + let size_array = Int64Array::from(files.iter().map(|(_, s, _, _)| *s).collect::>()); + let mod_time_array = Int64Array::from(files.iter().map(|(_, _, m, _)| *m).collect::>()); + let num_records_array = + Int64Array::from(files.iter().map(|(_, _, _, n)| *n).collect::>()); + + // Create empty map for partitionValues (repeated for each file) + let entries_field = Arc::new(Field::new( + "key_value", + ArrowDataType::Struct( + vec![ + Arc::new(Field::new("key", ArrowDataType::Utf8, false)), + Arc::new(Field::new("value", ArrowDataType::Utf8, true)), + ] + .into(), + ), + false, + )); + let empty_keys = StringArray::from(Vec::<&str>::new()); + let empty_values = StringArray::from(Vec::>::new()); + let empty_entries = StructArray::from(vec![ + ( + Arc::new(Field::new("key", ArrowDataType::Utf8, false)), + Arc::new(empty_keys) as ArrayRef, + ), + ( + Arc::new(Field::new("value", ArrowDataType::Utf8, true)), + Arc::new(empty_values) as ArrayRef, + ), + ]); + let offsets = OffsetBuffer::from_lengths(vec![0; num_files]); + let partition_values_array = Arc::new(MapArray::new( + entries_field, + offsets, + empty_entries, + None, + false, + )); + + // Build stats struct with all fields: numRecords, nullCount, minValues, maxValues, tightBounds + // nullCount, minValues, maxValues are empty structs (structure depends on data schema) + let empty_struct_fields: delta_kernel::arrow::datatypes::Fields = + Vec::>::new().into(); + let empty_struct = StructArray::new_empty_fields(num_files, None); + let tight_bounds_array = BooleanArray::from(vec![true; num_files]); + + let stats_struct = StructArray::from(vec![ + ( + Arc::new(Field::new("numRecords", ArrowDataType::Int64, true)), + Arc::new(num_records_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "nullCount", + ArrowDataType::Struct(empty_struct_fields.clone()), + true, + )), + Arc::new(empty_struct.clone()) as ArrayRef, + ), + ( + Arc::new(Field::new( + "minValues", + ArrowDataType::Struct(empty_struct_fields.clone()), + true, + )), + Arc::new(empty_struct.clone()) as ArrayRef, + ), + ( + Arc::new(Field::new( + "maxValues", + ArrowDataType::Struct(empty_struct_fields), + true, + )), + Arc::new(empty_struct) as ArrayRef, + ), + ( + Arc::new(Field::new("tightBounds", ArrowDataType::Boolean, true)), + Arc::new(tight_bounds_array) as ArrayRef, + ), + ]); + + let batch = RecordBatch::try_new( + Arc::new(TryFromKernel::try_from_kernel(add_files_schema.as_ref())?), + vec![ + Arc::new(path_array) as ArrayRef, + partition_values_array as ArrayRef, + Arc::new(size_array) as ArrayRef, + Arc::new(mod_time_array) as ArrayRef, + Arc::new(stats_struct) as ArrayRef, + ], + )?; + + Ok(Box::new(ArrowEngineData::new(batch))) +} + +/// Writes a [`RecordBatch`] to a table, commits the transaction, and returns the post-commit +/// snapshot. +pub async fn write_batch_to_table( + snapshot: &Arc, + engine: &DefaultEngine, + data: RecordBatch, + partition_values: std::collections::HashMap, +) -> Result, Box> { + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine)? + .with_engine_info("DefaultEngine") + .with_data_change(true); + let write_context = txn.get_write_context(); + let add_meta = engine + .write_parquet( + &ArrowEngineData::new(data), + &write_context, + partition_values, + ) + .await?; + txn.add_files(add_meta); + match txn.commit(engine)? { + delta_kernel::transaction::CommitResult::CommittedTransaction(c) => Ok(c + .post_commit_snapshot() + .expect("Failed to get post_commit_snapshot") + .clone()), + _ => panic!("Write commit should succeed"), + } +} + +/// An add info extracted from the log segment. +pub struct AddInfo { + pub path: String, + pub stats: Option, +} + +/// Reads all [`AddInfo`]s from a snapshot's log segment. +/// +/// # Example (conceptual) +/// +/// Given a delta log entry like: +/// ```json +/// {"add": {"path": "part-00000.parquet", "stats": "{\"numRecords\":10}"}} +/// ``` +/// This function would return: +/// ```text +/// vec![AddInfo { path: "part-00000.parquet", stats: Some({"numRecords": 10}) }] +/// ``` +pub fn read_add_infos( + snapshot: &Snapshot, + engine: &impl Engine, +) -> Result, Box> { + let schema = get_log_add_schema().clone(); + let batches = snapshot.log_segment().read_actions(engine, schema)?; + let mut actions = Vec::new(); + for batch_result in batches { + let actions_batch = batch_result?; + let engine_data = ArrowEngineData::try_from_engine_data(actions_batch.actions)?; + let record_batch = engine_data.record_batch(); + let add_struct = match record_batch.schema().index_of("add").ok().and_then(|idx| { + record_batch + .column(idx) + .as_any() + .downcast_ref::() + }) { + Some(s) => s, + None => continue, + }; + let path_arr = add_struct + .column_by_name("path") + .and_then(|c| c.as_any().downcast_ref::()); + let stats_arr = add_struct + .column_by_name("stats") + .and_then(|c| c.as_any().downcast_ref::()); + let len = add_struct.len(); + for i in 0..len { + if let Some(path) = path_arr.and_then(|a| (!a.is_null(i)).then(|| a.value(i))) { + let stats = stats_arr + .and_then(|a| (!a.is_null(i)).then(|| a.value(i))) + .map(serde_json::from_str) + .transpose()?; + actions.push(AddInfo { + path: path.to_string(), + stats, + }); + } + } + } + Ok(actions) +} + +/// Helper to create a table with the given properties, then load and return its snapshot. +pub fn create_table_and_load_snapshot( + table_path: &str, + schema: SchemaRef, + engine: &dyn Engine, + properties: &[(&str, &str)], +) -> DeltaResult> { + use delta_kernel::committer::FileSystemCommitter; + use delta_kernel::transaction::create_table::create_table; + + let _ = create_table(table_path, schema, "Test/1.0") + .with_table_properties(properties.to_vec()) + .build(engine, Box::new(FileSystemCommitter::new()))? + .commit(engine)?; + + let table_url = delta_kernel::try_parse_uri(table_path)?; + Snapshot::builder_for(table_url).build(engine) +} + +// Writer that captures log output into a shared buffer for test assertions +pub struct LogWriter(pub Arc>>); + +impl std::io::Write for LogWriter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.0.lock().unwrap().write(buf) + } + fn flush(&mut self) -> std::io::Result<()> { + self.0.lock().unwrap().flush() + } +} + +// Test helper that sets up tracing to capture log output +// The guard keeps the tracing subscriber active for the lifetime of the struct +pub struct LoggingTest { + logs: Arc>>, + _guard: DefaultGuard, +} + +impl Default for LoggingTest { + fn default() -> Self { + Self::new() + } +} + +impl LoggingTest { + pub fn new() -> Self { + let logs = Arc::new(Mutex::new(Vec::new())); + let logs_clone = logs.clone(); + let _guard = tracing::subscriber::set_default( + tracing_subscriber::registry().with( + tracing_subscriber::fmt::layer() + .with_writer(move || LogWriter(logs_clone.clone())) + .with_ansi(false), + ), + ); + Self { logs, _guard } + } + + pub fn logs(&self) -> String { + String::from_utf8(self.logs.lock().unwrap().clone()).unwrap() + } +} + +/// Reads a commit log file and returns all actions of the given type (e.g. "add" or "remove"). +pub fn read_actions_from_commit( + table_url: &Url, + version: u64, + action_type: &str, +) -> Result, Box> { + let table_path = table_url.to_file_path().expect("should be a file URL"); + let commit_path = table_path.join(format!("_delta_log/{version:020}.json")); + let content = std::fs::read_to_string(commit_path)?; + let parsed: Vec = Deserializer::from_str(&content) + .into_iter::() + .try_collect()?; + Ok(parsed + .into_iter() + .filter_map(|v| v.get(action_type).cloned()) + .collect()) +} + +/// Removes all scan files from the snapshot, commits the transaction, and returns +/// the parsed remove actions from the resulting commit log. +pub fn remove_all_and_get_remove_actions( + snapshot: &Arc, + table_url: &Url, + engine: &impl Engine, +) -> Result, Box> { + let scan = snapshot.clone().scan_builder().build()?; + let all_scan_metadata: Vec<_> = scan.scan_metadata(engine)?.collect::, _>>()?; + + let mut txn = snapshot + .clone() + .transaction(Box::new(FileSystemCommitter::new()), engine)? + .with_engine_info("DefaultEngine") + .with_data_change(true); + for sm in all_scan_metadata { + txn.remove_files(sm.scan_files); + } + let committed = match txn.commit(engine)? { + CommitResult::CommittedTransaction(c) => c, + _ => panic!("Transaction should be committed"), + }; + read_actions_from_commit(table_url, committed.commit_version(), "remove") +} + +/// Asserts that `action["partitionValues"]` contains the given key with the expected value. +pub fn assert_partition_values(action: &serde_json::Value, key: &str, expected_value: &str) { + let pv = action["partitionValues"] + .as_object() + .expect("action should have partitionValues"); + assert!( + pv.contains_key(key), + "partitionValues should contain key '{key}', got: {pv:?}" + ); + assert_eq!(pv[key], expected_value); +} diff --git a/uc-catalog/Cargo.toml b/uc-catalog/Cargo.toml deleted file mode 100644 index 5ca92362d8..0000000000 --- a/uc-catalog/Cargo.toml +++ /dev/null @@ -1,28 +0,0 @@ -[package] -name = "uc-catalog" -edition.workspace = true -homepage.workspace = true -keywords.workspace = true -license.workspace = true -repository.workspace = true -readme.workspace = true -rust-version.workspace = true -version.workspace = true - -# for cargo-release -[package.metadata.release] -release = false - -[dependencies] -delta_kernel = { path = "../kernel", features = ["catalog-managed"] } -uc-client = { path = "../uc-client" } -itertools = "0.14" -object_store = "0.12.3" -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" -tokio = { version = "1", features = ["full"] } -tracing = "0.1" -url = "2" - -[dev-dependencies] -delta_kernel = { path = "../kernel", features = ["arrow-56", "default-engine-rustls", "catalog-managed"] } diff --git a/uc-catalog/src/lib.rs b/uc-catalog/src/lib.rs deleted file mode 100644 index f41cd9293a..0000000000 --- a/uc-catalog/src/lib.rs +++ /dev/null @@ -1,195 +0,0 @@ -//! UCCatalog implements a high-level interface for interacting with Delta Tables in Unity Catalog. - -use std::sync::Arc; - -use delta_kernel::{Engine, LogPath, Snapshot, Version}; - -use uc_client::prelude::*; - -use itertools::Itertools; -use tracing::info; -use url::Url; - -/// The [UCCatalog] provides a high-level interface to interact with Delta Tables stored in Unity -/// Catalog. For now this is a lightweight wrapper around a [UCClient]. -pub struct UCCatalog<'a> { - client: &'a UCClient, -} - -impl<'a> UCCatalog<'a> { - /// Create a new [UCCatalog] instance with the provided [UCClient]. - pub fn new(client: &'a UCClient) -> Self { - UCCatalog { client } - } - - /// Load the latest snapshot of a Delta Table identified by `table_id` and `table_uri` in Unity - /// Catalog. Generally, a separate `get_table` call can be used to resolve the table id/uri from - /// the table name. - pub async fn load_snapshot( - &self, - table_id: &str, - table_uri: &str, - engine: &dyn Engine, - ) -> Result, Box> { - self.load_snapshot_inner(table_id, table_uri, None, engine) - .await - } - - /// Load a snapshot of a Delta Table identified by `table_id` and `table_uri` for a specific - /// version. Generally, a separate `get_table` call can be used to resolve the table id/uri from - /// the table name. - pub async fn load_snapshot_at( - &self, - table_id: &str, - table_uri: &str, - version: Version, - engine: &dyn Engine, - ) -> Result, Box> { - self.load_snapshot_inner(table_id, table_uri, Some(version), engine) - .await - } - - pub(crate) async fn load_snapshot_inner( - &self, - table_id: &str, - table_uri: &str, - version: Option, - engine: &dyn Engine, - ) -> Result, Box> { - let table_uri = table_uri.to_string(); - let req = CommitsRequest { - table_id: table_id.to_string(), - table_uri: table_uri.clone(), - start_version: Some(0), - end_version: version.and_then(|v| v.try_into().ok()), - }; - // TODO: does it paginate? - let commits = self.client.get_commits(req).await?; - - // if commits are present, we ensure they are sorted+contiguous - if let Some(commits) = &commits.commits { - if !commits.windows(2).all(|w| w[1].version == w[0].version + 1) { - return Err("Received non-contiguous commit versions".into()); - } - } - - // we always get back the latest version from commits response, and pass that in to - // kernel's Snapshot builder. basically, load_table for the latest version always looks - // like a time travel query since we know the latest version ahead of time. - // - // note there is a weird edge case: if the table was just created it will return - // latest_table_version = -1, but the 0.json will exist in the _delta_log. - let version: Version = match version { - Some(v) => v, - None => match commits.latest_table_version { - -1 => 0, - i => i.try_into()?, - }, - }; - - // consume uc-client's Commit and hand back a delta_kernel LogPath - let table_url = Url::parse(&table_uri)?; - let commits: Vec<_> = commits - .commits - .unwrap_or_default() - .into_iter() - .map(|c| -> Result> { - LogPath::staged_commit( - table_url.clone(), - &c.file_name, - c.file_modification_timestamp, - c.file_size.try_into()?, - ) - .map_err(|e| e.into()) - }) - .try_collect()?; - - info!("commits for kernel: {:?}\n", commits); - - Snapshot::builder_for(Url::parse(&(table_uri + "/"))?) - .at_version(version) - .with_log_tail(commits) - .build(engine) - .map_err(|e| e.into()) - } -} - -#[cfg(test)] -mod tests { - use std::env; - - use delta_kernel::engine::default::executor::tokio::TokioBackgroundExecutor; - use delta_kernel::engine::default::DefaultEngine; - - use super::*; - - // We could just re-export UCClient's get_table to not require consumers to directly import - // uc_client themselves. - async fn get_table( - client: &UCClient, - table_name: &str, - ) -> Result<(String, String), Box> { - let res = client.get_table(table_name).await?; - let table_id = res.table_id; - let table_uri = res.storage_location; - - info!( - "[GET TABLE] got table_id: {}, table_uri: {}\n", - table_id, table_uri - ); - - Ok((table_id, table_uri)) - } - - // ignored test which you can run manually to play around with reading a UC table. run with: - // `ENDPOINT=".." TABLENAME=".." TOKEN=".." cargo t read_uc_table --nocapture -- --ignored` - #[ignore] - #[tokio::test] - async fn read_uc_table() -> Result<(), Box> { - let endpoint = env::var("ENDPOINT").expect("ENDPOINT environment variable not set"); - let token = env::var("TOKEN").expect("TOKEN environment variable not set"); - let table_name = env::var("TABLENAME").expect("TABLENAME environment variable not set"); - - // build UC client, get table info and credentials - let client = UCClient::builder(endpoint, &token).build()?; - let (table_id, table_uri) = get_table(&client, &table_name).await?; - let creds = client - .get_credentials(&table_id, Operation::Read) - .await - .map_err(|e| format!("Failed to get credentials: {}", e))?; - - // build catalog - let catalog = UCCatalog::new(&client); - - // TODO: support non-AWS - let creds = creds - .aws_temp_credentials - .ok_or("No AWS temporary credentials found")?; - - let options = [ - ("region", "us-west-2"), - ("access_key_id", &creds.access_key_id), - ("secret_access_key", &creds.secret_access_key), - ("session_token", &creds.session_token), - ]; - - let table_url = Url::parse(&table_uri)?; - let (store, path) = object_store::parse_url_opts(&table_url, options)?; - let store: Arc<_> = store.into(); - - info!("created object store: {:?}\npath: {:?}\n", store, path); - - let engine = DefaultEngine::new(store, Arc::new(TokioBackgroundExecutor::new())); - - // read table - let snapshot = catalog - .load_snapshot(&table_id, &table_uri, &engine) - .await?; - // or time travel - // let snapshot = catalog.load_snapshot_at(&table, 2).await?; - - println!("🎉 loaded snapshot: {snapshot:?}"); - - Ok(()) - } -} diff --git a/uc-client/src/client.rs b/uc-client/src/client.rs deleted file mode 100644 index 071264882d..0000000000 --- a/uc-client/src/client.rs +++ /dev/null @@ -1,223 +0,0 @@ -use std::future::Future; -use std::time::Duration; - -use reqwest::{header, Client, Response, StatusCode}; -use tracing::{instrument, warn}; -use url::Url; - -use crate::config::{ClientConfig, ClientConfigBuilder}; -use crate::error::{Error, Result}; -use crate::models::commits::{CommitRequest, CommitsRequest, CommitsResponse}; -use crate::models::credentials::{CredentialsRequest, Operation, TemporaryTableCredentials}; -use crate::models::tables::TablesResponse; - -/// An HTTP client for interacting with the Unity Catalog API. -#[derive(Debug, Clone)] -pub struct UCClient { - client: Client, - config: ClientConfig, - base_url: Url, -} - -impl UCClient { - /// Create a new client from [ClientConfig]. - pub fn new(config: ClientConfig) -> Result { - // default headers with authorization and content type - let mut headers = header::HeaderMap::new(); - headers.insert( - header::AUTHORIZATION, - header::HeaderValue::from_str(&format!("Bearer {}", config.token))?, - ); - headers.insert( - header::CONTENT_TYPE, - header::HeaderValue::from_static("application/json"), - ); - - let client = Client::builder() - .default_headers(headers) - .timeout(config.timeout) - .connect_timeout(config.connect_timeout) - .build()?; - - Ok(Self { - client, - base_url: config.workspace_url.clone(), - config, - }) - } - - /// Create a new [UCClientBuilder] to configure and build a [UCClient]. - pub fn builder(workspace: impl Into, token: impl Into) -> UCClientBuilder { - UCClientBuilder::new(workspace, token) - } - - /// Get the latest commits for the table. - #[instrument(skip(self))] - pub async fn get_commits(&self, request: CommitsRequest) -> Result { - let url = self.base_url.join("delta/preview/commits")?; - let response = self - .execute_with_retry(|| { - self.client - .request(reqwest::Method::GET, url.clone()) - .json(&request) - .send() - }) - .await?; - - self.handle_response(response).await - } - - /// Commit a new version to the table. - #[instrument(skip(self))] - pub async fn commit(&self, request: CommitRequest) -> Result<()> { - let url = self.base_url.join("delta/preview/commits")?; - let response = self - .execute_with_retry(|| { - self.client - .request(reqwest::Method::POST, url.clone()) - .json(&request) - .send() - }) - .await?; - - self.handle_response(response).await - } - - /// Resolve the table by name. - #[instrument(skip(self))] - pub async fn get_table(&self, table_name: &str) -> Result { - let url = self.base_url.join(&format!("tables/{}", table_name))?; - - let response = self - .execute_with_retry(|| self.client.get(url.clone()).send()) - .await?; - - match response.status() { - StatusCode::NOT_FOUND => Err(Error::TableNotFound(table_name.to_string())), - _ => self.handle_response(response).await, - } - } - - /// Get temporary cloud storage credentials for accessing a table. - #[instrument(skip(self))] - pub async fn get_credentials( - &self, - table_id: &str, - operation: Operation, - ) -> Result { - let url = self.base_url.join("temporary-table-credentials")?; - - let request_body = CredentialsRequest::new(table_id, operation); - let response = self - .execute_with_retry(|| self.client.post(url.clone()).json(&request_body).send()) - .await?; - - self.handle_response(response).await - } - - async fn execute_with_retry(&self, f: F) -> Result - where - F: Fn() -> Fut, - Fut: Future>, - { - for retry in 0..=self.config.max_retries { - match f().await { - Ok(response) if !response.status().is_server_error() => return Ok(response), - Ok(response) if retry < self.config.max_retries => { - warn!( - "Server error {}, retrying (attempt {}/{})", - response.status(), - retry + 1, - self.config.max_retries - ); - } - Ok(response) => { - return Err(Error::ApiError { - status: response.status().as_u16(), - message: "Server error".to_string(), - }) - } - Err(e) if retry < self.config.max_retries => { - warn!( - "Request failed, retrying (attempt {}/{}): {}", - retry + 1, - self.config.max_retries, - e - ); - } - Err(e) => return Err(Error::from(e)), - } - - tokio::time::sleep(self.config.retry_base_delay * (retry + 1)).await; - } - - // this is actually unreachable since we return in the loop for Ok/Err after all retries - Err(Error::MaxRetriesExceeded) - } - - async fn handle_response(&self, response: Response) -> Result - where - T: serde::de::DeserializeOwned, - { - let status = response.status(); - - if status.is_success() { - response.json::().await.map_err(Error::from) - } else { - let error_body = response - .text() - .await - .unwrap_or_else(|_| "Unknown error".to_string()); - - match status { - StatusCode::UNAUTHORIZED => Err(Error::AuthenticationFailed), - StatusCode::NOT_FOUND => Err(Error::ApiError { - status: status.as_u16(), - message: format!("Resource not found: {}", error_body), - }), - _ => Err(Error::ApiError { - status: status.as_u16(), - message: error_body, - }), - } - } - } -} - -/// A builder for configuring and creating a [UCClient]. -pub struct UCClientBuilder { - config_builder: ClientConfigBuilder, -} - -impl UCClientBuilder { - pub fn new(workspace: impl Into, token: impl Into) -> Self { - Self { - config_builder: ClientConfig::build(workspace, token), - } - } - - pub fn with_timeout(mut self, timeout: Duration) -> Self { - self.config_builder = self.config_builder.with_timeout(timeout); - self - } - - pub fn with_connect_timeout(mut self, timeout: Duration) -> Self { - self.config_builder = self.config_builder.with_connect_timeout(timeout); - self - } - - pub fn with_max_retries(mut self, retries: u32) -> Self { - self.config_builder = self.config_builder.with_max_retries(retries); - self - } - - pub fn with_retry_delays(mut self, base: Duration, max: Duration) -> Self { - self.config_builder = self.config_builder.with_retry_delays(base, max); - self - } - - pub fn build(self) -> Result { - let config = self.config_builder.build()?; - UCClient::new(config) - } -} diff --git a/uc-client/src/error.rs b/uc-client/src/error.rs deleted file mode 100644 index 09e7627733..0000000000 --- a/uc-client/src/error.rs +++ /dev/null @@ -1,36 +0,0 @@ -use thiserror::Error; - -#[derive(Error, Debug)] -pub enum Error { - #[error("HTTP request failed: {0}")] - Http(#[from] reqwest::Error), - - #[error("Invalid header value: {0}")] - InvalidHeaderValue(#[from] reqwest::header::InvalidHeaderValue), - - #[error("URL parse error: {0}")] - UrlParse(#[from] url::ParseError), - - #[error("Serialization error: {0}")] - Serialization(#[from] serde_json::Error), - - #[error("API error (status {status}): {message}")] - ApiError { status: u16, message: String }, - - #[error("Table not found: {0}")] - TableNotFound(String), - - #[error("Invalid configuration: {0}")] - InvalidConfiguration(String), - - #[error("Authentication failed")] - AuthenticationFailed, - - #[error("Operation not supported: {0}")] - UnsupportedOperation(String), - - #[error("Max retries exceeded")] - MaxRetriesExceeded, -} - -pub type Result = std::result::Result; diff --git a/uc-client/src/lib.rs b/uc-client/src/lib.rs deleted file mode 100644 index c24b9c4a73..0000000000 --- a/uc-client/src/lib.rs +++ /dev/null @@ -1,42 +0,0 @@ -//! Unity Catalog Client for Rust -//! -//! This crate provides a Rust client for interacting with Unity Catalog APIs. -//! -//! # Example -//! -//! ```no_run -//! use uc_client::{UCClient, models::CommitsRequest}; -//! -//! #[tokio::main] -//! async fn main() -> Result<(), Box> { -//! let client = UCClient::builder("uc.awesome.org", "your-token") -//! .build()?; -//! -//! let request = CommitsRequest::new("table-id", "table-uri"); -//! let commits = client.get_commits(request).await?; -//! -//! Ok(()) -//! } -//! ``` - -pub mod client; -pub mod config; -pub mod error; -pub mod models; - -#[cfg(test)] -mod tests; - -pub use client::{UCClient, UCClientBuilder}; -pub use config::{ClientConfig, ClientConfigBuilder}; -pub use error::{Error, Result}; - -#[doc(hidden)] -pub mod prelude { - pub use crate::client::UCClient; - pub use crate::models::{ - commits::{Commit, CommitsRequest, CommitsResponse}, - credentials::{Operation, TemporaryTableCredentials}, - tables::TablesResponse, - }; -} diff --git a/uc-client/src/models/mod.rs b/uc-client/src/models/mod.rs deleted file mode 100644 index c6048a7d72..0000000000 --- a/uc-client/src/models/mod.rs +++ /dev/null @@ -1,7 +0,0 @@ -pub mod commits; -pub mod credentials; -pub mod tables; - -pub use commits::{Commit, CommitRequest, CommitsRequest, CommitsResponse}; -pub use credentials::{AwsTempCredentials, TemporaryTableCredentials}; -pub use tables::TablesResponse; diff --git a/uc-client/Cargo.toml b/unity-catalog-delta-client-api/Cargo.toml similarity index 69% rename from uc-client/Cargo.toml rename to unity-catalog-delta-client-api/Cargo.toml index 05e641935d..5f3b8cf49b 100644 --- a/uc-client/Cargo.toml +++ b/unity-catalog-delta-client-api/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "uc-client" +name = "unity-catalog-delta-client-api" edition.workspace = true homepage.workspace = true keywords.workspace = true @@ -14,15 +14,14 @@ version.workspace = true release = false [dependencies] -reqwest = { version = "0.12", features = ["json"] } +chrono = { version = "0.4", features = ["serde"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" -tokio = { version = "1", features = ["full"] } thiserror = "2.0" -tracing = "0.1" -url = "2.5" -chrono = { version = "0.4", features = ["serde"] } + +[features] +default = [] +test-utils = [] [dev-dependencies] -tracing-subscriber = { version = "0.3", features = ["env-filter"] } -clap = { version = "4.5", features = ["derive", "env"] } +tokio = { version = "1", features = ["full"] } diff --git a/unity-catalog-delta-client-api/src/clients/in_memory.rs b/unity-catalog-delta-client-api/src/clients/in_memory.rs new file mode 100644 index 0000000000..058f30e46b --- /dev/null +++ b/unity-catalog-delta-client-api/src/clients/in_memory.rs @@ -0,0 +1,293 @@ +//! In-memory implementation of [`CommitClient`] and [`GetCommitsClient`] for testing. + +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::sync::RwLock; + +use crate::error::{Error, Result}; +use crate::models::{Commit, CommitRequest, CommitsRequest, CommitsResponse}; + +use super::{CommitClient, GetCommitsClient}; + +// ============================================================================ +// TableData +// ============================================================================ + +/// In-memory representation of a UC-managed Delta table's commit state. +pub struct TableData { + /// The highest version that has been ratified (committed) to this table. + pub max_ratified_version: i64, + /// Commits that have been registered with UC but not yet published. + pub catalog_commits: Vec, +} + +impl TableData { + pub const MAX_UNPUBLISHED_COMMITS: usize = 20; + + /// Creates a new `TableData` representing a UC Delta table that has just been created. + /// The table starts with no commits and version 0. + fn new_post_table_create() -> Self { + Self { + max_ratified_version: 0, + catalog_commits: vec![], + } + } + + /// Returns commits within the requested version range. + fn get_commits(&self, request: CommitsRequest) -> Result { + let start = request.start_version.unwrap_or(0); + let end = request.end_version.unwrap_or(i64::MAX); + + Ok(CommitsResponse { + commits: Some( + self.catalog_commits + .iter() + .filter(|commit| start <= commit.version && commit.version <= end) + .cloned() + .collect(), + ), + latest_table_version: self.max_ratified_version, + }) + } + + /// Registers a new commit. Returns an error if the version is not the expected next version + /// or if the number of unpublished commits exceeds the maximum. + fn commit(&mut self, request: CommitRequest) -> Result<()> { + let Some(commit) = request.commit_info else { + return Err(Error::UnsupportedOperation( + "commit_info is required".to_string(), + )); + }; + + let expected_version = self.max_ratified_version + 1; + + if commit.version != expected_version { + return Err(Error::UnsupportedOperation(format!( + "Expected commit version {} but got {}", + expected_version, commit.version + ))); + } + + if self.catalog_commits.len() >= Self::MAX_UNPUBLISHED_COMMITS { + return Err(Error::MaxUnpublishedCommitsExceeded( + Self::MAX_UNPUBLISHED_COMMITS as u16, + )); + } + + if let Some(v) = request.latest_backfilled_version { + self.cleanup_published_commits(v); + } + + self.catalog_commits.push(commit); + self.max_ratified_version = expected_version; + + Ok(()) + } + + /// Removes commits that have been published (backfilled) to the Delta log. + fn cleanup_published_commits(&mut self, max_published_version: i64) { + self.catalog_commits + .retain(|commit| max_published_version < commit.version); + } +} + +// ============================================================================ +// InMemoryCommitsClient +// ============================================================================ + +/// An in-memory implementation of [`CommitClient`] and [`GetCommitsClient`] for testing. +pub struct InMemoryCommitsClient { + // table id -> table data + tables: RwLock>, +} + +impl InMemoryCommitsClient { + pub fn new() -> Self { + Self { + tables: RwLock::new(HashMap::new()), + } + } + + pub fn create_table(&self, table_id: impl Into) -> Result<()> { + let mut tables = self.tables.write().unwrap(); + match tables.entry(table_id.into()) { + Entry::Vacant(e) => { + e.insert(TableData::new_post_table_create()); + Ok(()) + } + Entry::Occupied(e) => Err(Error::UnsupportedOperation(format!( + "Table {} already exists", + e.key() + ))), + } + } + + /// Inserts a table with pre-existing state. Useful for testing. + pub fn insert_table(&self, table_id: impl Into, table_data: TableData) { + self.tables + .write() + .unwrap() + .insert(table_id.into(), table_data); + } +} + +impl Default for InMemoryCommitsClient { + fn default() -> Self { + Self::new() + } +} + +impl GetCommitsClient for InMemoryCommitsClient { + async fn get_commits(&self, request: CommitsRequest) -> Result { + let tables = self.tables.read().unwrap(); + let table = tables + .get(&request.table_id) + .ok_or_else(|| Error::TableNotFound(request.table_id.clone()))?; + table.get_commits(request) + } +} + +impl CommitClient for InMemoryCommitsClient { + async fn commit(&self, request: CommitRequest) -> Result<()> { + let mut tables = self.tables.write().unwrap(); + let table = tables + .get_mut(&request.table_id) + .ok_or_else(|| Error::TableNotFound(request.table_id.clone()))?; + table.commit(request) + } +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + const TABLE_ID: &str = "test-table-id"; + const TABLE_URI: &str = "s3://bucket/table"; + + fn make_commit(version: i64) -> Commit { + Commit::new( + version, + version * 1000, + format!("{version:020}.json"), + 100, + version * 1000, + ) + } + + fn commit_request(version: i64, latest_backfilled_version: Option) -> CommitRequest { + CommitRequest::new( + TABLE_ID, + TABLE_URI, + make_commit(version), + latest_backfilled_version, + ) + } + + fn get_commits_request() -> CommitsRequest { + CommitsRequest::new(TABLE_ID, TABLE_URI) + } + + fn extract_commit_versions(commits: &[Commit]) -> Vec { + commits.iter().map(|c| c.version).collect() + } + + #[tokio::test] + async fn test_commit_and_get_commits() { + let client = InMemoryCommitsClient::new(); + + // Create table + client.create_table(TABLE_ID).unwrap(); + + // Insert 10 commits (versions 1-10) + for v in 1..=10 { + client.commit(commit_request(v, None)).await.unwrap(); + } + + // Get commits (versions 3-8) + let commits_request = get_commits_request() + .with_start_version(3) + .with_end_version(8); + let response = client.get_commits(commits_request).await.unwrap(); + let commits = response.commits.unwrap(); + assert_eq!(commits.len(), 6); + assert_eq!(extract_commit_versions(&commits), vec![3, 4, 5, 6, 7, 8]); + assert_eq!(response.latest_table_version, 10); + + // Insert commit 11 with latest_backfilled_version = 5 + // This should cleanup commits 1-5 (retain versions >= 6) + client.commit(commit_request(11, Some(5))).await.unwrap(); + + // Get commits again - should return versions 6-11 + let response = client.get_commits(get_commits_request()).await.unwrap(); + let commits = response.commits.unwrap(); + assert_eq!(extract_commit_versions(&commits), vec![6, 7, 8, 9, 10, 11]); + assert_eq!(response.latest_table_version, 11); + } + + #[test] + fn test_create_table_duplicate_throws() { + let client = InMemoryCommitsClient::new(); + client.create_table(TABLE_ID).unwrap(); + assert!(matches!( + client.create_table(TABLE_ID), + Err(Error::UnsupportedOperation(_)) + )); + } + + #[tokio::test] + async fn test_get_commits_table_not_found() { + assert!(matches!( + InMemoryCommitsClient::new() + .get_commits(get_commits_request()) + .await, + Err(Error::TableNotFound(_)) + )); + } + + #[tokio::test] + async fn test_commit_table_not_found() { + assert!(matches!( + InMemoryCommitsClient::new() + .commit(commit_request(1, None)) + .await, + Err(Error::TableNotFound(_)) + )); + } + + #[tokio::test] + async fn test_commit_wrong_version() { + let client = InMemoryCommitsClient::new(); + client.create_table(TABLE_ID).unwrap(); + assert!(matches!( + client.commit(commit_request(5, None)).await, + Err(Error::UnsupportedOperation(_)) + )); + } + + #[tokio::test] + async fn test_get_commits_empty_table() { + let client = InMemoryCommitsClient::new(); + client.create_table(TABLE_ID).unwrap(); + let response = client.get_commits(get_commits_request()).await.unwrap(); + assert!(response.commits.unwrap().is_empty()); + assert_eq!(response.latest_table_version, 0); + } + + #[tokio::test] + async fn test_commit_max_unpublished_commits_exceeded() { + let client = InMemoryCommitsClient::new(); + client.create_table(TABLE_ID).unwrap(); + for v in 1..=TableData::MAX_UNPUBLISHED_COMMITS as i64 { + client.commit(commit_request(v, None)).await.unwrap(); + } + let next_version = TableData::MAX_UNPUBLISHED_COMMITS as i64 + 1; + assert!(matches!( + client.commit(commit_request(next_version, None)).await, + Err(Error::MaxUnpublishedCommitsExceeded(_)) + )); + } +} diff --git a/unity-catalog-delta-client-api/src/clients/mod.rs b/unity-catalog-delta-client-api/src/clients/mod.rs new file mode 100644 index 0000000000..4f6143ed11 --- /dev/null +++ b/unity-catalog-delta-client-api/src/clients/mod.rs @@ -0,0 +1,31 @@ +use crate::error::Result; +use crate::models::{CommitRequest, CommitsRequest, CommitsResponse}; + +#[cfg(any(test, feature = "test-utils"))] +mod in_memory; + +#[cfg(any(test, feature = "test-utils"))] +pub use in_memory::{InMemoryCommitsClient, TableData}; + +/// Trait for committing new versions to a UC-managed Delta table. +/// +/// Implementations of this trait are responsible for performing any necessary +/// retries on transient failures. This trait is designed to be injected into a +/// `UCCommitter` (in `delta-kernel-unity-catalog`), which itself does not +/// perform any retries and relies on the underlying client implementation to +/// handle retry logic. +#[allow(async_fn_in_trait)] +pub trait CommitClient: Send + Sync { + /// Commit a new version to the table. + async fn commit(&self, request: CommitRequest) -> Result<()>; +} + +/// Trait for retrieving commits from a UC-managed Delta table. +/// +/// Implementations of this trait are responsible for performing any necessary +/// retries on transient failures. +#[allow(async_fn_in_trait)] +pub trait GetCommitsClient: Send + Sync { + /// Get the latest commits for the table. + async fn get_commits(&self, request: CommitsRequest) -> Result; +} diff --git a/uc-client/src/models/credentials.rs b/unity-catalog-delta-client-api/src/credentials.rs similarity index 82% rename from uc-client/src/models/credentials.rs rename to unity-catalog-delta-client-api/src/credentials.rs index 288d6a683f..6f96ef046f 100644 --- a/uc-client/src/models/credentials.rs +++ b/unity-catalog-delta-client-api/src/credentials.rs @@ -50,18 +50,3 @@ impl std::fmt::Display for Operation { } } } - -#[derive(Debug, Clone, Serialize)] -pub struct CredentialsRequest { - pub table_id: String, - pub operation: Operation, -} - -impl CredentialsRequest { - pub fn new(table_id: impl Into, operation: Operation) -> Self { - Self { - table_id: table_id.into(), - operation, - } - } -} diff --git a/unity-catalog-delta-client-api/src/error.rs b/unity-catalog-delta-client-api/src/error.rs new file mode 100644 index 0000000000..6c9140db0a --- /dev/null +++ b/unity-catalog-delta-client-api/src/error.rs @@ -0,0 +1,34 @@ +use thiserror::Error; + +/// Errors for Unity Catalog client API traits. +/// +/// This error type contains no HTTP-specific variants, allowing any backend +/// (REST, gRPC, in-memory, etc.) to implement the traits without pulling in +/// implementation-specific dependencies. +#[derive(Error, Debug)] +pub enum Error { + /// The requested table was not found. + #[error("Table not found: {0}")] + TableNotFound(String), + + /// The number of unpublished commits has exceeded the maximum allowed. + #[error("Max unpublished commits exceeded (max: {0})")] + MaxUnpublishedCommitsExceeded(u16), + + /// The requested operation is not supported. + #[error("Operation not supported: {0}")] + UnsupportedOperation(String), + + /// Authentication with Unity Catalog failed. + #[error("Authentication failed")] + AuthenticationFailed, + + /// A generic error with a descriptive message. + #[error("{0}")] + Generic(String), +} + +/// A type alias for [`Result`] using [`enum@Error`]. +/// +/// [`Result`]: std::result::Result +pub type Result = std::result::Result; diff --git a/unity-catalog-delta-client-api/src/lib.rs b/unity-catalog-delta-client-api/src/lib.rs new file mode 100644 index 0000000000..520e35efc8 --- /dev/null +++ b/unity-catalog-delta-client-api/src/lib.rs @@ -0,0 +1,33 @@ +//! Unity Catalog client API traits and models. +//! +//! This crate defines transport-agnostic traits for interacting with Unity +//! Catalog. Concrete implementations (e.g., REST over HTTP) live in separate +//! crates that depend on this one. +//! +//! # Traits +//! +//! - [`CommitClient`] -- commit a new version to a UC-managed Delta table +//! - [`GetCommitsClient`] -- retrieve commits from a UC-managed Delta table +//! +//! # Testing +//! +//! Enable the `test-utils` feature for an in-memory implementation of both +//! traits suitable for unit tests: +//! +//! ```toml +//! [dev-dependencies] +//! unity-catalog-delta-client-api = { version = "...", features = ["test-utils"] } +//! ``` + +pub mod clients; +pub mod credentials; +pub mod error; +pub mod models; + +pub use clients::{CommitClient, GetCommitsClient}; +pub use credentials::{AwsTempCredentials, Operation, TemporaryTableCredentials}; +pub use error::{Error, Result}; +pub use models::{Commit, CommitRequest, CommitsRequest, CommitsResponse}; + +#[cfg(any(test, feature = "test-utils"))] +pub use clients::{InMemoryCommitsClient, TableData}; diff --git a/uc-client/src/models/commits.rs b/unity-catalog-delta-client-api/src/models.rs similarity index 95% rename from uc-client/src/models/commits.rs rename to unity-catalog-delta-client-api/src/models.rs index f90d6e81f1..4c1a2b541f 100644 --- a/uc-client/src/models/commits.rs +++ b/unity-catalog-delta-client-api/src/models.rs @@ -107,10 +107,5 @@ impl CommitRequest { } } - pub fn with_latest_backfilled_version(mut self, version: i64) -> Self { - self.latest_backfilled_version = Some(version); - self - } - // TODO: expose metadata/protocol (with_metadata, with_protocol) } diff --git a/unity-catalog-delta-rest-client/Cargo.toml b/unity-catalog-delta-rest-client/Cargo.toml new file mode 100644 index 0000000000..0a6ab988f2 --- /dev/null +++ b/unity-catalog-delta-rest-client/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "unity-catalog-delta-rest-client" +edition.workspace = true +homepage.workspace = true +keywords.workspace = true +license.workspace = true +repository.workspace = true +readme.workspace = true +rust-version.workspace = true +version.workspace = true + +# for cargo-release +[package.metadata.release] +release = false + +[dependencies] +unity-catalog-delta-client-api = { path = "../unity-catalog-delta-client-api" } +# Hardcodes rustls (always used alongside default-engine-rustls in practice). +reqwest = { version = "0.13", default-features = false, features = ["charset", "http2", "json", "rustls", "system-proxy"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +tokio = { version = "1", features = ["full"] } +thiserror = "2.0" +tracing = "0.1" +url = "2.5" +chrono = { version = "0.4", features = ["serde"] } + +[features] +default = [] +test-utils = ["unity-catalog-delta-client-api/test-utils"] + +[dev-dependencies] +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +clap = { version = "4.5", features = ["derive", "env"] } diff --git a/uc-client/README.md b/unity-catalog-delta-rest-client/README.md similarity index 92% rename from uc-client/README.md rename to unity-catalog-delta-rest-client/README.md index 09d6026b7d..a49aa76d35 100644 --- a/uc-client/README.md +++ b/unity-catalog-delta-rest-client/README.md @@ -1,11 +1,11 @@ -# uc-client +# unity-catalog-delta-rest-client An experimental/under-construction rust client for Unity Catalog. This crate is not intended for production use. ## Example CLI This crate provides a command-line interface (CLI) to interact with Unity Catalog APIs, see -`uc-client/examples/uc-cli.rs`. +`unity-catalog-delta-rest-client/examples/uc-cli.rs`. ```bash # set environment variables for UC url/token @@ -34,4 +34,4 @@ cargo run --example uc-cli -- credentials \ # also can enable verbose logging cargo run --example uc-cli -- --verbose table catalog.schema.table -``` \ No newline at end of file +``` diff --git a/uc-client/examples/uc-cli.rs b/unity-catalog-delta-rest-client/examples/uc-cli.rs similarity index 80% rename from uc-client/examples/uc-cli.rs rename to unity-catalog-delta-rest-client/examples/uc-cli.rs index 91baf7ebe7..2d3c192da0 100644 --- a/uc-client/examples/uc-cli.rs +++ b/unity-catalog-delta-rest-client/examples/uc-cli.rs @@ -1,13 +1,11 @@ use clap::{Parser, Subcommand}; use std::time::Duration; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; -use uc_client::{ - models::{commits::Commit, credentials::Operation, CommitsRequest}, - UCClient, -}; +use unity_catalog_delta_client_api::{Commit, CommitsRequest, GetCommitsClient, Operation}; +use unity_catalog_delta_rest_client::{UCClient, UCCommitsRestClient}; #[derive(Parser)] -#[command(name = "uc-client")] +#[command(name = "uc-cli")] #[command(about = "Unity Catalog CLI client", long_about = None)] struct Cli { /// Unity Catalog URL @@ -68,8 +66,7 @@ fn parse_operation(s: &str) -> Result { "WRITE" => Ok(Operation::Write), "READ_WRITE" | "READWRITE" => Ok(Operation::ReadWrite), _ => Err(format!( - "Invalid operation '{}'. Must be READ, WRITE, or READ_WRITE", - s + "Invalid operation '{s}'. Must be READ, WRITE, or READ_WRITE" )), } } @@ -85,24 +82,30 @@ async fn main() -> Result<(), Box> { .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(filter_level))) .init(); - // Create client - let client = UCClient::builder(&cli.workspace_url, &cli.token) - .with_timeout(Duration::from_secs(60)) - .with_max_retries(3) - .build()?; + // Create shared config + let config = + unity_catalog_delta_rest_client::ClientConfig::build(&cli.workspace_url, &cli.token) + .with_timeout(Duration::from_secs(60)) + .with_max_retries(3) + .build()?; + + // Create shared HTTP client and UC clients + let http_client = unity_catalog_delta_rest_client::http::build_http_client(&config)?; + let uc_client = UCClient::with_http_client(http_client.clone(), config.clone()); + let uc_commits_client = UCCommitsRestClient::with_http_client(http_client, config); // Execute command match cli.command { Commands::Table { name } => { - println!("Fetching table metadata for: {}", name); + println!("Fetching table metadata for: {name}"); - match client.get_table(&name).await { + match uc_client.get_table(&name).await { Ok(table) => { println!("\n✓ Table metadata retrieved successfully\n"); - println!("{}", table); + println!("{table}"); } Err(e) => { - eprintln!("✗ Failed to get table: {}", e); + eprintln!("✗ Failed to get table: {e}"); std::process::exit(1); } } @@ -113,10 +116,10 @@ async fn main() -> Result<(), Box> { start_version, end_version, } => { - println!("Resolving table: {}", name); + println!("Resolving table: {name}"); // First, get the table metadata to obtain table_id and storage_location - let table = match client.get_table(&name).await { + let table = match uc_client.get_table(&name).await { Ok(table) => { println!( "✓ Table resolved: {} (ID: {})", @@ -126,7 +129,7 @@ async fn main() -> Result<(), Box> { table } Err(e) => { - eprintln!("✗ Failed to resolve table: {}", e); + eprintln!("✗ Failed to resolve table: {e}"); std::process::exit(1); } }; @@ -143,7 +146,7 @@ async fn main() -> Result<(), Box> { request = request.with_end_version(end); } - match client.get_commits(request).await { + match uc_commits_client.get_commits(request).await { Ok(response) => { println!("\n✓ Commits retrieved successfully\n"); println!("Table: {}", table.full_name()); @@ -157,7 +160,7 @@ async fn main() -> Result<(), Box> { } } Err(e) => { - eprintln!("✗ Failed to get commits: {}", e); + eprintln!("✗ Failed to get commits: {e}"); std::process::exit(1); } } @@ -167,12 +170,9 @@ async fn main() -> Result<(), Box> { table_id, operation, } => { - println!( - "Getting {} credentials for table_id: {}", - operation, table_id - ); + println!("Getting {operation} credentials for table_id: {table_id}"); - match client.get_credentials(&table_id, operation).await { + match uc_client.get_credentials(&table_id, operation).await { Ok(creds) => { println!("\n✓ Credentials retrieved successfully\n"); println!("URL: {}", creds.url); @@ -180,12 +180,12 @@ async fn main() -> Result<(), Box> { .expiration_as_datetime() .map(|dt| dt.format("%Y-%m-%d %H:%M:%S UTC").to_string()) .unwrap_or_else(|| format!("Invalid timestamp: {}", creds.expiration_time)); - println!("Expires at: {}", expires_str); + println!("Expires at: {expires_str}"); if let Some(time_left) = creds.time_until_expiry() { let hours = time_left.num_hours(); let minutes = time_left.num_minutes() % 60; - println!("Time until expiry: {} hours {} minutes", hours, minutes); + println!("Time until expiry: {hours} hours {minutes} minutes"); } else { println!("Time until expiry: Unable to calculate"); } @@ -208,7 +208,7 @@ async fn main() -> Result<(), Box> { } } Err(e) => { - eprintln!("✗ Failed to get credentials: {}", e); + eprintln!("✗ Failed to get credentials: {e}"); std::process::exit(1); } } diff --git a/unity-catalog-delta-rest-client/src/clients/commits.rs b/unity-catalog-delta-rest-client/src/clients/commits.rs new file mode 100644 index 0000000000..64c694ec36 --- /dev/null +++ b/unity-catalog-delta-rest-client/src/clients/commits.rs @@ -0,0 +1,84 @@ +use serde::Deserialize; +use tracing::instrument; +use url::Url; + +use crate::config::ClientConfig; +use crate::http::{build_http_client, execute_with_retry, handle_response}; +use unity_catalog_delta_client_api::{ + CommitClient, CommitRequest, CommitsRequest, CommitsResponse, GetCommitsClient, +}; + +/// Placeholder for deserializing empty JSON responses from void-returning endpoints. +#[derive(Deserialize)] +struct EmptyResponse {} + +/// REST implementation of [CommitClient] and [GetCommitsClient]. +#[derive(Debug, Clone)] +pub struct UCCommitsRestClient { + http_client: reqwest::Client, + config: ClientConfig, + base_url: Url, +} + +impl UCCommitsRestClient { + /// Create from config. + pub fn new(config: ClientConfig) -> crate::error::Result { + Ok(Self { + http_client: build_http_client(&config)?, + base_url: config.workspace_url.clone(), + config, + }) + } + + /// Create from existing reqwest Client. + pub fn with_http_client(http_client: reqwest::Client, config: ClientConfig) -> Self { + Self { + base_url: config.workspace_url.clone(), + http_client, + config, + } + } +} + +impl GetCommitsClient for UCCommitsRestClient { + #[instrument(skip(self))] + async fn get_commits( + &self, + request: CommitsRequest, + ) -> unity_catalog_delta_client_api::Result { + let result: crate::error::Result = async { + let url = self.base_url.join("delta/preview/commits")?; + let response = execute_with_retry(&self.config, || { + self.http_client + .request(reqwest::Method::GET, url.clone()) + .json(&request) + .send() + }) + .await?; + handle_response(response).await + } + .await; + result.map_err(Into::into) + } +} + +impl CommitClient for UCCommitsRestClient { + #[instrument(skip(self))] + async fn commit(&self, request: CommitRequest) -> unity_catalog_delta_client_api::Result<()> { + let result: crate::error::Result<()> = async { + let url = self.base_url.join("delta/preview/commits")?; + let response = execute_with_retry(&self.config, || { + self.http_client + .request(reqwest::Method::POST, url.clone()) + .json(&request) + .send() + }) + .await?; + + let _: EmptyResponse = handle_response(response).await?; + Ok(()) + } + .await; + result.map_err(Into::into) + } +} diff --git a/unity-catalog-delta-rest-client/src/clients/mod.rs b/unity-catalog-delta-rest-client/src/clients/mod.rs new file mode 100644 index 0000000000..b4a7d1e596 --- /dev/null +++ b/unity-catalog-delta-rest-client/src/clients/mod.rs @@ -0,0 +1,5 @@ +mod commits; +mod uc_client; + +pub use commits::UCCommitsRestClient; +pub use uc_client::UCClient; diff --git a/unity-catalog-delta-rest-client/src/clients/uc_client.rs b/unity-catalog-delta-rest-client/src/clients/uc_client.rs new file mode 100644 index 0000000000..a459a7c2fc --- /dev/null +++ b/unity-catalog-delta-rest-client/src/clients/uc_client.rs @@ -0,0 +1,80 @@ +// TODO(https://github.com/delta-io/delta-kernel-rs/issues/2251): Replace UCClient with +// trait-based clients (GetTableClient, GetCredentialsClient) once those traits are added +// to unity-catalog-delta-client-api. +use reqwest::StatusCode; +use tracing::instrument; +use url::Url; + +use unity_catalog_delta_client_api::{Operation, TemporaryTableCredentials}; + +use crate::config::ClientConfig; +use crate::error::Result; +use crate::http::{build_http_client, execute_with_retry, handle_response}; +use crate::models::credentials::CredentialsRequest; +use crate::models::tables::TablesResponse; + +/// An HTTP client for interacting with the Unity Catalog API. +#[derive(Debug, Clone)] +pub struct UCClient { + http_client: reqwest::Client, + config: ClientConfig, + base_url: Url, +} + +impl UCClient { + /// Create a new client from [ClientConfig]. + pub fn new(config: ClientConfig) -> Result { + Ok(Self { + http_client: build_http_client(&config)?, + base_url: config.workspace_url.clone(), + config, + }) + } + + /// Create from existing reqwest Client. + pub fn with_http_client(http_client: reqwest::Client, config: ClientConfig) -> Self { + Self { + base_url: config.workspace_url.clone(), + http_client, + config, + } + } + + /// Resolve the table by name. + #[instrument(skip(self))] + pub async fn get_table(&self, table_name: &str) -> Result { + let url = self.base_url.join(&format!("tables/{table_name}"))?; + + let response = + execute_with_retry(&self.config, || self.http_client.get(url.clone()).send()).await?; + + match response.status() { + StatusCode::NOT_FOUND => Err(unity_catalog_delta_client_api::Error::TableNotFound( + table_name.to_string(), + ) + .into()), + _ => handle_response(response).await, + } + } + + /// Get temporary cloud storage credentials for accessing a table. + #[instrument(skip(self))] + pub async fn get_credentials( + &self, + table_id: &str, + operation: Operation, + ) -> Result { + let url = self.base_url.join("temporary-table-credentials")?; + + let request_body = CredentialsRequest::new(table_id, operation); + let response = execute_with_retry(&self.config, || { + self.http_client + .post(url.clone()) + .json(&request_body) + .send() + }) + .await?; + + handle_response(response).await + } +} diff --git a/uc-client/src/config.rs b/unity-catalog-delta-rest-client/src/config.rs similarity index 100% rename from uc-client/src/config.rs rename to unity-catalog-delta-rest-client/src/config.rs diff --git a/unity-catalog-delta-rest-client/src/error.rs b/unity-catalog-delta-rest-client/src/error.rs new file mode 100644 index 0000000000..3416673489 --- /dev/null +++ b/unity-catalog-delta-rest-client/src/error.rs @@ -0,0 +1,99 @@ +use thiserror::Error; + +/// REST-specific errors for the Unity Catalog client. +/// +/// API-level errors (table not found, auth failures, etc.) are wrapped in the +/// [`Api`](Error::Api) variant, keeping them in sync with `unity_catalog_delta_client_api::Error` +/// without duplicating variants. +#[derive(Error, Debug)] +pub enum Error { + /// A transport-agnostic API error. + #[error(transparent)] + Api(#[from] unity_catalog_delta_client_api::Error), + + /// An HTTP request failed. + #[error("HTTP request failed: {0}")] + Http(#[from] reqwest::Error), + + /// An invalid HTTP header value was provided. + #[error("Invalid header value: {0}")] + InvalidHeaderValue(#[from] reqwest::header::InvalidHeaderValue), + + /// A URL could not be parsed. + #[error("URL parse error: {0}")] + UrlParse(#[from] url::ParseError), + + /// JSON serialization or deserialization failed. + #[error("Serialization error: {0}")] + Serialization(#[from] serde_json::Error), + + /// The server returned a non-success HTTP status code. + #[error("HTTP error (status {status}): {message}")] + HttpStatusError { status: u16, message: String }, + + /// The client configuration is invalid. + #[error("Invalid configuration: {0}")] + InvalidConfiguration(String), + + /// All retry attempts have been exhausted. + #[error("Max retries exceeded")] + MaxRetriesExceeded, +} + +/// A type alias for [`Result`] using [`enum@Error`]. +/// +/// [`Result`]: std::result::Result +pub type Result = std::result::Result; + +impl From for unity_catalog_delta_client_api::Error { + fn from(e: Error) -> Self { + match e { + Error::Api(api_err) => api_err, + e => unity_catalog_delta_client_api::Error::Generic(e.to_string()), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_error_unwraps_api_variant() { + let api_err = unity_catalog_delta_client_api::Error::TableNotFound("t1".into()); + let rest_err = Error::Api(api_err); + assert!(matches!( + unity_catalog_delta_client_api::Error::from(rest_err), + unity_catalog_delta_client_api::Error::TableNotFound(msg) if msg == "t1" + )); + + let rest_err = Error::Api(unity_catalog_delta_client_api::Error::AuthenticationFailed); + assert!(matches!( + unity_catalog_delta_client_api::Error::from(rest_err), + unity_catalog_delta_client_api::Error::AuthenticationFailed + )); + + let rest_err = + Error::Api(unity_catalog_delta_client_api::Error::MaxUnpublishedCommitsExceeded(5)); + assert!(matches!( + unity_catalog_delta_client_api::Error::from(rest_err), + unity_catalog_delta_client_api::Error::MaxUnpublishedCommitsExceeded(5) + )); + } + + #[test] + fn from_error_maps_rest_only_variants_to_generic() { + let api_err = unity_catalog_delta_client_api::Error::from(Error::MaxRetriesExceeded); + assert!( + matches!(api_err, unity_catalog_delta_client_api::Error::Generic(ref msg) if msg == "Max retries exceeded"), + "unexpected: {api_err:?}" + ); + + let api_err = + unity_catalog_delta_client_api::Error::from(Error::InvalidConfiguration("bad".into())); + assert!( + matches!(api_err, unity_catalog_delta_client_api::Error::Generic(ref msg) if msg == "Invalid configuration: bad"), + "unexpected: {api_err:?}" + ); + } +} diff --git a/unity-catalog-delta-rest-client/src/http.rs b/unity-catalog-delta-rest-client/src/http.rs new file mode 100644 index 0000000000..e1a303c108 --- /dev/null +++ b/unity-catalog-delta-rest-client/src/http.rs @@ -0,0 +1,102 @@ +use std::future::Future; + +use reqwest::{header, Client, Response, StatusCode}; +use tracing::warn; + +use crate::config::ClientConfig; +use crate::error::{Error, Result}; + +/// Build a configured HTTP client from the given config. +pub fn build_http_client(config: &ClientConfig) -> Result { + let headers = header::HeaderMap::from_iter([ + ( + header::AUTHORIZATION, + header::HeaderValue::from_str(&format!("Bearer {}", config.token))?, + ), + ( + header::CONTENT_TYPE, + header::HeaderValue::from_static("application/json"), + ), + ]); + + let client = Client::builder() + .default_headers(headers) + .timeout(config.timeout) + .connect_timeout(config.connect_timeout) + .build()?; + + Ok(client) +} + +/// Execute a request with retry logic for server errors and request failures. +/// Retries up to `max_retries` times with linear backoff: delay = `retry_base_delay * attempt`. +pub async fn execute_with_retry(config: &ClientConfig, f: F) -> Result +where + F: Fn() -> Fut, + Fut: Future>, +{ + for retry in 0..=config.max_retries { + match f().await { + Ok(response) if !response.status().is_server_error() => return Ok(response), + Ok(response) if retry < config.max_retries => { + warn!( + "Server error {}, retrying (attempt {}/{})", + response.status(), + retry + 1, + config.max_retries + ); + } + Ok(response) => { + return Err(Error::HttpStatusError { + status: response.status().as_u16(), + message: "Server error".to_string(), + }) + } + Err(e) if retry < config.max_retries => { + warn!( + "Request failed, retrying (attempt {}/{}): {}", + retry + 1, + config.max_retries, + e + ); + } + Err(e) => return Err(Error::from(e)), + } + + tokio::time::sleep(config.retry_base_delay * (retry + 1)).await; + } + + // this is actually unreachable since we return in the loop for Ok/Err after all retries + Err(Error::MaxRetriesExceeded) +} + +/// Handle HTTP response and deserialize. +pub async fn handle_response(response: Response) -> Result +where + T: serde::de::DeserializeOwned, +{ + let status = response.status(); + + if status.is_success() { + response.json::().await.map_err(Error::from) + } else { + let error_body = response + .text() + .await + .unwrap_or_else(|_| "Unknown error".to_string()); + + match status { + StatusCode::UNAUTHORIZED => { + Err(unity_catalog_delta_client_api::Error::AuthenticationFailed.into()) + } + StatusCode::NOT_FOUND => Err(Error::HttpStatusError { + status: status.as_u16(), + message: format!("Resource not found: {error_body}"), + }), + _ => Err(Error::HttpStatusError { + status: status.as_u16(), + message: error_body, + }), + } + } +} diff --git a/unity-catalog-delta-rest-client/src/lib.rs b/unity-catalog-delta-rest-client/src/lib.rs new file mode 100644 index 0000000000..89edb169e6 --- /dev/null +++ b/unity-catalog-delta-rest-client/src/lib.rs @@ -0,0 +1,35 @@ +//! REST client implementation for Unity Catalog Delta APIs. +//! +//! This crate provides HTTP-based implementations of the traits defined in +//! [`unity_catalog_delta_client_api`]. +//! +//! # Example +//! +//! ```no_run +//! use unity_catalog_delta_client_api::{CommitsRequest, GetCommitsClient}; +//! use unity_catalog_delta_rest_client::{ClientConfig, UCCommitsRestClient}; +//! +//! #[tokio::main] +//! async fn main() -> Result<(), Box> { +//! let config = ClientConfig::build("uc.awesome.org", "your-token").build()?; +//! let client = UCCommitsRestClient::new(config)?; +//! +//! let request = CommitsRequest::new("table-id", "table-uri"); +//! let commits = client.get_commits(request).await?; +//! +//! Ok(()) +//! } +//! ``` + +pub mod clients; +pub mod config; +pub mod error; +pub mod http; +pub mod models; + +#[cfg(test)] +mod tests; + +pub use clients::{UCClient, UCCommitsRestClient}; +pub use config::{ClientConfig, ClientConfigBuilder}; +pub use error::{Error, Result}; diff --git a/unity-catalog-delta-rest-client/src/models/credentials.rs b/unity-catalog-delta-rest-client/src/models/credentials.rs new file mode 100644 index 0000000000..d472085722 --- /dev/null +++ b/unity-catalog-delta-rest-client/src/models/credentials.rs @@ -0,0 +1,19 @@ +use serde::Serialize; +use unity_catalog_delta_client_api::Operation; + +/// The HTTP request body for the temporary credentials endpoint. +#[derive(Debug, Clone, Serialize)] +pub struct CredentialsRequest { + pub table_id: String, + pub operation: Operation, +} + +impl CredentialsRequest { + /// Create a new credentials request for the given table and operation. + pub fn new(table_id: impl Into, operation: Operation) -> Self { + Self { + table_id: table_id.into(), + operation, + } + } +} diff --git a/unity-catalog-delta-rest-client/src/models/mod.rs b/unity-catalog-delta-rest-client/src/models/mod.rs new file mode 100644 index 0000000000..5483ca45db --- /dev/null +++ b/unity-catalog-delta-rest-client/src/models/mod.rs @@ -0,0 +1,4 @@ +pub mod credentials; +pub mod tables; + +pub use tables::TablesResponse; diff --git a/uc-client/src/models/tables.rs b/unity-catalog-delta-rest-client/src/models/tables.rs similarity index 97% rename from uc-client/src/models/tables.rs rename to unity-catalog-delta-rest-client/src/models/tables.rs index 619df9a1b0..fc8c722c3d 100644 --- a/uc-client/src/models/tables.rs +++ b/unity-catalog-delta-rest-client/src/models/tables.rs @@ -57,7 +57,7 @@ impl Display for TablesResponse { writeln!(f)?; writeln!(f, "Properties:")?; for (key, value) in &self.properties { - writeln!(f, " {}: {}", key, value)?; + writeln!(f, " {key}: {value}")?; } } diff --git a/uc-client/src/tests.rs b/unity-catalog-delta-rest-client/src/tests.rs similarity index 94% rename from uc-client/src/tests.rs rename to unity-catalog-delta-rest-client/src/tests.rs index 2bb7bed44a..72ac5bf726 100644 --- a/uc-client/src/tests.rs +++ b/unity-catalog-delta-rest-client/src/tests.rs @@ -1,5 +1,4 @@ -use crate::models::commits::CommitsRequest; -use crate::models::credentials::Operation; +use unity_catalog_delta_client_api::{CommitsRequest, Operation}; #[test] fn test_commits_request_builder() {