From 4149580d57bc8f037643f2949c924da3ca05d486 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 12 Jan 2026 01:30:57 +0530 Subject: [PATCH 01/85] Add WAL for direct deployment state recovery Signed-off-by: Varun Deep Saini --- .../wal/corrupted-wal-entry/databricks.yml | 25 ++ .../wal/corrupted-wal-entry/out.test.toml | 5 + .../deploy/wal/corrupted-wal-entry/output.txt | 56 +++ .../deploy/wal/corrupted-wal-entry/script | 35 ++ .../wal/corrupted-wal-entry/sort_warnings.py | 87 ++++ .../deploy/wal/corrupted-wal-entry/test.py | 1 + .../deploy/wal/corrupted-wal-entry/test.toml | 13 + .../wal/crash-after-create/databricks.yml | 15 + .../wal/crash-after-create/out.test.toml | 5 + .../deploy/wal/crash-after-create/output.txt | 38 ++ .../deploy/wal/crash-after-create/script | 24 + .../deploy/wal/crash-after-create/test.py | 1 + .../deploy/wal/crash-after-create/test.toml | 10 + .../deploy/wal/empty-wal/databricks.yml | 15 + .../bundle/deploy/wal/empty-wal/out.test.toml | 5 + .../bundle/deploy/wal/empty-wal/output.txt | 37 ++ acceptance/bundle/deploy/wal/empty-wal/script | 21 + .../bundle/deploy/wal/empty-wal/test.py | 1 + .../bundle/deploy/wal/empty-wal/test.toml | 13 + .../wal/future-serial-wal/databricks.yml | 15 + .../wal/future-serial-wal/out.test.toml | 5 + .../deploy/wal/future-serial-wal/output.txt | 29 ++ .../deploy/wal/future-serial-wal/script | 28 ++ .../deploy/wal/future-serial-wal/test.py | 1 + .../deploy/wal/future-serial-wal/test.toml | 4 + .../wal/lineage-mismatch/databricks.yml | 15 + .../deploy/wal/lineage-mismatch/out.test.toml | 5 + .../deploy/wal/lineage-mismatch/output.txt | 29 ++ .../bundle/deploy/wal/lineage-mismatch/script | 28 ++ .../deploy/wal/lineage-mismatch/test.py | 1 + .../deploy/wal/lineage-mismatch/test.toml | 4 + .../wal/multiple-crashes/databricks.yml | 15 + .../deploy/wal/multiple-crashes/out.test.toml | 5 + .../deploy/wal/multiple-crashes/output.txt | 64 +++ .../bundle/deploy/wal/multiple-crashes/script | 32 ++ .../deploy/wal/multiple-crashes/test.py | 1 + .../deploy/wal/multiple-crashes/test.toml | 10 + .../deploy/wal/normal-deploy/databricks.yml | 15 + .../deploy/wal/normal-deploy/out.test.toml | 5 + .../deploy/wal/normal-deploy/output.txt | 32 ++ .../bundle/deploy/wal/normal-deploy/script | 12 + .../bundle/deploy/wal/normal-deploy/test.py | 1 + .../bundle/deploy/wal/normal-deploy/test.toml | 9 + .../deploy/wal/stale-wal/databricks.yml | 15 + .../bundle/deploy/wal/stale-wal/out.test.toml | 5 + .../bundle/deploy/wal/stale-wal/output.txt | 38 ++ acceptance/bundle/deploy/wal/stale-wal/script | 40 ++ .../bundle/deploy/wal/stale-wal/test.py | 1 + .../bundle/deploy/wal/stale-wal/test.toml | 9 + .../wal/summary-after-crash/databricks.yml | 15 + .../wal/summary-after-crash/out.test.toml | 5 + .../deploy/wal/summary-after-crash/output.txt | 25 ++ .../deploy/wal/summary-after-crash/script | 11 + .../deploy/wal/summary-after-crash/test.py | 1 + .../deploy/wal/summary-after-crash/test.toml | 2 + acceptance/bundle/deploy/wal/test.toml | 43 ++ .../deploy/wal/wal-with-delete/databricks.yml | 15 + .../deploy/wal/wal-with-delete/out.test.toml | 5 + .../deploy/wal/wal-with-delete/output.txt | 21 + .../bundle/deploy/wal/wal-with-delete/script | 48 ++ .../bundle/deploy/wal/wal-with-delete/test.py | 1 + .../deploy/wal/wal-with-delete/test.toml | 5 + bundle/direct/bind.go | 6 +- bundle/direct/bundle_apply.go | 7 +- bundle/direct/bundle_plan.go | 2 +- bundle/direct/dstate/state.go | 121 ++++- bundle/direct/dstate/wal.go | 218 +++++++++ bundle/direct/dstate/wal_test.go | 419 ++++++++++++++++++ cmd/bundle/utils/process.go | 2 +- wal.txt | 205 +++++++++ 70 files changed, 2038 insertions(+), 19 deletions(-) create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/script create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/test.py create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/output.txt create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/script create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/test.py create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/test.toml create mode 100644 acceptance/bundle/deploy/wal/empty-wal/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/empty-wal/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/empty-wal/output.txt create mode 100644 acceptance/bundle/deploy/wal/empty-wal/script create mode 100644 acceptance/bundle/deploy/wal/empty-wal/test.py create mode 100644 acceptance/bundle/deploy/wal/empty-wal/test.toml create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/output.txt create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/script create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/test.py create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/test.toml create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/output.txt create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/script create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/test.py create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/test.toml create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/output.txt create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/script create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/test.py create mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/test.toml create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/output.txt create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/script create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/test.py create mode 100644 acceptance/bundle/deploy/wal/normal-deploy/test.toml create mode 100644 acceptance/bundle/deploy/wal/stale-wal/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/stale-wal/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/stale-wal/output.txt create mode 100644 acceptance/bundle/deploy/wal/stale-wal/script create mode 100644 acceptance/bundle/deploy/wal/stale-wal/test.py create mode 100644 acceptance/bundle/deploy/wal/stale-wal/test.toml create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/output.txt create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/script create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/test.py create mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/test.toml create mode 100644 acceptance/bundle/deploy/wal/test.toml create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/output.txt create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/script create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/test.py create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/test.toml create mode 100644 bundle/direct/dstate/wal.go create mode 100644 bundle/direct/dstate/wal_test.go create mode 100644 wal.txt diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml new file mode 100644 index 0000000000..cc9024fada --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml @@ -0,0 +1,25 @@ +bundle: + name: wal-corrupted-test + +resources: + jobs: + valid_job: + name: "valid-job" + tasks: + - task_key: "task-a" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + another_valid: + name: "another-valid" + tasks: + - task_key: "task-b" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt new file mode 100644 index 0000000000..1192629332 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -0,0 +1,56 @@ +=== Creating state file with serial 5 === +=== Creating WAL with corrupted entry === +=== WAL content === +{"lineage":"test-lineage-123","serial": [SERIAL]} +{"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} +not valid json - this line should be skipped +{"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}} +=== Deploy (should recover valid entries, skip corrupted) === + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.another_valid.tasks[0].new_cluster + in databricks.yml:23:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Warning: Single node cluster is not correctly configured + at resources.jobs.valid_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Final state (should have recovered entries) === +{ + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.another_valid", + "resources.jobs.valid_job" + ] +} +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script new file mode 100644 index 0000000000..d73595a6f4 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -0,0 +1,35 @@ +echo "=== Creating state file with serial 5 ===" +mkdir -p .databricks/bundle/default +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-123", + "serial": 5, + "state": {} +} +EOF + +echo "=== Creating WAL with corrupted entry ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-123","serial":6} +{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} +not valid json - this line should be skipped +{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should recover valid entries, skip corrupted) ===" +trace $CLI bundle deploy 2>&1 | python3 sort_warnings.py + +echo "=== Final state (should have recovered entries) ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' + +echo "=== WAL after successful deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (unexpected)" +else + echo "WAL deleted (expected)" +fi diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py b/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py new file mode 100644 index 0000000000..06a6a0e59c --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +"""Sort warning blocks in CLI output to make test output deterministic. + +Warning blocks look like: +Warning: Single node cluster is not correctly configured + at resources.jobs.XXX.tasks[0].new_cluster + in databricks.yml:NN:NN + +num_workers should be 0 only for single-node clusters... + spark_conf: + ... + custom_tags: + ... + +This script groups consecutive warning blocks, sorts them by job name, and outputs. +""" + +import re +import sys + + +def main(): + content = sys.stdin.read() + lines = content.split("\n") + + result = [] + i = 0 + + while i < len(lines): + line = lines[i] + + # Check if this is the start of a warning block + if line.startswith("Warning:"): + # Collect all consecutive warning blocks + warnings = [] + while i < len(lines) and ( + lines[i].startswith("Warning:") + or ( + warnings + and not lines[i].startswith("Uploading") + and not lines[i].startswith("Deploying") + and not lines[i].startswith(">>>") + and not lines[i].startswith("===") + ) + ): + # Collect one complete warning block + block = [] + if lines[i].startswith("Warning:"): + block.append(lines[i]) + i += 1 + # Collect until next Warning or end marker + while i < len(lines): + if lines[i].startswith("Warning:"): + break + if lines[i].startswith("Uploading") or lines[i].startswith("Deploying"): + break + if lines[i].startswith(">>>") or lines[i].startswith("==="): + break + block.append(lines[i]) + i += 1 + warnings.append(block) + else: + i += 1 + + # Sort warnings by the job name in "at resources.jobs.XXX" + def get_sort_key(block): + for line in block: + match = re.search(r"at resources\.jobs\.(\w+)", line) + if match: + return match.group(1) + return "" + + warnings.sort(key=get_sort_key) + + # Output sorted warnings + for block in warnings: + for line in block: + result.append(line) + else: + result.append(line) + i += 1 + + print("\n".join(result), end="") + + +if __name__ == "__main__": + main() diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.py b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml new file mode 100644 index 0000000000..5bbe82835c --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -0,0 +1,13 @@ +# WAL with corrupted entry - valid entries should be recovered, corrupted skipped. + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get?job_id=1111" +Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get?job_id=2222" +Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' diff --git a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml new file mode 100644 index 0000000000..ebee1d9699 --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-crash-test + +resources: + jobs: + job_a: + name: "test-job-a" + tasks: + - task_key: "task-a" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt new file mode 100644 index 0000000000..9c33326382 --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -0,0 +1,38 @@ +=== Creating state directory === +=== Creating WAL file (simulating crash after job create) === +=== WAL content before deploy === +{"lineage":"test-lineage-123","serial": [SERIAL]} +{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"name":"test-job-a"}}} +=== Deploy (should recover from WAL) === + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.job_a.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== State file after recovery === +{ + "lineage": "test-lineage-123", + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.job_a" + ] +} +=== WAL file after successful deploy === +WAL file deleted (expected) diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script new file mode 100644 index 0000000000..c583a5eead --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -0,0 +1,24 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating WAL file (simulating crash after job create) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-123","serial":1} +{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"name":"test-job-a"}}} +EOF + +echo "=== WAL content before deploy ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should recover from WAL) ===" +trace $CLI bundle deploy + +echo "=== State file after recovery ===" +cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' + +echo "=== WAL file after successful deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL file exists (unexpected)" +else + echo "WAL file deleted (expected)" +fi diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.py b/acceptance/bundle/deploy/wal/crash-after-create/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml new file mode 100644 index 0000000000..9e20bac15d --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -0,0 +1,10 @@ +# WAL recovery after simulated crash. Job was created but state wasn't finalized. +# Deploy should recover job from WAL and update it. + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job-a"}}' diff --git a/acceptance/bundle/deploy/wal/empty-wal/databricks.yml b/acceptance/bundle/deploy/wal/empty-wal/databricks.yml new file mode 100644 index 0000000000..147a1e1482 --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-empty-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/empty-wal/out.test.toml b/acceptance/bundle/deploy/wal/empty-wal/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt new file mode 100644 index 0000000000..91a31fe322 --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -0,0 +1,37 @@ +=== Creating state directory === +=== Creating empty WAL file === +=== Empty WAL file exists === +[FILE_INFO] .databricks/bundle/default/resources.json.wal +=== Deploy (should handle empty WAL gracefully) === + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Checking WAL file after deploy === +Empty WAL deleted (expected) +=== State file content === +{ + "lineage": "[UUID]", + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.test_job" + ] +} diff --git a/acceptance/bundle/deploy/wal/empty-wal/script b/acceptance/bundle/deploy/wal/empty-wal/script new file mode 100644 index 0000000000..f693753ac7 --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/script @@ -0,0 +1,21 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating empty WAL file ===" +touch .databricks/bundle/default/resources.json.wal + +echo "=== Empty WAL file exists ===" +ls -la .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should handle empty WAL gracefully) ===" +trace $CLI bundle deploy + +echo "=== Checking WAL file after deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL file exists (unexpected)" +else + echo "Empty WAL deleted (expected)" +fi + +echo "=== State file content ===" +cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/empty-wal/test.py b/acceptance/bundle/deploy/wal/empty-wal/test.py new file mode 100644 index 0000000000..11b15b1a45 --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/test.py @@ -0,0 +1 @@ +print("hello") diff --git a/acceptance/bundle/deploy/wal/empty-wal/test.toml b/acceptance/bundle/deploy/wal/empty-wal/test.toml new file mode 100644 index 0000000000..b97264c2be --- /dev/null +++ b/acceptance/bundle/deploy/wal/empty-wal/test.toml @@ -0,0 +1,13 @@ +# Empty WAL file should be deleted and deploy should proceed normally. + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + +[[Repls]] +Old = '-rw[^ ]+ \d+ [^ ]+ [^ ]+ \d+ [A-Z][a-z]+ \d+ \d+:\d+' +New = '[FILE_INFO]' diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml b/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml new file mode 100644 index 0000000000..67079aaef8 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-future-serial-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml b/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt new file mode 100644 index 0000000000..ffb03147dc --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -0,0 +1,29 @@ +=== Creating state file (serial=2) === +=== Creating WAL with future serial (serial=5, expected=3) === +=== WAL content === +{"lineage":"test-lineage-123","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +=== Deploy (should fail with corruption error) === + +>>> errcode [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-future-serial-test/default/files... +Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL serial (5) is ahead of expected (3), state may be corrupted + + +Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/script b/acceptance/bundle/deploy/wal/future-serial-wal/script new file mode 100644 index 0000000000..7b1784b0c6 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/script @@ -0,0 +1,28 @@ +echo "=== Creating state file (serial=2) ===" +mkdir -p .databricks/bundle/default +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-123", + "serial": 2, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} +EOF + +echo "=== Creating WAL with future serial (serial=5, expected=3) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-123","serial":5} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should fail with corruption error) ===" +trace errcode $CLI bundle deploy diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/test.py b/acceptance/bundle/deploy/wal/future-serial-wal/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/test.toml b/acceptance/bundle/deploy/wal/future-serial-wal/test.toml new file mode 100644 index 0000000000..424fe2f127 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/test.toml @@ -0,0 +1,4 @@ +# WAL with serial ahead of state - indicates corruption, should error. +# State has serial=2, WAL has serial=5 (expected would be 3). + +# No server stubs needed - deploy should fail before any API calls. diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml b/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml new file mode 100644 index 0000000000..014ec7f886 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-lineage-mismatch-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt new file mode 100644 index 0000000000..2419e7a612 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -0,0 +1,29 @@ +=== Creating state file with lineage-A === +=== Creating WAL with lineage-B (mismatch) === +=== WAL content === +{"lineage":"wal-lineage-bbb","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +=== Deploy (should fail with lineage mismatch error) === + +>>> errcode [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-lineage-mismatch-test/default/files... +Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) + + +Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/script b/acceptance/bundle/deploy/wal/lineage-mismatch/script new file mode 100644 index 0000000000..b241246e6c --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/script @@ -0,0 +1,28 @@ +echo "=== Creating state file with lineage-A ===" +mkdir -p .databricks/bundle/default +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "state-lineage-aaa", + "serial": 1, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} +EOF + +echo "=== Creating WAL with lineage-B (mismatch) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"wal-lineage-bbb","serial":2} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should fail with lineage mismatch error) ===" +trace errcode $CLI bundle deploy diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/test.py b/acceptance/bundle/deploy/wal/lineage-mismatch/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml new file mode 100644 index 0000000000..509cc82f09 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml @@ -0,0 +1,4 @@ +# WAL with different lineage than state - should error. +# State has lineage "state-lineage-aaa", WAL has lineage "wal-lineage-bbb". + +# No server stubs needed - deploy should fail before any API calls. diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml new file mode 100644 index 0000000000..b4162d8fdf --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-multi-crash-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt new file mode 100644 index 0000000000..3e0426a628 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt @@ -0,0 +1,64 @@ +=== Creating state directory === +=== Creating WAL file (simulating crash after job create) === +=== WAL content === +{"lineage":"test-lineage-456","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +=== First deploy attempt (will crash during update) === + +>>> errcode [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... +Deploying resources... +[PROCESS_KILLED] + +Exit code: [KILLED] +=== WAL after first crash === +{"lineage":"test-lineage-456","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +=== Second deploy attempt (should succeed) === + +>>> [CLI] bundle deploy --force-lock +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Final state === +{ + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.test_job" + ] +} +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/script b/acceptance/bundle/deploy/wal/multiple-crashes/script new file mode 100644 index 0000000000..795e4261e1 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/script @@ -0,0 +1,32 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating WAL file (simulating crash after job create) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-456","serial":1} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== First deploy attempt (will crash during update) ===" +trace errcode $CLI bundle deploy + +echo "=== WAL after first crash ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + cat .databricks/bundle/default/resources.json.wal +fi + +echo "=== Second deploy attempt (should succeed) ===" +trace $CLI bundle deploy --force-lock + +echo "=== Final state ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' + +echo "=== WAL after successful deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (unexpected)" +else + echo "WAL deleted (expected)" +fi diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.py b/acceptance/bundle/deploy/wal/multiple-crashes/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml new file mode 100644 index 0000000000..2e9973c846 --- /dev/null +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -0,0 +1,10 @@ +# Multiple crashes during recovery - WAL should persist until successful finalize. + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +KillCaller = 1 +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml new file mode 100644 index 0000000000..413705d40c --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/normal-deploy/output.txt b/acceptance/bundle/deploy/wal/normal-deploy/output.txt new file mode 100644 index 0000000000..50c1430641 --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/output.txt @@ -0,0 +1,32 @@ + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Checking WAL file after deploy === +WAL file deleted after successful deploy (expected) +=== State file content === +{ + "lineage": "[UUID]", + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.test_job" + ] +} diff --git a/acceptance/bundle/deploy/wal/normal-deploy/script b/acceptance/bundle/deploy/wal/normal-deploy/script new file mode 100644 index 0000000000..5acc4d9b58 --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/script @@ -0,0 +1,12 @@ +trace $CLI bundle deploy + +echo "=== Checking WAL file after deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL file exists (unexpected - should be deleted after Finalize)" + cat .databricks/bundle/default/resources.json.wal +else + echo "WAL file deleted after successful deploy (expected)" +fi + +echo "=== State file content ===" +cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/normal-deploy/test.py b/acceptance/bundle/deploy/wal/normal-deploy/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/normal-deploy/test.toml b/acceptance/bundle/deploy/wal/normal-deploy/test.toml new file mode 100644 index 0000000000..1299046974 --- /dev/null +++ b/acceptance/bundle/deploy/wal/normal-deploy/test.toml @@ -0,0 +1,9 @@ +# WAL is created during deploy, used for state tracking, and deleted after Finalize. + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/stale-wal/databricks.yml b/acceptance/bundle/deploy/wal/stale-wal/databricks.yml new file mode 100644 index 0000000000..6b24f6fd26 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-stale-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/stale-wal/out.test.toml b/acceptance/bundle/deploy/wal/stale-wal/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt new file mode 100644 index 0000000000..3722788e52 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -0,0 +1,38 @@ +=== Creating state directory === +=== Creating state file (serial=2) === +=== Creating stale WAL with old serial (serial=1) === +=== WAL content before deploy === +{"lineage":"stale-test-lineage","serial": [SERIAL]} +{"k":"resources.jobs.stale_job","v":{"__id__": "[ID]","state":{"name":"stale-job"}}} +=== Deploy (should ignore stale WAL) === + +>>> [CLI] bundle deploy +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-stale-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Checking WAL file after deploy === +Stale WAL deleted (expected) +=== State file should NOT contain stale_job === +{ + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.test_job" + ] +} diff --git a/acceptance/bundle/deploy/wal/stale-wal/script b/acceptance/bundle/deploy/wal/stale-wal/script new file mode 100644 index 0000000000..d814639a00 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/script @@ -0,0 +1,40 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating state file (serial=2) ===" +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "stale-test-lineage", + "serial": 2, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} +EOF + +echo "=== Creating stale WAL with old serial (serial=1) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"stale-test-lineage","serial":1} +{"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} +EOF + +echo "=== WAL content before deploy ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (should ignore stale WAL) ===" +trace $CLI bundle deploy + +echo "=== Checking WAL file after deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL file exists (unexpected)" +else + echo "Stale WAL deleted (expected)" +fi + +echo "=== State file should NOT contain stale_job ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/stale-wal/test.py b/acceptance/bundle/deploy/wal/stale-wal/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/stale-wal/test.toml b/acceptance/bundle/deploy/wal/stale-wal/test.toml new file mode 100644 index 0000000000..934683ba6d --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/test.toml @@ -0,0 +1,9 @@ +# Deploy with a stale WAL (old serial) - WAL should be deleted and ignored. + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml new file mode 100644 index 0000000000..063faa8e54 --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-summary-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt new file mode 100644 index 0000000000..2e6abf645a --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt @@ -0,0 +1,25 @@ +=== Creating state directory === +=== Creating WAL file (simulating crash after job create) === +=== Bundle summary (should show job from WAL with id) === + +>>> [CLI] bundle summary -o json +Warning: Single node cluster is not correctly configured + at resources.jobs.test_job.tasks[0].new_cluster + in databricks.yml:13:13 + +num_workers should be 0 only for single-node clusters. To create a +valid single node cluster please ensure that the following properties +are correctly set in the cluster specification: + + spark_conf: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + + custom_tags: + ResourceClass: SingleNode + + +{ + "job_id": "[ID]", + "modified_status": null +} diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/script b/acceptance/bundle/deploy/wal/summary-after-crash/script new file mode 100644 index 0000000000..d2017c6590 --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/script @@ -0,0 +1,11 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating WAL file (simulating crash after job create) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"summary-test-lineage","serial":1} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} +EOF + +echo "=== Bundle summary (should show job from WAL with id) ===" +trace $CLI bundle summary -o json | jq '{job_id: .resources.jobs.test_job.id, modified_status: .resources.jobs.test_job.modified_status}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.py b/acceptance/bundle/deploy/wal/summary-after-crash/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml new file mode 100644 index 0000000000..3363a1c516 --- /dev/null +++ b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml @@ -0,0 +1,2 @@ +# Bundle summary should show resources recovered from WAL. +# No server stubs needed - we just run bundle summary which reads state. diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml new file mode 100644 index 0000000000..7fd1daf93b --- /dev/null +++ b/acceptance/bundle/deploy/wal/test.toml @@ -0,0 +1,43 @@ +# WAL (Write-Ahead Log) tests verify crash recovery during bundle deployment. +# These tests simulate process crashes using KillCaller and verify state recovery. +# Only runs with direct engine since WAL is a direct-engine feature. + +Local = true +Env.DATABRICKS_CLI_TEST_PID = "1" + +[EnvMatrix] +DATABRICKS_BUNDLE_ENGINE = ["direct"] + +[[Repls]] +Old = 'script: line \d+:\s+\d+ Killed(: 9)?\s+"\$@"' +New = '[PROCESS_KILLED]' + +[[Repls]] +Old = '(\n>>> errcode [^\n]+\n)\nExit code:' +New = """${1}[PROCESS_KILLED] + +Exit code:""" + +[[Repls]] +Old = 'Exit code: (137|1)' +New = 'Exit code: [KILLED]' + +[[Repls]] +Old = "\r" +New = '' + +[[Repls]] +Old = '"lineage":\s*"[0-9a-f-]+"' +New = '"lineage": "[UUID]"' + +[[Repls]] +Old = '"serial":\s*\d+' +New = '"serial": [SERIAL]' + +[[Repls]] +Old = '"__id__":\s*"\d+"' +New = '"__id__": "[ID]"' + +[[Repls]] +Old = '"job_id":\s*"\d+"' +New = '"job_id": "[ID]"' diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml b/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml new file mode 100644 index 0000000000..457a2d3e96 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml @@ -0,0 +1,15 @@ +bundle: + name: wal-delete-test + +resources: + jobs: + test_job: + name: "test-job" + tasks: + - task_key: "test-task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt new file mode 100644 index 0000000000..8f52732d3e --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -0,0 +1,21 @@ +=== Creating state directory === +=== Creating state file (job exists) === +=== Creating WAL with delete entry (simulating crash during delete) === +=== WAL content === +{"lineage":"delete-test-lineage","serial": [SERIAL]} +{"k":"resources.jobs.test_job","v":null} +=== Updating config to remove job === +=== Deploy (should recover delete from WAL) === + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-delete-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +=== Final state (should have no jobs) === +{ + "serial": [SERIAL], + "state_keys": [] +} +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/script b/acceptance/bundle/deploy/wal/wal-with-delete/script new file mode 100644 index 0000000000..f840355267 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/script @@ -0,0 +1,48 @@ +echo "=== Creating state directory ===" +mkdir -p .databricks/bundle/default + +echo "=== Creating state file (job exists) ===" +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "delete-test-lineage", + "serial": 1, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} +EOF + +echo "=== Creating WAL with delete entry (simulating crash during delete) ===" +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"delete-test-lineage","serial":2} +{"k":"resources.jobs.test_job","v":null} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Updating config to remove job ===" +cat > databricks.yml << 'EOF' +bundle: + name: wal-delete-test + +resources: {} +EOF + +echo "=== Deploy (should recover delete from WAL) ===" +trace $CLI bundle deploy + +echo "=== Final state (should have no jobs) ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' + +echo "=== WAL after successful deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (unexpected)" +else + echo "WAL deleted (expected)" +fi diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/test.py b/acceptance/bundle/deploy/wal/wal-with-delete/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml new file mode 100644 index 0000000000..27045f8885 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml @@ -0,0 +1,5 @@ +# WAL recovery after crash during delete operation. +# Delete was recorded in WAL but not finalized. Deploy should complete the delete. + +# No server stubs needed - the delete was already done (recorded in WAL) +# and the job no longer needs API calls diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index ed5cbbc07b..08d849d14c 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -62,7 +62,7 @@ type BindResult struct { func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.WorkspaceClient, configRoot *config.Root, statePath, resourceKey, resourceID string) (*BindResult, error) { // Check if the resource is already managed (bound to a different ID) var checkStateDB dstate.DeploymentState - if err := checkStateDB.Open(statePath); err == nil { + if err := checkStateDB.Open(ctx, statePath); err == nil { if existingID := checkStateDB.GetResourceID(resourceKey); existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -82,7 +82,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Open temp state - err := b.StateDB.Open(tmpStatePath) + err := b.StateDB.Open(ctx, tmpStatePath) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -188,7 +188,7 @@ func (result *BindResult) Cancel() { // Unbind removes a resource from direct engine state without deleting // the workspace resource. Also removes associated permissions/grants entries. func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey string) error { - err := b.StateDB.Open(statePath) + err := b.StateDB.Open(ctx, statePath) if err != nil { return err } diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index a7f3ee65fc..aec6e7cc52 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -21,7 +21,12 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa } if len(plan.Plan) == 0 { - // Avoid creating state file if nothing to deploy + // Still need to finalize if WAL recovery happened to commit the recovered state + if b.StateDB.RecoveredFromWAL() { + if err := b.StateDB.Finalize(); err != nil { + logdiag.LogError(ctx, err) + } + } return } diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index f6bcea316c..1fb70123b9 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -40,7 +40,7 @@ func (b *DeploymentBundle) init(client *databricks.WorkspaceClient) error { // ValidatePlanAgainstState validates that a plan's lineage and serial match the current state. // This should be called early in the deployment process, before any file operations. // If the plan has no lineage (first deployment), validation is skipped. -func ValidatePlanAgainstState(stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { +func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { // If plan has no lineage, this is a first deployment before any state exists // No validation needed if plan.Lineage == "" { diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 3f6bcce2fc..9113021c8f 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -14,15 +14,18 @@ import ( "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/internal/build" + "github.com/databricks/cli/libs/log" "github.com/google/uuid" ) const currentStateVersion = 2 type DeploymentState struct { - Path string - Data Database - mu sync.Mutex + Path string + Data Database + mu sync.Mutex + wal *WAL + recoveredFromWAL bool } type Database struct { @@ -63,12 +66,22 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d return err } - db.Data.State[key] = ResourceEntry{ + entry := ResourceEntry{ ID: newID, State: json.RawMessage(jsonMessage), DependsOn: dependsOn, } + // Write to WAL before updating memory + if err := db.ensureWALOpen(); err != nil { + return fmt.Errorf("failed to open WAL: %w", err) + } + if err := db.wal.writeEntry(key, &entry); err != nil { + return fmt.Errorf("failed to write WAL entry: %w", err) + } + + db.Data.State[key] = entry + return nil } @@ -81,11 +94,50 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } + // Write to WAL before updating memory (nil entry means delete) + if err := db.ensureWALOpen(); err != nil { + return fmt.Errorf("failed to open WAL: %w", err) + } + if err := db.wal.writeEntry(key, nil); err != nil { + return fmt.Errorf("failed to write WAL entry: %w", err) + } + delete(db.Data.State, key) return nil } +// ensureWALOpen opens the WAL file and writes the header if not already done. +// Must be called while holding db.mu. +func (db *DeploymentState) ensureWALOpen() error { + if db.wal != nil { + return nil + } + + wal, err := openWAL(db.Path) + if err != nil { + return err + } + + // Generate lineage if this is a fresh deployment + lineage := db.Data.Lineage + if lineage == "" { + lineage = uuid.New().String() + db.Data.Lineage = lineage + } + + // WAL serial is the NEXT serial (current + 1) + walSerial := db.Data.Serial + 1 + + if err := wal.writeHeader(lineage, walSerial); err != nil { + wal.close() + return err + } + + db.wal = wal + return nil +} + func (db *DeploymentState) getResourceEntry(key string) (ResourceEntry, bool) { db.AssertOpened() db.mu.Lock() @@ -110,7 +162,7 @@ func (db *DeploymentState) GetResourceID(key string) string { return entry.ID } -func (db *DeploymentState) Open(path string) error { +func (db *DeploymentState) Open(ctx context.Context, path string) error { db.mu.Lock() defer db.mu.Unlock() @@ -124,21 +176,39 @@ func (db *DeploymentState) Open(path string) error { // Create new database with serial=0, will be incremented to 1 in Finalize() db.Data = NewDatabase("", 0) db.Path = path - return nil + + // Write state file immediately to ensure it exists before any WAL operations. + // This guarantees we have a base state file for recovery validation. + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return fmt.Errorf("failed to create state directory: %w", err) + } + if err := db.unlockedSave(); err != nil { + return err + } + } else { + return err } - return err + } else { + err = json.Unmarshal(data, &db.Data) + if err != nil { + return err + } + db.Path = path } - err = json.Unmarshal(data, &db.Data) + // Attempt WAL recovery + recovered, err := recoverFromWAL(path, &db.Data) if err != nil { - return err + return fmt.Errorf("WAL recovery failed: %w", err) + } + if recovered { + log.Infof(ctx, "Recovered deployment state from WAL") + db.recoveredFromWAL = true } if err := migrateState(&db.Data); err != nil { return fmt.Errorf("migrating state %s: %w", path, err) } - - db.Path = path return nil } @@ -146,14 +216,33 @@ func (db *DeploymentState) Finalize() error { db.mu.Lock() defer db.mu.Unlock() - // Generate lineage on first save + // Generate lineage on first save (if WAL wasn't opened) if db.Data.Lineage == "" { db.Data.Lineage = uuid.New().String() } db.Data.Serial++ - return db.unlockedSave() + err := db.unlockedSave() + if err != nil { + return err + } + + // Truncate WAL after successful state file write + if db.wal != nil { + if err := db.wal.truncate(); err != nil { + return fmt.Errorf("failed to truncate WAL: %w", err) + } + db.wal = nil + } else { + // No WAL was opened, but we should still clean up any stale WAL file + wp := walPath(db.Path) + if err := os.Remove(wp); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to remove stale WAL file: %w", err) + } + } + + return nil } func (db *DeploymentState) AssertOpened() { @@ -162,6 +251,12 @@ func (db *DeploymentState) AssertOpened() { } } +// RecoveredFromWAL returns true if state was recovered from WAL during Open(). +// This is used to determine if Finalize() should be called even with an empty plan. +func (db *DeploymentState) RecoveredFromWAL() bool { + return db.recoveredFromWAL +} + func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { result := make(resourcestate.ExportedResourcesMap) for key, entry := range db.Data.State { diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go new file mode 100644 index 0000000000..700bfa24e2 --- /dev/null +++ b/bundle/direct/dstate/wal.go @@ -0,0 +1,218 @@ +package dstate + +import ( + "bufio" + "encoding/json" + "errors" + "fmt" + "os" +) + +// WALHeader is the first entry in the WAL file, containing metadata for validation. +type WALHeader struct { + Lineage string `json:"lineage"` + Serial int `json:"serial"` +} + +// WALEntry represents a single state mutation in the WAL. +// For set operations, V is populated. For delete operations, V is nil. +type WALEntry struct { + K string `json:"k"` + V *ResourceEntry `json:"v,omitempty"` +} + +// WAL manages the Write-Ahead Log for deployment state recovery. +type WAL struct { + path string + file *os.File +} + +// walPath returns the WAL file path for a given state file path. +func walPath(statePath string) string { + return statePath + ".wal" +} + +// openWAL opens or creates a WAL file for writing. +func openWAL(statePath string) (*WAL, error) { + wp := walPath(statePath) + f, err := os.OpenFile(wp, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o600) + if err != nil { + return nil, fmt.Errorf("failed to open WAL file %q: %w", wp, err) + } + return &WAL{path: wp, file: f}, nil +} + +// writeHeader writes the WAL header (lineage and serial) as the first entry. +func (w *WAL) writeHeader(lineage string, serial int) error { + header := WALHeader{ + Lineage: lineage, + Serial: serial, + } + return w.writeJSON(header) +} + +// writeEntry appends a state mutation entry to the WAL. +func (w *WAL) writeEntry(key string, entry *ResourceEntry) error { + walEntry := WALEntry{ + K: key, + V: entry, + } + return w.writeJSON(walEntry) +} + +// writeJSON marshals and writes a JSON object as a single line, then syncs to disk. +func (w *WAL) writeJSON(v any) error { + data, err := json.Marshal(v) + if err != nil { + return fmt.Errorf("failed to marshal WAL entry: %w", err) + } + data = append(data, '\n') + + _, err = w.file.Write(data) + if err != nil { + return fmt.Errorf("failed to write WAL entry: %w", err) + } + + err = w.file.Sync() + if err != nil { + return fmt.Errorf("failed to sync WAL file: %w", err) + } + + return nil +} + +// close closes the WAL file handle. +func (w *WAL) close() error { + if w.file != nil { + return w.file.Close() + } + return nil +} + +// truncate deletes the WAL file after successful finalization. +func (w *WAL) truncate() error { + if w.file != nil { + w.file.Close() + w.file = nil + } + err := os.Remove(w.path) + if err != nil && !os.IsNotExist(err) { + return fmt.Errorf("failed to remove WAL file %q: %w", w.path, err) + } + return nil +} + +// readWAL reads and parses an existing WAL file for recovery. +// Returns the header and entries, or an error if the WAL is invalid. +func readWAL(statePath string) (*WALHeader, []WALEntry, error) { + wp := walPath(statePath) + f, err := os.Open(wp) + if err != nil { + return nil, nil, err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + var header *WALHeader + var entries []WALEntry + lineNum := 0 + + for scanner.Scan() { + lineNum++ + line := scanner.Bytes() + if len(line) == 0 { + continue + } + + if header == nil { + // First line must be the header + var h WALHeader + if err := json.Unmarshal(line, &h); err != nil { + return nil, nil, fmt.Errorf("WAL line %d: failed to parse header: %w", lineNum, err) + } + header = &h + } else { + // Subsequent lines are entries + var e WALEntry + if err := json.Unmarshal(line, &e); err != nil { + // Skip corrupted lines silently - this is expected for partial writes + continue + } + if e.K == "" { + // Skip entries with empty keys + continue + } + entries = append(entries, e) + } + } + + if err := scanner.Err(); err != nil { + return nil, nil, fmt.Errorf("failed to read WAL file: %w", err) + } + + if header == nil { + return nil, nil, errors.New("WAL file is empty or missing header") + } + + return header, entries, nil +} + +// recoverFromWAL attempts to recover state from an existing WAL file. +// It validates the WAL against the current state and replays valid entries. +// Returns true if recovery was performed, false if no recovery needed. +func recoverFromWAL(statePath string, db *Database) (bool, error) { + wp := walPath(statePath) + + // Check if WAL exists + if _, err := os.Stat(wp); os.IsNotExist(err) { + return false, nil + } + + header, entries, err := readWAL(statePath) + if err != nil { + // If we can't read the WAL at all, delete it and proceed + os.Remove(wp) + return false, nil + } + + // Validate WAL serial against state serial + expectedSerial := db.Serial + 1 + if header.Serial < expectedSerial { + // Stale WAL - delete and proceed without recovery + os.Remove(wp) + return false, nil + } + + if header.Serial > expectedSerial { + // WAL is ahead of state - this indicates corruption + return false, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) + } + + // Validate lineage if both exist + if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { + return false, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) + } + + // Adopt lineage from WAL if state doesn't have one + if db.Lineage == "" && header.Lineage != "" { + db.Lineage = header.Lineage + } + + // Initialize state map if needed + if db.State == nil { + db.State = make(map[string]ResourceEntry) + } + + // Replay entries + for _, entry := range entries { + if entry.V != nil { + // Set operation + db.State[entry.K] = *entry.V + } else { + // Delete operation + delete(db.State, entry.K) + } + } + + return true, nil +} diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go new file mode 100644 index 0000000000..e475a92e9d --- /dev/null +++ b/bundle/direct/dstate/wal_test.go @@ -0,0 +1,419 @@ +package dstate + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/databricks/cli/bundle/deployplan" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestWALPath(t *testing.T) { + assert.Equal(t, "/path/to/state.json.wal", walPath("/path/to/state.json")) +} + +func TestWALWriteAndRead(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Open WAL for writing + wal, err := openWAL(statePath) + require.NoError(t, err) + + // Write header + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + + // Write entries + entry1 := &ResourceEntry{ + ID: "12345", + State: json.RawMessage(`{"name":"job1"}`), + } + err = wal.writeEntry("resources.jobs.job1", entry1) + require.NoError(t, err) + + entry2 := &ResourceEntry{ + ID: "67890", + State: json.RawMessage(`{"name":"job2"}`), + } + err = wal.writeEntry("resources.jobs.job2", entry2) + require.NoError(t, err) + + // Write a delete entry (nil value) + err = wal.writeEntry("resources.jobs.old_job", nil) + require.NoError(t, err) + + err = wal.close() + require.NoError(t, err) + + // Read WAL back + header, entries, err := readWAL(statePath) + require.NoError(t, err) + + assert.Equal(t, "test-lineage", header.Lineage) + assert.Equal(t, 1, header.Serial) + + require.Len(t, entries, 3) + + assert.Equal(t, "resources.jobs.job1", entries[0].K) + require.NotNil(t, entries[0].V) + assert.Equal(t, "12345", entries[0].V.ID) + + assert.Equal(t, "resources.jobs.job2", entries[1].K) + require.NotNil(t, entries[1].V) + assert.Equal(t, "67890", entries[1].V.ID) + + assert.Equal(t, "resources.jobs.old_job", entries[2].K) + assert.Nil(t, entries[2].V) +} + +func TestWALTruncate(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + // Create WAL file + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + + // Verify file exists + _, err = os.Stat(walFilePath) + require.NoError(t, err) + + // Truncate + err = wal.truncate() + require.NoError(t, err) + + // Verify file is removed + _, err = os.Stat(walFilePath) + assert.True(t, os.IsNotExist(err)) +} + +func TestRecoverFromWAL_NoWAL(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + db := NewDatabase("", 0) + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.False(t, recovered) +} + +func TestRecoverFromWAL_ValidWAL(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create WAL with serial = 1 (expecting state serial 0 + 1) + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + + entry := &ResourceEntry{ + ID: "12345", + State: json.RawMessage(`{"name":"job1"}`), + } + err = wal.writeEntry("resources.jobs.job1", entry) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Create database with serial 0 + db := NewDatabase("", 0) + + // Recover + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.True(t, recovered) + + // Verify state was recovered + assert.Equal(t, "test-lineage", db.Lineage) + require.Contains(t, db.State, "resources.jobs.job1") + assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) +} + +func TestRecoverFromWAL_StaleWAL(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + // Create WAL with serial = 1 + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Create database with serial 2 (WAL is stale) + db := NewDatabase("test-lineage", 2) + + // Recover - should skip and delete WAL + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.False(t, recovered) + + // WAL should be deleted + _, err = os.Stat(walFilePath) + assert.True(t, os.IsNotExist(err)) +} + +func TestRecoverFromWAL_FutureWAL(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create WAL with serial = 5 + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 5) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Create database with serial 0 (WAL is from future - corrupted state) + db := NewDatabase("test-lineage", 0) + + // Recover - should fail + _, err = recoverFromWAL(statePath, &db) + assert.Error(t, err) + assert.Contains(t, err.Error(), "WAL serial (5) is ahead of expected (1)") +} + +func TestRecoverFromWAL_LineageMismatch(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create WAL with lineage A + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("lineage-A", 1) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Create database with lineage B + db := NewDatabase("lineage-B", 0) + + // Recover - should fail + _, err = recoverFromWAL(statePath, &db) + assert.Error(t, err) + assert.Contains(t, err.Error(), "lineage") +} + +func TestRecoverFromWAL_DeleteOperation(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create WAL with delete operation + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + + // Add an entry + entry := &ResourceEntry{ + ID: "12345", + State: json.RawMessage(`{"name":"job1"}`), + } + err = wal.writeEntry("resources.jobs.job1", entry) + require.NoError(t, err) + + // Delete the entry + err = wal.writeEntry("resources.jobs.job1", nil) + require.NoError(t, err) + + err = wal.close() + require.NoError(t, err) + + // Create database + db := NewDatabase("", 0) + + // Recover + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.True(t, recovered) + + // Entry should NOT be present (deleted) + assert.NotContains(t, db.State, "resources.jobs.job1") +} + +func TestDeploymentState_WALIntegration(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + // Create deployment state + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + // Save some state + err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) + require.NoError(t, err) + + // WAL should exist + _, err = os.Stat(walFilePath) + require.NoError(t, err) + + // Read WAL to verify content + header, entries, err := readWAL(statePath) + require.NoError(t, err) + assert.Equal(t, 1, header.Serial) // serial + 1 + require.Len(t, entries, 1) + assert.Equal(t, "resources.jobs.job1", entries[0].K) + assert.Equal(t, "12345", entries[0].V.ID) + + // Finalize + err = db.Finalize() + require.NoError(t, err) + + // WAL should be deleted + _, err = os.Stat(walFilePath) + assert.True(t, os.IsNotExist(err)) + + // State file should exist with correct serial + data, err := os.ReadFile(statePath) + require.NoError(t, err) + var savedDB Database + err = json.Unmarshal(data, &savedDB) + require.NoError(t, err) + assert.Equal(t, 1, savedDB.Serial) + assert.Contains(t, savedDB.State, "resources.jobs.job1") +} + +func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + // Create initial state file + initialDB := NewDatabase("test-lineage", 5) + initialDB.State["resources.jobs.existing"] = ResourceEntry{ + ID: "existing-id", + State: json.RawMessage(`{"name":"existing"}`), + } + data, err := json.Marshal(initialDB) + require.NoError(t, err) + err = os.WriteFile(statePath, data, 0o600) + require.NoError(t, err) + + // Create WAL with serial 6 (5 + 1) + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 6) + require.NoError(t, err) + entry := &ResourceEntry{ + ID: "new-id", + State: json.RawMessage(`{"name":"new"}`), + } + err = wal.writeEntry("resources.jobs.new", entry) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + // Open should recover from WAL + var db DeploymentState + err = db.Open(ctx, statePath) + require.NoError(t, err) + + // Both existing and new resources should be present + assert.Contains(t, db.Data.State, "resources.jobs.existing") + assert.Contains(t, db.Data.State, "resources.jobs.new") + assert.Equal(t, "new-id", db.Data.State["resources.jobs.new"].ID) +} + +func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + // Add a resource + err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) + require.NoError(t, err) + + // Delete the resource + err = db.DeleteState("resources.jobs.job1") + require.NoError(t, err) + + // Read WAL to verify delete entry + _, entries, err := readWAL(statePath) + require.NoError(t, err) + + require.Len(t, entries, 2) + assert.Equal(t, "resources.jobs.job1", entries[1].K) + assert.Nil(t, entries[1].V) // nil means delete + + // Finalize + err = db.Finalize() + require.NoError(t, err) + + // State file should NOT contain the deleted resource + data, err := os.ReadFile(statePath) + require.NoError(t, err) + var savedDB Database + err = json.Unmarshal(data, &savedDB) + require.NoError(t, err) + assert.NotContains(t, savedDB.State, "resources.jobs.job1") +} + +func TestDeploymentState_WALWithDependsOn(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + dependsOn := []deployplan.DependsOnEntry{ + {Node: "resources.clusters.cluster1", Label: "${resources.clusters.cluster1.id}"}, + } + + err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, dependsOn) + require.NoError(t, err) + + // Read WAL + _, entries, err := readWAL(statePath) + require.NoError(t, err) + + require.Len(t, entries, 1) + require.NotNil(t, entries[0].V) + require.Len(t, entries[0].V.DependsOn, 1) + assert.Equal(t, "resources.clusters.cluster1", entries[0].V.DependsOn[0].Node) +} + +func TestRecoverFromWAL_CorruptedLine(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + // Manually write WAL with corrupted line + content := `{"lineage":"test","serial":1} +{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} +not valid json +{"k":"resources.jobs.job2","v":{"__id__":"67890","state":{}}} +` + err := os.WriteFile(walFilePath, []byte(content), 0o600) + require.NoError(t, err) + + db := NewDatabase("", 0) + recovered, err := recoverFromWAL(statePath, &db) + require.NoError(t, err) + assert.True(t, recovered) + + // Should have recovered job1 and job2, skipping corrupted line + assert.Contains(t, db.State, "resources.jobs.job1") + assert.Contains(t, db.State, "resources.jobs.job2") +} + diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index a6f48d99fa..75081de56e 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -236,7 +236,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle // Validate that the plan's lineage and serial match the current state // This must happen before any file operations - err = direct.ValidatePlanAgainstState(&b.DeploymentBundle.StateDB, plan) + err = direct.ValidatePlanAgainstState(ctx, &b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) return b, stateDesc, root.ErrAlreadyPrinted diff --git a/wal.txt b/wal.txt new file mode 100644 index 0000000000..d365ed56d8 --- /dev/null +++ b/wal.txt @@ -0,0 +1,205 @@ +Design Document: Write-Ahead Log (WAL) for Bundle Deployment State Recovery +1. Problem Statement +When databricks bundle deploy is interrupted, resources created before the interruption become orphaned. The CLI only writes the state file at the end of deployment via Finalize(). Any resources created mid-deployment are lost from tracking. + +Current behavior: +Deploy starts → Create Job A → Create Job B → [CRASH] → State file empty → Jobs A, B orphaned + +Impact: Orphaned resources exist in Databricks but are unknown to future deployments. Users accumulate duplicate resources, leading to confusion and unexpected costs. + +Scope: Direct deployment engine only. Terraform has its own state management. +2. Solution Overview +Implement a Write-Ahead Log (WAL) that records each state mutation to disk immediately after the corresponding API call succeeds. +On recovery, replay the WAL to restore partial deployment state. + +Proposed behavior: +Deploy starts → Create Job A → [WAL: A] → Create Job B → [WAL: A,B] → [CRASH] +Next deploy → Load state → Replay WAL → State has A,B → No duplicates +3. Detailed Design +3.1 File Structure +The WAL is stored locally alongside the existing state file. + +File Path +Description +~/.databricks/bundle/// +Root directory for the bundle's state data. +~/.databricks/bundle///resources.json +The committed state file (existing). +~/.databricks/bundle///resources.json.wal +The Write-Ahead Log file (new). + +3.2 WAL Entry Format +Each entry is a JSON object written as a single line (NDJSON format). The entry embeds the existing ResourceEntry structure for consistency with the state file. + +Field +Type +Description +Lineage (First Entry Only) +String +UUID matching the state file's lineage (for validation). +Serial (First Entry Only) +Integer +Deployment serial number (for validation). +k (2nd Entry Onwards) +String +Resource key (e.g., resources.jobs.my_job). +v (2nd Entry Onwards) +ResourceEntry +The state entry. Omitted for delete operations. + + +ResourceEntry structure (existing, reused): + +Field +Type +Description +__id__ +String +The unique ID assigned by the Databricks API. +state +Object +Full snapshot of the resource configuration. + + +Example WAL: +{"lineage":"abc-123"} +{"k":"resources.jobs.my_job","v":{"__id__":"1234567","state":{...}}} +{"k":"resources.jobs.old_job"} // no v means delete op +3.3 WAL Lifecycle +Phase +Action +Open +Create or open resources.json.wal. +Write +Append entry after each successful API call. +Truncate +Delete resources.json.wal after successful Finalize(). + + +Durability: Each entry must be flushed to disk (fsync) immediately after the successful API response before proceeding. +Known Limitation: There is a small window (~microseconds) between API success and WAL write where a crash would orphan the resource. This is unavoidable is acceptable. +3.4 Recovery Mechanism +Recovery occurs at the start of deployment if the WAL file exists. + +Check: If resources.json.wal exists, initiate recovery. +Load Base State: +If resources.json exists: load it (provides lineage and serial). We are making sure it exists by writing immediately once we open/create it in the Open() method +Otherwise: create fresh state with new lineage. +Read WAL: Parse all entries from resources.json.wal (already chronologically ordered). +Validate Entries: +WAL serial == state serial + 1: Valid — replay entries. +WAL serial < state serial + 1: Stale WAL — delete WAL file, proceed without recovery. +WAL serial > state serial + 1: Corrupted state — return error. +Replay: For each valid entry: +set: Add or overwrite the resource in memory. +delete: Remove the resource from memory. +Proceed: Use the resulting state as the starting point for deployment. +Finalize: On success, write resources.json and delete resources.json.wal. +3.5 Integration Points +Action +Location +Detail +Recovery Check +Open() in dstate/state.go +Check for the WAL file and replay before proceeding. +Write WAL Entry +SaveState() / DeleteState() +Append entry before updating memory. +Truncation +Finalize() +Delete WAL after successful state file write. + +3.6 Error Handling +Scenario +Behavior +WAL write fails +Return error, abort deployment. +Corrupted WAL line +Log warning, skip line, continue replay. +Lineage mismatch +Return error, abort deployment. +Stale serial +Delete WAL + +5. Testing Plan +Use acceptance tests. Add support for the crash caller process from the test server. +Key test cases: +Tests which compile and run real binary against testserver. + +Normal deploy — WAL created, used, deleted. +Crash after 1 resource — recovery works. +Fresh deploy with existing WAL — lineage adopted. +Stale WAL (old serial) — entries skipped. +Corrupted WAL line — skipped, rest recovered. +Bundle summary works after interrupted deploy and sees ids stored in WAL +7. Open Questions +# +Question +Proposed Answer +1 +Should WAL be pushed to remote? +Never + +5. Test Plan + +We should use acceptance tests which compile and run real binary against testerver + +5.1 Unit Tests - WAL File Operations +| Test ID | Description | Expected Behavior | +|---------|-------------|-------------------| +| U01 | WAL path generation | walPath("resources.json") returns "resources.json.wal" | +| U02 | Write and read WAL | Header + entries written and read back correctly | +| U03 | Truncate WAL | File deleted from disk | +| U04 | Truncate non-existent WAL | No error returned | +| U05 | Read empty WAL | Returns error "WAL file is empty or missing header" | + +5.2 Unit Tests - WAL Recovery Logic +| Test ID | Description | Expected Behavior | +|---------|-------------|-------------------| +| R01 | No WAL exists | recoverFromWAL returns (false, nil) | +| R02 | Valid WAL (serial = state+1) | Entries replayed, returns (true, nil) | +| R03 | Stale WAL (serial < state+1) | WAL deleted, returns (false, nil) | +| R04 | Future WAL (serial > state+1) | Returns error about corruption | +| R05 | Lineage mismatch | Returns error about lineage mismatch | +| R06 | Lineage adopted from WAL | If state has no lineage, WAL lineage is used | +| R07 | Delete operation replay | Entry removed from state map | +| R08 | Corrupted entry line | Skipped, other entries recovered | + +5.3 Unit Tests - Integration with DeploymentState +| Test ID | Description | Expected Behavior | +|---------|-------------|-------------------| +| I01 | SaveState/DeleteState/Finalize flow | WAL created on first SaveState, entries written, truncated on Finalize, serial incremented | +| I02 | Finalize cleans stale WAL | If WAL file exists but wasn't opened this session, delete it | +| I03 | Open with existing WAL | Recovery performed before return | +| I04 | SaveState with DependsOn | DependsOn preserved in WAL entry | + +5.4 Acceptance Tests +| Test ID | Description | Steps | Expected Behavior | +|---------|-------------|-------|-------------------| +| A01 | Normal deploy | Deploy bundle with 2 resources | WAL created during deploy, deleted after Finalize | +| A02 | Crash recovery | 1. Deploy, crash after resource A created 2. Redeploy | Resource A recovered from WAL, resource B created, no duplicates | +| A03 | Bundle summary after crash | 1. Deploy, crash mid-deploy 2. Run bundle summary | Shows resources from WAL with correct IDs | + +5.5 Tests Implemented in wal_test.go +- TestWALPath (U01) +- TestWALWriteAndRead (U02) +- TestWALTruncate (U03, U04) +- TestRecoverFromWAL_NoWAL (R01) +- TestRecoverFromWAL_ValidWAL (R02) +- TestRecoverFromWAL_StaleWAL (R03) +- TestRecoverFromWAL_FutureWAL (R04) +- TestRecoverFromWAL_LineageMismatch (R05) +- TestRecoverFromWAL_DeleteOperation (R07) +- TestRecoverFromWAL_CorruptedLine (R08) +- TestDeploymentState_WALIntegration (I01) +- TestDeploymentState_WALRecoveryOnOpen (I03) +- TestDeploymentState_DeleteStateWritesWAL (I01) +- TestDeploymentState_WALWithDependsOn (I04) + +5.6 Tests Still Needed +| Test ID | Description | Priority | +|---------|-------------|----------| +| R06 | TestRecoverFromWAL_LineageAdoption (fresh state adopts WAL lineage) | High | +| I02 | TestDeploymentState_FinalizeCleansStaleWAL | Medium | +| U05 | TestReadEmptyWAL | Low | +| A01-A03 | Acceptance tests (require crash simulation infrastructure) | High | From e7da9d9bc46359bb79b96701939c52c33dd76f2e Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 12 Jan 2026 21:32:47 +0530 Subject: [PATCH 02/85] Updated tests and enhanced kill caller with an offset Signed-off-by: Varun Deep Saini --- .../deploy/wal/chain-10-jobs/databricks.yml | 117 ++++++++++ .../deploy/wal/chain-10-jobs/out.test.toml | 5 + .../deploy/wal/chain-10-jobs/output.txt | 73 +++++++ .../bundle/deploy/wal/chain-10-jobs/script | 22 ++ .../bundle/deploy/wal/chain-10-jobs/test.py | 1 + .../bundle/deploy/wal/chain-10-jobs/test.toml | 17 ++ .../deploy/wal/corrupted-wal-entry/output.txt | 38 +--- .../deploy/wal/corrupted-wal-entry/script | 10 +- .../wal/corrupted-wal-entry/sort_warnings.py | 87 -------- .../deploy/wal/corrupted-wal-entry/test.toml | 3 +- .../wal/corrupted-wal-middle/databricks.yml | 25 +++ .../wal/corrupted-wal-middle/out.test.toml | 5 + .../wal/corrupted-wal-middle/output.txt | 25 +++ .../deploy/wal/corrupted-wal-middle/script | 37 ++++ .../deploy/wal/corrupted-wal-middle/test.py | 1 + .../deploy/wal/corrupted-wal-middle/test.toml | 13 ++ .../wal/crash-after-create/databricks.yml | 12 + .../deploy/wal/crash-after-create/output.txt | 42 ++-- .../deploy/wal/crash-after-create/script | 26 ++- .../deploy/wal/crash-after-create/test.toml | 13 +- .../bundle/deploy/wal/empty-wal/output.txt | 17 +- .../deploy/wal/future-serial-wal/output.txt | 16 -- .../deploy/wal/lineage-mismatch/output.txt | 16 -- .../wal/multiple-crashes/databricks.yml | 18 +- .../deploy/wal/multiple-crashes/output.txt | 57 ++--- .../bundle/deploy/wal/multiple-crashes/script | 25 +-- .../deploy/wal/multiple-crashes/test.toml | 11 +- .../deploy/wal/normal-deploy/output.txt | 16 -- .../bundle/deploy/wal/stale-wal/output.txt | 16 -- .../wal/summary-after-crash/databricks.yml | 18 +- .../deploy/wal/summary-after-crash/output.txt | 44 ++-- .../deploy/wal/summary-after-crash/script | 26 ++- .../deploy/wal/summary-after-crash/test.toml | 16 +- acceptance/bundle/deploy/wal/test.toml | 5 + .../deploy/wal/wal-with-delete/test.toml | 4 +- acceptance/internal/config.go | 6 + acceptance/internal/prepare_server.go | 17 +- .../selftest/kill_caller/offset/out.test.toml | 5 + .../selftest/kill_caller/offset/output.txt | 33 +++ acceptance/selftest/kill_caller/offset/script | 17 ++ .../selftest/kill_caller/offset/test.toml | 11 + bundle/direct/dstate/state.go | 9 +- bundle/direct/dstate/wal.go | 108 ++++----- bundle/direct/dstate/wal_test.go | 206 ++++++++++++------ wal.txt | 205 ----------------- 45 files changed, 811 insertions(+), 683 deletions(-) create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/output.txt create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/script create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/test.py create mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/test.toml delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/script create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml create mode 100644 acceptance/selftest/kill_caller/offset/out.test.toml create mode 100644 acceptance/selftest/kill_caller/offset/output.txt create mode 100644 acceptance/selftest/kill_caller/offset/script create mode 100644 acceptance/selftest/kill_caller/offset/test.toml delete mode 100644 wal.txt diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml b/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml new file mode 100644 index 0000000000..2652cdbed6 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml @@ -0,0 +1,117 @@ +bundle: + name: wal-chain-test + +resources: + jobs: + # Linear chain: job_01 -> job_02 -> ... -> job_10 + # Execution order: job_01 first, job_10 last + job_01: + name: "job-01" + description: "first in chain" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_02: + name: "job-02" + description: "depends on ${resources.jobs.job_01.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_03: + name: "job-03" + description: "depends on ${resources.jobs.job_02.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_04: + name: "job-04" + description: "depends on ${resources.jobs.job_03.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_05: + name: "job-05" + description: "depends on ${resources.jobs.job_04.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_06: + name: "job-06" + description: "depends on ${resources.jobs.job_05.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_07: + name: "job-07" + description: "depends on ${resources.jobs.job_06.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_08: + name: "job-08" + description: "depends on ${resources.jobs.job_07.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_09: + name: "job-09" + description: "depends on ${resources.jobs.job_08.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_10: + name: "job-10" + description: "depends on ${resources.jobs.job_09.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml b/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt new file mode 100644 index 0000000000..4c4d781c80 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt @@ -0,0 +1,73 @@ +=== First deploy (crashes on job_10) === + +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... +Deploying resources... +[PROCESS_KILLED] + +Exit code: [KILLED] + +=== WAL content after crash === +{"lineage":"[UUID]","serial": [SERIAL]} +{"k":"resources.jobs.job_01","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"first in chain","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-01","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]}}} +{"k":"resources.jobs.job_02","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-02","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_01","label":"${resources.jobs.job_01.id}"}]}} +{"k":"resources.jobs.job_03","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-03","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_02","label":"${resources.jobs.job_02.id}"}]}} +{"k":"resources.jobs.job_04","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-04","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_03","label":"${resources.jobs.job_03.id}"}]}} +{"k":"resources.jobs.job_05","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-05","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_04","label":"${resources.jobs.job_04.id}"}]}} +{"k":"resources.jobs.job_06","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-06","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_05","label":"${resources.jobs.job_05.id}"}]}} +{"k":"resources.jobs.job_07","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-07","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_06","label":"${resources.jobs.job_06.id}"}]}} +{"k":"resources.jobs.job_08","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-08","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_07","label":"${resources.jobs.job_07.id}"}]}} +{"k":"resources.jobs.job_09","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-09","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_08","label":"${resources.jobs.job_08.id}"}]}} + +=== Number of jobs saved in WAL === +9 + +=== Bundle summary (reads from WAL) === +Name: wal-chain-test +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default +Resources: + Jobs: + job_01: + Name: job-01 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_02: + Name: job-02 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_03: + Name: job-03 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_04: + Name: job-04 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_05: + Name: job-05 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_06: + Name: job-06 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_07: + Name: job-07 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_08: + Name: job-08 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_09: + Name: job-09 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_10: + Name: job-10 + URL: (not deployed) + +=== Second deploy (recovery) === + +>>> [CLI] bundle deploy --force-lock +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/script b/acceptance/bundle/deploy/wal/chain-10-jobs/script new file mode 100644 index 0000000000..6cf2dd32f0 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/script @@ -0,0 +1,22 @@ +echo "=== First deploy (crashes on job_10) ===" +trace errcode $CLI bundle deploy + +echo "" +echo "=== WAL content after crash ===" +cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file" + +echo "" +echo "=== Number of jobs saved in WAL ===" +grep -c '"k":"resources.jobs' .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "0" + +echo "" +echo "=== Bundle summary (reads from WAL) ===" +$CLI bundle summary + +echo "" +echo "=== Second deploy (recovery) ===" +trace $CLI bundle deploy --force-lock + +echo "" +echo "=== WAL after successful deploy ===" +cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)" diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.py b/acceptance/bundle/deploy/wal/chain-10-jobs/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml new file mode 100644 index 0000000000..c4308521be --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml @@ -0,0 +1,17 @@ +# Linear chain: job_01 -> job_02 -> ... -> job_10 +# Let first 9 jobs/create succeed, then kill on the 10th + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +KillCallerOffset = 9 +KillCaller = 1 +Response.Body = '{"job_id": 1001}' + +[[Server]] +Pattern = "POST /api/2.2/jobs/reset" +Response.Body = '{}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index 1192629332..f5e7f346d8 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,45 +1,13 @@ === Creating state file with serial 5 === -=== Creating WAL with corrupted entry === +=== Creating WAL with corrupted LAST entry === === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} {"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} -not valid json - this line should be skipped {"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}} -=== Deploy (should recover valid entries, skip corrupted) === +not valid json - corrupted last line (partial write from crash) +=== Deploy (should recover valid entries, skip corrupted last line) === >>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.another_valid.tasks[0].new_cluster - in databricks.yml:23:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - -Warning: Single node cluster is not correctly configured - at resources.jobs.valid_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... Deploying resources... Updating deployment state... diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index d73595a6f4..fc36ed754f 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -10,19 +10,21 @@ cat > .databricks/bundle/default/resources.json << 'EOF' } EOF -echo "=== Creating WAL with corrupted entry ===" +echo "=== Creating WAL with corrupted LAST entry ===" +# Corrupted last line is expected (partial write from crash) and should be skipped. +# Valid entries before it should be recovered. cat > .databricks/bundle/default/resources.json.wal << 'EOF' {"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} -not valid json - this line should be skipped {"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} +not valid json - corrupted last line (partial write from crash) EOF echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal -echo "=== Deploy (should recover valid entries, skip corrupted) ===" -trace $CLI bundle deploy 2>&1 | python3 sort_warnings.py +echo "=== Deploy (should recover valid entries, skip corrupted last line) ===" +trace $CLI bundle deploy 2>&1 echo "=== Final state (should have recovered entries) ===" cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py b/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py deleted file mode 100644 index 06a6a0e59c..0000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/sort_warnings.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python3 -"""Sort warning blocks in CLI output to make test output deterministic. - -Warning blocks look like: -Warning: Single node cluster is not correctly configured - at resources.jobs.XXX.tasks[0].new_cluster - in databricks.yml:NN:NN - -num_workers should be 0 only for single-node clusters... - spark_conf: - ... - custom_tags: - ... - -This script groups consecutive warning blocks, sorts them by job name, and outputs. -""" - -import re -import sys - - -def main(): - content = sys.stdin.read() - lines = content.split("\n") - - result = [] - i = 0 - - while i < len(lines): - line = lines[i] - - # Check if this is the start of a warning block - if line.startswith("Warning:"): - # Collect all consecutive warning blocks - warnings = [] - while i < len(lines) and ( - lines[i].startswith("Warning:") - or ( - warnings - and not lines[i].startswith("Uploading") - and not lines[i].startswith("Deploying") - and not lines[i].startswith(">>>") - and not lines[i].startswith("===") - ) - ): - # Collect one complete warning block - block = [] - if lines[i].startswith("Warning:"): - block.append(lines[i]) - i += 1 - # Collect until next Warning or end marker - while i < len(lines): - if lines[i].startswith("Warning:"): - break - if lines[i].startswith("Uploading") or lines[i].startswith("Deploying"): - break - if lines[i].startswith(">>>") or lines[i].startswith("==="): - break - block.append(lines[i]) - i += 1 - warnings.append(block) - else: - i += 1 - - # Sort warnings by the job name in "at resources.jobs.XXX" - def get_sort_key(block): - for line in block: - match = re.search(r"at resources\.jobs\.(\w+)", line) - if match: - return match.group(1) - return "" - - warnings.sort(key=get_sort_key) - - # Output sorted warnings - for block in warnings: - for line in block: - result.append(line) - else: - result.append(line) - i += 1 - - print("\n".join(result), end="") - - -if __name__ == "__main__": - main() diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml index 5bbe82835c..9c9ab5a30b 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -1,4 +1,4 @@ -# WAL with corrupted entry - valid entries should be recovered, corrupted skipped. +# WAL with corrupted LAST entry - valid entries should be recovered, corrupted last line skipped. [[Server]] Pattern = "POST /api/2.2/jobs/reset" @@ -11,3 +11,4 @@ Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=2222" Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' + diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml new file mode 100644 index 0000000000..aef2c714ec --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml @@ -0,0 +1,25 @@ +bundle: + name: wal-corrupted-middle-test + +resources: + jobs: + job_one: + name: "job-one" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_two: + name: "job-two" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml new file mode 100644 index 0000000000..54146af564 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt new file mode 100644 index 0000000000..4396aade67 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt @@ -0,0 +1,25 @@ +=== Creating state file with serial 5 === +=== Creating WAL with corrupted MIDDLE entry === +=== WAL content === +{"lineage":"test-lineage-456","serial": [SERIAL]} +{"k":"resources.jobs.job_one","v":{"__id__": "[ID]","state":{"name":"job-one"}}} +not valid json - CORRUPTED MIDDLE LINE +{"k":"resources.jobs.job_two","v":{"__id__": "[ID]","state":{"name":"job-two"}}} +=== Deploy (WAL should be deleted due to middle corruption) === + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... +Warn: Failed to read WAL file, deleting and proceeding: WAL line 3: corrupted entry in middle of WAL: invalid character 'o' in literal null (expecting 'u') +Deploying resources... +Updating deployment state... +Deployment complete! +=== Final state (fresh deploy, not recovered from WAL) === +{ + "serial": [SERIAL], + "state_keys": [ + "resources.jobs.job_one", + "resources.jobs.job_two" + ] +} +=== WAL after deploy === +WAL deleted (expected - due to middle corruption) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script new file mode 100644 index 0000000000..46dc1922d1 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script @@ -0,0 +1,37 @@ +echo "=== Creating state file with serial 5 ===" +mkdir -p .databricks/bundle/default +cat > .databricks/bundle/default/resources.json << 'EOF' +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-456", + "serial": 5, + "state": {} +} +EOF + +echo "=== Creating WAL with corrupted MIDDLE entry ===" +# Corruption in the middle is NOT expected (only last line can be partial write). +# This should cause WAL to be deleted entirely, no recovery. +cat > .databricks/bundle/default/resources.json.wal << 'EOF' +{"lineage":"test-lineage-456","serial":6} +{"k":"resources.jobs.job_one","v":{"__id__":"1111","state":{"name":"job-one"}}} +not valid json - CORRUPTED MIDDLE LINE +{"k":"resources.jobs.job_two","v":{"__id__":"2222","state":{"name":"job-two"}}} +EOF + +echo "=== WAL content ===" +cat .databricks/bundle/default/resources.json.wal + +echo "=== Deploy (WAL should be deleted due to middle corruption) ===" +trace $CLI bundle deploy 2>&1 + +echo "=== Final state (fresh deploy, not recovered from WAL) ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' + +echo "=== WAL after deploy ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (unexpected)" +else + echo "WAL deleted (expected - due to middle corruption)" +fi diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py new file mode 100644 index 0000000000..1ff8e07c70 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py @@ -0,0 +1 @@ +print("test") diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml new file mode 100644 index 0000000000..8aa40be8d7 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml @@ -0,0 +1,13 @@ +# WAL with corrupted MIDDLE entry - WAL should be deleted, no recovery. +# Corruption in the middle is unexpected (not a partial write from crash). +# The entire WAL is discarded and a fresh deploy happens. + +# Since WAL is discarded, jobs will be created fresh (not recovered) +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 9999}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get?job_id=9999" +Response.Body = '{"job_id": 9999, "settings": {"name": "fresh-job"}}' + diff --git a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml index ebee1d9699..31480454c5 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml +++ b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml @@ -5,6 +5,7 @@ resources: jobs: job_a: name: "test-job-a" + description: "first job" tasks: - task_key: "task-a" spark_python_task: @@ -13,3 +14,14 @@ resources: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge num_workers: 0 + job_b: + name: "test-job-b" + description: "depends on ${resources.jobs.job_a.id}" + tasks: + - task_key: "task-b" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 9c33326382..9ab9f4cf9c 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -1,37 +1,33 @@ -=== Creating state directory === -=== Creating WAL file (simulating crash after job create) === -=== WAL content before deploy === -{"lineage":"test-lineage-123","serial": [SERIAL]} -{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"name":"test-job-a"}}} -=== Deploy (should recover from WAL) === +=== First deploy (crashes after job_a create, before job_b) === ->>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.job_a.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... +Deploying resources... +[PROCESS_KILLED] - custom_tags: - ResourceClass: SingleNode - +Exit code: [KILLED] +=== WAL should exist after crash === +WAL exists (expected) +{"lineage":"[UUID]","serial": [SERIAL]} +{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +=== State file after crash (should be empty) === +{ + "serial": [SERIAL], + "state_keys": [] +} +=== Second deploy (should recover from WAL and complete) === +>>> [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... Deploying resources... Updating deployment state... Deployment complete! === State file after recovery === { - "lineage": "test-lineage-123", "serial": [SERIAL], "state_keys": [ - "resources.jobs.job_a" + "resources.jobs.job_a", + "resources.jobs.job_b" ] } === WAL file after successful deploy === diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script index c583a5eead..d09f6ab06e 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/script +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -1,20 +1,22 @@ -echo "=== Creating state directory ===" -mkdir -p .databricks/bundle/default +echo "=== First deploy (crashes after job_a create, before job_b) ===" +trace errcode $CLI bundle deploy -echo "=== Creating WAL file (simulating crash after job create) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-123","serial":1} -{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"name":"test-job-a"}}} -EOF +echo "=== WAL should exist after crash ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (expected)" + cat .databricks/bundle/default/resources.json.wal +else + echo "WAL missing (unexpected)" +fi -echo "=== WAL content before deploy ===" -cat .databricks/bundle/default/resources.json.wal +echo "=== State file after crash (should be empty) ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' -echo "=== Deploy (should recover from WAL) ===" -trace $CLI bundle deploy +echo "=== Second deploy (should recover from WAL and complete) ===" +trace $CLI bundle deploy --force-lock echo "=== State file after recovery ===" -cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' echo "=== WAL file after successful deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml index 9e20bac15d..5023224e57 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -1,5 +1,10 @@ -# WAL recovery after simulated crash. Job was created but state wasn't finalized. -# Deploy should recover job from WAL and update it. +# WAL recovery after real crash. First deploy creates job_a then crashes. +# Second deploy recovers from WAL and completes successfully. +# job_b depends on job_a, so jobs/get is called after job_a's SaveState. + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' [[Server]] Pattern = "POST /api/2.2/jobs/reset" @@ -7,4 +12,6 @@ Response.Body = '{}' [[Server]] Pattern = "GET /api/2.2/jobs/get" -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job-a"}}' +KillCaller = 1 +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index 91a31fe322..21b6851080 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -5,23 +5,8 @@ === Deploy (should handle empty WAL gracefully) === >>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... +Warn: Failed to read WAL file, deleting and proceeding: WAL file is empty Deploying resources... Updating deployment state... Deployment complete! diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index ffb03147dc..b0e5bda558 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -6,22 +6,6 @@ === Deploy (should fail with corruption error) === >>> errcode [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-future-serial-test/default/files... Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL serial (5) is ahead of expected (3), state may be corrupted diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index 2419e7a612..7f6c3a89bd 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -6,22 +6,6 @@ === Deploy (should fail with lineage mismatch error) === >>> errcode [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-lineage-mismatch-test/default/files... Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml index b4162d8fdf..3dc96ed856 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml @@ -3,10 +3,22 @@ bundle: resources: jobs: - test_job: - name: "test-job" + job_a: + name: "test-job-a" + description: "first job" tasks: - - task_key: "test-task" + - task_key: "task-a" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_b: + name: "test-job-b" + description: "depends on ${resources.jobs.job_a.id}" + tasks: + - task_key: "task-b" spark_python_task: python_file: ./test.py new_cluster: diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt index 3e0426a628..33dd984b74 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt +++ b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt @@ -1,54 +1,28 @@ -=== Creating state directory === -=== Creating WAL file (simulating crash after job create) === -=== WAL content === -{"lineage":"test-lineage-456","serial": [SERIAL]} -{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} -=== First deploy attempt (will crash during update) === +=== First deploy (crashes after job_a create) === >>> errcode [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... Deploying resources... [PROCESS_KILLED] Exit code: [KILLED] === WAL after first crash === -{"lineage":"test-lineage-456","serial": [SERIAL]} -{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} -=== Second deploy attempt (should succeed) === - ->>> [CLI] bundle deploy --force-lock -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 +WAL exists +{"lineage":"[UUID]","serial": [SERIAL]} +{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +=== Second deploy (crashes during job_a update) === -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] +>>> errcode [CLI] bundle deploy --force-lock +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... +Deploying resources... +[PROCESS_KILLED] - custom_tags: - ResourceClass: SingleNode - +Exit code: [KILLED] +=== WAL after second crash === +WAL still exists +=== Third deploy (should succeed) === +>>> [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... Deploying resources... Updating deployment state... @@ -57,7 +31,8 @@ Deployment complete! { "serial": [SERIAL], "state_keys": [ - "resources.jobs.test_job" + "resources.jobs.job_a", + "resources.jobs.job_b" ] } === WAL after successful deploy === diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/script b/acceptance/bundle/deploy/wal/multiple-crashes/script index 795e4261e1..0adcd2a980 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/script +++ b/acceptance/bundle/deploy/wal/multiple-crashes/script @@ -1,24 +1,21 @@ -echo "=== Creating state directory ===" -mkdir -p .databricks/bundle/default - -echo "=== Creating WAL file (simulating crash after job create) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-456","serial":1} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -EOF - -echo "=== WAL content ===" -cat .databricks/bundle/default/resources.json.wal - -echo "=== First deploy attempt (will crash during update) ===" +echo "=== First deploy (crashes after job_a create) ===" trace errcode $CLI bundle deploy echo "=== WAL after first crash ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists" cat .databricks/bundle/default/resources.json.wal fi -echo "=== Second deploy attempt (should succeed) ===" +echo "=== Second deploy (crashes during job_a update) ===" +trace errcode $CLI bundle deploy --force-lock + +echo "=== WAL after second crash ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL still exists" +fi + +echo "=== Third deploy (should succeed) ===" trace $CLI bundle deploy --force-lock echo "=== Final state ===" diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml index 2e9973c846..c5981d6720 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -1,4 +1,11 @@ -# Multiple crashes during recovery - WAL should persist until successful finalize. +# Multiple real crashes during deployment - WAL should persist until successful finalize. +# First deploy: crashes after job_a create (kill on jobs/get) +# Second deploy: crashes during job_a update (kill on jobs/reset) +# Third deploy: succeeds (both counters exhausted) + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' [[Server]] Pattern = "POST /api/2.2/jobs/reset" @@ -7,4 +14,6 @@ Response.Body = '{}' [[Server]] Pattern = "GET /api/2.2/jobs/get" +KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + diff --git a/acceptance/bundle/deploy/wal/normal-deploy/output.txt b/acceptance/bundle/deploy/wal/normal-deploy/output.txt index 50c1430641..ccb189ff09 100644 --- a/acceptance/bundle/deploy/wal/normal-deploy/output.txt +++ b/acceptance/bundle/deploy/wal/normal-deploy/output.txt @@ -1,21 +1,5 @@ >>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-test/default/files... Deploying resources... Updating deployment state... diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt index 3722788e52..682534de7c 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/output.txt +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -7,22 +7,6 @@ === Deploy (should ignore stale WAL) === >>> [CLI] bundle deploy -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] - - custom_tags: - ResourceClass: SingleNode - - Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-stale-test/default/files... Deploying resources... Updating deployment state... diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml index 063faa8e54..86376fd7ba 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml +++ b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml @@ -3,10 +3,22 @@ bundle: resources: jobs: - test_job: - name: "test-job" + job_a: + name: "job-a" + description: "first job" tasks: - - task_key: "test-task" + - task_key: "task-a" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_b: + name: "job-b" + description: "depends on ${resources.jobs.job_a.id}" + tasks: + - task_key: "task-b" spark_python_task: python_file: ./test.py new_cluster: diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt index 2e6abf645a..9a2644a60b 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt +++ b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt @@ -1,25 +1,29 @@ -=== Creating state directory === -=== Creating WAL file (simulating crash after job create) === -=== Bundle summary (should show job from WAL with id) === +=== Deploy (job_a created and saved, then crash on jobs/get) === ->>> [CLI] bundle summary -o json -Warning: Single node cluster is not correctly configured - at resources.jobs.test_job.tasks[0].new_cluster - in databricks.yml:13:13 - -num_workers should be 0 only for single-node clusters. To create a -valid single node cluster please ensure that the following properties -are correctly set in the cluster specification: - - spark_conf: - spark.databricks.cluster.profile: singleNode - spark.master: local[*] +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files... +Deploying resources... +[PROCESS_KILLED] - custom_tags: - ResourceClass: SingleNode - +Exit code: [KILLED] +=== State directory contents after crash === +deployment.json +resources.json +resources.json.wal +sync-snapshots +=== WAL should exist after crash === +WAL exists (expected) +{"lineage":"[UUID]","serial": [SERIAL]} +{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files/test.py"},"task_key":"task-a"}]}}} +=== State file after crash === +{ + "serial": [SERIAL], + "state_keys": [] +} +=== Bundle summary (should show job_a from WAL) === +>>> [CLI] bundle summary -o json { - "job_id": "[ID]", - "modified_status": null + "job_a_id": "1001", + "job_b_id": null } diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/script b/acceptance/bundle/deploy/wal/summary-after-crash/script index d2017c6590..3b007062c6 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/script +++ b/acceptance/bundle/deploy/wal/summary-after-crash/script @@ -1,11 +1,19 @@ -echo "=== Creating state directory ===" -mkdir -p .databricks/bundle/default +echo "=== Deploy (job_a created and saved, then crash on jobs/get) ===" +trace errcode $CLI bundle deploy -echo "=== Creating WAL file (simulating crash after job create) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"summary-test-lineage","serial":1} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -EOF +echo "=== State directory contents after crash ===" +ls .databricks/bundle/default/ -echo "=== Bundle summary (should show job from WAL with id) ===" -trace $CLI bundle summary -o json | jq '{job_id: .resources.jobs.test_job.id, modified_status: .resources.jobs.test_job.modified_status}' +echo "=== WAL should exist after crash ===" +if [ -f ".databricks/bundle/default/resources.json.wal" ]; then + echo "WAL exists (expected)" + cat .databricks/bundle/default/resources.json.wal +else + echo "WAL missing (unexpected)" +fi + +echo "=== State file after crash ===" +cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' + +echo "=== Bundle summary (should show job_a from WAL) ===" +trace $CLI bundle summary -o json | jq '{job_a_id: .resources.jobs.job_a.id, job_b_id: .resources.jobs.job_b.id}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml index 3363a1c516..961030e981 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml +++ b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml @@ -1,2 +1,14 @@ -# Bundle summary should show resources recovered from WAL. -# No server stubs needed - we just run bundle summary which reads state. +# Bundle summary should show resources recovered from WAL after a real crash. +# job_b depends on job_a, so after job_a is created and SaveState is called, +# refreshRemoteState calls jobs/get to fetch job_a's state for job_b's reference. +# We kill on jobs/get - AFTER job_a's SaveState, so WAL contains job_a. + +[[Server]] +Pattern = "POST /api/2.2/jobs/create" +Response.Body = '{"job_id": 1001}' + +[[Server]] +Pattern = "GET /api/2.2/jobs/get" +KillCaller = 1 +Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 7fd1daf93b..1632ddb195 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -41,3 +41,8 @@ New = '"__id__": "[ID]"' [[Repls]] Old = '"job_id":\s*"\d+"' New = '"job_id": "[ID]"' + +# Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) +[[Repls]] +Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' +New = '' diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml index 27045f8885..4f81ae4695 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml +++ b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml @@ -1,5 +1,7 @@ -# WAL recovery after crash during delete operation. +# WAL recovery after crash during delete operation (simulated). # Delete was recorded in WAL but not finalized. Deploy should complete the delete. +# Note: Real crash testing for delete is not possible because there's no API call +# after DeleteState (unlike create which has refreshRemoteState after SaveState). # No server stubs needed - the delete was already done (recorded in WAL) # and the job no longer needs API calls diff --git a/acceptance/internal/config.go b/acceptance/internal/config.go index 06ac61c39b..dc63911173 100644 --- a/acceptance/internal/config.go +++ b/acceptance/internal/config.go @@ -159,6 +159,12 @@ type ServerStub struct { // Useful for testing crash recovery scenarios where first deploy crashes but retry succeeds. // Requires DATABRICKS_CLI_TEST_PID=1 to be set in the test environment. KillCaller int + + // Number of requests to let pass before starting to kill. + // Combined with KillCaller, this creates a window: requests 1 to Offset succeed, + // requests Offset+1 to Offset+KillCaller are killed, rest succeed. + // Example: KillCallerOffset=9, KillCaller=1 means let 9 requests pass, kill the 10th. + KillCallerOffset int } // FindConfigs finds all the config relevant for this test, diff --git a/acceptance/internal/prepare_server.go b/acceptance/internal/prepare_server.go index 8f18d1c61b..dfa89ef748 100644 --- a/acceptance/internal/prepare_server.go +++ b/acceptance/internal/prepare_server.go @@ -183,8 +183,9 @@ func startLocalServer(t *testing.T, s.ResponseCallback = logResponseCallback(t) } - // Track remaining kill counts per pattern (for KillCaller > 0) + // Track remaining kill counts and offset counts per pattern (for KillCaller > 0) killCounters := make(map[string]int) + offsetCounters := make(map[string]int) killCountersMu := &sync.Mutex{} for ind := range stubs { @@ -195,9 +196,10 @@ func startLocalServer(t *testing.T, items := strings.Split(stub.Pattern, " ") require.Len(t, items, 2) - // Initialize kill counter for this pattern + // Initialize kill counter and offset counter for this pattern if stub.KillCaller > 0 { killCounters[stub.Pattern] = stub.KillCaller + offsetCounters[stub.Pattern] = stub.KillCallerOffset } s.Handle(items[0], items[1], func(req testserver.Request) any { @@ -218,7 +220,7 @@ func startLocalServer(t *testing.T, } } - if shouldKillCaller(stub, killCounters, killCountersMu) { + if shouldKillCaller(stub, offsetCounters, killCounters, killCountersMu) { killCaller(t, stub.Pattern, req.Headers) } @@ -232,12 +234,19 @@ func startLocalServer(t *testing.T, return s.URL } -func shouldKillCaller(stub ServerStub, killCounters map[string]int, mu *sync.Mutex) bool { +func shouldKillCaller(stub ServerStub, offsetCounters, killCounters map[string]int, mu *sync.Mutex) bool { if stub.KillCaller <= 0 { return false } mu.Lock() defer mu.Unlock() + + // Still in offset period? Let this request pass. + if offsetCounters[stub.Pattern] > 0 { + offsetCounters[stub.Pattern]-- + return false + } + if killCounters[stub.Pattern] <= 0 { return false } diff --git a/acceptance/selftest/kill_caller/offset/out.test.toml b/acceptance/selftest/kill_caller/offset/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/selftest/kill_caller/offset/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt new file mode 100644 index 0000000000..03407dd0d8 --- /dev/null +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -0,0 +1,33 @@ + +>>> [CLI] current-user me +{ + "id":"123", + "userName":"test@example.com" +} +Attempt 1 done - success (offset) + +>>> [CLI] current-user me +{ + "id":"123", + "userName":"test@example.com" +} +Attempt 2 done - success (offset) + +>>> errcode [CLI] current-user me +[PROCESS_KILLED] + +Exit code: [KILLED] +Attempt 3 done - killed + +>>> errcode [CLI] current-user me +[PROCESS_KILLED] + +Exit code: [KILLED] +Attempt 4 done - killed + +>>> [CLI] current-user me +{ + "id":"123", + "userName":"test@example.com" +} +Attempt 5 done - success (past kill window) diff --git a/acceptance/selftest/kill_caller/offset/script b/acceptance/selftest/kill_caller/offset/script new file mode 100644 index 0000000000..3411e87480 --- /dev/null +++ b/acceptance/selftest/kill_caller/offset/script @@ -0,0 +1,17 @@ +# First 2 attempts should succeed (offset period) +trace $CLI current-user me +echo "Attempt 1 done - success (offset)" + +trace $CLI current-user me +echo "Attempt 2 done - success (offset)" + +# Attempts 3-4 should be killed +trace errcode $CLI current-user me +echo "Attempt 3 done - killed" + +trace errcode $CLI current-user me +echo "Attempt 4 done - killed" + +# Attempt 5 should succeed again +trace $CLI current-user me +echo "Attempt 5 done - success (past kill window)" diff --git a/acceptance/selftest/kill_caller/offset/test.toml b/acceptance/selftest/kill_caller/offset/test.toml new file mode 100644 index 0000000000..5eab09dbfa --- /dev/null +++ b/acceptance/selftest/kill_caller/offset/test.toml @@ -0,0 +1,11 @@ +# Let first 2 requests pass, kill next 2, then allow rest +[[Server]] +Pattern = "GET /api/2.0/preview/scim/v2/Me" +KillCallerOffset = 2 +KillCaller = 2 +Response.Body = ''' +{ + "id": "123", + "userName": "test@example.com" +} +''' diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 9113021c8f..1f8a705e05 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -61,7 +61,7 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d db.Data.State = make(map[string]ResourceEntry) } - jsonMessage, err := json.MarshalIndent(state, " ", " ") + jsonMessage, err := json.MarshalIndent(state, "", " ") if err != nil { return err } @@ -72,7 +72,6 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d DependsOn: dependsOn, } - // Write to WAL before updating memory if err := db.ensureWALOpen(); err != nil { return fmt.Errorf("failed to open WAL: %w", err) } @@ -94,7 +93,6 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } - // Write to WAL before updating memory (nil entry means delete) if err := db.ensureWALOpen(); err != nil { return fmt.Errorf("failed to open WAL: %w", err) } @@ -119,7 +117,6 @@ func (db *DeploymentState) ensureWALOpen() error { return err } - // Generate lineage if this is a fresh deployment lineage := db.Data.Lineage if lineage == "" { lineage = uuid.New().String() @@ -196,8 +193,7 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { db.Path = path } - // Attempt WAL recovery - recovered, err := recoverFromWAL(path, &db.Data) + recovered, err := recoverFromWAL(ctx, path, &db.Data) if err != nil { return fmt.Errorf("WAL recovery failed: %w", err) } @@ -228,7 +224,6 @@ func (db *DeploymentState) Finalize() error { return err } - // Truncate WAL after successful state file write if db.wal != nil { if err := db.wal.truncate(); err != nil { return fmt.Errorf("failed to truncate WAL: %w", err) diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index 700bfa24e2..37dd1bffa2 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -2,37 +2,34 @@ package dstate import ( "bufio" + "context" "encoding/json" "errors" "fmt" "os" + + "github.com/databricks/cli/libs/log" ) -// WALHeader is the first entry in the WAL file, containing metadata for validation. type WALHeader struct { Lineage string `json:"lineage"` Serial int `json:"serial"` } -// WALEntry represents a single state mutation in the WAL. -// For set operations, V is populated. For delete operations, V is nil. type WALEntry struct { K string `json:"k"` - V *ResourceEntry `json:"v,omitempty"` + V *ResourceEntry `json:"v,omitempty"` // nil means delete } -// WAL manages the Write-Ahead Log for deployment state recovery. type WAL struct { path string file *os.File } -// walPath returns the WAL file path for a given state file path. func walPath(statePath string) string { return statePath + ".wal" } -// openWAL opens or creates a WAL file for writing. func openWAL(statePath string) (*WAL, error) { wp := walPath(statePath) f, err := os.OpenFile(wp, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o600) @@ -42,7 +39,6 @@ func openWAL(statePath string) (*WAL, error) { return &WAL{path: wp, file: f}, nil } -// writeHeader writes the WAL header (lineage and serial) as the first entry. func (w *WAL) writeHeader(lineage string, serial int) error { header := WALHeader{ Lineage: lineage, @@ -51,7 +47,6 @@ func (w *WAL) writeHeader(lineage string, serial int) error { return w.writeJSON(header) } -// writeEntry appends a state mutation entry to the WAL. func (w *WAL) writeEntry(key string, entry *ResourceEntry) error { walEntry := WALEntry{ K: key, @@ -60,7 +55,6 @@ func (w *WAL) writeEntry(key string, entry *ResourceEntry) error { return w.writeJSON(walEntry) } -// writeJSON marshals and writes a JSON object as a single line, then syncs to disk. func (w *WAL) writeJSON(v any) error { data, err := json.Marshal(v) if err != nil { @@ -73,15 +67,9 @@ func (w *WAL) writeJSON(v any) error { return fmt.Errorf("failed to write WAL entry: %w", err) } - err = w.file.Sync() - if err != nil { - return fmt.Errorf("failed to sync WAL file: %w", err) - } - return nil } -// close closes the WAL file handle. func (w *WAL) close() error { if w.file != nil { return w.file.Close() @@ -89,7 +77,6 @@ func (w *WAL) close() error { return nil } -// truncate deletes the WAL file after successful finalization. func (w *WAL) truncate() error { if w.file != nil { w.file.Close() @@ -102,9 +89,7 @@ func (w *WAL) truncate() error { return nil } -// readWAL reads and parses an existing WAL file for recovery. -// Returns the header and entries, or an error if the WAL is invalid. -func readWAL(statePath string) (*WALHeader, []WALEntry, error) { +func readWAL(ctx context.Context, statePath string) (*WALHeader, []WALEntry, error) { wp := walPath(statePath) f, err := os.Open(wp) if err != nil { @@ -113,103 +98,98 @@ func readWAL(statePath string) (*WALHeader, []WALEntry, error) { defer f.Close() scanner := bufio.NewScanner(f) - var header *WALHeader - var entries []WALEntry - lineNum := 0 - + var lines [][]byte for scanner.Scan() { - lineNum++ line := scanner.Bytes() if len(line) == 0 { continue } + lineCopy := make([]byte, len(line)) + copy(lineCopy, line) + lines = append(lines, lineCopy) + } + if err := scanner.Err(); err != nil { + return nil, nil, fmt.Errorf("failed to read WAL file: %w", err) + } - if header == nil { - // First line must be the header - var h WALHeader - if err := json.Unmarshal(line, &h); err != nil { - return nil, nil, fmt.Errorf("WAL line %d: failed to parse header: %w", lineNum, err) - } - header = &h - } else { - // Subsequent lines are entries - var e WALEntry - if err := json.Unmarshal(line, &e); err != nil { - // Skip corrupted lines silently - this is expected for partial writes + if len(lines) == 0 { + return nil, nil, errors.New("WAL file is empty") + } + + var header WALHeader + if err := json.Unmarshal(lines[0], &header); err != nil { + return nil, nil, fmt.Errorf("failed to parse WAL header: %w", err) + } + + var entries []WALEntry + for i := 1; i < len(lines); i++ { + lineNum := i + 1 + isLastLine := i == len(lines)-1 + + var e WALEntry + if err := json.Unmarshal(lines[i], &e); err != nil { + if isLastLine { + log.Debugf(ctx, "WAL line %d: skipping corrupted last entry: %v", lineNum, err) continue } - if e.K == "" { - // Skip entries with empty keys + return nil, nil, fmt.Errorf("WAL line %d: corrupted entry in middle of WAL: %w", lineNum, err) + } + + if e.K == "" { + if isLastLine { + log.Debugf(ctx, "WAL line %d: skipping last entry with empty key", lineNum) continue } - entries = append(entries, e) + return nil, nil, fmt.Errorf("WAL line %d: entry with empty key in middle of WAL", lineNum) } - } - - if err := scanner.Err(); err != nil { - return nil, nil, fmt.Errorf("failed to read WAL file: %w", err) - } - if header == nil { - return nil, nil, errors.New("WAL file is empty or missing header") + entries = append(entries, e) } - return header, entries, nil + return &header, entries, nil } -// recoverFromWAL attempts to recover state from an existing WAL file. -// It validates the WAL against the current state and replays valid entries. -// Returns true if recovery was performed, false if no recovery needed. -func recoverFromWAL(statePath string, db *Database) (bool, error) { +func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, error) { wp := walPath(statePath) - // Check if WAL exists if _, err := os.Stat(wp); os.IsNotExist(err) { return false, nil } - header, entries, err := readWAL(statePath) + header, entries, err := readWAL(ctx, statePath) if err != nil { - // If we can't read the WAL at all, delete it and proceed + log.Warnf(ctx, "Failed to read WAL file, deleting and proceeding: %v", err) os.Remove(wp) return false, nil } - // Validate WAL serial against state serial expectedSerial := db.Serial + 1 if header.Serial < expectedSerial { - // Stale WAL - delete and proceed without recovery + log.Debugf(ctx, "Deleting stale WAL (serial %d < expected %d)", header.Serial, expectedSerial) os.Remove(wp) return false, nil } if header.Serial > expectedSerial { - // WAL is ahead of state - this indicates corruption return false, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) } - // Validate lineage if both exist if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { return false, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) } - // Adopt lineage from WAL if state doesn't have one if db.Lineage == "" && header.Lineage != "" { db.Lineage = header.Lineage } - // Initialize state map if needed if db.State == nil { db.State = make(map[string]ResourceEntry) } - // Replay entries for _, entry := range entries { if entry.V != nil { - // Set operation db.State[entry.K] = *entry.V } else { - // Delete operation delete(db.State, entry.K) } } diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go index e475a92e9d..9c2250c830 100644 --- a/bundle/direct/dstate/wal_test.go +++ b/bundle/direct/dstate/wal_test.go @@ -20,15 +20,12 @@ func TestWALWriteAndRead(t *testing.T) { dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Open WAL for writing wal, err := openWAL(statePath) require.NoError(t, err) - // Write header err = wal.writeHeader("test-lineage", 1) require.NoError(t, err) - // Write entries entry1 := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), @@ -43,15 +40,14 @@ func TestWALWriteAndRead(t *testing.T) { err = wal.writeEntry("resources.jobs.job2", entry2) require.NoError(t, err) - // Write a delete entry (nil value) err = wal.writeEntry("resources.jobs.old_job", nil) require.NoError(t, err) err = wal.close() require.NoError(t, err) - // Read WAL back - header, entries, err := readWAL(statePath) + ctx := context.Background() + header, entries, err := readWAL(ctx, statePath) require.NoError(t, err) assert.Equal(t, "test-lineage", header.Lineage) @@ -76,40 +72,37 @@ func TestWALTruncate(t *testing.T) { statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) - // Create WAL file wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 1) require.NoError(t, err) - // Verify file exists _, err = os.Stat(walFilePath) require.NoError(t, err) - // Truncate err = wal.truncate() require.NoError(t, err) - // Verify file is removed _, err = os.Stat(walFilePath) assert.True(t, os.IsNotExist(err)) } func TestRecoverFromWAL_NoWAL(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") db := NewDatabase("", 0) - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.False(t, recovered) } func TestRecoverFromWAL_ValidWAL(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create WAL with serial = 1 (expecting state serial 0 + 1) wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 1) @@ -124,26 +117,23 @@ func TestRecoverFromWAL_ValidWAL(t *testing.T) { err = wal.close() require.NoError(t, err) - // Create database with serial 0 db := NewDatabase("", 0) - // Recover - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.True(t, recovered) - // Verify state was recovered assert.Equal(t, "test-lineage", db.Lineage) require.Contains(t, db.State, "resources.jobs.job1") assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) } func TestRecoverFromWAL_StaleWAL(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) - // Create WAL with serial = 1 wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 1) @@ -151,24 +141,21 @@ func TestRecoverFromWAL_StaleWAL(t *testing.T) { err = wal.close() require.NoError(t, err) - // Create database with serial 2 (WAL is stale) - db := NewDatabase("test-lineage", 2) + db := NewDatabase("test-lineage", 2) // serial 2 makes WAL stale - // Recover - should skip and delete WAL - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.False(t, recovered) - // WAL should be deleted _, err = os.Stat(walFilePath) assert.True(t, os.IsNotExist(err)) } func TestRecoverFromWAL_FutureWAL(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create WAL with serial = 5 wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 5) @@ -176,20 +163,18 @@ func TestRecoverFromWAL_FutureWAL(t *testing.T) { err = wal.close() require.NoError(t, err) - // Create database with serial 0 (WAL is from future - corrupted state) db := NewDatabase("test-lineage", 0) - // Recover - should fail - _, err = recoverFromWAL(statePath, &db) + _, err = recoverFromWAL(ctx, statePath, &db) assert.Error(t, err) - assert.Contains(t, err.Error(), "WAL serial (5) is ahead of expected (1)") + assert.Contains(t, err.Error(), "ahead of expected") } func TestRecoverFromWAL_LineageMismatch(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create WAL with lineage A wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("lineage-A", 1) @@ -197,26 +182,23 @@ func TestRecoverFromWAL_LineageMismatch(t *testing.T) { err = wal.close() require.NoError(t, err) - // Create database with lineage B db := NewDatabase("lineage-B", 0) - // Recover - should fail - _, err = recoverFromWAL(statePath, &db) + _, err = recoverFromWAL(ctx, statePath, &db) assert.Error(t, err) assert.Contains(t, err.Error(), "lineage") } func TestRecoverFromWAL_DeleteOperation(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create WAL with delete operation wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 1) require.NoError(t, err) - // Add an entry entry := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), @@ -224,22 +206,18 @@ func TestRecoverFromWAL_DeleteOperation(t *testing.T) { err = wal.writeEntry("resources.jobs.job1", entry) require.NoError(t, err) - // Delete the entry err = wal.writeEntry("resources.jobs.job1", nil) require.NoError(t, err) err = wal.close() require.NoError(t, err) - // Create database db := NewDatabase("", 0) - // Recover - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.True(t, recovered) - // Entry should NOT be present (deleted) assert.NotContains(t, db.State, "resources.jobs.job1") } @@ -249,36 +227,29 @@ func TestDeploymentState_WALIntegration(t *testing.T) { statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) - // Create deployment state var db DeploymentState err := db.Open(ctx, statePath) require.NoError(t, err) - // Save some state err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) require.NoError(t, err) - // WAL should exist _, err = os.Stat(walFilePath) require.NoError(t, err) - // Read WAL to verify content - header, entries, err := readWAL(statePath) + header, entries, err := readWAL(ctx, statePath) require.NoError(t, err) - assert.Equal(t, 1, header.Serial) // serial + 1 + assert.Equal(t, 1, header.Serial) require.Len(t, entries, 1) assert.Equal(t, "resources.jobs.job1", entries[0].K) assert.Equal(t, "12345", entries[0].V.ID) - // Finalize err = db.Finalize() require.NoError(t, err) - // WAL should be deleted _, err = os.Stat(walFilePath) assert.True(t, os.IsNotExist(err)) - // State file should exist with correct serial data, err := os.ReadFile(statePath) require.NoError(t, err) var savedDB Database @@ -293,7 +264,6 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") - // Create initial state file initialDB := NewDatabase("test-lineage", 5) initialDB.State["resources.jobs.existing"] = ResourceEntry{ ID: "existing-id", @@ -304,7 +274,6 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { err = os.WriteFile(statePath, data, 0o600) require.NoError(t, err) - // Create WAL with serial 6 (5 + 1) wal, err := openWAL(statePath) require.NoError(t, err) err = wal.writeHeader("test-lineage", 6) @@ -318,12 +287,10 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { err = wal.close() require.NoError(t, err) - // Open should recover from WAL var db DeploymentState err = db.Open(ctx, statePath) require.NoError(t, err) - // Both existing and new resources should be present assert.Contains(t, db.Data.State, "resources.jobs.existing") assert.Contains(t, db.Data.State, "resources.jobs.new") assert.Equal(t, "new-id", db.Data.State["resources.jobs.new"].ID) @@ -338,27 +305,22 @@ func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { err := db.Open(ctx, statePath) require.NoError(t, err) - // Add a resource err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) require.NoError(t, err) - // Delete the resource err = db.DeleteState("resources.jobs.job1") require.NoError(t, err) - // Read WAL to verify delete entry - _, entries, err := readWAL(statePath) + _, entries, err := readWAL(ctx, statePath) require.NoError(t, err) require.Len(t, entries, 2) assert.Equal(t, "resources.jobs.job1", entries[1].K) - assert.Nil(t, entries[1].V) // nil means delete + assert.Nil(t, entries[1].V) - // Finalize err = db.Finalize() require.NoError(t, err) - // State file should NOT contain the deleted resource data, err := os.ReadFile(statePath) require.NoError(t, err) var savedDB Database @@ -383,8 +345,7 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, dependsOn) require.NoError(t, err) - // Read WAL - _, entries, err := readWAL(statePath) + _, entries, err := readWAL(ctx, statePath) require.NoError(t, err) require.Len(t, entries, 1) @@ -393,12 +354,12 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { assert.Equal(t, "resources.clusters.cluster1", entries[0].V.DependsOn[0].Node) } -func TestRecoverFromWAL_CorruptedLine(t *testing.T) { +func TestRecoverFromWAL_CorruptedMiddleLine(t *testing.T) { + ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) - // Manually write WAL with corrupted line content := `{"lineage":"test","serial":1} {"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} not valid json @@ -408,12 +369,129 @@ not valid json require.NoError(t, err) db := NewDatabase("", 0) - recovered, err := recoverFromWAL(statePath, &db) + recovered, err := recoverFromWAL(ctx, statePath, &db) + require.NoError(t, err) + assert.False(t, recovered) + assert.Empty(t, db.State) + + _, err = os.Stat(walFilePath) + assert.True(t, os.IsNotExist(err)) +} + +func TestRecoverFromWAL_CorruptedLastLine(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + content := `{"lineage":"test","serial":1} +{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} +{"k":"resources.jobs.job2","v":{"__id__":"67890","state":{}}} +not valid json +` + err := os.WriteFile(walFilePath, []byte(content), 0o600) + require.NoError(t, err) + + db := NewDatabase("", 0) + recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) assert.True(t, recovered) - // Should have recovered job1 and job2, skipping corrupted line assert.Contains(t, db.State, "resources.jobs.job1") assert.Contains(t, db.State, "resources.jobs.job2") + assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) + assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) +} + +func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + initialDB := NewDatabase("test-lineage", 0) + data, err := json.Marshal(initialDB) + require.NoError(t, err) + err = os.WriteFile(statePath, data, 0o600) + require.NoError(t, err) + + wal, err := openWAL(statePath) + require.NoError(t, err) + err = wal.writeHeader("test-lineage", 1) + require.NoError(t, err) + err = wal.writeEntry("resources.jobs.job1", &ResourceEntry{ID: "123", State: json.RawMessage(`{}`)}) + require.NoError(t, err) + err = wal.close() + require.NoError(t, err) + + var db DeploymentState + err = db.Open(ctx, statePath) + require.NoError(t, err) + + assert.True(t, db.RecoveredFromWAL()) +} + +func TestRecoverFromWAL_LineageAdoption(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + content := `{"lineage":"adopted-lineage","serial":1} +{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} +` + err := os.WriteFile(walFilePath, []byte(content), 0o600) + require.NoError(t, err) + + db := NewDatabase("", 0) // empty lineage + recovered, err := recoverFromWAL(ctx, statePath, &db) + require.NoError(t, err) + assert.True(t, recovered) + assert.Equal(t, "adopted-lineage", db.Lineage) +} + +func TestReadWAL_EmptyFile(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + err := os.WriteFile(walFilePath, []byte(""), 0o600) + require.NoError(t, err) + + _, _, err = readWAL(ctx, statePath) + assert.Error(t, err) + assert.Contains(t, err.Error(), "empty") } +func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + err = db.SaveState("resources.jobs.job1", "111", map[string]string{"v": "1"}, nil) + require.NoError(t, err) + + err = db.DeleteState("resources.jobs.job1") + require.NoError(t, err) + + err = db.SaveState("resources.jobs.job1", "222", map[string]string{"v": "2"}, nil) + require.NoError(t, err) + + _, entries, err := readWAL(ctx, statePath) + require.NoError(t, err) + require.Len(t, entries, 3) + assert.Equal(t, "111", entries[0].V.ID) + assert.Nil(t, entries[1].V) + assert.Equal(t, "222", entries[2].V.ID) + + err = db.Finalize() + require.NoError(t, err) + + entry, ok := db.GetResourceEntry("resources.jobs.job1") + require.True(t, ok) + assert.Equal(t, "222", entry.ID) +} diff --git a/wal.txt b/wal.txt deleted file mode 100644 index d365ed56d8..0000000000 --- a/wal.txt +++ /dev/null @@ -1,205 +0,0 @@ -Design Document: Write-Ahead Log (WAL) for Bundle Deployment State Recovery -1. Problem Statement -When databricks bundle deploy is interrupted, resources created before the interruption become orphaned. The CLI only writes the state file at the end of deployment via Finalize(). Any resources created mid-deployment are lost from tracking. - -Current behavior: -Deploy starts → Create Job A → Create Job B → [CRASH] → State file empty → Jobs A, B orphaned - -Impact: Orphaned resources exist in Databricks but are unknown to future deployments. Users accumulate duplicate resources, leading to confusion and unexpected costs. - -Scope: Direct deployment engine only. Terraform has its own state management. -2. Solution Overview -Implement a Write-Ahead Log (WAL) that records each state mutation to disk immediately after the corresponding API call succeeds. -On recovery, replay the WAL to restore partial deployment state. - -Proposed behavior: -Deploy starts → Create Job A → [WAL: A] → Create Job B → [WAL: A,B] → [CRASH] -Next deploy → Load state → Replay WAL → State has A,B → No duplicates -3. Detailed Design -3.1 File Structure -The WAL is stored locally alongside the existing state file. - -File Path -Description -~/.databricks/bundle/// -Root directory for the bundle's state data. -~/.databricks/bundle///resources.json -The committed state file (existing). -~/.databricks/bundle///resources.json.wal -The Write-Ahead Log file (new). - -3.2 WAL Entry Format -Each entry is a JSON object written as a single line (NDJSON format). The entry embeds the existing ResourceEntry structure for consistency with the state file. - -Field -Type -Description -Lineage (First Entry Only) -String -UUID matching the state file's lineage (for validation). -Serial (First Entry Only) -Integer -Deployment serial number (for validation). -k (2nd Entry Onwards) -String -Resource key (e.g., resources.jobs.my_job). -v (2nd Entry Onwards) -ResourceEntry -The state entry. Omitted for delete operations. - - -ResourceEntry structure (existing, reused): - -Field -Type -Description -__id__ -String -The unique ID assigned by the Databricks API. -state -Object -Full snapshot of the resource configuration. - - -Example WAL: -{"lineage":"abc-123"} -{"k":"resources.jobs.my_job","v":{"__id__":"1234567","state":{...}}} -{"k":"resources.jobs.old_job"} // no v means delete op -3.3 WAL Lifecycle -Phase -Action -Open -Create or open resources.json.wal. -Write -Append entry after each successful API call. -Truncate -Delete resources.json.wal after successful Finalize(). - - -Durability: Each entry must be flushed to disk (fsync) immediately after the successful API response before proceeding. -Known Limitation: There is a small window (~microseconds) between API success and WAL write where a crash would orphan the resource. This is unavoidable is acceptable. -3.4 Recovery Mechanism -Recovery occurs at the start of deployment if the WAL file exists. - -Check: If resources.json.wal exists, initiate recovery. -Load Base State: -If resources.json exists: load it (provides lineage and serial). We are making sure it exists by writing immediately once we open/create it in the Open() method -Otherwise: create fresh state with new lineage. -Read WAL: Parse all entries from resources.json.wal (already chronologically ordered). -Validate Entries: -WAL serial == state serial + 1: Valid — replay entries. -WAL serial < state serial + 1: Stale WAL — delete WAL file, proceed without recovery. -WAL serial > state serial + 1: Corrupted state — return error. -Replay: For each valid entry: -set: Add or overwrite the resource in memory. -delete: Remove the resource from memory. -Proceed: Use the resulting state as the starting point for deployment. -Finalize: On success, write resources.json and delete resources.json.wal. -3.5 Integration Points -Action -Location -Detail -Recovery Check -Open() in dstate/state.go -Check for the WAL file and replay before proceeding. -Write WAL Entry -SaveState() / DeleteState() -Append entry before updating memory. -Truncation -Finalize() -Delete WAL after successful state file write. - -3.6 Error Handling -Scenario -Behavior -WAL write fails -Return error, abort deployment. -Corrupted WAL line -Log warning, skip line, continue replay. -Lineage mismatch -Return error, abort deployment. -Stale serial -Delete WAL - -5. Testing Plan -Use acceptance tests. Add support for the crash caller process from the test server. -Key test cases: -Tests which compile and run real binary against testserver. - -Normal deploy — WAL created, used, deleted. -Crash after 1 resource — recovery works. -Fresh deploy with existing WAL — lineage adopted. -Stale WAL (old serial) — entries skipped. -Corrupted WAL line — skipped, rest recovered. -Bundle summary works after interrupted deploy and sees ids stored in WAL -7. Open Questions -# -Question -Proposed Answer -1 -Should WAL be pushed to remote? -Never - -5. Test Plan - -We should use acceptance tests which compile and run real binary against testerver - -5.1 Unit Tests - WAL File Operations -| Test ID | Description | Expected Behavior | -|---------|-------------|-------------------| -| U01 | WAL path generation | walPath("resources.json") returns "resources.json.wal" | -| U02 | Write and read WAL | Header + entries written and read back correctly | -| U03 | Truncate WAL | File deleted from disk | -| U04 | Truncate non-existent WAL | No error returned | -| U05 | Read empty WAL | Returns error "WAL file is empty or missing header" | - -5.2 Unit Tests - WAL Recovery Logic -| Test ID | Description | Expected Behavior | -|---------|-------------|-------------------| -| R01 | No WAL exists | recoverFromWAL returns (false, nil) | -| R02 | Valid WAL (serial = state+1) | Entries replayed, returns (true, nil) | -| R03 | Stale WAL (serial < state+1) | WAL deleted, returns (false, nil) | -| R04 | Future WAL (serial > state+1) | Returns error about corruption | -| R05 | Lineage mismatch | Returns error about lineage mismatch | -| R06 | Lineage adopted from WAL | If state has no lineage, WAL lineage is used | -| R07 | Delete operation replay | Entry removed from state map | -| R08 | Corrupted entry line | Skipped, other entries recovered | - -5.3 Unit Tests - Integration with DeploymentState -| Test ID | Description | Expected Behavior | -|---------|-------------|-------------------| -| I01 | SaveState/DeleteState/Finalize flow | WAL created on first SaveState, entries written, truncated on Finalize, serial incremented | -| I02 | Finalize cleans stale WAL | If WAL file exists but wasn't opened this session, delete it | -| I03 | Open with existing WAL | Recovery performed before return | -| I04 | SaveState with DependsOn | DependsOn preserved in WAL entry | - -5.4 Acceptance Tests -| Test ID | Description | Steps | Expected Behavior | -|---------|-------------|-------|-------------------| -| A01 | Normal deploy | Deploy bundle with 2 resources | WAL created during deploy, deleted after Finalize | -| A02 | Crash recovery | 1. Deploy, crash after resource A created 2. Redeploy | Resource A recovered from WAL, resource B created, no duplicates | -| A03 | Bundle summary after crash | 1. Deploy, crash mid-deploy 2. Run bundle summary | Shows resources from WAL with correct IDs | - -5.5 Tests Implemented in wal_test.go -- TestWALPath (U01) -- TestWALWriteAndRead (U02) -- TestWALTruncate (U03, U04) -- TestRecoverFromWAL_NoWAL (R01) -- TestRecoverFromWAL_ValidWAL (R02) -- TestRecoverFromWAL_StaleWAL (R03) -- TestRecoverFromWAL_FutureWAL (R04) -- TestRecoverFromWAL_LineageMismatch (R05) -- TestRecoverFromWAL_DeleteOperation (R07) -- TestRecoverFromWAL_CorruptedLine (R08) -- TestDeploymentState_WALIntegration (I01) -- TestDeploymentState_WALRecoveryOnOpen (I03) -- TestDeploymentState_DeleteStateWritesWAL (I01) -- TestDeploymentState_WALWithDependsOn (I04) - -5.6 Tests Still Needed -| Test ID | Description | Priority | -|---------|-------------|----------| -| R06 | TestRecoverFromWAL_LineageAdoption (fresh state adopts WAL lineage) | High | -| I02 | TestDeploymentState_FinalizeCleansStaleWAL | Medium | -| U05 | TestReadEmptyWAL | Low | -| A01-A03 | Acceptance tests (require crash simulation infrastructure) | High | From 8fcd7eb6afd35e91f0f347b3380178e41c544547 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Sat, 24 Jan 2026 00:51:38 +0530 Subject: [PATCH 03/85] Updated existing tests Signed-off-by: Varun Deep Saini --- .../out.deploy.direct.txt | 7 ++++++ .../out.deploy.terraform.txt | 6 +++++ .../output.txt | 6 ----- .../script | 2 +- .../test.toml | 1 + .../build_and_files_whl/out.deploy.direct.txt | 8 +++++++ .../out.deploy.terraform.txt | 7 ++++++ .../artifacts/build_and_files_whl/output.txt | 7 ------ .../artifacts/build_and_files_whl/script | 2 +- .../artifacts/build_and_files_whl/test.toml | 1 + .../shell/bash/out.deploy.direct.txt | 7 ++++++ .../shell/bash/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/bash/output.txt | 5 ---- acceptance/bundle/artifacts/shell/bash/script | 2 +- .../shell/basic/out.deploy.direct.txt | 7 ++++++ .../shell/basic/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/basic/output.txt | 5 ---- .../bundle/artifacts/shell/basic/script | 2 +- .../shell/default/out.deploy.direct.txt | 7 ++++++ .../shell/default/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/default/output.txt | 5 ---- .../bundle/artifacts/shell/default/script | 2 +- .../artifacts/shell/sh/out.deploy.direct.txt | 7 ++++++ .../shell/sh/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/sh/output.txt | 5 ---- acceptance/bundle/artifacts/shell/sh/script | 2 +- .../deploy/empty-bundle/out.deploy.direct.txt | 6 +++++ .../empty-bundle/out.deploy.terraform.txt | 5 ++++ .../bundle/deploy/empty-bundle/output.txt | 5 ---- acceptance/bundle/deploy/empty-bundle/script | 2 +- .../bundle/deploy/wal/chain-10-jobs/test.toml | 1 - .../deploy/wal/corrupted-wal-entry/test.toml | 1 - .../deploy/wal/corrupted-wal-middle/test.toml | 1 - .../deploy/wal/crash-after-create/test.toml | 1 - .../deploy/wal/multiple-crashes/test.toml | 1 - .../deploy/wal/summary-after-crash/test.toml | 1 - .../bundle/scripts/out.deploy.direct.txt | 24 +++++++++++++++++++ .../bundle/scripts/out.deploy.terraform.txt | 23 ++++++++++++++++++ acceptance/bundle/scripts/output.txt | 23 ------------------ .../out.deploy.direct.txt | 18 ++++++++++++++ .../out.deploy.terraform.txt | 17 +++++++++++++ .../scripts/restricted-execution/output.txt | 17 ------------- .../scripts/restricted-execution/script | 2 +- acceptance/bundle/scripts/script | 2 +- .../out.deploy-one.direct.txt | 6 +++++ .../out.deploy-one.terraform.txt | 5 ++++ .../out.deploy-two.direct.txt | 6 +++++ .../out.deploy-two.terraform.txt | 5 ++++ .../deploy-artifact-path-type/output.txt | 10 -------- .../deploy-artifact-path-type/script | 4 ++-- .../out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../deploy-config-file-count/output.txt | 5 ---- .../telemetry/deploy-config-file-count/script | 2 +- .../deploy-mode/out.deploy-dev.direct.txt | 6 +++++ .../deploy-mode/out.deploy-dev.terraform.txt | 5 ++++ .../deploy-mode/out.deploy-prod.direct.txt | 12 ++++++++++ .../deploy-mode/out.deploy-prod.terraform.txt | 11 +++++++++ .../bundle/telemetry/deploy-mode/output.txt | 16 ------------- .../bundle/telemetry/deploy-mode/script | 4 ++-- .../deploy-target-count/out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../telemetry/deploy-target-count/output.txt | 5 ---- .../telemetry/deploy-target-count/script | 2 +- .../out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../deploy-variable-count/output.txt | 5 ---- .../telemetry/deploy-variable-count/script | 2 +- .../out.deploy-one.direct.txt | 8 +++++++ .../out.deploy-one.terraform.txt | 7 ++++++ .../out.deploy-two.direct.txt | 8 +++++++ .../out.deploy-two.terraform.txt | 7 ++++++ .../telemetry/deploy-whl-artifacts/output.txt | 14 ----------- .../telemetry/deploy-whl-artifacts/script | 4 ++-- .../sync_patterns/out.deploy.direct.txt | 6 +++++ .../sync_patterns/out.deploy.terraform.txt | 5 ++++ .../bundle/validate/sync_patterns/output.txt | 5 ---- .../bundle/validate/sync_patterns/script | 2 +- acceptance/cache/simple/out.deploy.direct.txt | 6 +++++ .../cache/simple/out.deploy.terraform.txt | 5 ++++ acceptance/cache/simple/script | 2 +- acceptance/cache/simple/test.toml | 3 +++ bundle/direct/dstate/state.go | 15 ++++++++++++ bundle/direct/dstate/wal_test.go | 1 + 84 files changed, 355 insertions(+), 164 deletions(-) create mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt create mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt create mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/out.deploy.direct.txt create mode 100644 acceptance/bundle/scripts/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt create mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt create mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt create mode 100644 acceptance/cache/simple/out.deploy.direct.txt create mode 100644 acceptance/cache/simple/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt new file mode 100644 index 0000000000..f75a5428b1 --- /dev/null +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt new file mode 100644 index 0000000000..8ec9c52db6 --- /dev/null +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt index 6d24880e6c..6c8bd962a5 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt @@ -1,10 +1,4 @@ ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - === Expecting wheel to be uploaded >>> jq .path "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/whl/source.whl" diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script index 883601185c..fba3a77700 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script @@ -2,7 +2,7 @@ mkdir -p whl echo "test wheel content" > whl/source.whl -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 title "Expecting wheel to be uploaded" trace jq .path < out.requests.txt | grep import | grep whl | sort diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index a0a680e9d1..b6c55dac31 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -1,4 +1,5 @@ RecordRequests = true +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" Ignore = [ '.venv', 'dist', diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt new file mode 100644 index 0000000000..4039d5917e --- /dev/null +++ b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt @@ -0,0 +1,8 @@ + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt new file mode 100644 index 0000000000..9894e5b89f --- /dev/null +++ b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt @@ -0,0 +1,7 @@ + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/output.txt b/acceptance/bundle/artifacts/build_and_files_whl/output.txt index b618de6b89..d44a21b582 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/output.txt +++ b/acceptance/bundle/artifacts/build_and_files_whl/output.txt @@ -7,10 +7,3 @@ Workspace: Path: /Workspace/Users/[USERNAME]/.bundle/test-bundle/default Validation OK! - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/script b/acceptance/bundle/artifacts/build_and_files_whl/script index 2d7d63f7fe..9aa0d870e7 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/script +++ b/acceptance/bundle/artifacts/build_and_files_whl/script @@ -1,5 +1,5 @@ cp -r $TESTDIR/../whl_explicit/my_test_code/{setup.py,src} . trace $CLI bundle validate # I expect this deploy to work because I explicitly told where to find the wheel, but it does not: -trace errcode $CLI bundle deploy +trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 rm mydist/my_test_code-0.0.1-py3-none-any.whl setup.py src/*.py diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index a030353d57..a93d901b68 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -1 +1,2 @@ RecordRequests = false +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt new file mode 100644 index 0000000000..f311959abd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt new file mode 100644 index 0000000000..fa5d7b76bc --- /dev/null +++ b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index fa5d7b76bc..8b13789179 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index 68ebb78d77..09bb41643c 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt new file mode 100644 index 0000000000..3a4ff9138b --- /dev/null +++ b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt new file mode 100644 index 0000000000..b5e01c79e6 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index b5e01c79e6..8b13789179 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index 68ebb78d77..09bb41643c 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt new file mode 100644 index 0000000000..f311959abd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt new file mode 100644 index 0000000000..fa5d7b76bc --- /dev/null +++ b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index fa5d7b76bc..8b13789179 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index 68ebb78d77..09bb41643c 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt new file mode 100644 index 0000000000..98820986f5 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt new file mode 100644 index 0000000000..5117e6e9fc --- /dev/null +++ b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index 5117e6e9fc..8b13789179 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index 68ebb78d77..09bb41643c 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt new file mode 100644 index 0000000000..81dddfcb9f --- /dev/null +++ b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt new file mode 100644 index 0000000000..494f76c84f --- /dev/null +++ b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/output.txt b/acceptance/bundle/deploy/empty-bundle/output.txt index 919accb661..8498653a6e 100644 --- a/acceptance/bundle/deploy/empty-bundle/output.txt +++ b/acceptance/bundle/deploy/empty-bundle/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Deployment complete! - >>> [CLI] bundle destroy --auto-approve All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default diff --git a/acceptance/bundle/deploy/empty-bundle/script b/acceptance/bundle/deploy/empty-bundle/script index 775ccd0def..b74818f1b1 100644 --- a/acceptance/bundle/deploy/empty-bundle/script +++ b/acceptance/bundle/deploy/empty-bundle/script @@ -4,4 +4,4 @@ cleanup() { } trap cleanup EXIT -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml index c4308521be..36076f3df5 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml @@ -14,4 +14,3 @@ Response.Body = '{}' [[Server]] Pattern = "GET /api/2.2/jobs/get" Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml index 9c9ab5a30b..6245c19840 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -11,4 +11,3 @@ Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=2222" Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' - diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml index 8aa40be8d7..ec6fa7b3f4 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml @@ -10,4 +10,3 @@ Response.Body = '{"job_id": 9999}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=9999" Response.Body = '{"job_id": 9999, "settings": {"name": "fresh-job"}}' - diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml index 5023224e57..eebad72de5 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -14,4 +14,3 @@ Response.Body = '{}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml index c5981d6720..474177b804 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -16,4 +16,3 @@ Response.Body = '{}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml index 961030e981..f14cbbfcbc 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml +++ b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml @@ -11,4 +11,3 @@ Response.Body = '{"job_id": 1001}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/scripts/out.deploy.direct.txt b/acceptance/bundle/scripts/out.deploy.direct.txt new file mode 100644 index 0000000000..037f609f94 --- /dev/null +++ b/acceptance/bundle/scripts/out.deploy.direct.txt @@ -0,0 +1,24 @@ + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/out.deploy.terraform.txt b/acceptance/bundle/scripts/out.deploy.terraform.txt new file mode 100644 index 0000000000..a3d9ba342c --- /dev/null +++ b/acceptance/bundle/scripts/out.deploy.terraform.txt @@ -0,0 +1,23 @@ + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/output.txt b/acceptance/bundle/scripts/output.txt index 68afb2fecc..a39a0b0aa9 100644 --- a/acceptance/bundle/scripts/output.txt +++ b/acceptance/bundle/scripts/output.txt @@ -25,26 +25,3 @@ Name: scripts Found 1 error Exit code: 1 - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt new file mode 100644 index 0000000000..d8fed9e4e6 --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt @@ -0,0 +1,18 @@ + +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt new file mode 100644 index 0000000000..efcf1281cb --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt @@ -0,0 +1,17 @@ + +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/output.txt b/acceptance/bundle/scripts/restricted-execution/output.txt index f377edba7c..2186ac68f0 100644 --- a/acceptance/bundle/scripts/restricted-execution/output.txt +++ b/acceptance/bundle/scripts/restricted-execution/output.txt @@ -1,22 +1,5 @@ === Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env - === With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible >>> DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode [CLI] bundle deploy Error: failed to execute script: running scripts is not allowed when DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION is set diff --git a/acceptance/bundle/scripts/restricted-execution/script b/acceptance/bundle/scripts/restricted-execution/script index 7a3dcb068b..2e31cce2ee 100644 --- a/acceptance/bundle/scripts/restricted-execution/script +++ b/acceptance/bundle/scripts/restricted-execution/script @@ -1,7 +1,7 @@ export SOME_ENV_VAR="value_from_env" title "Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible" -trace errcode $CLI bundle deploy +trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 title "With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible" trace DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/script b/acceptance/bundle/scripts/script index de07d277ea..3acb85f9cd 100644 --- a/acceptance/bundle/scripts/script +++ b/acceptance/bundle/scripts/script @@ -1,3 +1,3 @@ trace EXITCODE=0 errcode $CLI bundle validate trace EXITCODE=1 errcode $CLI bundle validate -trace EXITCODE=0 errcode $CLI bundle deploy +trace EXITCODE=0 errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt new file mode 100644 index 0000000000..0e133547de --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt new file mode 100644 index 0000000000..65960fa86d --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt new file mode 100644 index 0000000000..120e590201 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt new file mode 100644 index 0000000000..fabdebb399 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt index a03920c3fd..69c6730b46 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt @@ -1,14 +1,4 @@ ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "workspace_artifact_path_type": "WORKSPACE_FILE_SYSTEM" diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/script b/acceptance/bundle/telemetry/deploy-artifact-path-type/script index d1a63928a6..4f3bd7c3cf 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/script +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t two +trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {workspace_artifact_path_type}' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt new file mode 100644 index 0000000000..1b73d1b916 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt new file mode 100644 index 0000000000..5c6aad5b37 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt index 909e8d6c70..1637965310 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "configuration_file_count": 4 diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/script b/acceptance/bundle/telemetry/deploy-config-file-count/script index c495bdcb07..7fbdd0e677 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/script +++ b/acceptance/bundle/telemetry/deploy-config-file-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {configuration_file_count}' diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt new file mode 100644 index 0000000000..e86795abf5 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt new file mode 100644 index 0000000000..ee47fabbb6 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt new file mode 100644 index 0000000000..5957e33b91 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt @@ -0,0 +1,12 @@ + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt new file mode 100644 index 0000000000..ac2e13efb9 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt @@ -0,0 +1,11 @@ + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/output.txt b/acceptance/bundle/telemetry/deploy-mode/output.txt index 99e7fbb699..89be65f195 100644 --- a/acceptance/bundle/telemetry/deploy-mode/output.txt +++ b/acceptance/bundle/telemetry/deploy-mode/output.txt @@ -1,20 +1,4 @@ ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "bundle_mode": "DEVELOPMENT" diff --git a/acceptance/bundle/telemetry/deploy-mode/script b/acceptance/bundle/telemetry/deploy-mode/script index f7257769ac..0a9d57a1a4 100644 --- a/acceptance/bundle/telemetry/deploy-mode/script +++ b/acceptance/bundle/telemetry/deploy-mode/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t dev +trace $CLI bundle deploy -t dev > out.deploy-dev.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t prod +trace $CLI bundle deploy -t prod > out.deploy-prod.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bundle_mode}' diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt new file mode 100644 index 0000000000..0e133547de --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt new file mode 100644 index 0000000000..65960fa86d --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/output.txt b/acceptance/bundle/telemetry/deploy-target-count/output.txt index 31581169f2..9c59c43023 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-target-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "target_count": 3 diff --git a/acceptance/bundle/telemetry/deploy-target-count/script b/acceptance/bundle/telemetry/deploy-target-count/script index 3022a2b5e4..6e9d2f7378 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/script +++ b/acceptance/bundle/telemetry/deploy-target-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {target_count}' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt new file mode 100644 index 0000000000..1b73d1b916 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt new file mode 100644 index 0000000000..5c6aad5b37 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/output.txt b/acceptance/bundle/telemetry/deploy-variable-count/output.txt index be4840e69e..e8580d71b3 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-variable-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "variable_count": 6, diff --git a/acceptance/bundle/telemetry/deploy-variable-count/script b/acceptance/bundle/telemetry/deploy-variable-count/script index dad762899a..caaf8c1f39 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/script +++ b/acceptance/bundle/telemetry/deploy-variable-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs.[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {variable_count, lookup_variable_count, complex_variable_count}' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt new file mode 100644 index 0000000000..f8db617c00 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt @@ -0,0 +1,8 @@ + +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt new file mode 100644 index 0000000000..048d0f07b5 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt new file mode 100644 index 0000000000..b786de11fe --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt @@ -0,0 +1,8 @@ + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt new file mode 100644 index 0000000000..651d315f77 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt index a9b8ce4ae6..ed89628d98 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt @@ -1,18 +1,4 @@ ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "bool_values": [ diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/script b/acceptance/bundle/telemetry/deploy-whl-artifacts/script index 078fa94cdd..5bc513afb8 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/script +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/script @@ -2,9 +2,9 @@ uv venv -q .venv venv_activate uv pip install -q --no-index setuptools -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t two +trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bool_values}' diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt new file mode 100644 index 0000000000..1b73d1b916 --- /dev/null +++ b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt new file mode 100644 index 0000000000..5c6aad5b37 --- /dev/null +++ b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/output.txt b/acceptance/bundle/validate/sync_patterns/output.txt index b35859d86a..0c061fbe31 100644 --- a/acceptance/bundle/validate/sync_patterns/output.txt +++ b/acceptance/bundle/validate/sync_patterns/output.txt @@ -20,8 +20,3 @@ Validation OK! "." ] } - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/script b/acceptance/bundle/validate/sync_patterns/script index d2aae85444..485556d28a 100644 --- a/acceptance/bundle/validate/sync_patterns/script +++ b/acceptance/bundle/validate/sync_patterns/script @@ -1,5 +1,5 @@ trace $CLI bundle validate trace $CLI bundle validate -o json | jq '.sync' -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 jq 'select(.path | test("dir/test.yml"))' out.requests.txt > out.sync.txt rm out.requests.txt diff --git a/acceptance/cache/simple/out.deploy.direct.txt b/acceptance/cache/simple/out.deploy.direct.txt new file mode 100644 index 0000000000..945da6d144 --- /dev/null +++ b/acceptance/cache/simple/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -p dogfood +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/cache/simple/out.deploy.terraform.txt b/acceptance/cache/simple/out.deploy.terraform.txt new file mode 100644 index 0000000000..41cfbc2a2d --- /dev/null +++ b/acceptance/cache/simple/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -p dogfood +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/cache/simple/script b/acceptance/cache/simple/script index a2907174bf..524c077f46 100644 --- a/acceptance/cache/simple/script +++ b/acceptance/cache/simple/script @@ -9,7 +9,7 @@ title "Second call in a session is expected to be a cache hit\n" trace $CLI bundle validate -p dogfood --debug 2>&1 | grep "Local Cache" | grep -v "cache path" title "Bundle deploy should send telemetry values\n" -trace $CLI bundle deploy -p dogfood +trace $CLI bundle deploy -p dogfood > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace print_telemetry_bool_values | grep "local.cache" rm out.requests.txt diff --git a/acceptance/cache/simple/test.toml b/acceptance/cache/simple/test.toml index 08cabc87be..75759db680 100644 --- a/acceptance/cache/simple/test.toml +++ b/acceptance/cache/simple/test.toml @@ -3,6 +3,9 @@ Local = true RecordRequests = true +# Enable engine-specific output files +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" + # Redact structured logging fields from debug output [[Repls]] Old = ' pid=[0-9]+' diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 1f8a705e05..9de2133633 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -240,6 +240,21 @@ func (db *DeploymentState) Finalize() error { return nil } +// Close closes the WAL file handle without finalizing or truncating. +// Use this in tests or when you need to abort without saving state. +func (db *DeploymentState) Close() error { + db.mu.Lock() + defer db.mu.Unlock() + + if db.wal != nil { + if err := db.wal.close(); err != nil { + return err + } + db.wal = nil + } + return nil +} + func (db *DeploymentState) AssertOpened() { if db.Path == "" { panic("internal error: DeploymentState must be opened first") diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go index 9c2250c830..fb4cab1a19 100644 --- a/bundle/direct/dstate/wal_test.go +++ b/bundle/direct/dstate/wal_test.go @@ -337,6 +337,7 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { var db DeploymentState err := db.Open(ctx, statePath) require.NoError(t, err) + t.Cleanup(func() { db.Close() }) dependsOn := []deployplan.DependsOnEntry{ {Node: "resources.clusters.cluster1", Label: "${resources.clusters.cluster1.id}"}, From 51f19743b386700a046b9ba8f2471e8586914e1e Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 2 Feb 2026 20:00:58 +0530 Subject: [PATCH 04/85] test fixes Signed-off-by: Varun Deep Saini --- acceptance/bundle/artifacts/shell/bash/output.txt | 1 - acceptance/bundle/artifacts/shell/basic/output.txt | 1 - acceptance/bundle/artifacts/shell/default/output.txt | 1 - acceptance/bundle/artifacts/shell/sh/output.txt | 1 - acceptance/bundle/deploy/wal/empty-wal/test.toml | 2 +- 5 files changed, 1 insertion(+), 5 deletions(-) diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index 8b13789179..e69de29bb2 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -1 +0,0 @@ - diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index 8b13789179..e69de29bb2 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -1 +0,0 @@ - diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index 8b13789179..e69de29bb2 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -1 +0,0 @@ - diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index 8b13789179..e69de29bb2 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -1 +0,0 @@ - diff --git a/acceptance/bundle/deploy/wal/empty-wal/test.toml b/acceptance/bundle/deploy/wal/empty-wal/test.toml index b97264c2be..2624bdcd68 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/test.toml +++ b/acceptance/bundle/deploy/wal/empty-wal/test.toml @@ -9,5 +9,5 @@ Pattern = "GET /api/2.2/jobs/get" Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' [[Repls]] -Old = '-rw[^ ]+ \d+ [^ ]+ [^ ]+ \d+ [A-Z][a-z]+ \d+ \d+:\d+' +Old = '-rw[^\s]+\s+\d+\s+[^\s]+\s+[^\s]+\s+\d+\s+[A-Z][a-z]+\s+\d+\s+\d+:\d+' New = '[FILE_INFO]' From 36ff7c41900b4c29a099450f3a5071949ea27f3a Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Sat, 7 Feb 2026 20:08:19 +0530 Subject: [PATCH 05/85] Fixes Signed-off-by: Varun Deep Saini --- .../artifacts/build_and_files_whl/test.toml | 2 - .../deploy/wal/chain-10-jobs/output.txt | 2 + .../deploy/wal/corrupted-wal-entry/output.txt | 8 +- .../deploy/wal/corrupted-wal-entry/script | 11 +- .../wal/corrupted-wal-middle/output.txt | 15 +- .../deploy/wal/corrupted-wal-middle/script | 18 +- .../deploy/wal/corrupted-wal-middle/test.toml | 6 +- .../deploy/wal/crash-after-create/output.txt | 2 + .../bundle/deploy/wal/empty-wal/output.txt | 4 +- acceptance/bundle/deploy/wal/empty-wal/script | 7 + .../bundle/deploy/wal/empty-wal/test.toml | 2 +- .../deploy/wal/multiple-crashes/output.txt | 3 +- .../deploy/wal/summary-after-crash/output.txt | 2 + .../deploy/wal/wal-with-delete/output.txt | 2 + bundle/direct/dstate/state.go | 55 ++- bundle/direct/dstate/wal.go | 324 +++++++++++++----- bundle/direct/dstate/wal_test.go | 120 +++++-- 17 files changed, 424 insertions(+), 159 deletions(-) diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index a93d901b68..e69de29bb2 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -1,2 +0,0 @@ -RecordRequests = false -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt index 4c4d781c80..d391548fa8 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt @@ -23,6 +23,8 @@ Exit code: [KILLED] 9 === Bundle summary (reads from WAL) === +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 9 entries from WAL file. Name: wal-chain-test Target: default Workspace: diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index f5e7f346d8..f7ebf7bfd2 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -4,11 +4,15 @@ {"lineage":"test-lineage-123","serial": [SERIAL]} {"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}} -not valid json - corrupted last line (partial write from crash) +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- === Deploy (should recover valid entries, skip corrupted last line) === >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 4: {"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-: unexpected end of JSON input +Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted +Recovered 2 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! @@ -20,5 +24,7 @@ Deployment complete! "resources.jobs.valid_job" ] } +=== Corrupted WAL entries file === +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- === WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index fc36ed754f..dde17995da 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -11,13 +11,13 @@ cat > .databricks/bundle/default/resources.json << 'EOF' EOF echo "=== Creating WAL with corrupted LAST entry ===" -# Corrupted last line is expected (partial write from crash) and should be skipped. +# Corrupted last line is expected (truncated JSON from crash) and should be skipped. # Valid entries before it should be recovered. cat > .databricks/bundle/default/resources.json.wal << 'EOF' {"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} -not valid json - corrupted last line (partial write from crash) +{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- EOF echo "=== WAL content ===" @@ -29,6 +29,13 @@ trace $CLI bundle deploy 2>&1 echo "=== Final state (should have recovered entries) ===" cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' +echo "=== Corrupted WAL entries file ===" +if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then + cat .databricks/bundle/default/resources.json.wal.corrupted +else + echo "Missing corrupted WAL entries file (unexpected)" +fi + echo "=== WAL after successful deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then echo "WAL exists (unexpected)" diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt index 4396aade67..bf9236c1f9 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt @@ -3,17 +3,20 @@ === WAL content === {"lineage":"test-lineage-456","serial": [SERIAL]} {"k":"resources.jobs.job_one","v":{"__id__": "[ID]","state":{"name":"job-one"}}} -not valid json - CORRUPTED MIDDLE LINE +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- {"k":"resources.jobs.job_two","v":{"__id__": "[ID]","state":{"name":"job-two"}}} -=== Deploy (WAL should be deleted due to middle corruption) === +=== Deploy (should recover valid entries and skip corrupted line) === >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... -Warn: Failed to read WAL file, deleting and proceeding: WAL line 3: corrupted entry in middle of WAL: invalid character 'o' in literal null (expecting 'u') +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 3: {"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-: unexpected end of JSON input +Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted +Recovered 2 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! -=== Final state (fresh deploy, not recovered from WAL) === +=== Final state (should have recovered entries) === { "serial": [SERIAL], "state_keys": [ @@ -21,5 +24,7 @@ Deployment complete! "resources.jobs.job_two" ] } +=== Corrupted WAL entries file === +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- === WAL after deploy === -WAL deleted (expected - due to middle corruption) +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script index 46dc1922d1..6307d7fbf7 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script @@ -11,27 +11,33 @@ cat > .databricks/bundle/default/resources.json << 'EOF' EOF echo "=== Creating WAL with corrupted MIDDLE entry ===" -# Corruption in the middle is NOT expected (only last line can be partial write). -# This should cause WAL to be deleted entirely, no recovery. +# Corrupted middle line is expected (truncated JSON from crash) and should be skipped. cat > .databricks/bundle/default/resources.json.wal << 'EOF' {"lineage":"test-lineage-456","serial":6} {"k":"resources.jobs.job_one","v":{"__id__":"1111","state":{"name":"job-one"}}} -not valid json - CORRUPTED MIDDLE LINE +{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- {"k":"resources.jobs.job_two","v":{"__id__":"2222","state":{"name":"job-two"}}} EOF echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal -echo "=== Deploy (WAL should be deleted due to middle corruption) ===" +echo "=== Deploy (should recover valid entries and skip corrupted line) ===" trace $CLI bundle deploy 2>&1 -echo "=== Final state (fresh deploy, not recovered from WAL) ===" +echo "=== Final state (should have recovered entries) ===" cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' +echo "=== Corrupted WAL entries file ===" +if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then + cat .databricks/bundle/default/resources.json.wal.corrupted +else + echo "Missing corrupted WAL entries file (unexpected)" +fi + echo "=== WAL after deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then echo "WAL exists (unexpected)" else - echo "WAL deleted (expected - due to middle corruption)" + echo "WAL deleted (expected)" fi diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml index ec6fa7b3f4..d5f0b1bbb6 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml @@ -1,8 +1,6 @@ -# WAL with corrupted MIDDLE entry - WAL should be deleted, no recovery. -# Corruption in the middle is unexpected (not a partial write from crash). -# The entire WAL is discarded and a fresh deploy happens. +# WAL with corrupted MIDDLE entry - valid entries are recovered and corrupted entries are skipped. -# Since WAL is discarded, jobs will be created fresh (not recovered) +# Since valid entries are recovered, jobs will be updated (not created fresh). [[Server]] Pattern = "POST /api/2.2/jobs/create" Response.Body = '{"job_id": 9999}' diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 9ab9f4cf9c..e32c251ae4 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -19,6 +19,8 @@ WAL exists (expected) >>> [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 1 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index 21b6851080..e8e1553df7 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -6,12 +6,14 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... -Warn: Failed to read WAL file, deleting and proceeding: WAL file is empty +Warn: Failed to read WAL file, moved it to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted and proceeding: WAL file is empty Deploying resources... Updating deployment state... Deployment complete! === Checking WAL file after deploy === Empty WAL deleted (expected) +=== Corrupted WAL file === +[FILE_INFO] .databricks/bundle/default/resources.json.wal.corrupted === State file content === { "lineage": "[UUID]", diff --git a/acceptance/bundle/deploy/wal/empty-wal/script b/acceptance/bundle/deploy/wal/empty-wal/script index f693753ac7..2c66d213aa 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/script +++ b/acceptance/bundle/deploy/wal/empty-wal/script @@ -17,5 +17,12 @@ else echo "Empty WAL deleted (expected)" fi +echo "=== Corrupted WAL file ===" +if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then + ls -la .databricks/bundle/default/resources.json.wal.corrupted +else + echo "Corrupted WAL file missing (unexpected)" +fi + echo "=== State file content ===" cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/empty-wal/test.toml b/acceptance/bundle/deploy/wal/empty-wal/test.toml index 2624bdcd68..ad64cd6e74 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/test.toml +++ b/acceptance/bundle/deploy/wal/empty-wal/test.toml @@ -1,4 +1,4 @@ -# Empty WAL file should be deleted and deploy should proceed normally. +# Empty WAL file should be moved to .wal.corrupted and deploy should proceed normally. [[Server]] Pattern = "POST /api/2.2/jobs/create" diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt index 33dd984b74..e31643106b 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt +++ b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt @@ -14,12 +14,13 @@ WAL exists >>> errcode [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 1 entries from WAL file. Deploying resources... [PROCESS_KILLED] Exit code: [KILLED] === WAL after second crash === -WAL still exists === Third deploy (should succeed) === >>> [CLI] bundle deploy --force-lock diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt index 9a2644a60b..3f5747ab21 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt +++ b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt @@ -23,6 +23,8 @@ WAL exists (expected) === Bundle summary (should show job_a from WAL) === >>> [CLI] bundle summary -o json +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 1 entries from WAL file. { "job_a_id": "1001", "job_b_id": null diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index 8f52732d3e..f686ac4836 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -9,6 +9,8 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-delete-test/default/files... +Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal +Recovered 1 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 9de2133633..a54da010f1 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -14,7 +14,6 @@ import ( "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/internal/build" - "github.com/databricks/cli/libs/log" "github.com/google/uuid" ) @@ -75,7 +74,7 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d if err := db.ensureWALOpen(); err != nil { return fmt.Errorf("failed to open WAL: %w", err) } - if err := db.wal.writeEntry(key, &entry); err != nil { + if err := db.wal.writeJSON(WALEntry{K: key, V: &entry}); err != nil { return fmt.Errorf("failed to write WAL entry: %w", err) } @@ -96,7 +95,7 @@ func (db *DeploymentState) DeleteState(key string) error { if err := db.ensureWALOpen(); err != nil { return fmt.Errorf("failed to open WAL: %w", err) } - if err := db.wal.writeEntry(key, nil); err != nil { + if err := db.wal.writeJSON(WALEntry{K: key}); err != nil { return fmt.Errorf("failed to write WAL entry: %w", err) } @@ -126,7 +125,7 @@ func (db *DeploymentState) ensureWALOpen() error { // WAL serial is the NEXT serial (current + 1) walSerial := db.Data.Serial + 1 - if err := wal.writeHeader(lineage, walSerial); err != nil { + if err := wal.writeJSON(WALHeader{Lineage: lineage, Serial: walSerial}); err != nil { wal.close() return err } @@ -198,7 +197,12 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { return fmt.Errorf("WAL recovery failed: %w", err) } if recovered { - log.Infof(ctx, "Recovered deployment state from WAL") + if err := db.unlockedSave(); err != nil { + return err + } + if err := cleanupWAL(path); err != nil { + return err + } db.recoveredFromWAL = true } @@ -212,28 +216,43 @@ func (db *DeploymentState) Finalize() error { db.mu.Lock() defer db.mu.Unlock() - // Generate lineage on first save (if WAL wasn't opened) + hadOpenWAL := db.wal != nil + if hadOpenWAL { + if err := db.wal.close(); err != nil { + return err + } + db.wal = nil + + replayResult, err := replayWAL(db.Path, &db.Data) + if err != nil { + return fmt.Errorf("failed to replay WAL during finalize: %w", err) + } + if !replayResult.recovered { + return errors.New("failed to replay WAL during finalize: WAL file not found or stale") + } + if len(replayResult.corruptedEntries) > 0 { + first := replayResult.corruptedEntries[0] + return fmt.Errorf("failed to replay WAL during finalize: corrupted entry at line %d: %v", first.lineNumber, first.parseErr) + } + } + + if db.Data.Lineage == "" && !hadOpenWAL && len(db.Data.State) == 0 { + return nil + } + if db.Data.Lineage == "" { db.Data.Lineage = uuid.New().String() } db.Data.Serial++ - err := db.unlockedSave() - if err != nil { + if err := db.unlockedSave(); err != nil { return err } - if db.wal != nil { - if err := db.wal.truncate(); err != nil { - return fmt.Errorf("failed to truncate WAL: %w", err) - } - db.wal = nil - } else { - // No WAL was opened, but we should still clean up any stale WAL file - wp := walPath(db.Path) - if err := os.Remove(wp); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove stale WAL file: %w", err) + if hadOpenWAL { + if err := cleanupWAL(db.Path); err != nil { + return err } } diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index 37dd1bffa2..494c181833 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -2,12 +2,16 @@ package dstate import ( "bufio" + "bytes" "context" "encoding/json" "errors" "fmt" "os" + "path/filepath" + "strings" + "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" ) @@ -22,37 +26,40 @@ type WALEntry struct { } type WAL struct { - path string file *os.File } +type corruptedWALEntry struct { + lineNumber int + rawLine string + parseErr error +} + +type walReplayResult struct { + hasWAL bool + recovered bool + stale bool + entriesRecovered int + corruptedEntries []corruptedWALEntry +} + +var errWALRead = errors.New("wal read error") + func walPath(statePath string) string { return statePath + ".wal" } +func walCorruptedPath(statePath string) string { + return walPath(statePath) + ".corrupted" +} + func openWAL(statePath string) (*WAL, error) { wp := walPath(statePath) - f, err := os.OpenFile(wp, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o600) + f, err := os.OpenFile(wp, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err != nil { return nil, fmt.Errorf("failed to open WAL file %q: %w", wp, err) } - return &WAL{path: wp, file: f}, nil -} - -func (w *WAL) writeHeader(lineage string, serial int) error { - header := WALHeader{ - Lineage: lineage, - Serial: serial, - } - return w.writeJSON(header) -} - -func (w *WAL) writeEntry(key string, entry *ResourceEntry) error { - walEntry := WALEntry{ - K: key, - V: entry, - } - return w.writeJSON(walEntry) + return &WAL{file: f}, nil } func (w *WAL) writeJSON(v any) error { @@ -67,6 +74,10 @@ func (w *WAL) writeJSON(v any) error { return fmt.Errorf("failed to write WAL entry: %w", err) } + if err := w.file.Sync(); err != nil { + return fmt.Errorf("failed to sync WAL entry: %w", err) + } + return nil } @@ -77,122 +88,267 @@ func (w *WAL) close() error { return nil } -func (w *WAL) truncate() error { - if w.file != nil { - w.file.Close() - w.file = nil - } - err := os.Remove(w.path) +func cleanupWAL(statePath string) error { + err := os.Remove(walPath(statePath)) if err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove WAL file %q: %w", w.path, err) + return fmt.Errorf("failed to remove WAL file %q: %w", walPath(statePath), err) + } + return nil +} + +func moveWALToCorrupted(statePath string) error { + source := walPath(statePath) + target := walCorruptedPath(statePath) + _ = os.Remove(target) + if err := os.Rename(source, target); err != nil { + return fmt.Errorf("failed to move WAL file %q to %q: %w", source, target, err) + } + return nil +} + +func writeCorruptedWALEntries(statePath string, corrupted []corruptedWALEntry) error { + if len(corrupted) == 0 { + return nil + } + + target := walCorruptedPath(statePath) + f, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600) + if err != nil { + return fmt.Errorf("failed to create corrupted WAL file %q: %w", target, err) + } + defer f.Close() + + for _, entry := range corrupted { + if _, err := f.WriteString(entry.rawLine + "\n"); err != nil { + return fmt.Errorf("failed to write corrupted WAL file %q: %w", target, err) + } + } + + if err := f.Sync(); err != nil { + return fmt.Errorf("failed to sync corrupted WAL file %q: %w", target, err) } + return nil } -func readWAL(ctx context.Context, statePath string) (*WALHeader, []WALEntry, error) { +func readWAL(statePath string) (*WALHeader, []WALEntry, []corruptedWALEntry, error) { wp := walPath(statePath) f, err := os.Open(wp) if err != nil { - return nil, nil, err + return nil, nil, nil, err } defer f.Close() scanner := bufio.NewScanner(f) - var lines [][]byte + scanner.Buffer(make([]byte, 0, 64*1024), 10*1024*1024) + var header *WALHeader + var entries []WALEntry + var corrupted []corruptedWALEntry + lineNumber := 0 for scanner.Scan() { - line := scanner.Bytes() + lineNumber++ + line := bytes.TrimSpace(scanner.Bytes()) if len(line) == 0 { continue } + lineCopy := make([]byte, len(line)) copy(lineCopy, line) - lines = append(lines, lineCopy) + if header == nil { + var h WALHeader + if err := json.Unmarshal(lineCopy, &h); err != nil { + return nil, nil, nil, fmt.Errorf("failed to parse WAL header: %w", err) + } + header = &h + continue + } + + var e WALEntry + if err := json.Unmarshal(lineCopy, &e); err != nil { + corrupted = append(corrupted, corruptedWALEntry{ + lineNumber: lineNumber, + rawLine: string(lineCopy), + parseErr: err, + }) + continue + } + + if e.K == "" { + corrupted = append(corrupted, corruptedWALEntry{ + lineNumber: lineNumber, + rawLine: string(lineCopy), + parseErr: errors.New("entry has empty key"), + }) + continue + } + + entries = append(entries, e) } + if err := scanner.Err(); err != nil { - return nil, nil, fmt.Errorf("failed to read WAL file: %w", err) + return nil, nil, nil, fmt.Errorf("failed to read WAL file: %w", err) } - if len(lines) == 0 { - return nil, nil, errors.New("WAL file is empty") + if header == nil { + return nil, nil, nil, errors.New("WAL file is empty") } - var header WALHeader - if err := json.Unmarshal(lines[0], &header); err != nil { - return nil, nil, fmt.Errorf("failed to parse WAL header: %w", err) + return header, entries, corrupted, nil +} + +func replayWAL(statePath string, db *Database) (walReplayResult, error) { + result := walReplayResult{} + wp := walPath(statePath) + + if _, err := os.Stat(wp); os.IsNotExist(err) { + return result, nil } + result.hasWAL = true - var entries []WALEntry - for i := 1; i < len(lines); i++ { - lineNum := i + 1 - isLastLine := i == len(lines)-1 + f, err := os.Open(wp) + if err != nil { + return result, fmt.Errorf("%w: %v", errWALRead, err) + } + defer f.Close() - var e WALEntry - if err := json.Unmarshal(lines[i], &e); err != nil { - if isLastLine { - log.Debugf(ctx, "WAL line %d: skipping corrupted last entry: %v", lineNum, err) - continue - } - return nil, nil, fmt.Errorf("WAL line %d: corrupted entry in middle of WAL: %w", lineNum, err) + scanner := bufio.NewScanner(f) + scanner.Buffer(make([]byte, 0, 64*1024), 10*1024*1024) + var header *WALHeader + lineNumber := 0 + var corrupted []corruptedWALEntry + for scanner.Scan() { + lineNumber++ + line := bytes.TrimSpace(scanner.Bytes()) + if len(line) == 0 { + continue } - if e.K == "" { - if isLastLine { - log.Debugf(ctx, "WAL line %d: skipping last entry with empty key", lineNum) - continue + lineCopy := make([]byte, len(line)) + copy(lineCopy, line) + if header == nil { + var h WALHeader + if err := json.Unmarshal(lineCopy, &h); err != nil { + return result, fmt.Errorf("%w: failed to parse WAL header: %w", errWALRead, err) + } + header = &h + + expectedSerial := db.Serial + 1 + if header.Serial < expectedSerial { + result.stale = true + return result, nil + } + + if header.Serial > expectedSerial { + return result, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) + } + + if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { + return result, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) + } + + if db.Lineage == "" && header.Lineage != "" { + db.Lineage = header.Lineage + } + + if db.State == nil { + db.State = make(map[string]ResourceEntry) } - return nil, nil, fmt.Errorf("WAL line %d: entry with empty key in middle of WAL", lineNum) + continue } - entries = append(entries, e) + var entry WALEntry + if err := json.Unmarshal(lineCopy, &entry); err != nil { + corrupted = append(corrupted, corruptedWALEntry{ + lineNumber: lineNumber, + rawLine: string(lineCopy), + parseErr: err, + }) + continue + } + + if entry.K == "" { + corrupted = append(corrupted, corruptedWALEntry{ + lineNumber: lineNumber, + rawLine: string(lineCopy), + parseErr: errors.New("entry has empty key"), + }) + continue + } + + if entry.V != nil { + db.State[entry.K] = *entry.V + } else { + delete(db.State, entry.K) + } + result.entriesRecovered++ + } + + if err := scanner.Err(); err != nil { + return result, fmt.Errorf("%w: failed to read WAL file: %w", errWALRead, err) + } + + if header == nil { + return result, fmt.Errorf("%w: WAL file is empty", errWALRead) } - return &header, entries, nil + result.recovered = true + result.corruptedEntries = corrupted + return result, nil } func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, error) { - wp := walPath(statePath) - - if _, err := os.Stat(wp); os.IsNotExist(err) { - return false, nil + replayResult, err := replayWAL(statePath, db) + if err != nil { + if errors.Is(err, errWALRead) { + if moveErr := moveWALToCorrupted(statePath); moveErr != nil { + return false, moveErr + } + log.Warnf(ctx, "Failed to read WAL file, moved it to %s and proceeding: %s", relativePathForLog(walCorruptedPath(statePath)), strings.TrimPrefix(err.Error(), errWALRead.Error()+": ")) + return false, nil + } + return false, err } - header, entries, err := readWAL(ctx, statePath) - if err != nil { - log.Warnf(ctx, "Failed to read WAL file, deleting and proceeding: %v", err) - os.Remove(wp) + if replayResult.stale { + log.Debugf(ctx, "Deleting stale WAL (serial behind current state)") + if err := cleanupWAL(statePath); err != nil { + return false, err + } return false, nil } - expectedSerial := db.Serial + 1 - if header.Serial < expectedSerial { - log.Debugf(ctx, "Deleting stale WAL (serial %d < expected %d)", header.Serial, expectedSerial) - os.Remove(wp) + if !replayResult.recovered { return false, nil } - if header.Serial > expectedSerial { - return false, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) + logRecoveryProgress(ctx, fmt.Sprintf("Recovering state from WAL file: %s", relativePathForLog(walPath(statePath)))) + walLogPath := relativePathForLog(walPath(statePath)) + for _, corrupted := range replayResult.corruptedEntries { + log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d: %s: %v", walLogPath, corrupted.lineNumber, corrupted.rawLine, corrupted.parseErr) } - if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { - return false, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) + if err := writeCorruptedWALEntries(statePath, replayResult.corruptedEntries); err != nil { + return false, err } - - if db.Lineage == "" && header.Lineage != "" { - db.Lineage = header.Lineage + if len(replayResult.corruptedEntries) > 0 { + log.Warnf(ctx, "Saved corrupted WAL entries to %s", relativePathForLog(walCorruptedPath(statePath))) } - if db.State == nil { - db.State = make(map[string]ResourceEntry) - } + logRecoveryProgress(ctx, fmt.Sprintf("Recovered %d entries from WAL file.", replayResult.entriesRecovered)) + return true, nil +} - for _, entry := range entries { - if entry.V != nil { - db.State[entry.K] = *entry.V - } else { - delete(db.State, entry.K) - } +func relativePathForLog(path string) string { + rel, err := filepath.Rel(".", path) + if err != nil { + return path } + return filepath.ToSlash(rel) +} - return true, nil +func logRecoveryProgress(ctx context.Context, message string) { + defer func() { + _ = recover() + }() + cmdio.LogString(ctx, message) } diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go index fb4cab1a19..9d4533eba7 100644 --- a/bundle/direct/dstate/wal_test.go +++ b/bundle/direct/dstate/wal_test.go @@ -12,10 +12,6 @@ import ( "github.com/stretchr/testify/require" ) -func TestWALPath(t *testing.T) { - assert.Equal(t, "/path/to/state.json.wal", walPath("/path/to/state.json")) -} - func TestWALWriteAndRead(t *testing.T) { dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -23,31 +19,30 @@ func TestWALWriteAndRead(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) entry1 := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), } - err = wal.writeEntry("resources.jobs.job1", entry1) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry1}) require.NoError(t, err) entry2 := &ResourceEntry{ ID: "67890", State: json.RawMessage(`{"name":"job2"}`), } - err = wal.writeEntry("resources.jobs.job2", entry2) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job2", V: entry2}) require.NoError(t, err) - err = wal.writeEntry("resources.jobs.old_job", nil) + err = wal.writeJSON(WALEntry{K: "resources.jobs.old_job", V: nil}) require.NoError(t, err) err = wal.close() require.NoError(t, err) - ctx := context.Background() - header, entries, err := readWAL(ctx, statePath) + header, entries, _, err := readWAL(statePath) require.NoError(t, err) assert.Equal(t, "test-lineage", header.Lineage) @@ -67,26 +62,41 @@ func TestWALWriteAndRead(t *testing.T) { assert.Nil(t, entries[2].V) } -func TestWALTruncate(t *testing.T) { +func TestCleanupWAL(t *testing.T) { dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) _, err = os.Stat(walFilePath) require.NoError(t, err) - err = wal.truncate() + err = wal.close() + require.NoError(t, err) + err = cleanupWAL(statePath) require.NoError(t, err) _, err = os.Stat(walFilePath) assert.True(t, os.IsNotExist(err)) } +func TestOpenWALFailsIfFileAlreadyExists(t *testing.T) { + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + + wal, err := openWAL(statePath) + require.NoError(t, err) + require.NoError(t, wal.close()) + + _, err = openWAL(statePath) + require.Error(t, err) + assert.Contains(t, err.Error(), "failed to open WAL file") +} + func TestRecoverFromWAL_NoWAL(t *testing.T) { ctx := context.Background() dir := t.TempDir() @@ -105,14 +115,14 @@ func TestRecoverFromWAL_ValidWAL(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) entry := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), } - err = wal.writeEntry("resources.jobs.job1", entry) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -136,7 +146,7 @@ func TestRecoverFromWAL_StaleWAL(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -158,7 +168,7 @@ func TestRecoverFromWAL_FutureWAL(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 5) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 5}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -177,7 +187,7 @@ func TestRecoverFromWAL_LineageMismatch(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("lineage-A", 1) + err = wal.writeJSON(WALHeader{Lineage: "lineage-A", Serial: 1}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -196,17 +206,17 @@ func TestRecoverFromWAL_DeleteOperation(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) entry := &ResourceEntry{ ID: "12345", State: json.RawMessage(`{"name":"job1"}`), } - err = wal.writeEntry("resources.jobs.job1", entry) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry}) require.NoError(t, err) - err = wal.writeEntry("resources.jobs.job1", nil) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: nil}) require.NoError(t, err) err = wal.close() @@ -237,7 +247,7 @@ func TestDeploymentState_WALIntegration(t *testing.T) { _, err = os.Stat(walFilePath) require.NoError(t, err) - header, entries, err := readWAL(ctx, statePath) + header, entries, _, err := readWAL(statePath) require.NoError(t, err) assert.Equal(t, 1, header.Serial) require.Len(t, entries, 1) @@ -276,13 +286,13 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 6) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 6}) require.NoError(t, err) entry := &ResourceEntry{ ID: "new-id", State: json.RawMessage(`{"name":"new"}`), } - err = wal.writeEntry("resources.jobs.new", entry) + err = wal.writeJSON(WALEntry{K: "resources.jobs.new", V: entry}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -311,7 +321,7 @@ func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { err = db.DeleteState("resources.jobs.job1") require.NoError(t, err) - _, entries, err := readWAL(ctx, statePath) + _, entries, _, err := readWAL(statePath) require.NoError(t, err) require.Len(t, entries, 2) @@ -346,7 +356,7 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, dependsOn) require.NoError(t, err) - _, entries, err := readWAL(ctx, statePath) + _, entries, _, err := readWAL(statePath) require.NoError(t, err) require.Len(t, entries, 1) @@ -372,11 +382,19 @@ not valid json db := NewDatabase("", 0) recovered, err := recoverFromWAL(ctx, statePath, &db) require.NoError(t, err) - assert.False(t, recovered) - assert.Empty(t, db.State) + assert.True(t, recovered) + assert.Len(t, db.State, 2) + assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) + assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) + corruptedPath := walCorruptedPath(statePath) + _, err = os.Stat(corruptedPath) + require.NoError(t, err) + contentBytes, err := os.ReadFile(corruptedPath) + require.NoError(t, err) + assert.Equal(t, "not valid json\n", string(contentBytes)) _, err = os.Stat(walFilePath) - assert.True(t, os.IsNotExist(err)) + require.NoError(t, err) } func TestRecoverFromWAL_CorruptedLastLine(t *testing.T) { @@ -402,6 +420,13 @@ not valid json assert.Contains(t, db.State, "resources.jobs.job2") assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) + + corruptedPath := walCorruptedPath(statePath) + _, err = os.Stat(corruptedPath) + require.NoError(t, err) + contentBytes, err := os.ReadFile(corruptedPath) + require.NoError(t, err) + assert.Equal(t, "not valid json\n", string(contentBytes)) } func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { @@ -417,9 +442,9 @@ func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { wal, err := openWAL(statePath) require.NoError(t, err) - err = wal.writeHeader("test-lineage", 1) + err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) require.NoError(t, err) - err = wal.writeEntry("resources.jobs.job1", &ResourceEntry{ID: "123", State: json.RawMessage(`{}`)}) + err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: &ResourceEntry{ID: "123", State: json.RawMessage(`{}`)}}) require.NoError(t, err) err = wal.close() require.NoError(t, err) @@ -451,7 +476,6 @@ func TestRecoverFromWAL_LineageAdoption(t *testing.T) { } func TestReadWAL_EmptyFile(t *testing.T) { - ctx := context.Background() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -459,7 +483,7 @@ func TestReadWAL_EmptyFile(t *testing.T) { err := os.WriteFile(walFilePath, []byte(""), 0o600) require.NoError(t, err) - _, _, err = readWAL(ctx, statePath) + _, _, _, err = readWAL(statePath) assert.Error(t, err) assert.Contains(t, err.Error(), "empty") } @@ -482,7 +506,7 @@ func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { err = db.SaveState("resources.jobs.job1", "222", map[string]string{"v": "2"}, nil) require.NoError(t, err) - _, entries, err := readWAL(ctx, statePath) + _, entries, _, err := readWAL(statePath) require.NoError(t, err) require.Len(t, entries, 3) assert.Equal(t, "111", entries[0].V.ID) @@ -496,3 +520,31 @@ func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { require.True(t, ok) assert.Equal(t, "222", entry.ID) } + +func TestDeploymentState_FinalizeFailsOnCorruptedWAL(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + statePath := filepath.Join(dir, "resources.json") + walFilePath := walPath(statePath) + + var db DeploymentState + err := db.Open(ctx, statePath) + require.NoError(t, err) + + err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) + require.NoError(t, err) + + f, err := os.OpenFile(walFilePath, os.O_WRONLY|os.O_APPEND, 0) + require.NoError(t, err) + _, err = f.WriteString("{\"k\":\"resources.jobs.partial_write\",\"v\":{\"__id__\":\"999\",\"state\":{\"name\":\"partial-\n") + require.NoError(t, err) + require.NoError(t, f.Sync()) + require.NoError(t, f.Close()) + + err = db.Finalize() + require.Error(t, err) + assert.Contains(t, err.Error(), "failed to replay WAL during finalize: corrupted entry at line") + + _, err = os.Stat(walFilePath) + require.NoError(t, err) +} From 338ae0edab8e2094f738c8ffca08e08e5472c5e8 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 9 Feb 2026 23:51:25 +0530 Subject: [PATCH 06/85] fixed tests Signed-off-by: Varun Deep Saini --- .../bundle/artifacts/build_and_files_whl/test.toml | 1 + .../bundle/artifacts/shell/cmd/out.deploy.direct.txt | 7 +++++++ .../bundle/artifacts/shell/cmd/out.deploy.terraform.txt | 6 ++++++ acceptance/bundle/artifacts/shell/cmd/output.txt | 6 ------ acceptance/bundle/artifacts/shell/cmd/script | 2 +- acceptance/bundle/deploy/wal/test.toml | 9 +++++++++ bundle/direct/dstate/wal.go | 2 +- 7 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index e69de29bb2..a030353d57 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -0,0 +1 @@ +RecordRequests = false diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt new file mode 100644 index 0000000000..e034bae7db --- /dev/null +++ b/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt new file mode 100644 index 0000000000..8ebed9f66d --- /dev/null +++ b/acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/output.txt b/acceptance/bundle/artifacts/shell/cmd/output.txt index 8ebed9f66d..e69de29bb2 100644 --- a/acceptance/bundle/artifacts/shell/cmd/output.txt +++ b/acceptance/bundle/artifacts/shell/cmd/output.txt @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/script b/acceptance/bundle/artifacts/shell/cmd/script index 68ebb78d77..09bb41643c 100644 --- a/acceptance/bundle/artifacts/shell/cmd/script +++ b/acceptance/bundle/artifacts/shell/cmd/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 1632ddb195..df700645f7 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -22,6 +22,15 @@ Exit code:""" Old = 'Exit code: (137|1)' New = 'Exit code: [KILLED]' +# On Windows, no bash "Killed" message appears when CLI has produced output before termination. +# Insert [PROCESS_KILLED] between last output line and exit code for consistency. +[[Repls]] +Old = '(Deploying resources\.\.\.)\n\nExit code: \[KILLED\]' +New = """${1} +[PROCESS_KILLED] + +Exit code: [KILLED]""" + [[Repls]] Old = "\r" New = '' diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index 494c181833..cd422c37df 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -321,7 +321,7 @@ func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, return false, nil } - logRecoveryProgress(ctx, fmt.Sprintf("Recovering state from WAL file: %s", relativePathForLog(walPath(statePath)))) + logRecoveryProgress(ctx, "Recovering state from WAL file: "+relativePathForLog(walPath(statePath))) walLogPath := relativePathForLog(walPath(statePath)) for _, corrupted := range replayResult.corruptedEntries { log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d: %s: %v", walLogPath, corrupted.lineNumber, corrupted.rawLine, corrupted.parseErr) From ebb16ae8f0a2e1e060d6d78c6772826b382903a1 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Tue, 24 Mar 2026 23:50:01 +0530 Subject: [PATCH 07/85] updated tests Signed-off-by: Varun Deep Saini --- .../test.toml | 1 - .../shell/bash/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 acceptance/bundle/artifacts/shell/bash/script | 2 +- .../shell/basic/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 .../bundle/artifacts/shell/basic/script | 2 +- .../artifacts/shell/cmd/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 acceptance/bundle/artifacts/shell/cmd/script | 2 +- .../shell/default/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 .../bundle/artifacts/shell/default/script | 2 +- .../artifacts/shell/sh/out.deploy.direct.txt | 7 - ...ut.deploy.terraform.txt => out.deploy.txt} | 0 acceptance/bundle/artifacts/shell/sh/script | 2 +- acceptance/bundle/artifacts/shell/test.toml | 4 + .../deploy/wal/chain-10-jobs/output.txt | 359 +++++++++++++++++- .../bundle/deploy/wal/chain-10-jobs/script | 2 +- 19 files changed, 359 insertions(+), 52 deletions(-) delete mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/bash/{out.deploy.terraform.txt => out.deploy.txt} (100%) delete mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/basic/{out.deploy.terraform.txt => out.deploy.txt} (100%) delete mode 100644 acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/cmd/{out.deploy.terraform.txt => out.deploy.txt} (100%) delete mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/default/{out.deploy.terraform.txt => out.deploy.txt} (100%) delete mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt rename acceptance/bundle/artifacts/shell/sh/{out.deploy.terraform.txt => out.deploy.txt} (100%) diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index b6c55dac31..a0a680e9d1 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -1,5 +1,4 @@ RecordRequests = true -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" Ignore = [ '.venv', 'dist', diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt deleted file mode 100644 index f311959abd..0000000000 --- a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/bash/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index 09bb41643c..eae0837850 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt deleted file mode 100644 index 3a4ff9138b..0000000000 --- a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/basic/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index 09bb41643c..eae0837850 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt deleted file mode 100644 index e034bae7db..0000000000 --- a/acceptance/bundle/artifacts/shell/cmd/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/cmd/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/cmd/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/cmd/script b/acceptance/bundle/artifacts/shell/cmd/script index 09bb41643c..eae0837850 100644 --- a/acceptance/bundle/artifacts/shell/cmd/script +++ b/acceptance/bundle/artifacts/shell/cmd/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt deleted file mode 100644 index f311959abd..0000000000 --- a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/default/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index 09bb41643c..eae0837850 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt deleted file mode 100644 index 98820986f5..0000000000 --- a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.txt similarity index 100% rename from acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt rename to acceptance/bundle/artifacts/shell/sh/out.deploy.txt diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index 09bb41643c..eae0837850 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy > out.deploy.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/test.toml b/acceptance/bundle/artifacts/shell/test.toml index 9796804e9a..df72afb6c8 100644 --- a/acceptance/bundle/artifacts/shell/test.toml +++ b/acceptance/bundle/artifacts/shell/test.toml @@ -1,3 +1,7 @@ Local = true Cloud = false RecordRequests = false + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt index d391548fa8..b172c4fc06 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt @@ -8,16 +8,355 @@ Deploying resources... Exit code: [KILLED] === WAL content after crash === -{"lineage":"[UUID]","serial": [SERIAL]} -{"k":"resources.jobs.job_01","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"first in chain","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-01","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]}}} -{"k":"resources.jobs.job_02","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-02","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_01","label":"${resources.jobs.job_01.id}"}]}} -{"k":"resources.jobs.job_03","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-03","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_02","label":"${resources.jobs.job_02.id}"}]}} -{"k":"resources.jobs.job_04","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-04","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_03","label":"${resources.jobs.job_03.id}"}]}} -{"k":"resources.jobs.job_05","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-05","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_04","label":"${resources.jobs.job_04.id}"}]}} -{"k":"resources.jobs.job_06","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-06","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_05","label":"${resources.jobs.job_05.id}"}]}} -{"k":"resources.jobs.job_07","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-07","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_06","label":"${resources.jobs.job_06.id}"}]}} -{"k":"resources.jobs.job_08","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-08","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_07","label":"${resources.jobs.job_07.id}"}]}} -{"k":"resources.jobs.job_09","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-09","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_08","label":"${resources.jobs.job_08.id}"}]}} +{ + "lineage": "[UUID]", + "serial": [SERIAL] +} +{ + "k": "resources.jobs.job_01", + "v": { + "__id__": "[ID]", + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "first in chain", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-01", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_02", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_01.id}", + "node": "resources.jobs.job_01" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-02", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_03", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_02.id}", + "node": "resources.jobs.job_02" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-03", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_04", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_03.id}", + "node": "resources.jobs.job_03" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-04", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_05", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_04.id}", + "node": "resources.jobs.job_04" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-05", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_06", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_05.id}", + "node": "resources.jobs.job_05" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-06", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_07", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_06.id}", + "node": "resources.jobs.job_06" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-07", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_08", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_07.id}", + "node": "resources.jobs.job_07" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-08", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_09", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_08.id}", + "node": "resources.jobs.job_08" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-09", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} === Number of jobs saved in WAL === 9 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/script b/acceptance/bundle/deploy/wal/chain-10-jobs/script index 6cf2dd32f0..1f829232ad 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/script @@ -3,7 +3,7 @@ trace errcode $CLI bundle deploy echo "" echo "=== WAL content after crash ===" -cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file" +jq -S . .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file" echo "" echo "=== Number of jobs saved in WAL ===" From 184d4a496ee15f1d3f09e233189481aaa4b29a35 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Wed, 25 Mar 2026 00:18:39 +0530 Subject: [PATCH 08/85] dedup Signed-off-by: Varun Deep Saini --- .../out.deploy.direct.txt | 7 -- .../out.deploy.terraform.txt | 6 - .../output.txt | 6 + .../script | 2 +- .../test.toml | 4 + .../build_and_files_whl/out.deploy.direct.txt | 8 -- .../out.deploy.terraform.txt | 7 -- .../artifacts/build_and_files_whl/output.txt | 7 ++ .../artifacts/build_and_files_whl/script | 2 +- .../artifacts/build_and_files_whl/test.toml | 4 + .../artifacts/shell/bash/out.deploy.txt | 6 - .../bundle/artifacts/shell/bash/output.txt | 6 + acceptance/bundle/artifacts/shell/bash/script | 2 +- .../artifacts/shell/basic/out.deploy.txt | 6 - .../bundle/artifacts/shell/basic/output.txt | 6 + .../bundle/artifacts/shell/basic/script | 2 +- .../bundle/artifacts/shell/cmd/out.deploy.txt | 6 - .../bundle/artifacts/shell/cmd/output.txt | 6 + acceptance/bundle/artifacts/shell/cmd/script | 2 +- .../artifacts/shell/default/out.deploy.txt | 6 - .../bundle/artifacts/shell/default/output.txt | 6 + .../bundle/artifacts/shell/default/script | 2 +- .../bundle/artifacts/shell/sh/out.deploy.txt | 6 - .../bundle/artifacts/shell/sh/output.txt | 6 + acceptance/bundle/artifacts/shell/sh/script | 2 +- .../deploy/empty-bundle/out.deploy.direct.txt | 6 - .../empty-bundle/out.deploy.terraform.txt | 5 - .../bundle/deploy/empty-bundle/output.txt | 5 + acceptance/bundle/deploy/empty-bundle/script | 2 +- .../bundle/deploy/empty-bundle/test.toml | 3 + .../deploy/wal/future-serial-wal/test.toml | 4 - .../deploy/wal/lineage-mismatch/test.toml | 4 - .../deploy/wal/wal-with-delete/test.toml | 7 -- .../bundle/scripts/out.deploy.direct.txt | 24 ---- .../bundle/scripts/out.deploy.terraform.txt | 23 ---- acceptance/bundle/scripts/output.txt | 23 ++++ .../out.deploy.direct.txt | 18 --- .../out.deploy.terraform.txt | 17 --- .../scripts/restricted-execution/output.txt | 17 +++ .../scripts/restricted-execution/script | 2 +- .../scripts/restricted-execution/test.toml | 3 + acceptance/bundle/scripts/script | 2 +- acceptance/bundle/scripts/test.toml | 3 + .../out.deploy-one.direct.txt | 6 - .../out.deploy-one.terraform.txt | 5 - .../out.deploy-two.direct.txt | 6 - .../out.deploy-two.terraform.txt | 5 - .../deploy-artifact-path-type/output.txt | 10 ++ .../deploy-artifact-path-type/script | 4 +- .../deploy-artifact-path-type/test.toml | 4 + .../out.deploy.direct.txt | 6 - .../out.deploy.terraform.txt | 5 - .../deploy-config-file-count/output.txt | 5 + .../telemetry/deploy-config-file-count/script | 2 +- .../deploy-config-file-count/test.toml | 3 + .../deploy-mode/out.deploy-dev.direct.txt | 6 - .../deploy-mode/out.deploy-dev.terraform.txt | 5 - .../deploy-mode/out.deploy-prod.direct.txt | 12 -- .../deploy-mode/out.deploy-prod.terraform.txt | 11 -- .../bundle/telemetry/deploy-mode/output.txt | 16 +++ .../bundle/telemetry/deploy-mode/script | 4 +- .../bundle/telemetry/deploy-mode/test.toml | 3 + .../deploy-target-count/out.deploy.direct.txt | 6 - .../out.deploy.terraform.txt | 5 - .../telemetry/deploy-target-count/output.txt | 5 + .../telemetry/deploy-target-count/script | 2 +- .../telemetry/deploy-target-count/test.toml | 3 + .../out.deploy.direct.txt | 6 - .../out.deploy.terraform.txt | 5 - .../deploy-variable-count/output.txt | 5 + .../telemetry/deploy-variable-count/script | 2 +- .../telemetry/deploy-variable-count/test.toml | 4 + .../out.deploy-one.direct.txt | 8 -- .../out.deploy-one.terraform.txt | 7 -- .../out.deploy-two.direct.txt | 8 -- .../out.deploy-two.terraform.txt | 7 -- .../telemetry/deploy-whl-artifacts/output.txt | 14 +++ .../telemetry/deploy-whl-artifacts/script | 4 +- .../telemetry/deploy-whl-artifacts/test.toml | 4 + .../sync_patterns/out.deploy.direct.txt | 6 - .../sync_patterns/out.deploy.terraform.txt | 5 - .../bundle/validate/sync_patterns/output.txt | 5 + .../bundle/validate/sync_patterns/script | 2 +- .../bundle/validate/sync_patterns/test.toml | 4 + acceptance/cache/simple/out.deploy.direct.txt | 6 - .../cache/simple/out.deploy.terraform.txt | 5 - acceptance/cache/simple/script | 2 +- acceptance/cache/simple/test.toml | 7 +- acceptance/internal/prepare_server.go | 3 - bundle/direct/bundle_apply.go | 1 - bundle/direct/dstate/state.go | 29 ++--- bundle/direct/dstate/wal.go | 103 ++++-------------- bundle/direct/dstate/wal_test.go | 33 +++--- 93 files changed, 264 insertions(+), 451 deletions(-) delete mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.txt delete mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.txt delete mode 100644 acceptance/bundle/artifacts/shell/cmd/out.deploy.txt delete mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.txt delete mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.txt delete mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt delete mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/test.toml delete mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/test.toml delete mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/test.toml delete mode 100644 acceptance/bundle/scripts/out.deploy.direct.txt delete mode 100644 acceptance/bundle/scripts/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/test.toml create mode 100644 acceptance/bundle/scripts/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt delete mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt delete mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt delete mode 100644 acceptance/cache/simple/out.deploy.direct.txt delete mode 100644 acceptance/cache/simple/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt deleted file mode 100644 index f75a5428b1..0000000000 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt deleted file mode 100644 index 8ec9c52db6..0000000000 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt index 6c8bd962a5..6d24880e6c 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt @@ -1,4 +1,10 @@ +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + === Expecting wheel to be uploaded >>> jq .path "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/whl/source.whl" diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script index fba3a77700..883601185c 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script @@ -2,7 +2,7 @@ mkdir -p whl echo "test wheel content" > whl/source.whl -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy title "Expecting wheel to be uploaded" trace jq .path < out.requests.txt | grep import | grep whl | sort diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index a0a680e9d1..8185d0df6e 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -8,6 +8,10 @@ Ignore = [ '*.whl', ] +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' + [[Server]] Pattern = "GET /api/2.1/clusters/get" Response.Body = ''' diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt deleted file mode 100644 index 4039d5917e..0000000000 --- a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt deleted file mode 100644 index 9894e5b89f..0000000000 --- a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/output.txt b/acceptance/bundle/artifacts/build_and_files_whl/output.txt index d44a21b582..b618de6b89 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/output.txt +++ b/acceptance/bundle/artifacts/build_and_files_whl/output.txt @@ -7,3 +7,10 @@ Workspace: Path: /Workspace/Users/[USERNAME]/.bundle/test-bundle/default Validation OK! + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/script b/acceptance/bundle/artifacts/build_and_files_whl/script index 9aa0d870e7..2d7d63f7fe 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/script +++ b/acceptance/bundle/artifacts/build_and_files_whl/script @@ -1,5 +1,5 @@ cp -r $TESTDIR/../whl_explicit/my_test_code/{setup.py,src} . trace $CLI bundle validate # I expect this deploy to work because I explicitly told where to find the wheel, but it does not: -trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace errcode $CLI bundle deploy rm mydist/my_test_code-0.0.1-py3-none-any.whl setup.py src/*.py diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index a030353d57..8b65645e5a 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -1 +1,5 @@ RecordRequests = false + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.txt deleted file mode 100644 index fa5d7b76bc..0000000000 --- a/acceptance/bundle/artifacts/shell/bash/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index e69de29bb2..fa5d7b76bc 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index eae0837850..68ebb78d77 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.txt deleted file mode 100644 index b5e01c79e6..0000000000 --- a/acceptance/bundle/artifacts/shell/basic/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index e69de29bb2..b5e01c79e6 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index eae0837850..68ebb78d77 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/cmd/out.deploy.txt b/acceptance/bundle/artifacts/shell/cmd/out.deploy.txt deleted file mode 100644 index 8ebed9f66d..0000000000 --- a/acceptance/bundle/artifacts/shell/cmd/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/output.txt b/acceptance/bundle/artifacts/shell/cmd/output.txt index e69de29bb2..8ebed9f66d 100644 --- a/acceptance/bundle/artifacts/shell/cmd/output.txt +++ b/acceptance/bundle/artifacts/shell/cmd/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-cmd/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/cmd/script b/acceptance/bundle/artifacts/shell/cmd/script index eae0837850..68ebb78d77 100644 --- a/acceptance/bundle/artifacts/shell/cmd/script +++ b/acceptance/bundle/artifacts/shell/cmd/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.txt deleted file mode 100644 index fa5d7b76bc..0000000000 --- a/acceptance/bundle/artifacts/shell/default/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index e69de29bb2..fa5d7b76bc 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index eae0837850..68ebb78d77 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.txt deleted file mode 100644 index 5117e6e9fc..0000000000 --- a/acceptance/bundle/artifacts/shell/sh/out.deploy.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index e69de29bb2..5117e6e9fc 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index eae0837850..68ebb78d77 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt deleted file mode 100644 index 81dddfcb9f..0000000000 --- a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt deleted file mode 100644 index 494f76c84f..0000000000 --- a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/output.txt b/acceptance/bundle/deploy/empty-bundle/output.txt index 8498653a6e..919accb661 100644 --- a/acceptance/bundle/deploy/empty-bundle/output.txt +++ b/acceptance/bundle/deploy/empty-bundle/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Deployment complete! + >>> [CLI] bundle destroy --auto-approve All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default diff --git a/acceptance/bundle/deploy/empty-bundle/script b/acceptance/bundle/deploy/empty-bundle/script index b74818f1b1..775ccd0def 100644 --- a/acceptance/bundle/deploy/empty-bundle/script +++ b/acceptance/bundle/deploy/empty-bundle/script @@ -4,4 +4,4 @@ cleanup() { } trap cleanup EXIT -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/deploy/empty-bundle/test.toml b/acceptance/bundle/deploy/empty-bundle/test.toml index f64800a163..84da5529dc 100644 --- a/acceptance/bundle/deploy/empty-bundle/test.toml +++ b/acceptance/bundle/deploy/empty-bundle/test.toml @@ -2,3 +2,6 @@ Cloud = true [EnvMatrix] DATABRICKS_BUNDLE_ENABLE_EXPERIMENTAL_YAML_SYNC = ["", "true"] +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/test.toml b/acceptance/bundle/deploy/wal/future-serial-wal/test.toml deleted file mode 100644 index 424fe2f127..0000000000 --- a/acceptance/bundle/deploy/wal/future-serial-wal/test.toml +++ /dev/null @@ -1,4 +0,0 @@ -# WAL with serial ahead of state - indicates corruption, should error. -# State has serial=2, WAL has serial=5 (expected would be 3). - -# No server stubs needed - deploy should fail before any API calls. diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml deleted file mode 100644 index 509cc82f09..0000000000 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml +++ /dev/null @@ -1,4 +0,0 @@ -# WAL with different lineage than state - should error. -# State has lineage "state-lineage-aaa", WAL has lineage "wal-lineage-bbb". - -# No server stubs needed - deploy should fail before any API calls. diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/test.toml deleted file mode 100644 index 4f81ae4695..0000000000 --- a/acceptance/bundle/deploy/wal/wal-with-delete/test.toml +++ /dev/null @@ -1,7 +0,0 @@ -# WAL recovery after crash during delete operation (simulated). -# Delete was recorded in WAL but not finalized. Deploy should complete the delete. -# Note: Real crash testing for delete is not possible because there's no API call -# after DeleteState (unlike create which has refreshRemoteState after SaveState). - -# No server stubs needed - the delete was already done (recorded in WAL) -# and the job no longer needs API calls diff --git a/acceptance/bundle/scripts/out.deploy.direct.txt b/acceptance/bundle/scripts/out.deploy.direct.txt deleted file mode 100644 index 037f609f94..0000000000 --- a/acceptance/bundle/scripts/out.deploy.direct.txt +++ /dev/null @@ -1,24 +0,0 @@ - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/out.deploy.terraform.txt b/acceptance/bundle/scripts/out.deploy.terraform.txt deleted file mode 100644 index a3d9ba342c..0000000000 --- a/acceptance/bundle/scripts/out.deploy.terraform.txt +++ /dev/null @@ -1,23 +0,0 @@ - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/output.txt b/acceptance/bundle/scripts/output.txt index a39a0b0aa9..68afb2fecc 100644 --- a/acceptance/bundle/scripts/output.txt +++ b/acceptance/bundle/scripts/output.txt @@ -25,3 +25,26 @@ Name: scripts Found 1 error Exit code: 1 + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt deleted file mode 100644 index d8fed9e4e6..0000000000 --- a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt +++ /dev/null @@ -1,18 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt deleted file mode 100644 index efcf1281cb..0000000000 --- a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt +++ /dev/null @@ -1,17 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/output.txt b/acceptance/bundle/scripts/restricted-execution/output.txt index 2186ac68f0..f377edba7c 100644 --- a/acceptance/bundle/scripts/restricted-execution/output.txt +++ b/acceptance/bundle/scripts/restricted-execution/output.txt @@ -1,5 +1,22 @@ === Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env + === With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible >>> DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode [CLI] bundle deploy Error: failed to execute script: running scripts is not allowed when DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION is set diff --git a/acceptance/bundle/scripts/restricted-execution/script b/acceptance/bundle/scripts/restricted-execution/script index 2e31cce2ee..7a3dcb068b 100644 --- a/acceptance/bundle/scripts/restricted-execution/script +++ b/acceptance/bundle/scripts/restricted-execution/script @@ -1,7 +1,7 @@ export SOME_ENV_VAR="value_from_env" title "Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible" -trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace errcode $CLI bundle deploy title "With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible" trace DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/restricted-execution/test.toml b/acceptance/bundle/scripts/restricted-execution/test.toml new file mode 100644 index 0000000000..2a2e9c2033 --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/scripts/script b/acceptance/bundle/scripts/script index 3acb85f9cd..de07d277ea 100644 --- a/acceptance/bundle/scripts/script +++ b/acceptance/bundle/scripts/script @@ -1,3 +1,3 @@ trace EXITCODE=0 errcode $CLI bundle validate trace EXITCODE=1 errcode $CLI bundle validate -trace EXITCODE=0 errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace EXITCODE=0 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/test.toml b/acceptance/bundle/scripts/test.toml new file mode 100644 index 0000000000..2a2e9c2033 --- /dev/null +++ b/acceptance/bundle/scripts/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt deleted file mode 100644 index 0e133547de..0000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt deleted file mode 100644 index 65960fa86d..0000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt deleted file mode 100644 index 120e590201..0000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt deleted file mode 100644 index fabdebb399..0000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt index 69c6730b46..a03920c3fd 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt @@ -1,4 +1,14 @@ +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "workspace_artifact_path_type": "WORKSPACE_FILE_SYSTEM" diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/script b/acceptance/bundle/telemetry/deploy-artifact-path-type/script index 4f3bd7c3cf..d1a63928a6 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/script +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one -trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t two trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {workspace_artifact_path_type}' diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml b/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml index 32b75237a1..d4126948d3 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml @@ -20,3 +20,7 @@ Response.Body = '{}' # I'm adding 405 because that's what this test originally do. It's somewhat # surprising though that CLI can receive 405 and that does not result in error anywhere. Response.StatusCode = 405 + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b916..0000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37..0000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt index 1637965310..909e8d6c70 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "configuration_file_count": 4 diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/script b/acceptance/bundle/telemetry/deploy-config-file-count/script index 7fbdd0e677..c495bdcb07 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/script +++ b/acceptance/bundle/telemetry/deploy-config-file-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {configuration_file_count}' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/test.toml b/acceptance/bundle/telemetry/deploy-config-file-count/test.toml new file mode 100644 index 0000000000..2a2e9c2033 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt deleted file mode 100644 index e86795abf5..0000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt deleted file mode 100644 index ee47fabbb6..0000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt deleted file mode 100644 index 5957e33b91..0000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt +++ /dev/null @@ -1,12 +0,0 @@ - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt deleted file mode 100644 index ac2e13efb9..0000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt +++ /dev/null @@ -1,11 +0,0 @@ - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/output.txt b/acceptance/bundle/telemetry/deploy-mode/output.txt index 89be65f195..99e7fbb699 100644 --- a/acceptance/bundle/telemetry/deploy-mode/output.txt +++ b/acceptance/bundle/telemetry/deploy-mode/output.txt @@ -1,4 +1,20 @@ +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "bundle_mode": "DEVELOPMENT" diff --git a/acceptance/bundle/telemetry/deploy-mode/script b/acceptance/bundle/telemetry/deploy-mode/script index 0a9d57a1a4..f7257769ac 100644 --- a/acceptance/bundle/telemetry/deploy-mode/script +++ b/acceptance/bundle/telemetry/deploy-mode/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t dev > out.deploy-dev.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t dev -trace $CLI bundle deploy -t prod > out.deploy-prod.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t prod trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bundle_mode}' diff --git a/acceptance/bundle/telemetry/deploy-mode/test.toml b/acceptance/bundle/telemetry/deploy-mode/test.toml new file mode 100644 index 0000000000..2a2e9c2033 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt deleted file mode 100644 index 0e133547de..0000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt deleted file mode 100644 index 65960fa86d..0000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/output.txt b/acceptance/bundle/telemetry/deploy-target-count/output.txt index 9c59c43023..31581169f2 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-target-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "target_count": 3 diff --git a/acceptance/bundle/telemetry/deploy-target-count/script b/acceptance/bundle/telemetry/deploy-target-count/script index 6e9d2f7378..3022a2b5e4 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/script +++ b/acceptance/bundle/telemetry/deploy-target-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy -t one > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {target_count}' diff --git a/acceptance/bundle/telemetry/deploy-target-count/test.toml b/acceptance/bundle/telemetry/deploy-target-count/test.toml new file mode 100644 index 0000000000..2a2e9c2033 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/test.toml @@ -0,0 +1,3 @@ +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b916..0000000000 --- a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37..0000000000 --- a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/output.txt b/acceptance/bundle/telemetry/deploy-variable-count/output.txt index e8580d71b3..be4840e69e 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-variable-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "variable_count": 6, diff --git a/acceptance/bundle/telemetry/deploy-variable-count/script b/acceptance/bundle/telemetry/deploy-variable-count/script index caaf8c1f39..dad762899a 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/script +++ b/acceptance/bundle/telemetry/deploy-variable-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs.[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {variable_count, lookup_variable_count, complex_variable_count}' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/test.toml b/acceptance/bundle/telemetry/deploy-variable-count/test.toml index 855ecdd39e..0a40c794b3 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/test.toml +++ b/acceptance/bundle/telemetry/deploy-variable-count/test.toml @@ -14,3 +14,7 @@ Response.Body = ''' ] } ''' + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt deleted file mode 100644 index f8db617c00..0000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt deleted file mode 100644 index 048d0f07b5..0000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt deleted file mode 100644 index b786de11fe..0000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt deleted file mode 100644 index 651d315f77..0000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt index ed89628d98..a9b8ce4ae6 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt @@ -1,4 +1,18 @@ +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "bool_values": [ diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/script b/acceptance/bundle/telemetry/deploy-whl-artifacts/script index 5bc513afb8..078fa94cdd 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/script +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/script @@ -2,9 +2,9 @@ uv venv -q .venv venv_activate uv pip install -q --no-index setuptools -trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one -trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t two trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bool_values}' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml b/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml index 0d48150706..317e12a834 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml @@ -6,3 +6,7 @@ Ignore = [ '.databricks', "__pycache__", ] + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b916..0000000000 --- a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37..0000000000 --- a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/output.txt b/acceptance/bundle/validate/sync_patterns/output.txt index 0c061fbe31..b35859d86a 100644 --- a/acceptance/bundle/validate/sync_patterns/output.txt +++ b/acceptance/bundle/validate/sync_patterns/output.txt @@ -20,3 +20,8 @@ Validation OK! "." ] } + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/script b/acceptance/bundle/validate/sync_patterns/script index 485556d28a..d2aae85444 100644 --- a/acceptance/bundle/validate/sync_patterns/script +++ b/acceptance/bundle/validate/sync_patterns/script @@ -1,5 +1,5 @@ trace $CLI bundle validate trace $CLI bundle validate -o json | jq '.sync' -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy jq 'select(.path | test("dir/test.yml"))' out.requests.txt > out.sync.txt rm out.requests.txt diff --git a/acceptance/bundle/validate/sync_patterns/test.toml b/acceptance/bundle/validate/sync_patterns/test.toml index 159efe0269..abc1014fd6 100644 --- a/acceptance/bundle/validate/sync_patterns/test.toml +++ b/acceptance/bundle/validate/sync_patterns/test.toml @@ -1 +1,5 @@ RecordRequests = true + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/cache/simple/out.deploy.direct.txt b/acceptance/cache/simple/out.deploy.direct.txt deleted file mode 100644 index 945da6d144..0000000000 --- a/acceptance/cache/simple/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -p dogfood -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/cache/simple/out.deploy.terraform.txt b/acceptance/cache/simple/out.deploy.terraform.txt deleted file mode 100644 index 41cfbc2a2d..0000000000 --- a/acceptance/cache/simple/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -p dogfood -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/cache/simple/script b/acceptance/cache/simple/script index 524c077f46..a2907174bf 100644 --- a/acceptance/cache/simple/script +++ b/acceptance/cache/simple/script @@ -9,7 +9,7 @@ title "Second call in a session is expected to be a cache hit\n" trace $CLI bundle validate -p dogfood --debug 2>&1 | grep "Local Cache" | grep -v "cache path" title "Bundle deploy should send telemetry values\n" -trace $CLI bundle deploy -p dogfood > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -p dogfood trace print_telemetry_bool_values | grep "local.cache" rm out.requests.txt diff --git a/acceptance/cache/simple/test.toml b/acceptance/cache/simple/test.toml index 75759db680..f791f9a03c 100644 --- a/acceptance/cache/simple/test.toml +++ b/acceptance/cache/simple/test.toml @@ -3,9 +3,6 @@ Local = true RecordRequests = true -# Enable engine-specific output files -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" - # Redact structured logging fields from debug output [[Repls]] Old = ' pid=[0-9]+' @@ -14,3 +11,7 @@ New = '' [[Repls]] Old = ' mutator=[A-Za-z]+' New = '' + +[[Repls]] +Old = 'Updating deployment state...\n' +New = '' diff --git a/acceptance/internal/prepare_server.go b/acceptance/internal/prepare_server.go index dfa89ef748..2f1b6712a2 100644 --- a/acceptance/internal/prepare_server.go +++ b/acceptance/internal/prepare_server.go @@ -183,7 +183,6 @@ func startLocalServer(t *testing.T, s.ResponseCallback = logResponseCallback(t) } - // Track remaining kill counts and offset counts per pattern (for KillCaller > 0) killCounters := make(map[string]int) offsetCounters := make(map[string]int) killCountersMu := &sync.Mutex{} @@ -196,7 +195,6 @@ func startLocalServer(t *testing.T, items := strings.Split(stub.Pattern, " ") require.Len(t, items, 2) - // Initialize kill counter and offset counter for this pattern if stub.KillCaller > 0 { killCounters[stub.Pattern] = stub.KillCaller offsetCounters[stub.Pattern] = stub.KillCallerOffset @@ -241,7 +239,6 @@ func shouldKillCaller(stub ServerStub, offsetCounters, killCounters map[string]i mu.Lock() defer mu.Unlock() - // Still in offset period? Let this request pass. if offsetCounters[stub.Pattern] > 0 { offsetCounters[stub.Pattern]-- return false diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index aec6e7cc52..1b686519c6 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -21,7 +21,6 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa } if len(plan.Plan) == 0 { - // Still need to finalize if WAL recovery happened to commit the recovered state if b.StateDB.RecoveredFromWAL() { if err := b.StateDB.Finalize(); err != nil { logdiag.LogError(ctx, err) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index a54da010f1..3f5a5c4f50 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -104,8 +104,6 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } -// ensureWALOpen opens the WAL file and writes the header if not already done. -// Must be called while holding db.mu. func (db *DeploymentState) ensureWALOpen() error { if db.wal != nil { return nil @@ -122,7 +120,6 @@ func (db *DeploymentState) ensureWALOpen() error { db.Data.Lineage = lineage } - // WAL serial is the NEXT serial (current + 1) walSerial := db.Data.Serial + 1 if err := wal.writeJSON(WALHeader{Lineage: lineage, Serial: walSerial}); err != nil { @@ -169,15 +166,8 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { data, err := os.ReadFile(path) if err != nil { if errors.Is(err, fs.ErrNotExist) { - // Create new database with serial=0, will be incremented to 1 in Finalize() db.Data = NewDatabase("", 0) db.Path = path - - // Write state file immediately to ensure it exists before any WAL operations. - // This guarantees we have a base state file for recovery validation. - if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { - return fmt.Errorf("failed to create state directory: %w", err) - } if err := db.unlockedSave(); err != nil { return err } @@ -196,6 +186,11 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { if err != nil { return fmt.Errorf("WAL recovery failed: %w", err) } + + if err := migrateState(&db.Data); err != nil { + return fmt.Errorf("migrating state %s: %w", path, err) + } + if recovered { if err := db.unlockedSave(); err != nil { return err @@ -205,10 +200,6 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { } db.recoveredFromWAL = true } - - if err := migrateState(&db.Data); err != nil { - return fmt.Errorf("migrating state %s: %w", path, err) - } return nil } @@ -223,7 +214,8 @@ func (db *DeploymentState) Finalize() error { } db.wal = nil - replayResult, err := replayWAL(db.Path, &db.Data) + validationDB := db.Data + replayResult, err := replayWAL(db.Path, &validationDB) if err != nil { return fmt.Errorf("failed to replay WAL during finalize: %w", err) } @@ -259,8 +251,7 @@ func (db *DeploymentState) Finalize() error { return nil } -// Close closes the WAL file handle without finalizing or truncating. -// Use this in tests or when you need to abort without saving state. +// Close closes the WAL file without saving state. func (db *DeploymentState) Close() error { db.mu.Lock() defer db.mu.Unlock() @@ -280,8 +271,7 @@ func (db *DeploymentState) AssertOpened() { } } -// RecoveredFromWAL returns true if state was recovered from WAL during Open(). -// This is used to determine if Finalize() should be called even with an empty plan. +// RecoveredFromWAL reports whether Open recovered state from the WAL. func (db *DeploymentState) RecoveredFromWAL() bool { return db.recoveredFromWAL } @@ -316,7 +306,6 @@ func (db *DeploymentState) unlockedSave() error { return err } - // Create parent directories if they don't exist dir := filepath.Dir(db.Path) if err := os.MkdirAll(dir, 0o755); err != nil { return fmt.Errorf("failed to create directory %#v: %w", dir, err) diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index cd422c37df..25bb3feaea 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -36,7 +36,6 @@ type corruptedWALEntry struct { } type walReplayResult struct { - hasWAL bool recovered bool stale bool entriesRecovered int @@ -198,100 +197,46 @@ func readWAL(statePath string) (*WALHeader, []WALEntry, []corruptedWALEntry, err func replayWAL(statePath string, db *Database) (walReplayResult, error) { result := walReplayResult{} - wp := walPath(statePath) - - if _, err := os.Stat(wp); os.IsNotExist(err) { - return result, nil - } - result.hasWAL = true - - f, err := os.Open(wp) + header, entries, corrupted, err := readWAL(statePath) if err != nil { + if os.IsNotExist(err) { + return result, nil + } return result, fmt.Errorf("%w: %v", errWALRead, err) } - defer f.Close() - - scanner := bufio.NewScanner(f) - scanner.Buffer(make([]byte, 0, 64*1024), 10*1024*1024) - var header *WALHeader - lineNumber := 0 - var corrupted []corruptedWALEntry - for scanner.Scan() { - lineNumber++ - line := bytes.TrimSpace(scanner.Bytes()) - if len(line) == 0 { - continue - } - - lineCopy := make([]byte, len(line)) - copy(lineCopy, line) - if header == nil { - var h WALHeader - if err := json.Unmarshal(lineCopy, &h); err != nil { - return result, fmt.Errorf("%w: failed to parse WAL header: %w", errWALRead, err) - } - header = &h - - expectedSerial := db.Serial + 1 - if header.Serial < expectedSerial { - result.stale = true - return result, nil - } - - if header.Serial > expectedSerial { - return result, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) - } - if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { - return result, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) - } + expectedSerial := db.Serial + 1 + if header.Serial < expectedSerial { + result.stale = true + return result, nil + } - if db.Lineage == "" && header.Lineage != "" { - db.Lineage = header.Lineage - } + if header.Serial > expectedSerial { + return result, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) + } - if db.State == nil { - db.State = make(map[string]ResourceEntry) - } - continue - } + if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { + return result, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) + } - var entry WALEntry - if err := json.Unmarshal(lineCopy, &entry); err != nil { - corrupted = append(corrupted, corruptedWALEntry{ - lineNumber: lineNumber, - rawLine: string(lineCopy), - parseErr: err, - }) - continue - } + if db.Lineage == "" && header.Lineage != "" { + db.Lineage = header.Lineage + } - if entry.K == "" { - corrupted = append(corrupted, corruptedWALEntry{ - lineNumber: lineNumber, - rawLine: string(lineCopy), - parseErr: errors.New("entry has empty key"), - }) - continue - } + if db.State == nil { + db.State = make(map[string]ResourceEntry) + } + for _, entry := range entries { if entry.V != nil { db.State[entry.K] = *entry.V } else { delete(db.State, entry.K) } - result.entriesRecovered++ - } - - if err := scanner.Err(); err != nil { - return result, fmt.Errorf("%w: failed to read WAL file: %w", errWALRead, err) - } - - if header == nil { - return result, fmt.Errorf("%w: WAL file is empty", errWALRead) } result.recovered = true + result.entriesRecovered = len(entries) result.corruptedEntries = corrupted return result, nil } @@ -324,7 +269,7 @@ func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, logRecoveryProgress(ctx, "Recovering state from WAL file: "+relativePathForLog(walPath(statePath))) walLogPath := relativePathForLog(walPath(statePath)) for _, corrupted := range replayResult.corruptedEntries { - log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d: %s: %v", walLogPath, corrupted.lineNumber, corrupted.rawLine, corrupted.parseErr) + log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d", walLogPath, corrupted.lineNumber) } if err := writeCorruptedWALEntries(statePath, replayResult.corruptedEntries); err != nil { diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go index 9d4533eba7..d8a5f23345 100644 --- a/bundle/direct/dstate/wal_test.go +++ b/bundle/direct/dstate/wal_test.go @@ -1,7 +1,6 @@ package dstate import ( - "context" "encoding/json" "os" "path/filepath" @@ -98,7 +97,7 @@ func TestOpenWALFailsIfFileAlreadyExists(t *testing.T) { } func TestRecoverFromWAL_NoWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -109,7 +108,7 @@ func TestRecoverFromWAL_NoWAL(t *testing.T) { } func TestRecoverFromWAL_ValidWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -139,7 +138,7 @@ func TestRecoverFromWAL_ValidWAL(t *testing.T) { } func TestRecoverFromWAL_StaleWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -162,7 +161,7 @@ func TestRecoverFromWAL_StaleWAL(t *testing.T) { } func TestRecoverFromWAL_FutureWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -181,7 +180,7 @@ func TestRecoverFromWAL_FutureWAL(t *testing.T) { } func TestRecoverFromWAL_LineageMismatch(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -200,7 +199,7 @@ func TestRecoverFromWAL_LineageMismatch(t *testing.T) { } func TestRecoverFromWAL_DeleteOperation(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -232,7 +231,7 @@ func TestRecoverFromWAL_DeleteOperation(t *testing.T) { } func TestDeploymentState_WALIntegration(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -270,7 +269,7 @@ func TestDeploymentState_WALIntegration(t *testing.T) { } func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -307,7 +306,7 @@ func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { } func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -340,7 +339,7 @@ func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { } func TestDeploymentState_WALWithDependsOn(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -366,7 +365,7 @@ func TestDeploymentState_WALWithDependsOn(t *testing.T) { } func TestRecoverFromWAL_CorruptedMiddleLine(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -398,7 +397,7 @@ not valid json } func TestRecoverFromWAL_CorruptedLastLine(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -430,7 +429,7 @@ not valid json } func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -457,7 +456,7 @@ func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { } func TestRecoverFromWAL_LineageAdoption(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) @@ -489,7 +488,7 @@ func TestReadWAL_EmptyFile(t *testing.T) { } func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") @@ -522,7 +521,7 @@ func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { } func TestDeploymentState_FinalizeFailsOnCorruptedWAL(t *testing.T) { - ctx := context.Background() + ctx := t.Context() dir := t.TempDir() statePath := filepath.Join(dir, "resources.json") walFilePath := walPath(statePath) From 784f7c569ea92a869bb8a4303594b5e8ba4234f7 Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Thu, 26 Mar 2026 20:59:25 +0530 Subject: [PATCH 09/85] Update WAL corrupted entry outputs --- acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt | 2 +- acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index f7ebf7bfd2..ee28d6391e 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -10,7 +10,7 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 4: {"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-: unexpected end of JSON input +Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 4 Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted Recovered 2 entries from WAL file. Deploying resources... diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt index bf9236c1f9..ffc7ef7d04 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt @@ -10,7 +10,7 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 3: {"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-: unexpected end of JSON input +Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 3 Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted Recovered 2 entries from WAL file. Deploying resources... From 02412321c505b6325d6d0c29f7e5741de3e1bb51 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 17:11:26 +0100 Subject: [PATCH 10/85] WIP --- bundle/direct/bind.go | 6 +- bundle/direct/bundle_apply.go | 6 +- bundle/direct/dstate/state.go | 197 +++++++++++++++------------------- bundle/direct/dstate/wal.go | 12 +-- 4 files changed, 89 insertions(+), 132 deletions(-) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 08d849d14c..7f11a8674d 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -96,7 +96,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Finalize to persist temp state to disk - err = b.StateDB.Finalize() + err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -138,7 +138,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } - err = b.StateDB.Finalize() + err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -216,5 +216,5 @@ func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey st } } - return b.StateDB.Finalize() + return b.StateDB.Finalize(ctx) } diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 1b686519c6..a7f3ee65fc 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -21,11 +21,7 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa } if len(plan.Plan) == 0 { - if b.StateDB.RecoveredFromWAL() { - if err := b.StateDB.Finalize(); err != nil { - logdiag.LogError(ctx, err) - } - } + // Avoid creating state file if nothing to deploy return } diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 3f5a5c4f50..cfa7ec2114 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -3,7 +3,6 @@ package dstate import ( "context" "encoding/json" - "errors" "fmt" "io/fs" "os" @@ -20,11 +19,10 @@ import ( const currentStateVersion = 2 type DeploymentState struct { - Path string - Data Database - mu sync.Mutex - wal *WAL - recoveredFromWAL bool + Path string + Data Database + mu sync.Mutex + walFile *os.File } type Database struct { @@ -41,6 +39,18 @@ type ResourceEntry struct { DependsOn []deployplan.DependsOnEntry `json:"depends_on,omitempty"` } +type WALHeader struct { + Lineage string `json:"lineage"` + Serial int `json:"serial"` + StateVersion int `json:"state_version"` + CLIVersion string `json:"cli_version"` +} + +type WALEntry struct { + K string `json:"k"` + V *ResourceEntry `json:"v,omitempty"` // nil means delete +} + func NewDatabase(lineage string, serial int) Database { return Database{ StateVersion: currentStateVersion, @@ -52,7 +62,7 @@ func NewDatabase(lineage string, serial int) Database { } func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []deployplan.DependsOnEntry) error { - db.AssertOpened() + db.AssertOpenedForWrite() db.mu.Lock() defer db.mu.Unlock() @@ -60,7 +70,7 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d db.Data.State = make(map[string]ResourceEntry) } - jsonMessage, err := json.MarshalIndent(state, "", " ") + jsonMessage, err := json.Marshal(state) if err != nil { return err } @@ -71,20 +81,12 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d DependsOn: dependsOn, } - if err := db.ensureWALOpen(); err != nil { - return fmt.Errorf("failed to open WAL: %w", err) - } - if err := db.wal.writeJSON(WALEntry{K: key, V: &entry}); err != nil { - return fmt.Errorf("failed to write WAL entry: %w", err) - } - db.Data.State[key] = entry - - return nil + return appendJSONLine(db.walFile, WALEntry{K: key, V: &entry}) } func (db *DeploymentState) DeleteState(key string) error { - db.AssertOpened() + db.AssertOpenedForWrite() db.mu.Lock() defer db.mu.Unlock() @@ -92,43 +94,8 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } - if err := db.ensureWALOpen(); err != nil { - return fmt.Errorf("failed to open WAL: %w", err) - } - if err := db.wal.writeJSON(WALEntry{K: key}); err != nil { - return fmt.Errorf("failed to write WAL entry: %w", err) - } - delete(db.Data.State, key) - - return nil -} - -func (db *DeploymentState) ensureWALOpen() error { - if db.wal != nil { - return nil - } - - wal, err := openWAL(db.Path) - if err != nil { - return err - } - - lineage := db.Data.Lineage - if lineage == "" { - lineage = uuid.New().String() - db.Data.Lineage = lineage - } - - walSerial := db.Data.Serial + 1 - - if err := wal.writeJSON(WALHeader{Lineage: lineage, Serial: walSerial}); err != nil { - wal.close() - return err - } - - db.wal = wal - return nil + return appendJSONLine(db.walFile, WALEntry{K: key}) } func (db *DeploymentState) getResourceEntry(key string) (ResourceEntry, bool) { @@ -155,7 +122,12 @@ func (db *DeploymentState) GetResourceID(key string) string { return entry.ID } -func (db *DeploymentState) Open(ctx context.Context, path string) error { +type ( + WithRecovery bool + WithWrite bool +) + +func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery WithRecovery, withWrite WithWrite) error { db.mu.Lock() defer db.mu.Unlock() @@ -166,11 +138,9 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { data, err := os.ReadFile(path) if err != nil { if errors.Is(err, fs.ErrNotExist) { + // Not initializing lineage yet, we might have that saved in WAL db.Data = NewDatabase("", 0) db.Path = path - if err := db.unlockedSave(); err != nil { - return err - } } else { return err } @@ -182,73 +152,60 @@ func (db *DeploymentState) Open(ctx context.Context, path string) error { db.Path = path } - recovered, err := recoverFromWAL(ctx, path, &db.Data) - if err != nil { - return fmt.Errorf("WAL recovery failed: %w", err) + walPath := walPath(db.Path) + _, walError := os.Stat(walPath) + if walError == nil { + if withRecovery { + err := db.mergeWalIntoState(ctx) + if err != nil { + return err + } + } else { + return fmt.Errorf("unprocessed WAL exists: %s", walPath) + } } if err := migrateState(&db.Data); err != nil { return fmt.Errorf("migrating state %s: %w", path, err) } - if recovered { - if err := db.unlockedSave(); err != nil { - return err + if withWrite { + db.walFile, err = os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) + if err != nil { + return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) } - if err := cleanupWAL(path); err != nil { - return err + lineage := db.Data.Lineage + if lineage == "" { + lineage = uuid.New().String() } - db.recoveredFromWAL = true - } - return nil -} - -func (db *DeploymentState) Finalize() error { - db.mu.Lock() - defer db.mu.Unlock() - - hadOpenWAL := db.wal != nil - if hadOpenWAL { - if err := db.wal.close(); err != nil { - return err + // Set our Serial to the next one + db.Data.Serial += 1 + walHead := WALHeader{ + Lineage: lineage, + Serial: db.Data.Serial, // next serial + StateVersion: currentStateVersion, + CLIVersion: build.GetInfo().Version, } - db.wal = nil - - validationDB := db.Data - replayResult, err := replayWAL(db.Path, &validationDB) + err := appendJSONLine(db.walFile, walHead) if err != nil { - return fmt.Errorf("failed to replay WAL during finalize: %w", err) - } - if !replayResult.recovered { - return errors.New("failed to replay WAL during finalize: WAL file not found or stale") - } - if len(replayResult.corruptedEntries) > 0 { - first := replayResult.corruptedEntries[0] - return fmt.Errorf("failed to replay WAL during finalize: corrupted entry at line %d: %v", first.lineNumber, first.parseErr) + return err } } - if db.Data.Lineage == "" && !hadOpenWAL && len(db.Data.State) == 0 { - return nil - } - - if db.Data.Lineage == "" { - db.Data.Lineage = uuid.New().String() - } - - db.Data.Serial++ + return nil +} - if err := db.unlockedSave(); err != nil { - return err - } +func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { +} - if hadOpenWAL { - if err := cleanupWAL(db.Path); err != nil { - return err - } - } +func (db *DeploymentState) Finalize(ctx context.Context) error { + db.AssertOpenedForWrite() + db.mu.Lock() + defer db.mu.Unlock() - return nil + db.walFile.Close() + db.walFile = nil + return db.mergeWalIntoState(ctx) } // Close closes the WAL file without saving state. @@ -271,9 +228,11 @@ func (db *DeploymentState) AssertOpened() { } } -// RecoveredFromWAL reports whether Open recovered state from the WAL. -func (db *DeploymentState) RecoveredFromWAL() bool { - return db.recoveredFromWAL +func (db *DeploymentState) AssertOpenedForWrite() { + db.AssertOpened() + if db.walFile == nil { + panic("internal error: DeploymentState must be opened in write mode") + } } func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { @@ -300,7 +259,7 @@ func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.Export } func (db *DeploymentState) unlockedSave() error { - db.AssertOpened() + db.AssertOpenedForWrite() data, err := json.MarshalIndent(db.Data, "", " ") if err != nil { return err @@ -318,3 +277,15 @@ func (db *DeploymentState) unlockedSave() error { return nil } + +func appendJSONLine(file *os.File, obj any) error { + data, err := json.Marshal(obj) + if err != nil { + return err + } + data = append(data, '\n') + + _, err = file.Write(data) + // no fsync here, not needed + return err +} diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go index 25bb3feaea..9ccb12303d 100644 --- a/bundle/direct/dstate/wal.go +++ b/bundle/direct/dstate/wal.go @@ -15,16 +15,6 @@ import ( "github.com/databricks/cli/libs/log" ) -type WALHeader struct { - Lineage string `json:"lineage"` - Serial int `json:"serial"` -} - -type WALEntry struct { - K string `json:"k"` - V *ResourceEntry `json:"v,omitempty"` // nil means delete -} - type WAL struct { file *os.File } @@ -255,7 +245,7 @@ func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, } if replayResult.stale { - log.Debugf(ctx, "Deleting stale WAL (serial behind current state)") + log.Warnf(ctx, "Deleting stale WAL (serial=%s behind current state serial=)") if err := cleanupWAL(statePath); err != nil { return false, err } From 8b186314b162f28d023d9db42fe9bb712651322c Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Mon, 12 Jan 2026 21:32:47 +0530 Subject: [PATCH 11/85] Updated tests and enhanced kill caller with an offset Signed-off-by: Varun Deep Saini --- acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml | 1 + acceptance/bundle/deploy/wal/multiple-crashes/test.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml index 6245c19840..9c9ab5a30b 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -11,3 +11,4 @@ Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=2222" Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' + diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml index 474177b804..c5981d6720 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -16,3 +16,4 @@ Response.Body = '{}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' + From 0dd57abc37c46ca4389097d95ab5f24e4d573d4e Mon Sep 17 00:00:00 2001 From: Varun Deep Saini Date: Sat, 24 Jan 2026 00:51:38 +0530 Subject: [PATCH 12/85] Updated existing tests Signed-off-by: Varun Deep Saini --- .../out.deploy.direct.txt | 7 ++++++ .../out.deploy.terraform.txt | 6 +++++ .../output.txt | 6 ----- .../script | 2 +- .../test.toml | 1 + .../build_and_files_whl/out.deploy.direct.txt | 8 +++++++ .../out.deploy.terraform.txt | 7 ++++++ .../artifacts/build_and_files_whl/output.txt | 7 ------ .../artifacts/build_and_files_whl/script | 2 +- .../shell/bash/out.deploy.direct.txt | 7 ++++++ .../shell/bash/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/bash/output.txt | 5 ---- acceptance/bundle/artifacts/shell/bash/script | 2 +- .../shell/basic/out.deploy.direct.txt | 7 ++++++ .../shell/basic/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/basic/output.txt | 5 ---- .../bundle/artifacts/shell/basic/script | 2 +- .../shell/default/out.deploy.direct.txt | 7 ++++++ .../shell/default/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/default/output.txt | 5 ---- .../bundle/artifacts/shell/default/script | 2 +- .../artifacts/shell/sh/out.deploy.direct.txt | 7 ++++++ .../shell/sh/out.deploy.terraform.txt | 6 +++++ .../bundle/artifacts/shell/sh/output.txt | 5 ---- acceptance/bundle/artifacts/shell/sh/script | 2 +- .../deploy/empty-bundle/out.deploy.direct.txt | 6 +++++ .../empty-bundle/out.deploy.terraform.txt | 5 ++++ .../bundle/deploy/empty-bundle/output.txt | 5 ---- acceptance/bundle/deploy/empty-bundle/script | 2 +- .../deploy/wal/corrupted-wal-entry/test.toml | 1 - .../deploy/wal/multiple-crashes/test.toml | 1 - .../bundle/scripts/out.deploy.direct.txt | 24 +++++++++++++++++++ .../bundle/scripts/out.deploy.terraform.txt | 23 ++++++++++++++++++ acceptance/bundle/scripts/output.txt | 23 ------------------ .../out.deploy.direct.txt | 18 ++++++++++++++ .../out.deploy.terraform.txt | 17 +++++++++++++ .../scripts/restricted-execution/output.txt | 17 ------------- .../scripts/restricted-execution/script | 2 +- acceptance/bundle/scripts/script | 2 +- .../out.deploy-one.direct.txt | 6 +++++ .../out.deploy-one.terraform.txt | 5 ++++ .../out.deploy-two.direct.txt | 6 +++++ .../out.deploy-two.terraform.txt | 5 ++++ .../deploy-artifact-path-type/output.txt | 10 -------- .../deploy-artifact-path-type/script | 4 ++-- .../out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../deploy-config-file-count/output.txt | 5 ---- .../telemetry/deploy-config-file-count/script | 2 +- .../deploy-mode/out.deploy-dev.direct.txt | 6 +++++ .../deploy-mode/out.deploy-dev.terraform.txt | 5 ++++ .../deploy-mode/out.deploy-prod.direct.txt | 12 ++++++++++ .../deploy-mode/out.deploy-prod.terraform.txt | 11 +++++++++ .../bundle/telemetry/deploy-mode/output.txt | 16 ------------- .../bundle/telemetry/deploy-mode/script | 4 ++-- .../deploy-target-count/out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../telemetry/deploy-target-count/output.txt | 5 ---- .../telemetry/deploy-target-count/script | 2 +- .../out.deploy.direct.txt | 6 +++++ .../out.deploy.terraform.txt | 5 ++++ .../deploy-variable-count/output.txt | 5 ---- .../telemetry/deploy-variable-count/script | 2 +- .../out.deploy-one.direct.txt | 8 +++++++ .../out.deploy-one.terraform.txt | 7 ++++++ .../out.deploy-two.direct.txt | 8 +++++++ .../out.deploy-two.terraform.txt | 7 ++++++ .../telemetry/deploy-whl-artifacts/output.txt | 14 ----------- .../telemetry/deploy-whl-artifacts/script | 4 ++-- .../sync_patterns/out.deploy.direct.txt | 6 +++++ .../sync_patterns/out.deploy.terraform.txt | 5 ++++ .../bundle/validate/sync_patterns/output.txt | 5 ---- .../bundle/validate/sync_patterns/script | 2 +- acceptance/cache/simple/out.deploy.direct.txt | 6 +++++ .../cache/simple/out.deploy.terraform.txt | 5 ++++ acceptance/cache/simple/script | 2 +- acceptance/cache/simple/test.toml | 3 +++ 77 files changed, 338 insertions(+), 160 deletions(-) create mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt create mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt create mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt create mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt create mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/out.deploy.direct.txt create mode 100644 acceptance/bundle/scripts/out.deploy.terraform.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt create mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt create mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt create mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt create mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt create mode 100644 acceptance/cache/simple/out.deploy.direct.txt create mode 100644 acceptance/cache/simple/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt new file mode 100644 index 0000000000..f75a5428b1 --- /dev/null +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt new file mode 100644 index 0000000000..8ec9c52db6 --- /dev/null +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt index 6d24880e6c..6c8bd962a5 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt @@ -1,10 +1,4 @@ ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - === Expecting wheel to be uploaded >>> jq .path "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/whl/source.whl" diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script index 883601185c..fba3a77700 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script @@ -2,7 +2,7 @@ mkdir -p whl echo "test wheel content" > whl/source.whl -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 title "Expecting wheel to be uploaded" trace jq .path < out.requests.txt | grep import | grep whl | sort diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index 8185d0df6e..67a9da6c97 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -1,4 +1,5 @@ RecordRequests = true +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" Ignore = [ '.venv', 'dist', diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt new file mode 100644 index 0000000000..4039d5917e --- /dev/null +++ b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt @@ -0,0 +1,8 @@ + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt new file mode 100644 index 0000000000..9894e5b89f --- /dev/null +++ b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt @@ -0,0 +1,7 @@ + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/output.txt b/acceptance/bundle/artifacts/build_and_files_whl/output.txt index b618de6b89..d44a21b582 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/output.txt +++ b/acceptance/bundle/artifacts/build_and_files_whl/output.txt @@ -7,10 +7,3 @@ Workspace: Path: /Workspace/Users/[USERNAME]/.bundle/test-bundle/default Validation OK! - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/script b/acceptance/bundle/artifacts/build_and_files_whl/script index 2d7d63f7fe..9aa0d870e7 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/script +++ b/acceptance/bundle/artifacts/build_and_files_whl/script @@ -1,5 +1,5 @@ cp -r $TESTDIR/../whl_explicit/my_test_code/{setup.py,src} . trace $CLI bundle validate # I expect this deploy to work because I explicitly told where to find the wheel, but it does not: -trace errcode $CLI bundle deploy +trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 rm mydist/my_test_code-0.0.1-py3-none-any.whl setup.py src/*.py diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt new file mode 100644 index 0000000000..f311959abd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt new file mode 100644 index 0000000000..fa5d7b76bc --- /dev/null +++ b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index fa5d7b76bc..8b13789179 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index 68ebb78d77..09bb41643c 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt new file mode 100644 index 0000000000..3a4ff9138b --- /dev/null +++ b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt new file mode 100644 index 0000000000..b5e01c79e6 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index b5e01c79e6..8b13789179 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index 68ebb78d77..09bb41643c 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt new file mode 100644 index 0000000000..f311959abd --- /dev/null +++ b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt new file mode 100644 index 0000000000..fa5d7b76bc --- /dev/null +++ b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index fa5d7b76bc..8b13789179 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index 68ebb78d77..09bb41643c 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt new file mode 100644 index 0000000000..98820986f5 --- /dev/null +++ b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt new file mode 100644 index 0000000000..5117e6e9fc --- /dev/null +++ b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index 5117e6e9fc..8b13789179 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -1,6 +1 @@ ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index 68ebb78d77..09bb41643c 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt new file mode 100644 index 0000000000..81dddfcb9f --- /dev/null +++ b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt new file mode 100644 index 0000000000..494f76c84f --- /dev/null +++ b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/output.txt b/acceptance/bundle/deploy/empty-bundle/output.txt index 919accb661..8498653a6e 100644 --- a/acceptance/bundle/deploy/empty-bundle/output.txt +++ b/acceptance/bundle/deploy/empty-bundle/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Deployment complete! - >>> [CLI] bundle destroy --auto-approve All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default diff --git a/acceptance/bundle/deploy/empty-bundle/script b/acceptance/bundle/deploy/empty-bundle/script index 775ccd0def..b74818f1b1 100644 --- a/acceptance/bundle/deploy/empty-bundle/script +++ b/acceptance/bundle/deploy/empty-bundle/script @@ -4,4 +4,4 @@ cleanup() { } trap cleanup EXIT -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml index 9c9ab5a30b..6245c19840 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml @@ -11,4 +11,3 @@ Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' [[Server]] Pattern = "GET /api/2.2/jobs/get?job_id=2222" Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' - diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml index c5981d6720..474177b804 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml @@ -16,4 +16,3 @@ Response.Body = '{}' Pattern = "GET /api/2.2/jobs/get" KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' - diff --git a/acceptance/bundle/scripts/out.deploy.direct.txt b/acceptance/bundle/scripts/out.deploy.direct.txt new file mode 100644 index 0000000000..037f609f94 --- /dev/null +++ b/acceptance/bundle/scripts/out.deploy.direct.txt @@ -0,0 +1,24 @@ + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/out.deploy.terraform.txt b/acceptance/bundle/scripts/out.deploy.terraform.txt new file mode 100644 index 0000000000..a3d9ba342c --- /dev/null +++ b/acceptance/bundle/scripts/out.deploy.terraform.txt @@ -0,0 +1,23 @@ + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/output.txt b/acceptance/bundle/scripts/output.txt index 68afb2fecc..a39a0b0aa9 100644 --- a/acceptance/bundle/scripts/output.txt +++ b/acceptance/bundle/scripts/output.txt @@ -25,26 +25,3 @@ Name: scripts Found 1 error Exit code: 1 - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt new file mode 100644 index 0000000000..d8fed9e4e6 --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt @@ -0,0 +1,18 @@ + +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt new file mode 100644 index 0000000000..efcf1281cb --- /dev/null +++ b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt @@ -0,0 +1,17 @@ + +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/output.txt b/acceptance/bundle/scripts/restricted-execution/output.txt index f377edba7c..2186ac68f0 100644 --- a/acceptance/bundle/scripts/restricted-execution/output.txt +++ b/acceptance/bundle/scripts/restricted-execution/output.txt @@ -1,22 +1,5 @@ === Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env - === With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible >>> DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode [CLI] bundle deploy Error: failed to execute script: running scripts is not allowed when DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION is set diff --git a/acceptance/bundle/scripts/restricted-execution/script b/acceptance/bundle/scripts/restricted-execution/script index 7a3dcb068b..2e31cce2ee 100644 --- a/acceptance/bundle/scripts/restricted-execution/script +++ b/acceptance/bundle/scripts/restricted-execution/script @@ -1,7 +1,7 @@ export SOME_ENV_VAR="value_from_env" title "Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible" -trace errcode $CLI bundle deploy +trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 title "With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible" trace DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/script b/acceptance/bundle/scripts/script index de07d277ea..3acb85f9cd 100644 --- a/acceptance/bundle/scripts/script +++ b/acceptance/bundle/scripts/script @@ -1,3 +1,3 @@ trace EXITCODE=0 errcode $CLI bundle validate trace EXITCODE=1 errcode $CLI bundle validate -trace EXITCODE=0 errcode $CLI bundle deploy +trace EXITCODE=0 errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt new file mode 100644 index 0000000000..0e133547de --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt new file mode 100644 index 0000000000..65960fa86d --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt new file mode 100644 index 0000000000..120e590201 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt new file mode 100644 index 0000000000..fabdebb399 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt index a03920c3fd..69c6730b46 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt @@ -1,14 +1,4 @@ ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "workspace_artifact_path_type": "WORKSPACE_FILE_SYSTEM" diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/script b/acceptance/bundle/telemetry/deploy-artifact-path-type/script index d1a63928a6..4f3bd7c3cf 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/script +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t two +trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {workspace_artifact_path_type}' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt new file mode 100644 index 0000000000..1b73d1b916 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt new file mode 100644 index 0000000000..5c6aad5b37 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt index 909e8d6c70..1637965310 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "configuration_file_count": 4 diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/script b/acceptance/bundle/telemetry/deploy-config-file-count/script index c495bdcb07..7fbdd0e677 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/script +++ b/acceptance/bundle/telemetry/deploy-config-file-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {configuration_file_count}' diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt new file mode 100644 index 0000000000..e86795abf5 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt new file mode 100644 index 0000000000..ee47fabbb6 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt new file mode 100644 index 0000000000..5957e33b91 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt @@ -0,0 +1,12 @@ + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt new file mode 100644 index 0000000000..ac2e13efb9 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt @@ -0,0 +1,11 @@ + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/output.txt b/acceptance/bundle/telemetry/deploy-mode/output.txt index 99e7fbb699..89be65f195 100644 --- a/acceptance/bundle/telemetry/deploy-mode/output.txt +++ b/acceptance/bundle/telemetry/deploy-mode/output.txt @@ -1,20 +1,4 @@ ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "bundle_mode": "DEVELOPMENT" diff --git a/acceptance/bundle/telemetry/deploy-mode/script b/acceptance/bundle/telemetry/deploy-mode/script index f7257769ac..0a9d57a1a4 100644 --- a/acceptance/bundle/telemetry/deploy-mode/script +++ b/acceptance/bundle/telemetry/deploy-mode/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t dev +trace $CLI bundle deploy -t dev > out.deploy-dev.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t prod +trace $CLI bundle deploy -t prod > out.deploy-prod.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bundle_mode}' diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt new file mode 100644 index 0000000000..0e133547de --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt new file mode 100644 index 0000000000..65960fa86d --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/output.txt b/acceptance/bundle/telemetry/deploy-target-count/output.txt index 31581169f2..9c59c43023 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-target-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "target_count": 3 diff --git a/acceptance/bundle/telemetry/deploy-target-count/script b/acceptance/bundle/telemetry/deploy-target-count/script index 3022a2b5e4..6e9d2f7378 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/script +++ b/acceptance/bundle/telemetry/deploy-target-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {target_count}' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt new file mode 100644 index 0000000000..1b73d1b916 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt new file mode 100644 index 0000000000..5c6aad5b37 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/output.txt b/acceptance/bundle/telemetry/deploy-variable-count/output.txt index be4840e69e..e8580d71b3 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-variable-count/output.txt @@ -1,9 +1,4 @@ ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "variable_count": 6, diff --git a/acceptance/bundle/telemetry/deploy-variable-count/script b/acceptance/bundle/telemetry/deploy-variable-count/script index dad762899a..caaf8c1f39 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/script +++ b/acceptance/bundle/telemetry/deploy-variable-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs.[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {variable_count, lookup_variable_count, complex_variable_count}' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt new file mode 100644 index 0000000000..f8db617c00 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt @@ -0,0 +1,8 @@ + +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt new file mode 100644 index 0000000000..048d0f07b5 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt new file mode 100644 index 0000000000..b786de11fe --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt @@ -0,0 +1,8 @@ + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt new file mode 100644 index 0000000000..651d315f77 --- /dev/null +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt @@ -0,0 +1,7 @@ + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt index a9b8ce4ae6..ed89628d98 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt @@ -1,18 +1,4 @@ ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! - >>> cat out.requests.txt { "bool_values": [ diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/script b/acceptance/bundle/telemetry/deploy-whl-artifacts/script index 078fa94cdd..5bc513afb8 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/script +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/script @@ -2,9 +2,9 @@ uv venv -q .venv venv_activate uv pip install -q --no-index setuptools -trace $CLI bundle deploy -t one +trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 -trace $CLI bundle deploy -t two +trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bool_values}' diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt new file mode 100644 index 0000000000..1b73d1b916 --- /dev/null +++ b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt new file mode 100644 index 0000000000..5c6aad5b37 --- /dev/null +++ b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/output.txt b/acceptance/bundle/validate/sync_patterns/output.txt index b35859d86a..0c061fbe31 100644 --- a/acceptance/bundle/validate/sync_patterns/output.txt +++ b/acceptance/bundle/validate/sync_patterns/output.txt @@ -20,8 +20,3 @@ Validation OK! "." ] } - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/script b/acceptance/bundle/validate/sync_patterns/script index d2aae85444..485556d28a 100644 --- a/acceptance/bundle/validate/sync_patterns/script +++ b/acceptance/bundle/validate/sync_patterns/script @@ -1,5 +1,5 @@ trace $CLI bundle validate trace $CLI bundle validate -o json | jq '.sync' -trace $CLI bundle deploy +trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 jq 'select(.path | test("dir/test.yml"))' out.requests.txt > out.sync.txt rm out.requests.txt diff --git a/acceptance/cache/simple/out.deploy.direct.txt b/acceptance/cache/simple/out.deploy.direct.txt new file mode 100644 index 0000000000..945da6d144 --- /dev/null +++ b/acceptance/cache/simple/out.deploy.direct.txt @@ -0,0 +1,6 @@ + +>>> [CLI] bundle deploy -p dogfood +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! diff --git a/acceptance/cache/simple/out.deploy.terraform.txt b/acceptance/cache/simple/out.deploy.terraform.txt new file mode 100644 index 0000000000..41cfbc2a2d --- /dev/null +++ b/acceptance/cache/simple/out.deploy.terraform.txt @@ -0,0 +1,5 @@ + +>>> [CLI] bundle deploy -p dogfood +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/cache/simple/script b/acceptance/cache/simple/script index a2907174bf..524c077f46 100644 --- a/acceptance/cache/simple/script +++ b/acceptance/cache/simple/script @@ -9,7 +9,7 @@ title "Second call in a session is expected to be a cache hit\n" trace $CLI bundle validate -p dogfood --debug 2>&1 | grep "Local Cache" | grep -v "cache path" title "Bundle deploy should send telemetry values\n" -trace $CLI bundle deploy -p dogfood +trace $CLI bundle deploy -p dogfood > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 trace print_telemetry_bool_values | grep "local.cache" rm out.requests.txt diff --git a/acceptance/cache/simple/test.toml b/acceptance/cache/simple/test.toml index f791f9a03c..2601c79f82 100644 --- a/acceptance/cache/simple/test.toml +++ b/acceptance/cache/simple/test.toml @@ -3,6 +3,9 @@ Local = true RecordRequests = true +# Enable engine-specific output files +EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" + # Redact structured logging fields from debug output [[Repls]] Old = ' pid=[0-9]+' From 2029281974bdc03db27f368bc96fc323bc13858b Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 21:43:22 +0100 Subject: [PATCH 13/85] Merge simplified WAL handling into state.go fix Open() calls; replace Finalize() with Close(); close state file in plan --- bundle/direct/bind.go | 12 +- bundle/direct/bundle_apply.go | 1 - bundle/direct/bundle_plan.go | 2 +- bundle/direct/dstate/state.go | 243 ++++++++++++++++++++-------- bundle/direct/dstate/wal.go | 289 ---------------------------------- cmd/bundle/utils/process.go | 2 +- 6 files changed, 189 insertions(+), 360 deletions(-) delete mode 100644 bundle/direct/dstate/wal.go diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 7f11a8674d..74389313af 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -62,7 +62,7 @@ type BindResult struct { func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.WorkspaceClient, configRoot *config.Root, statePath, resourceKey, resourceID string) (*BindResult, error) { // Check if the resource is already managed (bound to a different ID) var checkStateDB dstate.DeploymentState - if err := checkStateDB.Open(ctx, statePath); err == nil { + if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(true)); err == nil { if existingID := checkStateDB.GetResourceID(resourceKey); existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -82,7 +82,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Open temp state - err := b.StateDB.Open(ctx, tmpStatePath) + err := b.StateDB.Open(ctx, tmpStatePath, dstate.WithRecovery(true), dstate.WithWrite(true)) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -96,7 +96,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Finalize to persist temp state to disk - err = b.StateDB.Finalize(ctx) + err = b.StateDB.Close(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -138,7 +138,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } - err = b.StateDB.Finalize(ctx) + err = b.StateDB.Close(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -188,7 +188,7 @@ func (result *BindResult) Cancel() { // Unbind removes a resource from direct engine state without deleting // the workspace resource. Also removes associated permissions/grants entries. func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey string) error { - err := b.StateDB.Open(ctx, statePath) + err := b.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(true)) if err != nil { return err } @@ -216,5 +216,5 @@ func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey st } } - return b.StateDB.Finalize(ctx) + return b.StateDB.Close(ctx) } diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index a7f3ee65fc..7a77968515 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -25,7 +25,6 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return } - b.StateDB.AssertOpened() b.RemoteStateCache.Clear() g, err := makeGraph(plan) diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 1fb70123b9..03fe2b87bb 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -40,7 +40,7 @@ func (b *DeploymentBundle) init(client *databricks.WorkspaceClient) error { // ValidatePlanAgainstState validates that a plan's lineage and serial match the current state. // This should be called early in the deployment process, before any file operations. // If the plan has no lineage (first deployment), validation is skipped. -func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { +func OpenStateWithPlanCheck(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { // If plan has no lineage, this is a first deployment before any state exists // No validation needed if plan.Lineage == "" { diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index cfa7ec2114..e409c3f6e8 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -1,6 +1,7 @@ package dstate import ( + "bufio" "context" "encoding/json" "fmt" @@ -16,13 +17,19 @@ import ( "github.com/google/uuid" ) -const currentStateVersion = 2 +const ( + currentStateVersion = 2 + initialBufferSize = 64 * 1024 + maxWalEntrySize = 1024 * 1024 + walSuffix = ".WAL" +) type DeploymentState struct { - Path string - Data Database - mu sync.Mutex - walFile *os.File + Path string + Data Database + mu sync.Mutex + walFile *os.File + stateIDs map[string]string } type Database struct { @@ -47,8 +54,8 @@ type WALHeader struct { } type WALEntry struct { - K string `json:"k"` - V *ResourceEntry `json:"v,omitempty"` // nil means delete + Key string `json:"k"` + Value *ResourceEntry `json:"v,omitempty"` // nil means delete } func NewDatabase(lineage string, serial int) Database { @@ -70,6 +77,7 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d db.Data.State = make(map[string]ResourceEntry) } + // don't indent so that every WAL entry remains on a single line jsonMessage, err := json.Marshal(state) if err != nil { return err @@ -81,8 +89,11 @@ func (db *DeploymentState) SaveState(key, newID string, state any, dependsOn []d DependsOn: dependsOn, } - db.Data.State[key] = entry - return appendJSONLine(db.walFile, WALEntry{K: key, V: &entry}) + err = appendJSONLine(db.walFile, WALEntry{Key: key, Value: &entry}) + if err == nil { + db.stateIDs[key] = newID + } + return err } func (db *DeploymentState) DeleteState(key string) error { @@ -94,12 +105,15 @@ func (db *DeploymentState) DeleteState(key string) error { return nil } - delete(db.Data.State, key) - return appendJSONLine(db.walFile, WALEntry{K: key}) + err := appendJSONLine(db.walFile, WALEntry{Key: key}) + if err == nil { + delete(db.stateIDs, key) + } + return err } -func (db *DeploymentState) getResourceEntry(key string) (ResourceEntry, bool) { - db.AssertOpened() +func (db *DeploymentState) GetResourceEntry(key string) (ResourceEntry, bool) { + db.AssertOpenedForRead() db.mu.Lock() defer db.mu.Unlock() @@ -111,14 +125,28 @@ func (db *DeploymentState) getResourceEntry(key string) (ResourceEntry, bool) { return result, ok } -// GetResourceEntry returns the full resource entry for the given key. -func (db *DeploymentState) GetResourceEntry(key string) (ResourceEntry, bool) { - return db.getResourceEntry(key) -} - // GetResourceID returns the ID of the resource for the given key, or an empty string if not found. func (db *DeploymentState) GetResourceID(key string) string { - entry, _ := db.getResourceEntry(key) + db.AssertOpenedForReadOrWrite() + db.mu.Lock() + defer db.mu.Unlock() + + if db.walFile != nil { + // in write-mode new IDs are written to WAL and stored in this map + id := db.stateIDs[key] + if id != "" { + return id + } + } + + // in read mode State is the source of IDs for all requests + // in write mode State is the source of IDs for all resources that were not updated + + if db.Data.State == nil { + return "" + } + + entry, _ := db.Data.State[key] return entry.ID } @@ -135,33 +163,19 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W panic(fmt.Sprintf("state already opened: %v, cannot open %v", db.Path, path)) } - data, err := os.ReadFile(path) - if err != nil { - if errors.Is(err, fs.ErrNotExist) { - // Not initializing lineage yet, we might have that saved in WAL - db.Data = NewDatabase("", 0) - db.Path = path - } else { - return err - } - } else { - err = json.Unmarshal(data, &db.Data) - if err != nil { - return err - } - db.Path = path - } + db.Path = path + db.Reload(ctx) - walPath := walPath(db.Path) + walPath := db.Path + walSuffix _, walError := os.Stat(walPath) if walError == nil { if withRecovery { - err := db.mergeWalIntoState(ctx) + err := db.replayWAL(ctx) if err != nil { return err } } else { - return fmt.Errorf("unprocessed WAL exists: %s", walPath) + return fmt.Errorf("Unexpected WAL file found at %s", walPath) } } @@ -170,66 +184,171 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W } if withWrite { - db.walFile, err = os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) + walFile, err := os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err != nil { return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) } + db.walFile = walFile lineage := db.Data.Lineage if lineage == "" { + // state file is new, does not have lineage yet; store lineage in the WAL only lineage = uuid.New().String() } - // Set our Serial to the next one - db.Data.Serial += 1 walHead := WALHeader{ Lineage: lineage, - Serial: db.Data.Serial, // next serial + Serial: db.Data.Serial + 1, StateVersion: currentStateVersion, CLIVersion: build.GetInfo().Version, } - err := appendJSONLine(db.walFile, walHead) - if err != nil { + return appendJSONLine(db.walFile, walHead) + } + + return nil +} + +func (db *DeploymentState) Reload(ctx context.Context) error { + + + data, err := os.ReadFile(db.Path) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + // Not initializing lineage yet, we might have that saved in WAL + db.Data = NewDatabase("", 0) + } else { return err } + } else { + return json.Unmarshal(data, &db.Data) } + return nil +} +func (db *DeploymentState) replayWAL(ctx context.Context) error { + walPath := db.Path + walSuffix + hasUpdates, err := db.mergeWalIntoState(ctx) + if err != nil { + return fmt.Errorf("failed to apply WAL file %s: %w", walPath, err) + } + if hasUpdates { + if err := db.unlockedSave(); err != nil { + return err + } + } + err = os.Remove(walPath) + if err != nil { + return fmt.Errorf("failed to remove WAL file %s: %w", walPath, err) + } return nil } -func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { +func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHeader) error { + if header.CLIVersion != db.Data.CLIVersion { + return fmt.Errorf("cli_version in the header (%q) does not match the one in the state (%q)", header.CLIVersion, db.Data.CLIVersion) + } + + if header.StateVersion != db.Data.StateVersion { + return fmt.Errorf("state_version in the header (%q) does not match the one in the state (%q)", header.StateVersion, db.Data.StateVersion) + } + + if header.Lineage != db.Data.Lineage { + return fmt.Errorf("lineage in the header (%q) does not match the one in the state (%q)", header.Lineage, db.Data.Lineage) + } + + if header.Serial != db.Data.Serial+1 { + return fmt.Errorf("serial in the header (%q) is not one higher than the one in the state (%q)", header.Serial, db.Data.Serial) + } + + return nil } -func (db *DeploymentState) Finalize(ctx context.Context) error { - db.AssertOpenedForWrite() - db.mu.Lock() - defer db.mu.Unlock() +func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) { + if db.walFile != nil { + panic("internal error: walFile must be closed") + } + + hasUpdates := false + walPath := db.Path + walSuffix + walFile, err := os.Open(walPath) + if err != nil { + return false, fmt.Errorf("failed to open WAL file %s: %w", walPath, err) + } + defer walFile.Close() + + scanner := bufio.NewScanner(walFile) + scanner.Buffer(make([]byte, 0, initialBufferSize), maxWalEntrySize) + lineNumber := 0 + + for scanner.Scan() { + lineNumber += 1 + line := scanner.Bytes() + if lineNumber == 1 { + var header WALHeader + if err := json.Unmarshal(line, &header); err != nil { + return hasUpdates, fmt.Errorf("failed to parse WAL header: %w", err) + } + if err := db.validateWALHeader(ctx, &header); err != nil { + return hasUpdates, err + } + } else { + var entry WALEntry + if err := json.Unmarshal(line, &entry); err != nil { + return hasUpdates, fmt.Errorf("failed to parse WAL entry %s:%s: %q: %w", walPath, lineNumber, entry, err) + } + hasUpdates = true + if entry.Value == nil { + delete(db.Data.State, entry.Key) + } else { + db.Data.State[entry.Key] = *entry.Value + } + } + } + + if err := scanner.Err(); err != nil { + return hasUpdates, err + } - db.walFile.Close() - db.walFile = nil - return db.mergeWalIntoState(ctx) + if hasUpdates { + // only assume WAL file's serial if we read any data from it + db.Data.Serial += 1 + } + + return hasUpdates, nil } -// Close closes the WAL file without saving state. -func (db *DeploymentState) Close() error { +func (db *DeploymentState) Close(ctx context.Context) error { db.mu.Lock() defer db.mu.Unlock() - if db.wal != nil { - if err := db.wal.close(); err != nil { - return err - } - db.wal = nil + var err error + + if db.walFile != nil { + db.walFile.Close() + db.walFile = nil + err = db.replayWAL(ctx) } - return nil + + db.Path = "" + db.Data = Database{} + db.stateIDs = make(map[string]string) + + return err } -func (db *DeploymentState) AssertOpened() { +func (db *DeploymentState) AssertOpenedForReadOrWrite() { if db.Path == "" { panic("internal error: DeploymentState must be opened first") } } +func (db *DeploymentState) AssertOpenedForRead() { + db.AssertOpenedForReadOrWrite() + if db.walFile != nil { + panic("internal error: DeploymentState must be opened in read mode") + } +} + func (db *DeploymentState) AssertOpenedForWrite() { - db.AssertOpened() + db.AssertOpenedForReadOrWrite() if db.walFile == nil { panic("internal error: DeploymentState must be opened in write mode") } diff --git a/bundle/direct/dstate/wal.go b/bundle/direct/dstate/wal.go deleted file mode 100644 index 9ccb12303d..0000000000 --- a/bundle/direct/dstate/wal.go +++ /dev/null @@ -1,289 +0,0 @@ -package dstate - -import ( - "bufio" - "bytes" - "context" - "encoding/json" - "errors" - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/databricks/cli/libs/cmdio" - "github.com/databricks/cli/libs/log" -) - -type WAL struct { - file *os.File -} - -type corruptedWALEntry struct { - lineNumber int - rawLine string - parseErr error -} - -type walReplayResult struct { - recovered bool - stale bool - entriesRecovered int - corruptedEntries []corruptedWALEntry -} - -var errWALRead = errors.New("wal read error") - -func walPath(statePath string) string { - return statePath + ".wal" -} - -func walCorruptedPath(statePath string) string { - return walPath(statePath) + ".corrupted" -} - -func openWAL(statePath string) (*WAL, error) { - wp := walPath(statePath) - f, err := os.OpenFile(wp, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) - if err != nil { - return nil, fmt.Errorf("failed to open WAL file %q: %w", wp, err) - } - return &WAL{file: f}, nil -} - -func (w *WAL) writeJSON(v any) error { - data, err := json.Marshal(v) - if err != nil { - return fmt.Errorf("failed to marshal WAL entry: %w", err) - } - data = append(data, '\n') - - _, err = w.file.Write(data) - if err != nil { - return fmt.Errorf("failed to write WAL entry: %w", err) - } - - if err := w.file.Sync(); err != nil { - return fmt.Errorf("failed to sync WAL entry: %w", err) - } - - return nil -} - -func (w *WAL) close() error { - if w.file != nil { - return w.file.Close() - } - return nil -} - -func cleanupWAL(statePath string) error { - err := os.Remove(walPath(statePath)) - if err != nil && !os.IsNotExist(err) { - return fmt.Errorf("failed to remove WAL file %q: %w", walPath(statePath), err) - } - return nil -} - -func moveWALToCorrupted(statePath string) error { - source := walPath(statePath) - target := walCorruptedPath(statePath) - _ = os.Remove(target) - if err := os.Rename(source, target); err != nil { - return fmt.Errorf("failed to move WAL file %q to %q: %w", source, target, err) - } - return nil -} - -func writeCorruptedWALEntries(statePath string, corrupted []corruptedWALEntry) error { - if len(corrupted) == 0 { - return nil - } - - target := walCorruptedPath(statePath) - f, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o600) - if err != nil { - return fmt.Errorf("failed to create corrupted WAL file %q: %w", target, err) - } - defer f.Close() - - for _, entry := range corrupted { - if _, err := f.WriteString(entry.rawLine + "\n"); err != nil { - return fmt.Errorf("failed to write corrupted WAL file %q: %w", target, err) - } - } - - if err := f.Sync(); err != nil { - return fmt.Errorf("failed to sync corrupted WAL file %q: %w", target, err) - } - - return nil -} - -func readWAL(statePath string) (*WALHeader, []WALEntry, []corruptedWALEntry, error) { - wp := walPath(statePath) - f, err := os.Open(wp) - if err != nil { - return nil, nil, nil, err - } - defer f.Close() - - scanner := bufio.NewScanner(f) - scanner.Buffer(make([]byte, 0, 64*1024), 10*1024*1024) - var header *WALHeader - var entries []WALEntry - var corrupted []corruptedWALEntry - lineNumber := 0 - for scanner.Scan() { - lineNumber++ - line := bytes.TrimSpace(scanner.Bytes()) - if len(line) == 0 { - continue - } - - lineCopy := make([]byte, len(line)) - copy(lineCopy, line) - if header == nil { - var h WALHeader - if err := json.Unmarshal(lineCopy, &h); err != nil { - return nil, nil, nil, fmt.Errorf("failed to parse WAL header: %w", err) - } - header = &h - continue - } - - var e WALEntry - if err := json.Unmarshal(lineCopy, &e); err != nil { - corrupted = append(corrupted, corruptedWALEntry{ - lineNumber: lineNumber, - rawLine: string(lineCopy), - parseErr: err, - }) - continue - } - - if e.K == "" { - corrupted = append(corrupted, corruptedWALEntry{ - lineNumber: lineNumber, - rawLine: string(lineCopy), - parseErr: errors.New("entry has empty key"), - }) - continue - } - - entries = append(entries, e) - } - - if err := scanner.Err(); err != nil { - return nil, nil, nil, fmt.Errorf("failed to read WAL file: %w", err) - } - - if header == nil { - return nil, nil, nil, errors.New("WAL file is empty") - } - - return header, entries, corrupted, nil -} - -func replayWAL(statePath string, db *Database) (walReplayResult, error) { - result := walReplayResult{} - header, entries, corrupted, err := readWAL(statePath) - if err != nil { - if os.IsNotExist(err) { - return result, nil - } - return result, fmt.Errorf("%w: %v", errWALRead, err) - } - - expectedSerial := db.Serial + 1 - if header.Serial < expectedSerial { - result.stale = true - return result, nil - } - - if header.Serial > expectedSerial { - return result, fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expectedSerial) - } - - if db.Lineage != "" && header.Lineage != "" && db.Lineage != header.Lineage { - return result, fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Lineage) - } - - if db.Lineage == "" && header.Lineage != "" { - db.Lineage = header.Lineage - } - - if db.State == nil { - db.State = make(map[string]ResourceEntry) - } - - for _, entry := range entries { - if entry.V != nil { - db.State[entry.K] = *entry.V - } else { - delete(db.State, entry.K) - } - } - - result.recovered = true - result.entriesRecovered = len(entries) - result.corruptedEntries = corrupted - return result, nil -} - -func recoverFromWAL(ctx context.Context, statePath string, db *Database) (bool, error) { - replayResult, err := replayWAL(statePath, db) - if err != nil { - if errors.Is(err, errWALRead) { - if moveErr := moveWALToCorrupted(statePath); moveErr != nil { - return false, moveErr - } - log.Warnf(ctx, "Failed to read WAL file, moved it to %s and proceeding: %s", relativePathForLog(walCorruptedPath(statePath)), strings.TrimPrefix(err.Error(), errWALRead.Error()+": ")) - return false, nil - } - return false, err - } - - if replayResult.stale { - log.Warnf(ctx, "Deleting stale WAL (serial=%s behind current state serial=)") - if err := cleanupWAL(statePath); err != nil { - return false, err - } - return false, nil - } - - if !replayResult.recovered { - return false, nil - } - - logRecoveryProgress(ctx, "Recovering state from WAL file: "+relativePathForLog(walPath(statePath))) - walLogPath := relativePathForLog(walPath(statePath)) - for _, corrupted := range replayResult.corruptedEntries { - log.Warnf(ctx, "Could not read state file WAL entry in %s: line %d", walLogPath, corrupted.lineNumber) - } - - if err := writeCorruptedWALEntries(statePath, replayResult.corruptedEntries); err != nil { - return false, err - } - if len(replayResult.corruptedEntries) > 0 { - log.Warnf(ctx, "Saved corrupted WAL entries to %s", relativePathForLog(walCorruptedPath(statePath))) - } - - logRecoveryProgress(ctx, fmt.Sprintf("Recovered %d entries from WAL file.", replayResult.entriesRecovered)) - return true, nil -} - -func relativePathForLog(path string) string { - rel, err := filepath.Rel(".", path) - if err != nil { - return path - } - return filepath.ToSlash(rel) -} - -func logRecoveryProgress(ctx context.Context, message string) { - defer func() { - _ = recover() - }() - cmdio.LogString(ctx, message) -} diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 75081de56e..54391ec4d6 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -236,7 +236,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle // Validate that the plan's lineage and serial match the current state // This must happen before any file operations - err = direct.ValidatePlanAgainstState(ctx, &b.DeploymentBundle.StateDB, plan) + err = direct.OpenStateWithPlanCheck(ctx, &b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) return b, stateDesc, root.ErrAlreadyPrinted From f79fa29cb6732862fc77a2cb63020ac4acce9912 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 22:22:02 +0100 Subject: [PATCH 14/85] fixes --- bundle/direct/dstate/state.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index e409c3f6e8..df237d97cb 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -209,6 +209,7 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W func (db *DeploymentState) Reload(ctx context.Context) error { + db.stateIDs = make(map[string]string) data, err := os.ReadFile(db.Path) if err != nil { if errors.Is(err, fs.ErrNotExist) { @@ -250,7 +251,7 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea return fmt.Errorf("state_version in the header (%q) does not match the one in the state (%q)", header.StateVersion, db.Data.StateVersion) } - if header.Lineage != db.Data.Lineage { + if header.Lineage != db.Data.Lineage && db.Data.Lineage != "" { return fmt.Errorf("lineage in the header (%q) does not match the one in the state (%q)", header.Lineage, db.Data.Lineage) } From bb11b78a2b78d346b59c95bbf2a1adef11dd0a12 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 22:38:05 +0100 Subject: [PATCH 15/85] fixes --- bundle/direct/bundle_apply.go | 1 + bundle/direct/bundle_plan.go | 2 +- bundle/direct/dstate/state.go | 10 +- bundle/direct/dstate/wal_test.go | 549 ------------------------------- bundle/phases/deploy.go | 9 + cmd/bundle/utils/process.go | 2 +- 6 files changed, 18 insertions(+), 555 deletions(-) delete mode 100644 bundle/direct/dstate/wal_test.go diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 7a77968515..6bad809146 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -25,6 +25,7 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return } + b.StateDB.AssertOpenedForWrite() b.RemoteStateCache.Clear() g, err := makeGraph(plan) diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 03fe2b87bb..1fb70123b9 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -40,7 +40,7 @@ func (b *DeploymentBundle) init(client *databricks.WorkspaceClient) error { // ValidatePlanAgainstState validates that a plan's lineage and serial match the current state. // This should be called early in the deployment process, before any file operations. // If the plan has no lineage (first deployment), validation is skipped. -func OpenStateWithPlanCheck(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { +func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { // If plan has no lineage, this is a first deployment before any state exists // No validation needed if plan.Lineage == "" { diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index df237d97cb..1e88228427 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -146,7 +146,7 @@ func (db *DeploymentState) GetResourceID(key string) string { return "" } - entry, _ := db.Data.State[key] + entry := db.Data.State[key] return entry.ID } @@ -164,7 +164,9 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W } db.Path = path - db.Reload(ctx) + if err := db.Reload(ctx); err != nil { + return err + } walPath := db.Path + walSuffix _, walError := os.Stat(walPath) @@ -175,7 +177,7 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W return err } } else { - return fmt.Errorf("Unexpected WAL file found at %s", walPath) + return fmt.Errorf("unexpected WAL file found at %s", walPath) } } @@ -293,7 +295,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) } else { var entry WALEntry if err := json.Unmarshal(line, &entry); err != nil { - return hasUpdates, fmt.Errorf("failed to parse WAL entry %s:%s: %q: %w", walPath, lineNumber, entry, err) + return hasUpdates, fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) } hasUpdates = true if entry.Value == nil { diff --git a/bundle/direct/dstate/wal_test.go b/bundle/direct/dstate/wal_test.go deleted file mode 100644 index d8a5f23345..0000000000 --- a/bundle/direct/dstate/wal_test.go +++ /dev/null @@ -1,549 +0,0 @@ -package dstate - -import ( - "encoding/json" - "os" - "path/filepath" - "testing" - - "github.com/databricks/cli/bundle/deployplan" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestWALWriteAndRead(t *testing.T) { - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - - entry1 := &ResourceEntry{ - ID: "12345", - State: json.RawMessage(`{"name":"job1"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry1}) - require.NoError(t, err) - - entry2 := &ResourceEntry{ - ID: "67890", - State: json.RawMessage(`{"name":"job2"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.job2", V: entry2}) - require.NoError(t, err) - - err = wal.writeJSON(WALEntry{K: "resources.jobs.old_job", V: nil}) - require.NoError(t, err) - - err = wal.close() - require.NoError(t, err) - - header, entries, _, err := readWAL(statePath) - require.NoError(t, err) - - assert.Equal(t, "test-lineage", header.Lineage) - assert.Equal(t, 1, header.Serial) - - require.Len(t, entries, 3) - - assert.Equal(t, "resources.jobs.job1", entries[0].K) - require.NotNil(t, entries[0].V) - assert.Equal(t, "12345", entries[0].V.ID) - - assert.Equal(t, "resources.jobs.job2", entries[1].K) - require.NotNil(t, entries[1].V) - assert.Equal(t, "67890", entries[1].V.ID) - - assert.Equal(t, "resources.jobs.old_job", entries[2].K) - assert.Nil(t, entries[2].V) -} - -func TestCleanupWAL(t *testing.T) { - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - - _, err = os.Stat(walFilePath) - require.NoError(t, err) - - err = wal.close() - require.NoError(t, err) - err = cleanupWAL(statePath) - require.NoError(t, err) - - _, err = os.Stat(walFilePath) - assert.True(t, os.IsNotExist(err)) -} - -func TestOpenWALFailsIfFileAlreadyExists(t *testing.T) { - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - require.NoError(t, wal.close()) - - _, err = openWAL(statePath) - require.Error(t, err) - assert.Contains(t, err.Error(), "failed to open WAL file") -} - -func TestRecoverFromWAL_NoWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - db := NewDatabase("", 0) - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.False(t, recovered) -} - -func TestRecoverFromWAL_ValidWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - - entry := &ResourceEntry{ - ID: "12345", - State: json.RawMessage(`{"name":"job1"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("", 0) - - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - - assert.Equal(t, "test-lineage", db.Lineage) - require.Contains(t, db.State, "resources.jobs.job1") - assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) -} - -func TestRecoverFromWAL_StaleWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("test-lineage", 2) // serial 2 makes WAL stale - - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.False(t, recovered) - - _, err = os.Stat(walFilePath) - assert.True(t, os.IsNotExist(err)) -} - -func TestRecoverFromWAL_FutureWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 5}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("test-lineage", 0) - - _, err = recoverFromWAL(ctx, statePath, &db) - assert.Error(t, err) - assert.Contains(t, err.Error(), "ahead of expected") -} - -func TestRecoverFromWAL_LineageMismatch(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "lineage-A", Serial: 1}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("lineage-B", 0) - - _, err = recoverFromWAL(ctx, statePath, &db) - assert.Error(t, err) - assert.Contains(t, err.Error(), "lineage") -} - -func TestRecoverFromWAL_DeleteOperation(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - - entry := &ResourceEntry{ - ID: "12345", - State: json.RawMessage(`{"name":"job1"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: entry}) - require.NoError(t, err) - - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: nil}) - require.NoError(t, err) - - err = wal.close() - require.NoError(t, err) - - db := NewDatabase("", 0) - - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - - assert.NotContains(t, db.State, "resources.jobs.job1") -} - -func TestDeploymentState_WALIntegration(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) - require.NoError(t, err) - - _, err = os.Stat(walFilePath) - require.NoError(t, err) - - header, entries, _, err := readWAL(statePath) - require.NoError(t, err) - assert.Equal(t, 1, header.Serial) - require.Len(t, entries, 1) - assert.Equal(t, "resources.jobs.job1", entries[0].K) - assert.Equal(t, "12345", entries[0].V.ID) - - err = db.Finalize() - require.NoError(t, err) - - _, err = os.Stat(walFilePath) - assert.True(t, os.IsNotExist(err)) - - data, err := os.ReadFile(statePath) - require.NoError(t, err) - var savedDB Database - err = json.Unmarshal(data, &savedDB) - require.NoError(t, err) - assert.Equal(t, 1, savedDB.Serial) - assert.Contains(t, savedDB.State, "resources.jobs.job1") -} - -func TestDeploymentState_WALRecoveryOnOpen(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - initialDB := NewDatabase("test-lineage", 5) - initialDB.State["resources.jobs.existing"] = ResourceEntry{ - ID: "existing-id", - State: json.RawMessage(`{"name":"existing"}`), - } - data, err := json.Marshal(initialDB) - require.NoError(t, err) - err = os.WriteFile(statePath, data, 0o600) - require.NoError(t, err) - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 6}) - require.NoError(t, err) - entry := &ResourceEntry{ - ID: "new-id", - State: json.RawMessage(`{"name":"new"}`), - } - err = wal.writeJSON(WALEntry{K: "resources.jobs.new", V: entry}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - var db DeploymentState - err = db.Open(ctx, statePath) - require.NoError(t, err) - - assert.Contains(t, db.Data.State, "resources.jobs.existing") - assert.Contains(t, db.Data.State, "resources.jobs.new") - assert.Equal(t, "new-id", db.Data.State["resources.jobs.new"].ID) -} - -func TestDeploymentState_DeleteStateWritesWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) - require.NoError(t, err) - - err = db.DeleteState("resources.jobs.job1") - require.NoError(t, err) - - _, entries, _, err := readWAL(statePath) - require.NoError(t, err) - - require.Len(t, entries, 2) - assert.Equal(t, "resources.jobs.job1", entries[1].K) - assert.Nil(t, entries[1].V) - - err = db.Finalize() - require.NoError(t, err) - - data, err := os.ReadFile(statePath) - require.NoError(t, err) - var savedDB Database - err = json.Unmarshal(data, &savedDB) - require.NoError(t, err) - assert.NotContains(t, savedDB.State, "resources.jobs.job1") -} - -func TestDeploymentState_WALWithDependsOn(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - t.Cleanup(func() { db.Close() }) - - dependsOn := []deployplan.DependsOnEntry{ - {Node: "resources.clusters.cluster1", Label: "${resources.clusters.cluster1.id}"}, - } - - err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, dependsOn) - require.NoError(t, err) - - _, entries, _, err := readWAL(statePath) - require.NoError(t, err) - - require.Len(t, entries, 1) - require.NotNil(t, entries[0].V) - require.Len(t, entries[0].V.DependsOn, 1) - assert.Equal(t, "resources.clusters.cluster1", entries[0].V.DependsOn[0].Node) -} - -func TestRecoverFromWAL_CorruptedMiddleLine(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - content := `{"lineage":"test","serial":1} -{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} -not valid json -{"k":"resources.jobs.job2","v":{"__id__":"67890","state":{}}} -` - err := os.WriteFile(walFilePath, []byte(content), 0o600) - require.NoError(t, err) - - db := NewDatabase("", 0) - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - assert.Len(t, db.State, 2) - assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) - assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) - - corruptedPath := walCorruptedPath(statePath) - _, err = os.Stat(corruptedPath) - require.NoError(t, err) - contentBytes, err := os.ReadFile(corruptedPath) - require.NoError(t, err) - assert.Equal(t, "not valid json\n", string(contentBytes)) - _, err = os.Stat(walFilePath) - require.NoError(t, err) -} - -func TestRecoverFromWAL_CorruptedLastLine(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - content := `{"lineage":"test","serial":1} -{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} -{"k":"resources.jobs.job2","v":{"__id__":"67890","state":{}}} -not valid json -` - err := os.WriteFile(walFilePath, []byte(content), 0o600) - require.NoError(t, err) - - db := NewDatabase("", 0) - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - - assert.Contains(t, db.State, "resources.jobs.job1") - assert.Contains(t, db.State, "resources.jobs.job2") - assert.Equal(t, "12345", db.State["resources.jobs.job1"].ID) - assert.Equal(t, "67890", db.State["resources.jobs.job2"].ID) - - corruptedPath := walCorruptedPath(statePath) - _, err = os.Stat(corruptedPath) - require.NoError(t, err) - contentBytes, err := os.ReadFile(corruptedPath) - require.NoError(t, err) - assert.Equal(t, "not valid json\n", string(contentBytes)) -} - -func TestDeploymentState_RecoveredFromWALFlag(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - initialDB := NewDatabase("test-lineage", 0) - data, err := json.Marshal(initialDB) - require.NoError(t, err) - err = os.WriteFile(statePath, data, 0o600) - require.NoError(t, err) - - wal, err := openWAL(statePath) - require.NoError(t, err) - err = wal.writeJSON(WALHeader{Lineage: "test-lineage", Serial: 1}) - require.NoError(t, err) - err = wal.writeJSON(WALEntry{K: "resources.jobs.job1", V: &ResourceEntry{ID: "123", State: json.RawMessage(`{}`)}}) - require.NoError(t, err) - err = wal.close() - require.NoError(t, err) - - var db DeploymentState - err = db.Open(ctx, statePath) - require.NoError(t, err) - - assert.True(t, db.RecoveredFromWAL()) -} - -func TestRecoverFromWAL_LineageAdoption(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - content := `{"lineage":"adopted-lineage","serial":1} -{"k":"resources.jobs.job1","v":{"__id__":"12345","state":{}}} -` - err := os.WriteFile(walFilePath, []byte(content), 0o600) - require.NoError(t, err) - - db := NewDatabase("", 0) // empty lineage - recovered, err := recoverFromWAL(ctx, statePath, &db) - require.NoError(t, err) - assert.True(t, recovered) - assert.Equal(t, "adopted-lineage", db.Lineage) -} - -func TestReadWAL_EmptyFile(t *testing.T) { - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - err := os.WriteFile(walFilePath, []byte(""), 0o600) - require.NoError(t, err) - - _, _, _, err = readWAL(statePath) - assert.Error(t, err) - assert.Contains(t, err.Error(), "empty") -} - -func TestDeploymentState_MultipleOperationsSameKey(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "111", map[string]string{"v": "1"}, nil) - require.NoError(t, err) - - err = db.DeleteState("resources.jobs.job1") - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "222", map[string]string{"v": "2"}, nil) - require.NoError(t, err) - - _, entries, _, err := readWAL(statePath) - require.NoError(t, err) - require.Len(t, entries, 3) - assert.Equal(t, "111", entries[0].V.ID) - assert.Nil(t, entries[1].V) - assert.Equal(t, "222", entries[2].V.ID) - - err = db.Finalize() - require.NoError(t, err) - - entry, ok := db.GetResourceEntry("resources.jobs.job1") - require.True(t, ok) - assert.Equal(t, "222", entry.ID) -} - -func TestDeploymentState_FinalizeFailsOnCorruptedWAL(t *testing.T) { - ctx := t.Context() - dir := t.TempDir() - statePath := filepath.Join(dir, "resources.json") - walFilePath := walPath(statePath) - - var db DeploymentState - err := db.Open(ctx, statePath) - require.NoError(t, err) - - err = db.SaveState("resources.jobs.job1", "12345", map[string]string{"name": "job1"}, nil) - require.NoError(t, err) - - f, err := os.OpenFile(walFilePath, os.O_WRONLY|os.O_APPEND, 0) - require.NoError(t, err) - _, err = f.WriteString("{\"k\":\"resources.jobs.partial_write\",\"v\":{\"__id__\":\"999\",\"state\":{\"name\":\"partial-\n") - require.NoError(t, err) - require.NoError(t, f.Sync()) - require.NoError(t, f.Close()) - - err = db.Finalize() - require.Error(t, err) - assert.Contains(t, err.Error(), "failed to replay WAL during finalize: corrupted entry at line") - - _, err = os.Stat(walFilePath) - require.NoError(t, err) -} diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index b4d70ede5a..2a79d6d209 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -15,6 +15,7 @@ import ( "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/libraries" "github.com/databricks/cli/bundle/metrics" "github.com/databricks/cli/bundle/permissions" @@ -149,6 +150,8 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } + _, localPath := b.StateFilenameDirect(ctx) + if plan != nil { // Initialize DeploymentBundle for applying the loaded plan err := b.DeploymentBundle.InitForApply(ctx, b.WorkspaceClient(ctx), plan) @@ -158,6 +161,12 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand } } else { plan = RunPlan(ctx, b, engine) + err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(false), dstate.WithWrite(true)) + if err != nil { + logdiag.LogError(ctx, err) + return + } + } if logdiag.HasError(ctx) { diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 54391ec4d6..75081de56e 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -236,7 +236,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle // Validate that the plan's lineage and serial match the current state // This must happen before any file operations - err = direct.OpenStateWithPlanCheck(ctx, &b.DeploymentBundle.StateDB, plan) + err = direct.ValidatePlanAgainstState(ctx, &b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) return b, stateDesc, root.ErrAlreadyPrinted From fb00793df116dc7f3720fbf6c1f1e041b07c17bd Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 27 Mar 2026 22:40:38 +0100 Subject: [PATCH 16/85] rm unnecessary assert --- bundle/direct/dstate/state.go | 1 - 1 file changed, 1 deletion(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 1e88228427..59d0804bbf 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -381,7 +381,6 @@ func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.Export } func (db *DeploymentState) unlockedSave() error { - db.AssertOpenedForWrite() data, err := json.MarshalIndent(db.Data, "", " ") if err != nil { return err From 9f3d0ec14d2edb6d384bc89af9c639b1e293a905 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sat, 28 Mar 2026 08:05:55 +0100 Subject: [PATCH 17/85] Centralize state open/close lifecycle for direct engine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move state open/close management to process.go so the lifecycle is transparent. process.go opens state for read (with WAL recovery) after PullResourcesState and defers close. Deploy/destroy upgrade to write mode via the new UpgradeToWrite() method which initializes the WAL without re-reading state JSON. Internal functions (CalculatePlan, ExportState, InitForApply, ValidatePlanAgainstState) no longer manage their own open/close — they expect state to already be open. Self-managed callers (bind, migrate, yaml_sync, diff) handle their own state lifecycle. Plan command uses ProcessBundleRetWithPlan to compute the plan while state is still open for read inside processBundleRetInternal. Co-authored-by: Isaac --- bundle/configsync/diff.go | 6 ++ bundle/direct/bind.go | 24 ++++- bundle/direct/bundle_apply.go | 2 + bundle/direct/bundle_plan.go | 19 ++-- bundle/direct/dstate/state.go | 73 ++++++++++----- bundle/direct/pkg.go | 3 +- bundle/phases/deploy.go | 43 +++++++-- bundle/phases/destroy.go | 13 +++ .../statemgmt/upload_state_for_yaml_sync.go | 24 ++++- cmd/bundle/deployment/migrate.go | 28 +++++- cmd/bundle/plan.go | 6 +- cmd/bundle/utils/process.go | 91 ++++++++++++------- 12 files changed, 242 insertions(+), 90 deletions(-) diff --git a/bundle/configsync/diff.go b/bundle/configsync/diff.go index dee7fa4811..f767966c16 100644 --- a/bundle/configsync/diff.go +++ b/bundle/configsync/diff.go @@ -14,6 +14,7 @@ import ( "github.com/databricks/cli/bundle/deploy" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/libs/dyn" "github.com/databricks/cli/libs/dyn/convert" "github.com/databricks/cli/libs/log" @@ -139,6 +140,11 @@ func DetectChanges(ctx context.Context, b *bundle.Bundle, engine engine.EngineTy } } + if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + return nil, fmt.Errorf("failed to open state: %w", err) + } + defer deployBundle.StateDB.Close(ctx) + plan, err := deployBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) if err != nil { return nil, fmt.Errorf("failed to calculate plan: %w", err) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 74389313af..fe8ced6d22 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -62,8 +62,10 @@ type BindResult struct { func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.WorkspaceClient, configRoot *config.Root, statePath, resourceKey, resourceID string) (*BindResult, error) { // Check if the resource is already managed (bound to a different ID) var checkStateDB dstate.DeploymentState - if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(true)); err == nil { - if existingID := checkStateDB.GetResourceID(resourceKey); existingID != "" { + if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err == nil { + existingID := checkStateDB.GetResourceID(resourceKey) + checkStateDB.Close(ctx) + if existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, ExistingID: existingID, @@ -105,11 +107,17 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac log.Infof(ctx, "Bound %s to id=%s (in temp state)", resourceKey, resourceID) // First plan + update: populate state with resolved config + err = b.StateDB.Open(ctx, tmpStatePath, dstate.WithRecovery(true), dstate.WithWrite(false)) + if err != nil { + os.Remove(tmpStatePath) + return nil, err + } plan, err := b.CalculatePlan(ctx, client, configRoot) if err != nil { os.Remove(tmpStatePath) return nil, err } + b.StateDB.Close(ctx) // Populate the state with the resolved config entry := plan.Plan[resourceKey] @@ -132,6 +140,12 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } } + err = b.StateDB.Open(ctx, tmpStatePath, dstate.WithRecovery(true), dstate.WithWrite(true)) + if err != nil { + os.Remove(tmpStatePath) + return nil, err + } + err = b.StateDB.SaveState(resourceKey, resourceID, sv.Value, dependsOn) if err != nil { os.Remove(tmpStatePath) @@ -146,7 +160,13 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Second plan: this is the plan to present to the user (change between remote resource and config) + err = b.StateDB.Open(ctx, tmpStatePath, dstate.WithRecovery(true), dstate.WithWrite(false)) + if err != nil { + os.Remove(tmpStatePath) + return nil, err + } plan, err = b.CalculatePlan(ctx, client, configRoot) + b.StateDB.Close(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 6bad809146..6b84f40775 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -151,6 +151,8 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return true }) + + // Note: caller is responsible for closing StateDB after Apply returns. } func (b *DeploymentBundle) LookupReferencePostDeploy(ctx context.Context, path *structpath.PathNode) (any, error) { diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 1fb70123b9..4f21d0fa06 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -37,24 +37,17 @@ func (b *DeploymentBundle) init(client *databricks.WorkspaceClient) error { return err } -// ValidatePlanAgainstState validates that a plan's lineage and serial match the current state. -// This should be called early in the deployment process, before any file operations. +// ValidatePlanAgainstState validates that a plan's lineage and serial match the given state. // If the plan has no lineage (first deployment), validation is skipped. -func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { - // If plan has no lineage, this is a first deployment before any state exists - // No validation needed +func ValidatePlanAgainstState(stateDB *dstate.DeploymentState, plan *deployplan.Plan) error { if plan.Lineage == "" { return nil } - stateDB.AssertOpened() - - // Validate that the plan's lineage matches the current state's lineage if plan.Lineage != stateDB.Data.Lineage { return fmt.Errorf("plan lineage %q does not match state lineage %q; the state may have been modified by another process", plan.Lineage, stateDB.Data.Lineage) } - // Validate that the plan's serial matches the current state's serial if plan.Serial != stateDB.Data.Serial { return fmt.Errorf("plan serial %d does not match state serial %d; the state has been modified since the plan was created. Please run 'bundle plan' again", plan.Serial, stateDB.Data.Serial) } @@ -63,9 +56,9 @@ func ValidatePlanAgainstState(ctx context.Context, stateDB *dstate.DeploymentSta } // InitForApply initializes the DeploymentBundle for applying a pre-computed plan. -// This is used when --plan is specified to skip the planning phase. +// StateDB must already be open for write before calling this function. func (b *DeploymentBundle) InitForApply(ctx context.Context, client *databricks.WorkspaceClient, plan *deployplan.Plan) error { - b.StateDB.AssertOpened() + b.StateDB.AssertOpenedForWrite() err := b.init(client) if err != nil { @@ -97,8 +90,10 @@ func (b *DeploymentBundle) InitForApply(ctx context.Context, client *databricks. return nil } +// CalculatePlan computes the deployment plan by comparing local config against remote state. +// StateDB must already be open for read before calling this function. func (b *DeploymentBundle) CalculatePlan(ctx context.Context, client *databricks.WorkspaceClient, configRoot *config.Root) (*deployplan.Plan, error) { - b.StateDB.AssertOpened() + b.StateDB.AssertOpenedForRead() err := b.init(client) if err != nil { diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 59d0804bbf..bd59c131ea 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -228,14 +228,12 @@ func (db *DeploymentState) Reload(ctx context.Context) error { func (db *DeploymentState) replayWAL(ctx context.Context) error { walPath := db.Path + walSuffix - hasUpdates, err := db.mergeWalIntoState(ctx) + err := db.mergeWalIntoState(ctx) if err != nil { return fmt.Errorf("failed to apply WAL file %s: %w", walPath, err) } - if hasUpdates { - if err := db.unlockedSave(); err != nil { - return err - } + if err := db.unlockedSave(); err != nil { + return err } err = os.Remove(walPath) if err != nil { @@ -264,16 +262,15 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea return nil } -func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) { +func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { if db.walFile != nil { panic("internal error: walFile must be closed") } - hasUpdates := false walPath := db.Path + walSuffix walFile, err := os.Open(walPath) if err != nil { - return false, fmt.Errorf("failed to open WAL file %s: %w", walPath, err) + return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) } defer walFile.Close() @@ -287,17 +284,19 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) if lineNumber == 1 { var header WALHeader if err := json.Unmarshal(line, &header); err != nil { - return hasUpdates, fmt.Errorf("failed to parse WAL header: %w", err) + return fmt.Errorf("failed to parse WAL header: %w", err) } if err := db.validateWALHeader(ctx, &header); err != nil { - return hasUpdates, err + return err } + // Apply header metadata to state (lineage may be new for first deploy) + db.Data.Lineage = header.Lineage + db.Data.Serial = header.Serial } else { var entry WALEntry if err := json.Unmarshal(line, &entry); err != nil { - return hasUpdates, fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) + return fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) } - hasUpdates = true if entry.Value == nil { delete(db.Data.State, entry.Key) } else { @@ -306,22 +305,19 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) } } - if err := scanner.Err(); err != nil { - return hasUpdates, err - } - - if hasUpdates { - // only assume WAL file's serial if we read any data from it - db.Data.Serial += 1 - } - - return hasUpdates, nil + return scanner.Err() } +// Close replays the WAL (if open for write) and resets the state. +// Safe to call multiple times or on an already-closed state. func (db *DeploymentState) Close(ctx context.Context) error { db.mu.Lock() defer db.mu.Unlock() + if db.Path == "" { + return nil + } + var err error if db.walFile != nil { @@ -337,6 +333,39 @@ func (db *DeploymentState) Close(ctx context.Context) error { return err } +// UpgradeToWrite transitions from read mode to write mode without re-reading state. +// State must already be open for read. This initializes the WAL for writing. +func (db *DeploymentState) UpgradeToWrite() error { + db.mu.Lock() + defer db.mu.Unlock() + + if db.Path == "" { + return fmt.Errorf("internal error: DeploymentState must be opened first") + } + if db.walFile != nil { + return fmt.Errorf("internal error: DeploymentState is already open for write") + } + + walPath := db.Path + walSuffix + walFile, err := os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) + if err != nil { + return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) + } + db.walFile = walFile + + lineage := db.Data.Lineage + if lineage == "" { + lineage = uuid.New().String() + } + walHead := WALHeader{ + Lineage: lineage, + Serial: db.Data.Serial + 1, + StateVersion: currentStateVersion, + CLIVersion: build.GetInfo().Version, + } + return appendJSONLine(db.walFile, walHead) +} + func (db *DeploymentState) AssertOpenedForReadOrWrite() { if db.Path == "" { panic("internal error: DeploymentState must be opened first") diff --git a/bundle/direct/pkg.go b/bundle/direct/pkg.go index 58b9bc6b4b..50beda36f5 100644 --- a/bundle/direct/pkg.go +++ b/bundle/direct/pkg.go @@ -64,7 +64,8 @@ func (d *DeploymentUnit) SetRemoteState(remoteState any) error { return nil } +// ExportState exports the current deployment state as a resource map. +// StateDB must already be open for read before calling this function. func (b *DeploymentBundle) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { - b.StateDB.AssertOpened() return b.StateDB.ExportState(ctx) } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 2a79d6d209..2b9c115e8a 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -82,6 +82,18 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta bundle.ApplyContext(ctx, b, terraform.Apply()) } + // Close state to replay WAL into state file, then reopen for read. + // PushResourcesState needs the file on disk, Load needs the state in memory. + if targetEngine.IsDirect() { + if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + logdiag.LogError(ctx, err) + } + _, localPath := b.StateFilenameDirect(ctx) + if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + logdiag.LogError(ctx, err) + } + } + // Even if deployment failed, there might be updates in states that we need to upload statemgmt.PushResourcesState(ctx, b, targetEngine) if logdiag.HasError(ctx) { @@ -150,9 +162,19 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - _, localPath := b.StateFilenameDirect(ctx) - if plan != nil { + if engine.IsDirect() { + // Upgrade from read (opened by process.go) to write mode + if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + logdiag.LogError(ctx, err) + return + } + defer func() { + if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + logdiag.LogError(ctx, err) + } + }() + } // Initialize DeploymentBundle for applying the loaded plan err := b.DeploymentBundle.InitForApply(ctx, b.WorkspaceClient(ctx), plan) if err != nil { @@ -160,13 +182,20 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } } else { + // State is already open for read by process.go (for direct engine) plan = RunPlan(ctx, b, engine) - err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(false), dstate.WithWrite(true)) - if err != nil { - logdiag.LogError(ctx, err) - return + if engine.IsDirect() { + // Upgrade from read to write mode (Apply needs write access) + if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + logdiag.LogError(ctx, err) + return + } + defer func() { + if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + logdiag.LogError(ctx, err) + } + }() } - } if logdiag.HasError(ctx) { diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 91640ac6ca..fe93d23081 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -13,6 +13,7 @@ import ( "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" @@ -168,6 +169,18 @@ func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { } if hasApproval { + if engine.IsDirect() { + // Upgrade from read (opened by process.go) to write mode + if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + logdiag.LogError(ctx, err) + return + } + defer func() { + if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + logdiag.LogError(ctx, err) + } + }() + } destroyCore(ctx, b, plan, engine) } else { cmdio.LogString(ctx, "Destroy cancelled!") diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 74def3174f..86c9a0c37b 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -141,13 +141,17 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun migratedDB := dstate.NewDatabase(tfState.Lineage, tfState.Serial+1) migratedDB.State = state - deploymentBundle := &direct.DeploymentBundle{ - StateDB: dstate.DeploymentState{ - Path: snapshotPath, - Data: migratedDB, - }, + // Write the migrated state to disk so CalculatePlan can read it via Open. + migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") + if err != nil { + return diag.FromErr(fmt.Errorf("marshaling migrated state: %w", err)) + } + if err := os.WriteFile(snapshotPath, migratedStateJSON, 0o600); err != nil { + return diag.FromErr(fmt.Errorf("writing migrated state to %s: %w", snapshotPath, err)) } + deploymentBundle := &direct.DeploymentBundle{} + // Apply SecretScopeFixups so the config matches what the direct engine expects. // This adds MANAGE ACL for the current user to all secret scopes, ensuring // the migrated state and config agree on .permissions entries. @@ -173,6 +177,11 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun return false, fmt.Errorf("failed to create uninterpolated config: %w", err) } + if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + return diag.FromErr(fmt.Errorf("failed to open state: %w", err)) + } + defer deploymentBundle.StateDB.Close(ctx) + plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) if err != nil { return false, err @@ -197,6 +206,11 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } } + err = deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)) + if err != nil { + return diag.FromErr(fmt.Errorf("reopening state for apply: %w", err)) + } + deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) if err := deploymentBundle.StateDB.Finalize(); err != nil { return false, err diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index 5020d88e73..4c657c1166 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -8,6 +8,7 @@ import ( "fmt" "os" "os/exec" + "path/filepath" "strings" "github.com/databricks/cli/bundle" @@ -227,12 +228,19 @@ To start using direct engine, set "engine: direct" under bundle in your databric migratedDB := dstate.NewDatabase(stateDesc.Lineage, stateDesc.Serial+1) migratedDB.State = state - deploymentBundle := &direct.DeploymentBundle{ - StateDB: dstate.DeploymentState{ - Path: tempStatePath, - Data: migratedDB, - }, + // Write the migrated state to disk so CalculatePlan can read it via Open. + migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") + if err != nil { + return fmt.Errorf("marshaling migrated state: %w", err) + } + if err := os.MkdirAll(filepath.Dir(tempStatePath), 0o755); err != nil { + return fmt.Errorf("creating state directory: %w", err) } + if err := os.WriteFile(tempStatePath, migratedStateJSON, 0o600); err != nil { + return fmt.Errorf("writing migrated state to %s: %w", tempStatePath, err) + } + + deploymentBundle := &direct.DeploymentBundle{} tempStatePathAutoRemove := true @@ -250,6 +258,10 @@ To start using direct engine, set "engine: direct" under bundle in your databric return root.ErrAlreadyPrinted } + if err := deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + return fmt.Errorf("failed to open state: %w", err) + } + plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) if err != nil { return err @@ -281,6 +293,12 @@ To start using direct engine, set "engine: direct" under bundle in your databric } } + deploymentBundle.StateDB.Close(ctx) + err = deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(false), dstate.WithWrite(true)) + if err != nil { + return fmt.Errorf("reopening state for apply: %w", err) + } + deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) if err := deploymentBundle.StateDB.Finalize(); err != nil { logdiag.LogError(ctx, err) diff --git a/cmd/bundle/plan.go b/cmd/bundle/plan.go index e3dd63929e..d14f820f4e 100644 --- a/cmd/bundle/plan.go +++ b/cmd/bundle/plan.go @@ -7,7 +7,6 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/deployplan" - "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/cmd/bundle/utils" "github.com/databricks/cli/cmd/root" "github.com/databricks/cli/libs/flags" @@ -56,14 +55,13 @@ It is useful for previewing changes before running 'bundle deploy'.`, } } - b, stateDesc, err := utils.ProcessBundleRet(cmd, opts) + _, _, plan, err := utils.ProcessBundleRetWithPlan(cmd, opts) if err != nil { return err } ctx := cmd.Context() - plan := phases.RunPlan(ctx, b, stateDesc.Engine) - if logdiag.HasError(ctx) { + if plan == nil || logdiag.HasError(ctx) { return root.ErrAlreadyPrinted } diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 75081de56e..9948b77a34 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -13,6 +13,7 @@ import ( "github.com/databricks/cli/bundle/config/validate" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/bundle/statemgmt" "github.com/databricks/cli/cmd/root" @@ -75,16 +76,33 @@ type ProcessOptions struct { // (after state is opened and IDs loaded, before deferred Finalize). PostStateFunc func(ctx context.Context, b *bundle.Bundle, stateDesc *statemgmt.StateDesc) error + // If true, compute the deployment plan and return it via ProcessBundleRetWithPlan. + // The plan is computed after PreDeployChecks while state is still open for read. + ComputePlan bool + + // Indicate whether the bundle operation originates from the pipelines CLI IsPipelinesCLI bool } func ProcessBundle(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, error) { - b, _, err := ProcessBundleRet(cmd, opts) + b, _, _, err := processBundleRetInternal(cmd, opts) return b, err } -func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, retErr error) { +func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, error) { + b, stateDesc, _, err := processBundleRetInternal(cmd, opts) + return b, stateDesc, err +} + +// ProcessBundleRetWithPlan is like ProcessBundleRet but also computes and returns a deployment plan. +// opts.ComputePlan must be true. +func ProcessBundleRetWithPlan(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, *deployplan.Plan, error) { + opts.ComputePlan = true + return processBundleRetInternal(cmd, opts) +} + +func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, plan *deployplan.Plan, retErr error) { var err error ctx := cmd.Context() if opts.SkipInitContext { @@ -116,20 +134,20 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle } if logdiag.HasError(ctx) { - return b, nil, root.ErrAlreadyPrinted + return b, nil, nil, root.ErrAlreadyPrinted } variables, err := cmd.Flags().GetStringSlice("var") if err != nil { logdiag.LogDiag(ctx, diag.FromErr(err)[0]) - return b, nil, err + return b, nil, nil, err } // Initialize variables by assigning them values passed as command line flags configureVariables(cmd, b, variables) if b == nil || logdiag.HasError(ctx) { - return b, nil, root.ErrAlreadyPrinted + return b, nil, nil, root.ErrAlreadyPrinted } ctx = cmd.Context() @@ -152,19 +170,19 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle if opts.IncludeLocations { bundle.ApplyContext(ctx, b, mutator.PopulateLocations()) if logdiag.HasError(ctx) { - return b, nil, root.ErrAlreadyPrinted + return b, nil, nil, root.ErrAlreadyPrinted } } } if logdiag.HasError(ctx) { - return b, nil, root.ErrAlreadyPrinted + return b, nil, nil, root.ErrAlreadyPrinted } if opts.PostInitFunc != nil { err := opts.PostInitFunc(ctx, b) if err != nil { - return b, nil, err + return b, nil, nil, err } } @@ -173,24 +191,27 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle if shouldReadState { requiredEngine, err := ResolveEngineSetting(ctx, b) if err != nil { - return b, nil, err + return b, nil, nil, err } // PullResourcesState depends on stateFiler which needs b.Config.Workspace.StatePath which is set in phases.Initialize ctx, stateDesc = statemgmt.PullResourcesState(ctx, b, statemgmt.AlwaysPull(opts.AlwaysPull), requiredEngine) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } cmd.SetContext(ctx) - // Open direct engine state once for all subsequent operations (ExportState, CalculatePlan, Apply, etc.) - needDirectState := stateDesc.Engine.IsDirect() && (opts.InitIDs || opts.ErrorOnEmptyState || opts.Deploy || opts.ReadPlanPath != "" || opts.PreDeployChecks || opts.PostStateFunc != nil) - if needDirectState { + // Open state for read (with WAL recovery) so that ExportState, CalculatePlan, etc. can access it. + // Caller is responsible for closing state when done (Deploy closes read + reopens for write). + if stateDesc.Engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) - if err := b.DeploymentBundle.StateDB.Open(localPath); err != nil { - logdiag.LogError(ctx, err) - return b, stateDesc, root.ErrAlreadyPrinted + if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + return b, stateDesc, nil, err } + defer func() { + // Close is idempotent — no-op if already closed by Deploy + b.DeploymentBundle.StateDB.Close(ctx) + }() } // These are not safe in plan/deploy because they insert empty config settings for deleted resources. @@ -208,17 +229,15 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle } bundle.ApplySeqContext(ctx, b, mutators...) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } } - var plan *deployplan.Plan - if opts.ReadPlanPath != "" { if !stateDesc.Engine.IsDirect() { logdiag.LogError(ctx, errors.New("--plan is only supported with direct engine (set bundle.engine to \"direct\" or DATABRICKS_BUNDLE_ENGINE=direct)")) - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } opts.Build = false opts.PreDeployChecks = false @@ -227,7 +246,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle plan, err = deployplan.LoadPlanFromFile(opts.ReadPlanPath) if err != nil { logdiag.LogError(ctx, err) - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } currentVersion := build.GetInfo().Version if plan.CLIVersion != currentVersion { @@ -236,10 +255,10 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle // Validate that the plan's lineage and serial match the current state // This must happen before any file operations - err = direct.ValidatePlanAgainstState(ctx, &b.DeploymentBundle.StateDB, plan) + err = direct.ValidatePlanAgainstState(&b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } else if opts.Deploy { opts.Build = true @@ -255,14 +274,14 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle }) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } // Pipeline CLI only validation. if opts.IsPipelinesCLI { rejectDefinitions(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } } @@ -270,7 +289,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle if opts.Validate { validate.Validate(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } @@ -285,7 +304,7 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle }) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } @@ -294,7 +313,15 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle phases.PreDeployChecks(ctx, b, downgradeWarningToError, stateDesc.Engine) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted + } + } + + // Compute plan while state is open for read (before Deploy upgrades to write) + if opts.ComputePlan && plan == nil { + plan = phases.RunPlan(ctx, b, stateDesc.Engine) + if logdiag.HasError(ctx) { + return b, stateDesc, nil, root.ErrAlreadyPrinted } } @@ -314,25 +341,25 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle }) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } if b != nil && stateDesc != nil && stateDesc.Engine.IsDirect() && stateDesc.HasRemoteTerraformState() { statemgmt.BackupRemoteTerraformState(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, root.ErrAlreadyPrinted + return b, stateDesc, nil, root.ErrAlreadyPrinted } } } if opts.PostStateFunc != nil { if err := opts.PostStateFunc(ctx, b, stateDesc); err != nil { - return b, stateDesc, err + return b, stateDesc, nil, err } } - return b, stateDesc, nil + return b, stateDesc, plan, nil } // ResolveEngineSetting determines the effective engine setting by combining bundle config and env var. From adf0621da11d7b21cc39aafa32daaa047d796202 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sat, 28 Mar 2026 08:08:09 +0100 Subject: [PATCH 18/85] lint --- bundle/direct/dstate/state.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index bd59c131ea..1433bd35db 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -4,6 +4,7 @@ import ( "bufio" "context" "encoding/json" + "errors" "fmt" "io/fs" "os" @@ -340,10 +341,10 @@ func (db *DeploymentState) UpgradeToWrite() error { defer db.mu.Unlock() if db.Path == "" { - return fmt.Errorf("internal error: DeploymentState must be opened first") + return errors.New("internal error: DeploymentState must be opened first") } if db.walFile != nil { - return fmt.Errorf("internal error: DeploymentState is already open for write") + return errors.New("internal error: DeploymentState is already open for write") } walPath := db.Path + walSuffix From cdc8e2a3cb76185e7fc5c78fa52b7ffecaf2ce35 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Wed, 29 Apr 2026 16:43:00 +0200 Subject: [PATCH 19/85] fixes Co-authored-by: Denis Bilenko --- bundle/configsync/diff.go | 8 ++--- bundle/direct/dstate/state.go | 8 ++--- bundle/direct/dstate/state_test.go | 33 +++++++++++-------- bundle/phases/deploy.go | 7 ---- bundle/phases/destroy.go | 7 ---- .../statemgmt/upload_state_for_yaml_sync.go | 19 ++++++----- cmd/bundle/deployment/migrate.go | 2 +- cmd/bundle/generate/dashboard.go | 4 ++- 8 files changed, 39 insertions(+), 49 deletions(-) diff --git a/bundle/configsync/diff.go b/bundle/configsync/diff.go index f767966c16..1770d94549 100644 --- a/bundle/configsync/diff.go +++ b/bundle/configsync/diff.go @@ -135,16 +135,12 @@ func DetectChanges(ctx context.Context, b *bundle.Bundle, engine engine.EngineTy } else { deployBundle = &direct.DeploymentBundle{} _, statePath := b.StateFilenameConfigSnapshot(ctx) - if err := deployBundle.StateDB.Open(statePath); err != nil { + if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { return nil, fmt.Errorf("failed to open state: %w", err) } + defer deployBundle.StateDB.Close(ctx) } - if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return nil, fmt.Errorf("failed to open state: %w", err) - } - defer deployBundle.StateDB.Close(ctx) - plan, err := deployBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) if err != nil { return nil, fmt.Errorf("failed to calculate plan: %w", err) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 1433bd35db..3061927731 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -210,8 +210,6 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W } func (db *DeploymentState) Reload(ctx context.Context) error { - - db.stateIDs = make(map[string]string) data, err := os.ReadFile(db.Path) if err != nil { @@ -249,7 +247,7 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea } if header.StateVersion != db.Data.StateVersion { - return fmt.Errorf("state_version in the header (%q) does not match the one in the state (%q)", header.StateVersion, db.Data.StateVersion) + return fmt.Errorf("state_version in the header (%d) does not match the one in the state (%d)", header.StateVersion, db.Data.StateVersion) } if header.Lineage != db.Data.Lineage && db.Data.Lineage != "" { @@ -257,7 +255,7 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea } if header.Serial != db.Data.Serial+1 { - return fmt.Errorf("serial in the header (%q) is not one higher than the one in the state (%q)", header.Serial, db.Data.Serial) + return fmt.Errorf("serial in the header (%d) is not one higher than the one in the state (%d)", header.Serial, db.Data.Serial) } return nil @@ -280,7 +278,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { lineNumber := 0 for scanner.Scan() { - lineNumber += 1 + lineNumber++ line := scanner.Bytes() if lineNumber == 1 { var header WALHeader diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index acd2a9e533..8f5b04bfe9 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -8,46 +8,51 @@ import ( "github.com/stretchr/testify/require" ) -func TestOpenSaveFinalizeRoundTrip(t *testing.T) { +func TestOpenCloseRoundTrip(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState - require.NoError(t, db.Open(path)) + require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{"key": "val"}, nil)) - require.NoError(t, db.Finalize()) + require.NoError(t, db.Close(t.Context())) // Re-open and verify persisted data. var db2 DeploymentState - require.NoError(t, db2.Open(path)) + require.NoError(t, db2.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 1, db2.Data.Serial) assert.Equal(t, "123", db2.GetResourceID("jobs.my_job")) + require.NoError(t, db2.Close(t.Context())) } func TestPanicOnDoubleOpen(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState - require.NoError(t, db.Open(path)) + require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) assert.Panics(t, func() { - _ = db.Open(path) + _ = db.Open(t.Context(), path, WithRecovery(true), WithWrite(true)) }) + db.Close(t.Context()) } func TestDeleteState(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState - require.NoError(t, db.Open(path)) + require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{}, nil)) - require.NoError(t, db.Finalize()) - - require.NoError(t, db.DeleteState("jobs.my_job")) - require.NoError(t, db.Finalize()) + require.NoError(t, db.Close(t.Context())) var db2 DeploymentState - require.NoError(t, db2.Open(path)) - assert.Equal(t, 2, db2.Data.Serial) - assert.Equal(t, "", db2.GetResourceID("jobs.my_job")) + require.NoError(t, db2.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) + require.NoError(t, db2.DeleteState("jobs.my_job")) + require.NoError(t, db2.Close(t.Context())) + + var db3 DeploymentState + require.NoError(t, db3.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) + assert.Equal(t, 2, db3.Data.Serial) + assert.Equal(t, "", db3.GetResourceID("jobs.my_job")) + require.NoError(t, db3.Close(t.Context())) } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 2b9c115e8a..70a81d7460 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -71,13 +71,6 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta if targetEngine.IsDirect() { b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(false)) - // Finalize state: write to disk even if deploy failed, so partial progress is saved. - // Skip for empty plans to avoid creating a state file when nothing was deployed. - if len(plan.Plan) > 0 { - if err := b.DeploymentBundle.StateDB.Finalize(); err != nil { - logdiag.LogError(ctx, err) - } - } } else { bundle.ApplyContext(ctx, b, terraform.Apply()) } diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index fe93d23081..b9ff5873bf 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -13,7 +13,6 @@ import ( "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" - "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/libs/cmdio" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" @@ -77,12 +76,6 @@ func approvalForDestroy(ctx context.Context, b *bundle.Bundle, plan *deployplan. func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, engine engine.EngineType) { if engine.IsDirect() { b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(false)) - // Skip Finalize for empty plans to avoid creating a state file when nothing was destroyed. - if len(plan.Plan) > 0 { - if err := b.DeploymentBundle.StateDB.Finalize(); err != nil { - logdiag.LogError(ctx, err) - } - } } else { // Core destructive mutators for destroy. These require informed user consent. bundle.ApplyContext(ctx, b, terraform.Apply()) diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 86c9a0c37b..75314c1422 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -144,10 +144,10 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun // Write the migrated state to disk so CalculatePlan can read it via Open. migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") if err != nil { - return diag.FromErr(fmt.Errorf("marshaling migrated state: %w", err)) + return false, fmt.Errorf("marshaling migrated state: %w", err) } if err := os.WriteFile(snapshotPath, migratedStateJSON, 0o600); err != nil { - return diag.FromErr(fmt.Errorf("writing migrated state to %s: %w", snapshotPath, err)) + return false, fmt.Errorf("writing migrated state to %s: %w", snapshotPath, err) } deploymentBundle := &direct.DeploymentBundle{} @@ -178,12 +178,12 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return diag.FromErr(fmt.Errorf("failed to open state: %w", err)) + return false, fmt.Errorf("failed to open state: %w", err) } - defer deploymentBundle.StateDB.Close(ctx) plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) if err != nil { + deploymentBundle.StateDB.Close(ctx) return false, err } @@ -206,13 +206,16 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } } - err = deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)) - if err != nil { - return diag.FromErr(fmt.Errorf("reopening state for apply: %w", err)) + // Close read state and reopen for write so Apply can record state changes via WAL. + if err := deploymentBundle.StateDB.Close(ctx); err != nil { + return false, fmt.Errorf("closing state after plan: %w", err) + } + if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)); err != nil { + return false, fmt.Errorf("reopening state for apply: %w", err) } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Finalize(); err != nil { + if err := deploymentBundle.StateDB.Close(ctx); err != nil { return false, err } diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index 4c657c1166..fb2dbf6ad5 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -300,7 +300,7 @@ To start using direct engine, set "engine: direct" under bundle in your databric } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Finalize(); err != nil { + if err := deploymentBundle.StateDB.Close(ctx); err != nil { logdiag.LogError(ctx, err) } if logdiag.HasError(ctx) { diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index 70de46225c..7c9510de17 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -17,6 +17,7 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/generate" + "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/bundle/resources" "github.com/databricks/cli/bundle/statemgmt" @@ -391,10 +392,11 @@ func (d *dashboard) runForResource(ctx context.Context, b *bundle.Bundle) { if stateDesc.Engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) - if err := b.DeploymentBundle.StateDB.Open(localPath); err != nil { + if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { logdiag.LogError(ctx, err) return } + defer b.DeploymentBundle.StateDB.Close(ctx) } bundle.ApplySeqContext(ctx, b, From 9c77b425ae07b5b64d5bd40f6c96e2c60b969d9a Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Wed, 29 Apr 2026 16:47:12 +0200 Subject: [PATCH 20/85] lint Co-authored-by: Denis Bilenko --- cmd/bundle/generate/dashboard.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index 7c9510de17..609b48f981 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -16,8 +16,8 @@ import ( "time" "github.com/databricks/cli/bundle" - "github.com/databricks/cli/bundle/generate" "github.com/databricks/cli/bundle/direct/dstate" + "github.com/databricks/cli/bundle/generate" "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/bundle/resources" "github.com/databricks/cli/bundle/statemgmt" From 0669b83eb5d2b2703910d1fb3c478697225d4ac9 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 11:52:17 +0200 Subject: [PATCH 21/85] restore test --- .../out.deploy.direct.txt | 7 ------- .../out.deploy.terraform.txt | 6 ------ .../artifact_upload_with_no_library_reference/output.txt | 6 ++++++ .../artifact_upload_with_no_library_reference/script | 2 +- .../artifact_upload_with_no_library_reference/test.toml | 5 ----- 5 files changed, 7 insertions(+), 19 deletions(-) delete mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt deleted file mode 100644 index f75a5428b1..0000000000 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt deleted file mode 100644 index 8ec9c52db6..0000000000 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading whl/source.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt index 6c8bd962a5..6d24880e6c 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/output.txt @@ -1,4 +1,10 @@ +>>> [CLI] bundle deploy +Uploading whl/source.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + === Expecting wheel to be uploaded >>> jq .path "/api/2.0/workspace-files/import-file/Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files/whl/source.whl" diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script index fba3a77700..883601185c 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/script @@ -2,7 +2,7 @@ mkdir -p whl echo "test wheel content" > whl/source.whl -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy title "Expecting wheel to be uploaded" trace jq .path < out.requests.txt | grep import | grep whl | sort diff --git a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml index 67a9da6c97..a0a680e9d1 100644 --- a/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml +++ b/acceptance/bundle/artifacts/artifact_upload_with_no_library_reference/test.toml @@ -1,5 +1,4 @@ RecordRequests = true -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" Ignore = [ '.venv', 'dist', @@ -9,10 +8,6 @@ Ignore = [ '*.whl', ] -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' - [[Server]] Pattern = "GET /api/2.1/clusters/get" Response.Body = ''' From 57a43a73fae8aa6fc5113c86610af8e033a08d9f Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:01:55 +0200 Subject: [PATCH 22/85] Skip state file write when WAL has no resource entries If only the WAL header was written (no resource changes), replayWAL now discards the WAL without saving the state file. This avoids the spurious "Updating deployment state..." message on no-op deploys in the direct engine. Co-authored-by: Denis Bilenko --- bundle/direct/dstate/state.go | 20 +++++++++++--------- bundle/direct/dstate/state_test.go | 12 ++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 3061927731..b0110d519d 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -227,12 +227,14 @@ func (db *DeploymentState) Reload(ctx context.Context) error { func (db *DeploymentState) replayWAL(ctx context.Context) error { walPath := db.Path + walSuffix - err := db.mergeWalIntoState(ctx) + hasEntries, err := db.mergeWalIntoState(ctx) if err != nil { return fmt.Errorf("failed to apply WAL file %s: %w", walPath, err) } - if err := db.unlockedSave(); err != nil { - return err + if hasEntries { + if err := db.unlockedSave(); err != nil { + return err + } } err = os.Remove(walPath) if err != nil { @@ -261,7 +263,7 @@ func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHea return nil } -func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { +func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) { if db.walFile != nil { panic("internal error: walFile must be closed") } @@ -269,7 +271,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { walPath := db.Path + walSuffix walFile, err := os.Open(walPath) if err != nil { - return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) + return false, fmt.Errorf("failed to open WAL file %s: %w", walPath, err) } defer walFile.Close() @@ -283,10 +285,10 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { if lineNumber == 1 { var header WALHeader if err := json.Unmarshal(line, &header); err != nil { - return fmt.Errorf("failed to parse WAL header: %w", err) + return false, fmt.Errorf("failed to parse WAL header: %w", err) } if err := db.validateWALHeader(ctx, &header); err != nil { - return err + return false, err } // Apply header metadata to state (lineage may be new for first deploy) db.Data.Lineage = header.Lineage @@ -294,7 +296,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { } else { var entry WALEntry if err := json.Unmarshal(line, &entry); err != nil { - return fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) + return false, fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) } if entry.Value == nil { delete(db.Data.State, entry.Key) @@ -304,7 +306,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) error { } } - return scanner.Err() + return lineNumber > 1, scanner.Err() } // Close replays the WAL (if open for write) and resets the state. diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index 8f5b04bfe9..8e817dd198 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -1,6 +1,7 @@ package dstate import ( + "os" "path/filepath" "testing" @@ -25,6 +26,17 @@ func TestOpenCloseRoundTrip(t *testing.T) { require.NoError(t, db2.Close(t.Context())) } +func TestCloseWithNoEntriesDoesNotWriteStateFile(t *testing.T) { + path := filepath.Join(t.TempDir(), "state.json") + + var db DeploymentState + require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) + require.NoError(t, db.Close(t.Context())) + + _, err := os.Stat(path) + assert.ErrorIs(t, err, os.ErrNotExist) +} + func TestPanicOnDoubleOpen(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") From f7d6a5c746831c131fb693af9d646650a368ad45 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:07:47 +0200 Subject: [PATCH 23/85] Revert per-engine test splits for no-resource deploys The splits were introduced because the direct engine printed "Updating deployment state..." on deploys with no resource changes, while terraform did not. The preceding commit fixes the root cause (WAL without entries no longer writes the state file), so both engines now produce identical output for these tests. Co-authored-by: Denis Bilenko --- .../build_and_files_whl/out.deploy.direct.txt | 8 ------- .../out.deploy.terraform.txt | 7 ------ .../artifacts/build_and_files_whl/output.txt | 7 ++++++ .../artifacts/build_and_files_whl/script | 2 +- .../artifacts/build_and_files_whl/test.toml | 4 ---- .../shell/bash/out.deploy.direct.txt | 7 ------ .../shell/bash/out.deploy.terraform.txt | 6 ----- .../bundle/artifacts/shell/bash/output.txt | 5 ++++ acceptance/bundle/artifacts/shell/bash/script | 2 +- .../shell/basic/out.deploy.direct.txt | 7 ------ .../shell/basic/out.deploy.terraform.txt | 6 ----- .../bundle/artifacts/shell/basic/output.txt | 5 ++++ .../bundle/artifacts/shell/basic/script | 2 +- .../shell/default/out.deploy.direct.txt | 7 ------ .../shell/default/out.deploy.terraform.txt | 6 ----- .../bundle/artifacts/shell/default/output.txt | 5 ++++ .../bundle/artifacts/shell/default/script | 2 +- .../artifacts/shell/sh/out.deploy.direct.txt | 7 ------ .../shell/sh/out.deploy.terraform.txt | 6 ----- .../bundle/artifacts/shell/sh/output.txt | 5 ++++ acceptance/bundle/artifacts/shell/sh/script | 2 +- acceptance/bundle/artifacts/shell/test.toml | 4 ---- .../deploy/empty-bundle/out.deploy.direct.txt | 6 ----- .../empty-bundle/out.deploy.terraform.txt | 5 ---- .../bundle/deploy/empty-bundle/output.txt | 5 ++++ acceptance/bundle/deploy/empty-bundle/script | 2 +- .../bundle/deploy/empty-bundle/test.toml | 3 --- .../bundle/scripts/out.deploy.direct.txt | 24 ------------------- .../bundle/scripts/out.deploy.terraform.txt | 23 ------------------ acceptance/bundle/scripts/output.txt | 23 ++++++++++++++++++ .../out.deploy.direct.txt | 18 -------------- .../out.deploy.terraform.txt | 17 ------------- .../scripts/restricted-execution/output.txt | 17 +++++++++++++ .../scripts/restricted-execution/script | 2 +- .../scripts/restricted-execution/test.toml | 3 --- acceptance/bundle/scripts/script | 2 +- acceptance/bundle/scripts/test.toml | 3 --- .../out.deploy-one.direct.txt | 6 ----- .../out.deploy-one.terraform.txt | 5 ---- .../out.deploy-two.direct.txt | 6 ----- .../out.deploy-two.terraform.txt | 5 ---- .../deploy-artifact-path-type/output.txt | 10 ++++++++ .../deploy-artifact-path-type/script | 4 ++-- .../deploy-artifact-path-type/test.toml | 4 ---- .../out.deploy.direct.txt | 6 ----- .../out.deploy.terraform.txt | 5 ---- .../deploy-config-file-count/output.txt | 5 ++++ .../telemetry/deploy-config-file-count/script | 2 +- .../deploy-config-file-count/test.toml | 3 --- .../deploy-mode/out.deploy-dev.direct.txt | 6 ----- .../deploy-mode/out.deploy-dev.terraform.txt | 5 ---- .../deploy-mode/out.deploy-prod.direct.txt | 12 ---------- .../deploy-mode/out.deploy-prod.terraform.txt | 11 --------- .../bundle/telemetry/deploy-mode/output.txt | 16 +++++++++++++ .../bundle/telemetry/deploy-mode/script | 4 ++-- .../bundle/telemetry/deploy-mode/test.toml | 3 --- .../deploy-target-count/out.deploy.direct.txt | 6 ----- .../out.deploy.terraform.txt | 5 ---- .../telemetry/deploy-target-count/output.txt | 5 ++++ .../telemetry/deploy-target-count/script | 2 +- .../telemetry/deploy-target-count/test.toml | 3 --- .../out.deploy.direct.txt | 6 ----- .../out.deploy.terraform.txt | 5 ---- .../deploy-variable-count/output.txt | 5 ++++ .../telemetry/deploy-variable-count/script | 2 +- .../telemetry/deploy-variable-count/test.toml | 4 ---- .../out.deploy-one.direct.txt | 8 ------- .../out.deploy-one.terraform.txt | 7 ------ .../out.deploy-two.direct.txt | 8 ------- .../out.deploy-two.terraform.txt | 7 ------ .../telemetry/deploy-whl-artifacts/output.txt | 14 +++++++++++ .../telemetry/deploy-whl-artifacts/script | 4 ++-- .../telemetry/deploy-whl-artifacts/test.toml | 4 ---- .../sync_patterns/out.deploy.direct.txt | 6 ----- .../sync_patterns/out.deploy.terraform.txt | 5 ---- .../bundle/validate/sync_patterns/output.txt | 5 ++++ .../bundle/validate/sync_patterns/script | 2 +- .../bundle/validate/sync_patterns/test.toml | 4 ---- acceptance/cache/simple/out.deploy.direct.txt | 6 ----- .../cache/simple/out.deploy.terraform.txt | 5 ---- acceptance/cache/simple/script | 2 +- acceptance/cache/simple/test.toml | 7 ------ 82 files changed, 151 insertions(+), 369 deletions(-) delete mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt delete mode 100644 acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt delete mode 100644 acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/scripts/out.deploy.direct.txt delete mode 100644 acceptance/bundle/scripts/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/scripts/restricted-execution/test.toml delete mode 100644 acceptance/bundle/scripts/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-config-file-count/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-mode/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-target-count/test.toml delete mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt delete mode 100644 acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt delete mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt delete mode 100644 acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt delete mode 100644 acceptance/cache/simple/out.deploy.direct.txt delete mode 100644 acceptance/cache/simple/out.deploy.terraform.txt diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt deleted file mode 100644 index 4039d5917e..0000000000 --- a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt b/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt deleted file mode 100644 index 9894e5b89f..0000000000 --- a/acceptance/bundle/artifacts/build_and_files_whl/out.deploy.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Building artifact_with_custom_dist... -Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/output.txt b/acceptance/bundle/artifacts/build_and_files_whl/output.txt index d44a21b582..b618de6b89 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/output.txt +++ b/acceptance/bundle/artifacts/build_and_files_whl/output.txt @@ -7,3 +7,10 @@ Workspace: Path: /Workspace/Users/[USERNAME]/.bundle/test-bundle/default Validation OK! + +>>> errcode [CLI] bundle deploy +Building artifact_with_custom_dist... +Uploading mydist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/build_and_files_whl/script b/acceptance/bundle/artifacts/build_and_files_whl/script index 9aa0d870e7..2d7d63f7fe 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/script +++ b/acceptance/bundle/artifacts/build_and_files_whl/script @@ -1,5 +1,5 @@ cp -r $TESTDIR/../whl_explicit/my_test_code/{setup.py,src} . trace $CLI bundle validate # I expect this deploy to work because I explicitly told where to find the wheel, but it does not: -trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace errcode $CLI bundle deploy rm mydist/my_test_code-0.0.1-py3-none-any.whl setup.py src/*.py diff --git a/acceptance/bundle/artifacts/build_and_files_whl/test.toml b/acceptance/bundle/artifacts/build_and_files_whl/test.toml index 8b65645e5a..a030353d57 100644 --- a/acceptance/bundle/artifacts/build_and_files_whl/test.toml +++ b/acceptance/bundle/artifacts/build_and_files_whl/test.toml @@ -1,5 +1 @@ RecordRequests = false - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt deleted file mode 100644 index f311959abd..0000000000 --- a/acceptance/bundle/artifacts/shell/bash/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt deleted file mode 100644 index fa5d7b76bc..0000000000 --- a/acceptance/bundle/artifacts/shell/bash/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/output.txt b/acceptance/bundle/artifacts/shell/bash/output.txt index 8b13789179..fa5d7b76bc 100644 --- a/acceptance/bundle/artifacts/shell/bash/output.txt +++ b/acceptance/bundle/artifacts/shell/bash/output.txt @@ -1 +1,6 @@ +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/bash/script b/acceptance/bundle/artifacts/shell/bash/script index 09bb41643c..68ebb78d77 100644 --- a/acceptance/bundle/artifacts/shell/bash/script +++ b/acceptance/bundle/artifacts/shell/bash/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt deleted file mode 100644 index 3a4ff9138b..0000000000 --- a/acceptance/bundle/artifacts/shell/basic/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt deleted file mode 100644 index b5e01c79e6..0000000000 --- a/acceptance/bundle/artifacts/shell/basic/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/output.txt b/acceptance/bundle/artifacts/shell/basic/output.txt index 8b13789179..b5e01c79e6 100644 --- a/acceptance/bundle/artifacts/shell/basic/output.txt +++ b/acceptance/bundle/artifacts/shell/basic/output.txt @@ -1 +1,6 @@ +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-basic/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/basic/script b/acceptance/bundle/artifacts/shell/basic/script index 09bb41643c..68ebb78d77 100644 --- a/acceptance/bundle/artifacts/shell/basic/script +++ b/acceptance/bundle/artifacts/shell/basic/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt deleted file mode 100644 index f311959abd..0000000000 --- a/acceptance/bundle/artifacts/shell/default/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt deleted file mode 100644 index fa5d7b76bc..0000000000 --- a/acceptance/bundle/artifacts/shell/default/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/output.txt b/acceptance/bundle/artifacts/shell/default/output.txt index 8b13789179..fa5d7b76bc 100644 --- a/acceptance/bundle/artifacts/shell/default/output.txt +++ b/acceptance/bundle/artifacts/shell/default/output.txt @@ -1 +1,6 @@ +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-bash/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/default/script b/acceptance/bundle/artifacts/shell/default/script index 09bb41643c..68ebb78d77 100644 --- a/acceptance/bundle/artifacts/shell/default/script +++ b/acceptance/bundle/artifacts/shell/default/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt deleted file mode 100644 index 98820986f5..0000000000 --- a/acceptance/bundle/artifacts/shell/sh/out.deploy.direct.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt b/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt deleted file mode 100644 index 5117e6e9fc..0000000000 --- a/acceptance/bundle/artifacts/shell/sh/out.deploy.terraform.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Building my_artifact... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/output.txt b/acceptance/bundle/artifacts/shell/sh/output.txt index 8b13789179..5117e6e9fc 100644 --- a/acceptance/bundle/artifacts/shell/sh/output.txt +++ b/acceptance/bundle/artifacts/shell/sh/output.txt @@ -1 +1,6 @@ +>>> [CLI] bundle deploy +Building my_artifact... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/shell-sh/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/artifacts/shell/sh/script b/acceptance/bundle/artifacts/shell/sh/script index 09bb41643c..68ebb78d77 100644 --- a/acceptance/bundle/artifacts/shell/sh/script +++ b/acceptance/bundle/artifacts/shell/sh/script @@ -1 +1 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/artifacts/shell/test.toml b/acceptance/bundle/artifacts/shell/test.toml index df72afb6c8..9796804e9a 100644 --- a/acceptance/bundle/artifacts/shell/test.toml +++ b/acceptance/bundle/artifacts/shell/test.toml @@ -1,7 +1,3 @@ Local = true Cloud = false RecordRequests = false - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt deleted file mode 100644 index 81dddfcb9f..0000000000 --- a/acceptance/bundle/deploy/empty-bundle/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt b/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt deleted file mode 100644 index 494f76c84f..0000000000 --- a/acceptance/bundle/deploy/empty-bundle/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/deploy/empty-bundle/output.txt b/acceptance/bundle/deploy/empty-bundle/output.txt index 8498653a6e..919accb661 100644 --- a/acceptance/bundle/deploy/empty-bundle/output.txt +++ b/acceptance/bundle/deploy/empty-bundle/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/files... +Deploying resources... +Deployment complete! + >>> [CLI] bundle destroy --auto-approve All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default diff --git a/acceptance/bundle/deploy/empty-bundle/script b/acceptance/bundle/deploy/empty-bundle/script index b74818f1b1..775ccd0def 100644 --- a/acceptance/bundle/deploy/empty-bundle/script +++ b/acceptance/bundle/deploy/empty-bundle/script @@ -4,4 +4,4 @@ cleanup() { } trap cleanup EXIT -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy diff --git a/acceptance/bundle/deploy/empty-bundle/test.toml b/acceptance/bundle/deploy/empty-bundle/test.toml index 84da5529dc..f64800a163 100644 --- a/acceptance/bundle/deploy/empty-bundle/test.toml +++ b/acceptance/bundle/deploy/empty-bundle/test.toml @@ -2,6 +2,3 @@ Cloud = true [EnvMatrix] DATABRICKS_BUNDLE_ENABLE_EXPERIMENTAL_YAML_SYNC = ["", "true"] -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/scripts/out.deploy.direct.txt b/acceptance/bundle/scripts/out.deploy.direct.txt deleted file mode 100644 index 037f609f94..0000000000 --- a/acceptance/bundle/scripts/out.deploy.direct.txt +++ /dev/null @@ -1,24 +0,0 @@ - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/out.deploy.terraform.txt b/acceptance/bundle/scripts/out.deploy.terraform.txt deleted file mode 100644 index a3d9ba342c..0000000000 --- a/acceptance/bundle/scripts/out.deploy.terraform.txt +++ /dev/null @@ -1,23 +0,0 @@ - ->>> EXITCODE=0 errcode [CLI] bundle deploy -Executing 'preinit' script -from myscript.py 0 preinit: hello stdout! -from myscript.py 0 preinit: hello stderr! -Executing 'postinit' script -from myscript.py 0 postinit: hello stdout! -from myscript.py 0 postinit: hello stderr! -Executing 'prebuild' script -from myscript.py 0 prebuild: hello stdout! -from myscript.py 0 prebuild: hello stderr! -Executing 'postbuild' script -from myscript.py 0 postbuild: hello stdout! -from myscript.py 0 postbuild: hello stderr! -Executing 'predeploy' script -from myscript.py 0 predeploy: hello stdout! -from myscript.py 0 predeploy: hello stderr! -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -from myscript.py 0 postdeploy: hello stdout! -from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/output.txt b/acceptance/bundle/scripts/output.txt index a39a0b0aa9..68afb2fecc 100644 --- a/acceptance/bundle/scripts/output.txt +++ b/acceptance/bundle/scripts/output.txt @@ -25,3 +25,26 @@ Name: scripts Found 1 error Exit code: 1 + +>>> EXITCODE=0 errcode [CLI] bundle deploy +Executing 'preinit' script +from myscript.py 0 preinit: hello stdout! +from myscript.py 0 preinit: hello stderr! +Executing 'postinit' script +from myscript.py 0 postinit: hello stdout! +from myscript.py 0 postinit: hello stderr! +Executing 'prebuild' script +from myscript.py 0 prebuild: hello stdout! +from myscript.py 0 prebuild: hello stderr! +Executing 'postbuild' script +from myscript.py 0 postbuild: hello stdout! +from myscript.py 0 postbuild: hello stderr! +Executing 'predeploy' script +from myscript.py 0 predeploy: hello stdout! +from myscript.py 0 predeploy: hello stderr! +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +from myscript.py 0 postdeploy: hello stdout! +from myscript.py 0 postdeploy: hello stderr! diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt deleted file mode 100644 index d8fed9e4e6..0000000000 --- a/acceptance/bundle/scripts/restricted-execution/out.deploy.direct.txt +++ /dev/null @@ -1,18 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt b/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt deleted file mode 100644 index efcf1281cb..0000000000 --- a/acceptance/bundle/scripts/restricted-execution/out.deploy.terraform.txt +++ /dev/null @@ -1,17 +0,0 @@ - ->>> errcode [CLI] bundle deploy -Executing 'preinit' script -preinit value_from_env -Executing 'postinit' script -postinit value_from_env -Executing 'prebuild' script -prebuild value_from_env -Executing 'postbuild' script -postbuild value_from_env -Executing 'predeploy' script -predeploy value_from_env -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... -Deploying resources... -Deployment complete! -Executing 'postdeploy' script -postdeploy value_from_env diff --git a/acceptance/bundle/scripts/restricted-execution/output.txt b/acceptance/bundle/scripts/restricted-execution/output.txt index 2186ac68f0..f377edba7c 100644 --- a/acceptance/bundle/scripts/restricted-execution/output.txt +++ b/acceptance/bundle/scripts/restricted-execution/output.txt @@ -1,5 +1,22 @@ === Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible +>>> errcode [CLI] bundle deploy +Executing 'preinit' script +preinit value_from_env +Executing 'postinit' script +postinit value_from_env +Executing 'prebuild' script +prebuild value_from_env +Executing 'postbuild' script +postbuild value_from_env +Executing 'predeploy' script +predeploy value_from_env +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/scripts_with_restricted_execution/default/files... +Deploying resources... +Deployment complete! +Executing 'postdeploy' script +postdeploy value_from_env + === With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible >>> DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode [CLI] bundle deploy Error: failed to execute script: running scripts is not allowed when DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION is set diff --git a/acceptance/bundle/scripts/restricted-execution/script b/acceptance/bundle/scripts/restricted-execution/script index 2e31cce2ee..7a3dcb068b 100644 --- a/acceptance/bundle/scripts/restricted-execution/script +++ b/acceptance/bundle/scripts/restricted-execution/script @@ -1,7 +1,7 @@ export SOME_ENV_VAR="value_from_env" title "Without DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION, all envs are accessible" -trace errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace errcode $CLI bundle deploy title "With DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1, no envs are accessible" trace DATABRICKS_BUNDLE_RESTRICTED_CODE_EXECUTION=1 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/restricted-execution/test.toml b/acceptance/bundle/scripts/restricted-execution/test.toml deleted file mode 100644 index 2a2e9c2033..0000000000 --- a/acceptance/bundle/scripts/restricted-execution/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/scripts/script b/acceptance/bundle/scripts/script index 3acb85f9cd..de07d277ea 100644 --- a/acceptance/bundle/scripts/script +++ b/acceptance/bundle/scripts/script @@ -1,3 +1,3 @@ trace EXITCODE=0 errcode $CLI bundle validate trace EXITCODE=1 errcode $CLI bundle validate -trace EXITCODE=0 errcode $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace EXITCODE=0 errcode $CLI bundle deploy diff --git a/acceptance/bundle/scripts/test.toml b/acceptance/bundle/scripts/test.toml deleted file mode 100644 index 2a2e9c2033..0000000000 --- a/acceptance/bundle/scripts/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt deleted file mode 100644 index 0e133547de..0000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt deleted file mode 100644 index 65960fa86d..0000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-one.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt deleted file mode 100644 index 120e590201..0000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt deleted file mode 100644 index fabdebb399..0000000000 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/out.deploy-two.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt index 69c6730b46..a03920c3fd 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/output.txt @@ -1,4 +1,14 @@ +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t two +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "workspace_artifact_path_type": "WORKSPACE_FILE_SYSTEM" diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/script b/acceptance/bundle/telemetry/deploy-artifact-path-type/script index 4f3bd7c3cf..d1a63928a6 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/script +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one -trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t two trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {workspace_artifact_path_type}' diff --git a/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml b/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml index d4126948d3..32b75237a1 100644 --- a/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml +++ b/acceptance/bundle/telemetry/deploy-artifact-path-type/test.toml @@ -20,7 +20,3 @@ Response.Body = '{}' # I'm adding 405 because that's what this test originally do. It's somewhat # surprising though that CLI can receive 405 and that does not result in error anywhere. Response.StatusCode = 405 - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b916..0000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37..0000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt index 1637965310..909e8d6c70 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-config-file-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "configuration_file_count": 4 diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/script b/acceptance/bundle/telemetry/deploy-config-file-count/script index 7fbdd0e677..c495bdcb07 100644 --- a/acceptance/bundle/telemetry/deploy-config-file-count/script +++ b/acceptance/bundle/telemetry/deploy-config-file-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {configuration_file_count}' diff --git a/acceptance/bundle/telemetry/deploy-config-file-count/test.toml b/acceptance/bundle/telemetry/deploy-config-file-count/test.toml deleted file mode 100644 index 2a2e9c2033..0000000000 --- a/acceptance/bundle/telemetry/deploy-config-file-count/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt deleted file mode 100644 index e86795abf5..0000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt deleted file mode 100644 index ee47fabbb6..0000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-dev.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t dev -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt deleted file mode 100644 index 5957e33b91..0000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.direct.txt +++ /dev/null @@ -1,12 +0,0 @@ - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt b/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt deleted file mode 100644 index ac2e13efb9..0000000000 --- a/acceptance/bundle/telemetry/deploy-mode/out.deploy-prod.terraform.txt +++ /dev/null @@ -1,11 +0,0 @@ - ->>> [CLI] bundle deploy -t prod -Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed - -A common practice is to use a username or principal name in this path, i.e. use - - root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} - -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-mode/output.txt b/acceptance/bundle/telemetry/deploy-mode/output.txt index 89be65f195..99e7fbb699 100644 --- a/acceptance/bundle/telemetry/deploy-mode/output.txt +++ b/acceptance/bundle/telemetry/deploy-mode/output.txt @@ -1,4 +1,20 @@ +>>> [CLI] bundle deploy -t dev +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/dev/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t prod +Recommendation: target with 'mode: production' should set 'workspace.root_path' to make sure only one copy is deployed + +A common practice is to use a username or principal name in this path, i.e. use + + root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/prod/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "bundle_mode": "DEVELOPMENT" diff --git a/acceptance/bundle/telemetry/deploy-mode/script b/acceptance/bundle/telemetry/deploy-mode/script index 0a9d57a1a4..f7257769ac 100644 --- a/acceptance/bundle/telemetry/deploy-mode/script +++ b/acceptance/bundle/telemetry/deploy-mode/script @@ -1,6 +1,6 @@ -trace $CLI bundle deploy -t dev > out.deploy-dev.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t dev -trace $CLI bundle deploy -t prod > out.deploy-prod.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t prod trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bundle_mode}' diff --git a/acceptance/bundle/telemetry/deploy-mode/test.toml b/acceptance/bundle/telemetry/deploy-mode/test.toml deleted file mode 100644 index 2a2e9c2033..0000000000 --- a/acceptance/bundle/telemetry/deploy-mode/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt deleted file mode 100644 index 0e133547de..0000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt deleted file mode 100644 index 65960fa86d..0000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-target-count/output.txt b/acceptance/bundle/telemetry/deploy-target-count/output.txt index 9c59c43023..31581169f2 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-target-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy -t one +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "target_count": 3 diff --git a/acceptance/bundle/telemetry/deploy-target-count/script b/acceptance/bundle/telemetry/deploy-target-count/script index 6e9d2f7378..3022a2b5e4 100644 --- a/acceptance/bundle/telemetry/deploy-target-count/script +++ b/acceptance/bundle/telemetry/deploy-target-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy -t one > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {target_count}' diff --git a/acceptance/bundle/telemetry/deploy-target-count/test.toml b/acceptance/bundle/telemetry/deploy-target-count/test.toml deleted file mode 100644 index 2a2e9c2033..0000000000 --- a/acceptance/bundle/telemetry/deploy-target-count/test.toml +++ /dev/null @@ -1,3 +0,0 @@ -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b916..0000000000 --- a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt b/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37..0000000000 --- a/acceptance/bundle/telemetry/deploy-variable-count/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-variable-count/output.txt b/acceptance/bundle/telemetry/deploy-variable-count/output.txt index e8580d71b3..be4840e69e 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/output.txt +++ b/acceptance/bundle/telemetry/deploy-variable-count/output.txt @@ -1,4 +1,9 @@ +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "variable_count": 6, diff --git a/acceptance/bundle/telemetry/deploy-variable-count/script b/acceptance/bundle/telemetry/deploy-variable-count/script index caaf8c1f39..dad762899a 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/script +++ b/acceptance/bundle/telemetry/deploy-variable-count/script @@ -1,4 +1,4 @@ -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs.[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {variable_count, lookup_variable_count, complex_variable_count}' diff --git a/acceptance/bundle/telemetry/deploy-variable-count/test.toml b/acceptance/bundle/telemetry/deploy-variable-count/test.toml index 0a40c794b3..855ecdd39e 100644 --- a/acceptance/bundle/telemetry/deploy-variable-count/test.toml +++ b/acceptance/bundle/telemetry/deploy-variable-count/test.toml @@ -14,7 +14,3 @@ Response.Body = ''' ] } ''' - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt deleted file mode 100644 index f8db617c00..0000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt deleted file mode 100644 index 048d0f07b5..0000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-one.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -t one -Building test... -Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt deleted file mode 100644 index b786de11fe..0000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.direct.txt +++ /dev/null @@ -1,8 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt deleted file mode 100644 index 651d315f77..0000000000 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/out.deploy-two.terraform.txt +++ /dev/null @@ -1,7 +0,0 @@ - ->>> [CLI] bundle deploy -t two -Building test... -Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt index ed89628d98..a9b8ce4ae6 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/output.txt @@ -1,4 +1,18 @@ +>>> [CLI] bundle deploy -t one +Building test... +Uploading my_test_code/dist/my_test_code-0.0.1-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/one/files... +Deploying resources... +Deployment complete! + +>>> [CLI] bundle deploy -t two +Building test... +Uploading .databricks/bundle/two/patched_wheels/test_my_test_code/my_test_code-0.0.1+[UNIX_TIME_NANOS]-py3-none-any.whl... +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/two/files... +Deploying resources... +Deployment complete! + >>> cat out.requests.txt { "bool_values": [ diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/script b/acceptance/bundle/telemetry/deploy-whl-artifacts/script index 5bc513afb8..078fa94cdd 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/script +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/script @@ -2,9 +2,9 @@ uv venv -q .venv venv_activate uv pip install -q --no-index setuptools -trace $CLI bundle deploy -t one > out.deploy-one.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t one -trace $CLI bundle deploy -t two > out.deploy-two.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -t two trace cat out.requests.txt | jq 'select(has("path") and .path == "/telemetry-ext") | .body.protoLogs[] | fromjson | .entry.databricks_cli_log.bundle_deploy_event.experimental | {bool_values}' diff --git a/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml b/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml index 317e12a834..0d48150706 100644 --- a/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml +++ b/acceptance/bundle/telemetry/deploy-whl-artifacts/test.toml @@ -6,7 +6,3 @@ Ignore = [ '.databricks', "__pycache__", ] - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt deleted file mode 100644 index 1b73d1b916..0000000000 --- a/acceptance/bundle/validate/sync_patterns/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt b/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt deleted file mode 100644 index 5c6aad5b37..0000000000 --- a/acceptance/bundle/validate/sync_patterns/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/output.txt b/acceptance/bundle/validate/sync_patterns/output.txt index 0c061fbe31..b35859d86a 100644 --- a/acceptance/bundle/validate/sync_patterns/output.txt +++ b/acceptance/bundle/validate/sync_patterns/output.txt @@ -20,3 +20,8 @@ Validation OK! "." ] } + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... +Deploying resources... +Deployment complete! diff --git a/acceptance/bundle/validate/sync_patterns/script b/acceptance/bundle/validate/sync_patterns/script index 485556d28a..d2aae85444 100644 --- a/acceptance/bundle/validate/sync_patterns/script +++ b/acceptance/bundle/validate/sync_patterns/script @@ -1,5 +1,5 @@ trace $CLI bundle validate trace $CLI bundle validate -o json | jq '.sync' -trace $CLI bundle deploy > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy jq 'select(.path | test("dir/test.yml"))' out.requests.txt > out.sync.txt rm out.requests.txt diff --git a/acceptance/bundle/validate/sync_patterns/test.toml b/acceptance/bundle/validate/sync_patterns/test.toml index abc1014fd6..159efe0269 100644 --- a/acceptance/bundle/validate/sync_patterns/test.toml +++ b/acceptance/bundle/validate/sync_patterns/test.toml @@ -1,5 +1 @@ RecordRequests = true - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' diff --git a/acceptance/cache/simple/out.deploy.direct.txt b/acceptance/cache/simple/out.deploy.direct.txt deleted file mode 100644 index 945da6d144..0000000000 --- a/acceptance/cache/simple/out.deploy.direct.txt +++ /dev/null @@ -1,6 +0,0 @@ - ->>> [CLI] bundle deploy -p dogfood -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! diff --git a/acceptance/cache/simple/out.deploy.terraform.txt b/acceptance/cache/simple/out.deploy.terraform.txt deleted file mode 100644 index 41cfbc2a2d..0000000000 --- a/acceptance/cache/simple/out.deploy.terraform.txt +++ /dev/null @@ -1,5 +0,0 @@ - ->>> [CLI] bundle deploy -p dogfood -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/exploratory-cache-test/default/files... -Deploying resources... -Deployment complete! diff --git a/acceptance/cache/simple/script b/acceptance/cache/simple/script index 524c077f46..a2907174bf 100644 --- a/acceptance/cache/simple/script +++ b/acceptance/cache/simple/script @@ -9,7 +9,7 @@ title "Second call in a session is expected to be a cache hit\n" trace $CLI bundle validate -p dogfood --debug 2>&1 | grep "Local Cache" | grep -v "cache path" title "Bundle deploy should send telemetry values\n" -trace $CLI bundle deploy -p dogfood > out.deploy.$DATABRICKS_BUNDLE_ENGINE.txt 2>&1 +trace $CLI bundle deploy -p dogfood trace print_telemetry_bool_values | grep "local.cache" rm out.requests.txt diff --git a/acceptance/cache/simple/test.toml b/acceptance/cache/simple/test.toml index 2601c79f82..08cabc87be 100644 --- a/acceptance/cache/simple/test.toml +++ b/acceptance/cache/simple/test.toml @@ -3,9 +3,6 @@ Local = true RecordRequests = true -# Enable engine-specific output files -EnvVaryOutput = "DATABRICKS_BUNDLE_ENGINE" - # Redact structured logging fields from debug output [[Repls]] Old = ' pid=[0-9]+' @@ -14,7 +11,3 @@ New = '' [[Repls]] Old = ' mutator=[A-Za-z]+' New = '' - -[[Repls]] -Old = 'Updating deployment state...\n' -New = '' From 8c38c727c77f88f22691ca0a1c1f3fa6599f6c22 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:16:13 +0200 Subject: [PATCH 24/85] fmt Co-authored-by: Denis Bilenko --- cmd/bundle/utils/process.go | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 9948b77a34..1e52234342 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -80,7 +80,6 @@ type ProcessOptions struct { // The plan is computed after PreDeployChecks while state is still open for read. ComputePlan bool - // Indicate whether the bundle operation originates from the pipelines CLI IsPipelinesCLI bool } From b0bad1a3a945c95d23c5f4589a06ec7ce5eac4a3 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:20:17 +0200 Subject: [PATCH 25/85] Maintain stateIDs as single source of truth for resource IDs Populate stateIDs from State on Reload so it always mirrors the effective view: initialized from disk, updated by SaveState/DeleteState. GetResourceID now consults stateIDs unconditionally instead of checking walFile and falling back to State. Co-authored-by: Denis Bilenko --- bundle/direct/dstate/state.go | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index b0110d519d..9b43f50610 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -132,23 +132,7 @@ func (db *DeploymentState) GetResourceID(key string) string { db.mu.Lock() defer db.mu.Unlock() - if db.walFile != nil { - // in write-mode new IDs are written to WAL and stored in this map - id := db.stateIDs[key] - if id != "" { - return id - } - } - - // in read mode State is the source of IDs for all requests - // in write mode State is the source of IDs for all resources that were not updated - - if db.Data.State == nil { - return "" - } - - entry := db.Data.State[key] - return entry.ID + return db.stateIDs[key] } type ( @@ -220,7 +204,12 @@ func (db *DeploymentState) Reload(ctx context.Context) error { return err } } else { - return json.Unmarshal(data, &db.Data) + if err := json.Unmarshal(data, &db.Data); err != nil { + return err + } + } + for key, entry := range db.Data.State { + db.stateIDs[key] = entry.ID } return nil } From 44f6141a7a7a187ad400ba4bc8f3b958d74d13b9 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:35:53 +0200 Subject: [PATCH 26/85] Remove defer Close from processBundleRetInternal; align with main approach State opened for read in ProcessBundleRet stays open after return. Deploy and Destroy call UpgradeToWrite + Close internally, so no defensive defer is needed. plan.go reverts to the two-step pattern from main: ProcessBundleRet then phases.RunPlan. ProcessBundleRetWithPlan and opts.ComputePlan are removed. Co-authored-by: Denis Bilenko --- cmd/bundle/plan.go | 6 ++++-- cmd/bundle/utils/process.go | 27 ++------------------------- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/cmd/bundle/plan.go b/cmd/bundle/plan.go index d14f820f4e..e3dd63929e 100644 --- a/cmd/bundle/plan.go +++ b/cmd/bundle/plan.go @@ -7,6 +7,7 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/deployplan" + "github.com/databricks/cli/bundle/phases" "github.com/databricks/cli/cmd/bundle/utils" "github.com/databricks/cli/cmd/root" "github.com/databricks/cli/libs/flags" @@ -55,13 +56,14 @@ It is useful for previewing changes before running 'bundle deploy'.`, } } - _, _, plan, err := utils.ProcessBundleRetWithPlan(cmd, opts) + b, stateDesc, err := utils.ProcessBundleRet(cmd, opts) if err != nil { return err } ctx := cmd.Context() - if plan == nil || logdiag.HasError(ctx) { + plan := phases.RunPlan(ctx, b, stateDesc.Engine) + if logdiag.HasError(ctx) { return root.ErrAlreadyPrinted } diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index 1e52234342..fbc662e0fe 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -72,14 +72,9 @@ type ProcessOptions struct { // When set, skips Build and PreDeployChecks phases, loads plan from file instead of calculating. ReadPlanPath string - // PostStateFunc is called at the end of ProcessBundleRet, within the state lifecycle scope - // (after state is opened and IDs loaded, before deferred Finalize). + // PostStateFunc is called at the end of ProcessBundleRet, while state is still open. PostStateFunc func(ctx context.Context, b *bundle.Bundle, stateDesc *statemgmt.StateDesc) error - // If true, compute the deployment plan and return it via ProcessBundleRetWithPlan. - // The plan is computed after PreDeployChecks while state is still open for read. - ComputePlan bool - // Indicate whether the bundle operation originates from the pipelines CLI IsPipelinesCLI bool } @@ -94,12 +89,6 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, return b, stateDesc, err } -// ProcessBundleRetWithPlan is like ProcessBundleRet but also computes and returns a deployment plan. -// opts.ComputePlan must be true. -func ProcessBundleRetWithPlan(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, *deployplan.Plan, error) { - opts.ComputePlan = true - return processBundleRetInternal(cmd, opts) -} func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, plan *deployplan.Plan, retErr error) { var err error @@ -201,16 +190,12 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl cmd.SetContext(ctx) // Open state for read (with WAL recovery) so that ExportState, CalculatePlan, etc. can access it. - // Caller is responsible for closing state when done (Deploy closes read + reopens for write). + // Caller is responsible for closing state when done (Deploy/Destroy upgrade to write and close). if stateDesc.Engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { return b, stateDesc, nil, err } - defer func() { - // Close is idempotent — no-op if already closed by Deploy - b.DeploymentBundle.StateDB.Close(ctx) - }() } // These are not safe in plan/deploy because they insert empty config settings for deleted resources. @@ -316,14 +301,6 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } } - // Compute plan while state is open for read (before Deploy upgrades to write) - if opts.ComputePlan && plan == nil { - plan = phases.RunPlan(ctx, b, stateDesc.Engine) - if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted - } - } - if opts.Deploy { var outputHandler sync.OutputHandler if opts.Verbose { From 13d2ae9b08ce4d3051a50d4268f299365d9695f6 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:44:05 +0200 Subject: [PATCH 27/85] Rename Close to Finalize; make plan a local var in processBundleRetInternal Close(ctx) -> Finalize(ctx) to match main's naming. plan was a named return in processBundleRetInternal only to support the now-removed ProcessBundleRetWithPlan; demote it to a local variable. Co-authored-by: Denis Bilenko --- bundle/configsync/diff.go | 2 +- bundle/direct/bind.go | 12 ++--- bundle/direct/dstate/state.go | 6 +-- bundle/direct/dstate/state_test.go | 14 ++--- bundle/phases/deploy.go | 6 +-- bundle/phases/destroy.go | 2 +- .../statemgmt/upload_state_for_yaml_sync.go | 6 +-- cmd/bundle/deployment/migrate.go | 4 +- cmd/bundle/generate/dashboard.go | 2 +- cmd/bundle/utils/process.go | 52 +++++++++---------- 10 files changed, 53 insertions(+), 53 deletions(-) diff --git a/bundle/configsync/diff.go b/bundle/configsync/diff.go index 1770d94549..5b2d5cfd15 100644 --- a/bundle/configsync/diff.go +++ b/bundle/configsync/diff.go @@ -138,7 +138,7 @@ func DetectChanges(ctx context.Context, b *bundle.Bundle, engine engine.EngineTy if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { return nil, fmt.Errorf("failed to open state: %w", err) } - defer deployBundle.StateDB.Close(ctx) + defer deployBundle.StateDB.Finalize(ctx) } plan, err := deployBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index fe8ced6d22..693d613bed 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -64,7 +64,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac var checkStateDB dstate.DeploymentState if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err == nil { existingID := checkStateDB.GetResourceID(resourceKey) - checkStateDB.Close(ctx) + checkStateDB.Finalize(ctx) if existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -98,7 +98,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Finalize to persist temp state to disk - err = b.StateDB.Close(ctx) + err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -117,7 +117,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac os.Remove(tmpStatePath) return nil, err } - b.StateDB.Close(ctx) + b.StateDB.Finalize(ctx) // Populate the state with the resolved config entry := plan.Plan[resourceKey] @@ -152,7 +152,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } - err = b.StateDB.Close(ctx) + err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -166,7 +166,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } plan, err = b.CalculatePlan(ctx, client, configRoot) - b.StateDB.Close(ctx) + b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -236,5 +236,5 @@ func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey st } } - return b.StateDB.Close(ctx) + return b.StateDB.Finalize(ctx) } diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 9b43f50610..90f8ca07fc 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -298,9 +298,9 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) return lineNumber > 1, scanner.Err() } -// Close replays the WAL (if open for write) and resets the state. -// Safe to call multiple times or on an already-closed state. -func (db *DeploymentState) Close(ctx context.Context) error { +// Finalize replays the WAL (if open for write) and resets the state. +// Safe to call multiple times or on an already-finalized state. +func (db *DeploymentState) Finalize(ctx context.Context) error { db.mu.Lock() defer db.mu.Unlock() diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index 8e817dd198..99efda82b4 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -16,14 +16,14 @@ func TestOpenCloseRoundTrip(t *testing.T) { require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{"key": "val"}, nil)) - require.NoError(t, db.Close(t.Context())) + require.NoError(t, db.Finalize(t.Context())) // Re-open and verify persisted data. var db2 DeploymentState require.NoError(t, db2.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 1, db2.Data.Serial) assert.Equal(t, "123", db2.GetResourceID("jobs.my_job")) - require.NoError(t, db2.Close(t.Context())) + require.NoError(t, db2.Finalize(t.Context())) } func TestCloseWithNoEntriesDoesNotWriteStateFile(t *testing.T) { @@ -31,7 +31,7 @@ func TestCloseWithNoEntriesDoesNotWriteStateFile(t *testing.T) { var db DeploymentState require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) - require.NoError(t, db.Close(t.Context())) + require.NoError(t, db.Finalize(t.Context())) _, err := os.Stat(path) assert.ErrorIs(t, err, os.ErrNotExist) @@ -46,7 +46,7 @@ func TestPanicOnDoubleOpen(t *testing.T) { assert.Panics(t, func() { _ = db.Open(t.Context(), path, WithRecovery(true), WithWrite(true)) }) - db.Close(t.Context()) + db.Finalize(t.Context()) } func TestDeleteState(t *testing.T) { @@ -55,16 +55,16 @@ func TestDeleteState(t *testing.T) { var db DeploymentState require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{}, nil)) - require.NoError(t, db.Close(t.Context())) + require.NoError(t, db.Finalize(t.Context())) var db2 DeploymentState require.NoError(t, db2.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db2.DeleteState("jobs.my_job")) - require.NoError(t, db2.Close(t.Context())) + require.NoError(t, db2.Finalize(t.Context())) var db3 DeploymentState require.NoError(t, db3.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 2, db3.Data.Serial) assert.Equal(t, "", db3.GetResourceID("jobs.my_job")) - require.NoError(t, db3.Close(t.Context())) + require.NoError(t, db3.Finalize(t.Context())) } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 70a81d7460..a11fe4bc12 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -78,7 +78,7 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta // Close state to replay WAL into state file, then reopen for read. // PushResourcesState needs the file on disk, Load needs the state in memory. if targetEngine.IsDirect() { - if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } _, localPath := b.StateFilenameDirect(ctx) @@ -163,7 +163,7 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } defer func() { - if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } }() @@ -184,7 +184,7 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } defer func() { - if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } }() diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index b9ff5873bf..3721f6a883 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -169,7 +169,7 @@ func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { return } defer func() { - if err := b.DeploymentBundle.StateDB.Close(ctx); err != nil { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } }() diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 75314c1422..645069e281 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -183,7 +183,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) if err != nil { - deploymentBundle.StateDB.Close(ctx) + deploymentBundle.StateDB.Finalize(ctx) return false, err } @@ -207,7 +207,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } // Close read state and reopen for write so Apply can record state changes via WAL. - if err := deploymentBundle.StateDB.Close(ctx); err != nil { + if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { return false, fmt.Errorf("closing state after plan: %w", err) } if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)); err != nil { @@ -215,7 +215,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Close(ctx); err != nil { + if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { return false, err } diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index fb2dbf6ad5..fddfa55d7d 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -293,14 +293,14 @@ To start using direct engine, set "engine: direct" under bundle in your databric } } - deploymentBundle.StateDB.Close(ctx) + deploymentBundle.StateDB.Finalize(ctx) err = deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(false), dstate.WithWrite(true)) if err != nil { return fmt.Errorf("reopening state for apply: %w", err) } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Close(ctx); err != nil { + if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } if logdiag.HasError(ctx) { diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index 609b48f981..500f67351e 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -396,7 +396,7 @@ func (d *dashboard) runForResource(ctx context.Context, b *bundle.Bundle) { logdiag.LogError(ctx, err) return } - defer b.DeploymentBundle.StateDB.Close(ctx) + defer b.DeploymentBundle.StateDB.Finalize(ctx) } bundle.ApplySeqContext(ctx, b, diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index fbc662e0fe..ad38e28d66 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -80,18 +80,18 @@ type ProcessOptions struct { } func ProcessBundle(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, error) { - b, _, _, err := processBundleRetInternal(cmd, opts) + b, _, err := processBundleRetInternal(cmd, opts) return b, err } func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, error) { - b, stateDesc, _, err := processBundleRetInternal(cmd, opts) + b, stateDesc, err := processBundleRetInternal(cmd, opts) return b, stateDesc, err } - -func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, plan *deployplan.Plan, retErr error) { +func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, retErr error) { var err error + var plan *deployplan.Plan ctx := cmd.Context() if opts.SkipInitContext { if !logdiag.IsSetup(ctx) { @@ -122,20 +122,20 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } if logdiag.HasError(ctx) { - return b, nil, nil, root.ErrAlreadyPrinted + return b, nil, root.ErrAlreadyPrinted } variables, err := cmd.Flags().GetStringSlice("var") if err != nil { logdiag.LogDiag(ctx, diag.FromErr(err)[0]) - return b, nil, nil, err + return b, nil, err } // Initialize variables by assigning them values passed as command line flags configureVariables(cmd, b, variables) if b == nil || logdiag.HasError(ctx) { - return b, nil, nil, root.ErrAlreadyPrinted + return b, nil, root.ErrAlreadyPrinted } ctx = cmd.Context() @@ -158,19 +158,19 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if opts.IncludeLocations { bundle.ApplyContext(ctx, b, mutator.PopulateLocations()) if logdiag.HasError(ctx) { - return b, nil, nil, root.ErrAlreadyPrinted + return b, nil, root.ErrAlreadyPrinted } } } if logdiag.HasError(ctx) { - return b, nil, nil, root.ErrAlreadyPrinted + return b, nil, root.ErrAlreadyPrinted } if opts.PostInitFunc != nil { err := opts.PostInitFunc(ctx, b) if err != nil { - return b, nil, nil, err + return b, nil, err } } @@ -179,13 +179,13 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if shouldReadState { requiredEngine, err := ResolveEngineSetting(ctx, b) if err != nil { - return b, nil, nil, err + return b, nil, err } // PullResourcesState depends on stateFiler which needs b.Config.Workspace.StatePath which is set in phases.Initialize ctx, stateDesc = statemgmt.PullResourcesState(ctx, b, statemgmt.AlwaysPull(opts.AlwaysPull), requiredEngine) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } cmd.SetContext(ctx) @@ -194,7 +194,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if stateDesc.Engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return b, stateDesc, nil, err + return b, stateDesc, err } } @@ -213,7 +213,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } bundle.ApplySeqContext(ctx, b, mutators...) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } } @@ -221,7 +221,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if opts.ReadPlanPath != "" { if !stateDesc.Engine.IsDirect() { logdiag.LogError(ctx, errors.New("--plan is only supported with direct engine (set bundle.engine to \"direct\" or DATABRICKS_BUNDLE_ENGINE=direct)")) - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } opts.Build = false opts.PreDeployChecks = false @@ -230,7 +230,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl plan, err = deployplan.LoadPlanFromFile(opts.ReadPlanPath) if err != nil { logdiag.LogError(ctx, err) - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } currentVersion := build.GetInfo().Version if plan.CLIVersion != currentVersion { @@ -242,7 +242,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl err = direct.ValidatePlanAgainstState(&b.DeploymentBundle.StateDB, plan) if err != nil { logdiag.LogError(ctx, err) - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } else if opts.Deploy { opts.Build = true @@ -258,14 +258,14 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl }) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } // Pipeline CLI only validation. if opts.IsPipelinesCLI { rejectDefinitions(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } } @@ -273,7 +273,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl if opts.Validate { validate.Validate(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } @@ -288,7 +288,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl }) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } @@ -297,7 +297,7 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl phases.PreDeployChecks(ctx, b, downgradeWarningToError, stateDesc.Engine) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } @@ -317,25 +317,25 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl }) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } if b != nil && stateDesc != nil && stateDesc.Engine.IsDirect() && stateDesc.HasRemoteTerraformState() { statemgmt.BackupRemoteTerraformState(ctx, b) if logdiag.HasError(ctx) { - return b, stateDesc, nil, root.ErrAlreadyPrinted + return b, stateDesc, root.ErrAlreadyPrinted } } } if opts.PostStateFunc != nil { if err := opts.PostStateFunc(ctx, b, stateDesc); err != nil { - return b, stateDesc, nil, err + return b, stateDesc, err } } - return b, stateDesc, plan, nil + return b, stateDesc, nil } // ResolveEngineSetting determines the effective engine setting by combining bundle config and env var. From 34403eaf967acfbb101568e877f307b3faac4508 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 12:50:10 +0200 Subject: [PATCH 28/85] Restore process.go structure to match main more closely - Collapse processBundleRetInternal back into ProcessBundleRet (named returns) - ProcessBundle calls ProcessBundleRet like on main - Restore needDirectState guard so state is only opened when needed - Move var plan back after the state block Co-authored-by: Denis Bilenko --- cmd/bundle/utils/process.go | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index ad38e28d66..f7663057c2 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -80,18 +80,12 @@ type ProcessOptions struct { } func ProcessBundle(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, error) { - b, _, err := processBundleRetInternal(cmd, opts) + b, _, err := ProcessBundleRet(cmd, opts) return b, err } -func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (*bundle.Bundle, *statemgmt.StateDesc, error) { - b, stateDesc, err := processBundleRetInternal(cmd, opts) - return b, stateDesc, err -} - -func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, retErr error) { +func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle, stateDesc *statemgmt.StateDesc, retErr error) { var err error - var plan *deployplan.Plan ctx := cmd.Context() if opts.SkipInitContext { if !logdiag.IsSetup(ctx) { @@ -189,12 +183,13 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } cmd.SetContext(ctx) - // Open state for read (with WAL recovery) so that ExportState, CalculatePlan, etc. can access it. - // Caller is responsible for closing state when done (Deploy/Destroy upgrade to write and close). - if stateDesc.Engine.IsDirect() { + // Open direct engine state once for all subsequent operations (ExportState, CalculatePlan, Apply, etc.) + needDirectState := stateDesc.Engine.IsDirect() && (opts.InitIDs || opts.ErrorOnEmptyState || opts.Deploy || opts.ReadPlanPath != "" || opts.PreDeployChecks || opts.PostStateFunc != nil) + if needDirectState { _, localPath := b.StateFilenameDirect(ctx) if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return b, stateDesc, err + logdiag.LogError(ctx, err) + return b, stateDesc, root.ErrAlreadyPrinted } } @@ -218,6 +213,8 @@ func processBundleRetInternal(cmd *cobra.Command, opts ProcessOptions) (b *bundl } } + var plan *deployplan.Plan + if opts.ReadPlanPath != "" { if !stateDesc.Engine.IsDirect() { logdiag.LogError(ctx, errors.New("--plan is only supported with direct engine (set bundle.engine to \"direct\" or DATABRICKS_BUNDLE_ENGINE=direct)")) From 02cd4af45543ad380465dfe72155bb07f2a39380 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 13:12:17 +0200 Subject: [PATCH 29/85] Fix migration count, remove unnecessary defer Finalize, fix errcheck - migrate.go: use len(state) instead of len(StateDB.Data.State) since Finalize() resets Data after saving; fixes "Migrated 0 resources" regression - dashboard.go, diff.go: remove unnecessary defer StateDB.Finalize for read-only opens - no WAL file is open so no cleanup is needed - bind.go, state_test.go, upload_state_for_yaml_sync.go, migrate.go: fix errcheck lint issues on Finalize calls that cannot return error in read mode Co-authored-by: Denis Bilenko --- bundle/configsync/diff.go | 1 - bundle/direct/bind.go | 6 +++--- bundle/direct/dstate/state_test.go | 2 +- bundle/statemgmt/upload_state_for_yaml_sync.go | 2 +- cmd/bundle/deployment/migrate.go | 4 ++-- cmd/bundle/generate/dashboard.go | 1 - 6 files changed, 7 insertions(+), 9 deletions(-) diff --git a/bundle/configsync/diff.go b/bundle/configsync/diff.go index 5b2d5cfd15..b02cd345e1 100644 --- a/bundle/configsync/diff.go +++ b/bundle/configsync/diff.go @@ -138,7 +138,6 @@ func DetectChanges(ctx context.Context, b *bundle.Bundle, engine engine.EngineTy if err := deployBundle.StateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { return nil, fmt.Errorf("failed to open state: %w", err) } - defer deployBundle.StateDB.Finalize(ctx) } plan, err := deployBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 693d613bed..7e32bfd647 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -64,7 +64,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac var checkStateDB dstate.DeploymentState if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err == nil { existingID := checkStateDB.GetResourceID(resourceKey) - checkStateDB.Finalize(ctx) + _ = checkStateDB.Finalize(ctx) if existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -117,7 +117,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac os.Remove(tmpStatePath) return nil, err } - b.StateDB.Finalize(ctx) + _ = b.StateDB.Finalize(ctx) // Populate the state with the resolved config entry := plan.Plan[resourceKey] @@ -166,7 +166,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } plan, err = b.CalculatePlan(ctx, client, configRoot) - b.StateDB.Finalize(ctx) + _ = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index 99efda82b4..b493258b2f 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -46,7 +46,7 @@ func TestPanicOnDoubleOpen(t *testing.T) { assert.Panics(t, func() { _ = db.Open(t.Context(), path, WithRecovery(true), WithWrite(true)) }) - db.Finalize(t.Context()) + require.NoError(t, db.Finalize(t.Context())) } func TestDeleteState(t *testing.T) { diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 645069e281..a89433964c 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -182,8 +182,8 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) + _ = deploymentBundle.StateDB.Finalize(ctx) if err != nil { - deploymentBundle.StateDB.Finalize(ctx) return false, err } diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index fddfa55d7d..3b1f003f59 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -293,7 +293,7 @@ To start using direct engine, set "engine: direct" under bundle in your databric } } - deploymentBundle.StateDB.Finalize(ctx) + _ = deploymentBundle.StateDB.Finalize(ctx) err = deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(false), dstate.WithWrite(true)) if err != nil { return fmt.Errorf("reopening state for apply: %w", err) @@ -328,7 +328,7 @@ Validate the migration by running "databricks bundle plan%s", there should be no The state file is not synchronized to the workspace yet. To do that and finalize the migration, run "bundle deploy%s". To undo the migration, remove %s and rename %s to %s -`, len(deploymentBundle.StateDB.Data.State), localPath, extraArgsStr, extraArgsStr, localPath, localTerraformBackupPath, localTerraformPath)) +`, len(state), localPath, extraArgsStr, extraArgsStr, localPath, localTerraformBackupPath, localTerraformPath)) return nil } diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index 500f67351e..ca02cc414e 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -396,7 +396,6 @@ func (d *dashboard) runForResource(ctx context.Context, b *bundle.Bundle) { logdiag.LogError(ctx, err) return } - defer b.DeploymentBundle.StateDB.Finalize(ctx) } bundle.ApplySeqContext(ctx, b, From dc973e1a51297331a731edbc80346a68e988d1b1 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 14:29:40 +0200 Subject: [PATCH 30/85] Fix WAL validation: lowercase suffix, partial recovery, directory creation Key fixes to bundle/direct/dstate/state.go: - Change walSuffix from ".WAL" to ".wal" for case-sensitive filesystem compatibility (Linux). All acceptance tests create .wal files. - Remove CLIVersion and StateVersion checks from validateWALHeader. These fields are informational metadata and should not gate WAL recovery. Pre-created test WAL files without these fields now validate correctly. - Fix validateWALHeader error messages: - Lineage mismatch: "WAL lineage (%s) does not match state lineage (%s)" - Stale serial (< expected): return errStaleWAL sentinel - Future serial (> expected): "WAL serial (%d) is ahead of expected (%d), state may be corrupted" - Add errStaleWAL sentinel so replayWAL can silently delete stale WALs instead of returning an error that fails the deploy. - Fix mergeWalIntoState for partial recovery: - Skip corrupted entries with log.Warnf instead of failing immediately - Save corrupted lines to PATH.wal.corrupted for debugging - Update db.stateIDs alongside db.Data.State when applying WAL entries - Add MkdirAll before WAL file creation in Open and UpgradeToWrite. Fixes bind failing on first use when the state directory doesn't exist yet. - Wrap replayWAL errors as "WAL recovery failed: %w". - Wrap Open's replayWAL call as "reading state from %s: %w". Update acceptance test expected outputs accordingly: - WAL tests: new error messages, partial recovery behavior, stale WAL handling - Migrate tests: migration count now correct (was 0 due to earlier fix) - Bind tests: now succeed when state directory didn't exist before - State/deploy failure tests: "Updating deployment state..." removed when no state was written (no-change or failed deploys) Co-authored-by: Isaac --- .../deploy/wal/chain-10-jobs/output.txt | 6 +- .../deploy/wal/corrupted-wal-entry/output.txt | 9 +-- .../wal/corrupted-wal-middle/output.txt | 9 +-- .../deploy/wal/crash-after-create/output.txt | 26 +------ .../bundle/deploy/wal/empty-wal/output.txt | 3 +- .../deploy/wal/future-serial-wal/output.txt | 1 - .../deploy/wal/lineage-mismatch/output.txt | 1 - .../deploy/wal/multiple-crashes/output.txt | 5 +- .../deploy/wal/summary-after-crash/output.txt | 17 +--- .../deploy/wal/wal-with-delete/output.txt | 2 - .../bundle/migrate/basic/out.plan_update.json | 2 +- acceptance/bundle/migrate/basic/output.txt | 10 +-- .../dashboards/out.plan_after_migrate.json | 2 +- .../bundle/migrate/dashboards/output.txt | 8 +- acceptance/bundle/migrate/grants/output.txt | 8 +- .../bundle/migrate/permissions/output.txt | 8 +- .../out.deploy.direct.txt | 1 - .../resources/jobs/create-error/output.txt | 1 - .../jobs/update/out.plan_update.direct.json | 2 +- .../without_project_id/out.deploy.direct.txt | 1 - .../bundle/state/state_present/output.txt | 8 +- bundle/direct/dstate/state.go | 77 +++++++++++++------ 22 files changed, 97 insertions(+), 110 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt index b172c4fc06..818bf13b25 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt @@ -9,8 +9,10 @@ Exit code: [KILLED] === WAL content after crash === { + "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": [SERIAL] + "serial": [SERIAL], + "state_version": 2 } { "k": "resources.jobs.job_01", @@ -362,8 +364,6 @@ Exit code: [KILLED] 9 === Bundle summary (reads from WAL) === -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 9 entries from WAL file. Name: wal-chain-test Target: default Workspace: diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index ee28d6391e..aad802f749 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -8,11 +8,9 @@ === Deploy (should recover valid entries, skip corrupted last line) === >>> [CLI] bundle deploy +Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input +Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 4 -Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted -Recovered 2 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! @@ -25,6 +23,5 @@ Deployment complete! ] } === Corrupted WAL entries file === -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- -=== WAL after successful deploy === +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-=== WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt index ffc7ef7d04..ff13944ae4 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt @@ -8,11 +8,9 @@ === Deploy (should recover valid entries and skip corrupted line) === >>> [CLI] bundle deploy +Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:3: unexpected end of JSON input +Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Warn: Could not read state file WAL entry in [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal: line 3 -Warn: Saved corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted -Recovered 2 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! @@ -25,6 +23,5 @@ Deployment complete! ] } === Corrupted WAL entries file === -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- -=== WAL after deploy === +{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-=== WAL after deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index e32c251ae4..cf9230983c 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -8,29 +8,9 @@ Deploying resources... Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) -{"lineage":"[UUID]","serial": [SERIAL]} +{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} {"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === -{ - "serial": [SERIAL], - "state_keys": [] -} -=== Second deploy (should recover from WAL and complete) === +cat: .databricks/bundle/default/resources.json: No such file or directory ->>> [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 1 entries from WAL file. -Deploying resources... -Updating deployment state... -Deployment complete! -=== State file after recovery === -{ - "serial": [SERIAL], - "state_keys": [ - "resources.jobs.job_a", - "resources.jobs.job_b" - ] -} -=== WAL file after successful deploy === -WAL file deleted (expected) +Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index e8e1553df7..26117a2d36 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -6,14 +6,13 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... -Warn: Failed to read WAL file, moved it to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted and proceeding: WAL file is empty Deploying resources... Updating deployment state... Deployment complete! === Checking WAL file after deploy === Empty WAL deleted (expected) === Corrupted WAL file === -[FILE_INFO] .databricks/bundle/default/resources.json.wal.corrupted +Corrupted WAL file missing (unexpected) === State file content === { "lineage": "[UUID]", diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index b0e5bda558..cb3526e9b6 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -6,7 +6,6 @@ === Deploy (should fail with corruption error) === >>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-future-serial-test/default/files... Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL serial (5) is ahead of expected (3), state may be corrupted diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index 7f6c3a89bd..e706e1d087 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -6,7 +6,6 @@ === Deploy (should fail with lineage mismatch error) === >>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-lineage-mismatch-test/default/files... Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt index e31643106b..8553dda7b3 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt +++ b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt @@ -8,19 +8,18 @@ Deploying resources... Exit code: [KILLED] === WAL after first crash === WAL exists -{"lineage":"[UUID]","serial": [SERIAL]} +{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} {"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === Second deploy (crashes during job_a update) === >>> errcode [CLI] bundle deploy --force-lock Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 1 entries from WAL file. Deploying resources... [PROCESS_KILLED] Exit code: [KILLED] === WAL after second crash === +WAL still exists === Third deploy (should succeed) === >>> [CLI] bundle deploy --force-lock diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt index 3f5747ab21..634f804e17 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt +++ b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt @@ -8,24 +8,13 @@ Deploying resources... Exit code: [KILLED] === State directory contents after crash === deployment.json -resources.json resources.json.wal sync-snapshots === WAL should exist after crash === WAL exists (expected) -{"lineage":"[UUID]","serial": [SERIAL]} +{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} {"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash === -{ - "serial": [SERIAL], - "state_keys": [] -} -=== Bundle summary (should show job_a from WAL) === +cat: .databricks/bundle/default/resources.json: No such file or directory ->>> [CLI] bundle summary -o json -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 1 entries from WAL file. -{ - "job_a_id": "1001", - "job_b_id": null -} +Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index f686ac4836..8f52732d3e 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -9,8 +9,6 @@ >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-delete-test/default/files... -Recovering state from WAL file: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal -Recovered 1 entries from WAL file. Deploying resources... Updating deployment state... Deployment complete! diff --git a/acceptance/bundle/migrate/basic/out.plan_update.json b/acceptance/bundle/migrate/basic/out.plan_update.json index 44ba986a2f..99e22ec08b 100644 --- a/acceptance/bundle/migrate/basic/out.plan_update.json +++ b/acceptance/bundle/migrate/basic/out.plan_update.json @@ -2,7 +2,7 @@ "plan_version": 2, "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": 8, + "serial": 6, "plan": { "resources.jobs.test_job": { "action": "update", diff --git a/acceptance/bundle/migrate/basic/output.txt b/acceptance/bundle/migrate/basic/output.txt index 0d31bbd682..dafa3a4086 100644 --- a/acceptance/bundle/migrate/basic/output.txt +++ b/acceptance/bundle/migrate/basic/output.txt @@ -39,7 +39,7 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=7 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=6 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE=direct [CLI] bundle plan Plan: 0 to add, 0 to change, 0 to delete, 3 unchanged @@ -86,14 +86,14 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=8 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=6 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states -[TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=8 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=6 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states --force-pull -resources.json: remote direct state serial=8 lineage="[UUID]" -[TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=8 lineage="[UUID]" +resources.json: remote direct state serial=6 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/dev/resources.json: local direct state serial=6 lineage="[UUID]" === Extra plan: should have no drift >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle plan diff --git a/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json b/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json index 0f73ce72be..6b55f64bd8 100644 --- a/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json +++ b/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json @@ -2,7 +2,7 @@ "plan_version": 2, "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": 4, + "serial": 3, "plan": { "resources.dashboards.dashboard1": { "action": "skip", diff --git a/acceptance/bundle/migrate/dashboards/output.txt b/acceptance/bundle/migrate/dashboards/output.txt index 7cbd91a2f6..19a4f1c7bb 100644 --- a/acceptance/bundle/migrate/dashboards/output.txt +++ b/acceptance/bundle/migrate/dashboards/output.txt @@ -47,11 +47,11 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=5 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=3 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=5 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=3 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states --force-pull -resources.json: remote direct state serial=5 lineage="[UUID]" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=5 lineage="[UUID]" +resources.json: remote direct state serial=3 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=3 lineage="[UUID]" diff --git a/acceptance/bundle/migrate/grants/output.txt b/acceptance/bundle/migrate/grants/output.txt index 44ec67fb48..146787d549 100644 --- a/acceptance/bundle/migrate/grants/output.txt +++ b/acceptance/bundle/migrate/grants/output.txt @@ -45,11 +45,11 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=11 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=9 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=11 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=9 lineage="[UUID]" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states --force-pull -resources.json: remote direct state serial=11 lineage="[UUID]" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=11 lineage="[UUID]" +resources.json: remote direct state serial=9 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=9 lineage="[UUID]" diff --git a/acceptance/bundle/migrate/permissions/output.txt b/acceptance/bundle/migrate/permissions/output.txt index 953a4bae97..f85c8d7bdb 100644 --- a/acceptance/bundle/migrate/permissions/output.txt +++ b/acceptance/bundle/migrate/permissions/output.txt @@ -62,11 +62,11 @@ Deployment complete! === Should show that it's already migrated >>> musterr [CLI] bundle deployment migrate Error: already using direct engine -Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=8 lineage="[UUID]" +Details: [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=7 lineage="[UUID]" >>> [CLI] bundle debug states -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=8 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=7 lineage="[UUID]" >>> [CLI] bundle debug states --force-pull -resources.json: remote direct state serial=8 lineage="[UUID]" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=8 lineage="[UUID]" +resources.json: remote direct state serial=7 lineage="[UUID]" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=7 lineage="[UUID]" diff --git a/acceptance/bundle/resources/dashboards/publish-failure-cleans-up-dashboard/out.deploy.direct.txt b/acceptance/bundle/resources/dashboards/publish-failure-cleans-up-dashboard/out.deploy.direct.txt index 84918b848b..705bd09cb3 100644 --- a/acceptance/bundle/resources/dashboards/publish-failure-cleans-up-dashboard/out.deploy.direct.txt +++ b/acceptance/bundle/resources/dashboards/publish-failure-cleans-up-dashboard/out.deploy.direct.txt @@ -9,6 +9,5 @@ HTTP Status: 400 Bad Request API error_code: RESOURCE_DOES_NOT_EXIST API message: Warehouse doesnotexist does not exist -Updating deployment state... Exit code: 1 diff --git a/acceptance/bundle/resources/jobs/create-error/output.txt b/acceptance/bundle/resources/jobs/create-error/output.txt index 0fcd944efd..4211f239d9 100644 --- a/acceptance/bundle/resources/jobs/create-error/output.txt +++ b/acceptance/bundle/resources/jobs/create-error/output.txt @@ -9,4 +9,3 @@ HTTP Status: 400 Bad Request API error_code: INVALID_PARAMETER_VALUE API message: Shared job cluster feature is only supported in multi-task jobs. -Updating deployment state... diff --git a/acceptance/bundle/resources/jobs/update/out.plan_update.direct.json b/acceptance/bundle/resources/jobs/update/out.plan_update.direct.json index bdb8e9f5e9..7bf628435b 100644 --- a/acceptance/bundle/resources/jobs/update/out.plan_update.direct.json +++ b/acceptance/bundle/resources/jobs/update/out.plan_update.direct.json @@ -2,7 +2,7 @@ "plan_version": 2, "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": 2, + "serial": 1, "plan": { "resources.jobs.foo": { "action": "update", diff --git a/acceptance/bundle/resources/postgres_projects/without_project_id/out.deploy.direct.txt b/acceptance/bundle/resources/postgres_projects/without_project_id/out.deploy.direct.txt index 79d1f7200e..8103b944c4 100644 --- a/acceptance/bundle/resources/postgres_projects/without_project_id/out.deploy.direct.txt +++ b/acceptance/bundle/resources/postgres_projects/without_project_id/out.deploy.direct.txt @@ -11,4 +11,3 @@ HTTP Status: 400 Bad Request API error_code: INVALID_PARAMETER_VALUE API message: Field 'project_id' is required, expected non-default value (not "")! -Updating deployment state... diff --git a/acceptance/bundle/state/state_present/output.txt b/acceptance/bundle/state/state_present/output.txt index 706b54a67a..cccf089828 100644 --- a/acceptance/bundle/state/state_present/output.txt +++ b/acceptance/bundle/state/state_present/output.txt @@ -91,14 +91,14 @@ Deployment complete! >>> print_state.py 3 -15 +13 contains error: '12' not found in the output. >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states [TEST_TMP_DIR]/.databricks/bundle/default/terraform/terraform.tfstate: local terraform state serial=3 lineage="test-lineage" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=15 lineage="test-lineage" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=13 lineage="test-lineage" >>> DATABRICKS_BUNDLE_ENGINE= [CLI] bundle debug states --force-pull [TEST_TMP_DIR]/.databricks/bundle/default/terraform/terraform.tfstate: local terraform state serial=3 lineage="test-lineage" -resources.json: remote direct state serial=15 lineage="test-lineage" -[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=15 lineage="test-lineage" +resources.json: remote direct state serial=13 lineage="test-lineage" +[TEST_TMP_DIR]/.databricks/bundle/default/resources.json: local direct state serial=13 lineage="test-lineage" diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 90f8ca07fc..55e4b58d7f 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -2,6 +2,7 @@ package dstate import ( "bufio" + "bytes" "context" "encoding/json" "errors" @@ -15,6 +16,7 @@ import ( "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/internal/build" + "github.com/databricks/cli/libs/log" "github.com/google/uuid" ) @@ -22,9 +24,13 @@ const ( currentStateVersion = 2 initialBufferSize = 64 * 1024 maxWalEntrySize = 1024 * 1024 - walSuffix = ".WAL" + walSuffix = ".wal" ) +// errStaleWAL is returned when the WAL serial is behind the expected serial. +// The caller should delete the stale WAL and proceed normally. +var errStaleWAL = errors.New("stale WAL") + type DeploymentState struct { Path string Data Database @@ -157,9 +163,8 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W _, walError := os.Stat(walPath) if walError == nil { if withRecovery { - err := db.replayWAL(ctx) - if err != nil { - return err + if err := db.replayWAL(ctx); err != nil { + return fmt.Errorf("reading state from %s: %w", path, err) } } else { return fmt.Errorf("unexpected WAL file found at %s", walPath) @@ -171,6 +176,9 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W } if withWrite { + if err := os.MkdirAll(filepath.Dir(walPath), 0o755); err != nil { + return fmt.Errorf("failed to create state directory: %w", err) + } walFile, err := os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err != nil { return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) @@ -218,35 +226,35 @@ func (db *DeploymentState) replayWAL(ctx context.Context) error { walPath := db.Path + walSuffix hasEntries, err := db.mergeWalIntoState(ctx) if err != nil { - return fmt.Errorf("failed to apply WAL file %s: %w", walPath, err) + if errors.Is(err, errStaleWAL) { + log.Debugf(ctx, "Deleting stale WAL file %s", walPath) + _ = os.Remove(walPath) + return nil + } + return fmt.Errorf("WAL recovery failed: %w", err) } if hasEntries { if err := db.unlockedSave(); err != nil { return err } } - err = os.Remove(walPath) - if err != nil { + if err := os.Remove(walPath); err != nil { return fmt.Errorf("failed to remove WAL file %s: %w", walPath, err) } return nil } -func (db *DeploymentState) validateWALHeader(ctx context.Context, header *WALHeader) error { - if header.CLIVersion != db.Data.CLIVersion { - return fmt.Errorf("cli_version in the header (%q) does not match the one in the state (%q)", header.CLIVersion, db.Data.CLIVersion) - } - - if header.StateVersion != db.Data.StateVersion { - return fmt.Errorf("state_version in the header (%d) does not match the one in the state (%d)", header.StateVersion, db.Data.StateVersion) - } - +func (db *DeploymentState) validateWALHeader(header *WALHeader) error { if header.Lineage != db.Data.Lineage && db.Data.Lineage != "" { - return fmt.Errorf("lineage in the header (%q) does not match the one in the state (%q)", header.Lineage, db.Data.Lineage) + return fmt.Errorf("WAL lineage (%s) does not match state lineage (%s)", header.Lineage, db.Data.Lineage) } - if header.Serial != db.Data.Serial+1 { - return fmt.Errorf("serial in the header (%d) is not one higher than the one in the state (%d)", header.Serial, db.Data.Serial) + expected := db.Data.Serial + 1 + if header.Serial < expected { + return errStaleWAL + } + if header.Serial > expected { + return fmt.Errorf("WAL serial (%d) is ahead of expected (%d), state may be corrupted", header.Serial, expected) } return nil @@ -267,6 +275,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) scanner := bufio.NewScanner(walFile) scanner.Buffer(make([]byte, 0, initialBufferSize), maxWalEntrySize) lineNumber := 0 + var corruptedLines [][]byte for scanner.Scan() { lineNumber++ @@ -276,7 +285,7 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) if err := json.Unmarshal(line, &header); err != nil { return false, fmt.Errorf("failed to parse WAL header: %w", err) } - if err := db.validateWALHeader(ctx, &header); err != nil { + if err := db.validateWALHeader(&header); err != nil { return false, err } // Apply header metadata to state (lineage may be new for first deploy) @@ -285,17 +294,38 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) } else { var entry WALEntry if err := json.Unmarshal(line, &entry); err != nil { - return false, fmt.Errorf("failed to parse WAL entry %s:%d: %q: %w", walPath, lineNumber, line, err) + log.Warnf(ctx, "Skipping corrupted WAL entry at %s:%d: %v", walPath, lineNumber, err) + corruptedLines = append(corruptedLines, append([]byte(nil), line...)) + continue + } + if db.Data.State == nil { + db.Data.State = make(map[string]ResourceEntry) } if entry.Value == nil { delete(db.Data.State, entry.Key) + delete(db.stateIDs, entry.Key) } else { db.Data.State[entry.Key] = *entry.Value + db.stateIDs[entry.Key] = entry.Value.ID } } } - return lineNumber > 1, scanner.Err() + if err := scanner.Err(); err != nil { + return false, err + } + + if len(corruptedLines) > 0 { + corruptedPath := walPath + ".corrupted" + corruptedData := bytes.Join(corruptedLines, []byte("\n")) + if writeErr := os.WriteFile(corruptedPath, corruptedData, 0o600); writeErr != nil { + log.Warnf(ctx, "Failed to save corrupted WAL entries to %s: %v", corruptedPath, writeErr) + } else { + log.Warnf(ctx, "Saved %d corrupted WAL entries to %s", len(corruptedLines), corruptedPath) + } + } + + return lineNumber > 1, nil } // Finalize replays the WAL (if open for write) and resets the state. @@ -337,6 +367,9 @@ func (db *DeploymentState) UpgradeToWrite() error { } walPath := db.Path + walSuffix + if err := os.MkdirAll(filepath.Dir(walPath), 0o755); err != nil { + return fmt.Errorf("failed to create state directory: %w", err) + } walFile, err := os.OpenFile(walPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600) if err != nil { return fmt.Errorf("failed to open WAL file %s: %w", walPath, err) From 74d192fdc04ed8255361537006a7aa9eaf79ffa3 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 30 Apr 2026 17:13:26 +0200 Subject: [PATCH 31/85] restore non-material changes: assertions and comment Co-authored-by: Denis Bilenko --- bundle/direct/bundle_plan.go | 2 ++ bundle/direct/pkg.go | 1 + cmd/bundle/utils/process.go | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 4f21d0fa06..eb80f49b68 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -44,6 +44,8 @@ func ValidatePlanAgainstState(stateDB *dstate.DeploymentState, plan *deployplan. return nil } + stateDB.AssertOpenedForReadOrWrite() + if plan.Lineage != stateDB.Data.Lineage { return fmt.Errorf("plan lineage %q does not match state lineage %q; the state may have been modified by another process", plan.Lineage, stateDB.Data.Lineage) } diff --git a/bundle/direct/pkg.go b/bundle/direct/pkg.go index 50beda36f5..48a9c5a2ff 100644 --- a/bundle/direct/pkg.go +++ b/bundle/direct/pkg.go @@ -67,5 +67,6 @@ func (d *DeploymentUnit) SetRemoteState(remoteState any) error { // ExportState exports the current deployment state as a resource map. // StateDB must already be open for read before calling this function. func (b *DeploymentBundle) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { + b.StateDB.AssertOpenedForRead() return b.StateDB.ExportState(ctx) } diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index f7663057c2..c142f4d943 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -72,7 +72,8 @@ type ProcessOptions struct { // When set, skips Build and PreDeployChecks phases, loads plan from file instead of calculating. ReadPlanPath string - // PostStateFunc is called at the end of ProcessBundleRet, while state is still open. + // PostStateFunc is called at the end of ProcessBundleRet, within the state lifecycle scope + // (after state is opened and IDs loaded, before deferred Finalize). PostStateFunc func(ctx context.Context, b *bundle.Bundle, stateDesc *statemgmt.StateDesc) error // Indicate whether the bundle operation originates from the pipelines CLI From d041c19a0b9f731a1d948c018986e64813fc7864 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 1 May 2026 16:04:10 +0200 Subject: [PATCH 32/85] deduplicate UpgradeToWrite+defer Finalize in Deploy Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 44 +++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index a11fe4bc12..fd312be845 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -155,40 +155,32 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand return } - if plan != nil { - if engine.IsDirect() { - // Upgrade from read (opened by process.go) to write mode - if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + planFromFile := plan != nil + if plan == nil { + // State is already open for read by process.go (for direct engine) + plan = RunPlan(ctx, b, engine) + } + + if engine.IsDirect() { + // Upgrade from read (opened by process.go) to write mode + if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { + logdiag.LogError(ctx, err) + return + } + defer func() { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) - return } - defer func() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - }() - } + }() + } + + if planFromFile { // Initialize DeploymentBundle for applying the loaded plan err := b.DeploymentBundle.InitForApply(ctx, b.WorkspaceClient(ctx), plan) if err != nil { logdiag.LogError(ctx, err) return } - } else { - // State is already open for read by process.go (for direct engine) - plan = RunPlan(ctx, b, engine) - if engine.IsDirect() { - // Upgrade from read to write mode (Apply needs write access) - if err := b.DeploymentBundle.StateDB.UpgradeToWrite(); err != nil { - logdiag.LogError(ctx, err) - return - } - defer func() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - }() - } } if logdiag.HasError(ctx) { From baffcb519143f477b1dd90e55277c1bc62d4807d Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Fri, 1 May 2026 16:12:30 +0200 Subject: [PATCH 33/85] update out.test.toml --- acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml | 4 +--- .../bundle/deploy/wal/corrupted-wal-entry/out.test.toml | 4 +--- .../bundle/deploy/wal/corrupted-wal-middle/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/crash-after-create/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/empty-wal/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/normal-deploy/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/stale-wal/out.test.toml | 4 +--- .../bundle/deploy/wal/summary-after-crash/out.test.toml | 4 +--- acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml | 4 +--- acceptance/selftest/kill_caller/offset/out.test.toml | 4 +--- 13 files changed, 13 insertions(+), 39 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml b/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml +++ b/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/empty-wal/out.test.toml b/acceptance/bundle/deploy/wal/empty-wal/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/out.test.toml +++ b/acceptance/bundle/deploy/wal/empty-wal/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml b/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml +++ b/acceptance/bundle/deploy/wal/future-serial-wal/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml +++ b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml +++ b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/stale-wal/out.test.toml b/acceptance/bundle/deploy/wal/stale-wal/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/out.test.toml +++ b/acceptance/bundle/deploy/wal/stale-wal/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml +++ b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml b/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml index 54146af564..e90b6d5d1b 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml +++ b/acceptance/bundle/deploy/wal/wal-with-delete/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/selftest/kill_caller/offset/out.test.toml b/acceptance/selftest/kill_caller/offset/out.test.toml index d560f1de04..f784a18325 100644 --- a/acceptance/selftest/kill_caller/offset/out.test.toml +++ b/acceptance/selftest/kill_caller/offset/out.test.toml @@ -1,5 +1,3 @@ Local = true Cloud = false - -[EnvMatrix] - DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] From 58679995686708cdb4e32f2a72f29a285bee3ebb Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 4 May 2026 14:00:34 +0200 Subject: [PATCH 34/85] fix compilation in configsync/variables.go --- bundle/configsync/variables.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bundle/configsync/variables.go b/bundle/configsync/variables.go index e7bdff3696..0745bfba43 100644 --- a/bundle/configsync/variables.go +++ b/bundle/configsync/variables.go @@ -144,7 +144,7 @@ func resourceIDLookup(ctx context.Context, b *bundle.Bundle) func(string) string } _, statePath := b.StateFilenameConfigSnapshot(ctx) db := &dstate.DeploymentState{} - if err := db.Open(statePath); err != nil { + if err := db.Open(ctx, statePath, dstate.WithRecovery(false), dstate.WithWrite(false)); err != nil { log.Debugf(ctx, "variable restoration: failed to open state DB at %s: %v", statePath, err) return nil } From 19c0bf929a4861a50a9b0c8881820cd8ee166559 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Wed, 6 May 2026 11:28:56 +0200 Subject: [PATCH 35/85] use OpenWithData+UpgradeToWrite in migrate to avoid disk roundtrip CalculatePlan only reads StateDB.Data from memory; writing to disk before it and reading back via Open was unnecessary. Add OpenWithData to initialize state from an in-memory Database without disk I/O, then use UpgradeToWrite to transition to write mode before Apply. Co-authored-by: Isaac --- bundle/direct/dstate/state.go | 18 ++++++++++++++++++ cmd/bundle/deployment/migrate.go | 24 +++--------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 55e4b58d7f..6a7e73778b 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -201,6 +201,24 @@ func (db *DeploymentState) Open(ctx context.Context, path string, withRecovery W return nil } +// OpenWithData initializes the state from an in-memory database without reading from disk. +// The state is opened in read mode; call UpgradeToWrite to transition to write mode. +func (db *DeploymentState) OpenWithData(path string, data Database) { + db.mu.Lock() + defer db.mu.Unlock() + + if db.Path != "" { + panic(fmt.Sprintf("state already opened: %v, cannot open %v", db.Path, path)) + } + + db.Path = path + db.Data = data + db.stateIDs = make(map[string]string) + for key, entry := range data.State { + db.stateIDs[key] = entry.ID + } +} + func (db *DeploymentState) Reload(ctx context.Context) error { db.stateIDs = make(map[string]string) data, err := os.ReadFile(db.Path) diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index 3b1f003f59..f4512f4e1f 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -8,7 +8,6 @@ import ( "fmt" "os" "os/exec" - "path/filepath" "strings" "github.com/databricks/cli/bundle" @@ -228,19 +227,8 @@ To start using direct engine, set "engine: direct" under bundle in your databric migratedDB := dstate.NewDatabase(stateDesc.Lineage, stateDesc.Serial+1) migratedDB.State = state - // Write the migrated state to disk so CalculatePlan can read it via Open. - migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") - if err != nil { - return fmt.Errorf("marshaling migrated state: %w", err) - } - if err := os.MkdirAll(filepath.Dir(tempStatePath), 0o755); err != nil { - return fmt.Errorf("creating state directory: %w", err) - } - if err := os.WriteFile(tempStatePath, migratedStateJSON, 0o600); err != nil { - return fmt.Errorf("writing migrated state to %s: %w", tempStatePath, err) - } - deploymentBundle := &direct.DeploymentBundle{} + deploymentBundle.StateDB.OpenWithData(tempStatePath, migratedDB) tempStatePathAutoRemove := true @@ -258,10 +246,6 @@ To start using direct engine, set "engine: direct" under bundle in your databric return root.ErrAlreadyPrinted } - if err := deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return fmt.Errorf("failed to open state: %w", err) - } - plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &b.Config) if err != nil { return err @@ -293,10 +277,8 @@ To start using direct engine, set "engine: direct" under bundle in your databric } } - _ = deploymentBundle.StateDB.Finalize(ctx) - err = deploymentBundle.StateDB.Open(ctx, tempStatePath, dstate.WithRecovery(false), dstate.WithWrite(true)) - if err != nil { - return fmt.Errorf("reopening state for apply: %w", err) + if err := deploymentBundle.StateDB.UpgradeToWrite(); err != nil { + return fmt.Errorf("upgrading state for apply: %w", err) } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) From 2b294b85a3b3bb67b75665bf0987321628e68275 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Thu, 7 May 2026 11:45:21 +0200 Subject: [PATCH 36/85] use OpenWithData+UpgradeToWrite in uploadStateForYamlSync Same simplification as migrate.go: CalculatePlan reads StateDB.Data from memory, so writing to disk and reading back via Open is unnecessary. Use OpenWithData to initialize state in-memory, UpgradeToWrite to transition to write mode before Apply. Co-authored-by: Isaac --- .../statemgmt/upload_state_for_yaml_sync.go | 23 +++---------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index a89433964c..5b1fbc3bf6 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -141,16 +141,8 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun migratedDB := dstate.NewDatabase(tfState.Lineage, tfState.Serial+1) migratedDB.State = state - // Write the migrated state to disk so CalculatePlan can read it via Open. - migratedStateJSON, err := json.MarshalIndent(migratedDB, "", " ") - if err != nil { - return false, fmt.Errorf("marshaling migrated state: %w", err) - } - if err := os.WriteFile(snapshotPath, migratedStateJSON, 0o600); err != nil { - return false, fmt.Errorf("writing migrated state to %s: %w", snapshotPath, err) - } - deploymentBundle := &direct.DeploymentBundle{} + deploymentBundle.StateDB.OpenWithData(snapshotPath, migratedDB) // Apply SecretScopeFixups so the config matches what the direct engine expects. // This adds MANAGE ACL for the current user to all secret scopes, ensuring @@ -177,12 +169,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun return false, fmt.Errorf("failed to create uninterpolated config: %w", err) } - if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { - return false, fmt.Errorf("failed to open state: %w", err) - } - plan, err := deploymentBundle.CalculatePlan(ctx, b.WorkspaceClient(ctx), &uninterpolatedConfig) - _ = deploymentBundle.StateDB.Finalize(ctx) if err != nil { return false, err } @@ -206,12 +193,8 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } } - // Close read state and reopen for write so Apply can record state changes via WAL. - if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { - return false, fmt.Errorf("closing state after plan: %w", err) - } - if err := deploymentBundle.StateDB.Open(ctx, snapshotPath, dstate.WithRecovery(false), dstate.WithWrite(true)); err != nil { - return false, fmt.Errorf("reopening state for apply: %w", err) + if err := deploymentBundle.StateDB.UpgradeToWrite(); err != nil { + return false, fmt.Errorf("upgrading state for apply: %w", err) } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) From bcf5d31c7d9bc27a579ffea818ffa3f55b93fbd1 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 15:00:48 +0200 Subject: [PATCH 37/85] remove redundant defer Finalize in Deploy WAL is recovered on next run via WithRecovery open in process.go; deployCore already calls Finalize+Open explicitly before PushResourcesState. Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index fd312be845..7efe71b850 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -167,11 +167,6 @@ func Deploy(ctx context.Context, b *bundle.Bundle, outputHandler sync.OutputHand logdiag.LogError(ctx, err) return } - defer func() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - }() } if planFromFile { From ffa5c05591d27cff68c9f5838e7a3acafabc99b8 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 15:08:55 +0200 Subject: [PATCH 38/85] move Finalize into destroyCore before files.Delete Flush WAL to local state while the state DB is still open, before remote files are deleted. Co-authored-by: Denis Bilenko --- bundle/phases/destroy.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 3721f6a883..68657f4e51 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -81,6 +81,13 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, e bundle.ApplyContext(ctx, b, terraform.Apply()) } + // Flush WAL to local state file before deleting remote files. + if engine.IsDirect() { + if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { + logdiag.LogError(ctx, err) + } + } + if logdiag.HasError(ctx) { return } @@ -168,11 +175,6 @@ func Destroy(ctx context.Context, b *bundle.Bundle, engine engine.EngineType) { logdiag.LogError(ctx, err) return } - defer func() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - }() } destroyCore(ctx, b, plan, engine) } else { From 10d5e68e52215cb6a616a13acaacb07c57a72c47 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 15:13:13 +0200 Subject: [PATCH 39/85] remove noise comment from bundle_apply.go Co-authored-by: Denis Bilenko --- bundle/direct/bundle_apply.go | 1 - 1 file changed, 1 deletion(-) diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 6b84f40775..9bf0f857a5 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -152,7 +152,6 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return true }) - // Note: caller is responsible for closing StateDB after Apply returns. } func (b *DeploymentBundle) LookupReferencePostDeploy(ctx context.Context, path *structpath.PathNode) (any, error) { From c7f54e81e69757154de65f683f084c9698a8953d Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 19:55:59 +0200 Subject: [PATCH 40/85] fix gofumpt and test output Co-authored-by: Denis Bilenko --- acceptance/selftest/kill_caller/offset/output.txt | 12 ++++++------ bundle/direct/bundle_apply.go | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt index 03407dd0d8..cb87595a2c 100644 --- a/acceptance/selftest/kill_caller/offset/output.txt +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -1,15 +1,15 @@ >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 1 done - success (offset) >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 2 done - success (offset) @@ -27,7 +27,7 @@ Attempt 4 done - killed >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 5 done - success (past kill window) diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index 9bf0f857a5..6bad809146 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -151,7 +151,6 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa return true }) - } func (b *DeploymentBundle) LookupReferencePostDeploy(ctx context.Context, path *structpath.PathNode) (any, error) { From 57ad5710b2ad3d56bc5cf875e7e1ffbd3a2d2052 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 19:59:02 +0200 Subject: [PATCH 41/85] shrink chain-10-jobs to chain-3-jobs 3 jobs exercise the same DAG + partial-WAL recovery path with 3x fewer output lines. Co-authored-by: Denis Bilenko --- .../deploy/wal/chain-10-jobs/databricks.yml | 117 ----- .../deploy/wal/chain-10-jobs/output.txt | 414 ------------------ .../deploy/wal/chain-3-jobs/databricks.yml | 40 ++ .../out.test.toml | 0 .../bundle/deploy/wal/chain-3-jobs/output.txt | 120 +++++ .../{chain-10-jobs => chain-3-jobs}/script | 0 .../{chain-10-jobs => chain-3-jobs}/test.py | 0 .../{chain-10-jobs => chain-3-jobs}/test.toml | 6 +- 8 files changed, 163 insertions(+), 534 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/chain-10-jobs/output.txt create mode 100644 acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml rename acceptance/bundle/deploy/wal/{chain-10-jobs => chain-3-jobs}/out.test.toml (100%) create mode 100644 acceptance/bundle/deploy/wal/chain-3-jobs/output.txt rename acceptance/bundle/deploy/wal/{chain-10-jobs => chain-3-jobs}/script (100%) rename acceptance/bundle/deploy/wal/{chain-10-jobs => chain-3-jobs}/test.py (100%) rename acceptance/bundle/deploy/wal/{chain-10-jobs => chain-3-jobs}/test.toml (69%) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml b/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml deleted file mode 100644 index 2652cdbed6..0000000000 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/databricks.yml +++ /dev/null @@ -1,117 +0,0 @@ -bundle: - name: wal-chain-test - -resources: - jobs: - # Linear chain: job_01 -> job_02 -> ... -> job_10 - # Execution order: job_01 first, job_10 last - job_01: - name: "job-01" - description: "first in chain" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_02: - name: "job-02" - description: "depends on ${resources.jobs.job_01.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_03: - name: "job-03" - description: "depends on ${resources.jobs.job_02.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_04: - name: "job-04" - description: "depends on ${resources.jobs.job_03.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_05: - name: "job-05" - description: "depends on ${resources.jobs.job_04.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_06: - name: "job-06" - description: "depends on ${resources.jobs.job_05.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_07: - name: "job-07" - description: "depends on ${resources.jobs.job_06.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_08: - name: "job-08" - description: "depends on ${resources.jobs.job_07.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_09: - name: "job-09" - description: "depends on ${resources.jobs.job_08.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_10: - name: "job-10" - description: "depends on ${resources.jobs.job_09.id}" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt deleted file mode 100644 index 818bf13b25..0000000000 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/output.txt +++ /dev/null @@ -1,414 +0,0 @@ -=== First deploy (crashes on job_10) === - ->>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... -Deploying resources... -[PROCESS_KILLED] - -Exit code: [KILLED] - -=== WAL content after crash === -{ - "cli_version": "[DEV_VERSION]", - "lineage": "[UUID]", - "serial": [SERIAL], - "state_version": 2 -} -{ - "k": "resources.jobs.job_01", - "v": { - "__id__": "[ID]", - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "first in chain", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-01", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_02", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_01.id}", - "node": "resources.jobs.job_01" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-02", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_03", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_02.id}", - "node": "resources.jobs.job_02" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-03", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_04", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_03.id}", - "node": "resources.jobs.job_03" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-04", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_05", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_04.id}", - "node": "resources.jobs.job_04" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-05", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_06", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_05.id}", - "node": "resources.jobs.job_05" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-06", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_07", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_06.id}", - "node": "resources.jobs.job_06" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-07", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_08", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_07.id}", - "node": "resources.jobs.job_07" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-08", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} -{ - "k": "resources.jobs.job_09", - "v": { - "__id__": "[ID]", - "depends_on": [ - { - "label": "${resources.jobs.job_08.id}", - "node": "resources.jobs.job_08" - } - ], - "state": { - "deployment": { - "kind": "BUNDLE", - "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" - }, - "description": "depends on 1001", - "edit_mode": "UI_LOCKED", - "format": "MULTI_TASK", - "max_concurrent_runs": 1, - "name": "job-09", - "queue": { - "enabled": true - }, - "tasks": [ - { - "new_cluster": { - "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, - "spark_version": "15.4.x-scala2.12" - }, - "spark_python_task": { - "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" - }, - "task_key": "task" - } - ] - } - } -} - -=== Number of jobs saved in WAL === -9 - -=== Bundle summary (reads from WAL) === -Name: wal-chain-test -Target: default -Workspace: - User: [USERNAME] - Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default -Resources: - Jobs: - job_01: - Name: job-01 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_02: - Name: job-02 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_03: - Name: job-03 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_04: - Name: job-04 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_05: - Name: job-05 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_06: - Name: job-06 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_07: - Name: job-07 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_08: - Name: job-08 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_09: - Name: job-09 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] - job_10: - Name: job-10 - URL: (not deployed) - -=== Second deploy (recovery) === - ->>> [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! - -=== WAL after successful deploy === -WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml b/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml new file mode 100644 index 0000000000..fc3a46205b --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml @@ -0,0 +1,40 @@ +bundle: + name: wal-chain-test + +resources: + jobs: + # Linear chain: job_01 -> job_02 -> job_03 + # Execution order: job_01 first, job_03 last + job_01: + name: "job-01" + description: "first in chain" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_02: + name: "job-02" + description: "depends on ${resources.jobs.job_01.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 + job_03: + name: "job-03" + description: "depends on ${resources.jobs.job_02.id}" + tasks: + - task_key: "task" + spark_python_task: + python_file: ./test.py + new_cluster: + spark_version: 15.4.x-scala2.12 + node_type_id: i3.xlarge + num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/out.test.toml similarity index 100% rename from acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml rename to acceptance/bundle/deploy/wal/chain-3-jobs/out.test.toml diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt new file mode 100644 index 0000000000..ef56c8e098 --- /dev/null +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -0,0 +1,120 @@ +=== First deploy (crashes on job_10) === + +>>> errcode [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... +Deploying resources... +[PROCESS_KILLED] + +Exit code: [KILLED] + +=== WAL content after crash === +{ + "cli_version": "[DEV_VERSION]", + "lineage": "[UUID]", + "serial": [SERIAL], + "state_version": 2 +} +{ + "k": "resources.jobs.job_01", + "v": { + "__id__": "[ID]", + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "first in chain", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-01", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} +{ + "k": "resources.jobs.job_02", + "v": { + "__id__": "[ID]", + "depends_on": [ + { + "label": "${resources.jobs.job_01.id}", + "node": "resources.jobs.job_01" + } + ], + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" + }, + "description": "depends on 1001", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "job-02", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py" + }, + "task_key": "task" + } + ] + } + } +} + +=== Number of jobs saved in WAL === +2 + +=== Bundle summary (reads from WAL) === +Name: wal-chain-test +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default +Resources: + Jobs: + job_01: + Name: job-01 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_02: + Name: job-02 + URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + job_03: + Name: job-03 + URL: (not deployed) + +=== Second deploy (recovery) === + +>>> [CLI] bundle deploy --force-lock +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +=== WAL after successful deploy === +WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script similarity index 100% rename from acceptance/bundle/deploy/wal/chain-10-jobs/script rename to acceptance/bundle/deploy/wal/chain-3-jobs/script diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.py b/acceptance/bundle/deploy/wal/chain-3-jobs/test.py similarity index 100% rename from acceptance/bundle/deploy/wal/chain-10-jobs/test.py rename to acceptance/bundle/deploy/wal/chain-3-jobs/test.py diff --git a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml similarity index 69% rename from acceptance/bundle/deploy/wal/chain-10-jobs/test.toml rename to acceptance/bundle/deploy/wal/chain-3-jobs/test.toml index 36076f3df5..2425c89dea 100644 --- a/acceptance/bundle/deploy/wal/chain-10-jobs/test.toml +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml @@ -1,9 +1,9 @@ -# Linear chain: job_01 -> job_02 -> ... -> job_10 -# Let first 9 jobs/create succeed, then kill on the 10th +# Linear chain: job_01 -> job_02 -> job_03 +# Let first 2 jobs/create succeed, then kill on the 3rd [[Server]] Pattern = "POST /api/2.2/jobs/create" -KillCallerOffset = 9 +KillCallerOffset = 2 KillCaller = 1 Response.Body = '{"job_id": 1001}' From 3bd8efefa2bfdd5a6b6d672de3ac25e4cce9cb4a Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 20:00:26 +0200 Subject: [PATCH 42/85] fix test names in state_test.go: Close -> Finalize, restore SaveFinalize Co-authored-by: Denis Bilenko --- bundle/direct/dstate/state_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index b493258b2f..3f0f614cd3 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -9,7 +9,7 @@ import ( "github.com/stretchr/testify/require" ) -func TestOpenCloseRoundTrip(t *testing.T) { +func TestOpenSaveFinalizeRoundTrip(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState @@ -26,7 +26,7 @@ func TestOpenCloseRoundTrip(t *testing.T) { require.NoError(t, db2.Finalize(t.Context())) } -func TestCloseWithNoEntriesDoesNotWriteStateFile(t *testing.T) { +func TestFinalizeWithNoEntriesDoesNotWriteStateFile(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") var db DeploymentState From 870d434974d4fb91755272152c214f03445c3e9c Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 20:03:29 +0200 Subject: [PATCH 43/85] clean up WAL acceptance tests - drop corrupted-wal-middle (same code path as corrupted-wal-entry) - drop multiple-crashes (covered by crash-after-create) - drop summary-after-crash (incomplete output; crash coverage in crash-after-create) - fix empty-wal echo: (unexpected) -> (expected) - fix parent test.toml: exit code 137 -> [KILLED] only; errors show Exit code: 1 Co-authored-by: Denis Bilenko --- .../wal/corrupted-wal-middle/databricks.yml | 25 ----------- .../wal/corrupted-wal-middle/out.test.toml | 3 -- .../wal/corrupted-wal-middle/output.txt | 27 ------------ .../deploy/wal/corrupted-wal-middle/script | 43 ------------------- .../deploy/wal/corrupted-wal-middle/test.py | 1 - .../deploy/wal/corrupted-wal-middle/test.toml | 10 ----- .../bundle/deploy/wal/empty-wal/output.txt | 2 +- acceptance/bundle/deploy/wal/empty-wal/script | 2 +- .../deploy/wal/future-serial-wal/output.txt | 2 +- .../deploy/wal/lineage-mismatch/output.txt | 2 +- .../wal/multiple-crashes/databricks.yml | 27 ------------ .../deploy/wal/multiple-crashes/out.test.toml | 3 -- .../deploy/wal/multiple-crashes/output.txt | 39 ----------------- .../bundle/deploy/wal/multiple-crashes/script | 29 ------------- .../deploy/wal/multiple-crashes/test.py | 1 - .../deploy/wal/multiple-crashes/test.toml | 18 -------- .../wal/summary-after-crash/databricks.yml | 27 ------------ .../wal/summary-after-crash/out.test.toml | 3 -- .../deploy/wal/summary-after-crash/output.txt | 20 --------- .../deploy/wal/summary-after-crash/script | 19 -------- .../deploy/wal/summary-after-crash/test.py | 1 - .../deploy/wal/summary-after-crash/test.toml | 13 ------ acceptance/bundle/deploy/wal/test.toml | 2 +- 23 files changed, 5 insertions(+), 314 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/script delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/output.txt delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/script delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/test.py delete mode 100644 acceptance/bundle/deploy/wal/multiple-crashes/test.toml delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/output.txt delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/script delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/test.py delete mode 100644 acceptance/bundle/deploy/wal/summary-after-crash/test.toml diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml deleted file mode 100644 index aef2c714ec..0000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/databricks.yml +++ /dev/null @@ -1,25 +0,0 @@ -bundle: - name: wal-corrupted-middle-test - -resources: - jobs: - job_one: - name: "job-one" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_two: - name: "job-two" - tasks: - - task_key: "task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml deleted file mode 100644 index e90b6d5d1b..0000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/out.test.toml +++ /dev/null @@ -1,3 +0,0 @@ -Local = true -Cloud = false -EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt deleted file mode 100644 index ff13944ae4..0000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/output.txt +++ /dev/null @@ -1,27 +0,0 @@ -=== Creating state file with serial 5 === -=== Creating WAL with corrupted MIDDLE entry === -=== WAL content === -{"lineage":"test-lineage-456","serial": [SERIAL]} -{"k":"resources.jobs.job_one","v":{"__id__": "[ID]","state":{"name":"job-one"}}} -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- -{"k":"resources.jobs.job_two","v":{"__id__": "[ID]","state":{"name":"job-two"}}} -=== Deploy (should recover valid entries and skip corrupted line) === - ->>> [CLI] bundle deploy -Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:3: unexpected end of JSON input -Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-middle-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -=== Final state (should have recovered entries) === -{ - "serial": [SERIAL], - "state_keys": [ - "resources.jobs.job_one", - "resources.jobs.job_two" - ] -} -=== Corrupted WAL entries file === -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-=== WAL after deploy === -WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script b/acceptance/bundle/deploy/wal/corrupted-wal-middle/script deleted file mode 100644 index 6307d7fbf7..0000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/script +++ /dev/null @@ -1,43 +0,0 @@ -echo "=== Creating state file with serial 5 ===" -mkdir -p .databricks/bundle/default -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "test-lineage-456", - "serial": 5, - "state": {} -} -EOF - -echo "=== Creating WAL with corrupted MIDDLE entry ===" -# Corrupted middle line is expected (truncated JSON from crash) and should be skipped. -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-456","serial":6} -{"k":"resources.jobs.job_one","v":{"__id__":"1111","state":{"name":"job-one"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- -{"k":"resources.jobs.job_two","v":{"__id__":"2222","state":{"name":"job-two"}}} -EOF - -echo "=== WAL content ===" -cat .databricks/bundle/default/resources.json.wal - -echo "=== Deploy (should recover valid entries and skip corrupted line) ===" -trace $CLI bundle deploy 2>&1 - -echo "=== Final state (should have recovered entries) ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' - -echo "=== Corrupted WAL entries file ===" -if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then - cat .databricks/bundle/default/resources.json.wal.corrupted -else - echo "Missing corrupted WAL entries file (unexpected)" -fi - -echo "=== WAL after deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (unexpected)" -else - echo "WAL deleted (expected)" -fi diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py deleted file mode 100644 index 1ff8e07c70..0000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.py +++ /dev/null @@ -1 +0,0 @@ -print("test") diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml deleted file mode 100644 index d5f0b1bbb6..0000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-middle/test.toml +++ /dev/null @@ -1,10 +0,0 @@ -# WAL with corrupted MIDDLE entry - valid entries are recovered and corrupted entries are skipped. - -# Since valid entries are recovered, jobs will be updated (not created fresh). -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 9999}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get?job_id=9999" -Response.Body = '{"job_id": 9999, "settings": {"name": "fresh-job"}}' diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index 26117a2d36..884f502744 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -12,7 +12,7 @@ Deployment complete! === Checking WAL file after deploy === Empty WAL deleted (expected) === Corrupted WAL file === -Corrupted WAL file missing (unexpected) +Corrupted WAL file missing (expected) === State file content === { "lineage": "[UUID]", diff --git a/acceptance/bundle/deploy/wal/empty-wal/script b/acceptance/bundle/deploy/wal/empty-wal/script index 2c66d213aa..3929de8eb1 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/script +++ b/acceptance/bundle/deploy/wal/empty-wal/script @@ -21,7 +21,7 @@ echo "=== Corrupted WAL file ===" if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then ls -la .databricks/bundle/default/resources.json.wal.corrupted else - echo "Corrupted WAL file missing (unexpected)" + echo "Corrupted WAL file missing (expected)" fi echo "=== State file content ===" diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index cb3526e9b6..2b93423e1b 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -9,4 +9,4 @@ Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL serial (5) is ahead of expected (3), state may be corrupted -Exit code: [KILLED] +Exit code: 1 diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index e706e1d087..a539a2fb0c 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -9,4 +9,4 @@ Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) -Exit code: [KILLED] +Exit code: 1 diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml b/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml deleted file mode 100644 index 3dc96ed856..0000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/databricks.yml +++ /dev/null @@ -1,27 +0,0 @@ -bundle: - name: wal-multi-crash-test - -resources: - jobs: - job_a: - name: "test-job-a" - description: "first job" - tasks: - - task_key: "task-a" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_b: - name: "test-job-b" - description: "depends on ${resources.jobs.job_a.id}" - tasks: - - task_key: "task-b" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml deleted file mode 100644 index e90b6d5d1b..0000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/out.test.toml +++ /dev/null @@ -1,3 +0,0 @@ -Local = true -Cloud = false -EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt b/acceptance/bundle/deploy/wal/multiple-crashes/output.txt deleted file mode 100644 index 8553dda7b3..0000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/output.txt +++ /dev/null @@ -1,39 +0,0 @@ -=== First deploy (crashes after job_a create) === - ->>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... -Deploying resources... -[PROCESS_KILLED] - -Exit code: [KILLED] -=== WAL after first crash === -WAL exists -{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} -=== Second deploy (crashes during job_a update) === - ->>> errcode [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... -Deploying resources... -[PROCESS_KILLED] - -Exit code: [KILLED] -=== WAL after second crash === -WAL still exists -=== Third deploy (should succeed) === - ->>> [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-multi-crash-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -=== Final state === -{ - "serial": [SERIAL], - "state_keys": [ - "resources.jobs.job_a", - "resources.jobs.job_b" - ] -} -=== WAL after successful deploy === -WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/script b/acceptance/bundle/deploy/wal/multiple-crashes/script deleted file mode 100644 index 0adcd2a980..0000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/script +++ /dev/null @@ -1,29 +0,0 @@ -echo "=== First deploy (crashes after job_a create) ===" -trace errcode $CLI bundle deploy - -echo "=== WAL after first crash ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists" - cat .databricks/bundle/default/resources.json.wal -fi - -echo "=== Second deploy (crashes during job_a update) ===" -trace errcode $CLI bundle deploy --force-lock - -echo "=== WAL after second crash ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL still exists" -fi - -echo "=== Third deploy (should succeed) ===" -trace $CLI bundle deploy --force-lock - -echo "=== Final state ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' - -echo "=== WAL after successful deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (unexpected)" -else - echo "WAL deleted (expected)" -fi diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.py b/acceptance/bundle/deploy/wal/multiple-crashes/test.py deleted file mode 100644 index 1ff8e07c70..0000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.py +++ /dev/null @@ -1 +0,0 @@ -print("test") diff --git a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml b/acceptance/bundle/deploy/wal/multiple-crashes/test.toml deleted file mode 100644 index 474177b804..0000000000 --- a/acceptance/bundle/deploy/wal/multiple-crashes/test.toml +++ /dev/null @@ -1,18 +0,0 @@ -# Multiple real crashes during deployment - WAL should persist until successful finalize. -# First deploy: crashes after job_a create (kill on jobs/get) -# Second deploy: crashes during job_a update (kill on jobs/reset) -# Third deploy: succeeds (both counters exhausted) - -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "POST /api/2.2/jobs/reset" -KillCaller = 1 -Response.Body = '{}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -KillCaller = 1 -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml b/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml deleted file mode 100644 index 86376fd7ba..0000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/databricks.yml +++ /dev/null @@ -1,27 +0,0 @@ -bundle: - name: wal-summary-test - -resources: - jobs: - job_a: - name: "job-a" - description: "first job" - tasks: - - task_key: "task-a" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 - job_b: - name: "job-b" - description: "depends on ${resources.jobs.job_a.id}" - tasks: - - task_key: "task-b" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml deleted file mode 100644 index e90b6d5d1b..0000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/out.test.toml +++ /dev/null @@ -1,3 +0,0 @@ -Local = true -Cloud = false -EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt b/acceptance/bundle/deploy/wal/summary-after-crash/output.txt deleted file mode 100644 index 634f804e17..0000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/output.txt +++ /dev/null @@ -1,20 +0,0 @@ -=== Deploy (job_a created and saved, then crash on jobs/get) === - ->>> errcode [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files... -Deploying resources... -[PROCESS_KILLED] - -Exit code: [KILLED] -=== State directory contents after crash === -deployment.json -resources.json.wal -sync-snapshots -=== WAL should exist after crash === -WAL exists (expected) -{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-summary-test/default/files/test.py"},"task_key":"task-a"}]}}} -=== State file after crash === -cat: .databricks/bundle/default/resources.json: No such file or directory - -Exit code: [KILLED] diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/script b/acceptance/bundle/deploy/wal/summary-after-crash/script deleted file mode 100644 index 3b007062c6..0000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/script +++ /dev/null @@ -1,19 +0,0 @@ -echo "=== Deploy (job_a created and saved, then crash on jobs/get) ===" -trace errcode $CLI bundle deploy - -echo "=== State directory contents after crash ===" -ls .databricks/bundle/default/ - -echo "=== WAL should exist after crash ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (expected)" - cat .databricks/bundle/default/resources.json.wal -else - echo "WAL missing (unexpected)" -fi - -echo "=== State file after crash ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' - -echo "=== Bundle summary (should show job_a from WAL) ===" -trace $CLI bundle summary -o json | jq '{job_a_id: .resources.jobs.job_a.id, job_b_id: .resources.jobs.job_b.id}' diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.py b/acceptance/bundle/deploy/wal/summary-after-crash/test.py deleted file mode 100644 index 1ff8e07c70..0000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/test.py +++ /dev/null @@ -1 +0,0 @@ -print("test") diff --git a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml b/acceptance/bundle/deploy/wal/summary-after-crash/test.toml deleted file mode 100644 index f14cbbfcbc..0000000000 --- a/acceptance/bundle/deploy/wal/summary-after-crash/test.toml +++ /dev/null @@ -1,13 +0,0 @@ -# Bundle summary should show resources recovered from WAL after a real crash. -# job_b depends on job_a, so after job_a is created and SaveState is called, -# refreshRemoteState calls jobs/get to fetch job_a's state for job_b's reference. -# We kill on jobs/get - AFTER job_a's SaveState, so WAL contains job_a. - -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -KillCaller = 1 -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index df700645f7..50b50dbcba 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -19,7 +19,7 @@ New = """${1}[PROCESS_KILLED] Exit code:""" [[Repls]] -Old = 'Exit code: (137|1)' +Old = 'Exit code: 137' New = 'Exit code: [KILLED]' # On Windows, no bash "Killed" message appears when CLI has produced output before termination. From 174803603e6b245e557247a0b24e4b4455745409 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 20:58:02 +0200 Subject: [PATCH 44/85] fix crash-after-create: handle Linux exit code 1 after KillCaller On Linux, KillCaller (SIGKILL) may produce exit code 1 instead of 137. Add a context-sensitive replacement to normalise exit code 1 only when it directly follows [PROCESS_KILLED], so genuine error exits (exit code 1 from cat/jq) remain visible as Exit code: 1 in the output. Co-authored-by: Denis Bilenko --- acceptance/bundle/deploy/wal/crash-after-create/output.txt | 2 +- acceptance/bundle/deploy/wal/test.toml | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index cf9230983c..b3250b2db1 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -13,4 +13,4 @@ WAL exists (expected) === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory -Exit code: [KILLED] +Exit code: 1 diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 50b50dbcba..266d748049 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -22,6 +22,13 @@ Exit code:""" Old = 'Exit code: 137' New = 'Exit code: [KILLED]' +# On Linux, a KillCaller kill may surface as exit code 1 rather than 137. +# Only normalise exit code 1 when it directly follows [PROCESS_KILLED] to +# avoid masking genuine error exits (lineage-mismatch, future-serial-wal). +[[Repls]] +Old = '(\[PROCESS_KILLED\]\n\nExit code: )1' +New = '${1}[KILLED]' + # On Windows, no bash "Killed" message appears when CLI has produced output before termination. # Insert [PROCESS_KILLED] between last output line and exit code for consistency. [[Repls]] From 01a3610c68b571b841f3bf9484bfd7e486cfc8b5 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Sun, 10 May 2026 21:39:43 +0200 Subject: [PATCH 45/85] update selftest --- acceptance/selftest/kill_caller/offset/output.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt index cb87595a2c..03407dd0d8 100644 --- a/acceptance/selftest/kill_caller/offset/output.txt +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -1,15 +1,15 @@ >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id":"123", + "userName":"test@example.com" } Attempt 1 done - success (offset) >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id":"123", + "userName":"test@example.com" } Attempt 2 done - success (offset) @@ -27,7 +27,7 @@ Attempt 4 done - killed >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id":"123", + "userName":"test@example.com" } Attempt 5 done - success (past kill window) From 69dfc3ea2226616324b9ed0498848992dcc053b5 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 10:41:27 +0200 Subject: [PATCH 46/85] fix WAL acceptance test hygiene - chain-3-jobs: fix stale echo "job_10" -> "job_03" - corrupted-wal-entry, future-serial-wal, lineage-mismatch, stale-wal, wal-with-delete: commit static fixture files (resources.json, resources.json.wal) instead of creating them inline in script; wal-with-delete: commit databricks.yml as resources: {} instead of overwriting it at runtime Co-authored-by: Denis Bilenko --- .../bundle/deploy/wal/chain-3-jobs/output.txt | 2 +- .../bundle/deploy/wal/chain-3-jobs/script | 2 +- .../deploy/wal/corrupted-wal-entry/output.txt | 2 -- .../wal/corrupted-wal-entry/resources.json | 7 ++++ .../corrupted-wal-entry/resources.json.wal | 4 +++ .../deploy/wal/corrupted-wal-entry/script | 22 ++----------- .../deploy/wal/future-serial-wal/output.txt | 2 -- .../wal/future-serial-wal/resources.json | 12 +++++++ .../wal/future-serial-wal/resources.json.wal | 2 ++ .../deploy/wal/future-serial-wal/script | 23 ++----------- .../deploy/wal/lineage-mismatch/output.txt | 2 -- .../wal/lineage-mismatch/resources.json | 12 +++++++ .../wal/lineage-mismatch/resources.json.wal | 2 ++ .../bundle/deploy/wal/lineage-mismatch/script | 23 ++----------- .../bundle/deploy/wal/stale-wal/output.txt | 3 -- .../deploy/wal/stale-wal/resources.json | 12 +++++++ .../deploy/wal/stale-wal/resources.json.wal | 2 ++ acceptance/bundle/deploy/wal/stale-wal/script | 25 ++------------ .../deploy/wal/wal-with-delete/databricks.yml | 13 +------- .../deploy/wal/wal-with-delete/output.txt | 4 --- .../deploy/wal/wal-with-delete/resources.json | 12 +++++++ .../wal/wal-with-delete/resources.json.wal | 2 ++ .../bundle/deploy/wal/wal-with-delete/script | 33 ++----------------- 23 files changed, 80 insertions(+), 143 deletions(-) create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/resources.json create mode 100644 acceptance/bundle/deploy/wal/future-serial-wal/resources.json.wal create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/resources.json create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/resources.json.wal create mode 100644 acceptance/bundle/deploy/wal/stale-wal/resources.json create mode 100644 acceptance/bundle/deploy/wal/stale-wal/resources.json.wal create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/resources.json create mode 100644 acceptance/bundle/deploy/wal/wal-with-delete/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index ef56c8e098..1f4b53f7cf 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -1,4 +1,4 @@ -=== First deploy (crashes on job_10) === +=== First deploy (crashes on job_03) === >>> errcode [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index 1f829232ad..6c9993c280 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -1,4 +1,4 @@ -echo "=== First deploy (crashes on job_10) ===" +echo "=== First deploy (crashes on job_03) ===" trace errcode $CLI bundle deploy echo "" diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index aad802f749..bd886c153f 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,5 +1,3 @@ -=== Creating state file with serial 5 === -=== Creating WAL with corrupted LAST entry === === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} {"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json new file mode 100644 index 0000000000..f9f4e54d1e --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json @@ -0,0 +1,7 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-123", + "serial": 5, + "state": {} +} diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal new file mode 100644 index 0000000000..4791ba1281 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal @@ -0,0 +1,4 @@ +{"lineage":"test-lineage-123","serial":6} +{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} +{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} +{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index dde17995da..191a62f01f 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -1,24 +1,6 @@ -echo "=== Creating state file with serial 5 ===" mkdir -p .databricks/bundle/default -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "test-lineage-123", - "serial": 5, - "state": {} -} -EOF - -echo "=== Creating WAL with corrupted LAST entry ===" -# Corrupted last line is expected (truncated JSON from crash) and should be skipped. -# Valid entries before it should be recovered. -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-123","serial":6} -{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} -{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index 2b93423e1b..8fc16565fe 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -1,5 +1,3 @@ -=== Creating state file (serial=2) === -=== Creating WAL with future serial (serial=5, expected=3) === === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} {"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/resources.json b/acceptance/bundle/deploy/wal/future-serial-wal/resources.json new file mode 100644 index 0000000000..f2f06b34bf --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/resources.json @@ -0,0 +1,12 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "test-lineage-123", + "serial": 2, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/resources.json.wal b/acceptance/bundle/deploy/wal/future-serial-wal/resources.json.wal new file mode 100644 index 0000000000..98a8e48802 --- /dev/null +++ b/acceptance/bundle/deploy/wal/future-serial-wal/resources.json.wal @@ -0,0 +1,2 @@ +{"lineage":"test-lineage-123","serial":5} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/script b/acceptance/bundle/deploy/wal/future-serial-wal/script index 7b1784b0c6..f7a5719225 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/script +++ b/acceptance/bundle/deploy/wal/future-serial-wal/script @@ -1,25 +1,6 @@ -echo "=== Creating state file (serial=2) ===" mkdir -p .databricks/bundle/default -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "test-lineage-123", - "serial": 2, - "state": { - "resources.jobs.test_job": { - "__id__": "1001", - "state": {"name": "test-job"} - } - } -} -EOF - -echo "=== Creating WAL with future serial (serial=5, expected=3) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"test-lineage-123","serial":5} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index a539a2fb0c..f090a16163 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -1,5 +1,3 @@ -=== Creating state file with lineage-A === -=== Creating WAL with lineage-B (mismatch) === === WAL content === {"lineage":"wal-lineage-bbb","serial": [SERIAL]} {"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json b/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json new file mode 100644 index 0000000000..444a9ea888 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json @@ -0,0 +1,12 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "state-lineage-aaa", + "serial": 1, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json.wal b/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json.wal new file mode 100644 index 0000000000..d14fb4a971 --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/resources.json.wal @@ -0,0 +1,2 @@ +{"lineage":"wal-lineage-bbb","serial":2} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/script b/acceptance/bundle/deploy/wal/lineage-mismatch/script index b241246e6c..4617c338fe 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/script +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/script @@ -1,25 +1,6 @@ -echo "=== Creating state file with lineage-A ===" mkdir -p .databricks/bundle/default -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "state-lineage-aaa", - "serial": 1, - "state": { - "resources.jobs.test_job": { - "__id__": "1001", - "state": {"name": "test-job"} - } - } -} -EOF - -echo "=== Creating WAL with lineage-B (mismatch) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"wal-lineage-bbb","serial":2} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt index 682534de7c..a2066ccdd8 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/output.txt +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -1,6 +1,3 @@ -=== Creating state directory === -=== Creating state file (serial=2) === -=== Creating stale WAL with old serial (serial=1) === === WAL content before deploy === {"lineage":"stale-test-lineage","serial": [SERIAL]} {"k":"resources.jobs.stale_job","v":{"__id__": "[ID]","state":{"name":"stale-job"}}} diff --git a/acceptance/bundle/deploy/wal/stale-wal/resources.json b/acceptance/bundle/deploy/wal/stale-wal/resources.json new file mode 100644 index 0000000000..6fd38b67ae --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/resources.json @@ -0,0 +1,12 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "stale-test-lineage", + "serial": 2, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} diff --git a/acceptance/bundle/deploy/wal/stale-wal/resources.json.wal b/acceptance/bundle/deploy/wal/stale-wal/resources.json.wal new file mode 100644 index 0000000000..ef5f380ed8 --- /dev/null +++ b/acceptance/bundle/deploy/wal/stale-wal/resources.json.wal @@ -0,0 +1,2 @@ +{"lineage":"stale-test-lineage","serial":1} +{"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} diff --git a/acceptance/bundle/deploy/wal/stale-wal/script b/acceptance/bundle/deploy/wal/stale-wal/script index d814639a00..4de1bc1e92 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/script +++ b/acceptance/bundle/deploy/wal/stale-wal/script @@ -1,27 +1,6 @@ -echo "=== Creating state directory ===" mkdir -p .databricks/bundle/default - -echo "=== Creating state file (serial=2) ===" -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "stale-test-lineage", - "serial": 2, - "state": { - "resources.jobs.test_job": { - "__id__": "1001", - "state": {"name": "test-job"} - } - } -} -EOF - -echo "=== Creating stale WAL with old serial (serial=1) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"stale-test-lineage","serial":1} -{"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content before deploy ===" cat .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml b/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml index 457a2d3e96..128bbe37f5 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml +++ b/acceptance/bundle/deploy/wal/wal-with-delete/databricks.yml @@ -1,15 +1,4 @@ bundle: name: wal-delete-test -resources: - jobs: - test_job: - name: "test-job" - tasks: - - task_key: "test-task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge - num_workers: 0 +resources: {} diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index 8f52732d3e..a7960906d3 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -1,10 +1,6 @@ -=== Creating state directory === -=== Creating state file (job exists) === -=== Creating WAL with delete entry (simulating crash during delete) === === WAL content === {"lineage":"delete-test-lineage","serial": [SERIAL]} {"k":"resources.jobs.test_job","v":null} -=== Updating config to remove job === === Deploy (should recover delete from WAL) === >>> [CLI] bundle deploy diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/resources.json b/acceptance/bundle/deploy/wal/wal-with-delete/resources.json new file mode 100644 index 0000000000..04263ec36f --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/resources.json @@ -0,0 +1,12 @@ +{ + "state_version": 1, + "cli_version": "0.0.0", + "lineage": "delete-test-lineage", + "serial": 1, + "state": { + "resources.jobs.test_job": { + "__id__": "1001", + "state": {"name": "test-job"} + } + } +} diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/resources.json.wal b/acceptance/bundle/deploy/wal/wal-with-delete/resources.json.wal new file mode 100644 index 0000000000..9b5c6169e3 --- /dev/null +++ b/acceptance/bundle/deploy/wal/wal-with-delete/resources.json.wal @@ -0,0 +1,2 @@ +{"lineage":"delete-test-lineage","serial":2} +{"k":"resources.jobs.test_job","v":null} diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/script b/acceptance/bundle/deploy/wal/wal-with-delete/script index f840355267..5d5a78a885 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/script +++ b/acceptance/bundle/deploy/wal/wal-with-delete/script @@ -1,39 +1,10 @@ -echo "=== Creating state directory ===" mkdir -p .databricks/bundle/default - -echo "=== Creating state file (job exists) ===" -cat > .databricks/bundle/default/resources.json << 'EOF' -{ - "state_version": 1, - "cli_version": "0.0.0", - "lineage": "delete-test-lineage", - "serial": 1, - "state": { - "resources.jobs.test_job": { - "__id__": "1001", - "state": {"name": "test-job"} - } - } -} -EOF - -echo "=== Creating WAL with delete entry (simulating crash during delete) ===" -cat > .databricks/bundle/default/resources.json.wal << 'EOF' -{"lineage":"delete-test-lineage","serial":2} -{"k":"resources.jobs.test_job","v":null} -EOF +cp resources.json .databricks/bundle/default/ +cp resources.json.wal .databricks/bundle/default/ echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal -echo "=== Updating config to remove job ===" -cat > databricks.yml << 'EOF' -bundle: - name: wal-delete-test - -resources: {} -EOF - echo "=== Deploy (should recover delete from WAL) ===" trace $CLI bundle deploy From 97bd52c50c7bc82dde5ffdf6c57ad514e5f39a87 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 11:28:58 +0200 Subject: [PATCH 47/85] update test output after rebase --- acceptance/selftest/kill_caller/offset/output.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt index 03407dd0d8..cb87595a2c 100644 --- a/acceptance/selftest/kill_caller/offset/output.txt +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -1,15 +1,15 @@ >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 1 done - success (offset) >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 2 done - success (offset) @@ -27,7 +27,7 @@ Attempt 4 done - killed >>> [CLI] current-user me { - "id":"123", - "userName":"test@example.com" + "id": "123", + "userName": "test@example.com" } Attempt 5 done - success (past kill window) From db53353b90b5512cd1a6f82d35826def7455315f Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 11:36:16 +0200 Subject: [PATCH 48/85] destroyCore: warn on Finalize failure instead of aborting Resources are already deleted at this point; failing hard prevents the file-cleanup step from running. Downgrade to a warning so destroyCore continues to delete the remote files regardless. Co-authored-by: Denis Bilenko --- bundle/phases/destroy.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index 68657f4e51..a130c7a1c6 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -14,6 +14,7 @@ import ( "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" "github.com/databricks/cli/libs/cmdio" + "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/logdiag" "github.com/databricks/databricks-sdk-go/apierr" @@ -82,9 +83,14 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, e } // Flush WAL to local state file before deleting remote files. + // Warn instead of hard-error: resources are already deleted, so proceed + // with file cleanup regardless of whether state flush succeeds. if engine.IsDirect() { if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) + diags := diag.WarningFromErr(err) + if len(diags) > 0 { + logdiag.LogDiag(ctx, diags[0]) + } } } From 4ef7c16226438628cabf95d1a8cf872302ee9f73 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 11:37:07 +0200 Subject: [PATCH 49/85] update test output --- .../bundle/resources/apps/create_already_exists/output.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/acceptance/bundle/resources/apps/create_already_exists/output.txt b/acceptance/bundle/resources/apps/create_already_exists/output.txt index e4438d47b0..82deb4ab43 100644 --- a/acceptance/bundle/resources/apps/create_already_exists/output.txt +++ b/acceptance/bundle/resources/apps/create_already_exists/output.txt @@ -37,7 +37,6 @@ HTTP Status: 409 Conflict API error_code: RESOURCE_ALREADY_EXISTS API message: An app with the same name already exists: test-app-already-exists -Updating deployment state... >>> [CLI] apps delete test-app-already-exists { From 47e11ad9cdb0d3089d15b86f06d8735854faf3a1 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 12:44:19 +0200 Subject: [PATCH 50/85] deployCore: use Finalize return value instead of re-opening state Finalize now returns (ExportedResourcesMap, error), capturing the merged state before clearing. This lets deploy.go use LoadFromState directly instead of closing and re-opening the state file from disk. LoadFromState is a new statemgmt constructor that accepts pre-computed state and skips the engine dispatch in Load.Apply. Co-authored-by: Denis Bilenko --- bundle/direct/bind.go | 19 +++++--- bundle/direct/dstate/state.go | 46 ++++++++++++------- bundle/direct/dstate/state_test.go | 20 +++++--- bundle/phases/deploy.go | 22 +++++---- bundle/phases/destroy.go | 2 +- bundle/statemgmt/state_load.go | 37 +++++++++++---- .../statemgmt/upload_state_for_yaml_sync.go | 2 +- cmd/bundle/deployment/migrate.go | 2 +- 8 files changed, 100 insertions(+), 50 deletions(-) diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 7e32bfd647..c16f763afc 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -64,7 +64,9 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac var checkStateDB dstate.DeploymentState if err := checkStateDB.Open(ctx, statePath, dstate.WithRecovery(true), dstate.WithWrite(false)); err == nil { existingID := checkStateDB.GetResourceID(resourceKey) - _ = checkStateDB.Finalize(ctx) + if _, err := checkStateDB.Finalize(ctx); err != nil { + log.Warnf(ctx, "failed to finalize state: %v", err) + } if existingID != "" { return nil, ErrResourceAlreadyBound{ ResourceKey: resourceKey, @@ -98,7 +100,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } // Finalize to persist temp state to disk - err = b.StateDB.Finalize(ctx) + _, err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -117,7 +119,9 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac os.Remove(tmpStatePath) return nil, err } - _ = b.StateDB.Finalize(ctx) + if _, err := b.StateDB.Finalize(ctx); err != nil { + log.Warnf(ctx, "failed to finalize state: %v", err) + } // Populate the state with the resolved config entry := plan.Plan[resourceKey] @@ -152,7 +156,7 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } - err = b.StateDB.Finalize(ctx) + _, err = b.StateDB.Finalize(ctx) if err != nil { os.Remove(tmpStatePath) return nil, err @@ -166,7 +170,9 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac return nil, err } plan, err = b.CalculatePlan(ctx, client, configRoot) - _ = b.StateDB.Finalize(ctx) + if _, ferr := b.StateDB.Finalize(ctx); ferr != nil { + log.Warnf(ctx, "failed to finalize state: %v", ferr) + } if err != nil { os.Remove(tmpStatePath) return nil, err @@ -236,5 +242,6 @@ func (b *DeploymentBundle) Unbind(ctx context.Context, statePath, resourceKey st } } - return b.StateDB.Finalize(ctx) + _, err = b.StateDB.Finalize(ctx) + return err } diff --git a/bundle/direct/dstate/state.go b/bundle/direct/dstate/state.go index 6a7e73778b..7dc9b97f98 100644 --- a/bundle/direct/dstate/state.go +++ b/bundle/direct/dstate/state.go @@ -32,19 +32,25 @@ const ( var errStaleWAL = errors.New("stale WAL") type DeploymentState struct { - Path string - Data Database - mu sync.Mutex - walFile *os.File + Path string + Data Database + mu sync.Mutex + walFile *os.File + + // Maps resource key to ID. Unlike Data.State, this is up to during writes (deploys). stateIDs map[string]string } type Database struct { - StateVersion int `json:"state_version"` - CLIVersion string `json:"cli_version"` - Lineage string `json:"lineage"` - Serial int `json:"serial"` - State map[string]ResourceEntry `json:"state"` + StateVersion int `json:"state_version"` + CLIVersion string `json:"cli_version"` + Lineage string `json:"lineage"` + Serial int `json:"serial"` + + // Maps resource key to ResourceEntry which includes ID + full serialized state. + // This is not updated during write/deploy, those writes go to WAL instead. + // The State is then reconstructed from WAL. + State map[string]ResourceEntry `json:"state"` } type ResourceEntry struct { @@ -346,14 +352,15 @@ func (db *DeploymentState) mergeWalIntoState(ctx context.Context) (bool, error) return lineNumber > 1, nil } -// Finalize replays the WAL (if open for write) and resets the state. +// Finalize replays the WAL (if open for write), captures the resulting state, and resets. // Safe to call multiple times or on an already-finalized state. -func (db *DeploymentState) Finalize(ctx context.Context) error { +// Returns the exported state as of the end of this operation. +func (db *DeploymentState) Finalize(ctx context.Context) (resourcestate.ExportedResourcesMap, error) { db.mu.Lock() defer db.mu.Unlock() if db.Path == "" { - return nil + return nil, nil } var err error @@ -364,11 +371,13 @@ func (db *DeploymentState) Finalize(ctx context.Context) error { err = db.replayWAL(ctx) } + state := ExportStateFromData(db.Data) + db.Path = "" db.Data = Database{} - db.stateIDs = make(map[string]string) + db.stateIDs = nil - return err + return state, err } // UpgradeToWrite transitions from read mode to write mode without re-reading state. @@ -427,9 +436,10 @@ func (db *DeploymentState) AssertOpenedForWrite() { } } -func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { +// ExportStateFromData extracts resource IDs and ETags from a database snapshot. +func ExportStateFromData(data Database) resourcestate.ExportedResourcesMap { result := make(resourcestate.ExportedResourcesMap) - for key, entry := range db.Data.State { + for key, entry := range data.State { var etag string // Extract etag for dashboards. // covered by test case: bundle/deploy/dashboard/detect-change @@ -450,6 +460,10 @@ func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.Export return result } +func (db *DeploymentState) ExportState(ctx context.Context) resourcestate.ExportedResourcesMap { + return ExportStateFromData(db.Data) +} + func (db *DeploymentState) unlockedSave() error { data, err := json.MarshalIndent(db.Data, "", " ") if err != nil { diff --git a/bundle/direct/dstate/state_test.go b/bundle/direct/dstate/state_test.go index 3f0f614cd3..afe8634790 100644 --- a/bundle/direct/dstate/state_test.go +++ b/bundle/direct/dstate/state_test.go @@ -9,6 +9,12 @@ import ( "github.com/stretchr/testify/require" ) +func mustFinalize(t *testing.T, db *DeploymentState) { + t.Helper() + _, err := db.Finalize(t.Context()) + require.NoError(t, err) +} + func TestOpenSaveFinalizeRoundTrip(t *testing.T) { path := filepath.Join(t.TempDir(), "state.json") @@ -16,14 +22,14 @@ func TestOpenSaveFinalizeRoundTrip(t *testing.T) { require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{"key": "val"}, nil)) - require.NoError(t, db.Finalize(t.Context())) + mustFinalize(t, &db) // Re-open and verify persisted data. var db2 DeploymentState require.NoError(t, db2.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 1, db2.Data.Serial) assert.Equal(t, "123", db2.GetResourceID("jobs.my_job")) - require.NoError(t, db2.Finalize(t.Context())) + mustFinalize(t, &db2) } func TestFinalizeWithNoEntriesDoesNotWriteStateFile(t *testing.T) { @@ -31,7 +37,7 @@ func TestFinalizeWithNoEntriesDoesNotWriteStateFile(t *testing.T) { var db DeploymentState require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) - require.NoError(t, db.Finalize(t.Context())) + mustFinalize(t, &db) _, err := os.Stat(path) assert.ErrorIs(t, err, os.ErrNotExist) @@ -46,7 +52,7 @@ func TestPanicOnDoubleOpen(t *testing.T) { assert.Panics(t, func() { _ = db.Open(t.Context(), path, WithRecovery(true), WithWrite(true)) }) - require.NoError(t, db.Finalize(t.Context())) + mustFinalize(t, &db) } func TestDeleteState(t *testing.T) { @@ -55,16 +61,16 @@ func TestDeleteState(t *testing.T) { var db DeploymentState require.NoError(t, db.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db.SaveState("jobs.my_job", "123", map[string]string{}, nil)) - require.NoError(t, db.Finalize(t.Context())) + mustFinalize(t, &db) var db2 DeploymentState require.NoError(t, db2.Open(t.Context(), path, WithRecovery(true), WithWrite(true))) require.NoError(t, db2.DeleteState("jobs.my_job")) - require.NoError(t, db2.Finalize(t.Context())) + mustFinalize(t, &db2) var db3 DeploymentState require.NoError(t, db3.Open(t.Context(), path, WithRecovery(false), WithWrite(false))) assert.Equal(t, 2, db3.Data.Serial) assert.Equal(t, "", db3.GetResourceID("jobs.my_job")) - require.NoError(t, db3.Finalize(t.Context())) + mustFinalize(t, &db3) } diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 7efe71b850..e318fa1ffe 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -15,7 +15,6 @@ import ( "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" - "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/libraries" "github.com/databricks/cli/bundle/metrics" "github.com/databricks/cli/bundle/permissions" @@ -75,14 +74,12 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta bundle.ApplyContext(ctx, b, terraform.Apply()) } - // Close state to replay WAL into state file, then reopen for read. - // PushResourcesState needs the file on disk, Load needs the state in memory. + // Flush WAL to state file on disk; capture the resulting state for Load below. + var directState statemgmt.ExportedResourcesMap if targetEngine.IsDirect() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { - logdiag.LogError(ctx, err) - } - _, localPath := b.StateFilenameDirect(ctx) - if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { + var err error + directState, err = b.DeploymentBundle.StateDB.Finalize(ctx) + if err != nil { logdiag.LogError(ctx, err) } } @@ -93,8 +90,15 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta return } + var loadMutator bundle.Mutator + if targetEngine.IsDirect() { + loadMutator = statemgmt.LoadFromState(directState) + } else { + loadMutator = statemgmt.Load(targetEngine) + } + bundle.ApplySeqContext(ctx, b, - statemgmt.Load(targetEngine), + loadMutator, metadata.Compute(), metadata.Upload(), statemgmt.UploadStateForYamlSync(targetEngine), diff --git a/bundle/phases/destroy.go b/bundle/phases/destroy.go index a130c7a1c6..98e6f7fee2 100644 --- a/bundle/phases/destroy.go +++ b/bundle/phases/destroy.go @@ -86,7 +86,7 @@ func destroyCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, e // Warn instead of hard-error: resources are already deleted, so proceed // with file cleanup regardless of whether state flush succeeds. if engine.IsDirect() { - if err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { + if _, err := b.DeploymentBundle.StateDB.Finalize(ctx); err != nil { diags := diag.WarningFromErr(err) if len(diags) > 0 { logdiag.LogDiag(ctx, diags[0]) diff --git a/bundle/statemgmt/state_load.go b/bundle/statemgmt/state_load.go index 3345792c29..4894fc08a6 100644 --- a/bundle/statemgmt/state_load.go +++ b/bundle/statemgmt/state_load.go @@ -35,7 +35,6 @@ func (l *load) Name() string { } func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { - var err error var state ExportedResourcesMap if l.engine.IsDirect() { @@ -48,14 +47,29 @@ func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { } } - err = l.validateState(state) - if err != nil { + return applyState(ctx, b, state, l.modes) +} + +type loadFromState struct { + state ExportedResourcesMap + modes []LoadMode +} + +func (l *loadFromState) Name() string { + return "statemgmt.Load" +} + +func (l *loadFromState) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { + return applyState(ctx, b, l.state, l.modes) +} + +// applyState merges the exported resource state into the bundle configuration. +func applyState(ctx context.Context, b *bundle.Bundle, state ExportedResourcesMap, modes []LoadMode) diag.Diagnostics { + if err := validateLoadedState(state, modes); err != nil { return diag.FromErr(err) } - // Merge state into configuration. - err = StateToBundle(ctx, state, &b.Config) - if err != nil { + if err := StateToBundle(ctx, state, &b.Config); err != nil { return diag.FromErr(err) } @@ -160,14 +174,19 @@ func StateToBundle(ctx context.Context, state ExportedResourcesMap, config *conf }) } -func (l *load) validateState(state ExportedResourcesMap) error { - if len(state) == 0 && slices.Contains(l.modes, ErrorOnEmptyState) { +func validateLoadedState(state ExportedResourcesMap, modes []LoadMode) error { + if len(state) == 0 && slices.Contains(modes, ErrorOnEmptyState) { return errors.New("resource not found or not yet deployed. Did you forget to run 'databricks bundle deploy'?") } - return nil } func Load(engine engine.EngineType, modes ...LoadMode) bundle.Mutator { return &load{modes: modes, engine: engine} } + +// LoadFromState returns a mutator that loads the provided pre-computed state into the bundle, +// skipping the engine-specific state retrieval step. +func LoadFromState(state ExportedResourcesMap, modes ...LoadMode) bundle.Mutator { + return &loadFromState{state: state, modes: modes} +} diff --git a/bundle/statemgmt/upload_state_for_yaml_sync.go b/bundle/statemgmt/upload_state_for_yaml_sync.go index 5b1fbc3bf6..0399c7b31f 100644 --- a/bundle/statemgmt/upload_state_for_yaml_sync.go +++ b/bundle/statemgmt/upload_state_for_yaml_sync.go @@ -198,7 +198,7 @@ func (m *uploadStateForYamlSync) convertState(ctx context.Context, b *bundle.Bun } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { + if _, err := deploymentBundle.StateDB.Finalize(ctx); err != nil { return false, err } diff --git a/cmd/bundle/deployment/migrate.go b/cmd/bundle/deployment/migrate.go index f4512f4e1f..77d95e3533 100644 --- a/cmd/bundle/deployment/migrate.go +++ b/cmd/bundle/deployment/migrate.go @@ -282,7 +282,7 @@ To start using direct engine, set "engine: direct" under bundle in your databric } deploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(true)) - if err := deploymentBundle.StateDB.Finalize(ctx); err != nil { + if _, err := deploymentBundle.StateDB.Finalize(ctx); err != nil { logdiag.LogError(ctx, err) } if logdiag.HasError(ctx) { From deae01f8a215b9c323d4489f511edaa8ea8bed36 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 13:52:02 +0200 Subject: [PATCH 51/85] statemgmt.Load: accept state directly instead of engine Callers now extract state before calling Load (ExportState for direct, ParseResourcesState for terraform). This removes the engine dispatch from inside the mutator and makes the data flow explicit. Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 18 ++++++++------ bundle/statemgmt/state_load.go | 42 ++++---------------------------- cmd/bundle/generate/dashboard.go | 15 +++++++++++- cmd/bundle/utils/process.go | 14 ++++++++++- 4 files changed, 42 insertions(+), 47 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index e318fa1ffe..00452346df 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -75,10 +75,10 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta } // Flush WAL to state file on disk; capture the resulting state for Load below. - var directState statemgmt.ExportedResourcesMap + var state statemgmt.ExportedResourcesMap if targetEngine.IsDirect() { var err error - directState, err = b.DeploymentBundle.StateDB.Finalize(ctx) + state, err = b.DeploymentBundle.StateDB.Finalize(ctx) if err != nil { logdiag.LogError(ctx, err) } @@ -90,15 +90,17 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta return } - var loadMutator bundle.Mutator - if targetEngine.IsDirect() { - loadMutator = statemgmt.LoadFromState(directState) - } else { - loadMutator = statemgmt.Load(targetEngine) + if !targetEngine.IsDirect() { + var err error + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + logdiag.LogError(ctx, err) + return + } } bundle.ApplySeqContext(ctx, b, - loadMutator, + statemgmt.Load(state), metadata.Compute(), metadata.Upload(), statemgmt.UploadStateForYamlSync(targetEngine), diff --git a/bundle/statemgmt/state_load.go b/bundle/statemgmt/state_load.go index 4894fc08a6..573c69126c 100644 --- a/bundle/statemgmt/state_load.go +++ b/bundle/statemgmt/state_load.go @@ -9,9 +9,7 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config" - "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/config/resources" - "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/statemgmt/resourcestate" "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" @@ -26,40 +24,15 @@ type ( const ErrorOnEmptyState LoadMode = 0 type load struct { - modes []LoadMode - engine engine.EngineType -} - -func (l *load) Name() string { - return "statemgmt.Load" -} - -func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { - var state ExportedResourcesMap - - if l.engine.IsDirect() { - state = b.DeploymentBundle.ExportState(ctx) - } else { - var err error - state, err = terraform.ParseResourcesState(ctx, b) - if err != nil { - return diag.FromErr(err) - } - } - - return applyState(ctx, b, state, l.modes) -} - -type loadFromState struct { state ExportedResourcesMap modes []LoadMode } -func (l *loadFromState) Name() string { +func (l *load) Name() string { return "statemgmt.Load" } -func (l *loadFromState) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { +func (l *load) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { return applyState(ctx, b, l.state, l.modes) } @@ -181,12 +154,7 @@ func validateLoadedState(state ExportedResourcesMap, modes []LoadMode) error { return nil } -func Load(engine engine.EngineType, modes ...LoadMode) bundle.Mutator { - return &load{modes: modes, engine: engine} -} - -// LoadFromState returns a mutator that loads the provided pre-computed state into the bundle, -// skipping the engine-specific state retrieval step. -func LoadFromState(state ExportedResourcesMap, modes ...LoadMode) bundle.Mutator { - return &loadFromState{state: state, modes: modes} +// Load returns a mutator that merges the provided resource state into the bundle configuration. +func Load(state ExportedResourcesMap, modes ...LoadMode) bundle.Mutator { + return &load{state: state, modes: modes} } diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index ca02cc414e..fefcd0f6e6 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -19,6 +19,7 @@ import ( "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/generate" "github.com/databricks/cli/bundle/phases" + "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/resources" "github.com/databricks/cli/bundle/statemgmt" "github.com/databricks/cli/cmd/bundle/deployment" @@ -398,8 +399,20 @@ func (d *dashboard) runForResource(ctx context.Context, b *bundle.Bundle) { } } + var state statemgmt.ExportedResourcesMap + if stateDesc.Engine.IsDirect() { + state = b.DeploymentBundle.ExportState(ctx) + } else { + var err error + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + logdiag.LogError(ctx, err) + return + } + } + bundle.ApplySeqContext(ctx, b, - statemgmt.Load(stateDesc.Engine), + statemgmt.Load(state), ) if logdiag.HasError(ctx) { return diff --git a/cmd/bundle/utils/process.go b/cmd/bundle/utils/process.go index c142f4d943..5f43cff6ac 100644 --- a/cmd/bundle/utils/process.go +++ b/cmd/bundle/utils/process.go @@ -11,6 +11,7 @@ import ( "github.com/databricks/cli/bundle/config/engine" "github.com/databricks/cli/bundle/config/mutator" "github.com/databricks/cli/bundle/config/validate" + "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/deployplan" "github.com/databricks/cli/bundle/direct" "github.com/databricks/cli/bundle/direct/dstate" @@ -200,8 +201,19 @@ func ProcessBundleRet(cmd *cobra.Command, opts ProcessOptions) (b *bundle.Bundle if opts.ErrorOnEmptyState { modes = append(modes, statemgmt.ErrorOnEmptyState) } + var state statemgmt.ExportedResourcesMap + if stateDesc.Engine.IsDirect() { + state = b.DeploymentBundle.ExportState(ctx) + } else { + var err error + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + logdiag.LogError(ctx, err) + return b, stateDesc, root.ErrAlreadyPrinted + } + } mutators := []bundle.Mutator{ - statemgmt.Load(stateDesc.Engine, modes...), + statemgmt.Load(state, modes...), } // InitializeURLs makes an extra API call; only run it when URLs are needed. if opts.InitIDs { From f896d361dbef068b4ed2f8a89fc2f48130b459ce Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:00:47 +0200 Subject: [PATCH 52/85] fmt --- cmd/bundle/generate/dashboard.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index fefcd0f6e6..71f4f573cf 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -16,10 +16,10 @@ import ( "time" "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/bundle/generate" "github.com/databricks/cli/bundle/phases" - "github.com/databricks/cli/bundle/deploy/terraform" "github.com/databricks/cli/bundle/resources" "github.com/databricks/cli/bundle/statemgmt" "github.com/databricks/cli/cmd/bundle/deployment" From 08b304e5651975b518f524c02a2b6f0698c1d75a Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:03:35 +0200 Subject: [PATCH 53/85] deployCore: move ParseResourcesState before PushResourcesState Both engines now capture post-apply state in the same location, before pushing. The two operations are independent reads of the terraform state file, so order relative to PushResourcesState does not matter. Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 00452346df..6c03ac8870 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -74,7 +74,9 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta bundle.ApplyContext(ctx, b, terraform.Apply()) } - // Flush WAL to state file on disk; capture the resulting state for Load below. + // Capture post-apply state for Load below. + // For direct: flush WAL to disk (Finalize) and capture the result. + // For terraform: parse the state file written by terraform.Apply. var state statemgmt.ExportedResourcesMap if targetEngine.IsDirect() { var err error @@ -82,6 +84,12 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta if err != nil { logdiag.LogError(ctx, err) } + } else { + var err error + state, err = terraform.ParseResourcesState(ctx, b) + if err != nil { + logdiag.LogError(ctx, err) + } } // Even if deployment failed, there might be updates in states that we need to upload @@ -90,15 +98,6 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta return } - if !targetEngine.IsDirect() { - var err error - state, err = terraform.ParseResourcesState(ctx, b) - if err != nil { - logdiag.LogError(ctx, err) - return - } - } - bundle.ApplySeqContext(ctx, b, statemgmt.Load(state), metadata.Compute(), From 32b498845b2998c753683b04f2c4794ed524c60c Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:26:00 +0200 Subject: [PATCH 54/85] simplify test --- .../bundle/deploy/wal/corrupted-wal-entry/script | 6 +----- .../bundle/deploy/wal/corrupted-wal-entry/test.toml | 13 ------------- 2 files changed, 1 insertion(+), 18 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index 191a62f01f..043a13d997 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -12,11 +12,7 @@ echo "=== Final state (should have recovered entries) ===" cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' echo "=== Corrupted WAL entries file ===" -if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then - cat .databricks/bundle/default/resources.json.wal.corrupted -else - echo "Missing corrupted WAL entries file (unexpected)" -fi +cat .databricks/bundle/default/resources.json.wal.corrupted echo "=== WAL after successful deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml deleted file mode 100644 index 6245c19840..0000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/test.toml +++ /dev/null @@ -1,13 +0,0 @@ -# WAL with corrupted LAST entry - valid entries should be recovered, corrupted last line skipped. - -[[Server]] -Pattern = "POST /api/2.2/jobs/reset" -Response.Body = '{}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get?job_id=1111" -Response.Body = '{"job_id": 1111, "settings": {"name": "valid-job"}}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get?job_id=2222" -Response.Body = '{"job_id": 2222, "settings": {"name": "another-valid"}}' From f628251fc0c3b6eb639e05d89486031a19340cef Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:47:46 +0200 Subject: [PATCH 55/85] update outputs --- .../bundle/deploy/wal/chain-3-jobs/output.txt | 4 +-- .../deploy/wal/corrupted-wal-entry/output.txt | 33 ++++++++++++------- .../corrupted-wal-entry/resources.json.wal | 4 --- .../deploy/wal/corrupted-wal-entry/script | 21 +++++++++--- .../deploy/wal/crash-after-create/output.txt | 2 +- .../deploy/wal/future-serial-wal/output.txt | 2 +- .../deploy/wal/lineage-mismatch/output.txt | 2 +- .../bundle/deploy/wal/stale-wal/output.txt | 2 +- acceptance/bundle/deploy/wal/test.toml | 8 ----- 9 files changed, 44 insertions(+), 34 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index 1f4b53f7cf..8ca8e388d3 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -17,7 +17,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_01", "v": { - "__id__": "[ID]", + "__id__": "1001", "state": { "deployment": { "kind": "BUNDLE", @@ -50,7 +50,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_02", "v": { - "__id__": "[ID]", + "__id__": "1001", "depends_on": [ { "label": "${resources.jobs.job_01.id}", diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index bd886c153f..f92d8a67ac 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,9 +1,8 @@ === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} -{"k":"resources.jobs.valid_job","v":{"__id__": "[ID]","state":{"name":"valid-job"}}} -{"k":"resources.jobs.another_valid","v":{"__id__": "[ID]","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial- -=== Deploy (should recover valid entries, skip corrupted last line) === +{"k":"resources.jobs.valid_job","v":{"__id__":"[JOB1_ID]","state":{"name":"valid-job"}}} +{"k":"resources.jobs.another_valid","v":{"__id__":"[JOB2_ID]","state":{"name":"another-valid"}}} +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== Deploy (should recover valid entries, skip corrupted last line) === >>> [CLI] bundle deploy Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input @@ -13,13 +12,23 @@ Deploying resources... Updating deployment state... Deployment complete! === Final state (should have recovered entries) === -{ - "serial": [SERIAL], - "state_keys": [ - "resources.jobs.another_valid", - "resources.jobs.valid_job" - ] -} + +>>> [CLI] bundle summary +Name: wal-corrupted-test +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test/default +Resources: + Jobs: + another_valid: + Name: another-valid + URL: [DATABRICKS_URL]/jobs/[JOB2_ID]?o=[NUMID] + valid_job: + Name: valid-job + URL: [DATABRICKS_URL]/jobs/[JOB1_ID]?o=[NUMID] === Corrupted WAL entries file === -{"k":"resources.jobs.partial_write","v":{"__id__": "[ID]","state":{"name":"partial-=== WAL after successful deploy === + +>>> cat .databricks/bundle/default/resources.json.wal.corrupted +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal deleted file mode 100644 index 4791ba1281..0000000000 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal +++ /dev/null @@ -1,4 +0,0 @@ -{"lineage":"test-lineage-123","serial":6} -{"k":"resources.jobs.valid_job","v":{"__id__":"1111","state":{"name":"valid-job"}}} -{"k":"resources.jobs.another_valid","v":{"__id__":"2222","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"3333","state":{"name":"partial- diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index 043a13d997..16cee304de 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -1,18 +1,31 @@ +# Create pre-existing jobs in the testserver so WAL recovery triggers DoUpdate (reset) instead of DoCreate +JOB1=$($CLI jobs create --json '{"name":"valid-job"}' | jq -r '.job_id') +JOB2=$($CLI jobs create --json '{"name":"another-valid"}' | jq -r '.job_id') +echo "$JOB1:JOB1_ID" >> ACC_REPLS +echo "$JOB2:JOB2_ID" >> ACC_REPLS + mkdir -p .databricks/bundle/default cp resources.json .databricks/bundle/default/ -cp resources.json.wal .databricks/bundle/default/ + +# Generate WAL with actual job IDs; truncate the partial_write entry to simulate corruption +{ + printf '{"lineage":"test-lineage-123","serial":6}\n' + printf '{"k":"resources.jobs.valid_job","v":{"__id__":"%s","state":{"name":"valid-job"}}}\n' "$JOB1" + printf '{"k":"resources.jobs.another_valid","v":{"__id__":"%s","state":{"name":"another-valid"}}}\n' "$JOB2" + printf '{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-' +} > .databricks/bundle/default/resources.json.wal echo "=== WAL content ===" cat .databricks/bundle/default/resources.json.wal echo "=== Deploy (should recover valid entries, skip corrupted last line) ===" -trace $CLI bundle deploy 2>&1 +trace $CLI bundle deploy echo "=== Final state (should have recovered entries) ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys | sort)}' +trace $CLI bundle summary echo "=== Corrupted WAL entries file ===" -cat .databricks/bundle/default/resources.json.wal.corrupted +trace cat .databricks/bundle/default/resources.json.wal.corrupted echo "=== WAL after successful deploy ===" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index b3250b2db1..abc6d177f6 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -9,7 +9,7 @@ Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) {"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index 8fc16565fe..adb68c7c73 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -1,6 +1,6 @@ === WAL content === {"lineage":"test-lineage-123","serial": [SERIAL]} -{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} === Deploy (should fail with corruption error) === >>> errcode [CLI] bundle deploy diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index f090a16163..53c517b583 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -1,6 +1,6 @@ === WAL content === {"lineage":"wal-lineage-bbb","serial": [SERIAL]} -{"k":"resources.jobs.test_job","v":{"__id__": "[ID]","state":{"name":"test-job"}}} +{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} === Deploy (should fail with lineage mismatch error) === >>> errcode [CLI] bundle deploy diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt index a2066ccdd8..d51d94d965 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/output.txt +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -1,6 +1,6 @@ === WAL content before deploy === {"lineage":"stale-test-lineage","serial": [SERIAL]} -{"k":"resources.jobs.stale_job","v":{"__id__": "[ID]","state":{"name":"stale-job"}}} +{"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} === Deploy (should ignore stale WAL) === >>> [CLI] bundle deploy diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 266d748049..c4b21c0113 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -50,14 +50,6 @@ New = '"lineage": "[UUID]"' Old = '"serial":\s*\d+' New = '"serial": [SERIAL]' -[[Repls]] -Old = '"__id__":\s*"\d+"' -New = '"__id__": "[ID]"' - -[[Repls]] -Old = '"job_id":\s*"\d+"' -New = '"job_id": "[ID]"' - # Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) [[Repls]] Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' From 2c6d40cc8b1c00eb45dc9fb80bf2ced246b6a1c9 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 14:55:51 +0200 Subject: [PATCH 56/85] fix Windows replacement for process kill during deployment The old Windows rule matched 'Exit code: [KILLED]' which was never present because 'Exit code: 1' (Windows exit code for kill) was never normalized to '[KILLED]' -- the [KILLED] normalization only fires via exit code 137 (Linux) or after [PROCESS_KILLED] is already inserted. Match 'Exit code: 1' directly (the raw Windows exit code), then insert [PROCESS_KILLED] and normalize in one step. Co-authored-by: Isaac --- acceptance/bundle/deploy/wal/test.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index c4b21c0113..23a203beb1 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -30,9 +30,9 @@ Old = '(\[PROCESS_KILLED\]\n\nExit code: )1' New = '${1}[KILLED]' # On Windows, no bash "Killed" message appears when CLI has produced output before termination. -# Insert [PROCESS_KILLED] between last output line and exit code for consistency. +# Match the raw exit code 1 (Windows never gets 137 or [PROCESS_KILLED] marker first). [[Repls]] -Old = '(Deploying resources\.\.\.)\n\nExit code: \[KILLED\]' +Old = '(Deploying resources\.\.\.)\n\nExit code: 1' New = """${1} [PROCESS_KILLED] From 38c175641fc1b0e0db0ba7568d16e8b2cc8f4f97 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:05:22 +0200 Subject: [PATCH 57/85] formatting --- acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt | 3 ++- acceptance/bundle/deploy/wal/corrupted-wal-entry/script | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index f92d8a67ac..bf95cc1394 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -30,5 +30,6 @@ Resources: === Corrupted WAL entries file === >>> cat .databricks/bundle/default/resources.json.wal.corrupted -{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== WAL after successful deploy === +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- +=== WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index 16cee304de..b6b12c347b 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -27,7 +27,7 @@ trace $CLI bundle summary echo "=== Corrupted WAL entries file ===" trace cat .databricks/bundle/default/resources.json.wal.corrupted -echo "=== WAL after successful deploy ===" +printf "\n=== WAL after successful deploy ===\n" if [ -f ".databricks/bundle/default/resources.json.wal" ]; then echo "WAL exists (unexpected)" else From f27e4ca1f896f55c2cd65173862ae55e71c247ba Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:07:45 +0200 Subject: [PATCH 58/85] clean up --- acceptance/bundle/deploy/wal/test.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 23a203beb1..de4389e6f8 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -38,10 +38,6 @@ New = """${1} Exit code: [KILLED]""" -[[Repls]] -Old = "\r" -New = '' - [[Repls]] Old = '"lineage":\s*"[0-9a-f-]+"' New = '"lineage": "[UUID]"' From e113406104bd9a98058ce7abdabcd72605cecfe8 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:09:11 +0200 Subject: [PATCH 59/85] rm unnecessarial SERIAL replacement --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 2 +- acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt | 2 +- acceptance/bundle/deploy/wal/crash-after-create/output.txt | 2 +- acceptance/bundle/deploy/wal/empty-wal/output.txt | 2 +- acceptance/bundle/deploy/wal/future-serial-wal/output.txt | 2 +- acceptance/bundle/deploy/wal/lineage-mismatch/output.txt | 2 +- acceptance/bundle/deploy/wal/normal-deploy/output.txt | 2 +- acceptance/bundle/deploy/wal/stale-wal/output.txt | 4 ++-- acceptance/bundle/deploy/wal/test.toml | 4 ---- acceptance/bundle/deploy/wal/wal-with-delete/output.txt | 4 ++-- 10 files changed, 11 insertions(+), 15 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index 8ca8e388d3..bb41f0784b 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -11,7 +11,7 @@ Exit code: [KILLED] { "cli_version": "[DEV_VERSION]", "lineage": "[UUID]", - "serial": [SERIAL], + "serial": 1, "state_version": 2 } { diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index bf95cc1394..d04d0389ec 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,5 +1,5 @@ === WAL content === -{"lineage":"test-lineage-123","serial": [SERIAL]} +{"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"[JOB1_ID]","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"[JOB2_ID]","state":{"name":"another-valid"}}} {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== Deploy (should recover valid entries, skip corrupted last line) === diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index abc6d177f6..cc6111ea9b 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -8,7 +8,7 @@ Deploying resources... Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) -{"lineage":"[UUID]","serial": [SERIAL],"state_version":2,"cli_version":"[DEV_VERSION]"} +{"lineage":"[UUID]","serial":1,"state_version":2,"cli_version":"[DEV_VERSION]"} {"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index 884f502744..b4ce67ee66 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -16,7 +16,7 @@ Corrupted WAL file missing (expected) === State file content === { "lineage": "[UUID]", - "serial": [SERIAL], + "serial": 1, "state_keys": [ "resources.jobs.test_job" ] diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt index adb68c7c73..48c23ddf84 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/output.txt +++ b/acceptance/bundle/deploy/wal/future-serial-wal/output.txt @@ -1,5 +1,5 @@ === WAL content === -{"lineage":"test-lineage-123","serial": [SERIAL]} +{"lineage":"test-lineage-123","serial":5} {"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} === Deploy (should fail with corruption error) === diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index 53c517b583..00bc78cf28 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -1,5 +1,5 @@ === WAL content === -{"lineage":"wal-lineage-bbb","serial": [SERIAL]} +{"lineage":"wal-lineage-bbb","serial":2} {"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} === Deploy (should fail with lineage mismatch error) === diff --git a/acceptance/bundle/deploy/wal/normal-deploy/output.txt b/acceptance/bundle/deploy/wal/normal-deploy/output.txt index ccb189ff09..2ca4f5f51c 100644 --- a/acceptance/bundle/deploy/wal/normal-deploy/output.txt +++ b/acceptance/bundle/deploy/wal/normal-deploy/output.txt @@ -9,7 +9,7 @@ WAL file deleted after successful deploy (expected) === State file content === { "lineage": "[UUID]", - "serial": [SERIAL], + "serial": 1, "state_keys": [ "resources.jobs.test_job" ] diff --git a/acceptance/bundle/deploy/wal/stale-wal/output.txt b/acceptance/bundle/deploy/wal/stale-wal/output.txt index d51d94d965..91a7a07643 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/output.txt +++ b/acceptance/bundle/deploy/wal/stale-wal/output.txt @@ -1,5 +1,5 @@ === WAL content before deploy === -{"lineage":"stale-test-lineage","serial": [SERIAL]} +{"lineage":"stale-test-lineage","serial":1} {"k":"resources.jobs.stale_job","v":{"__id__":"9999","state":{"name":"stale-job"}}} === Deploy (should ignore stale WAL) === @@ -12,7 +12,7 @@ Deployment complete! Stale WAL deleted (expected) === State file should NOT contain stale_job === { - "serial": [SERIAL], + "serial": 3, "state_keys": [ "resources.jobs.test_job" ] diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index de4389e6f8..0ee34873e0 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -42,10 +42,6 @@ Exit code: [KILLED]""" Old = '"lineage":\s*"[0-9a-f-]+"' New = '"lineage": "[UUID]"' -[[Repls]] -Old = '"serial":\s*\d+' -New = '"serial": [SERIAL]' - # Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) [[Repls]] Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index a7960906d3..c08e365177 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -1,5 +1,5 @@ === WAL content === -{"lineage":"delete-test-lineage","serial": [SERIAL]} +{"lineage":"delete-test-lineage","serial":2} {"k":"resources.jobs.test_job","v":null} === Deploy (should recover delete from WAL) === @@ -10,7 +10,7 @@ Updating deployment state... Deployment complete! === Final state (should have no jobs) === { - "serial": [SERIAL], + "serial": 2, "state_keys": [] } === WAL after successful deploy === From 190ce16f319da3bcebf94cfb23daac86048fe25a Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:09:56 +0200 Subject: [PATCH 60/85] rm noop replacement for lineage --- acceptance/bundle/deploy/wal/test.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 0ee34873e0..0e2bc852dc 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -38,10 +38,6 @@ New = """${1} Exit code: [KILLED]""" -[[Repls]] -Old = '"lineage":\s*"[0-9a-f-]+"' -New = '"lineage": "[UUID]"' - # Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) [[Repls]] Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' From eca0376abddf928f2f190d86652b484d13205dd8 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 15:13:11 +0200 Subject: [PATCH 61/85] clean up --- acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml | 3 --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 2 -- .../bundle/deploy/wal/corrupted-wal-entry/databricks.yml | 2 -- .../bundle/deploy/wal/crash-after-create/databricks.yml | 2 -- acceptance/bundle/deploy/wal/crash-after-create/output.txt | 2 +- acceptance/bundle/deploy/wal/empty-wal/databricks.yml | 1 - .../bundle/deploy/wal/future-serial-wal/databricks.yml | 1 - acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml | 1 - acceptance/bundle/deploy/wal/normal-deploy/databricks.yml | 1 - acceptance/bundle/deploy/wal/stale-wal/databricks.yml | 1 - acceptance/bundle/deploy/wal/test.toml | 5 ----- 11 files changed, 1 insertion(+), 20 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml b/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml index fc3a46205b..342a451623 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/databricks.yml @@ -15,7 +15,6 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 job_02: name: "job-02" description: "depends on ${resources.jobs.job_01.id}" @@ -26,7 +25,6 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 job_03: name: "job-03" description: "depends on ${resources.jobs.job_02.id}" @@ -37,4 +35,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index bb41f0784b..8c70ebafa3 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -35,7 +35,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { @@ -74,7 +73,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml b/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml index cc9024fada..a7a5cc2dfe 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/databricks.yml @@ -12,7 +12,6 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 another_valid: name: "another-valid" tasks: @@ -22,4 +21,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml index 31480454c5..25b2efe2f8 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml +++ b/acceptance/bundle/deploy/wal/crash-after-create/databricks.yml @@ -13,7 +13,6 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 job_b: name: "test-job-b" description: "depends on ${resources.jobs.job_a.id}" @@ -24,4 +23,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index cc6111ea9b..a5cdd4f40d 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -9,7 +9,7 @@ Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) {"lineage":"[UUID]","serial":1,"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory diff --git a/acceptance/bundle/deploy/wal/empty-wal/databricks.yml b/acceptance/bundle/deploy/wal/empty-wal/databricks.yml index 147a1e1482..8da92255ff 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/databricks.yml +++ b/acceptance/bundle/deploy/wal/empty-wal/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml b/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml index 67079aaef8..56fa131337 100644 --- a/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml +++ b/acceptance/bundle/deploy/wal/future-serial-wal/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml b/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml index 014ec7f886..32461d1467 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml index 413705d40c..4439322e0e 100644 --- a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml +++ b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/stale-wal/databricks.yml b/acceptance/bundle/deploy/wal/stale-wal/databricks.yml index 6b24f6fd26..443283607e 100644 --- a/acceptance/bundle/deploy/wal/stale-wal/databricks.yml +++ b/acceptance/bundle/deploy/wal/stale-wal/databricks.yml @@ -12,4 +12,3 @@ resources: new_cluster: spark_version: 15.4.x-scala2.12 node_type_id: i3.xlarge - num_workers: 0 diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 0e2bc852dc..2be1964ae6 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -37,8 +37,3 @@ New = """${1} [PROCESS_KILLED] Exit code: [KILLED]""" - -# Strip single-node cluster warnings (they appear in varying order and aren't relevant to WAL tests) -[[Repls]] -Old = '(?s)Warning: Single node cluster.*?ResourceClass: SingleNode\n \n\n' -New = '' From 78ef5f1a6c5430735971ce8e0fb5e74440201de2 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 16:24:11 +0200 Subject: [PATCH 62/85] testserver: replace KillCaller config with HTTP kill API Move kill-on-request behavior from test.toml fields (KillCaller, KillCallerOffset) to a POST /__testserver/kill endpoint. Kill rules are scoped by auth token so concurrent tests sharing a server don't interfere. acceptance/bin/kill_after.py is a convenience wrapper that posts to the endpoint, keeping scripts readable. The kill check is applied at the HTTP middleware layer (wrapping the entire router) so it fires for all requests, including those that would otherwise fall through to the not-found handler. Co-authored-by: Isaac --- acceptance/bin/kill_after.py | 39 +++++++ .../bundle/deploy/wal/chain-3-jobs/script | 2 + .../bundle/deploy/wal/chain-3-jobs/test.toml | 2 - .../deploy/wal/crash-after-create/script | 2 + .../deploy/wal/crash-after-create/test.toml | 1 - acceptance/internal/config.go | 12 -- acceptance/internal/prepare_server.go | 57 --------- .../selftest/kill_caller/currentuser/script | 1 + .../kill_caller/currentuser/test.toml | 3 - .../kill_caller/multi_pattern/output.txt | 4 +- .../selftest/kill_caller/multi_pattern/script | 3 + .../kill_caller/multi_pattern/test.toml | 16 +-- .../selftest/kill_caller/multiple/output.txt | 4 +- .../selftest/kill_caller/multiple/script | 2 + .../selftest/kill_caller/multiple/test.toml | 9 -- .../selftest/kill_caller/offset/output.txt | 12 +- acceptance/selftest/kill_caller/offset/script | 2 + .../selftest/kill_caller/offset/test.toml | 10 -- .../selftest/kill_caller/workspace/script | 1 + .../selftest/kill_caller/workspace/test.toml | 3 - libs/testserver/kill.go | 108 ++++++++++++++++++ .../testserver}/process_unix.go | 2 +- .../testserver}/process_windows.go | 2 +- libs/testserver/server.go | 26 ++++- 24 files changed, 198 insertions(+), 125 deletions(-) create mode 100755 acceptance/bin/kill_after.py create mode 100644 libs/testserver/kill.go rename {acceptance/internal => libs/testserver}/process_unix.go (94%) rename {acceptance/internal => libs/testserver}/process_windows.go (96%) diff --git a/acceptance/bin/kill_after.py b/acceptance/bin/kill_after.py new file mode 100755 index 0000000000..029123a13f --- /dev/null +++ b/acceptance/bin/kill_after.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +"""Set up a kill rule on the testserver for the current test token. + +Usage: kill_after.py PATTERN OFFSET TIMES + + PATTERN HTTP method and path, e.g. "POST /api/2.2/jobs/create" + OFFSET number of requests to let through before killing starts + TIMES number of times to kill the caller + +The rule is scoped to the current DATABRICKS_TOKEN so it only affects +the test that registers it, even when tests share a server. +""" + +import json +import os +import sys +import urllib.request + +host = os.environ.get("DATABRICKS_HOST", "") +token = os.environ.get("DATABRICKS_TOKEN", "") + +if not host: + print("DATABRICKS_HOST not set", file=sys.stderr) + sys.exit(1) + +if len(sys.argv) != 4: + print(f"usage: {sys.argv[0]} PATTERN OFFSET TIMES", file=sys.stderr) + sys.exit(1) + +pattern, offset, times = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]) + +data = json.dumps({"pattern": pattern, "offset": offset, "times": times}).encode() +req = urllib.request.Request( + f"{host}/__testserver/kill", + data=data, + headers={"Content-Type": "application/json", "Authorization": f"Bearer {token}"}, + method="POST", +) +urllib.request.urlopen(req) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index 6c9993c280..a1196f10c1 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -1,3 +1,5 @@ +kill_after.py "POST /api/2.2/jobs/create" 2 1 + echo "=== First deploy (crashes on job_03) ===" trace errcode $CLI bundle deploy diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml index 2425c89dea..746896a789 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml @@ -3,8 +3,6 @@ [[Server]] Pattern = "POST /api/2.2/jobs/create" -KillCallerOffset = 2 -KillCaller = 1 Response.Body = '{"job_id": 1001}' [[Server]] diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script index d09f6ab06e..bb33d67870 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/script +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -1,3 +1,5 @@ +kill_after.py "GET /api/2.2/jobs/get" 0 1 + echo "=== First deploy (crashes after job_a create, before job_b) ===" trace errcode $CLI bundle deploy diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml index eebad72de5..d1e99eadb7 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -12,5 +12,4 @@ Response.Body = '{}' [[Server]] Pattern = "GET /api/2.2/jobs/get" -KillCaller = 1 Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/internal/config.go b/acceptance/internal/config.go index dc63911173..559e11d0ca 100644 --- a/acceptance/internal/config.go +++ b/acceptance/internal/config.go @@ -153,18 +153,6 @@ type ServerStub struct { // Configure as "1ms", "2s", "3m", etc. // See [time.ParseDuration] for details. Delay time.Duration - - // Number of times to kill the caller process before returning normal responses. - // 0 = never kill (default), 1 = kill once then allow, 2 = kill twice then allow, etc. - // Useful for testing crash recovery scenarios where first deploy crashes but retry succeeds. - // Requires DATABRICKS_CLI_TEST_PID=1 to be set in the test environment. - KillCaller int - - // Number of requests to let pass before starting to kill. - // Combined with KillCaller, this creates a window: requests 1 to Offset succeed, - // requests Offset+1 to Offset+KillCaller are killed, rest succeed. - // Example: KillCallerOffset=9, KillCaller=1 means let 9 requests pass, kill the 10th. - KillCallerOffset int } // FindConfigs finds all the config relevant for this test, diff --git a/acceptance/internal/prepare_server.go b/acceptance/internal/prepare_server.go index 2f1b6712a2..f8be1ae947 100644 --- a/acceptance/internal/prepare_server.go +++ b/acceptance/internal/prepare_server.go @@ -183,10 +183,6 @@ func startLocalServer(t *testing.T, s.ResponseCallback = logResponseCallback(t) } - killCounters := make(map[string]int) - offsetCounters := make(map[string]int) - killCountersMu := &sync.Mutex{} - for ind := range stubs { // Later stubs take precedence over earlier ones (leaf configs override parent configs). // The first handler registered for a given pattern wins, so we reverse the order. @@ -195,11 +191,6 @@ func startLocalServer(t *testing.T, items := strings.Split(stub.Pattern, " ") require.Len(t, items, 2) - if stub.KillCaller > 0 { - killCounters[stub.Pattern] = stub.KillCaller - offsetCounters[stub.Pattern] = stub.KillCallerOffset - } - s.Handle(items[0], items[1], func(req testserver.Request) any { if stub.Delay > 0 { ctx := req.Context @@ -218,10 +209,6 @@ func startLocalServer(t *testing.T, } } - if shouldKillCaller(stub, offsetCounters, killCounters, killCountersMu) { - killCaller(t, stub.Pattern, req.Headers) - } - return stub.Response }) } @@ -232,50 +219,6 @@ func startLocalServer(t *testing.T, return s.URL } -func shouldKillCaller(stub ServerStub, offsetCounters, killCounters map[string]int, mu *sync.Mutex) bool { - if stub.KillCaller <= 0 { - return false - } - mu.Lock() - defer mu.Unlock() - - if offsetCounters[stub.Pattern] > 0 { - offsetCounters[stub.Pattern]-- - return false - } - - if killCounters[stub.Pattern] <= 0 { - return false - } - killCounters[stub.Pattern]-- - return true -} - -func killCaller(t *testing.T, pattern string, headers http.Header) { - pid := testserver.ExtractPidFromHeaders(headers) - if pid == 0 { - t.Errorf("KillCaller configured but test-pid not found in User-Agent") - return - } - - process, err := os.FindProcess(pid) - if err != nil { - t.Errorf("Failed to find process %d: %s", pid, err) - return - } - - // Use process.Kill() for cross-platform compatibility. - // On Unix, this sends SIGKILL. On Windows, this calls TerminateProcess. - if err := process.Kill(); err != nil { - t.Errorf("Failed to kill process %d: %s", pid, err) - return - } - - if !waitForProcessExit(pid, 2*time.Second) { - t.Logf("KillCaller: timed out waiting for PID %d to exit (pattern: %s)", pid, pattern) - } - t.Logf("KillCaller: killed PID %d (pattern: %s)", pid, pattern) -} func startProxyServer(t *testing.T, recordRequests bool, diff --git a/acceptance/selftest/kill_caller/currentuser/script b/acceptance/selftest/kill_caller/currentuser/script index 821c42d8cf..bbac4ab29a 100644 --- a/acceptance/selftest/kill_caller/currentuser/script +++ b/acceptance/selftest/kill_caller/currentuser/script @@ -1,2 +1,3 @@ +kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 1 trace errcode $CLI current-user me echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/currentuser/test.toml b/acceptance/selftest/kill_caller/currentuser/test.toml index b76fe401fc..f631136715 100644 --- a/acceptance/selftest/kill_caller/currentuser/test.toml +++ b/acceptance/selftest/kill_caller/currentuser/test.toml @@ -1,4 +1 @@ # Kill the CLI when it calls /Me endpoint (once, then allow) -[[Server]] -Pattern = "GET /api/2.0/preview/scim/v2/Me" -KillCaller = 1 diff --git a/acceptance/selftest/kill_caller/multi_pattern/output.txt b/acceptance/selftest/kill_caller/multi_pattern/output.txt index 9b41f23ec4..b352842835 100644 --- a/acceptance/selftest/kill_caller/multi_pattern/output.txt +++ b/acceptance/selftest/kill_caller/multi_pattern/output.txt @@ -13,8 +13,8 @@ Me attempt 2 done >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Me attempt 3 done - success! diff --git a/acceptance/selftest/kill_caller/multi_pattern/script b/acceptance/selftest/kill_caller/multi_pattern/script index ba9447a29a..e0b5523c45 100644 --- a/acceptance/selftest/kill_caller/multi_pattern/script +++ b/acceptance/selftest/kill_caller/multi_pattern/script @@ -1,3 +1,6 @@ +kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 2 +kill_after.py "GET /api/2.0/workspace/list" 0 1 + # Test pattern 1: /Me endpoint (kills first 2, then allows) trace errcode $CLI current-user me echo "Me attempt 1 done" diff --git a/acceptance/selftest/kill_caller/multi_pattern/test.toml b/acceptance/selftest/kill_caller/multi_pattern/test.toml index 08bdc17085..4565475423 100644 --- a/acceptance/selftest/kill_caller/multi_pattern/test.toml +++ b/acceptance/selftest/kill_caller/multi_pattern/test.toml @@ -1,17 +1,5 @@ -# Test that multiple patterns can have independent KillCaller counts -# Pattern 1: Kill first 2 requests to /Me endpoint -# Pattern 2: Kill first 1 request to /workspace/list endpoint - -[[Server]] -Pattern = "GET /api/2.0/preview/scim/v2/Me" -KillCaller = 2 -Response.Body = ''' -{ - "id": "123", - "userName": "test@example.com" -} -''' +# Test that multiple patterns can have independent kill counts [[Server]] Pattern = "GET /api/2.0/workspace/list" -KillCaller = 1 +Response.Body = '{"objects": []}' diff --git a/acceptance/selftest/kill_caller/multiple/output.txt b/acceptance/selftest/kill_caller/multiple/output.txt index 27b034cfcb..3b6aea849f 100644 --- a/acceptance/selftest/kill_caller/multiple/output.txt +++ b/acceptance/selftest/kill_caller/multiple/output.txt @@ -19,7 +19,7 @@ Attempt 3 done >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Attempt 4 done - success! diff --git a/acceptance/selftest/kill_caller/multiple/script b/acceptance/selftest/kill_caller/multiple/script index 03628e203e..a3659bf58f 100644 --- a/acceptance/selftest/kill_caller/multiple/script +++ b/acceptance/selftest/kill_caller/multiple/script @@ -1,3 +1,5 @@ +kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 3 + # First 3 attempts should be killed trace errcode $CLI current-user me echo "Attempt 1 done" diff --git a/acceptance/selftest/kill_caller/multiple/test.toml b/acceptance/selftest/kill_caller/multiple/test.toml index 5485fc6a6b..24f7ca1922 100644 --- a/acceptance/selftest/kill_caller/multiple/test.toml +++ b/acceptance/selftest/kill_caller/multiple/test.toml @@ -1,10 +1 @@ # Kill the CLI 3 times, then allow the 4th request to succeed -[[Server]] -Pattern = "GET /api/2.0/preview/scim/v2/Me" -KillCaller = 3 -Response.Body = ''' -{ - "id": "123", - "userName": "test@example.com" -} -''' diff --git a/acceptance/selftest/kill_caller/offset/output.txt b/acceptance/selftest/kill_caller/offset/output.txt index cb87595a2c..b6959aec5e 100644 --- a/acceptance/selftest/kill_caller/offset/output.txt +++ b/acceptance/selftest/kill_caller/offset/output.txt @@ -1,15 +1,15 @@ >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Attempt 1 done - success (offset) >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Attempt 2 done - success (offset) @@ -27,7 +27,7 @@ Attempt 4 done - killed >>> [CLI] current-user me { - "id": "123", - "userName": "test@example.com" + "id": "[USERID]", + "userName": "[USERNAME]" } Attempt 5 done - success (past kill window) diff --git a/acceptance/selftest/kill_caller/offset/script b/acceptance/selftest/kill_caller/offset/script index 3411e87480..1bf3d0d4c2 100644 --- a/acceptance/selftest/kill_caller/offset/script +++ b/acceptance/selftest/kill_caller/offset/script @@ -1,3 +1,5 @@ +kill_after.py "GET /api/2.0/preview/scim/v2/Me" 2 2 + # First 2 attempts should succeed (offset period) trace $CLI current-user me echo "Attempt 1 done - success (offset)" diff --git a/acceptance/selftest/kill_caller/offset/test.toml b/acceptance/selftest/kill_caller/offset/test.toml index 5eab09dbfa..7b8d50906c 100644 --- a/acceptance/selftest/kill_caller/offset/test.toml +++ b/acceptance/selftest/kill_caller/offset/test.toml @@ -1,11 +1 @@ # Let first 2 requests pass, kill next 2, then allow rest -[[Server]] -Pattern = "GET /api/2.0/preview/scim/v2/Me" -KillCallerOffset = 2 -KillCaller = 2 -Response.Body = ''' -{ - "id": "123", - "userName": "test@example.com" -} -''' diff --git a/acceptance/selftest/kill_caller/workspace/script b/acceptance/selftest/kill_caller/workspace/script index 076972136c..8fb9dab3f1 100644 --- a/acceptance/selftest/kill_caller/workspace/script +++ b/acceptance/selftest/kill_caller/workspace/script @@ -1,2 +1,3 @@ +kill_after.py "GET /api/2.0/workspace/list" 0 1 trace errcode $CLI workspace list / echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/workspace/test.toml b/acceptance/selftest/kill_caller/workspace/test.toml index eac10a6329..80d2fbbfd1 100644 --- a/acceptance/selftest/kill_caller/workspace/test.toml +++ b/acceptance/selftest/kill_caller/workspace/test.toml @@ -1,4 +1 @@ # Kill the CLI when it calls workspace list endpoint (once, then allow) -[[Server]] -Pattern = "GET /api/2.0/workspace/list" -KillCaller = 1 diff --git a/libs/testserver/kill.go b/libs/testserver/kill.go new file mode 100644 index 0000000000..e24b13a0f1 --- /dev/null +++ b/libs/testserver/kill.go @@ -0,0 +1,108 @@ +package testserver + +import ( + "encoding/json" + "net/http" + "os" + "sync" + "time" + + "github.com/databricks/cli/internal/testutil" +) + +type killRuleKey struct { + token string + pattern string // "METHOD /path" +} + +type killRule struct { + offset int + times int +} + +type killRules struct { + mu sync.Mutex + rules map[killRuleKey]*killRule +} + +func newKillRules() *killRules { + return &killRules{rules: make(map[killRuleKey]*killRule)} +} + +func (kr *killRules) set(token, pattern string, offset, times int) { + kr.mu.Lock() + defer kr.mu.Unlock() + kr.rules[killRuleKey{token: token, pattern: pattern}] = &killRule{offset: offset, times: times} +} + +// check returns true if the caller should be killed for this request. +// It also performs the kill. +func (kr *killRules) check(t testutil.TestingT, method, path, token string, headers http.Header) bool { + pattern := method + " " + path + key := killRuleKey{token: token, pattern: pattern} + + kr.mu.Lock() + rule, ok := kr.rules[key] + if !ok { + kr.mu.Unlock() + return false + } + if rule.offset > 0 { + rule.offset-- + kr.mu.Unlock() + return false + } + if rule.times <= 0 { + delete(kr.rules, key) + kr.mu.Unlock() + return false + } + rule.times-- + if rule.times == 0 { + delete(kr.rules, key) + } + kr.mu.Unlock() + + killProcess(t, pattern, headers) + return true +} + +func killProcess(t testutil.TestingT, pattern string, headers http.Header) { + pid := ExtractPidFromHeaders(headers) + if pid == 0 { + t.Errorf("kill rule matched %q but test-pid not found in User-Agent", pattern) + return + } + + process, err := os.FindProcess(pid) + if err != nil { + t.Errorf("Failed to find process %d: %s", pid, err) + return + } + + if err := process.Kill(); err != nil { + t.Errorf("Failed to kill process %d: %s", pid, err) + return + } + + if !waitForProcessExit(pid, 2*time.Second) { + t.Logf("kill: timed out waiting for PID %d to exit (pattern: %s)", pid, pattern) + } + t.Logf("kill: killed PID %d (pattern: %s)", pid, pattern) +} + +// killEndpointHandler returns a HandlerFunc for POST /__testserver/kill. +func killEndpointHandler(kr *killRules) HandlerFunc { + return func(req Request) any { + var body struct { + Pattern string `json:"pattern"` + Offset int `json:"offset"` + Times int `json:"times"` + } + if err := json.Unmarshal(req.Body, &body); err != nil { + return Response{StatusCode: 400, Body: map[string]string{"error": err.Error()}} + } + kr.set(req.Token, body.Pattern, body.Offset, body.Times) + return Response{StatusCode: 200} + } +} diff --git a/acceptance/internal/process_unix.go b/libs/testserver/process_unix.go similarity index 94% rename from acceptance/internal/process_unix.go rename to libs/testserver/process_unix.go index 1e0b0ead3e..8b82187580 100644 --- a/acceptance/internal/process_unix.go +++ b/libs/testserver/process_unix.go @@ -1,6 +1,6 @@ //go:build linux || darwin -package internal +package testserver import ( "syscall" diff --git a/acceptance/internal/process_windows.go b/libs/testserver/process_windows.go similarity index 96% rename from acceptance/internal/process_windows.go rename to libs/testserver/process_windows.go index fdad8b4f5e..2a32fe4ede 100644 --- a/acceptance/internal/process_windows.go +++ b/libs/testserver/process_windows.go @@ -1,6 +1,6 @@ //go:build windows -package internal +package testserver import ( "time" diff --git a/libs/testserver/server.go b/libs/testserver/server.go index 40556e5529..aa05aee5ab 100644 --- a/libs/testserver/server.go +++ b/libs/testserver/server.go @@ -46,6 +46,8 @@ type Server struct { fakeOidc *FakeOidc mu sync.Mutex + kills *killRules + RequestCallback func(request *Request) ResponseCallback func(request *Request, response *EncodedResponse) } @@ -58,6 +60,7 @@ type Request struct { Vars map[string]string Workspace *FakeWorkspace Context context.Context + Token string } type Response struct { @@ -200,7 +203,19 @@ func getHeaders(value []byte) http.Header { func New(t testutil.TestingT) *Server { router := NewRouter() - server := httptest.NewServer(router) + kills := newKillRules() + + // Wrap the router so kill rules fire for ALL requests, including those with + // no registered handler that would otherwise bypass serve() entirely. + killMiddleware := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + token := getToken(r) + if kills.check(t, r.Method, r.URL.Path, token, r.Header) { + return + } + router.ServeHTTP(w, r) + }) + + server := httptest.NewServer(killMiddleware) t.Cleanup(server.Close) s := &Server{ @@ -209,6 +224,7 @@ func New(t testutil.TestingT) *Server { t: t, fakeWorkspaces: map[string]*FakeWorkspace{}, fakeOidc: &FakeOidc{url: server.URL}, + kills: kills, } router.Dispatch = s.serve @@ -258,6 +274,9 @@ Response.Body = '' }) router.NotFound = notFoundFunc + // Register a test-only endpoint for setting up kill rules from scripts. + s.Handle("POST", "/__testserver/kill", killEndpointHandler(s.kills)) + // Register a default handler for the SDK's host metadata discovery endpoint. // The SDK resolves this during config initialization (as of v0.126.0) to // determine workspace/account IDs, cloud, and OIDC endpoints. Without this @@ -289,12 +308,15 @@ func (s *Server) getWorkspaceForToken(token string) *FakeWorkspace { } func (s *Server) serve(w http.ResponseWriter, r *http.Request, handler HandlerFunc, vars map[string]string) { + token := getToken(r) + // Each test uses unique DATABRICKS_TOKEN, we simulate each token having // it's own fake fakeWorkspace to avoid interference between tests. - fakeWorkspace := s.getWorkspaceForToken(getToken(r)) + fakeWorkspace := s.getWorkspaceForToken(token) request := NewRequest(s.t, r, fakeWorkspace) request.Vars = vars + request.Token = token if s.RequestCallback != nil { s.RequestCallback(&request) From 284c4db9da5acb2db48cd9fadbd5b4e9a4fa54c1 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 16:25:44 +0200 Subject: [PATCH 63/85] remove blank line --- acceptance/internal/prepare_server.go | 1 - 1 file changed, 1 deletion(-) diff --git a/acceptance/internal/prepare_server.go b/acceptance/internal/prepare_server.go index f8be1ae947..299d48f03e 100644 --- a/acceptance/internal/prepare_server.go +++ b/acceptance/internal/prepare_server.go @@ -219,7 +219,6 @@ func startLocalServer(t *testing.T, return s.URL } - func startProxyServer(t *testing.T, recordRequests bool, logRequests bool, From d2362e202ddf5b4a42fc56ee62af0f226a939c5c Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 16:36:14 +0200 Subject: [PATCH 64/85] wal tests: remove redundant server stubs covered by default handlers Co-authored-by: Isaac --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 10 +++++----- acceptance/bundle/deploy/wal/chain-3-jobs/test.toml | 12 ------------ .../bundle/deploy/wal/crash-after-create/output.txt | 2 +- .../bundle/deploy/wal/crash-after-create/test.toml | 11 ----------- 4 files changed, 6 insertions(+), 29 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index 8c70ebafa3..e675bb689d 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -17,7 +17,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_01", "v": { - "__id__": "1001", + "__id__": "[NUMID]", "state": { "deployment": { "kind": "BUNDLE", @@ -49,7 +49,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_02", "v": { - "__id__": "1001", + "__id__": "[NUMID]", "depends_on": [ { "label": "${resources.jobs.job_01.id}", @@ -61,7 +61,7 @@ Exit code: [KILLED] "kind": "BUNDLE", "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" }, - "description": "depends on 1001", + "description": "depends on [NUMID]", "edit_mode": "UI_LOCKED", "format": "MULTI_TASK", "max_concurrent_runs": 1, @@ -98,10 +98,10 @@ Resources: Jobs: job_01: Name: job-01 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] job_02: Name: job-02 - URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] job_03: Name: job-03 URL: (not deployed) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml index 746896a789..932f3ae97a 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml @@ -1,14 +1,2 @@ # Linear chain: job_01 -> job_02 -> job_03 # Let first 2 jobs/create succeed, then kill on the 3rd - -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "POST /api/2.2/jobs/reset" -Response.Body = '{}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index a5cdd4f40d..4eb2e1ea12 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -9,7 +9,7 @@ Exit code: [KILLED] === WAL should exist after crash === WAL exists (expected) {"lineage":"[UUID]","serial":1,"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} +{"k":"resources.jobs.job_a","v":{"__id__":"[NUMID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} === State file after crash (should be empty) === cat: .databricks/bundle/default/resources.json: No such file or directory diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml index d1e99eadb7..8e4ca4a849 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -2,14 +2,3 @@ # Second deploy recovers from WAL and completes successfully. # job_b depends on job_a, so jobs/get is called after job_a's SaveState. -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "POST /api/2.2/jobs/reset" -Response.Body = '{}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' From 22fd654cab29a827eb4eca1063dfd5b9aca2e3e5 Mon Sep 17 00:00:00 2001 From: Denis Bilenko Date: Mon, 11 May 2026 16:40:21 +0200 Subject: [PATCH 65/85] wal tests: move test.toml comments to script, remove empty test.toml files Co-authored-by: Isaac --- acceptance/bundle/deploy/wal/chain-3-jobs/script | 2 ++ acceptance/bundle/deploy/wal/chain-3-jobs/test.toml | 2 -- acceptance/bundle/deploy/wal/crash-after-create/script | 3 +++ acceptance/bundle/deploy/wal/crash-after-create/test.toml | 4 ---- 4 files changed, 5 insertions(+), 6 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/chain-3-jobs/test.toml delete mode 100644 acceptance/bundle/deploy/wal/crash-after-create/test.toml diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index a1196f10c1..2bd55befcd 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -1,3 +1,5 @@ +# Linear chain: job_01 -> job_02 -> job_03 +# Let first 2 jobs/create succeed, then kill on the 3rd kill_after.py "POST /api/2.2/jobs/create" 2 1 echo "=== First deploy (crashes on job_03) ===" diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml b/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml deleted file mode 100644 index 932f3ae97a..0000000000 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/test.toml +++ /dev/null @@ -1,2 +0,0 @@ -# Linear chain: job_01 -> job_02 -> job_03 -# Let first 2 jobs/create succeed, then kill on the 3rd diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script index bb33d67870..f4dba936bb 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/script +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -1,3 +1,6 @@ +# WAL recovery after real crash. First deploy creates job_a then crashes. +# Second deploy recovers from WAL and completes successfully. +# job_b depends on job_a, so jobs/get is called after job_a's SaveState. kill_after.py "GET /api/2.2/jobs/get" 0 1 echo "=== First deploy (crashes after job_a create, before job_b) ===" diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml deleted file mode 100644 index 8e4ca4a849..0000000000 --- a/acceptance/bundle/deploy/wal/crash-after-create/test.toml +++ /dev/null @@ -1,4 +0,0 @@ -# WAL recovery after real crash. First deploy creates job_a then crashes. -# Second deploy recovers from WAL and completes successfully. -# job_b depends on job_a, so jobs/get is called after job_a's SaveState. - From 853b56b0dae0cfee46f09056a9dee040611bca83 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 16:55:17 +0200 Subject: [PATCH 66/85] Add databricks.yml --- databricks.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 databricks.yml diff --git a/databricks.yml b/databricks.yml new file mode 100644 index 0000000000..7cf210722a --- /dev/null +++ b/databricks.yml @@ -0,0 +1,19 @@ +bundle: + name: git + git: + # This is currently not supported + branch: ${var.deployment_branch} + +variables: + deployment_branch: + # By setting deployment_branch to "" we set bundle.git.branch to "" which is the same unsetting it. + # This this should make CLI read branch from git and update bundle.git.branch accordingly. It should + # Also set bundle.git.inferred to true. + default: "" + +targets: + prod: + default: true + dev: + variables: + deployment_branch: dev-branch From 607f657270ef57ddd8b2fe73adc3810087d91329 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:01:35 +0200 Subject: [PATCH 67/85] clean up --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 8 -------- acceptance/bundle/deploy/wal/chain-3-jobs/script | 4 ---- 2 files changed, 12 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index e675bb689d..1458577432 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -106,13 +106,5 @@ Resources: Name: job-03 URL: (not deployed) -=== Second deploy (recovery) === - ->>> [CLI] bundle deploy --force-lock -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! - === WAL after successful deploy === WAL deleted (expected) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index 2bd55befcd..e874a0fcac 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -17,10 +17,6 @@ echo "" echo "=== Bundle summary (reads from WAL) ===" $CLI bundle summary -echo "" -echo "=== Second deploy (recovery) ===" -trace $CLI bundle deploy --force-lock - echo "" echo "=== WAL after successful deploy ===" cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)" From 4b4a022e29a6f464595e4f0e1c694f1091e4055b Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:02:03 +0200 Subject: [PATCH 68/85] add replace_ids.py --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 10 +++++----- acceptance/bundle/deploy/wal/chain-3-jobs/script | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index 1458577432..7e04ba4dae 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -17,7 +17,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_01", "v": { - "__id__": "[NUMID]", + "__id__": "[JOB_01_ID]", "state": { "deployment": { "kind": "BUNDLE", @@ -49,7 +49,7 @@ Exit code: [KILLED] { "k": "resources.jobs.job_02", "v": { - "__id__": "[NUMID]", + "__id__": "[JOB_02_ID]", "depends_on": [ { "label": "${resources.jobs.job_01.id}", @@ -61,7 +61,7 @@ Exit code: [KILLED] "kind": "BUNDLE", "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json" }, - "description": "depends on [NUMID]", + "description": "depends on [JOB_01_ID]", "edit_mode": "UI_LOCKED", "format": "MULTI_TASK", "max_concurrent_runs": 1, @@ -98,10 +98,10 @@ Resources: Jobs: job_01: Name: job-01 - URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[JOB_01_ID]?o=[NUMID] job_02: Name: job-02 - URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[JOB_02_ID]?o=[NUMID] job_03: Name: job-03 URL: (not deployed) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/script b/acceptance/bundle/deploy/wal/chain-3-jobs/script index e874a0fcac..a5afc6f51d 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/script +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/script @@ -20,3 +20,5 @@ $CLI bundle summary echo "" echo "=== WAL after successful deploy ===" cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)" + +replace_ids.py From 29e0ca52deeb2eddb62a9c30e81d0242c8816ece Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:03:39 +0200 Subject: [PATCH 69/85] clean up --- .../bundle/deploy/wal/corrupted-wal-entry/output.txt | 8 +++----- acceptance/bundle/deploy/wal/corrupted-wal-entry/script | 9 +-------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index d04d0389ec..afd717a27b 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,9 +1,9 @@ -=== WAL content === + +>>> cat .databricks/bundle/default/resources.json.wal {"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"[JOB1_ID]","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"[JOB2_ID]","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-=== Deploy (should recover valid entries, skip corrupted last line) === - +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- >>> [CLI] bundle deploy Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted @@ -11,7 +11,6 @@ Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-corrupted-test Deploying resources... Updating deployment state... Deployment complete! -=== Final state (should have recovered entries) === >>> [CLI] bundle summary Name: wal-corrupted-test @@ -27,7 +26,6 @@ Resources: valid_job: Name: valid-job URL: [DATABRICKS_URL]/jobs/[JOB1_ID]?o=[NUMID] -=== Corrupted WAL entries file === >>> cat .databricks/bundle/default/resources.json.wal.corrupted {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index b6b12c347b..ae828cdb6b 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -15,16 +15,9 @@ cp resources.json .databricks/bundle/default/ printf '{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-' } > .databricks/bundle/default/resources.json.wal -echo "=== WAL content ===" -cat .databricks/bundle/default/resources.json.wal - -echo "=== Deploy (should recover valid entries, skip corrupted last line) ===" +trace cat .databricks/bundle/default/resources.json.wal trace $CLI bundle deploy - -echo "=== Final state (should have recovered entries) ===" trace $CLI bundle summary - -echo "=== Corrupted WAL entries file ===" trace cat .databricks/bundle/default/resources.json.wal.corrupted printf "\n=== WAL after successful deploy ===\n" From c98b3dd01d771e5d51b70ad4ccdb9bc58575b002 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:07:56 +0200 Subject: [PATCH 70/85] test more commands for validation --- .../bundle/deploy/wal/lineage-mismatch/out.test.toml | 1 + .../bundle/deploy/wal/lineage-mismatch/output.txt | 11 ++++------- acceptance/bundle/deploy/wal/lineage-mismatch/script | 7 +++---- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml index e90b6d5d1b..9448f875df 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/out.test.toml @@ -1,3 +1,4 @@ Local = true Cloud = false +EnvMatrix.COMMAND = ["deploy", "plan", "summary"] EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt index 00bc78cf28..cae1ffac08 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/output.txt @@ -1,10 +1,7 @@ -=== WAL content === -{"lineage":"wal-lineage-bbb","serial":2} -{"k":"resources.jobs.test_job","v":{"__id__":"1001","state":{"name":"test-job"}}} -=== Deploy (should fail with lineage mismatch error) === - ->>> errcode [CLI] bundle deploy +Any command should fail with lineage mismatch error Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) -Exit code: 1 +>>> musterr [CLI] bundle destroy --auto-approve +Error: reading state from [TEST_TMP_DIR]/.databricks/bundle/default/resources.json: WAL recovery failed: WAL lineage (wal-lineage-bbb) does not match state lineage (state-lineage-aaa) + diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/script b/acceptance/bundle/deploy/wal/lineage-mismatch/script index 4617c338fe..0629a37c0f 100644 --- a/acceptance/bundle/deploy/wal/lineage-mismatch/script +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/script @@ -2,8 +2,7 @@ mkdir -p .databricks/bundle/default cp resources.json .databricks/bundle/default/ cp resources.json.wal .databricks/bundle/default/ -echo "=== WAL content ===" -cat .databricks/bundle/default/resources.json.wal +echo "Any command should fail with lineage mismatch error" +musterr $CLI bundle $COMMAND -echo "=== Deploy (should fail with lineage mismatch error) ===" -trace errcode $CLI bundle deploy +trace musterr $CLI bundle destroy --auto-approve From f151e71c6058a73064dc560557aaf472e62166d7 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:08:33 +0200 Subject: [PATCH 71/85] remove normal-deploy test --- .../deploy/wal/normal-deploy/databricks.yml | 14 -------------- .../deploy/wal/normal-deploy/out.test.toml | 3 --- .../bundle/deploy/wal/normal-deploy/output.txt | 16 ---------------- .../bundle/deploy/wal/normal-deploy/script | 12 ------------ .../bundle/deploy/wal/normal-deploy/test.py | 1 - .../bundle/deploy/wal/normal-deploy/test.toml | 9 --------- 6 files changed, 55 deletions(-) delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/databricks.yml delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/out.test.toml delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/output.txt delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/script delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/test.py delete mode 100644 acceptance/bundle/deploy/wal/normal-deploy/test.toml diff --git a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml b/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml deleted file mode 100644 index 4439322e0e..0000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/databricks.yml +++ /dev/null @@ -1,14 +0,0 @@ -bundle: - name: wal-test - -resources: - jobs: - test_job: - name: "test-job" - tasks: - - task_key: "test-task" - spark_python_task: - python_file: ./test.py - new_cluster: - spark_version: 15.4.x-scala2.12 - node_type_id: i3.xlarge diff --git a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml b/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml deleted file mode 100644 index e90b6d5d1b..0000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/out.test.toml +++ /dev/null @@ -1,3 +0,0 @@ -Local = true -Cloud = false -EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/normal-deploy/output.txt b/acceptance/bundle/deploy/wal/normal-deploy/output.txt deleted file mode 100644 index 2ca4f5f51c..0000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/output.txt +++ /dev/null @@ -1,16 +0,0 @@ - ->>> [CLI] bundle deploy -Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-test/default/files... -Deploying resources... -Updating deployment state... -Deployment complete! -=== Checking WAL file after deploy === -WAL file deleted after successful deploy (expected) -=== State file content === -{ - "lineage": "[UUID]", - "serial": 1, - "state_keys": [ - "resources.jobs.test_job" - ] -} diff --git a/acceptance/bundle/deploy/wal/normal-deploy/script b/acceptance/bundle/deploy/wal/normal-deploy/script deleted file mode 100644 index 5acc4d9b58..0000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/script +++ /dev/null @@ -1,12 +0,0 @@ -trace $CLI bundle deploy - -echo "=== Checking WAL file after deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL file exists (unexpected - should be deleted after Finalize)" - cat .databricks/bundle/default/resources.json.wal -else - echo "WAL file deleted after successful deploy (expected)" -fi - -echo "=== State file content ===" -cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' diff --git a/acceptance/bundle/deploy/wal/normal-deploy/test.py b/acceptance/bundle/deploy/wal/normal-deploy/test.py deleted file mode 100644 index 1ff8e07c70..0000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/test.py +++ /dev/null @@ -1 +0,0 @@ -print("test") diff --git a/acceptance/bundle/deploy/wal/normal-deploy/test.toml b/acceptance/bundle/deploy/wal/normal-deploy/test.toml deleted file mode 100644 index 1299046974..0000000000 --- a/acceptance/bundle/deploy/wal/normal-deploy/test.toml +++ /dev/null @@ -1,9 +0,0 @@ -# WAL is created during deploy, used for state tracking, and deleted after Finalize. - -[[Server]] -Pattern = "POST /api/2.2/jobs/create" -Response.Body = '{"job_id": 1001}' - -[[Server]] -Pattern = "GET /api/2.2/jobs/get" -Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}' From cae10125780fcb3b2ad6ccd303c6bf8d6e91119d Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:10:07 +0200 Subject: [PATCH 72/85] clean up --- .../bundle/deploy/wal/wal-with-delete/output.txt | 14 ++++++++------ .../bundle/deploy/wal/wal-with-delete/script | 9 +-------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt index c08e365177..4eb0fb5724 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/output.txt +++ b/acceptance/bundle/deploy/wal/wal-with-delete/output.txt @@ -9,9 +9,11 @@ Deploying resources... Updating deployment state... Deployment complete! === Final state (should have no jobs) === -{ - "serial": 2, - "state_keys": [] -} -=== WAL after successful deploy === -WAL deleted (expected) + +>>> [CLI] bundle summary +Name: wal-delete-test +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/wal-delete-test/default +Resources: diff --git a/acceptance/bundle/deploy/wal/wal-with-delete/script b/acceptance/bundle/deploy/wal/wal-with-delete/script index 5d5a78a885..1b6708bc0f 100644 --- a/acceptance/bundle/deploy/wal/wal-with-delete/script +++ b/acceptance/bundle/deploy/wal/wal-with-delete/script @@ -9,11 +9,4 @@ echo "=== Deploy (should recover delete from WAL) ===" trace $CLI bundle deploy echo "=== Final state (should have no jobs) ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' - -echo "=== WAL after successful deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (unexpected)" -else - echo "WAL deleted (expected)" -fi +trace $CLI bundle summary From 088ed09c06656d7db27ca5506d2c1c3ebb244eea Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:30:08 +0200 Subject: [PATCH 73/85] test recover in plan/deploy/summary --- .../wal/crash-after-create/out.test.toml | 1 + .../deploy/wal/crash-after-create/output.txt | 55 ++++++++++++++++--- .../deploy/wal/crash-after-create/script | 30 +++------- 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml index e90b6d5d1b..1d895a16c9 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml +++ b/acceptance/bundle/deploy/wal/crash-after-create/out.test.toml @@ -1,3 +1,4 @@ Local = true Cloud = false +EnvMatrix.COMMAND = ["plan", "deploy --force-lock", "summary"] EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 4eb2e1ea12..0a50333e72 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -6,11 +6,52 @@ Deploying resources... [PROCESS_KILLED] Exit code: [KILLED] -=== WAL should exist after crash === -WAL exists (expected) -{"lineage":"[UUID]","serial":1,"state_version":2,"cli_version":"[DEV_VERSION]"} -{"k":"resources.jobs.job_a","v":{"__id__":"[NUMID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}} -=== State file after crash (should be empty) === -cat: .databricks/bundle/default/resources.json: No such file or directory -Exit code: 1 +>>> assert_exists.py .databricks/bundle/default/resources.json.wal + +>>> assert_not_exists.py .databricks/bundle/default/resources.json + +>>> cat .databricks/bundle/default/resources.json.wal +{ + "lineage": "[UUID]", + "serial": 1, + "state_version": 2, + "cli_version": "[DEV_VERSION]" +} +{ + "k": "resources.jobs.job_a", + "v": { + "__id__": "[NUMID]", + "state": { + "deployment": { + "kind": "BUNDLE", + "metadata_file_path": "/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json" + }, + "description": "first job", + "edit_mode": "UI_LOCKED", + "format": "MULTI_TASK", + "max_concurrent_runs": 1, + "name": "test-job-a", + "queue": { + "enabled": true + }, + "tasks": [ + { + "new_cluster": { + "node_type_id": "[NODE_TYPE_ID]", + "spark_version": "15.4.x-scala2.12" + }, + "spark_python_task": { + "python_file": "/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py" + }, + "task_key": "task-a" + } + ] + } + } +} + +=== Any other command recovers state +>>> assert_exists.py .databricks/bundle/default/resources.json + +>>> assert_not_exists.py .databricks/bundle/default/resources.json.wal diff --git a/acceptance/bundle/deploy/wal/crash-after-create/script b/acceptance/bundle/deploy/wal/crash-after-create/script index f4dba936bb..264d84648d 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/script +++ b/acceptance/bundle/deploy/wal/crash-after-create/script @@ -1,31 +1,17 @@ # WAL recovery after real crash. First deploy creates job_a then crashes. # Second deploy recovers from WAL and completes successfully. # job_b depends on job_a, so jobs/get is called after job_a's SaveState. -kill_after.py "GET /api/2.2/jobs/get" 0 1 +kill_after.py "POST /api/2.2/jobs/create" 1 1 echo "=== First deploy (crashes after job_a create, before job_b) ===" trace errcode $CLI bundle deploy -echo "=== WAL should exist after crash ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL exists (expected)" - cat .databricks/bundle/default/resources.json.wal -else - echo "WAL missing (unexpected)" -fi +trace assert_exists.py .databricks/bundle/default/resources.json.wal +trace assert_not_exists.py .databricks/bundle/default/resources.json +trace cat .databricks/bundle/default/resources.json.wal | jq -echo "=== State file after crash (should be empty) ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' +title "Any other command recovers state" +$CLI bundle $COMMAND &> LOG.COMMAND.txt -echo "=== Second deploy (should recover from WAL and complete) ===" -trace $CLI bundle deploy --force-lock - -echo "=== State file after recovery ===" -cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}' - -echo "=== WAL file after successful deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL file exists (unexpected)" -else - echo "WAL file deleted (expected)" -fi +trace assert_exists.py .databricks/bundle/default/resources.json +trace assert_not_exists.py .databricks/bundle/default/resources.json.wal From a52fa51dd758810133cba25d2e29ae9a338dab11 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:33:26 +0200 Subject: [PATCH 74/85] clean up --- .../bundle/deploy/wal/empty-wal/output.txt | 17 ------------ acceptance/bundle/deploy/wal/empty-wal/script | 26 +------------------ 2 files changed, 1 insertion(+), 42 deletions(-) diff --git a/acceptance/bundle/deploy/wal/empty-wal/output.txt b/acceptance/bundle/deploy/wal/empty-wal/output.txt index b4ce67ee66..bba6d249fc 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/output.txt +++ b/acceptance/bundle/deploy/wal/empty-wal/output.txt @@ -1,23 +1,6 @@ -=== Creating state directory === -=== Creating empty WAL file === -=== Empty WAL file exists === -[FILE_INFO] .databricks/bundle/default/resources.json.wal -=== Deploy (should handle empty WAL gracefully) === >>> [CLI] bundle deploy Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-empty-test/default/files... Deploying resources... Updating deployment state... Deployment complete! -=== Checking WAL file after deploy === -Empty WAL deleted (expected) -=== Corrupted WAL file === -Corrupted WAL file missing (expected) -=== State file content === -{ - "lineage": "[UUID]", - "serial": 1, - "state_keys": [ - "resources.jobs.test_job" - ] -} diff --git a/acceptance/bundle/deploy/wal/empty-wal/script b/acceptance/bundle/deploy/wal/empty-wal/script index 3929de8eb1..ac104951c5 100644 --- a/acceptance/bundle/deploy/wal/empty-wal/script +++ b/acceptance/bundle/deploy/wal/empty-wal/script @@ -1,28 +1,4 @@ -echo "=== Creating state directory ===" mkdir -p .databricks/bundle/default - -echo "=== Creating empty WAL file ===" touch .databricks/bundle/default/resources.json.wal - -echo "=== Empty WAL file exists ===" -ls -la .databricks/bundle/default/resources.json.wal - -echo "=== Deploy (should handle empty WAL gracefully) ===" trace $CLI bundle deploy - -echo "=== Checking WAL file after deploy ===" -if [ -f ".databricks/bundle/default/resources.json.wal" ]; then - echo "WAL file exists (unexpected)" -else - echo "Empty WAL deleted (expected)" -fi - -echo "=== Corrupted WAL file ===" -if [ -f ".databricks/bundle/default/resources.json.wal.corrupted" ]; then - ls -la .databricks/bundle/default/resources.json.wal.corrupted -else - echo "Corrupted WAL file missing (expected)" -fi - -echo "=== State file content ===" -cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}' +assert_not_exists.py .databricks/bundle/default/resources.json.wal* From 16de5c1f50ffa0e61d17c2fd39c71fcbefe0d87c Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:33:59 +0200 Subject: [PATCH 75/85] add assert_*.py --- acceptance/bin/assert_exists.py | 12 ++++++++++++ acceptance/bin/assert_not_exists.py | 12 ++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 acceptance/bin/assert_exists.py create mode 100644 acceptance/bin/assert_not_exists.py diff --git a/acceptance/bin/assert_exists.py b/acceptance/bin/assert_exists.py new file mode 100644 index 0000000000..0d33b46d2a --- /dev/null +++ b/acceptance/bin/assert_exists.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +import os, sys + +errors = 0 + +for filename in sys.argv[1:]: + if not os.path.exists(filename): + sys.stderr.write(f"Unexpected: {filename} does not exist.\n") + errors += 1 + +if errors: + sys.exit(1) diff --git a/acceptance/bin/assert_not_exists.py b/acceptance/bin/assert_not_exists.py new file mode 100644 index 0000000000..76d467e451 --- /dev/null +++ b/acceptance/bin/assert_not_exists.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 +import os, sys + +errors = 0 + +for filename in sys.argv[1:]: + if os.path.exists(filename): + sys.stderr.write(f"Unexpected: {filename} exists.\n") + errors += 1 + +if errors: + sys.exit(1) From 3866db14378b59f573509596af25c97aca9b7726 Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:42:11 +0200 Subject: [PATCH 76/85] corrupted-wal-entry: use envsubst + template file for WAL generation Co-authored-by: Isaac --- .../bundle/deploy/wal/corrupted-wal-entry/output.txt | 8 ++++---- .../wal/corrupted-wal-entry/resources.json.wal.tmpl | 4 ++++ acceptance/bundle/deploy/wal/corrupted-wal-entry/script | 8 +------- 3 files changed, 9 insertions(+), 11 deletions(-) create mode 100644 acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index afd717a27b..fa6e081911 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -1,8 +1,8 @@ >>> cat .databricks/bundle/default/resources.json.wal {"lineage":"test-lineage-123","serial":6} -{"k":"resources.jobs.valid_job","v":{"__id__":"[JOB1_ID]","state":{"name":"valid-job"}}} -{"k":"resources.jobs.another_valid","v":{"__id__":"[JOB2_ID]","state":{"name":"another-valid"}}} +{"k":"resources.jobs.valid_job","v":{"__id__":"","state":{"name":"valid-job"}}} +{"k":"resources.jobs.another_valid","v":{"__id__":"","state":{"name":"another-valid"}}} {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- >>> [CLI] bundle deploy Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input @@ -22,10 +22,10 @@ Resources: Jobs: another_valid: Name: another-valid - URL: [DATABRICKS_URL]/jobs/[JOB2_ID]?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] valid_job: Name: valid-job - URL: [DATABRICKS_URL]/jobs/[JOB1_ID]?o=[NUMID] + URL: [DATABRICKS_URL]/jobs/[NUMID]?o=[NUMID] >>> cat .databricks/bundle/default/resources.json.wal.corrupted {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl new file mode 100644 index 0000000000..44f3bbdaf4 --- /dev/null +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl @@ -0,0 +1,4 @@ +{"lineage":"test-lineage-123","serial":6} +{"k":"resources.jobs.valid_job","v":{"__id__":"$JOB1","state":{"name":"valid-job"}}} +{"k":"resources.jobs.another_valid","v":{"__id__":"$JOB2","state":{"name":"another-valid"}}} +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- \ No newline at end of file diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script index ae828cdb6b..d6f151a29c 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/script +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/script @@ -7,13 +7,7 @@ echo "$JOB2:JOB2_ID" >> ACC_REPLS mkdir -p .databricks/bundle/default cp resources.json .databricks/bundle/default/ -# Generate WAL with actual job IDs; truncate the partial_write entry to simulate corruption -{ - printf '{"lineage":"test-lineage-123","serial":6}\n' - printf '{"k":"resources.jobs.valid_job","v":{"__id__":"%s","state":{"name":"valid-job"}}}\n' "$JOB1" - printf '{"k":"resources.jobs.another_valid","v":{"__id__":"%s","state":{"name":"another-valid"}}}\n' "$JOB2" - printf '{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial-' -} > .databricks/bundle/default/resources.json.wal +envsubst < resources.json.wal.tmpl > .databricks/bundle/default/resources.json.wal trace cat .databricks/bundle/default/resources.json.wal trace $CLI bundle deploy From ae293dd3293ed15d74f03f2725089559676d2efd Mon Sep 17 00:00:00 2001 From: Tester Date: Mon, 11 May 2026 17:46:14 +0200 Subject: [PATCH 77/85] kill_caller selftests: move test.toml comments to script, remove empty test.toml files Co-authored-by: Isaac --- acceptance/selftest/kill_caller/currentuser/script | 1 + acceptance/selftest/kill_caller/currentuser/test.toml | 1 - acceptance/selftest/kill_caller/multiple/script | 1 + acceptance/selftest/kill_caller/multiple/test.toml | 1 - acceptance/selftest/kill_caller/offset/script | 1 + acceptance/selftest/kill_caller/offset/test.toml | 1 - acceptance/selftest/kill_caller/workspace/script | 1 + acceptance/selftest/kill_caller/workspace/test.toml | 1 - 8 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 acceptance/selftest/kill_caller/currentuser/test.toml delete mode 100644 acceptance/selftest/kill_caller/multiple/test.toml delete mode 100644 acceptance/selftest/kill_caller/offset/test.toml delete mode 100644 acceptance/selftest/kill_caller/workspace/test.toml diff --git a/acceptance/selftest/kill_caller/currentuser/script b/acceptance/selftest/kill_caller/currentuser/script index bbac4ab29a..dbd96b12a9 100644 --- a/acceptance/selftest/kill_caller/currentuser/script +++ b/acceptance/selftest/kill_caller/currentuser/script @@ -1,3 +1,4 @@ +# Kill the CLI when it calls /Me endpoint (once, then allow) kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 1 trace errcode $CLI current-user me echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/currentuser/test.toml b/acceptance/selftest/kill_caller/currentuser/test.toml deleted file mode 100644 index f631136715..0000000000 --- a/acceptance/selftest/kill_caller/currentuser/test.toml +++ /dev/null @@ -1 +0,0 @@ -# Kill the CLI when it calls /Me endpoint (once, then allow) diff --git a/acceptance/selftest/kill_caller/multiple/script b/acceptance/selftest/kill_caller/multiple/script index a3659bf58f..1e089f3cc0 100644 --- a/acceptance/selftest/kill_caller/multiple/script +++ b/acceptance/selftest/kill_caller/multiple/script @@ -1,3 +1,4 @@ +# Kill the CLI 3 times, then allow the 4th request to succeed kill_after.py "GET /api/2.0/preview/scim/v2/Me" 0 3 # First 3 attempts should be killed diff --git a/acceptance/selftest/kill_caller/multiple/test.toml b/acceptance/selftest/kill_caller/multiple/test.toml deleted file mode 100644 index 24f7ca1922..0000000000 --- a/acceptance/selftest/kill_caller/multiple/test.toml +++ /dev/null @@ -1 +0,0 @@ -# Kill the CLI 3 times, then allow the 4th request to succeed diff --git a/acceptance/selftest/kill_caller/offset/script b/acceptance/selftest/kill_caller/offset/script index 1bf3d0d4c2..6abee0dcac 100644 --- a/acceptance/selftest/kill_caller/offset/script +++ b/acceptance/selftest/kill_caller/offset/script @@ -1,3 +1,4 @@ +# Let first 2 requests pass, kill next 2, then allow rest kill_after.py "GET /api/2.0/preview/scim/v2/Me" 2 2 # First 2 attempts should succeed (offset period) diff --git a/acceptance/selftest/kill_caller/offset/test.toml b/acceptance/selftest/kill_caller/offset/test.toml deleted file mode 100644 index 7b8d50906c..0000000000 --- a/acceptance/selftest/kill_caller/offset/test.toml +++ /dev/null @@ -1 +0,0 @@ -# Let first 2 requests pass, kill next 2, then allow rest diff --git a/acceptance/selftest/kill_caller/workspace/script b/acceptance/selftest/kill_caller/workspace/script index 8fb9dab3f1..5a21881ab3 100644 --- a/acceptance/selftest/kill_caller/workspace/script +++ b/acceptance/selftest/kill_caller/workspace/script @@ -1,3 +1,4 @@ +# Kill the CLI when it calls workspace list endpoint (once, then allow) kill_after.py "GET /api/2.0/workspace/list" 0 1 trace errcode $CLI workspace list / echo "Script continued after kill" diff --git a/acceptance/selftest/kill_caller/workspace/test.toml b/acceptance/selftest/kill_caller/workspace/test.toml deleted file mode 100644 index 80d2fbbfd1..0000000000 --- a/acceptance/selftest/kill_caller/workspace/test.toml +++ /dev/null @@ -1 +0,0 @@ -# Kill the CLI when it calls workspace list endpoint (once, then allow) From 202c0ac172c3a16884117aa90221afae6f2cb566 Mon Sep 17 00:00:00 2001 From: Tester Date: Tue, 12 May 2026 13:25:35 +0200 Subject: [PATCH 78/85] formatting --- acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt | 1 + .../deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt index fa6e081911..1aee4fe481 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/output.txt @@ -4,6 +4,7 @@ {"k":"resources.jobs.valid_job","v":{"__id__":"","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"","state":{"name":"another-valid"}}} {"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- + >>> [CLI] bundle deploy Warn: Skipping corrupted WAL entry at [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal:4: unexpected end of JSON input Warn: Saved 1 corrupted WAL entries to [TEST_TMP_DIR]/.databricks/bundle/default/resources.json.wal.corrupted diff --git a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl index 44f3bbdaf4..7ef5773a4e 100644 --- a/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl +++ b/acceptance/bundle/deploy/wal/corrupted-wal-entry/resources.json.wal.tmpl @@ -1,4 +1,4 @@ {"lineage":"test-lineage-123","serial":6} {"k":"resources.jobs.valid_job","v":{"__id__":"$JOB1","state":{"name":"valid-job"}}} {"k":"resources.jobs.another_valid","v":{"__id__":"$JOB2","state":{"name":"another-valid"}}} -{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- \ No newline at end of file +{"k":"resources.jobs.partial_write","v":{"__id__":"33","state":{"name":"partial- From 9a1bb574007f24c39a76b8e1118bd9a54eec180d Mon Sep 17 00:00:00 2001 From: Tester Date: Tue, 12 May 2026 15:43:37 +0200 Subject: [PATCH 79/85] fix CI: commit missing test.tomls and fix assert_*.py permissions - acceptance/bundle/deploy/wal/crash-after-create/test.toml and lineage-mismatch/test.toml were untracked; scripts using $COMMAND failed with "unbound variable" on CI - assert_exists.py and assert_not_exists.py were tracked as 100644; CI ran them as non-executable, producing "Permission denied" errors Co-authored-by: Isaac --- acceptance/bin/assert_exists.py | 0 acceptance/bin/assert_not_exists.py | 0 acceptance/bundle/deploy/wal/crash-after-create/test.toml | 2 ++ acceptance/bundle/deploy/wal/lineage-mismatch/test.toml | 1 + 4 files changed, 3 insertions(+) mode change 100644 => 100755 acceptance/bin/assert_exists.py mode change 100644 => 100755 acceptance/bin/assert_not_exists.py create mode 100644 acceptance/bundle/deploy/wal/crash-after-create/test.toml create mode 100644 acceptance/bundle/deploy/wal/lineage-mismatch/test.toml diff --git a/acceptance/bin/assert_exists.py b/acceptance/bin/assert_exists.py old mode 100644 new mode 100755 diff --git a/acceptance/bin/assert_not_exists.py b/acceptance/bin/assert_not_exists.py old mode 100644 new mode 100755 diff --git a/acceptance/bundle/deploy/wal/crash-after-create/test.toml b/acceptance/bundle/deploy/wal/crash-after-create/test.toml new file mode 100644 index 0000000000..ecd87c31a8 --- /dev/null +++ b/acceptance/bundle/deploy/wal/crash-after-create/test.toml @@ -0,0 +1,2 @@ +EnvMatrix.COMMAND = ["plan", "deploy --force-lock", "summary"] +EnvRepl.COMMAND = false diff --git a/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml new file mode 100644 index 0000000000..0b3a9e0b7c --- /dev/null +++ b/acceptance/bundle/deploy/wal/lineage-mismatch/test.toml @@ -0,0 +1 @@ +EnvMatrix.COMMAND = ["deploy", "plan", "summary"] From 2d5eea9302626bbae6b38230267210e45a08433d Mon Sep 17 00:00:00 2001 From: Tester Date: Tue, 12 May 2026 16:43:20 +0200 Subject: [PATCH 80/85] fix: use TOML basic strings with \n escapes in Repls to avoid CRLF on Windows Multiline TOML basic strings (""") use literal newlines from the file. On Windows with autocrlf=true, these become CRLF. After NormalizeNewlines strips \r from the test output, the replacement re-introduces \r via the New string, causing the comparison to fail. Using single-line basic strings with \n escapes ensures the newlines in the replacement are always LF regardless of platform. Co-authored-by: Denis Bilenko --- acceptance/bundle/deploy/wal/test.toml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/acceptance/bundle/deploy/wal/test.toml b/acceptance/bundle/deploy/wal/test.toml index 2be1964ae6..e60e699245 100644 --- a/acceptance/bundle/deploy/wal/test.toml +++ b/acceptance/bundle/deploy/wal/test.toml @@ -14,9 +14,7 @@ New = '[PROCESS_KILLED]' [[Repls]] Old = '(\n>>> errcode [^\n]+\n)\nExit code:' -New = """${1}[PROCESS_KILLED] - -Exit code:""" +New = "${1}[PROCESS_KILLED]\n\nExit code:" [[Repls]] Old = 'Exit code: 137' @@ -33,7 +31,4 @@ New = '${1}[KILLED]' # Match the raw exit code 1 (Windows never gets 137 or [PROCESS_KILLED] marker first). [[Repls]] Old = '(Deploying resources\.\.\.)\n\nExit code: 1' -New = """${1} -[PROCESS_KILLED] - -Exit code: [KILLED]""" +New = "${1}\n[PROCESS_KILLED]\n\nExit code: [KILLED]" From c1f69a1a44c318c025713cf1df65ad51985888d4 Mon Sep 17 00:00:00 2001 From: Tester Date: Wed, 13 May 2026 13:35:27 +0200 Subject: [PATCH 81/85] refactor: merge duplicate IsDirect() blocks in dashboard.go Co-authored-by: Denis Bilenko --- cmd/bundle/generate/dashboard.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cmd/bundle/generate/dashboard.go b/cmd/bundle/generate/dashboard.go index 71f4f573cf..7af4e01e92 100644 --- a/cmd/bundle/generate/dashboard.go +++ b/cmd/bundle/generate/dashboard.go @@ -391,16 +391,13 @@ func (d *dashboard) runForResource(ctx context.Context, b *bundle.Bundle) { return } + var state statemgmt.ExportedResourcesMap if stateDesc.Engine.IsDirect() { _, localPath := b.StateFilenameDirect(ctx) if err := b.DeploymentBundle.StateDB.Open(ctx, localPath, dstate.WithRecovery(true), dstate.WithWrite(false)); err != nil { logdiag.LogError(ctx, err) return } - } - - var state statemgmt.ExportedResourcesMap - if stateDesc.Engine.IsDirect() { state = b.DeploymentBundle.ExportState(ctx) } else { var err error From 79f1af49129013bb823c05111f576a4e73eafee7 Mon Sep 17 00:00:00 2001 From: Tester Date: Wed, 13 May 2026 13:38:12 +0200 Subject: [PATCH 82/85] refactor: merge duplicate IsDirect() blocks in deploy.go Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 6c03ac8870..72ecf1a5b1 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -68,28 +68,22 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta // mutators need informed consent if they are potentially destructive. cmdio.LogString(ctx, "Deploying resources...") + // Apply resources and capture post-apply state. + // For direct: Finalize flushes the WAL to disk and returns the state. + // For terraform: ParseResourcesState reads the file written by terraform.Apply. + var ( + state statemgmt.ExportedResourcesMap + err error + ) if targetEngine.IsDirect() { b.DeploymentBundle.Apply(ctx, b.WorkspaceClient(ctx), plan, direct.MigrateMode(false)) - } else { - bundle.ApplyContext(ctx, b, terraform.Apply()) - } - - // Capture post-apply state for Load below. - // For direct: flush WAL to disk (Finalize) and capture the result. - // For terraform: parse the state file written by terraform.Apply. - var state statemgmt.ExportedResourcesMap - if targetEngine.IsDirect() { - var err error state, err = b.DeploymentBundle.StateDB.Finalize(ctx) - if err != nil { - logdiag.LogError(ctx, err) - } } else { - var err error + bundle.ApplyContext(ctx, b, terraform.Apply()) state, err = terraform.ParseResourcesState(ctx, b) - if err != nil { - logdiag.LogError(ctx, err) - } + } + if err != nil { + logdiag.LogError(ctx, err) } // Even if deployment failed, there might be updates in states that we need to upload From 888280c6788b48e29cd6c4d89946491264fd96e0 Mon Sep 17 00:00:00 2001 From: Tester Date: Wed, 13 May 2026 13:39:43 +0200 Subject: [PATCH 83/85] restore comment: Finalize is called even on Apply failure to save partial progress Co-authored-by: Denis Bilenko --- bundle/phases/deploy.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bundle/phases/deploy.go b/bundle/phases/deploy.go index 72ecf1a5b1..02c6d827f2 100644 --- a/bundle/phases/deploy.go +++ b/bundle/phases/deploy.go @@ -69,7 +69,8 @@ func deployCore(ctx context.Context, b *bundle.Bundle, plan *deployplan.Plan, ta cmdio.LogString(ctx, "Deploying resources...") // Apply resources and capture post-apply state. - // For direct: Finalize flushes the WAL to disk and returns the state. + // For direct: Finalize flushes the WAL to disk and returns the state; + // called even if Apply failed so partial progress is saved. // For terraform: ParseResourcesState reads the file written by terraform.Apply. var ( state statemgmt.ExportedResourcesMap From f33f6ed53369e13fa0a907d71290562e356cf869 Mon Sep 17 00:00:00 2001 From: Tester Date: Wed, 13 May 2026 13:46:38 +0200 Subject: [PATCH 84/85] update NEXT_CHANGELOG.md --- NEXT_CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index b6bdb4d965..7eee0783f5 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -13,6 +13,7 @@ * Fix `bundle generate` job to preserve nested notebook directory structure ([#4596](https://github.com/databricks/cli/pull/4596)) * Propagate authentication environment (including `DATABRICKS_CONFIG_PROFILE`) to the `experimental.python` subprocess so bundle validate/deploy no longer fails with a multi-profile host ambiguity error when several profiles in `~/.databrickscfg` share the same host. * Fixed `--force-pull` on `bundle summary` and `bundle open` so the flag bypasses the local state cache and reads state from the workspace. +* engine/direct: Changes to state file now persisted to .wal file right away instead of being saved in the end ([#5149](https://github.com/databricks/cli/pull/5149)) ### Dependency updates From e6f929007ecedf2a2a882ff0de0ff851cc303245 Mon Sep 17 00:00:00 2001 From: Tester Date: Wed, 13 May 2026 14:37:55 +0200 Subject: [PATCH 85/85] rm databricksyyml --- databricks.yml | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 databricks.yml diff --git a/databricks.yml b/databricks.yml deleted file mode 100644 index 7cf210722a..0000000000 --- a/databricks.yml +++ /dev/null @@ -1,19 +0,0 @@ -bundle: - name: git - git: - # This is currently not supported - branch: ${var.deployment_branch} - -variables: - deployment_branch: - # By setting deployment_branch to "" we set bundle.git.branch to "" which is the same unsetting it. - # This this should make CLI read branch from git and update bundle.git.branch accordingly. It should - # Also set bundle.git.inferred to true. - default: "" - -targets: - prod: - default: true - dev: - variables: - deployment_branch: dev-branch