From d16c208142d764b978fb0b08dbb0203b13f4eb17 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Tue, 2 Jun 2026 11:40:38 +0200 Subject: [PATCH] Track bundle resource state sizes in telemetry (direct engine) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a `resources_metadata` field to the bundle deploy telemetry event with, per resource type, the count and the max/mean/median state size in bytes, plus the whole state file size. Only direct deploys are measured. The direct engine persists each resource's state as a JSON blob in resources.json, so sizes are read off directly as len(entry.State) after the deploy — no serialization or config-walking happens at telemetry time. Terraform stores state differently and is not collected (the field is absent there). This keeps collection trivial and cheap. The feature is one isolated module (bundle/phases/resources_metadata.go) plus one line at the telemetry-emission call site. Telemetry never fails a deploy: a missing or unparseable state file is logged at debug level and treated as no data. The universe proto (resources_metadata, BundleResourcesMetadata, ResourceMetadata) is already merged, so this is ingested rather than dropped. Co-authored-by: Isaac --- acceptance/bundle/telemetry/deploy/script | 5 +- bundle/phases/resources_metadata.go | 147 +++++++++++++++++++++ bundle/phases/resources_metadata_test.go | 151 ++++++++++++++++++++++ bundle/phases/telemetry.go | 2 + libs/telemetry/protos/bundle_deploy.go | 37 ++++++ 5 files changed, 341 insertions(+), 1 deletion(-) create mode 100644 bundle/phases/resources_metadata.go create mode 100644 bundle/phases/resources_metadata_test.go diff --git a/acceptance/bundle/telemetry/deploy/script b/acceptance/bundle/telemetry/deploy/script index de0ac667187..6b9fa7dcb98 100644 --- a/acceptance/bundle/telemetry/deploy/script +++ b/acceptance/bundle/telemetry/deploy/script @@ -10,7 +10,10 @@ trace cat telemetry.json | jq ' .entry.databricks_cli_log.bundle_deploy_event.ex # bundle_mutator_execution_time_ms can have variable number of entries depending upon the runtime of the mutators. Thus we omit it from # being asserted here. -cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms)' > out.telemetry.txt +# resources_metadata is only emitted for direct deploys (it reads the direct +# engine's resources.json), so it diverges across the DATABRICKS_BUNDLE_ENGINE +# matrix; omit it here to keep this golden engine-agnostic. +cat telemetry.json | jq 'del(.entry.databricks_cli_log.bundle_deploy_event.experimental.bundle_mutator_execution_time_ms, .entry.databricks_cli_log.bundle_deploy_event.resources_metadata)' > out.telemetry.txt cmd_exec_id=$(extract_command_exec_id.py) deployment_id=$(cat .databricks/bundle/default/deployment.json | jq -r .id) diff --git a/bundle/phases/resources_metadata.go b/bundle/phases/resources_metadata.go new file mode 100644 index 00000000000..2411d117850 --- /dev/null +++ b/bundle/phases/resources_metadata.go @@ -0,0 +1,147 @@ +package phases + +import ( + "context" + "encoding/json" + "errors" + "io/fs" + "os" + "slices" + "strings" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config/engine" + "github.com/databricks/cli/bundle/direct/dstate" + "github.com/databricks/cli/libs/log" + "github.com/databricks/cli/libs/telemetry/protos" +) + +// collectResourcesMetadata builds a BundleResourcesMetadata from the direct +// engine's deployment state file. +// +// Only direct deploys are measured. The direct engine persists each resource's +// state as a JSON blob in resources.json, so per-resource sizes are read off +// directly as len(entry.State) — no serialization or config-walking is done at +// telemetry time. Terraform stores state in a different shape and is not +// collected; for those deploys this returns nil. +// +// Telemetry must never fail a deploy, so a missing or unparseable state file is +// logged at debug level and treated as no data (returns nil). +func collectResourcesMetadata(ctx context.Context, b *bundle.Bundle) *protos.BundleResourcesMetadata { + // resource_*_count covers both engines; this state-size metadata only + // applies to the direct engine's resources.json. + if resolveDeployEngine(ctx, b) != string(engine.EngineDirect) { + return nil + } + + // A target must be selected to resolve the local state path. + if b.Target == nil { + return nil + } + + _, localPath := b.StateFilenameDirect(ctx) + raw, err := os.ReadFile(localPath) + if err != nil { + if !errors.Is(err, fs.ErrNotExist) { + log.Debugf(ctx, "resources-metadata telemetry: skipping direct state at %s: %s", localPath, err) + } + return nil + } + + var db dstate.Database + if err := json.Unmarshal(raw, &db); err != nil { + log.Debugf(ctx, "resources-metadata telemetry: failed to parse direct state: %s", err) + return nil + } + + counts := make(map[string]int64) + sizesByType := make(map[string][]int64) + for key, entry := range db.State { + t := resourceTypeFromKey(key) + if t == "" { + continue + } + counts[t]++ + sizesByType[t] = append(sizesByType[t], int64(len(entry.State))) + } + if len(counts) == 0 { + return nil + } + + types := make([]string, 0, len(counts)) + for t := range counts { + types = append(types, t) + } + slices.Sort(types) + + resources := make([]protos.ResourceMetadata, 0, len(types)) + for _, t := range types { + sizes := sizesByType[t] + slices.Sort(sizes) + resources = append(resources, protos.ResourceMetadata{ + ResourceType: t, + Count: counts[t], + StateSizeMaxBytes: statMax(sizes), + StateSizeMeanBytes: statMean(sizes), + StateSizeMedianBytes: statMedian(sizes), + }) + } + + return &protos.BundleResourcesMetadata{ + StateEngine: string(engine.EngineDirect), + StateFileSizeBytes: int64(len(raw)), + Resources: resources, + } +} + +// resourceTypeFromKey extracts the resource type from a direct-engine state +// key. Keys are "resources..", or "resources..." +// for sub-resources like permissions / grants / secret_acls. Sub-resources are +// tracked under the sub-resource type so they aggregate across resource +// families. Returns "" for keys that don't match. +func resourceTypeFromKey(key string) string { + parts := strings.SplitN(key, ".", 4) + if len(parts) < 3 || parts[0] != "resources" { + return "" + } + if len(parts) == 4 { + return parts[3] + } + return parts[1] +} + +// resolveDeployEngine returns the effective deploy engine ("direct" or +// "terraform"). Mirrors cmd/bundle/utils.ResolveEngineSetting but is inlined +// here to avoid a layering import (bundle/phases must not depend on cmd/). +func resolveDeployEngine(ctx context.Context, b *bundle.Bundle) string { + if b.Config.Bundle.Engine != engine.EngineNotSet { + return string(b.Config.Bundle.Engine.ThisOrDefault()) + } + envEngine, _ := engine.FromEnv(ctx) + return string(envEngine.ThisOrDefault()) +} + +func statMax(sortedSizes []int64) int64 { + if len(sortedSizes) == 0 { + return 0 + } + return sortedSizes[len(sortedSizes)-1] +} + +func statMean(sortedSizes []int64) int64 { + if len(sortedSizes) == 0 { + return 0 + } + var total int64 + for _, s := range sortedSizes { + total += s + } + return total / int64(len(sortedSizes)) +} + +func statMedian(sortedSizes []int64) int64 { + if len(sortedSizes) == 0 { + return 0 + } + return sortedSizes[(len(sortedSizes)-1)/2] +} diff --git a/bundle/phases/resources_metadata_test.go b/bundle/phases/resources_metadata_test.go new file mode 100644 index 00000000000..1089e0b4700 --- /dev/null +++ b/bundle/phases/resources_metadata_test.go @@ -0,0 +1,151 @@ +package phases + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/config/engine" + "github.com/databricks/cli/bundle/direct/dstate" + "github.com/databricks/cli/libs/env" + "github.com/databricks/cli/libs/telemetry/protos" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestResourceTypeFromKey(t *testing.T) { + cases := []struct { + in string + want string + }{ + {"resources.jobs.foo", "jobs"}, + {"resources.pipelines.bar", "pipelines"}, + {"resources.jobs.foo.permissions", "permissions"}, + {"resources.secret_scopes.s.permissions", "permissions"}, + {"not-a-state-key", ""}, + {"resources.jobs", ""}, + } + for _, c := range cases { + assert.Equal(t, c.want, resourceTypeFromKey(c.in), "key=%q", c.in) + } +} + +func TestResolveDeployEngine(t *testing.T) { + cases := []struct { + name string + configEng engine.EngineType + envEng string + want string + }{ + {"config wins over env", engine.EngineDirect, "terraform", "direct"}, + {"env used when config unset", engine.EngineNotSet, "direct", "direct"}, + {"default when neither set", engine.EngineNotSet, "", "terraform"}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + b := &bundle.Bundle{} + b.Config.Bundle.Engine = c.configEng + ctx := env.Set(t.Context(), engine.EnvVar, c.envEng) + assert.Equal(t, c.want, resolveDeployEngine(ctx, b)) + }) + } +} + +func TestStatHelpers(t *testing.T) { + assert.Equal(t, int64(3), statMax([]int64{1, 2, 3})) + assert.Equal(t, int64(2), statMean([]int64{1, 2, 3})) + assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3})) + // Lower-middle for even count: sorted [1,2,3,4] -> index (4-1)/2 = 1 -> 2. + assert.Equal(t, int64(2), statMedian([]int64{1, 2, 3, 4})) + assert.Equal(t, int64(0), statMax(nil)) + assert.Equal(t, int64(0), statMean(nil)) + assert.Equal(t, int64(0), statMedian(nil)) +} + +// directStateBundle writes a resources.json with the given per-key state blobs +// and returns a bundle wired to read it via StateFilenameDirect. +func directStateBundle(t *testing.T, state map[string]dstate.ResourceEntry) *bundle.Bundle { + t.Helper() + b := &bundle.Bundle{ + BundleRootPath: t.TempDir(), + Config: config.Root{ + Bundle: config.Bundle{ + Engine: engine.EngineDirect, + Target: "default", + }, + Workspace: config.Workspace{ + StatePath: "/Workspace/state", + }, + }, + Target: &config.Target{}, + } + _, localPath := b.StateFilenameDirect(t.Context()) + require.NoError(t, os.MkdirAll(filepath.Dir(localPath), 0o755)) + raw, err := json.Marshal(dstate.Database{State: state}) + require.NoError(t, err) + require.NoError(t, os.WriteFile(localPath, raw, 0o600)) + return b +} + +func TestCollectResourcesMetadata_GroupsByTypeFromState(t *testing.T) { + b := directStateBundle(t, map[string]dstate.ResourceEntry{ + "resources.jobs.foo": {State: json.RawMessage(`{"name":"foo","x":1}`)}, // 20 + "resources.jobs.bar": {State: json.RawMessage(`{"n":"bar"}`)}, // 11 + "resources.jobs.foo.permissions": {State: json.RawMessage(`[]`)}, // 2 + "resources.pipelines.qux": {State: json.RawMessage(`{"name":"qux"}`)}, // 14 + }) + + md := collectResourcesMetadata(t.Context(), b) + require.NotNil(t, md) + assert.Equal(t, "direct", md.StateEngine) + assert.Positive(t, md.StateFileSizeBytes) + + byType := make(map[string]protos.ResourceMetadata) + for _, r := range md.Resources { + byType[r.ResourceType] = r + } + assert.Equal(t, int64(2), byType["jobs"].Count) + assert.Equal(t, int64(20), byType["jobs"].StateSizeMaxBytes) + assert.Equal(t, int64(15), byType["jobs"].StateSizeMeanBytes) // (20+11)/2 + assert.Equal(t, int64(11), byType["jobs"].StateSizeMedianBytes) + assert.Equal(t, int64(1), byType["pipelines"].Count) + assert.Equal(t, int64(1), byType["permissions"].Count) +} + +func TestCollectResourcesMetadata_NilForTerraform(t *testing.T) { + b := directStateBundle(t, map[string]dstate.ResourceEntry{ + "resources.jobs.foo": {State: json.RawMessage(`{}`)}, + }) + b.Config.Bundle.Engine = engine.EngineTerraform + assert.Nil(t, collectResourcesMetadata(t.Context(), b)) +} + +func TestCollectResourcesMetadata_NilWhenNoStateFile(t *testing.T) { + b := &bundle.Bundle{ + BundleRootPath: t.TempDir(), + Config: config.Root{ + Bundle: config.Bundle{Engine: engine.EngineDirect, Target: "default"}, + Workspace: config.Workspace{StatePath: "/Workspace/state"}, + }, + Target: &config.Target{}, + } + assert.Nil(t, collectResourcesMetadata(t.Context(), b)) +} + +func TestCollectResourcesMetadata_NilOnMalformedState(t *testing.T) { + b := &bundle.Bundle{ + BundleRootPath: t.TempDir(), + Config: config.Root{ + Bundle: config.Bundle{Engine: engine.EngineDirect, Target: "default"}, + Workspace: config.Workspace{StatePath: "/Workspace/state"}, + }, + Target: &config.Target{}, + } + _, localPath := b.StateFilenameDirect(t.Context()) + require.NoError(t, os.MkdirAll(filepath.Dir(localPath), 0o755)) + require.NoError(t, os.WriteFile(localPath, []byte("not json"), 0o600)) + assert.Nil(t, collectResourcesMetadata(t.Context(), b)) +} diff --git a/bundle/phases/telemetry.go b/bundle/phases/telemetry.go index e64b736fc72..cac25eceee1 100644 --- a/bundle/phases/telemetry.go +++ b/bundle/phases/telemetry.go @@ -202,6 +202,8 @@ func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) { ResourceClusterIDs: clusterIds, ResourceDashboardIDs: dashboardIds, + ResourcesMetadata: collectResourcesMetadata(ctx, b), + Experimental: &protos.BundleDeployExperimental{ BundleMode: mode, ConfigurationFileCount: b.Metrics.ConfigurationFileCount, diff --git a/libs/telemetry/protos/bundle_deploy.go b/libs/telemetry/protos/bundle_deploy.go index d9439437d9b..73562782f1a 100644 --- a/libs/telemetry/protos/bundle_deploy.go +++ b/libs/telemetry/protos/bundle_deploy.go @@ -32,6 +32,9 @@ type BundleDeployEvent struct { ResourceClusterIDs []string `json:"resource_cluster_ids,omitempty"` ResourceDashboardIDs []string `json:"resource_dashboard_ids,omitempty"` + // Per-resource-type metadata (counts and state-size statistics). + ResourcesMetadata *BundleResourcesMetadata `json:"resources_metadata,omitempty"` + Experimental *BundleDeployExperimental `json:"experimental,omitempty"` } @@ -88,6 +91,40 @@ type BundleDeployExperimental struct { LocalCacheMeasurementsMs []IntMapEntry `json:"local_cache_measurements_ms,omitempty"` } +// BundleResourcesMetadata mirrors the universe proto. Per-resource-type +// state-size metadata for one bundle deployment. +// +// Only direct deploys are measured: the direct engine persists each resource's +// state as a JSON blob in resources.json, so sizes are read off directly as +// len(state) — no serialization happens at telemetry time. Terraform stores +// state in a different shape and is not collected (the field is absent there). +type BundleResourcesMetadata struct { + // Always "direct"; terraform deploys do not populate this message. + StateEngine string `json:"state_engine,omitempty"` + + // Size in bytes of the direct engine's resources.json state file on disk. + StateFileSizeBytes int64 `json:"state_file_size_bytes,omitempty"` + + // One entry per resource type present in the deployment state. + Resources []ResourceMetadata `json:"resources,omitempty"` +} + +// ResourceMetadata holds metadata about resources of a single type within one +// bundle deployment. +type ResourceMetadata struct { + // Resource type name: "jobs", "pipelines", "schemas", ... + ResourceType string `json:"resource_type,omitempty"` + + // Number of resources of this type tracked in the deployment state. + Count int64 `json:"count,omitempty"` + + // State-size statistics across resources of this type, each measured as + // len(state) of the JSON blob stored in resources.json. + StateSizeMaxBytes int64 `json:"state_size_max_bytes,omitempty"` + StateSizeMeanBytes int64 `json:"state_size_mean_bytes,omitempty"` + StateSizeMedianBytes int64 `json:"state_size_median_bytes,omitempty"` +} + type BoolMapEntry struct { Key string `json:"key,omitempty"` Value bool `json:"value"`