Skip to content

Commit 90c0ee1

Browse files
committed
feat: add sandbox_creation_latency metric
Adds a new histogram metric to track the time it takes for a sandbox to become ready after it has been created.
1 parent dca369b commit 90c0ee1

File tree

3 files changed

+144
-1
lines changed

3 files changed

+144
-1
lines changed

controllers/sandbox_controller.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"reflect"
2323
"time"
2424

25+
"github.com/prometheus/client_golang/prometheus"
2526
corev1 "k8s.io/api/core/v1"
2627
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2728
"k8s.io/apimachinery/pkg/api/meta"
@@ -36,6 +37,7 @@ import (
3637
"sigs.k8s.io/controller-runtime/pkg/client"
3738
"sigs.k8s.io/controller-runtime/pkg/handler"
3839
"sigs.k8s.io/controller-runtime/pkg/log"
40+
"sigs.k8s.io/controller-runtime/pkg/metrics"
3941
"sigs.k8s.io/controller-runtime/pkg/predicate"
4042

4143
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
@@ -45,16 +47,27 @@ const (
4547
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
4648
SanboxPodNameAnnotation = "agents.x-k8s.io/pod-name"
4749
sandboxControllerFieldOwner = "sandbox-controller"
50+
readinessObserved = "agents.x-k8s.io/readiness-observed"
4851
)
4952

5053
var (
5154
// Scheme for use by sandbox controllers. Registers required types for client.
5255
Scheme = runtime.NewScheme()
56+
57+
sandboxCreationLatency = prometheus.NewHistogram(
58+
prometheus.HistogramOpts{
59+
Name: "sandbox_creation_latency",
60+
Help: "Time taken from sandbox creation to sandbox ready in milliseconds",
61+
// 50, 100, 200, 300, 500, 700, 1000, 1500, 2000, 3000, 4500, 6000, 9000, 12000, 18000, 30000
62+
Buckets: []float64{50, 100, 200, 300, 500, 700, 1000, 1500, 2000, 3000, 4500, 6000, 9000, 12000, 18000, 30000},
63+
},
64+
)
5365
)
5466

5567
func init() {
5668
utilruntime.Must(clientgoscheme.AddToScheme(Scheme))
5769
utilruntime.Must(sandboxv1alpha1.AddToScheme(Scheme))
70+
metrics.Registry.MustRegister(sandboxCreationLatency)
5871
}
5972

6073
// SandboxReconciler reconciles a Sandbox object
@@ -116,13 +129,48 @@ func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
116129
// Update status
117130
if statusUpdateErr := r.updateStatus(ctx, oldStatus, sandbox); statusUpdateErr != nil {
118131
// Surface update error
132+
statusUpdateErr = fmt.Errorf("faild to update status: %w", statusUpdateErr)
119133
err = errors.Join(err, statusUpdateErr)
120134
}
121135

136+
if recordErr := r.recordFirstReadyMetric(ctx, sandbox); recordErr != nil {
137+
err = errors.Join(err, recordErr)
138+
}
139+
122140
// return errors seen
123141
return ctrl.Result{RequeueAfter: requeueAfter}, err
124142
}
125143

144+
func (r *SandboxReconciler) recordFirstReadyMetric(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox) error {
145+
log := log.FromContext(ctx)
146+
147+
// If already recorded ready dont record metric
148+
if sandbox.Annotations != nil && sandbox.Annotations[readinessObserved] != "" {
149+
return nil
150+
}
151+
152+
// If not ready dont record metric
153+
if !meta.IsStatusConditionTrue(sandbox.Status.Conditions, string(sandboxv1alpha1.SandboxConditionReady)) {
154+
return nil
155+
}
156+
157+
// record metric
158+
latency := time.Since(sandbox.CreationTimestamp.Time).Milliseconds()
159+
sandboxCreationLatency.Observe(float64(latency))
160+
161+
// add annotation
162+
patch := client.MergeFrom(sandbox.DeepCopy())
163+
if sandbox.Annotations == nil {
164+
sandbox.Annotations = make(map[string]string)
165+
}
166+
sandbox.Annotations[readinessObserved] = "true"
167+
if err := r.Patch(ctx, sandbox, patch); err != nil {
168+
log.Error(err, "Failed to add first-ready-metric annotation")
169+
return err
170+
}
171+
return nil
172+
}
173+
126174
func (r *SandboxReconciler) reconcileChildResources(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox) error {
127175
// Create a hash from the sandbox.Name and use it as label value
128176
nameHash := NameHash(sandbox.Name)

controllers/sandbox_controller_test.go

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,17 @@ package controllers
1616

1717
import (
1818
"errors"
19+
"strings"
1920
"testing"
2021
"time"
2122

2223
"github.com/google/go-cmp/cmp"
2324
"github.com/google/go-cmp/cmp/cmpopts"
25+
"github.com/prometheus/client_golang/prometheus/testutil"
2426
"github.com/stretchr/testify/require"
2527
corev1 "k8s.io/api/core/v1"
2628
k8serrors "k8s.io/apimachinery/pkg/api/errors"
29+
"k8s.io/apimachinery/pkg/api/meta"
2730
"k8s.io/apimachinery/pkg/api/resource"
2831
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2932
"k8s.io/apimachinery/pkg/runtime"
@@ -846,3 +849,94 @@ func TestSandboxExpiry(t *testing.T) {
846849
})
847850
}
848851
}
852+
853+
func TestSandboxCreationLatencyMetric(t *testing.T) {
854+
sandboxName := "sandbox-name"
855+
sandboxNs := "sandbox-ns"
856+
sb := &sandboxv1alpha1.Sandbox{}
857+
sb.Name = sandboxName
858+
sb.Namespace = sandboxNs
859+
sb.Generation = 1
860+
sb.CreationTimestamp = metav1.NewTime(time.Now())
861+
sb.Spec = sandboxv1alpha1.SandboxSpec{
862+
PodTemplate: sandboxv1alpha1.PodTemplate{
863+
Spec: corev1.PodSpec{
864+
Containers: []corev1.Container{
865+
{
866+
Name: "test-container",
867+
},
868+
},
869+
},
870+
},
871+
}
872+
873+
r := SandboxReconciler{
874+
Client: newFakeClient(sb),
875+
Scheme: Scheme,
876+
}
877+
878+
_, err := r.Reconcile(t.Context(), ctrl.Request{
879+
NamespacedName: types.NamespacedName{
880+
Name: sandboxName,
881+
Namespace: sandboxNs,
882+
},
883+
})
884+
require.NoError(t, err)
885+
886+
// get pod and mark it ready
887+
pod := &corev1.Pod{}
888+
require.NoError(t, r.Get(t.Context(), types.NamespacedName{Name: sandboxName, Namespace: sandboxNs}, pod))
889+
pod.Status.Phase = corev1.PodRunning
890+
pod.Status.Conditions = []corev1.PodCondition{
891+
{
892+
Type: corev1.PodReady,
893+
Status: corev1.ConditionTrue,
894+
},
895+
}
896+
require.NoError(t, r.Status().Update(t.Context(), pod))
897+
898+
_, err = r.Reconcile(t.Context(), ctrl.Request{
899+
NamespacedName: types.NamespacedName{
900+
Name: sandboxName,
901+
Namespace: sandboxNs,
902+
},
903+
})
904+
require.NoError(t, err)
905+
906+
// Validate Sandbox status
907+
liveSandbox := &sandboxv1alpha1.Sandbox{}
908+
require.NoError(t, r.Get(t.Context(), types.NamespacedName{Name: sandboxName, Namespace: sandboxNs}, liveSandbox))
909+
require.True(t, meta.IsStatusConditionTrue(liveSandbox.Status.Conditions, "Ready"))
910+
require.NotNil(t, liveSandbox.Annotations)
911+
require.Equal(t, "true", liveSandbox.Annotations[readinessObserved])
912+
913+
// Check metric
914+
expected := `
915+
# HELP sandbox_creation_latency Time taken from sandbox creation to sandbox ready in milliseconds
916+
# TYPE sandbox_creation_latency histogram
917+
sandbox_creation_latency_bucket{le="50"} 1
918+
919+
sandbox_creation_latency_bucket{le="100"} 1
920+
sandbox_creation_latency_bucket{le="200"} 1
921+
sandbox_creation_latency_bucket{le="300"} 1
922+
sandbox_creation_latency_bucket{le="500"} 1
923+
sandbox_creation_latency_bucket{le="700"} 1
924+
sandbox_creation_latency_bucket{le="1000"} 1
925+
sandbox_creation_latency_bucket{le="1500"} 1
926+
sandbox_creation_latency_bucket{le="2000"} 1
927+
sandbox_creation_latency_bucket{le="3000"} 1
928+
sandbox_creation_latency_bucket{le="4500"} 1
929+
sandbox_creation_latency_bucket{le="6000"} 1
930+
sandbox_creation_latency_bucket{le="9000"} 1
931+
sandbox_creation_latency_bucket{le="12000"} 1
932+
sandbox_creation_latency_bucket{le="18000"} 1
933+
sandbox_creation_latency_bucket{le="30000"} 1
934+
sandbox_creation_latency_bucket{le="+Inf"} 1
935+
sandbox_creation_latency_count 1
936+
`
937+
err = testutil.CollectAndCompare(sandboxCreationLatency, strings.NewReader(expected), "sandbox_creation_latency")
938+
// We ignore the error because the sum is not deterministic
939+
if err != nil && !strings.Contains(err.Error(), "sandbox_creation_latency_sum") {
940+
require.NoError(t, err)
941+
}
942+
}

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ go 1.24.4
44

55
require (
66
github.com/google/go-cmp v0.7.0
7+
github.com/prometheus/client_golang v1.23.2
78
github.com/stretchr/testify v1.11.1
89
k8s.io/api v0.34.1
910
k8s.io/apiextensions-apiserver v0.34.1
@@ -43,13 +44,13 @@ require (
4344
github.com/google/gnostic-models v0.7.0 // indirect
4445
github.com/google/uuid v1.6.0 // indirect
4546
github.com/json-iterator/go v1.1.12 // indirect
47+
github.com/kylelemons/godebug v1.1.0 // indirect
4648
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
4749
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
4850
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
4951
github.com/onsi/ginkgo/v2 v2.23.3 // indirect
5052
github.com/onsi/gomega v1.37.0 // indirect
5153
github.com/pmezard/go-difflib v1.0.0 // indirect
52-
github.com/prometheus/client_golang v1.23.2 // indirect
5354
github.com/prometheus/client_model v0.6.2 // indirect
5455
github.com/prometheus/common v0.67.1 // indirect
5556
github.com/prometheus/procfs v0.17.0 // indirect

0 commit comments

Comments
 (0)