Skip to content

Commit c2cf5ae

Browse files
committed
feat: add sandbox_creation_latency metric
Adds a new histogram metric to track the time it takes for a sandbox to become ready after it has been created.
1 parent dca369b commit c2cf5ae

File tree

3 files changed

+143
-1
lines changed

3 files changed

+143
-1
lines changed

controllers/sandbox_controller.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"reflect"
2323
"time"
2424

25+
"github.com/prometheus/client_golang/prometheus"
2526
corev1 "k8s.io/api/core/v1"
2627
k8serrors "k8s.io/apimachinery/pkg/api/errors"
2728
"k8s.io/apimachinery/pkg/api/meta"
@@ -36,6 +37,7 @@ import (
3637
"sigs.k8s.io/controller-runtime/pkg/client"
3738
"sigs.k8s.io/controller-runtime/pkg/handler"
3839
"sigs.k8s.io/controller-runtime/pkg/log"
40+
"sigs.k8s.io/controller-runtime/pkg/metrics"
3941
"sigs.k8s.io/controller-runtime/pkg/predicate"
4042

4143
sandboxv1alpha1 "sigs.k8s.io/agent-sandbox/api/v1alpha1"
@@ -45,16 +47,26 @@ const (
4547
sandboxLabel = "agents.x-k8s.io/sandbox-name-hash"
4648
SanboxPodNameAnnotation = "agents.x-k8s.io/pod-name"
4749
sandboxControllerFieldOwner = "sandbox-controller"
50+
readinessObserved = "agents.x-k8s.io/readiness-observed"
4851
)
4952

5053
var (
5154
// Scheme for use by sandbox controllers. Registers required types for client.
5255
Scheme = runtime.NewScheme()
56+
57+
sandboxCreationLatency = prometheus.NewHistogram(
58+
prometheus.HistogramOpts{
59+
Name: "sandbox_creation_latency_ms",
60+
Help: "Time taken from sandbox creation to sandbox ready in milliseconds",
61+
Buckets: []float64{50, 100, 200, 300, 500, 700, 1000, 1500, 2000, 3000, 4500, 6000, 9000, 12000, 18000, 30000},
62+
},
63+
)
5364
)
5465

5566
func init() {
5667
utilruntime.Must(clientgoscheme.AddToScheme(Scheme))
5768
utilruntime.Must(sandboxv1alpha1.AddToScheme(Scheme))
69+
metrics.Registry.MustRegister(sandboxCreationLatency)
5870
}
5971

6072
// SandboxReconciler reconciles a Sandbox object
@@ -116,13 +128,48 @@ func (r *SandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
116128
// Update status
117129
if statusUpdateErr := r.updateStatus(ctx, oldStatus, sandbox); statusUpdateErr != nil {
118130
// Surface update error
131+
statusUpdateErr = fmt.Errorf("faild to update status: %w", statusUpdateErr)
119132
err = errors.Join(err, statusUpdateErr)
120133
}
121134

135+
if recordErr := r.recordFirstReadyMetric(ctx, sandbox); recordErr != nil {
136+
err = errors.Join(err, recordErr)
137+
}
138+
122139
// return errors seen
123140
return ctrl.Result{RequeueAfter: requeueAfter}, err
124141
}
125142

143+
func (r *SandboxReconciler) recordFirstReadyMetric(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox) error {
144+
log := log.FromContext(ctx)
145+
146+
// If readiness was observed already dont re-record the metric
147+
if sandbox.Annotations != nil && sandbox.Annotations[readinessObserved] != "" {
148+
return nil
149+
}
150+
151+
// If not ready dont record metric
152+
if !meta.IsStatusConditionTrue(sandbox.Status.Conditions, string(sandboxv1alpha1.SandboxConditionReady)) {
153+
return nil
154+
}
155+
156+
// record metric
157+
latency := time.Since(sandbox.CreationTimestamp.Time).Milliseconds()
158+
sandboxCreationLatency.Observe(float64(latency))
159+
160+
// add annotation
161+
patch := client.MergeFrom(sandbox.DeepCopy())
162+
if sandbox.Annotations == nil {
163+
sandbox.Annotations = make(map[string]string)
164+
}
165+
sandbox.Annotations[readinessObserved] = fmt.Sprintf("%d", latency)
166+
if err := r.Patch(ctx, sandbox, patch); err != nil {
167+
log.Error(err, "Failed to add first-ready-metric annotation")
168+
return err
169+
}
170+
return nil
171+
}
172+
126173
func (r *SandboxReconciler) reconcileChildResources(ctx context.Context, sandbox *sandboxv1alpha1.Sandbox) error {
127174
// Create a hash from the sandbox.Name and use it as label value
128175
nameHash := NameHash(sandbox.Name)

controllers/sandbox_controller_test.go

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,17 @@ package controllers
1616

1717
import (
1818
"errors"
19+
"strings"
1920
"testing"
2021
"time"
2122

2223
"github.com/google/go-cmp/cmp"
2324
"github.com/google/go-cmp/cmp/cmpopts"
25+
"github.com/prometheus/client_golang/prometheus/testutil"
2426
"github.com/stretchr/testify/require"
2527
corev1 "k8s.io/api/core/v1"
2628
k8serrors "k8s.io/apimachinery/pkg/api/errors"
29+
"k8s.io/apimachinery/pkg/api/meta"
2730
"k8s.io/apimachinery/pkg/api/resource"
2831
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2932
"k8s.io/apimachinery/pkg/runtime"
@@ -846,3 +849,94 @@ func TestSandboxExpiry(t *testing.T) {
846849
})
847850
}
848851
}
852+
853+
func TestSandboxCreationLatencyMetric(t *testing.T) {
854+
sandboxName := "sandbox-name"
855+
sandboxNs := "sandbox-ns"
856+
sb := &sandboxv1alpha1.Sandbox{}
857+
sb.Name = sandboxName
858+
sb.Namespace = sandboxNs
859+
sb.Generation = 1
860+
sb.CreationTimestamp = metav1.NewTime(time.Now())
861+
sb.Spec = sandboxv1alpha1.SandboxSpec{
862+
PodTemplate: sandboxv1alpha1.PodTemplate{
863+
Spec: corev1.PodSpec{
864+
Containers: []corev1.Container{
865+
{
866+
Name: "test-container",
867+
},
868+
},
869+
},
870+
},
871+
}
872+
873+
r := SandboxReconciler{
874+
Client: newFakeClient(sb),
875+
Scheme: Scheme,
876+
}
877+
878+
_, err := r.Reconcile(t.Context(), ctrl.Request{
879+
NamespacedName: types.NamespacedName{
880+
Name: sandboxName,
881+
Namespace: sandboxNs,
882+
},
883+
})
884+
require.NoError(t, err)
885+
886+
// get pod and mark it ready
887+
pod := &corev1.Pod{}
888+
require.NoError(t, r.Get(t.Context(), types.NamespacedName{Name: sandboxName, Namespace: sandboxNs}, pod))
889+
pod.Status.Phase = corev1.PodRunning
890+
pod.Status.Conditions = []corev1.PodCondition{
891+
{
892+
Type: corev1.PodReady,
893+
Status: corev1.ConditionTrue,
894+
},
895+
}
896+
require.NoError(t, r.Status().Update(t.Context(), pod))
897+
898+
_, err = r.Reconcile(t.Context(), ctrl.Request{
899+
NamespacedName: types.NamespacedName{
900+
Name: sandboxName,
901+
Namespace: sandboxNs,
902+
},
903+
})
904+
require.NoError(t, err)
905+
906+
// Validate Sandbox status
907+
liveSandbox := &sandboxv1alpha1.Sandbox{}
908+
require.NoError(t, r.Get(t.Context(), types.NamespacedName{Name: sandboxName, Namespace: sandboxNs}, liveSandbox))
909+
require.True(t, meta.IsStatusConditionTrue(liveSandbox.Status.Conditions, "Ready"))
910+
require.NotNil(t, liveSandbox.Annotations)
911+
require.Equal(t, "true", liveSandbox.Annotations[readinessObserved])
912+
913+
// Check metric
914+
expected := `
915+
# HELP sandbox_creation_latency Time taken from sandbox creation to sandbox ready in milliseconds
916+
# TYPE sandbox_creation_latency histogram
917+
sandbox_creation_latency_bucket{le="50"} 1
918+
919+
sandbox_creation_latency_bucket{le="100"} 1
920+
sandbox_creation_latency_bucket{le="200"} 1
921+
sandbox_creation_latency_bucket{le="300"} 1
922+
sandbox_creation_latency_bucket{le="500"} 1
923+
sandbox_creation_latency_bucket{le="700"} 1
924+
sandbox_creation_latency_bucket{le="1000"} 1
925+
sandbox_creation_latency_bucket{le="1500"} 1
926+
sandbox_creation_latency_bucket{le="2000"} 1
927+
sandbox_creation_latency_bucket{le="3000"} 1
928+
sandbox_creation_latency_bucket{le="4500"} 1
929+
sandbox_creation_latency_bucket{le="6000"} 1
930+
sandbox_creation_latency_bucket{le="9000"} 1
931+
sandbox_creation_latency_bucket{le="12000"} 1
932+
sandbox_creation_latency_bucket{le="18000"} 1
933+
sandbox_creation_latency_bucket{le="30000"} 1
934+
sandbox_creation_latency_bucket{le="+Inf"} 1
935+
sandbox_creation_latency_count 1
936+
`
937+
err = testutil.CollectAndCompare(sandboxCreationLatency, strings.NewReader(expected), "sandbox_creation_latency")
938+
// We ignore the error because the sum is not deterministic
939+
if err != nil && !strings.Contains(err.Error(), "sandbox_creation_latency_sum") {
940+
require.NoError(t, err)
941+
}
942+
}

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ go 1.24.4
44

55
require (
66
github.com/google/go-cmp v0.7.0
7+
github.com/prometheus/client_golang v1.23.2
78
github.com/stretchr/testify v1.11.1
89
k8s.io/api v0.34.1
910
k8s.io/apiextensions-apiserver v0.34.1
@@ -43,13 +44,13 @@ require (
4344
github.com/google/gnostic-models v0.7.0 // indirect
4445
github.com/google/uuid v1.6.0 // indirect
4546
github.com/json-iterator/go v1.1.12 // indirect
47+
github.com/kylelemons/godebug v1.1.0 // indirect
4648
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
4749
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
4850
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
4951
github.com/onsi/ginkgo/v2 v2.23.3 // indirect
5052
github.com/onsi/gomega v1.37.0 // indirect
5153
github.com/pmezard/go-difflib v1.0.0 // indirect
52-
github.com/prometheus/client_golang v1.23.2 // indirect
5354
github.com/prometheus/client_model v0.6.2 // indirect
5455
github.com/prometheus/common v0.67.1 // indirect
5556
github.com/prometheus/procfs v0.17.0 // indirect

0 commit comments

Comments
 (0)