@@ -23,6 +23,7 @@ import (
2323 "github.com/go-logr/logr"
2424 appsv1 "k8s.io/api/apps/v1"
2525 corev1 "k8s.io/api/core/v1"
26+ apierrors "k8s.io/apimachinery/pkg/api/errors"
2627 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2728 "k8s.io/apimachinery/pkg/labels"
2829 "k8s.io/apimachinery/pkg/types"
@@ -37,6 +38,7 @@ import (
3738 "sigs.k8s.io/controller-runtime/pkg/predicate"
3839 "sigs.k8s.io/controller-runtime/pkg/reconcile"
3940
41+ kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
4042 "sigs.k8s.io/kueue/pkg/controller/jobframework"
4143 podcontroller "sigs.k8s.io/kueue/pkg/controller/jobs/pod/constants"
4244 clientutil "sigs.k8s.io/kueue/pkg/util/client"
@@ -65,11 +67,78 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
6567 log := ctrl .LoggerFrom (ctx )
6668 log .V (2 ).Info ("Reconcile StatefulSet" )
6769
68- err := r .fetchAndFinalizePods (ctx , req )
70+ sts := & appsv1.StatefulSet {}
71+ if err := r .client .Get (ctx , req .NamespacedName , sts ); err != nil {
72+ if client .IgnoreNotFound (err ) != nil {
73+ return ctrl.Result {}, err
74+ }
75+ // StatefulSet was deleted, finalize orphaned pods.
76+ return ctrl.Result {}, r .fetchAndFinalizePods (ctx , req , nil )
77+ }
78+
79+ // Ensure the StatefulSet is an owner of the workload to prevent
80+ // Kubernetes GC from deleting it when pods are replaced during rolling updates.
81+ // We log errors but don't block pod finalization, as ownership is best-effort.
82+ if err := r .ensureWorkloadOwnership (ctx , sts ); err != nil {
83+ log .Error (err , "Failed to ensure workload ownership, will retry" )
84+ }
85+
86+ err := r .fetchAndFinalizePods (ctx , req , sts )
6987 return ctrl.Result {}, err
7088}
7189
72- func (r * Reconciler ) fetchAndFinalizePods (ctx context.Context , req reconcile.Request ) error {
90+ // ensureWorkloadOwnership ensures the StatefulSet is an owner of the workload.
91+ // This prevents Kubernetes GC from deleting the workload when pods are replaced
92+ // during rolling updates.
93+ func (r * Reconciler ) ensureWorkloadOwnership (ctx context.Context , sts * appsv1.StatefulSet ) error {
94+ if sts == nil {
95+ return nil
96+ }
97+
98+ log := ctrl .LoggerFrom (ctx )
99+ workloadName := GetWorkloadName (sts .Name )
100+
101+ wl := & kueue.Workload {}
102+ if err := r .client .Get (ctx , types.NamespacedName {Name : workloadName , Namespace : sts .Namespace }, wl ); err != nil {
103+ if apierrors .IsNotFound (err ) {
104+ return nil
105+ }
106+ return err
107+ }
108+
109+ // Check if StatefulSet is already an owner.
110+ for _ , ref := range wl .GetOwnerReferences () {
111+ if ref .UID == sts .UID {
112+ return nil
113+ }
114+ }
115+
116+ // Add StatefulSet as an owner.
117+ if err := controllerutil .SetOwnerReference (sts , wl , r .client .Scheme ()); err != nil {
118+ return err
119+ }
120+
121+ if err := r .client .Update (ctx , wl ); err != nil {
122+ if apierrors .IsConflict (err ) {
123+ // Conflict means the workload was modified concurrently; ownership may not be set.
124+ // Will retry on next reconciliation.
125+ log .V (4 ).Info ("Conflict updating workload ownership, will retry on next reconciliation" ,
126+ "statefulset" , klog .KObj (sts ),
127+ "workload" , klog .KObj (wl ),
128+ )
129+ return nil
130+ }
131+ return err
132+ }
133+ log .V (3 ).Info ("Added StatefulSet as owner of workload" ,
134+ "statefulset" , klog .KObj (sts ),
135+ "workload" , klog .KObj (wl ),
136+ )
137+
138+ return nil
139+ }
140+
141+ func (r * Reconciler ) fetchAndFinalizePods (ctx context.Context , req reconcile.Request , sts * appsv1.StatefulSet ) error {
73142 podList := & corev1.PodList {}
74143 if err := r .client .List (ctx , podList , client .InNamespace (req .Namespace ), client.MatchingLabels {
75144 podcontroller .GroupNameLabel : GetWorkloadName (req .Name ),
@@ -82,16 +151,6 @@ func (r *Reconciler) fetchAndFinalizePods(ctx context.Context, req reconcile.Req
82151 return nil
83152 }
84153
85- sts := & appsv1.StatefulSet {}
86- err := r .client .Get (ctx , req .NamespacedName , sts )
87- if client .IgnoreNotFound (err ) != nil {
88- return err
89- }
90-
91- if err != nil {
92- sts = nil
93- }
94-
95154 return r .finalizePods (ctx , sts , podList .Items )
96155}
97156
0 commit comments