Skip to content

Commit 3c7ebd9

Browse files
pkg/cache/scheduler: add exclusion stats to TAS failure messages
Include detailed node exclusion reasons (taints, nodeSelector, affinity, resources) in TAS scheduling failure messages to improve debuggability. Fixes: #7854 Signed-off-by: Sohan Kunkerkar <[email protected]>
1 parent 81cae06 commit 3c7ebd9

File tree

5 files changed

+326
-55
lines changed

5 files changed

+326
-55
lines changed

pkg/cache/scheduler/tas_cache_test.go

Lines changed: 94 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1329,7 +1329,7 @@ func TestFindTopologyAssignments(t *testing.T) {
13291329
corev1.ResourceCPU: 4000,
13301330
},
13311331
count: 1,
1332-
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s)`,
1332+
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s). Total nodes: 4; excluded: resource "cpu": 4`,
13331333
}},
13341334
},
13351335
"block required; too many Pods to fit requested; BestFit": {
@@ -1698,7 +1698,7 @@ func TestFindTopologyAssignments(t *testing.T) {
16981698
corev1.ResourceCPU: 600,
16991699
},
17001700
count: 1,
1701-
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s)`,
1701+
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s). Total nodes: 1; excluded: resource "cpu": 1`,
17021702
}},
17031703
},
17041704
"include usage from running non-TAS pods, blocked assignment; BestFit": {
@@ -1729,7 +1729,7 @@ func TestFindTopologyAssignments(t *testing.T) {
17291729
corev1.ResourceCPU: 600,
17301730
},
17311731
count: 1,
1732-
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s)`,
1732+
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s). Total nodes: 1; excluded: resource "cpu": 1`,
17331733
}},
17341734
},
17351735
"include usage from running non-TAS pods, found free capacity on another node; BestFit": {
@@ -1873,7 +1873,91 @@ func TestFindTopologyAssignments(t *testing.T) {
18731873
corev1.ResourceCPU: 1000,
18741874
},
18751875
count: 1,
1876-
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s)`,
1876+
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s). Total nodes: 1; excluded: taint "example.com/gpu=present:NoSchedule": 1`,
1877+
}},
1878+
},
1879+
"detailed failure message with exclusion stats": {
1880+
nodes: []corev1.Node{
1881+
*testingnode.MakeNode("x1").
1882+
Label(corev1.LabelHostname, "x1").
1883+
Taints(corev1.Taint{
1884+
Key: "key",
1885+
Value: "value",
1886+
Effect: corev1.TaintEffectNoSchedule,
1887+
}).
1888+
StatusAllocatable(corev1.ResourceList{
1889+
corev1.ResourceCPU: resource.MustParse("1"),
1890+
corev1.ResourcePods: resource.MustParse("10"),
1891+
}).
1892+
Ready().
1893+
Obj(),
1894+
*testingnode.MakeNode("x2").
1895+
Label(corev1.LabelHostname, "x2").
1896+
Label("zone", "zone-b"). // Wrong zone
1897+
StatusAllocatable(corev1.ResourceList{
1898+
corev1.ResourceCPU: resource.MustParse("1"),
1899+
corev1.ResourcePods: resource.MustParse("10"),
1900+
}).
1901+
Ready().
1902+
Obj(),
1903+
*testingnode.MakeNode("x3").
1904+
Label(corev1.LabelHostname, "x3").
1905+
Label("zone", "zone-b"). // Wrong zone for nodeSelector
1906+
StatusAllocatable(corev1.ResourceList{
1907+
corev1.ResourceCPU: resource.MustParse("2"),
1908+
corev1.ResourcePods: resource.MustParse("10"),
1909+
}).
1910+
Ready().
1911+
Obj(),
1912+
*testingnode.MakeNode("x4").
1913+
Label(corev1.LabelHostname, "x4").
1914+
Label("zone", "zone-a"). // Correct zone but insufficient CPU
1915+
StatusAllocatable(corev1.ResourceList{
1916+
corev1.ResourceCPU: resource.MustParse("100m"),
1917+
corev1.ResourcePods: resource.MustParse("10"),
1918+
}).
1919+
Ready().
1920+
Obj(),
1921+
},
1922+
levels: defaultOneLevel,
1923+
podSets: []PodSetTestCase{{
1924+
topologyRequest: &kueue.PodSetTopologyRequest{
1925+
Required: ptr.To(corev1.LabelHostname),
1926+
},
1927+
requests: resources.Requests{
1928+
corev1.ResourceCPU: 1000,
1929+
},
1930+
nodeSelector: map[string]string{
1931+
"zone": "zone-a",
1932+
},
1933+
count: 1,
1934+
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s). Total nodes: 4; excluded: nodeSelector: 2, resource "cpu": 1, taint "key=value:NoSchedule": 1`,
1935+
}},
1936+
},
1937+
"resource exclusion picks most restrictive resource": {
1938+
nodes: []corev1.Node{
1939+
*testingnode.MakeNode("dual-shortage").
1940+
Label(corev1.LabelHostname, "dual-shortage").
1941+
StatusAllocatable(corev1.ResourceList{
1942+
corev1.ResourceCPU: resource.MustParse("500m"),
1943+
corev1.ResourcePods: resource.MustParse("10"),
1944+
corev1.ResourceName("example.com/gpu"): resource.MustParse("0"),
1945+
}).
1946+
Ready().
1947+
Obj(),
1948+
},
1949+
levels: defaultOneLevel,
1950+
podSets: []PodSetTestCase{{
1951+
topologyRequest: &kueue.PodSetTopologyRequest{
1952+
Required: ptr.To(corev1.LabelHostname),
1953+
},
1954+
requests: resources.Requests{
1955+
corev1.ResourceCPU: 1000,
1956+
corev1.ResourceName("example.com/gpu"): 1,
1957+
},
1958+
count: 1,
1959+
// When both resources give count=0, alphabetical tie-breaking picks "cpu"
1960+
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s). Total nodes: 1; excluded: resource "cpu": 1`,
18771961
}},
18781962
},
18791963
"allow to schedule on node with tolerated taint; BestFit": {
@@ -1957,7 +2041,7 @@ func TestFindTopologyAssignments(t *testing.T) {
19572041
corev1.ResourceCPU: 300,
19582042
},
19592043
count: 1,
1960-
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s)`,
2044+
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s). Total nodes: 1; excluded: resource "pods": 1`,
19612045
}},
19622046
},
19632047
"skip node which doesn't match node selector, missing label; BestFit": {
@@ -1986,7 +2070,7 @@ func TestFindTopologyAssignments(t *testing.T) {
19862070
corev1.ResourceCPU: 300,
19872071
},
19882072
count: 1,
1989-
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s)`,
2073+
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s). Total nodes: 1; excluded: nodeSelector: 1`,
19902074
nodeSelector: map[string]string{
19912075
"custom-label-1": "custom-value-1",
19922076
},
@@ -2018,7 +2102,7 @@ func TestFindTopologyAssignments(t *testing.T) {
20182102
corev1.ResourceCPU: 300,
20192103
},
20202104
count: 1,
2021-
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s)`,
2105+
wantReason: `topology "default" doesn't allow to fit any of 1 pod(s). Total nodes: 1; excluded: nodeSelector: 1`,
20222106
nodeSelector: map[string]string{
20232107
"custom-label-1": "value-2",
20242108
},
@@ -4900,7 +4984,7 @@ func TestFindTopologyAssignments(t *testing.T) {
49004984
podSetGroupName: ptr.To("sameGroup"),
49014985
count: 1,
49024986
wantAssignment: nil,
4903-
wantReason: `topology "default" allows to fit only 4 out of 4 pod(s)`,
4987+
wantReason: `topology "default" allows to fit only 4 out of 4 pod(s). Total nodes: 2; excluded: resource "example.com/gpu": 1`,
49044988
},
49054989
{
49064990
podSetName: "workers",
@@ -4914,7 +4998,7 @@ func TestFindTopologyAssignments(t *testing.T) {
49144998
podSetGroupName: ptr.To("sameGroup"),
49154999
count: 4,
49165000
wantAssignment: nil,
4917-
wantReason: `topology "default" allows to fit only 4 out of 4 pod(s)`,
5001+
wantReason: `topology "default" allows to fit only 4 out of 4 pod(s). Total nodes: 2; excluded: resource "example.com/gpu": 1`,
49185002
},
49195003
},
49205004
},
@@ -5574,7 +5658,7 @@ func TestFindTopologyAssignments(t *testing.T) {
55745658
corev1.ResourceMemory: 1000,
55755659
},
55765660
count: 1,
5577-
wantReason: "topology \"default\" doesn't allow to fit any of 1 pod(s)",
5661+
wantReason: "topology \"default\" doesn't allow to fit any of 1 pod(s). Total nodes: 2; excluded: nodeSelector: 2",
55785662
},
55795663
},
55805664
},

0 commit comments

Comments
 (0)