diff --git a/docs/configuration/nodegroup.md b/docs/configuration/nodegroup.md index a1ba240..a5ab39d 100644 --- a/docs/configuration/nodegroup.md +++ b/docs/configuration/nodegroup.md @@ -29,12 +29,12 @@ node_groups: taint_effect: NoExecute max_node_age: 24h aws: - fleet_instance_ready_timeout: 1m - launch_template_id: lt-1a2b3c4d - launch_template_version: "1" - lifecycle: on-demand - instance_type_overrides: ["t2.large", "t3.large"] - resource_tagging: false + fleet_instance_ready_timeout: 1m + launch_template_id: lt-1a2b3c4d + launch_template_version: "1" + lifecycle: on-demand + instance_type_overrides: ["t2.large", "t3.large"] + resource_tagging: false ``` ## Options @@ -273,3 +273,31 @@ When not at the minimum, the natural scaling up and down of the node group will node group. This is an optional feature and by default is disabled. + +### `unhealthy_node_grace_period` + +Defines the minimum age of a node before it can be tested to check if it is unhealthy. + +When enabled, instances can be tested periodically to determine if they are healthy. Escalator will pause all scaling activity and flush out unhealthy instances if they go above a configured threshold for the nodegroup. It will continuously do this until enough instances in the nodegroup are healthy and normal scaling activity can resume. + +Cordoned nodes are skipped and can never be considered unhealthy. + +This is an optional field. The default value is empty, which disables the feature. + +### `health_check_newest_nodes_percent` + +**[Only used if `unhealthy_node_grace_period` is set.]** + +The percentage of nodes (ordered by age from newer to older) in the nodegroup that are considered when checking for the maximum allowed unhealthy nodes in the nodegroup. The nodes captured by this percentage form the "test set" to be checked. Only nodes which are older than `unhealthy_node_grace_period` will be included in the test set. + +This field is required. + +### `max_unhealthy_nodes_percent` + +**[Only used if `unhealthy_node_grace_period` is set.]** + +The maximum percentage of unhealthy nodes in the test set from `health_check_newest_nodes_percent`. Beyond this threshold all scaling activity is paused and unhealthy nodes are flushed out. + +> **Note:** The valid range for `max_unhealthy_nodes_percent` is `0%` to `99%`. + +This is an optional field. If not set, it will default to `0%`. diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index 008040f..0c44493 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -2,6 +2,7 @@ package controller import ( "math" + "sort" "time" "github.com/atlassian/escalator/pkg/cloudprovider" @@ -227,6 +228,12 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState) nodeGroup.memCapacity = *allNodes[0].Status.Allocatable.Memory() } + // Taint all instances considered to be unhealthy before filtering the nodes + // into groups. + if nodeGroup.Opts.UnhealthyNodeGracePeriodDuration() > 0 { + c.taintUnhealthyInstances(allNodes, nodeGroup) + } + // Filter into untainted and tainted nodes untaintedNodes, taintedNodes, forceTaintedNodes, cordonedNodes := c.filterNodes(nodeGroup, allNodes) @@ -420,6 +427,22 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState) log.WithField("nodegroup", nodegroup).Error(forceActionErr) } + // If the nodegroup is considered to be unhealthy, then prevent any scaling + // for the time being and instead try removing tainted nodes to get the + // nodegroup into a healthy state again. No healthy nodes should be removed + // and no new cloud provider nodes should be added. + nodeGroupIsHealthy := true + + if nodeGroup.Opts.UnhealthyNodeGracePeriodDuration() > 0 { + if !c.isNodegroupHealthy(nodeGroup, allNodes) { + nodeGroupIsHealthy = false + nodesDelta = 0 + log.WithField("nodegroup", nodegroup).Infof("NodegroupUnhealthy: nodesDelta overridden to 0 from %d because the nodegroup is unhealthy", nodesDelta) + } + } + + c.reportNodeGroupHealthMetric(nodegroup, nodeGroupIsHealthy) + // Perform a scale up, do nothing or scale down based on the nodes delta var nodesDeltaResult int // actionErr keeps the error of any action below and checked after action @@ -439,7 +462,7 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState) log.WithField("nodegroup", nodegroup).Info("No need to scale") // reap any expired nodes var removed int - removed, actionErr = c.TryRemoveTaintedNodes(scaleOptions) + removed, actionErr = c.TryRemoveTaintedNodes(scaleOptions, nodeGroupIsHealthy) log.WithField("nodegroup", nodegroup).Infof("Reaper: There were %v empty nodes deleted this round", removed) } @@ -457,6 +480,131 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState) return nodesDelta, err } +// Finds all unhealthy instances in the nodegroup and adds the taint to mark +// them for deletion. +func (c *Controller) taintUnhealthyInstances(nodes []*v1.Node, state *NodeGroupState) []int { + bundles := make(nodesByOldestCreationTime, 0, len(nodes)) + + for i, node := range nodes { + // If the node is deemed healthy then there is nothing to do + if !k8s.IsNodeUnhealthy(node, state.Opts.unhealthyNodeGracePeriodDuration) { + continue + } + + bundles = append(bundles, nodeIndexBundle{node, i}) + } + + return c.taintInstances(bundles, state, len(bundles)) +} + +func (c *Controller) reportNodeGroupHealthMetric(nodegroup string, nodeGroupHealthy bool) { + healthy := 1 + + if !nodeGroupHealthy { + healthy = 0 + } + + metrics.NodeGroupUnhealthy.WithLabelValues(nodegroup).Set(float64(healthy)) +} + +// isNodegroupHealthy checks if the nodegroup is healthy. +// It does this by checking the health of the newest X% of nodes in the nodegroup that are older than the grace period. +// If the percentage of unhealthy nodes in this newest set of nodes is greater than the configured threshold, the nodegroup is considered unhealthy. +func (c *Controller) isNodegroupHealthy(state *NodeGroupState, nodes []*v1.Node) bool { + // Sort the nodes is reverse order based on age + reversedOrderedNodes := c.getNodesOrderedNewestFirst(nodes) + + // Filter out any nodes which are not old enough for the test group + oldEnoughNodes := c.filterOutNodesTooNew(state, reversedOrderedNodes) + + // Out of the nodes that are left, find the most recent configured + // percentage of nodes to do the test. + nodesForTest := c.getMostRecentNodes(state, oldEnoughNodes) + + // If there are no nodes to test, then the nodegroup is considered healthy. + if len(nodesForTest) == 0 { + return true + } + + // Get the total number of unhealthy nodes in the test set. + unhealthyNodesCount := c.countUnhealthyNodes(state, nodesForTest) + + // If the number of unhealthy nodes in the test group exceeds the percentage + // allowed then the test has failed. + return (unhealthyNodesCount*100)/len(nodesForTest) <= state.Opts.MaxUnhealthyNodesPercent +} + +func (c *Controller) getNodesOrderedNewestFirst(nodes []*v1.Node) []*v1.Node { + sortedNodes := make(nodesByOldestCreationTime, 0, len(nodes)) + + for i, node := range nodes { + sortedNodes = append(sortedNodes, nodeIndexBundle{node, i}) + } + + // Sort in reverse to get the newest instances at the front to make it + // easier to loop through. + sort.Sort(sort.Reverse(sortedNodes)) + + reverseOrderedNodes := make([]*v1.Node, 0, len(nodes)) + + for _, sortedNode := range sortedNodes { + reverseOrderedNodes = append(reverseOrderedNodes, sortedNode.node) + } + + return reverseOrderedNodes +} + +// Returns the list of nodes which are at least as old at the health check grace +// period duration configured for the nodegroup. These nodes are considered to +// be too new and still have a chance to be not Ready for legitimate reasons so +// they should not be considered. +func (c *Controller) filterOutNodesTooNew(state *NodeGroupState, nodes []*v1.Node) []*v1.Node { + now := time.Now() + newNodes := make([]*v1.Node, 0) + + for _, node := range nodes { + // Check if the node is old enough to be included in the new list + if node.CreationTimestamp.Add(state.Opts.unhealthyNodeGracePeriodDuration).Before(now) { + newNodes = append(newNodes, node) + } + } + + return newNodes +} + +// Returns the most recent X% of instances from the given list of nodes. +func (c *Controller) getMostRecentNodes(state *NodeGroupState, nodes []*v1.Node) []*v1.Node { + // Round up rather than down from HealthCheckNewestNodesPercent so that if + // there is a single instance then a non-100% percentage will still result + // in testing the instance. We want to test more rather than less. + numberOfNodes := int(math.Ceil((float64(state.Opts.HealthCheckNewestNodesPercent) / 100) * float64(len(nodes)))) + recentNodes := make([]*v1.Node, 0) + + for i, node := range nodes { + if i == numberOfNodes { + break + } + + recentNodes = append(recentNodes, node) + } + + return recentNodes +} + +func (c *Controller) countUnhealthyNodes(state *NodeGroupState, nodes []*v1.Node) int { + unhealthyNodesCount := 0 + + for _, node := range nodes { + // Include the unhealthyNodeDuration in the call to be 100% sure that we + // are not counting nodes which are too young are unhealthy. + if k8s.IsNodeUnhealthy(node, state.Opts.unhealthyNodeGracePeriodDuration) { + unhealthyNodesCount++ + } + } + + return unhealthyNodesCount +} + func (c *Controller) isScaleOnStarve( nodeGroup *NodeGroupState, podRequests k8s.PodRequestedUsage, diff --git a/pkg/controller/controller_test.go b/pkg/controller/controller_test.go index 3085465..55ea024 100644 --- a/pkg/controller/controller_test.go +++ b/pkg/controller/controller_test.go @@ -1,11 +1,16 @@ package controller import ( + "context" + "fmt" "testing" + "time" + "github.com/atlassian/escalator/pkg/k8s" "github.com/atlassian/escalator/pkg/test" "github.com/stretchr/testify/assert" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func TestControllerDryMode(t *testing.T) { @@ -174,3 +179,376 @@ func TestControllerFilterNodes(t *testing.T) { }) } } + +func TestGetNodesOrderedNewestFirst(t *testing.T) { + c := &Controller{} + now := time.Now() + + nodes := []*v1.Node{ + test.BuildTestNode(test.NodeOpts{Name: "n1", Creation: now.Add(-20 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n2", Creation: now.Add(-5 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n3", Creation: now.Add(-15 * time.Minute)}), + } + + reversedNodes := c.getNodesOrderedNewestFirst(nodes) + + assert.Equal(t, "n2", reversedNodes[0].Name) + assert.Equal(t, "n3", reversedNodes[1].Name) + assert.Equal(t, "n1", reversedNodes[2].Name) +} + +func TestFilterOutNodesTooNew(t *testing.T) { + c := &Controller{} + now := time.Now() + + type args struct { + state *NodeGroupState + nodes []*v1.Node + } + tests := []struct { + name string + args args + want []string + }{ + { + "filter out new nodes", + args{ + &NodeGroupState{ + Opts: NodeGroupOptions{ + unhealthyNodeGracePeriodDuration: 10 * time.Minute, + }, + }, + []*v1.Node{ + test.BuildTestNode(test.NodeOpts{Name: "n1", Creation: now.Add(-5 * time.Minute)}), + // The threshold needs to be exceeded for the instance to be considered unhealthy + test.BuildTestNode(test.NodeOpts{Name: "n2", Creation: now.Add(-10 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n3", Creation: now.Add(-15 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n4", Creation: now.Add(-20 * time.Minute)}), + }, + }, + []string{"n2", "n3", "n4"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := c.filterOutNodesTooNew(tt.args.state, tt.args.nodes) + gotNames := make([]string, len(got)) + + for i, node := range got { + gotNames[i] = node.Name + } + + assert.Equal(t, tt.want, gotNames) + }) + } +} + +func TestGetMostRecentNodes(t *testing.T) { + c := &Controller{} + now := time.Now() + + nodes1 := []*v1.Node{ + test.BuildTestNode(test.NodeOpts{Name: "n1", Creation: now.Add(-5 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n2", Creation: now.Add(-15 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n3", Creation: now.Add(-20 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n4", Creation: now.Add(-30 * time.Minute)}), + } + + nodes2 := []*v1.Node{ + test.BuildTestNode(test.NodeOpts{Name: "n1", Creation: now.Add(-5 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n2", Creation: now.Add(-15 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n3", Creation: now.Add(-20 * time.Minute)}), + } + + type args struct { + state *NodeGroupState + nodes []*v1.Node + } + tests := []struct { + name string + args args + want []string + }{ + { + "get most recent 50% (even starting nodes)", + args{ + &NodeGroupState{ + Opts: NodeGroupOptions{ + HealthCheckNewestNodesPercent: 50, + }, + }, + nodes1, + }, + []string{"n1", "n2"}, + }, + { + "get most recent 49% (even starting nodes)", + args{ + &NodeGroupState{ + Opts: NodeGroupOptions{ + HealthCheckNewestNodesPercent: 49, + }, + }, + nodes1, + }, + []string{"n1", "n2"}, + }, + { + "get most recent 75% (even starting nodes)", + args{ + &NodeGroupState{ + Opts: NodeGroupOptions{ + HealthCheckNewestNodesPercent: 75, + }, + }, + nodes1, + }, + []string{"n1", "n2", "n3"}, + }, + { + "get most recent 50% (odd starting nodes)", + args{ + &NodeGroupState{ + Opts: NodeGroupOptions{ + HealthCheckNewestNodesPercent: 50, + }, + }, + nodes2, + }, + []string{"n1", "n2"}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := c.getMostRecentNodes(tt.args.state, tt.args.nodes) + gotNames := make([]string, len(got)) + + for i, node := range got { + gotNames[i] = node.Name + } + + assert.Equal(t, tt.want, gotNames) + }) + } +} + +func TestCountUnhealthyNodes(t *testing.T) { + c := &Controller{} + now := time.Now() + + type args struct { + state *NodeGroupState + nodes []*v1.Node + } + tests := []struct { + name string + args args + want int + }{ + { + "count unhealthy nodes", + args{ + &NodeGroupState{ + Opts: NodeGroupOptions{ + unhealthyNodeGracePeriodDuration: 10 * time.Minute, + }, + }, + []*v1.Node{ + test.BuildTestNode(test.NodeOpts{Name: "n1", NotReady: true, Creation: now.Add(-5 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n2", NotReady: true, Creation: now.Add(-15 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n3", NotReady: false, Creation: now.Add(-20 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n4", NotReady: true, Creation: now.Add(-30 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n5", NotReady: true, Creation: now.Add(-30 * time.Minute), Unschedulable: true}), + }, + }, + // n2 and n4 are unhealthy and old enough + // n5 is not because it is cordoned and is not taken into account + // escalator. + 2, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := c.countUnhealthyNodes(tt.args.state, tt.args.nodes) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestTaintUnhealthyInstances(t *testing.T) { + now := time.Now() + + type args struct { + nodes []*v1.Node + state *NodeGroupState + } + tests := []struct { + name string + args args + want []string + }{ + { + "taint unhealthy instances", + args{ + []*v1.Node{ + test.BuildTestNode(test.NodeOpts{Name: "n1", NotReady: true, Creation: now.Add(-5 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n2", NotReady: true, Creation: now.Add(-15 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n3", NotReady: false, Creation: now.Add(-20 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n4", NotReady: true, Creation: now.Add(-30 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n5", NotReady: true, Creation: now.Add(-30 * time.Minute), Unschedulable: true}), + }, + &NodeGroupState{ + Opts: NodeGroupOptions{ + unhealthyNodeGracePeriodDuration: 10 * time.Minute, + }, + }, + }, + []string{"n2", "n4"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fakeClient, _ := test.BuildFakeClient(tt.args.nodes, nil) + c := &Controller{ + Client: &Client{ + Interface: fakeClient, + }, + } + + tt.args.state.taintTracker = []string{} + + nodeLister, _ := test.NewTestNodeWatcher(tt.args.nodes, test.NodeListerOptions{}) + podLister, _ := test.NewTestPodWatcher(nil, test.PodListerOptions{}) + tt.args.state.NodeGroupLister = NewNodeGroupLister(podLister, nodeLister, tt.args.state.Opts) + + taintedIndices := c.taintUnhealthyInstances(tt.args.nodes, tt.args.state) + + var taintedNames []string + + for _, index := range taintedIndices { + taintedNames = append(taintedNames, tt.args.nodes[index].Name) + } + + assert.ElementsMatch(t, tt.want, taintedNames) + + for _, name := range tt.want { + updatedNode, err := c.Client.CoreV1().Nodes().Get(context.Background(), name, metav1.GetOptions{}) + assert.NoError(t, err) + + _, tainted := k8s.GetToBeRemovedTaint(updatedNode) + assert.True(t, tainted) + } + }) + } +} + +func TestIsNodegroupHealthy(t *testing.T) { + c := &Controller{} + now := time.Now() + gracePeriod := 10 * time.Minute + + buildNodes := func(total int, unhealthy int, creationTime time.Time) []*v1.Node { + nodes := make([]*v1.Node, total) + for i := 0; i < total; i++ { + notReady := i < unhealthy + nodes[i] = test.BuildTestNode(test.NodeOpts{ + Name: fmt.Sprintf("n%d", i+1), + NotReady: notReady, + Creation: creationTime, + }) + } + return nodes + } + + oldCreationTime := now.Add(-2 * gracePeriod) + newCreationTime := now.Add(-gracePeriod / 2) + + tests := []struct { + name string + state *NodeGroupState + nodes []*v1.Node + healthy bool + }{ + { + name: "0 nodes, healthy", + state: &NodeGroupState{ + Opts: NodeGroupOptions{ + unhealthyNodeGracePeriodDuration: gracePeriod, + HealthCheckNewestNodesPercent: 100, + MaxUnhealthyNodesPercent: 50, + }, + }, + nodes: buildNodes(0, 0, oldCreationTime), + healthy: true, + }, + { + name: "0 nodes, max unhealthy 0%, healthy", + state: &NodeGroupState{ + Opts: NodeGroupOptions{ + unhealthyNodeGracePeriodDuration: gracePeriod, + HealthCheckNewestNodesPercent: 100, + MaxUnhealthyNodesPercent: 0, + }, + }, + nodes: buildNodes(0, 0, oldCreationTime), + healthy: true, + }, + { + name: "4 nodes, all nodes too new, healthy", + state: &NodeGroupState{ + Opts: NodeGroupOptions{ + unhealthyNodeGracePeriodDuration: gracePeriod, + HealthCheckNewestNodesPercent: 100, + MaxUnhealthyNodesPercent: 50, + }, + }, + nodes: buildNodes(4, 4, newCreationTime), + healthy: true, + }, + { + name: "4 nodes, all nodes old, unhealthy", + state: &NodeGroupState{ + Opts: NodeGroupOptions{ + unhealthyNodeGracePeriodDuration: gracePeriod, + HealthCheckNewestNodesPercent: 100, + MaxUnhealthyNodesPercent: 50, + }, + }, + nodes: buildNodes(4, 4, oldCreationTime), + healthy: false, + }, + { + name: "4 nodes, all healthy, max unhealthy 0%, healthy", + state: &NodeGroupState{ + Opts: NodeGroupOptions{ + unhealthyNodeGracePeriodDuration: gracePeriod, + HealthCheckNewestNodesPercent: 100, + MaxUnhealthyNodesPercent: 0, + }, + }, + nodes: buildNodes(4, 0, oldCreationTime), + healthy: true, + }, + { + name: "4 nodes, all unhealthy, max unhealthy 99%, healthy", + state: &NodeGroupState{ + Opts: NodeGroupOptions{ + unhealthyNodeGracePeriodDuration: gracePeriod, + HealthCheckNewestNodesPercent: 100, + MaxUnhealthyNodesPercent: 99, + }, + }, + nodes: buildNodes(4, 4, oldCreationTime), + healthy: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := c.isNodegroupHealthy(tt.state, tt.nodes) + assert.Equal(t, tt.healthy, got) + }) + } +} diff --git a/pkg/controller/node_group.go b/pkg/controller/node_group.go index 1e22ada..97b9f06 100644 --- a/pkg/controller/node_group.go +++ b/pkg/controller/node_group.go @@ -49,11 +49,26 @@ type NodeGroupOptions struct { MaxNodeAge string `json:"max_node_age,omitempty" yaml:"max_node_age,omitempty"` + // UnhealthyNodeGracePeriod is the duration to wait before testing if a node + // can be considered unhealthy. + UnhealthyNodeGracePeriod string `json:"unhealthy_node_grace_period,omitempty" yaml:"unhealthy_node_grace_period,omitempty"` + + // HealthCheckNewestNodesPercent is the percentage of newest nodes to check + // for an unhealthy state. The number of instances checked is rounded up + // from the strict percentage supplied. E.g. if 50 is given and there is + // only one instance found, 1 will be used rather than 0. + HealthCheckNewestNodesPercent int `json:"health_check_newest_nodes_percent,omitempty" yaml:"health_check_newest_nodes_percent,omitempty"` + + // MaxUnhealthyNodesPercentage is the maximum percentage of unhealthy nodes + // allowed in the nodegroup at any given time. + MaxUnhealthyNodesPercent int `json:"max_unhealthy_nodes_percent,omitempty" yaml:"max_unhealthy_nodes_percent,omitempty"` + // Private variables for storing the parsed duration from the string - softDeleteGracePeriodDuration time.Duration - hardDeleteGracePeriodDuration time.Duration - scaleUpCoolDownPeriodDuration time.Duration - maxNodeAgeDuration time.Duration + softDeleteGracePeriodDuration time.Duration + hardDeleteGracePeriodDuration time.Duration + scaleUpCoolDownPeriodDuration time.Duration + maxNodeAgeDuration time.Duration + unhealthyNodeGracePeriodDuration time.Duration } // AWSNodeGroupOptions represents a nodegroup running on a cluster that is @@ -130,6 +145,15 @@ func ValidateNodeGroup(nodegroup NodeGroupOptions) []error { checkThat(validMaxNodeAgeDuration(nodegroup.MaxNodeAge), "max_node_age failed to parse into a time.Duration. Set to '0' or '' to disable, or a positive Go duration to enable.") + // UnhealthyNodeGracePeriod is an optional parameter. + if len(nodegroup.UnhealthyNodeGracePeriod) > 0 { + checkThat(nodegroup.UnhealthyNodeGracePeriodDuration() > 0, "unhealthy_node_grace_period failed to parse into a time.Duration. check your formatting.") + checkThat(nodegroup.HealthCheckNewestNodesPercent > 0, "health_check_newest_nodes_percent must be greater than 0") + checkThat(nodegroup.HealthCheckNewestNodesPercent <= 100, "health_check_newest_nodes_percent must be less than or equal to 100") + checkThat(nodegroup.MaxUnhealthyNodesPercent >= 0, "max_unhealthy_nodes_percent must be greater than or equal to 0") + checkThat(nodegroup.MaxUnhealthyNodesPercent < 100, "max_unhealthy_nodes_percent must be less than 100") + } + return problems } @@ -203,6 +227,21 @@ func (n *NodeGroupOptions) MaxNodeAgeDuration() time.Duration { return n.maxNodeAgeDuration } +// UnhealthyNodeGracePeriodDuration lazily returns/parses the unhealthyNodeGracePeriod string into a duration +func (n *NodeGroupOptions) UnhealthyNodeGracePeriodDuration() time.Duration { + if n.unhealthyNodeGracePeriodDuration != 0 { + return n.unhealthyNodeGracePeriodDuration + } + + duration, err := time.ParseDuration(n.UnhealthyNodeGracePeriod) + if err != nil { + return 0 + } + + n.unhealthyNodeGracePeriodDuration = duration + return n.unhealthyNodeGracePeriodDuration +} + // autoDiscoverMinMaxNodeOptions returns whether the min_nodes and max_nodes options should be "auto-discovered" from the cloud provider func (n *NodeGroupOptions) autoDiscoverMinMaxNodeOptions() bool { return n.MinNodes == 0 && n.MaxNodes == 0 diff --git a/pkg/controller/node_group_test.go b/pkg/controller/node_group_test.go index 16dd80a..3faba50 100644 --- a/pkg/controller/node_group_test.go +++ b/pkg/controller/node_group_test.go @@ -453,6 +453,9 @@ func TestValidateNodeGroup(t *testing.T) { ScaleUpCoolDownPeriod: "55m", TaintEffect: "NoExecute", MaxNodeAge: "12h", + UnhealthyNodeGracePeriod: "10m", + HealthCheckNewestNodesPercent: 50, + MaxUnhealthyNodesPercent: 50, }, }, nil, @@ -476,6 +479,9 @@ func TestValidateNodeGroup(t *testing.T) { HardDeleteGracePeriod: "1h10m", ScaleUpCoolDownPeriod: "55m", TaintEffect: "", + UnhealthyNodeGracePeriod: "10m", + HealthCheckNewestNodesPercent: 100, + MaxUnhealthyNodesPercent: 1, }, }, nil, @@ -498,7 +504,9 @@ func TestValidateNodeGroup(t *testing.T) { SoftDeleteGracePeriod: "10m", HardDeleteGracePeriod: "1h10m", ScaleUpCoolDownPeriod: "55m", - MaxNodeAge: "", + UnhealthyNodeGracePeriod: "500h", + HealthCheckNewestNodesPercent: 1, + MaxUnhealthyNodesPercent: 99, }, }, nil, @@ -523,12 +531,15 @@ func TestValidateNodeGroup(t *testing.T) { ScaleUpCoolDownPeriod: "55m", TaintEffect: "", MaxNodeAge: "0", + UnhealthyNodeGracePeriod: "", // Equivalent to not being set + HealthCheckNewestNodesPercent: 0, // Equivalent to not being set + MaxUnhealthyNodesPercent: 0, // Equivalent to not being set }, }, nil, }, { - "invalid nodegroup", + "invalid nodegroup 1", args{ NodeGroupOptions{ Name: "", @@ -547,6 +558,9 @@ func TestValidateNodeGroup(t *testing.T) { ScaleUpCoolDownPeriod: "21h21m21s", TaintEffect: "invalid", MaxNodeAge: "bla", + UnhealthyNodeGracePeriod: "bla", + HealthCheckNewestNodesPercent: 101, + MaxUnhealthyNodesPercent: 100, }, }, []string{ @@ -557,6 +571,45 @@ func TestValidateNodeGroup(t *testing.T) { "soft_delete_grace_period failed to parse into a time.Duration. check your formatting.", "taint_effect must be valid kubernetes taint", "max_node_age failed to parse into a time.Duration. Set to '0' or '' to disable, or a positive Go duration to enable.", + "unhealthy_node_grace_period failed to parse into a time.Duration. check your formatting.", + "health_check_newest_nodes_percent must be less than or equal to 100", + "max_unhealthy_nodes_percent must be less than 100", + }, + }, + { + "invalid nodegroup 2", + args{ + NodeGroupOptions{ + Name: "", + LabelKey: "customer", + LabelValue: "buileng", + CloudProviderGroupName: "somegroup", + TaintUpperCapacityThresholdPercent: 70, + TaintLowerCapacityThresholdPercent: 90, + ScaleUpThresholdPercent: 100, + MinNodes: 1, + MaxNodes: 0, + SlowNodeRemovalRate: 1, + FastNodeRemovalRate: 2, + SoftDeleteGracePeriod: "10", + HardDeleteGracePeriod: "1h10m", + ScaleUpCoolDownPeriod: "21h21m21s", + TaintEffect: "invalid", + MaxNodeAge: "bla", + UnhealthyNodeGracePeriod: "1m", + HealthCheckNewestNodesPercent: 0, + MaxUnhealthyNodesPercent: 0, + }, + }, + []string{ + "name cannot be empty", + "taint_lower_capacity_threshold_percent must be less than taint_upper_capacity_threshold_percent", + "min_nodes must be less than max_nodes", + "max_nodes must be larger than 0", + "soft_delete_grace_period failed to parse into a time.Duration. check your formatting.", + "taint_effect must be valid kubernetes taint", + "max_node_age failed to parse into a time.Duration. Set to '0' or '' to disable, or a positive Go duration to enable.", + "health_check_newest_nodes_percent must be greater than 0", }, }, } diff --git a/pkg/controller/scale_down.go b/pkg/controller/scale_down.go index 19ac93d..49652e9 100644 --- a/pkg/controller/scale_down.go +++ b/pkg/controller/scale_down.go @@ -21,7 +21,7 @@ const ( // ScaleDown performs the taint and remove node logic func (c *Controller) ScaleDown(opts scaleOpts) (int, error) { - removed, err := c.TryRemoveTaintedNodes(opts) + removed, err := c.TryRemoveTaintedNodes(opts, true) if err != nil { switch err.(type) { // early return when node not in expected autoscaling group is found @@ -68,9 +68,17 @@ func (c *Controller) TryRemoveForceTaintedNodes(opts scaleOpts) (int, error) { // TryRemoveTaintedNodes attempts to remove nodes are // * tainted and empty // * have passed their grace period -func (c *Controller) TryRemoveTaintedNodes(opts scaleOpts) (int, error) { +func (c *Controller) TryRemoveTaintedNodes(opts scaleOpts, healthyNodesAllowedToBeRemoved bool) (int, error) { var toBeDeleted []*v1.Node for _, candidate := range opts.taintedNodes { + if opts.nodeGroup.Opts.UnhealthyNodeGracePeriodDuration() > 0 { + // If healthy nodes should not be removed and the node is healthy + // then the node is no longer considered a candidate for deletion. + if !healthyNodesAllowedToBeRemoved && !k8s.IsNodeUnhealthy(candidate, opts.nodeGroup.Opts.UnhealthyNodeGracePeriodDuration()) { + log.Infof("skip node %s because it is healthy and healthy nodes cannot be deleted right now", candidate.Name) + continue + } + } // skip any nodes marked with the NodeEscalatorIgnore condition which is true // filter these nodes out as late as possible to ensure rest of escalator scaling calculations remain unaffected @@ -194,36 +202,11 @@ func (c *Controller) scaleDownTaint(opts scaleOpts) (int, error) { // indices are from the parameter nodes indexes, not the sorted index func (c *Controller) taintOldestN(nodes []*v1.Node, nodeGroup *NodeGroupState, n int) []int { sorted := make(nodesByOldestCreationTime, 0, len(nodes)) + for i, node := range nodes { sorted = append(sorted, nodeIndexBundle{node, i}) } - sort.Sort(sorted) - - taintedIndices := make([]int, 0, n) - for _, bundle := range sorted { - // stop at N (or when array is fully iterated) - if len(taintedIndices) >= n { - break - } - - // only actually taint in dry mode - if !c.dryMode(nodeGroup) { - log.WithField("drymode", "off").WithField("nodegroup", nodeGroup.Opts.Name).Infof("Tainting node %v", bundle.node.Name) - // Taint the node - updatedNode, err := k8s.AddToBeRemovedTaint(bundle.node, c.Client, nodeGroup.Opts.TaintEffect) - if err != nil { - log.Errorf("While tainting %v: %v", bundle.node.Name, err) - } else { - bundle.node = updatedNode - taintedIndices = append(taintedIndices, bundle.index) - } - } else { - nodeGroup.taintTracker = append(nodeGroup.taintTracker, bundle.node.Name) - taintedIndices = append(taintedIndices, bundle.index) - log.WithField("drymode", "on").WithField("nodegroup", nodeGroup.Opts.Name).Infof("Tainting node %v", bundle.node.Name) - } - } - - return taintedIndices + sort.Sort(sorted) + return c.taintInstances(sorted, nodeGroup, n) } diff --git a/pkg/controller/scale_down_test.go b/pkg/controller/scale_down_test.go index 4474789..577c247 100644 --- a/pkg/controller/scale_down_test.go +++ b/pkg/controller/scale_down_test.go @@ -2,10 +2,11 @@ package controller import ( "fmt" - "github.com/stretchr/testify/require" "testing" "time" + "github.com/stretchr/testify/require" + "github.com/atlassian/escalator/pkg/k8s" "github.com/atlassian/escalator/pkg/test" "github.com/stretchr/testify/assert" @@ -440,16 +441,31 @@ func TestController_TryRemoveTaintedNodes(t *testing.T) { continue } + // Make odd nodes unhealthy + var ready = v1.NodeReady + + if i%2 == 0 { + // This is used arbitrarily to mean "not ready" + ready = v1.NodeNetworkUnavailable + } + + for i, condition := range n.Status.Conditions { + if condition.Type == v1.NodeReady { + n.Status.Conditions[i].Type = ready + } + } + untaintedNodes = append(untaintedNodes, nodes[i]) } assert.Equal(t, len(nodes)-2, len(untaintedNodes)) tests := []struct { - name string - opts scaleOpts - annotateFirstTainted bool - want int - wantErr bool + name string + opts scaleOpts + annotateFirstTainted bool + healthyNodesAllowedToBeRemoved bool + want int + wantErr bool }{ { "test normal delete all tainted", @@ -462,6 +478,7 @@ func TestController_TryRemoveTaintedNodes(t *testing.T) { 0, // not used in TryRemoveTaintedNodes }, false, + true, -2, false, }, @@ -476,6 +493,7 @@ func TestController_TryRemoveTaintedNodes(t *testing.T) { 0, // not used in TryRemoveTaintedNodes }, true, + true, -1, false, }, @@ -490,9 +508,25 @@ func TestController_TryRemoveTaintedNodes(t *testing.T) { 0, // not used in TryRemoveTaintedNodes }, false, + true, 0, false, }, + { + "test normal delete without unhealthy nodes", + scaleOpts{ + nodes, + taintedNodes, + []*v1.Node{}, + untaintedNodes, + nodeGroupsState[testNodeGroup.ID()], + 0, // not used in TryRemoveTaintedNodes + }, + false, + false, + -1, + false, + }, } for _, tt := range tests { @@ -502,7 +536,7 @@ func TestController_TryRemoveTaintedNodes(t *testing.T) { NodeEscalatorIgnoreAnnotation: "skip for testing", } } - got, err := c.TryRemoveTaintedNodes(tt.opts) + got, err := c.TryRemoveTaintedNodes(tt.opts, tt.healthyNodesAllowedToBeRemoved) if tt.wantErr { assert.Error(t, err) } else { diff --git a/pkg/controller/scale_up.go b/pkg/controller/scale_up.go index 5e5e7bd..2e8b356 100644 --- a/pkg/controller/scale_up.go +++ b/pkg/controller/scale_up.go @@ -12,7 +12,6 @@ import ( // ScaleUp performs the untaint and increase cloud provider node group logic func (c *Controller) ScaleUp(opts scaleOpts) (int, error) { - untainted, err := c.scaleUpUntaint(opts) // No nodes were untainted, so we need to scale up cloud provider node group if err != nil { @@ -23,25 +22,20 @@ func (c *Controller) ScaleUp(opts scaleOpts) (int, error) { // remove the number of nodes that were just untainted and the remaining is how much to increase the cloud provider node group by opts.nodesDelta -= untainted - if opts.nodesDelta > 0 { - // check that untainting the nodes doesn't do bring us over max nodes - if opts.nodesDelta <= 0 { - log.Warnf("Scale up delta is less than or equal to 0 after clamping: %v. Will not scale up cloud provider.", opts.nodesDelta) - return 0, nil - } + // check that untainting the nodes doesn't do bring us over max nodes + if opts.nodesDelta <= 0 { + log.Warnf("Scale up delta is less than or equal to 0 after clamping: %v. Will not scale up cloud provider.", opts.nodesDelta) + return untainted, nil + } - if opts.nodesDelta > 0 { - added, err := c.scaleUpCloudProviderNodeGroup(opts) - if err != nil { - log.Errorf("Failed to add nodes because of an error. Skipping cloud provider node group scaleup: %v", err) - return 0, err - } - opts.nodeGroup.scaleUpLock.lock(added) - return untainted + added, nil - } + added, err := c.scaleUpCloudProviderNodeGroup(opts) + if err != nil { + log.Errorf("Failed to add nodes because of an error. Skipping cloud provider node group scaleup: %v", err) + return 0, err } - return untainted, nil + opts.nodeGroup.scaleUpLock.lock(added) + return untainted + added, nil } // Calulates how many new nodes need to be created @@ -124,6 +118,17 @@ func (c *Controller) untaintNewestN(nodes []*v1.Node, nodeGroup *NodeGroupState, untaintedIndices := make([]int, 0, n) for _, bundle := range sorted { + // Check if the node is ready before untainting. Not ready nodes should + // be left to be removed instead. If a previously unhealthy node has + // become healthy, it will pass this test and as such be untainted and + // able to receive pods. + if nodeGroup.Opts.UnhealthyNodeGracePeriodDuration() > 0 { + if k8s.IsNodeUnhealthy(bundle.node, nodeGroup.Opts.UnhealthyNodeGracePeriodDuration()) { + log.WithField("drymode", c.dryMode(nodeGroup)).Infof("Skipping untaint of unhealthy node %v", bundle.node.Name) + continue + } + } + // stop at N (or when array is fully iterated) if len(untaintedIndices) >= n { break @@ -131,7 +136,7 @@ func (c *Controller) untaintNewestN(nodes []*v1.Node, nodeGroup *NodeGroupState, // only actually taint in dry mode if !c.dryMode(nodeGroup) { if _, tainted := k8s.GetToBeRemovedTaint(bundle.node); tainted { - log.WithField("drymode", "off").Infof("Untainting node %v", bundle.node.Name) + log.WithField("drymode", c.dryMode(nodeGroup)).Infof("Untainting node %v", bundle.node.Name) // Remove the taint from the node updatedNode, err := k8s.DeleteToBeRemovedTaint(bundle.node, c.Client) @@ -154,7 +159,7 @@ func (c *Controller) untaintNewestN(nodes []*v1.Node, nodeGroup *NodeGroupState, // Delete from tracker nodeGroup.taintTracker = append(nodeGroup.taintTracker[:deleteIndex], nodeGroup.taintTracker[deleteIndex+1:]...) untaintedIndices = append(untaintedIndices, bundle.index) - log.WithField("drymode", "on").Infof("Untainting node %v", bundle.node.Name) + log.WithField("drymode", c.dryMode(nodeGroup)).Infof("Untainting node %v", bundle.node.Name) } } } diff --git a/pkg/controller/scale_up_test.go b/pkg/controller/scale_up_test.go index 96c8289..dc67d12 100644 --- a/pkg/controller/scale_up_test.go +++ b/pkg/controller/scale_up_test.go @@ -2,10 +2,11 @@ package controller import ( "fmt" - "github.com/stretchr/testify/require" "testing" "time" + "github.com/stretchr/testify/require" + "github.com/atlassian/escalator/pkg/k8s" "github.com/atlassian/escalator/pkg/test" "github.com/stretchr/testify/assert" @@ -21,6 +22,7 @@ func TestControllerUntaintNewestN(t *testing.T) { 0: test.BuildTestNode(test.NodeOpts{ Name: "n1", Creation: time.Date(2011, 3, 3, 13, 0, 0, 0, time.UTC), + NotReady: true, }), 1: test.BuildTestNode(test.NodeOpts{ Name: "n2", @@ -29,6 +31,7 @@ func TestControllerUntaintNewestN(t *testing.T) { 2: test.BuildTestNode(test.NodeOpts{ Name: "n3", Creation: time.Date(2010, 3, 3, 13, 0, 0, 0, time.UTC), + NotReady: true, }), 3: test.BuildTestNode(test.NodeOpts{ Name: "n4", @@ -37,6 +40,7 @@ func TestControllerUntaintNewestN(t *testing.T) { 4: test.BuildTestNode(test.NodeOpts{ Name: "n5", Creation: time.Date(2005, 3, 3, 13, 0, 0, 0, time.UTC), + NotReady: true, }), 5: test.BuildTestNode(test.NodeOpts{ Name: "n6", @@ -69,9 +73,9 @@ func TestControllerUntaintNewestN(t *testing.T) { } type args struct { - nodes []*v1.Node - nodeGroup *NodeGroupState - n int + nodes []*v1.Node + n int + unhealthyNodeGracePeriod string } tests := []struct { name string @@ -82,8 +86,8 @@ func TestControllerUntaintNewestN(t *testing.T) { "first 3 nodes. untaint 3", args{ nodes[:3], - nodeGroupsState["example"], 3, + "", }, []int{0, 2, 1}, }, @@ -91,17 +95,35 @@ func TestControllerUntaintNewestN(t *testing.T) { "first 3 nodes. untaint 2", args{ nodes[:3], - nodeGroupsState["example"], 2, + "", }, []int{0, 2}, }, + { + "first 3 nodes. untaint 1 (with grace period)", + args{ + nodes[:3], + 1, + "1m", + }, + []int{1}, + }, + { + "first 3 nodes. untaint 2 (with grace period)", + args{ + nodes[:3], + 2, + "1m", + }, + []int{1}, + }, { "6 nodes. untaint 0", args{ nodes, - nodeGroupsState["example"], 0, + "", }, []int{}, }, @@ -109,8 +131,8 @@ func TestControllerUntaintNewestN(t *testing.T) { "6 nodes. untaint 2", args{ nodes, - nodeGroupsState["example"], 2, + "", }, []int{3, 0}, }, @@ -118,17 +140,26 @@ func TestControllerUntaintNewestN(t *testing.T) { "6 nodes. untaint 6", args{ nodes, - nodeGroupsState["example"], 6, + "", }, []int{3, 0, 2, 1, 5, 4}, }, + { + "6 nodes. untaint 6 (with grace period)", + args{ + nodes, + 6, + "1m", + }, + []int{3, 1, 5}, + }, { "6 nodes. untaint 5", args{ nodes, - nodeGroupsState["example"], 5, + "", }, []int{3, 0, 2, 1, 5}, }, @@ -136,8 +167,8 @@ func TestControllerUntaintNewestN(t *testing.T) { "6 nodes. untaint 7", args{ nodes, - nodeGroupsState["example"], 7, + "", }, []int{3, 0, 2, 1, 5, 4}, }, @@ -145,8 +176,8 @@ func TestControllerUntaintNewestN(t *testing.T) { "4 nodes. untaint 1", args{ nodes[:4], - nodeGroupsState["example"], 1, + "", }, []int{3}, }, @@ -160,6 +191,10 @@ func TestControllerUntaintNewestN(t *testing.T) { nodeGroups: nodeGroupsState, } + // Reset the node grace period for each test case + nodeGroupsState["example"].Opts.unhealthyNodeGracePeriodDuration = 0 + nodeGroupsState["example"].Opts.UnhealthyNodeGracePeriod = tt.args.unhealthyNodeGracePeriod + // taint all var tc int for _, node := range nodes { @@ -174,7 +209,7 @@ func TestControllerUntaintNewestN(t *testing.T) { // test wet mode c.Opts.DryMode = false - got := c.untaintNewestN(tt.args.nodes, tt.args.nodeGroup, tt.args.n) + got := c.untaintNewestN(tt.args.nodes, nodeGroupsState["example"], tt.args.n) eq := assert.Equal(t, tt.want, got) if eq { for _, i := range got { @@ -192,7 +227,7 @@ func TestControllerUntaintNewestN(t *testing.T) { // test dry mode c.Opts.DryMode = true - got = c.untaintNewestN(tt.args.nodes, tt.args.nodeGroup, tt.args.n) + got = c.untaintNewestN(tt.args.nodes, nodeGroupsState["example"], tt.args.n) assert.Equal(t, tt.want, got) }) } diff --git a/pkg/controller/util.go b/pkg/controller/util.go index 7d7675c..41f3baf 100644 --- a/pkg/controller/util.go +++ b/pkg/controller/util.go @@ -3,6 +3,7 @@ package controller import ( "math" + "github.com/atlassian/escalator/pkg/k8s" "github.com/pkg/errors" log "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" @@ -79,3 +80,39 @@ func calcPercentUsage(cpuRequest, memRequest, cpuCapacity, memCapacity resource. memPercent := float64(memRequest.MilliValue()) / float64(memCapacity.MilliValue()) * 100 return cpuPercent, memPercent, nil } + +// taintInstances taints the nodes in the node group. It will taint a maximum of `max` nodes. +// It returns the indexes of the nodes that were tainted. +func (c *Controller) taintInstances(sortedNodes nodesByOldestCreationTime, nodeGroup *NodeGroupState, max int) []int { + taintedIndices := make([]int, 0) + + for _, bundle := range sortedNodes { + // stop at max (or when array is fully iterated) + if len(taintedIndices) >= max { + break + } + + // only actually taint in non-dry mode + if c.dryMode(nodeGroup) { + nodeGroup.taintTracker = append(nodeGroup.taintTracker, bundle.node.Name) + taintedIndices = append(taintedIndices, bundle.index) + + log.WithField("drymode", c.dryMode(nodeGroup)).WithField("nodegroup", nodeGroup.Opts.Name).Infof("Tainting node %v", bundle.node.Name) + continue + } + + log.WithField("drymode", c.dryMode(nodeGroup)).WithField("nodegroup", nodeGroup.Opts.Name).Infof("Tainting node %v", bundle.node.Name) + + // Taint the node + updatedNode, err := k8s.AddToBeRemovedTaint(bundle.node, c.Client, nodeGroup.Opts.TaintEffect) + if err != nil { + log.Errorf("While tainting %v: %v", bundle.node.Name, err) + continue + } + + bundle.node = updatedNode + taintedIndices = append(taintedIndices, bundle.index) + } + + return taintedIndices +} diff --git a/pkg/controller/util_test.go b/pkg/controller/util_test.go index 08cd6e3..2a5b740 100644 --- a/pkg/controller/util_test.go +++ b/pkg/controller/util_test.go @@ -1,7 +1,9 @@ package controller import ( + "context" "testing" + "time" "github.com/atlassian/escalator/pkg/k8s" "github.com/atlassian/escalator/pkg/k8s/resource" @@ -10,6 +12,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func TestCalcScaleUpDeltaBelowThreshold(t *testing.T) { @@ -311,3 +314,116 @@ func TestCalcPercentUsage(t *testing.T) { }) } } + +func TestTaintInstances(t *testing.T) { + now := time.Now() + nodes := []*v1.Node{ + test.BuildTestNode(test.NodeOpts{Name: "n1", Creation: now.Add(-5 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n2", Creation: now.Add(-15 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n3", Creation: now.Add(-20 * time.Minute)}), + test.BuildTestNode(test.NodeOpts{Name: "n4", Creation: now.Add(-30 * time.Minute)}), + } + + type args struct { + sortedNodes nodesByOldestCreationTime + nodeGroup *NodeGroupState + max int + dryMode bool + } + tests := []struct { + name string + args args + want []int + }{ + { + "taint 2 oldest nodes", + args{ + nodesByOldestCreationTime{ + {nodes[3], 3}, + {nodes[2], 2}, + {nodes[1], 1}, + {nodes[0], 0}, + }, + &NodeGroupState{ + Opts: NodeGroupOptions{ + Name: "default", + }, + }, + 2, + false, + }, + []int{3, 2}, + }, + { + "taint 2 oldest nodes in dry mode", + args{ + nodesByOldestCreationTime{ + {nodes[3], 3}, + {nodes[2], 2}, + {nodes[1], 1}, + {nodes[0], 0}, + }, + &NodeGroupState{ + Opts: NodeGroupOptions{ + Name: "default", + }, + }, + 2, + true, + }, + []int{3, 2}, + }, + { + "taint 4 oldest nodes in dry mode even with max > len(nodes)", + args{ + nodesByOldestCreationTime{ + {nodes[3], 3}, + {nodes[2], 2}, + {nodes[1], 1}, + {nodes[0], 0}, + }, + &NodeGroupState{ + Opts: NodeGroupOptions{ + Name: "default", + }, + }, + 5, + true, + }, + []int{3, 2, 1, 0}, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fakeClient, _ := test.BuildFakeClient(nodes, nil) + c := &Controller{ + Client: &Client{ + Interface: fakeClient, + }, + Opts: Opts{ + DryMode: tt.args.dryMode, + }, + } + + tt.args.nodeGroup.taintTracker = []string{} + + got := c.taintInstances(tt.args.sortedNodes, tt.args.nodeGroup, tt.args.max) + assert.Equal(t, tt.want, got) + + // Check that the nodes were actually tainted if not in dry mode + if !tt.args.dryMode { + for _, index := range got { + node := nodes[index] + updatedNode, err := c.Client.CoreV1().Nodes().Get(context.Background(), node.Name, metav1.GetOptions{}) + assert.NoError(t, err) + + _, tainted := k8s.GetToBeRemovedTaint(updatedNode) + assert.True(t, tainted) + } + } else { + // In dry mode, the taint tracker should be updated + assert.Len(t, tt.args.nodeGroup.taintTracker, len(tt.want)) + } + }) + } +} diff --git a/pkg/k8s/node.go b/pkg/k8s/node.go index 5a5c70a..01b93e8 100644 --- a/pkg/k8s/node.go +++ b/pkg/k8s/node.go @@ -2,6 +2,7 @@ package k8s import ( "context" + "time" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -24,3 +25,28 @@ func DeleteNodes(nodes []*v1.Node, client kubernetes.Interface) error { } return nil } + +// IsNodeUnhealthy returns true if the node is not ready by the amount of time +// allowed. +func IsNodeUnhealthy(node *v1.Node, gracePeriod time.Duration) bool { + // If a node is cordoned then do not consider it unhealthy + if node.Spec.Unschedulable { + return false + } + + // If the grace period expiry time is in the future then the instance is not + // deemed to be unhealthy even if it is not ready. + if node.CreationTimestamp.Add(gracePeriod).After(time.Now()) { + return false + } + + for _, condition := range node.Status.Conditions { + if condition.Type == v1.NodeReady { + // ConditionTrue shows whether the node is ready, anything else + // and we can assume the node is not. + return condition.Status != v1.ConditionTrue + } + } + + return true +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index b6b948c..8433e17 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -72,6 +72,15 @@ var ( }, []string{"node_group"}, ) + // NodeGroupUnhealthy nodes considered by specific node groups that are unhealthy + NodeGroupUnhealthy = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "node_group_unhealthy_nodes", + Namespace: NAMESPACE, + Help: "node group considered to be unhealthy", + }, + []string{"node_group"}, + ) // NodeGroupPodsEvicted pods evicted during a scale down NodeGroupPodsEvicted = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -316,6 +325,7 @@ func init() { prometheus.MustRegister(NodeGroupNodesCordoned) prometheus.MustRegister(NodeGroupNodesUntainted) prometheus.MustRegister(NodeGroupNodesTainted) + prometheus.MustRegister(NodeGroupUnhealthy) prometheus.MustRegister(NodeGroupPods) prometheus.MustRegister(NodeGroupPodsEvicted) prometheus.MustRegister(NodeGroupsMemPercent) diff --git a/pkg/test/builder.go b/pkg/test/builder.go index 84d447f..f353b75 100644 --- a/pkg/test/builder.go +++ b/pkg/test/builder.go @@ -15,14 +15,19 @@ import ( // NodeOpts minimal options for configuring a node object in testing type NodeOpts struct { - Name string - CPU int64 - Mem int64 - LabelKey string - LabelValue string - Creation time.Time - Tainted bool - ForceTainted bool + Name string + CPU int64 + Mem int64 + LabelKey string + LabelValue string + Creation time.Time + Tainted bool + ForceTainted bool + Unschedulable bool + + // Not using Ready because bool defaults to false and the default should + // be that the node is ready + NotReady bool } // BuildFakeClient creates a fake client @@ -131,16 +136,31 @@ func BuildTestNode(opts NodeOpts) *apiv1.Node { CreationTimestamp: metav1.NewTime(opts.Creation), }, Spec: apiv1.NodeSpec{ - ProviderID: opts.Name, - Taints: taints, + ProviderID: opts.Name, + Taints: taints, + Unschedulable: opts.Unschedulable, }, Status: apiv1.NodeStatus{ Capacity: apiv1.ResourceList{ apiv1.ResourcePods: *resource.NewPodQuantity(100), }, + Conditions: []apiv1.NodeCondition{ + { + Type: apiv1.NodeReady, + Status: apiv1.ConditionTrue, + }, + }, }, } + if opts.NotReady { + for i, c := range node.Status.Conditions { + if c.Type == apiv1.NodeReady { + node.Status.Conditions[i].Status = apiv1.ConditionFalse + } + } + } + if opts.CPU >= 0 { node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewCPUQuantity(opts.CPU) }