Skip to content
Merged
38 changes: 32 additions & 6 deletions docs/configuration/nodegroup.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ node_groups:
taint_effect: NoExecute
max_node_age: 24h
aws:
fleet_instance_ready_timeout: 1m
launch_template_id: lt-1a2b3c4d
launch_template_version: "1"
lifecycle: on-demand
instance_type_overrides: ["t2.large", "t3.large"]
resource_tagging: false
fleet_instance_ready_timeout: 1m
launch_template_id: lt-1a2b3c4d
launch_template_version: "1"
lifecycle: on-demand
instance_type_overrides: ["t2.large", "t3.large"]
resource_tagging: false
```

## Options
Expand Down Expand Up @@ -273,3 +273,29 @@ When not at the minimum, the natural scaling up and down of the node group will
node group.

This is an optional feature and by default is disabled.

### `unhealthy_node_grace_period`
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: list the default values for all of these for quick reference. As I believe these default to being turned off, list some good starting points.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

health_check_newest_nodes_percent is the only one that is technically required if unhealthy_node_grace_period is set. I don't see the point to setting a default value for unhealthy_node_grace_period and health_check_newest_nodes_percent because those are required to use the feature. Setting a default for max_unhealthy_nodes_percentage makes sense because that one can still be used when not set


Defines the minimum age of a node before it can be tested to check if it is unhealthy.

When enabled, instances can be tested periodically to determine if they are healthy. Escalator will pause all scaling activity and flush out unhealthy instances if they go above a configured threshold for the nodegroup. It will continuously do this until enough instances in the nodegroup are healthy and normal scaling activity can resume.

Cordoned nodes are skipped and can never be considered unhealthy.

This is an optional field. The default value is empty, which disables the feature.

### `health_check_newest_nodes_percent`

**[Only used if `unhealthy_node_grace_period` is set.]**

The percentage of nodes (ordered by age from newer to older) in the nodegroup that are considered when checking for the maximum allowed unhealthy nodes in the nodegroup. The nodes captured by this percentage form the "test set" to be checked. Only nodes which are older than `unhealthy_node_grace_period` will be included in the test set.

This field is required.

### `max_unhealthy_nodes_percent`

**[Only used if `unhealthy_node_grace_period` is set.]**

The maximum percentage of unhealthy nodes in the test set from `health_check_newest_nodes_percent`. Beyond this threshold all scaling activity is paused and unhealthy nodes are flushed out.

This is an optional field. If not set, it will default to `0%`.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to clarify, if set to 0% this means any unhealthy node will pause scaling?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

150 changes: 149 additions & 1 deletion pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package controller

import (
"math"
"sort"
"time"

"github.com/atlassian/escalator/pkg/cloudprovider"
Expand Down Expand Up @@ -227,6 +228,12 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
nodeGroup.memCapacity = *allNodes[0].Status.Allocatable.Memory()
}

// Taint all instances considered to be unhealthy before filtering the nodes
// into groups.
if nodeGroup.Opts.UnhealthyNodeGracePeriodDuration() > 0 {
c.taintUnhealthyInstances(allNodes, nodeGroup)
}

// Filter into untainted and tainted nodes
untaintedNodes, taintedNodes, forceTaintedNodes, cordonedNodes := c.filterNodes(nodeGroup, allNodes)

Expand Down Expand Up @@ -420,6 +427,22 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
log.WithField("nodegroup", nodegroup).Error(forceActionErr)
}

// If the nodegroup is considered to be unhealthy, then prevent any scaling
// for the time being and instead try removing tainted nodes to get the
// nodegroup into a healthy state again. No healthy nodes should be removed
// and no new cloud provider nodes should be added.
nodeGroupIsHealthy := true

if nodeGroup.Opts.UnhealthyNodeGracePeriodDuration() > 0 {
if !c.isNodegroupHealthy(nodeGroup, allNodes) {
nodeGroupIsHealthy = false
nodesDelta = 0
log.WithField("nodegroup", nodegroup).Infof("NodegroupUnhealthy: nodesDelta overridden to 0 from %d because the nodegroup is unhealthy", nodesDelta)
}
}

c.reportNodeGroupHealthMetric(nodegroup, nodeGroupIsHealthy)

// Perform a scale up, do nothing or scale down based on the nodes delta
var nodesDeltaResult int
// actionErr keeps the error of any action below and checked after action
Expand All @@ -439,7 +462,7 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
log.WithField("nodegroup", nodegroup).Info("No need to scale")
// reap any expired nodes
var removed int
removed, actionErr = c.TryRemoveTaintedNodes(scaleOptions)
removed, actionErr = c.TryRemoveTaintedNodes(scaleOptions, nodeGroupIsHealthy)
log.WithField("nodegroup", nodegroup).Infof("Reaper: There were %v empty nodes deleted this round", removed)
}

Expand All @@ -457,6 +480,131 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
return nodesDelta, err
}

// Finds all unhealthy instances in the nodegroup and adds the taint to mark
// them for deletion.
func (c *Controller) taintUnhealthyInstances(nodes []*v1.Node, state *NodeGroupState) []int {
bundles := make(nodesByOldestCreationTime, 0, len(nodes))

for i, node := range nodes {
// If the node is deemed healthy then there is nothing to do
if !k8s.IsNodeUnhealthy(node, state.Opts.unhealthyNodeGracePeriodDuration) {
continue
}

bundles = append(bundles, nodeIndexBundle{node, i})
}

return c.taintInstances(bundles, state, len(bundles))
}

func (c *Controller) reportNodeGroupHealthMetric(nodegroup string, nodeGroupHealthy bool) {
healthy := 1

if !nodeGroupHealthy {
healthy = 0
}

metrics.NodeGroupUnhealthy.WithLabelValues(nodegroup).Set(float64(healthy))
}

// isNodegroupHealthy checks if the nodegroup is healthy.
// It does this by checking the health of the newest X% of nodes in the nodegroup that are older than the grace period.
// If the percentage of unhealthy nodes in this newest set of nodes is greater than the configured threshold, the nodegroup is considered unhealthy.
func (c *Controller) isNodegroupHealthy(state *NodeGroupState, nodes []*v1.Node) bool {
// Sort the nodes is reverse order based on age
reversedOrderedNodes := c.getNodesOrderedNewestFirst(nodes)

// Filter out any nodes which are not old enough for the test group
oldEnoughNodes := c.filterOutNodesTooNew(state, reversedOrderedNodes)

// Out of the nodes that are left, find the most recent configured
// percentage of nodes to do the test.
nodesForTest := c.getMostRecentNodes(state, oldEnoughNodes)

// If there are no nodes to test, then the nodegroup is considered healthy.
if len(nodesForTest) == 0 {
return true
}

// Get the total number of unhealthy nodes in the test set.
unhealthyNodesCount := c.countUnhealthyNodes(state, nodesForTest)

// If the number of unhealthy nodes in the test group exceeds the percentage
// allowed then the test has failed.
return (unhealthyNodesCount*100)/len(nodesForTest) <= state.Opts.MaxUnhealthyNodesPercent
}

func (c *Controller) getNodesOrderedNewestFirst(nodes []*v1.Node) []*v1.Node {
sortedNodes := make(nodesByOldestCreationTime, 0, len(nodes))

for i, node := range nodes {
sortedNodes = append(sortedNodes, nodeIndexBundle{node, i})
}

// Sort in reverse to get the newest instances at the front to make it
// easier to loop through.
sort.Sort(sort.Reverse(sortedNodes))

reverseOrderedNodes := make([]*v1.Node, 0, len(nodes))

for _, sortedNode := range sortedNodes {
reverseOrderedNodes = append(reverseOrderedNodes, sortedNode.node)
}

return reverseOrderedNodes
}

// Returns the list of nodes which are at least as old at the health check grace
// period duration configured for the nodegroup. These nodes are considered to
// be too new and still have a chance to be not Ready for legitimate reasons so
// they should not be considered.
func (c *Controller) filterOutNodesTooNew(state *NodeGroupState, nodes []*v1.Node) []*v1.Node {
now := time.Now()
newNodes := make([]*v1.Node, 0)

for _, node := range nodes {
// Check if the node is old enough to be included in the new list
if node.CreationTimestamp.Add(state.Opts.unhealthyNodeGracePeriodDuration).Before(now) {
newNodes = append(newNodes, node)
}
}

return newNodes
}

// Returns the most recent X% of instances from the given list of nodes.
func (c *Controller) getMostRecentNodes(state *NodeGroupState, nodes []*v1.Node) []*v1.Node {
// Round up rather than down from HealthCheckNewestNodesPercent so that if
// there is a single instance then a non-100% percentage will still result
// in testing the instance. We want to test more rather than less.
numberOfNodes := int(math.Ceil((float64(state.Opts.HealthCheckNewestNodesPercent) / 100) * float64(len(nodes))))
recentNodes := make([]*v1.Node, 0)

for i, node := range nodes {
if i == numberOfNodes {
break
}

recentNodes = append(recentNodes, node)
}

return recentNodes
}

func (c *Controller) countUnhealthyNodes(state *NodeGroupState, nodes []*v1.Node) int {
unhealthyNodesCount := 0

for _, node := range nodes {
// Include the unhealthyNodeDuration in the call to be 100% sure that we
// are not counting nodes which are too young are unhealthy.
if k8s.IsNodeUnhealthy(node, state.Opts.unhealthyNodeGracePeriodDuration) {
unhealthyNodesCount++
}
}

return unhealthyNodesCount
}

func (c *Controller) isScaleOnStarve(
nodeGroup *NodeGroupState,
podRequests k8s.PodRequestedUsage,
Expand Down
Loading
Loading