mirror of
https://github.com/kubernetes-sigs/descheduler.git
synced 2026-01-28 14:41:10 +01:00
Merge pull request #413 from damemi/podtopologyspread
Add PodTopologySpread strategy
This commit is contained in:
59
README.md
59
README.md
@@ -37,6 +37,7 @@ Table of Contents
|
|||||||
* [RemovePodsViolatingInterPodAntiAffinity](#removepodsviolatinginterpodantiaffinity)
|
* [RemovePodsViolatingInterPodAntiAffinity](#removepodsviolatinginterpodantiaffinity)
|
||||||
* [RemovePodsViolatingNodeAffinity](#removepodsviolatingnodeaffinity)
|
* [RemovePodsViolatingNodeAffinity](#removepodsviolatingnodeaffinity)
|
||||||
* [RemovePodsViolatingNodeTaints](#removepodsviolatingnodetaints)
|
* [RemovePodsViolatingNodeTaints](#removepodsviolatingnodetaints)
|
||||||
|
* [RemovePodsViolatingTopologySpreadConstraint](#removepodsviolatingtopologyspreadconstraint)
|
||||||
* [RemovePodsHavingTooManyRestarts](#removepodshavingtoomanyrestarts)
|
* [RemovePodsHavingTooManyRestarts](#removepodshavingtoomanyrestarts)
|
||||||
* [PodLifeTime](#podlifetime)
|
* [PodLifeTime](#podlifetime)
|
||||||
* [Filter Pods](#filter-pods)
|
* [Filter Pods](#filter-pods)
|
||||||
@@ -102,17 +103,17 @@ See the [user guide](docs/user-guide.md) in the `/docs` directory.
|
|||||||
## Policy and Strategies
|
## Policy and Strategies
|
||||||
|
|
||||||
Descheduler's policy is configurable and includes strategies that can be enabled or disabled.
|
Descheduler's policy is configurable and includes strategies that can be enabled or disabled.
|
||||||
Seven strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`,
|
Eight strategies `RemoveDuplicates`, `LowNodeUtilization`, `RemovePodsViolatingInterPodAntiAffinity`,
|
||||||
`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsHavingTooManyRestarts`, and `PodLifeTime`
|
`RemovePodsViolatingNodeAffinity`, `RemovePodsViolatingNodeTaints`, `RemovePodsViolatingTopologySpreadConstraint`,
|
||||||
are currently implemented. As part of the policy, the parameters associated with the strategies can be configured too.
|
`RemovePodsHavingTooManyRestarts`, and `PodLifeTime` are currently implemented. As part of the policy, the
|
||||||
By default, all strategies are enabled.
|
parameters associated with the strategies can be configured too. By default, all strategies are enabled.
|
||||||
|
|
||||||
The policy also includes common configuration for all the strategies:
|
The policy also includes common configuration for all the strategies:
|
||||||
- `nodeSelector` - limiting the nodes which are processed
|
- `nodeSelector` - limiting the nodes which are processed
|
||||||
- `evictLocalStoragePods` - allowing to evict pods with local storage
|
- `evictLocalStoragePods` - allowing to evict pods with local storage
|
||||||
- `maxNoOfPodsToEvictPerNode` - maximum number of pods evicted from each node (summed through all strategies)
|
- `maxNoOfPodsToEvictPerNode` - maximum number of pods evicted from each node (summed through all strategies)
|
||||||
|
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
nodeSelector: prod=dev
|
nodeSelector: prod=dev
|
||||||
@@ -144,7 +145,7 @@ has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considere
|
|||||||
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|
||||||
|
|
||||||
**Example:**
|
**Example:**
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -186,7 +187,7 @@ These thresholds, `thresholds` and `targetThresholds`, could be tuned as per you
|
|||||||
|
|
||||||
**Example:**
|
**Example:**
|
||||||
|
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -236,7 +237,7 @@ node.
|
|||||||
|
|
||||||
**Example:**
|
**Example:**
|
||||||
|
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -273,7 +274,7 @@ podA gets evicted from nodeA.
|
|||||||
|
|
||||||
**Example:**
|
**Example:**
|
||||||
|
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -301,7 +302,7 @@ and will be evicted.
|
|||||||
|
|
||||||
**Example:**
|
**Example:**
|
||||||
|
|
||||||
````
|
````yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -309,6 +310,31 @@ strategies:
|
|||||||
enabled: true
|
enabled: true
|
||||||
````
|
````
|
||||||
|
|
||||||
|
### RemovePodsViolatingTopologySpreadConstraint
|
||||||
|
|
||||||
|
This strategy makes sure that pods violating [topology spread constraints](https://kubernetes.io/docs/concepts/workloads/pods/pod-topology-spread-constraints/)
|
||||||
|
are evicted from nodes. Specifically, it tries to evict the minimum number of pods required to balance topology domains to within each constraint's `maxSkew`.
|
||||||
|
This strategy requires k8s version 1.18 at a minimum.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
|
||||||
|
|Name|Type|
|
||||||
|
|---|---|
|
||||||
|
|`thresholdPriority`|int (see [priority filtering](#priority-filtering))|
|
||||||
|
|`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))|
|
||||||
|
|`namespaces`|(see [namespace filtering](#namespace-filtering))|
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: "descheduler/v1alpha1"
|
||||||
|
kind: "DeschedulerPolicy"
|
||||||
|
strategies:
|
||||||
|
"RemovePodsViolatingTopologySpreadConstraint":
|
||||||
|
enabled: true
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
### RemovePodsHavingTooManyRestarts
|
### RemovePodsHavingTooManyRestarts
|
||||||
|
|
||||||
This strategy makes sure that pods having too many restarts are removed from nodes. For example a pod with EBS/PD that
|
This strategy makes sure that pods having too many restarts are removed from nodes. For example a pod with EBS/PD that
|
||||||
@@ -328,7 +354,7 @@ which determines whether init container restarts should be factored into that ca
|
|||||||
|
|
||||||
**Example:**
|
**Example:**
|
||||||
|
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -359,7 +385,7 @@ to `Running` and `Pending`.
|
|||||||
|
|
||||||
**Example:**
|
**Example:**
|
||||||
|
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -383,10 +409,11 @@ The following strategies accept a `namespaces` parameter which allows to specify
|
|||||||
* `RemovePodsViolatingNodeAffinity`
|
* `RemovePodsViolatingNodeAffinity`
|
||||||
* `RemovePodsViolatingInterPodAntiAffinity`
|
* `RemovePodsViolatingInterPodAntiAffinity`
|
||||||
* `RemoveDuplicates`
|
* `RemoveDuplicates`
|
||||||
|
* `RemovePodsViolatingTopologySpreadConstraint`
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -404,7 +431,7 @@ strategies:
|
|||||||
In the examples `PodLifeTime` gets executed only over `namespace1` and `namespace2`.
|
In the examples `PodLifeTime` gets executed only over `namespace1` and `namespace2`.
|
||||||
The similar holds for `exclude` field:
|
The similar holds for `exclude` field:
|
||||||
|
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -432,7 +459,7 @@ is set to the value of `system-cluster-critical` priority class.
|
|||||||
E.g.
|
E.g.
|
||||||
|
|
||||||
Setting `thresholdPriority`
|
Setting `thresholdPriority`
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
@@ -445,7 +472,7 @@ strategies:
|
|||||||
```
|
```
|
||||||
|
|
||||||
Setting `thresholdPriorityClassName`
|
Setting `thresholdPriorityClassName`
|
||||||
```
|
```yaml
|
||||||
apiVersion: "descheduler/v1alpha1"
|
apiVersion: "descheduler/v1alpha1"
|
||||||
kind: "DeschedulerPolicy"
|
kind: "DeschedulerPolicy"
|
||||||
strategies:
|
strategies:
|
||||||
|
|||||||
1
go.sum
1
go.sum
@@ -144,6 +144,7 @@ github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7a
|
|||||||
github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
|
github.com/gogo/protobuf v1.2.1/go.mod h1:hp+jE20tsWTFYpLwKvXlhS1hjn+gTNwPg2I6zVXpSg4=
|
||||||
github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls=
|
github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls=
|
||||||
github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
|
github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o=
|
||||||
|
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58=
|
||||||
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
|
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
|
||||||
github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903 h1:LbsanbbD6LieFkXbj9YNNBupiGHJgFeLpO0j0Fza1h8=
|
github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903 h1:LbsanbbD6LieFkXbj9YNNBupiGHJgFeLpO0j0Fza1h8=
|
||||||
github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
|
github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
|
||||||
|
|||||||
@@ -10,6 +10,9 @@ rules:
|
|||||||
- apiGroups: [""]
|
- apiGroups: [""]
|
||||||
resources: ["nodes"]
|
resources: ["nodes"]
|
||||||
verbs: ["get", "watch", "list"]
|
verbs: ["get", "watch", "list"]
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["namespaces"]
|
||||||
|
verbs: ["get", "list"]
|
||||||
- apiGroups: [""]
|
- apiGroups: [""]
|
||||||
resources: ["pods"]
|
resources: ["pods"]
|
||||||
verbs: ["get", "watch", "list", "delete"]
|
verbs: ["get", "watch", "list", "delete"]
|
||||||
|
|||||||
@@ -70,13 +70,14 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
|
|||||||
sharedInformerFactory.WaitForCacheSync(stopChannel)
|
sharedInformerFactory.WaitForCacheSync(stopChannel)
|
||||||
|
|
||||||
strategyFuncs := map[string]strategyFunction{
|
strategyFuncs := map[string]strategyFunction{
|
||||||
"RemoveDuplicates": strategies.RemoveDuplicatePods,
|
"RemoveDuplicates": strategies.RemoveDuplicatePods,
|
||||||
"LowNodeUtilization": strategies.LowNodeUtilization,
|
"LowNodeUtilization": strategies.LowNodeUtilization,
|
||||||
"RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity,
|
"RemovePodsViolatingInterPodAntiAffinity": strategies.RemovePodsViolatingInterPodAntiAffinity,
|
||||||
"RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity,
|
"RemovePodsViolatingNodeAffinity": strategies.RemovePodsViolatingNodeAffinity,
|
||||||
"RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints,
|
"RemovePodsViolatingNodeTaints": strategies.RemovePodsViolatingNodeTaints,
|
||||||
"RemovePodsHavingTooManyRestarts": strategies.RemovePodsHavingTooManyRestarts,
|
"RemovePodsHavingTooManyRestarts": strategies.RemovePodsHavingTooManyRestarts,
|
||||||
"PodLifeTime": strategies.PodLifeTime,
|
"PodLifeTime": strategies.PodLifeTime,
|
||||||
|
"RemovePodsViolatingTopologySpreadConstraint": strategies.RemovePodsViolatingTopologySpreadConstraint,
|
||||||
}
|
}
|
||||||
|
|
||||||
nodeSelector := rs.NodeSelector
|
nodeSelector := rs.NodeSelector
|
||||||
|
|||||||
355
pkg/descheduler/strategies/topologyspreadconstraint.go
Normal file
355
pkg/descheduler/strategies/topologyspreadconstraint.go
Normal file
@@ -0,0 +1,355 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2020 The Kubernetes Authors.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package strategies
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
v1 "k8s.io/api/core/v1"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/labels"
|
||||||
|
"k8s.io/apimachinery/pkg/util/sets"
|
||||||
|
clientset "k8s.io/client-go/kubernetes"
|
||||||
|
"k8s.io/klog/v2"
|
||||||
|
"sigs.k8s.io/descheduler/pkg/api"
|
||||||
|
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
|
||||||
|
nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node"
|
||||||
|
"sigs.k8s.io/descheduler/pkg/utils"
|
||||||
|
)
|
||||||
|
|
||||||
|
// AntiAffinityTerm's topology key value used in predicate metadata
|
||||||
|
type topologyPair struct {
|
||||||
|
key string
|
||||||
|
value string
|
||||||
|
}
|
||||||
|
|
||||||
|
type topology struct {
|
||||||
|
pair topologyPair
|
||||||
|
pods []*v1.Pod
|
||||||
|
}
|
||||||
|
|
||||||
|
func validateAndParseTopologySpreadParams(ctx context.Context, client clientset.Interface, params *api.StrategyParameters) (int32, sets.String, sets.String, error) {
|
||||||
|
var includedNamespaces, excludedNamespaces sets.String
|
||||||
|
if params == nil {
|
||||||
|
return 0, includedNamespaces, excludedNamespaces, nil
|
||||||
|
}
|
||||||
|
// At most one of include/exclude can be set
|
||||||
|
if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 {
|
||||||
|
return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("only one of Include/Exclude namespaces can be set")
|
||||||
|
}
|
||||||
|
if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" {
|
||||||
|
return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set")
|
||||||
|
}
|
||||||
|
thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, params)
|
||||||
|
if err != nil {
|
||||||
|
return 0, includedNamespaces, excludedNamespaces, fmt.Errorf("failed to get threshold priority from strategy's params: %+v", err)
|
||||||
|
}
|
||||||
|
if params.Namespaces != nil {
|
||||||
|
includedNamespaces = sets.NewString(params.Namespaces.Include...)
|
||||||
|
excludedNamespaces = sets.NewString(params.Namespaces.Exclude...)
|
||||||
|
}
|
||||||
|
|
||||||
|
return thresholdPriority, includedNamespaces, excludedNamespaces, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func RemovePodsViolatingTopologySpreadConstraint(
|
||||||
|
ctx context.Context,
|
||||||
|
client clientset.Interface,
|
||||||
|
strategy api.DeschedulerStrategy,
|
||||||
|
nodes []*v1.Node,
|
||||||
|
podEvictor *evictions.PodEvictor,
|
||||||
|
) {
|
||||||
|
thresholdPriority, includedNamespaces, excludedNamespaces, err := validateAndParseTopologySpreadParams(ctx, client, strategy.Params)
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "Invalid PodLifeTime parameters")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
nodeMap := make(map[string]*v1.Node, len(nodes))
|
||||||
|
for _, node := range nodes {
|
||||||
|
nodeMap[node.Name] = node
|
||||||
|
}
|
||||||
|
evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority))
|
||||||
|
|
||||||
|
// 1. for each namespace for which there is Topology Constraint
|
||||||
|
// 2. for each TopologySpreadyConstraint in that namespace
|
||||||
|
// { find all evictable pods in that namespace
|
||||||
|
// { 3. for each evictable pod in that namespace
|
||||||
|
// 4. If the pod matches this TopologySpreadConstraint LabelSelector
|
||||||
|
// 5. If the pod nodeName is present in the nodeMap
|
||||||
|
// 6. create a topoPair with key as this TopologySpreadConstraint.TopologyKey and value as this pod's Node Label Value for this TopologyKey
|
||||||
|
// 7. add the pod with key as this topoPair
|
||||||
|
// 8. find the min number of pods in any topoPair for this topologyKey
|
||||||
|
// iterate through all topoPairs for this topologyKey and diff currentPods -minPods <=maxSkew
|
||||||
|
// if diff > maxSkew, add this pod in the current bucket for eviction
|
||||||
|
|
||||||
|
// First record all of the constraints by namespace
|
||||||
|
namespaces, err := client.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "Couldn't list namespaces")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
podsForEviction := make(map[*v1.Pod]struct{})
|
||||||
|
// 1. for each namespace...
|
||||||
|
for _, namespace := range namespaces.Items {
|
||||||
|
if (!includedNamespaces.Has(namespace.Name) || excludedNamespaces.Has(namespace.Name)) && (includedNamespaces.Len()+excludedNamespaces.Len() > 0) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
namespacePods, err := client.CoreV1().Pods(namespace.Name).List(ctx, metav1.ListOptions{})
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "Couldn't list pods in namespace", "namespace", namespace)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// ...where there is a topology constraint
|
||||||
|
//namespaceTopologySpreadConstrainPods := make([]v1.Pod, 0, len(namespacePods.Items))
|
||||||
|
namespaceTopologySpreadConstraints := make(map[v1.TopologySpreadConstraint]struct{})
|
||||||
|
for _, pod := range namespacePods.Items {
|
||||||
|
for _, constraint := range pod.Spec.TopologySpreadConstraints {
|
||||||
|
// Only deal with hard topology constraints
|
||||||
|
// TODO(@damemi): add support for soft constraints
|
||||||
|
if constraint.WhenUnsatisfiable != v1.DoNotSchedule {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
namespaceTopologySpreadConstraints[constraint] = struct{}{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(namespaceTopologySpreadConstraints) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. for each topologySpreadConstraint in that namespace
|
||||||
|
for constraint := range namespaceTopologySpreadConstraints {
|
||||||
|
constraintTopologies := make(map[topologyPair][]*v1.Pod)
|
||||||
|
// pre-populate the topologyPair map with all the topologies available from the nodeMap
|
||||||
|
// (we can't just build it from existing pods' nodes because a topology may have 0 pods)
|
||||||
|
for _, node := range nodeMap {
|
||||||
|
if val, ok := node.Labels[constraint.TopologyKey]; ok {
|
||||||
|
constraintTopologies[topologyPair{key: constraint.TopologyKey, value: val}] = make([]*v1.Pod, 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
selector, err := metav1.LabelSelectorAsSelector(constraint.LabelSelector)
|
||||||
|
if err != nil {
|
||||||
|
klog.ErrorS(err, "Couldn't parse label selector as selector", "selector", constraint.LabelSelector)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. for each evictable pod in that namespace
|
||||||
|
// (this loop is where we count the number of pods per topologyValue that match this constraint'selector selector)
|
||||||
|
var sumPods float64
|
||||||
|
for i := range namespacePods.Items {
|
||||||
|
// 4. if the pod matches this TopologySpreadConstraint LabelSelector
|
||||||
|
if !selector.Matches(labels.Set(namespacePods.Items[i].Labels)) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. If the pod'selector node matches this constraint'selector topologyKey, create a topoPair and add the pod
|
||||||
|
node, ok := nodeMap[namespacePods.Items[i].Spec.NodeName]
|
||||||
|
if !ok {
|
||||||
|
// If ok is false, node is nil in which case node.Labels will panic. In which case a pod is yet to be scheduled. So it's safe to just continue here.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
nodeValue, ok := node.Labels[constraint.TopologyKey]
|
||||||
|
if !ok {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// 6. create a topoPair with key as this TopologySpreadConstraint
|
||||||
|
topoPair := topologyPair{key: constraint.TopologyKey, value: nodeValue}
|
||||||
|
// 7. add the pod with key as this topoPair
|
||||||
|
constraintTopologies[topoPair] = append(constraintTopologies[topoPair], &namespacePods.Items[i])
|
||||||
|
sumPods++
|
||||||
|
}
|
||||||
|
balanceDomains(podsForEviction, constraint, constraintTopologies, sumPods, evictable.IsEvictable, nodeMap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for pod := range podsForEviction {
|
||||||
|
if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[pod.Spec.NodeName], "PodTopologySpread"); err != nil && !evictable.IsEvictable(pod) {
|
||||||
|
klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// balanceDomains determines how many pods (minimum) should be evicted from large domains to achieve an ideal balance within maxSkew
|
||||||
|
// To actually determine how many pods need to be moved, we sort the topology domains in ascending length
|
||||||
|
// [2, 5, 3, 8, 5, 7]
|
||||||
|
//
|
||||||
|
// Would end up like:
|
||||||
|
// [2, 3, 5, 5, 7, 8]
|
||||||
|
//
|
||||||
|
// We then start at i=[0] and j=[len(list)-1] and compare the 2 topology sizes.
|
||||||
|
// If the diff of the size of the domains is more than the maxSkew, we will move up to half that skew,
|
||||||
|
// or the available pods from the higher domain, or the number required to bring the smaller domain up to the average,
|
||||||
|
// whichever number is less.
|
||||||
|
//
|
||||||
|
// (Note, we will only move as many pods from a domain as possible without bringing it below the ideal average,
|
||||||
|
// and we will not bring any smaller domain above the average)
|
||||||
|
// If the diff is within the skew, we move to the next highest domain.
|
||||||
|
// If the higher domain can't give any more without falling below the average, we move to the next lowest "high" domain
|
||||||
|
//
|
||||||
|
// Following this, the above topology domains end up "sorted" as:
|
||||||
|
// [5, 5, 5, 5, 5, 5]
|
||||||
|
// (assuming even distribution by the scheduler of the evicted pods)
|
||||||
|
func balanceDomains(
|
||||||
|
podsForEviction map[*v1.Pod]struct{},
|
||||||
|
constraint v1.TopologySpreadConstraint,
|
||||||
|
constraintTopologies map[topologyPair][]*v1.Pod,
|
||||||
|
sumPods float64,
|
||||||
|
isEvictable func(*v1.Pod) bool,
|
||||||
|
nodeMap map[string]*v1.Node) {
|
||||||
|
idealAvg := sumPods / float64(len(constraintTopologies))
|
||||||
|
sortedDomains := sortDomains(constraintTopologies, isEvictable)
|
||||||
|
// i is the index for belowOrEqualAvg
|
||||||
|
// j is the index for aboveAvg
|
||||||
|
i := 0
|
||||||
|
j := len(sortedDomains) - 1
|
||||||
|
for i < j {
|
||||||
|
// if j has no more to give without falling below the ideal average, move to next aboveAvg
|
||||||
|
if float64(len(sortedDomains[j].pods)) < idealAvg {
|
||||||
|
j--
|
||||||
|
}
|
||||||
|
|
||||||
|
// skew = actual difference between the domains
|
||||||
|
skew := float64(len(sortedDomains[j].pods) - len(sortedDomains[i].pods))
|
||||||
|
|
||||||
|
// if k and j are within the maxSkew of each other, move to next belowOrEqualAvg
|
||||||
|
if int32(skew) <= constraint.MaxSkew {
|
||||||
|
i++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// the most that can be given from aboveAvg is:
|
||||||
|
// 1. up to half the distance between them, minus MaxSkew, rounded up
|
||||||
|
// 2. how many it has remaining without falling below the average rounded up, or
|
||||||
|
// 3. how many can be added without bringing the smaller domain above the average rounded up,
|
||||||
|
// whichever is less
|
||||||
|
// (This is the basic principle of keeping all sizes within ~skew of the average)
|
||||||
|
aboveAvg := math.Ceil(float64(len(sortedDomains[j].pods)) - idealAvg)
|
||||||
|
belowAvg := math.Ceil(idealAvg - float64(len(sortedDomains[i].pods)))
|
||||||
|
smallestDiff := math.Min(aboveAvg, belowAvg)
|
||||||
|
halfSkew := math.Ceil((skew - float64(constraint.MaxSkew)) / 2)
|
||||||
|
movePods := int(math.Min(smallestDiff, halfSkew))
|
||||||
|
if movePods <= 0 {
|
||||||
|
i++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove pods from the higher topology and add them to the list of pods to be evicted
|
||||||
|
// also (just for tracking), add them to the list of pods in the lower topology
|
||||||
|
aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:]
|
||||||
|
for k := range aboveToEvict {
|
||||||
|
// if the pod has a hard nodeAffinity or nodeSelector that only matches this node,
|
||||||
|
// don't bother evicting it as it will just end up back on the same node
|
||||||
|
// however we still account for it "being evicted" so the algorithm can complete
|
||||||
|
// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
|
||||||
|
// but still try to get the required # of movePods (instead of just chopping that value off the slice above)
|
||||||
|
if aboveToEvict[k].Spec.NodeSelector != nil ||
|
||||||
|
(aboveToEvict[k].Spec.Affinity != nil &&
|
||||||
|
aboveToEvict[k].Spec.Affinity.NodeAffinity != nil &&
|
||||||
|
aboveToEvict[k].Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil &&
|
||||||
|
nodesPodFitsOnBesidesCurrent(aboveToEvict[k], nodeMap) == 0) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
podsForEviction[aboveToEvict[k]] = struct{}{}
|
||||||
|
}
|
||||||
|
sortedDomains[j].pods = sortedDomains[j].pods[:len(sortedDomains[j].pods)-movePods]
|
||||||
|
sortedDomains[i].pods = append(sortedDomains[i].pods, aboveToEvict...)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// nodesPodFitsOnBesidesCurrent counts the number of nodes this pod could fit on based on its affinity
|
||||||
|
// It excludes the current node because, for the sake of domain balancing only, we care about if there is any other
|
||||||
|
// place it could theoretically fit.
|
||||||
|
// If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading
|
||||||
|
func nodesPodFitsOnBesidesCurrent(pod *v1.Pod, nodeMap map[string]*v1.Node) int {
|
||||||
|
count := 0
|
||||||
|
for _, node := range nodeMap {
|
||||||
|
if nodeutil.PodFitsCurrentNode(pod, node) && node != nodeMap[pod.Spec.NodeName] {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count
|
||||||
|
}
|
||||||
|
|
||||||
|
// sortDomains sorts and splits the list of topology domains based on their size
|
||||||
|
// it also sorts the list of pods within the domains based on their node affinity/selector and priority in the following order:
|
||||||
|
// 1. non-evictable pods
|
||||||
|
// 2. pods with selectors or affinity
|
||||||
|
// 3. pods in descending priority
|
||||||
|
// 4. all other pods
|
||||||
|
// We then pop pods off the back of the list for eviction
|
||||||
|
func sortDomains(constraintTopologyPairs map[topologyPair][]*v1.Pod, isEvictable func(*v1.Pod) bool) []topology {
|
||||||
|
sortedTopologies := make([]topology, 0, len(constraintTopologyPairs))
|
||||||
|
// sort the topologies and return 2 lists: those <= the average and those > the average (> list inverted)
|
||||||
|
for pair, list := range constraintTopologyPairs {
|
||||||
|
// Sort the pods within the domain so that the lowest priority pods are considered first for eviction,
|
||||||
|
// followed by the highest priority,
|
||||||
|
// followed by the lowest priority pods with affinity or nodeSelector,
|
||||||
|
// followed by the highest priority pods with affinity or nodeSelector
|
||||||
|
sort.Slice(list, func(i, j int) bool {
|
||||||
|
// any non-evictable pods should be considered last (ie, first in the list)
|
||||||
|
if !isEvictable(list[i]) || !isEvictable(list[j]) {
|
||||||
|
// false - i is the only non-evictable, so return true to put it first
|
||||||
|
// true - j is non-evictable, so return false to put j before i
|
||||||
|
// if true and both and non-evictable, order doesn't matter
|
||||||
|
return !(isEvictable(list[i]) && !isEvictable(list[j]))
|
||||||
|
}
|
||||||
|
|
||||||
|
// if both pods have selectors/affinity, compare them by their priority
|
||||||
|
if hasSelectorOrAffinity(*list[i]) == hasSelectorOrAffinity(*list[j]) {
|
||||||
|
comparePodsByPriority(list[i], list[j])
|
||||||
|
}
|
||||||
|
return hasSelectorOrAffinity(*list[i]) && !hasSelectorOrAffinity(*list[j])
|
||||||
|
})
|
||||||
|
sortedTopologies = append(sortedTopologies, topology{pair: pair, pods: list})
|
||||||
|
}
|
||||||
|
|
||||||
|
// create an ascending slice of all key-value toplogy pairs
|
||||||
|
sort.Slice(sortedTopologies, func(i, j int) bool {
|
||||||
|
return len(sortedTopologies[i].pods) < len(sortedTopologies[j].pods)
|
||||||
|
})
|
||||||
|
|
||||||
|
return sortedTopologies
|
||||||
|
}
|
||||||
|
|
||||||
|
func hasSelectorOrAffinity(pod v1.Pod) bool {
|
||||||
|
return pod.Spec.NodeSelector != nil || (pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil)
|
||||||
|
}
|
||||||
|
|
||||||
|
// comparePodsByPriority is a helper to the sort function to compare 2 pods based on their priority values
|
||||||
|
// It will sort the pods in DESCENDING order of priority, since in our logic we evict pods from the back
|
||||||
|
// of the list first.
|
||||||
|
func comparePodsByPriority(iPod, jPod *v1.Pod) bool {
|
||||||
|
if iPod.Spec.Priority != nil && jPod.Spec.Priority != nil {
|
||||||
|
// a LOWER priority value should be evicted FIRST
|
||||||
|
return *iPod.Spec.Priority > *jPod.Spec.Priority
|
||||||
|
} else if iPod.Spec.Priority != nil && jPod.Spec.Priority == nil {
|
||||||
|
// i should come before j
|
||||||
|
return true
|
||||||
|
} else if iPod.Spec.Priority == nil && jPod.Spec.Priority != nil {
|
||||||
|
// j should come before i
|
||||||
|
return false
|
||||||
|
} else {
|
||||||
|
// it doesn't matter. just return true
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
340
pkg/descheduler/strategies/topologyspreadconstraint_test.go
Normal file
340
pkg/descheduler/strategies/topologyspreadconstraint_test.go
Normal file
@@ -0,0 +1,340 @@
|
|||||||
|
package strategies
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"sigs.k8s.io/descheduler/pkg/api"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
v1 "k8s.io/api/core/v1"
|
||||||
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||||
|
"k8s.io/apimachinery/pkg/runtime"
|
||||||
|
"k8s.io/client-go/kubernetes/fake"
|
||||||
|
core "k8s.io/client-go/testing"
|
||||||
|
"sigs.k8s.io/descheduler/pkg/descheduler/evictions"
|
||||||
|
"sigs.k8s.io/descheduler/test"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestTopologySpreadConstraint(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
testCases := []struct {
|
||||||
|
name string
|
||||||
|
pods []*v1.Pod
|
||||||
|
expectedEvictedCount int
|
||||||
|
nodes []*v1.Node
|
||||||
|
strategy api.DeschedulerStrategy
|
||||||
|
namespaces []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "2 domains, sizes [3,1], maxSkew=1, move 1 pod to achieve [2,2]",
|
||||||
|
nodes: []*v1.Node{
|
||||||
|
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
|
||||||
|
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
|
||||||
|
},
|
||||||
|
pods: createTestPods([]testPodList{
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
constraints: []v1.TopologySpreadConstraint{
|
||||||
|
{
|
||||||
|
MaxSkew: 1,
|
||||||
|
TopologyKey: "zone",
|
||||||
|
WhenUnsatisfiable: v1.DoNotSchedule,
|
||||||
|
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 2,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n2",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
expectedEvictedCount: 1,
|
||||||
|
strategy: api.DeschedulerStrategy{},
|
||||||
|
namespaces: []string{"ns1"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "2 domains, sizes [5,2], maxSkew=1, move 1 pod to achieve [4,3]",
|
||||||
|
nodes: []*v1.Node{
|
||||||
|
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
|
||||||
|
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
|
||||||
|
},
|
||||||
|
pods: createTestPods([]testPodList{
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
constraints: []v1.TopologySpreadConstraint{
|
||||||
|
{
|
||||||
|
MaxSkew: 1,
|
||||||
|
TopologyKey: "zone",
|
||||||
|
WhenUnsatisfiable: v1.DoNotSchedule,
|
||||||
|
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 4,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 2,
|
||||||
|
node: "n2",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
expectedEvictedCount: 1,
|
||||||
|
strategy: api.DeschedulerStrategy{},
|
||||||
|
namespaces: []string{"ns1"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "2 domains, sizes [4,0], maxSkew=1, move 2 pods to achieve [2,2]",
|
||||||
|
nodes: []*v1.Node{
|
||||||
|
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
|
||||||
|
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
|
||||||
|
},
|
||||||
|
pods: createTestPods([]testPodList{
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
constraints: []v1.TopologySpreadConstraint{
|
||||||
|
{
|
||||||
|
MaxSkew: 1,
|
||||||
|
TopologyKey: "zone",
|
||||||
|
WhenUnsatisfiable: v1.DoNotSchedule,
|
||||||
|
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 3,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
expectedEvictedCount: 2,
|
||||||
|
strategy: api.DeschedulerStrategy{},
|
||||||
|
namespaces: []string{"ns1"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "2 domains, sizes [4,0], maxSkew=1, only move 1 pod since pods with nodeSelector and nodeAffinity aren't evicted",
|
||||||
|
nodes: []*v1.Node{
|
||||||
|
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
|
||||||
|
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
|
||||||
|
},
|
||||||
|
pods: createTestPods([]testPodList{
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
constraints: []v1.TopologySpreadConstraint{
|
||||||
|
{
|
||||||
|
MaxSkew: 1,
|
||||||
|
TopologyKey: "zone",
|
||||||
|
WhenUnsatisfiable: v1.DoNotSchedule,
|
||||||
|
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
nodeSelector: map[string]string{"zone": "zoneA"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
nodeSelector: map[string]string{"zone": "zoneA"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
nodeAffinity: &v1.Affinity{NodeAffinity: &v1.NodeAffinity{
|
||||||
|
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{NodeSelectorTerms: []v1.NodeSelectorTerm{
|
||||||
|
{MatchExpressions: []v1.NodeSelectorRequirement{{Key: "foo", Values: []string{"bar"}, Operator: v1.NodeSelectorOpIn}}},
|
||||||
|
}},
|
||||||
|
}},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
expectedEvictedCount: 1,
|
||||||
|
strategy: api.DeschedulerStrategy{},
|
||||||
|
namespaces: []string{"ns1"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "3 domains, sizes [0, 1, 100], maxSkew=1, move 66 pods to get [34, 33, 34]",
|
||||||
|
nodes: []*v1.Node{
|
||||||
|
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
|
||||||
|
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
|
||||||
|
test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }),
|
||||||
|
},
|
||||||
|
pods: createTestPods([]testPodList{
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n2",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
constraints: []v1.TopologySpreadConstraint{
|
||||||
|
{
|
||||||
|
MaxSkew: 1,
|
||||||
|
TopologyKey: "zone",
|
||||||
|
WhenUnsatisfiable: v1.DoNotSchedule,
|
||||||
|
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 100,
|
||||||
|
node: "n3",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
expectedEvictedCount: 66,
|
||||||
|
strategy: api.DeschedulerStrategy{},
|
||||||
|
namespaces: []string{"ns1"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "4 domains, sizes [0, 1, 3, 5], should move 3 to get [2, 2, 3, 2]",
|
||||||
|
nodes: []*v1.Node{
|
||||||
|
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
|
||||||
|
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
|
||||||
|
test.BuildTestNode("n3", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneC" }),
|
||||||
|
test.BuildTestNode("n4", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneD" }),
|
||||||
|
},
|
||||||
|
pods: createTestPods([]testPodList{
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n2",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
constraints: []v1.TopologySpreadConstraint{
|
||||||
|
{
|
||||||
|
MaxSkew: 1,
|
||||||
|
TopologyKey: "zone",
|
||||||
|
WhenUnsatisfiable: v1.DoNotSchedule,
|
||||||
|
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 3,
|
||||||
|
node: "n3",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 5,
|
||||||
|
node: "n4",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
expectedEvictedCount: 3,
|
||||||
|
strategy: api.DeschedulerStrategy{},
|
||||||
|
namespaces: []string{"ns1"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "2 domains size [2 6], maxSkew=2, should move 1 to get [3 5]",
|
||||||
|
nodes: []*v1.Node{
|
||||||
|
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
|
||||||
|
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }),
|
||||||
|
},
|
||||||
|
pods: createTestPods([]testPodList{
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
constraints: []v1.TopologySpreadConstraint{
|
||||||
|
{
|
||||||
|
MaxSkew: 2,
|
||||||
|
TopologyKey: "zone",
|
||||||
|
WhenUnsatisfiable: v1.DoNotSchedule,
|
||||||
|
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 1,
|
||||||
|
node: "n1",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
count: 6,
|
||||||
|
node: "n2",
|
||||||
|
labels: map[string]string{"foo": "bar"},
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
expectedEvictedCount: 1,
|
||||||
|
strategy: api.DeschedulerStrategy{},
|
||||||
|
namespaces: []string{"ns1"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
fakeClient := &fake.Clientset{}
|
||||||
|
fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) {
|
||||||
|
podList := make([]v1.Pod, 0, len(tc.pods))
|
||||||
|
for _, pod := range tc.pods {
|
||||||
|
podList = append(podList, *pod)
|
||||||
|
}
|
||||||
|
return true, &v1.PodList{Items: podList}, nil
|
||||||
|
})
|
||||||
|
fakeClient.Fake.AddReactor("list", "namespaces", func(action core.Action) (bool, runtime.Object, error) {
|
||||||
|
return true, &v1.NamespaceList{Items: []v1.Namespace{{ObjectMeta: metav1.ObjectMeta{Name: "ns1", Namespace: "ns1"}}}}, nil
|
||||||
|
})
|
||||||
|
|
||||||
|
podEvictor := evictions.NewPodEvictor(
|
||||||
|
fakeClient,
|
||||||
|
"v1",
|
||||||
|
false,
|
||||||
|
100,
|
||||||
|
tc.nodes,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
RemovePodsViolatingTopologySpreadConstraint(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor)
|
||||||
|
podsEvicted := podEvictor.TotalEvicted()
|
||||||
|
if podsEvicted != tc.expectedEvictedCount {
|
||||||
|
t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", tc.name, tc.expectedEvictedCount, podsEvicted)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type testPodList struct {
|
||||||
|
count int
|
||||||
|
node string
|
||||||
|
labels map[string]string
|
||||||
|
constraints []v1.TopologySpreadConstraint
|
||||||
|
nodeSelector map[string]string
|
||||||
|
nodeAffinity *v1.Affinity
|
||||||
|
}
|
||||||
|
|
||||||
|
func createTestPods(testPods []testPodList) []*v1.Pod {
|
||||||
|
ownerRef1 := test.GetReplicaSetOwnerRefList()
|
||||||
|
pods := make([]*v1.Pod, 0)
|
||||||
|
podNum := 0
|
||||||
|
for _, tp := range testPods {
|
||||||
|
for i := 0; i < tp.count; i++ {
|
||||||
|
pods = append(pods,
|
||||||
|
test.BuildTestPod(fmt.Sprintf("pod-%d", podNum), 100, 0, tp.node, func(p *v1.Pod) {
|
||||||
|
p.Labels = make(map[string]string)
|
||||||
|
p.Labels = tp.labels
|
||||||
|
p.Namespace = "ns1"
|
||||||
|
p.ObjectMeta.OwnerReferences = ownerRef1
|
||||||
|
p.Spec.TopologySpreadConstraints = tp.constraints
|
||||||
|
p.Spec.NodeSelector = tp.nodeSelector
|
||||||
|
p.Spec.Affinity = tp.nodeAffinity
|
||||||
|
}))
|
||||||
|
podNum++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pods
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user