DRA の DeviceTaint と ReservedFor の PR のレビューで調べたことのメモ
devicetainteviction: continue processing ReservedFor consumersKEP-5055: DRA: device taints and tolerations enhancements/keps/sig-scheduling/5055-dra-device-taints-and-tolerations at master · kubernetes/enhancementsEnhancements tracking repo for Kubernetes. Contribute to kubernetes/enhancements development by creating an account on …
DeviceTaint が付くと、そのデバイスを使っている Pod が Evict される。 以下のコードで Evict が実行される。
func (tc *Controller) handlePod(pod *v1.Pod) {
eviction := tc.podEvictionTime(pod)
podRef := newObject(pod)
if eviction == nil {
if tc.cancelWorkWithEvent(podRef) {
tc.logger.V(3).Info("Canceled pod eviction", "pod", podRef)
}
return
}
tc.logger.V(3).Info("Going to evict pod", "pod", podRef, "eviction", eviction)
tc.evictPod(podRef, *eviction)
// If any reason is because of a taint, then eviction is in progress and the status may need to be updated.
for _, reason := range eviction.reason {
if reason.rule != nil {
tc.workqueue.Add(workItemForRule(reason.rule))
}
}
}ただ、handlePods 内の for ループで return を使っていたため、後続の Pod が処理されない問題があった。
func (tc *Controller) handlePods(claim *resourceapi.ResourceClaim) {
for _, consumer := range claim.Status.ReservedFor {
if consumer.APIGroup == "" && consumer.Resource == "pods" {
pod, err := tc.podLister.Pods(claim.Namespace).Get(consumer.Name)
if err != nil {
if apierrors.IsNotFound(err) {
return
}
// Should not happen.
utilruntime.HandleErrorWithLogger(tc.logger, err, "retrieve pod from cache")
return
}
if pod.UID != consumer.UID {
// Not the pod we were looking for.
return
}
tc.handlePod(pod)
}
}
}処理対象の Pod は ReservedFor に格納されており、これは ResourceClaim の Status として API で公開されている。デバイスをシェアできるため、ReservedFor には複数の Pod が入ることがある。
// ResourceClaimStatus tracks whether the resource has been allocated and what
// the result of that was.
type ResourceClaimStatus struct {
// Allocation is set once the claim has been allocated successfully.
//
// +optional
Allocation *AllocationResult
// ReservedFor indicates which entities are currently allowed to use
// the claim. A Pod which references a ResourceClaim which is not
// reserved for that Pod will not be started. A claim that is in
// use or might be in use because it has been reserved must not get
// deallocated.
//
// In a cluster with multiple scheduler instances, two pods might get
// scheduled concurrently by different schedulers. When they reference
// the same ResourceClaim which already has reached its maximum number
// of consumers, only one pod can be scheduled.
//
// Both schedulers try to add their pod to the claim.status.reservedFor
// field, but only the update that reaches the API server first gets
// stored. The other one fails with an error and the scheduler
// which issued it knows that it must put the pod back into the queue,
// waiting for the ResourceClaim to become usable again.
//
// There can be at most 256 such reservations. This may get increased in
// the future, but not reduced.
//
// +optional
// +listType=map
// +listMapKey=uid
// +patchStrategy=merge
// +patchMergeKey=uid
ReservedFor []ResourceClaimConsumerReference
// DeallocationRequested is tombstoned since Kubernetes 1.32 where
// it got removed. May be reused once decoding v1alpha3 is no longer
// supported.
// DeallocationRequested bool
// Devices contains the status of each device allocated for this
// claim, as reported by the driver. This can include driver-specific
// information. Entries are owned by their respective drivers.
//
// +optional
// +listType=map
// +listMapKey=driver
// +listMapKey=device
// +listMapKey=pool
// +listMapKey=shareID
// +featureGate=DRAResourceClaimDeviceStatus
Devices []AllocatedDeviceStatus
}handlePod は非同期でやった方がよさそう。