Notes from reviewing the PR on DRA's DeviceTaint and ReservedFor.
devicetainteviction: continue processing ReservedFor consumersKEP-5055: DRA: device taints and tolerations enhancements/keps/sig-scheduling/5055-dra-device-taints-and-tolerations at master · kubernetes/enhancementsEnhancements tracking repo for Kubernetes. Contribute to kubernetes/enhancements development by creating an account on …
When a DeviceTaint is applied, Pods using that device are evicted. The following code performs the eviction.
func (tc *Controller) handlePod(pod *v1.Pod) {
eviction := tc.podEvictionTime(pod)
podRef := newObject(pod)
if eviction == nil {
if tc.cancelWorkWithEvent(podRef) {
tc.logger.V(3).Info("Canceled pod eviction", "pod", podRef)
}
return
}
tc.logger.V(3).Info("Going to evict pod", "pod", podRef, "eviction", eviction)
tc.evictPod(podRef, *eviction)
// If any reason is because of a taint, then eviction is in progress and the status may need to be updated.
for _, reason := range eviction.reason {
if reason.rule != nil {
tc.workqueue.Add(workItemForRule(reason.rule))
}
}
}However, there was a bug where using return in the for loop within handlePods caused subsequent Pods to not be processed.
func (tc *Controller) handlePods(claim *resourceapi.ResourceClaim) {
for _, consumer := range claim.Status.ReservedFor {
if consumer.APIGroup == "" && consumer.Resource == "pods" {
pod, err := tc.podLister.Pods(claim.Namespace).Get(consumer.Name)
if err != nil {
if apierrors.IsNotFound(err) {
return
}
// Should not happen.
utilruntime.HandleErrorWithLogger(tc.logger, err, "retrieve pod from cache")
return
}
if pod.UID != consumer.UID {
// Not the pod we were looking for.
return
}
tc.handlePod(pod)
}
}
}The Pods to be processed are stored in ReservedFor, which is exposed via the API as part of ResourceClaim's Status. Since devices can be shared, ReservedFor may contain multiple Pods.
// ResourceClaimStatus tracks whether the resource has been allocated and what
// the result of that was.
type ResourceClaimStatus struct {
// Allocation is set once the claim has been allocated successfully.
//
// +optional
Allocation *AllocationResult
// ReservedFor indicates which entities are currently allowed to use
// the claim. A Pod which references a ResourceClaim which is not
// reserved for that Pod will not be started. A claim that is in
// use or might be in use because it has been reserved must not get
// deallocated.
//
// In a cluster with multiple scheduler instances, two pods might get
// scheduled concurrently by different schedulers. When they reference
// the same ResourceClaim which already has reached its maximum number
// of consumers, only one pod can be scheduled.
//
// Both schedulers try to add their pod to the claim.status.reservedFor
// field, but only the update that reaches the API server first gets
// stored. The other one fails with an error and the scheduler
// which issued it knows that it must put the pod back into the queue,
// waiting for the ResourceClaim to become usable again.
//
// There can be at most 256 such reservations. This may get increased in
// the future, but not reduced.
//
// +optional
// +listType=map
// +listMapKey=uid
// +patchStrategy=merge
// +patchMergeKey=uid
ReservedFor []ResourceClaimConsumerReference
// DeallocationRequested is tombstoned since Kubernetes 1.32 where
// it got removed. May be reused once decoding v1alpha3 is no longer
// supported.
// DeallocationRequested bool
// Devices contains the status of each device allocated for this
// claim, as reported by the driver. This can include driver-specific
// information. Entries are owned by their respective drivers.
//
// +optional
// +listType=map
// +listMapKey=driver
// +listMapKey=device
// +listMapKey=pool
// +listMapKey=shareID
// +featureGate=DRAResourceClaimDeviceStatus
Devices []AllocatedDeviceStatus
}handlePod should probably be executed asynchronously.