kubelet版本v1.27.1
前面讲过删除pod和添加pod,都是在kubelet文件的的HandlePodAdditions函数和pod_workers.go文件中主要流程。 这篇文章是当pod_workers.go流程结束后,重新触发kubelet.go文件的SyncPod、SyncTerminatingPod、SyncTerminatedPod函数进行处理。
这三个函数主要是对kubelet这一层最后的校验,然后将要调用CRI部分进行创建容器了。
这篇就是managePodLoop(1.27版本改名为podWorkerLoop)后的syncPod的主要功能
这里也是我的PR介绍的地方,TODO:113606是kubelet主管发起的的issue,解决方案中链接指向了我的PR
这是整体函数,其中的分支函数会在下面步骤中介绍
步骤1
做一些链路追踪,记录pod情况
记录第一次看到pod的情况
如果正在创建,记录pod工作者的启动延迟
将pod信息和podstatus(缓存对象信息)转换为status运行的信息,步骤2
将pod的ip设置会podips(如果使用主机网络generateAPIPodStatus函数中会设置)
如果状态是运行完成或者失败,不允许改变pod的状态了,直接记录状态并且返回
检查pod是否可以运行,这个函数在添加pod中有类似的
如果不允许运行,记录最终状态,并且记录原因
将最新的信息记录到status_manager管理器中,(后面文章会讲到这个函数,比较大的组件,作用是与api-server同步最新信息)
如果是是不允许运行的pod,进行删除。向下调用killpod函数(后续文章讲解)
如果网络插件还没有准备好,只有在使用主机网络的情况下才启动pod,否则返回错误
确保kubelet知道pod所使用的参考秘密或配置映射。
创建pcm用来cgroup的创建;如果这个pod准备删除了,则不会创建和更新;
检查这个容器是否第一次运行;如果pod不是第一次运行, 并且cgroup也没有创建。则需要删除pod;删除掉pod后,如果设置的重启策略不是RestartPolicyNever则重新进行创建;检查是否创建成功了
是否为静态pod;如果静态pod有镜像。如果镜像存在并且准备删除,则直接删除这个镜像;如果已经删除了并且没有镜像了,重新创建镜像
创建pod的挂载目录
添加探针
v1.27版本新特性,如果pod开启了InPlacePodVerticalScaling。则可以修改pod的request资源,不会重启pod。InPlacePodVerticalScaling新特性文章会介绍这里
开始同步pod(调用容器运行时,最终调用CRI创建容器)后续文章介绍
再次验证pod是否进行了资源的扩展,如果扩展则更新缓存
func (kl *Kubelet) SyncPod(_ context.Context, updateType kubetypes.SyncPodType, pod, mirrorPod *v1.Pod, podStatus *kubecontainer.PodStatus) (isTerminal bool, err error) {
// TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker.
//做一些链路追踪,记录pod情况
ctx, otelSpan := kl.tracer.Start(context.TODO(), "syncPod", trace.WithAttributes(
attribute.String("k8s.pod.uid", string(pod.UID)),
attribute.String("k8s.pod", klog.KObj(pod).String()),
attribute.String("k8s.pod.name", pod.Name),
attribute.String("k8s.pod.update_type", updateType.String()),
attribute.String("k8s.namespace.name", pod.Namespace),
))
klog.V(4).InfoS("SyncPod enter", "pod", klog.KObj(pod), "podUID", pod.UID)
defer func() {
klog.V(4).InfoS("SyncPod exit", "pod", klog.KObj(pod), "podUID", pod.UID, "isTerminal", isTerminal)
otelSpan.End()
}()
//记录第一次看到pod的情况
var firstSeenTime time.Time
if firstSeenTimeStr, ok := pod.Annotations[kubetypes.ConfigFirstSeenAnnotationKey]; ok {
firstSeenTime = kubetypes.ConvertToTimestamp(firstSeenTimeStr).Get()
}
//如果正在创建,记录pod工作者的启动延迟
if updateType == kubetypes.SyncPodCreate {
if !firstSeenTime.IsZero() {
metrics.PodWorkerStartDuration.Observe(metrics.SinceInSeconds(firstSeenTime))
} else {
klog.V(3).InfoS("First seen time not recorded for pod",
"podUID", pod.UID,
"pod", klog.KObj(pod))
}
}
//将pod信息和podstatus(缓存对象信息)转换为status运行的信息,步骤2
apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false)
//将pod的ip设置会podips(如果使用主机网络generateAPIPodStatus函数中会设置)
podStatus.IPs = make([]string, 0, len(apiPodStatus.PodIPs))
for _, ipInfo := range apiPodStatus.PodIPs {
podStatus.IPs = append(podStatus.IPs, ipInfo.IP)
}
if len(podStatus.IPs) == 0 && len(apiPodStatus.PodIP) > 0 {
podStatus.IPs = []string{apiPodStatus.PodIP}
}
//如果状态是运行完成或者失败,不允许改变pod的状态了,直接记录状态并且返回
if apiPodStatus.Phase == v1.PodSucceeded || apiPodStatus.Phase == v1.PodFailed {
kl.statusManager.SetPodStatus(pod, apiPodStatus)
isTerminal = true
return isTerminal, nil
}
//检查pod是否可以运行,这个函数在[添加pod]()中有类似的
runnable := kl.canRunPod(pod)
if !runnable.Admit {
//如果不允许运行,记录最终状态,并且记录原因
if apiPodStatus.Phase != v1.PodFailed && apiPodStatus.Phase != v1.PodSucceeded {
apiPodStatus.Phase = v1.PodPending
}
apiPodStatus.Reason = runnable.Reason
apiPodStatus.Message = runnable.Message
const waitingReason = "Blocked"
for _, cs := range apiPodStatus.InitContainerStatuses {
if cs.State.Waiting != nil {
cs.State.Waiting.Reason = waitingReason
}
}
for _, cs := range apiPodStatus.ContainerStatuses {
if cs.State.Waiting != nil {
cs.State.Waiting.Reason = waitingReason
}
}
}
//将最新的信息记录到statusManager中,(后续文章会讲到这个函数,比较大的组件,作用是与api-server同步最新信息)
kl.statusManager.SetPodStatus(pod, apiPodStatus)
if !runnable.Admit {
klog.V(2).InfoS("Pod is not runnable and must have running containers stopped", "pod", klog.KObj(pod), "podUID", pod.UID, "message", runnable.Message)
var syncErr error
//转换成pod类型
p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
//这个pod不可创建,进行删除。向下调用killpod函数(后续文章讲解)
if err := kl.killPod(ctx, pod, p, nil); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
syncErr = fmt.Errorf("error killing pod: %v", err)
utilruntime.HandleError(syncErr)
} else {
syncErr = fmt.Errorf("pod cannot be run: %s", runnable.Message)
}
return false, syncErr
}
//如果网络插件还没有准备好,只有在使用主机网络的情况下才启动pod,否则返回错误
if err := kl.runtimeState.networkErrors(); err != nil && !kubecontainer.IsHostNetworkPod(pod) {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.NetworkNotReady, "%s: %v", NetworkNotReadyErrorMsg, err)
return false, fmt.Errorf("%s: %v", NetworkNotReadyErrorMsg, err)
}
//确保kubelet知道pod所使用的参考秘密或配置映射。
if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {
if kl.secretManager != nil {
kl.secretManager.RegisterPod(pod)
}
if kl.configMapManager != nil {
kl.configMapManager.RegisterPod(pod)
}
}
//创建pcm用来cgroup的创建
pcm := kl.containerManager.NewPodContainerManager()
//如果这个pod准备删除了,则不会创建和更新
if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {
firstSync := true
//检查这个容器是否第一次运行
for _, containerStatus := range apiPodStatus.ContainerStatuses {
if containerStatus.State.Running != nil {
firstSync = false
break
}
}
podKilled := false
//如果pod不是第一次运行, 并且cgroup也没有创建。则需要删除pod
if !pcm.Exists(pod) && !firstSync {
p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
if err := kl.killPod(ctx, pod, p, nil); err == nil {
podKilled = true
} else {
klog.ErrorS(err, "KillPod failed", "pod", klog.KObj(pod), "podStatus", podStatus)
}
}
//删除掉pod后,如果设置的重启策略不是RestartPolicyNever则重新进行创建
if !(podKilled && pod.Spec.RestartPolicy == v1.RestartPolicyNever) {
if !pcm.Exists(pod) {
if err := kl.containerManager.UpdateQOSCgroups(); err != nil {
klog.V(2).InfoS("Failed to update QoS cgroups while syncing pod", "pod", klog.KObj(pod), "err", err)
}
//检查是否创建成功了
if err := pcm.EnsureExists(pod); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToCreatePodContainer, "unable to ensure pod container exists: %v", err)
return false, fmt.Errorf("failed to ensure that the pod: %v cgroups exist and are correctly applied: %v", pod.UID, err)
}
}
}
}
//是否为静态pod
if kubetypes.IsStaticPod(pod) {
deleted := false
//如果静态pod有镜像。如果镜像存在并且准备删除,则直接删除这个镜像
if mirrorPod != nil {
if mirrorPod.DeletionTimestamp != nil || !kl.podManager.IsMirrorPodOf(mirrorPod, pod) {
klog.InfoS("Trying to delete pod", "pod", klog.KObj(pod), "podUID", mirrorPod.ObjectMeta.UID)
podFullName := kubecontainer.GetPodFullName(pod)
var err error
deleted, err = kl.podManager.DeleteMirrorPod(podFullName, &mirrorPod.ObjectMeta.UID)
if deleted {
klog.InfoS("Deleted mirror pod because it is outdated", "pod", klog.KObj(mirrorPod))
} else if err != nil {
klog.ErrorS(err, "Failed deleting mirror pod", "pod", klog.KObj(mirrorPod))
}
}
}
//如果已经删除了并且没有镜像了,重新创建镜像
if mirrorPod == nil || deleted {
node, err := kl.GetNode()
if err != nil || node.DeletionTimestamp != nil {
klog.V(4).InfoS("No need to create a mirror pod, since node has been removed from the cluster", "node", klog.KRef("", string(kl.nodeName)))
} else {
klog.V(4).InfoS("Creating a mirror pod for static pod", "pod", klog.KObj(pod))
if err := kl.podManager.CreateMirrorPod(pod); err != nil {
klog.ErrorS(err, "Failed creating a mirror pod for", "pod", klog.KObj(pod))
}
}
}
}
//创建pod的挂载目录
if err := kl.makePodDataDirs(pod); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToMakePodDataDirectories, "error making pod data directories: %v", err)
klog.ErrorS(err, "Unable to make pod data directories for pod", "pod", klog.KObj(pod))
return false, err
}
if !kl.podWorkers.IsPodTerminationRequested(pod.UID) {
//等待挂载完成
if err := kl.volumeManager.WaitForAttachAndMount(pod); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedMountVolume, "Unable to attach or mount volumes: %v", err)
klog.ErrorS(err, "Unable to attach or mount volumes for pod; skipping pod", "pod", klog.KObj(pod))
return false, err
}
}
//获得sercrets的名字
pullSecrets := kl.getPullSecretsForPod(pod)
//添加探针
kl.probeManager.AddPod(pod)
//这里是v1.27版本新特性,如果pod开启了InPlacePodVerticalScaling。则可以修改pod的request资源,不会重启pod。后续文章会介绍这里
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
if kl.podWorkers.CouldHaveRunningContainers(pod.UID) && !kubetypes.IsStaticPod(pod) {
pod = kl.handlePodResourcesResize(pod)
}
}
//开始同步pod(调用容器运行时,最终调用CRI创建容器)后续文章介绍
result := kl.containerRuntime.SyncPod(ctx, pod, podStatus, pullSecrets, kl.backOff)
kl.reasonCache.Update(pod.UID, result)
if err := result.Error(); err != nil {
for _, r := range result.SyncResults {
if r.Error != kubecontainer.ErrCrashLoopBackOff && r.Error != images.ErrImagePullBackOff {
return false, err
}
}
return false, nil
}
//再次验证pod是否进行了资源的扩展,如果扩展则更新缓存
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && isPodResizeInProgress(pod, &apiPodStatus) {
runningPod := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
if err, _ := kl.pleg.UpdateCache(&runningPod, pod.UID); err != nil {
klog.ErrorS(err, "Failed to update pod cache", "pod", klog.KObj(pod))
return false, err
}
}
return false, nil
}
步骤2. generateAPIPodStatus函数 文件位置,pkg/kubelet/kubelet_pods.go
- 获得历史状态
- 进行状态转换,步骤3
- 确定就地升级的资源大小
- 判断最终的Phase状态
- 如果历史或者换成状态是完成或者失败。则当前状态等同,不允许修改
- 如果状态相同,直接存入历史的错误原因
- 如果是被驱逐的pod,记录原因
- 更新探针
- 记录几种必要条件(中断条件,容器条件,就绪条件,init容器条件等)
- 如果使用了主机网络,设置主机网络,同时如果podip为空将podip替换成主机ip。
func (kl *Kubelet) generateAPIPodStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus, podIsTerminal bool) v1.PodStatus {
klog.V(3).InfoS("Generating pod status", "podIsTerminal", podIsTerminal, "pod", klog.KObj(pod))
//获得历史状态
oldPodStatus, found := kl.statusManager.GetPodStatus(pod.UID)
if !found {
oldPodStatus = pod.Status
}
//进行状态转换,步骤3
s := kl.convertStatusToAPIStatus(pod, podStatus, oldPodStatus)
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
//确定就地升级的资源大小
s.Resize = kl.determinePodResizeStatus(pod, s)
}
allStatus := append(append([]v1.ContainerStatus{}, s.ContainerStatuses...), s.InitContainerStatuses...)
//判断最终的Phase状态
s.Phase = getPhase(pod, allStatus, podIsTerminal)
klog.V(4).InfoS("Got phase for pod", "pod", klog.KObj(pod), "oldPhase", oldPodStatus.Phase, "phase", s.Phase)
//如果历史或者换成状态是完成或者失败。则当前状态等同,不允许修改
if s.Phase != v1.PodFailed && s.Phase != v1.PodSucceeded {
switch {
case oldPodStatus.Phase == v1.PodFailed || oldPodStatus.Phase == v1.PodSucceeded:
klog.V(4).InfoS("Status manager phase was terminal, updating phase to match", "pod", klog.KObj(pod), "phase", oldPodStatus.Phase)
s.Phase = oldPodStatus.Phase
case pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded:
klog.V(4).InfoS("API phase was terminal, updating phase to match", "pod", klog.KObj(pod), "phase", pod.Status.Phase)
s.Phase = pod.Status.Phase
}
}
//如果状态相同,直接存入历史的错误原因
if s.Phase == oldPodStatus.Phase {
// preserve the reason and message which is associated with the phase
s.Reason = oldPodStatus.Reason
s.Message = oldPodStatus.Message
if len(s.Reason) == 0 {
s.Reason = pod.Status.Reason
}
if len(s.Message) == 0 {
s.Message = pod.Status.Message
}
}
for _, podSyncHandler := range kl.PodSyncHandlers {
//如果是被驱逐的pod,记录原因
if result := podSyncHandler.ShouldEvict(pod); result.Evict {
s.Phase = v1.PodFailed
s.Reason = result.Reason
s.Message = result.Message
break
}
}
if pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded {
if s.Phase != pod.Status.Phase {
klog.ErrorS(nil, "Pod attempted illegal phase transition", "pod", klog.KObj(pod), "originalStatusPhase", pod.Status.Phase, "apiStatusPhase", s.Phase, "apiStatus", s)
s.Phase = pod.Status.Phase
}
}
//更新探针
kl.probeManager.UpdatePodStatus(pod.UID, s)
//记录几种必要条件(中断条件,容器条件,就绪条件,init容器条件等)
s.Conditions = make([]v1.PodCondition, 0, len(pod.Status.Conditions)+1)
for _, c := range pod.Status.Conditions {
if !kubetypes.PodConditionByKubelet(c.Type) {
s.Conditions = append(s.Conditions, c)
}
}
if utilfeature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) {
cType := v1.DisruptionTarget
if _, condition := podutil.GetPodConditionFromList(oldPodStatus.Conditions, cType); condition != nil {
s.Conditions = utilpod.ReplaceOrAppendPodCondition(s.Conditions, condition)
}
}
if utilfeature.DefaultFeatureGate.Enabled(features.PodHasNetworkCondition) {
s.Conditions = append(s.Conditions, status.GeneratePodHasNetworkCondition(pod, podStatus))
}
s.Conditions = append(s.Conditions, status.GeneratePodInitializedCondition(&pod.Spec, s.InitContainerStatuses, s.Phase))
s.Conditions = append(s.Conditions, status.GeneratePodReadyCondition(&pod.Spec, s.Conditions, s.ContainerStatuses, s.Phase))
s.Conditions = append(s.Conditions, status.GenerateContainersReadyCondition(&pod.Spec, s.ContainerStatuses, s.Phase))
s.Conditions = append(s.Conditions, v1.PodCondition{
Type: v1.PodScheduled,
Status: v1.ConditionTrue,
})
//如果使用了主机网络,设置主机网络,同时如果podip为空将podip替换成主机ip。
if kl.kubeClient != nil {
hostIPs, err := kl.getHostIPsAnyWay()
if err != nil {
klog.V(4).InfoS("Cannot get host IPs", "err", err)
} else {
s.HostIP = hostIPs[0].String()
if kubecontainer.IsHostNetworkPod(pod) {
if s.PodIP == "" {
s.PodIP = hostIPs[0].String()
s.PodIPs = []v1.PodIP{{IP: s.PodIP}}
}
if len(hostIPs) == 2 && len(s.PodIPs) == 1 {
s.PodIPs = append(s.PodIPs, v1.PodIP{IP: hostIPs[1].String()})
}
}
}
}
return *s
}
步骤3.convertStatusToAPIStatus 这里主要设置pod.Status的几个重要的条件
- 设置pod的ips
- 设置QOS等级
- 设置容器和初始化容器的status状态。步骤4
- 设置临时容器
unc (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontainer.PodStatus, oldPodStatus v1.PodStatus) *v1.PodStatus {
var apiPodStatus v1.PodStatus
//设置pod的ips
podIPs := make([]string, len(podStatus.IPs))
copy(podIPs, podStatus.IPs)
podIPs = kl.sortPodIPs(podIPs)
for _, ip := range podIPs {
apiPodStatus.PodIPs = append(apiPodStatus.PodIPs, v1.PodIP{IP: ip})
}
if len(apiPodStatus.PodIPs) > 0 {
apiPodStatus.PodIP = apiPodStatus.PodIPs[0].IP
}
// 设置QOS等级
apiPodStatus.QOSClass = v1qos.GetPodQOS(pod)
//设置容器和初始化容器的status状态。步骤4
apiPodStatus.ContainerStatuses = kl.convertToAPIContainerStatuses(
pod, podStatus,
oldPodStatus.ContainerStatuses,
pod.Spec.Containers,
len(pod.Spec.InitContainers) > 0,
false,
)
apiPodStatus.InitContainerStatuses = kl.convertToAPIContainerStatuses(
pod, podStatus,
oldPodStatus.InitContainerStatuses,
pod.Spec.InitContainers,
len(pod.Spec.InitContainers) > 0,
true,
)
var ecSpecs []v1.Container
for i := range pod.Spec.EphemeralContainers {
ecSpecs = append(ecSpecs, v1.Container(pod.Spec.EphemeralContainers[i].EphemeralContainerCommon))
}
//设置临时容器
apiPodStatus.EphemeralContainerStatuses = kl.convertToAPIContainerStatuses(
pod, podStatus,
oldPodStatus.EphemeralContainerStatuses,
ecSpecs,
len(pod.Spec.InitContainers) > 0,
false,
)
return &apiPodStatus
}
步骤4.convertToAPIContainerStatuses设置容器的状态
- 初始化两个钩子函数(不详细介绍了)
- convertContainerStatus钩子函数是设置容器的运行状态
- convertContainerStatusResources钩子函数是设置容器的资源配置
- 将所有容器状态设置为默认的等待状态
- 初始化一些status的必要信息
- 遍历所有容器,如果容器在缓存中没有,但是在历史版本中存在,如果历史版本中是删除状态并且没有运行中,则记录错误信息"ContainerStatusUnknown"
- 循环缓存容器,去当前容器比较,如果存在,拿到这个容器的历史版本,进行第一个钩子函数的调用,后的容器运行状态存入到当前容器版本中
- 如果是init容器,如果容器退出并且退出码为0,则代表完成了不用管
- 如果一个容器应该在下一个syncpod中重新启动,返回等待即可
- 如果有失败记录,记录失败的原因
func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecontainer.PodStatus, previousStatus []v1.ContainerStatus, containers []v1.Container, hasInitContainers, isInitContainer bool) []v1.ContainerStatus {
//convertContainerStatus钩子函数是设置容器的运行状态
convertContainerStatus := func(cs *kubecontainer.Status, oldStatus *v1.ContainerStatus) *v1.ContainerStatus {
cid := cs.ID.String()
status := &v1.ContainerStatus{
Name: cs.Name,
RestartCount: int32(cs.RestartCount),
Image: cs.Image,
ImageID: cs.ImageID,
ContainerID: cid,
}
switch {
case cs.State == kubecontainer.ContainerStateRunning:
status.State.Running = &v1.ContainerStateRunning{StartedAt: metav1.NewTime(cs.StartedAt)}
case cs.State == kubecontainer.ContainerStateCreated:
status.State.Waiting = &v1.ContainerStateWaiting{}
case cs.State == kubecontainer.ContainerStateExited:
status.State.Terminated = &v1.ContainerStateTerminated{
ExitCode: int32(cs.ExitCode),
Reason: cs.Reason,
Message: cs.Message,
StartedAt: metav1.NewTime(cs.StartedAt),
FinishedAt: metav1.NewTime(cs.FinishedAt),
ContainerID: cid,
}
case cs.State == kubecontainer.ContainerStateUnknown &&
oldStatus != nil &&
oldStatus.State.Running != nil:
status.State.Terminated = &v1.ContainerStateTerminated{
Reason: "ContainerStatusUnknown",
Message: "The container could not be located when the pod was terminated",
ExitCode: 137, // this code indicates an error
}
status.RestartCount = oldStatus.RestartCount + 1
default:
status.State.Waiting = &v1.ContainerStateWaiting{}
}
return status
}
/convertContainerStatusResources钩子函数是设置容器的资源配置
convertContainerStatusResources := func(cName string, status *v1.ContainerStatus, cStatus *kubecontainer.Status, oldStatuses map[string]v1.ContainerStatus) *v1.ResourceRequirements {
var requests, limits v1.ResourceList
oldStatus, oldStatusFound := oldStatuses[cName]
determineResource := func(rName v1.ResourceName, v1ContainerResource, oldStatusResource, resource v1.ResourceList) {
if oldStatusFound {
if oldStatus.State.Running == nil || status.ContainerID != oldStatus.ContainerID {
if r, exists := v1ContainerResource[rName]; exists {
resource[rName] = r.DeepCopy()
}
} else {
if oldStatusResource != nil {
if r, exists := oldStatusResource[rName]; exists {
resource[rName] = r.DeepCopy()
}
}
}
}
}
container := kubecontainer.GetContainerSpec(pod, cName)
found := false
status.AllocatedResources, found = kl.statusManager.GetContainerResourceAllocation(string(pod.UID), cName)
if !(container.Resources.Requests == nil && container.Resources.Limits == nil) && !found {
klog.ErrorS(nil, "resource allocation not found in checkpoint store", "pod", pod.Name, "container", cName)
if oldStatusFound {
status.AllocatedResources = oldStatus.AllocatedResources
}
}
if oldStatus.Resources == nil {
oldStatus.Resources = &v1.ResourceRequirements{}
}
if container.Resources.Limits != nil {
limits = make(v1.ResourceList)
if cStatus.Resources != nil && cStatus.Resources.CPULimit != nil {
limits[v1.ResourceCPU] = cStatus.Resources.CPULimit.DeepCopy()
} else {
determineResource(v1.ResourceCPU, container.Resources.Limits, oldStatus.Resources.Limits, limits)
}
if cStatus.Resources != nil && cStatus.Resources.MemoryLimit != nil {
limits[v1.ResourceMemory] = cStatus.Resources.MemoryLimit.DeepCopy()
} else {
determineResource(v1.ResourceMemory, container.Resources.Limits, oldStatus.Resources.Limits, limits)
}
if ephemeralStorage, found := container.Resources.Limits[v1.ResourceEphemeralStorage]; found {
limits[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy()
}
}
if status.AllocatedResources != nil {
requests = make(v1.ResourceList)
if cStatus.Resources != nil && cStatus.Resources.CPURequest != nil {
requests[v1.ResourceCPU] = cStatus.Resources.CPURequest.DeepCopy()
} else {
determineResource(v1.ResourceCPU, status.AllocatedResources, oldStatus.Resources.Requests, requests)
}
if memory, found := status.AllocatedResources[v1.ResourceMemory]; found {
requests[v1.ResourceMemory] = memory.DeepCopy()
}
if ephemeralStorage, found := status.AllocatedResources[v1.ResourceEphemeralStorage]; found {
requests[v1.ResourceEphemeralStorage] = ephemeralStorage.DeepCopy()
}
}
resources := &v1.ResourceRequirements{
Limits: limits,
Requests: requests,
}
return resources
}
//初始化历史版本的map记录
oldStatuses := make(map[string]v1.ContainerStatus, len(containers))
for _, status := range previousStatus {
oldStatuses[status.Name] = status
}
//将所有容器状态设置为默认的等待状态
statuses := make(map[string]*v1.ContainerStatus, len(containers))
defaultWaitingState := v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: ContainerCreating}}
if hasInitContainers {
defaultWaitingState = v1.ContainerState{Waiting: &v1.ContainerStateWaiting{Reason: PodInitializing}}
}
//初始化一些status的必要信息
for _, container := range containers {
status := &v1.ContainerStatus{
Name: container.Name,
Image: container.Image,
State: defaultWaitingState,
}
oldStatus, found := oldStatuses[container.Name]
if found {
if oldStatus.State.Terminated != nil {
status = &oldStatus
} else {
status.RestartCount = oldStatus.RestartCount
status.LastTerminationState = oldStatus.LastTerminationState
}
}
statuses[container.Name] = status
}
for _, container := range containers {
//遍历所有容器,如果容器在缓存中没有,但是在历史版本中存在,如果历史版本中是删除状态并且没有运行中,则记录错误信息"ContainerStatusUnknown"
found := false
for _, cStatus := range podStatus.ContainerStatuses {
if container.Name == cStatus.Name {
found = true
break
}
}
if found {
continue
}
oldStatus, ok := oldStatuses[container.Name]
if !ok {
continue
}
if oldStatus.State.Terminated != nil {
continue
}
if oldStatus.State.Running == nil {
continue
}
status := statuses[container.Name]
isDefaultWaitingStatus := status.State.Waiting != nil && status.State.Waiting.Reason == ContainerCreating
if hasInitContainers {
isDefaultWaitingStatus = status.State.Waiting != nil && status.State.Waiting.Reason == PodInitializing
}
if !isDefaultWaitingStatus {
continue
}
if status.LastTerminationState.Terminated != nil {
continue
}
status.LastTerminationState.Terminated = &v1.ContainerStateTerminated{
Reason: "ContainerStatusUnknown",
Message: "The container could not be located when the pod was deleted. The container used to be Running",
ExitCode: 137,
}
if pod.DeletionTimestamp == nil {
status.RestartCount += 1
}
statuses[container.Name] = status
}
containerStatusesCopy := make([]*kubecontainer.Status, len(podStatus.ContainerStatuses))
copy(containerStatusesCopy, podStatus.ContainerStatuses)
sort.Sort(sort.Reverse(kubecontainer.SortContainerStatusesByCreationTime(containerStatusesCopy)))
containerSeen := map[string]int{}
//循环缓存容器,去当前容器比较,如果存在,拿到这个容器的历史版本,进行第一个钩子函数的调用,后的容器运行状态存入到当前容器版本中
for _, cStatus := range containerStatusesCopy {
cName := cStatus.Name
if _, ok := statuses[cName]; !ok {
continue
}
if containerSeen[cName] >= 2 {
continue
}
var oldStatusPtr *v1.ContainerStatus
if oldStatus, ok := oldStatuses[cName]; ok {
oldStatusPtr = &oldStatus
}
status := convertContainerStatus(cStatus, oldStatusPtr)
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
if status.State.Running != nil {
status.Resources = convertContainerStatusResources(cName, status, cStatus, oldStatuses)
}
}
if containerSeen[cName] == 0 {
statuses[cName] = status
} else {
statuses[cName].LastTerminationState = status.State
}
containerSeen[cName] = containerSeen[cName] + 1
}
for _, container := range containers {
if isInitContainer {
//如果是init容器,如果容器退出并且退出码为0,则代表完成了不用管
s := podStatus.FindContainerStatusByName(container.Name)
if s != nil && s.State == kubecontainer.ContainerStateExited && s.ExitCode == 0 {
continue
}
}
//如果一个容器应该在下一个syncpod中重新启动,返回等待即可
if !kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) {
continue
}
status := statuses[container.Name]
//如果有失败记录,记录失败的原因
reason, ok := kl.reasonCache.Get(pod.UID, container.Name)
if !ok {
continue
}
if status.State.Terminated != nil {
status.LastTerminationState = status.State
}
status.State = v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{
Reason: reason.Err.Error(),
Message: reason.Message,
},
}
statuses[container.Name] = status
}
//排序一下返回
if isInitContainer {
return kubetypes.SortStatusesOfInitContainers(pod, statuses)
}
containerStatuses := make([]v1.ContainerStatus, 0, len(statuses))
for _, status := range statuses {
containerStatuses = append(containerStatuses, *status)
}
sort.Sort(kubetypes.SortedContainerStatuses(containerStatuses))
return containerStatuses
}
二、SyncTerminatingPod
代码和syncpod基本一样,其中不同的是,syncpod函数中最终调用kuberuntime_manager.go中的syncpod。而SyncTerminatingPod调用的事killpod
func (kl *Kubelet) SyncTerminatingPod(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, gracePeriod *int64, podStatusFn func(*v1.PodStatus)) error {
// TODO(#113606): connect this with the incoming context parameter, which comes from the pod worker.
ctx, otelSpan := kl.tracer.Start(ctx, "syncTerminatingPod", trace.WithAttributes(
attribute.String("k8s.pod.uid", string(pod.UID)),
attribute.String("k8s.pod", klog.KObj(pod).String()),
attribute.String("k8s.pod.name", pod.Name),
attribute.String("k8s.namespace.name", pod.Namespace),
))
defer otelSpan.End()
apiPodStatus := kl.generateAPIPodStatus(pod, podStatus, false)
if podStatusFn != nil {
podStatusFn(&apiPodStatus)
}
kl.statusManager.SetPodStatus(pod, apiPodStatus)
kl.probeManager.StopLivenessAndStartup(pod)
p := kubecontainer.ConvertPodStatusToRunningPod(kl.getRuntime().Type(), podStatus)
if err := kl.killPod(ctx, pod, p, gracePeriod); err != nil {
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.FailedToKillPod, "error killing pod: %v", err)
utilruntime.HandleError(err)
return err
}
//****//
}
这两个函数的处理逻辑基本相同。最终调用了killpod。(其实syncpod函数中也会调用killpod,也就是后续的处理都是一个函数流程了,后面会统一介绍)
三、总结
这篇文章有点长,有很多其他函数,我只调出来了几个复杂的介绍。
其中介绍到了v1.27版本的pod资源原地扩缩容会在之后的InPlacePodVerticalScaling新特性文章中介绍。还有status_manager管理器,这个组件也需要一篇文章去介绍
这个SyncTerminatingPod函数和SyncTerminatedPod函数我就不全部贴出来了,流程和syncpod一样,其中syncpod包含了另外两个函数的主要流程killpod函数,所以不需要重复介绍。接下来的文章就会介绍killpod和syncpod两个函数在container runtime层的介绍了