-
Notifications
You must be signed in to change notification settings - Fork 73
minor errors in pod.go #100
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -24,7 +24,7 @@ import ( | |||||||||||||||||||||||||||||||||
"github.com/prometheus/client_golang/prometheus" | ||||||||||||||||||||||||||||||||||
"github.com/prometheus/client_golang/prometheus/promauto" | ||||||||||||||||||||||||||||||||||
log "github.com/sirupsen/logrus" | ||||||||||||||||||||||||||||||||||
"k8s.io/api/core/v1" | ||||||||||||||||||||||||||||||||||
v1 "k8s.io/api/core/v1" | ||||||||||||||||||||||||||||||||||
"k8s.io/apimachinery/pkg/api/errors" | ||||||||||||||||||||||||||||||||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||||||||||||||||||||||||||||||||||
"k8s.io/apimachinery/pkg/labels" | ||||||||||||||||||||||||||||||||||
|
@@ -86,7 +86,7 @@ func (jc *JobController) AddPod(obj interface{}) { | |||||||||||||||||||||||||||||||||
logger := commonutil.LoggerForPod(pod, jc.Controller.GetAPIGroupVersionKind().Kind) | ||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||
if job == nil { | ||||||||||||||||||||||||||||||||||
if pod.Labels[apiv1.GroupNameLabel] == jc.Controller.GetGroupNameLabelValue() { | ||||||||||||||||||||||||||||||||||
if pod.Labels[apiv1.GroupNameLabel] != jc.Controller.GetGroupNameLabelValue() { | ||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good catch up There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please check logics here. common/pkg/controller.v1/common/job_controller.go Lines 321 to 336 in ac0c6e1
If we can not find the job, we will end reconcile loop and directly There're several reason
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Jeffwan There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @xfate123 You are right. I didn't follow logic in
I may not explain this clearly. you are right. It could be an orphan pod (job has been deleted - case 2) or a pod point to different job (UID mismatch - case 3). case 1 seems match the criterion as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Jeffwan Thank you so much for your clear explanation |
||||||||||||||||||||||||||||||||||
logger.Info("This pod's job does not exist") | ||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||
return | ||||||||||||||||||||||||||||||||||
|
@@ -392,9 +392,9 @@ func (jc *JobController) ReconcilePods( | |||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||
// Check if the pod is retryable. | ||||||||||||||||||||||||||||||||||
if spec.RestartPolicy == apiv1.RestartPolicyExitCode { | ||||||||||||||||||||||||||||||||||
if pod.Status.Phase == v1.PodFailed && trainutil.IsRetryableExitCode(exitCode) { | ||||||||||||||||||||||||||||||||||
if pod.Status.Phase == v1.PodFailed && !trainutil.IsRetryableExitCode(exitCode) { | ||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this was intended and correct but we probably need to improve the log message here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The original logic here is to provide flexibility to define your own retryable exit code. This should not be changed. Since kubernetes doesn't have restart method, That's the reason we |
||||||||||||||||||||||||||||||||||
failedPodsCount.Inc() | ||||||||||||||||||||||||||||||||||
logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name) | ||||||||||||||||||||||||||||||||||
logger.Infof("Need to delete the pod: %v.%v", pod.Namespace, pod.Name) | ||||||||||||||||||||||||||||||||||
if err := jc.PodControl.DeletePod(pod.Namespace, pod.Name, runtimeObject); err != nil { | ||||||||||||||||||||||||||||||||||
return err | ||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change is unnecessary
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is created by gofmt I think, maybe we can keep it.