kubeflow · xfate123 · Aug 4, 2020 · terrytangyuan · Aug 4, 2020 · gaocegege
diff --git a/pkg/controller.v1/common/pod.go b/pkg/controller.v1/common/pod.go
@@ -24,7 +24,7 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/prometheus/client_golang/prometheus/promauto"
 	log "github.com/sirupsen/logrus"
-	"k8s.io/api/core/v1"
+	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
@@ -86,7 +86,7 @@ func (jc *JobController) AddPod(obj interface{}) {
 		logger := commonutil.LoggerForPod(pod, jc.Controller.GetAPIGroupVersionKind().Kind)
 
 		if job == nil {
-			if pod.Labels[apiv1.GroupNameLabel] == jc.Controller.GetGroupNameLabelValue() {
+			if pod.Labels[apiv1.GroupNameLabel] != jc.Controller.GetGroupNameLabelValue() {
 func (jc *JobController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) metav1.Object { 
 	// We can't look up by UID, so look up by Name and then verify UID. 
 	// Don't even try to look up by Name if it's the wrong Kind. 
 	if controllerRef.Kind != jc.Controller.GetAPIGroupVersionKind().Kind { 
 		return nil 
 	} 
 	job, err := jc.Controller.GetJobFromInformerCache(namespace, controllerRef.Name) 
 	if err != nil { 
 		return nil 
 	} 
 	if job.GetUID() != controllerRef.UID { 
 		// The controller we found with this Name is not the same one that the 
 		// ControllerRef points to. 
 		return nil 
 	} 
 	return job 
 func (jc *JobController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) metav1.Object { 
 	// We can't look up by UID, so look up by Name and then verify UID. 
 	// Don't even try to look up by Name if it's the wrong Kind. 
 	if controllerRef.Kind != jc.Controller.GetAPIGroupVersionKind().Kind { 
 		return nil 
 	} 
 	job, err := jc.Controller.GetJobFromInformerCache(namespace, controllerRef.Name) 
 	if err != nil { 
 		return nil 
 	} 
 	if job.GetUID() != controllerRef.UID { 
 		// The controller we found with this Name is not the same one that the 
 		// ControllerRef points to. 
 		return nil 
 	} 
 	return job 
 				logger.Info("This pod's job does not exist")
 			}
 			return
@@ -392,9 +392,9 @@ func (jc *JobController) ReconcilePods(
 			}
 			// Check if the pod is retryable.
 			if spec.RestartPolicy == apiv1.RestartPolicyExitCode {
-				if pod.Status.Phase == v1.PodFailed && trainutil.IsRetryableExitCode(exitCode) {
+				if pod.Status.Phase == v1.PodFailed && !trainutil.IsRetryableExitCode(exitCode) {
 					failedPodsCount.Inc()
-					logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name)
+					logger.Infof("Need to delete the pod: %v.%v", pod.Namespace, pod.Name)
 					if err := jc.PodControl.DeletePod(pod.Namespace, pod.Name, runtimeObject); err != nil {
 						return err
 					}