Skip to content

Commit 3440c90

Browse files
authored
Merge pull request #4456 from Azure/hawkowl/canary-e2e-cert-fix
[ARO-22479] Attempt to wait for all cluster operators to settle before finishing install
2 parents cf1f547 + c41a919 commit 3440c90

File tree

4 files changed

+248
-84
lines changed

4 files changed

+248
-84
lines changed

pkg/cluster/condition.go

Lines changed: 32 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ package cluster
66
import (
77
"context"
88
"errors"
9+
"slices"
10+
"strings"
911
"time"
1012

1113
corev1 "k8s.io/api/core/v1"
@@ -22,6 +24,8 @@ const workerMachineRoleLabel = "machine.openshift.io/cluster-api-machine-role=wo
2224
const workerNodeRoleLabel = "node-role.kubernetes.io/worker"
2325
const phaseRunning = "Running"
2426

27+
var clusterOperatorsToRequireSettled = []string{"kube-controller-manager", "kube-apiserver", "kube-scheduler", "console", "authentication"}
28+
2529
// condition functions should return an error only if it's not able to be retried
2630
// if a condition function encounters a error when retrying it should return false, nil.
2731

@@ -33,39 +37,6 @@ func (m *manager) apiServersReady(ctx context.Context) (bool, error) {
3337
return clusteroperators.IsOperatorAvailable(apiserver), nil
3438
}
3539

36-
// apiServersReadyAfterCertificateConfig provides a more robust check for apiserver readiness
37-
// after certificate configuration. It waits for the operator to be available and not progressing,
38-
// and also ensures that any new revisions triggered by certificate changes have completed.
39-
func (m *manager) apiServersReadyAfterCertificateConfig(ctx context.Context) (bool, error) {
40-
apiserver, err := m.configcli.ConfigV1().ClusterOperators().Get(ctx, "kube-apiserver", metav1.GetOptions{})
41-
if err != nil {
42-
return false, nil
43-
}
44-
45-
// First check if the operator is available and not progressing
46-
if !clusteroperators.IsOperatorAvailable(apiserver) {
47-
m.log.Infof("kube-apiserver operator not yet available: %s", clusteroperators.OperatorStatusText(apiserver))
48-
return false, nil
49-
}
50-
51-
// Additional check: ensure that all conditions are stable
52-
// This helps catch cases where the operator reports as available but is still
53-
// processing certificate-related revisions
54-
for _, condition := range apiserver.Status.Conditions {
55-
if condition.Type == configv1.OperatorProgressing && condition.Status == configv1.ConditionTrue {
56-
m.log.Infof("kube-apiserver operator still progressing: %s", clusteroperators.OperatorStatusText(apiserver))
57-
return false, nil
58-
}
59-
if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue {
60-
m.log.Infof("kube-apiserver operator degraded: %s", clusteroperators.OperatorStatusText(apiserver))
61-
return false, nil
62-
}
63-
}
64-
65-
m.log.Infof("kube-apiserver operator is ready: %s", clusteroperators.OperatorStatusText(apiserver))
66-
return true, nil
67-
}
68-
6940
func (m *manager) minimumWorkerNodesReady(ctx context.Context) (bool, error) {
7041
machines, err := m.maocli.MachineV1beta1().Machines("openshift-machine-api").List(ctx, metav1.ListOptions{
7142
LabelSelector: workerMachineRoleLabel,
@@ -194,3 +165,31 @@ func (m *manager) aroCredentialsRequestReconciled(ctx context.Context) (bool, er
194165
timeSinceLastSync := time.Since(timestamp)
195166
return timeSinceLastSync.Minutes() < 5, nil
196167
}
168+
169+
// Check if all ClusterOperators have settled (i.e. are available and not
170+
// progressing).
171+
func (m *manager) clusterOperatorsHaveSettled(ctx context.Context) (bool, error) {
172+
coList := &configv1.ClusterOperatorList{}
173+
174+
err := m.ch.List(ctx, coList)
175+
if err != nil {
176+
// Be resilient to failures as kube-apiserver might drop connections while it's reconciling
177+
m.log.Errorf("failure listing cluster operators, retrying: %s", err.Error())
178+
return false, nil
179+
}
180+
181+
allSettled := true
182+
183+
// Only check the COs we care about to prevent added ones in new OpenShift
184+
// versions perhaps tripping us up later
185+
for _, co := range coList.Items {
186+
if slices.Contains(clusterOperatorsToRequireSettled, strings.ToLower(co.Name)) {
187+
if !clusteroperators.IsOperatorAvailable(&co) {
188+
allSettled = false
189+
m.log.Warnf("ClusterOperator not yet settled: %s", clusteroperators.OperatorStatusText(&co))
190+
}
191+
}
192+
}
193+
194+
return allSettled, nil
195+
}

pkg/cluster/condition_test.go

Lines changed: 206 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ import (
2020
"k8s.io/client-go/kubernetes/scheme"
2121
ktesting "k8s.io/client-go/testing"
2222

23+
"sigs.k8s.io/controller-runtime/pkg/client"
24+
clientfake "sigs.k8s.io/controller-runtime/pkg/client/fake"
25+
2326
configv1 "github.com/openshift/api/config/v1"
2427
machinev1beta1 "github.com/openshift/api/machine/v1beta1"
2528
operatorv1 "github.com/openshift/api/operator/v1"
@@ -29,8 +32,10 @@ import (
2932
cloudcredentialv1 "github.com/openshift/cloud-credential-operator/pkg/apis/cloudcredential/v1"
3033

3134
"github.com/Azure/ARO-RP/pkg/api"
35+
"github.com/Azure/ARO-RP/pkg/util/clienthelper"
3236
"github.com/Azure/ARO-RP/pkg/util/pointerutils"
3337
utilerror "github.com/Azure/ARO-RP/test/util/error"
38+
testlog "github.com/Azure/ARO-RP/test/util/log"
3439
)
3540

3641
const errMustBeNilMsg = "err must be nil; condition is retried until timeout"
@@ -413,74 +418,229 @@ func TestAroCredentialsRequestReconciled(t *testing.T) {
413418
}
414419
}
415420

416-
func TestApiServersReadyAfterCertificateConfig(t *testing.T) {
421+
func TestHaveClusterOperatorsSettled(t *testing.T) {
417422
ctx := context.Background()
418423

419424
for _, tt := range []struct {
420-
name string
421-
availableCondition configv1.ConditionStatus
422-
progressingCondition configv1.ConditionStatus
423-
degradedCondition configv1.ConditionStatus
424-
want bool
425+
name string
426+
apiserverConditions []configv1.ClusterOperatorStatusCondition
427+
kubeControllerConditions []configv1.ClusterOperatorStatusCondition
428+
want bool
425429
}{
426430
{
427-
name: "Available, not progressing, not degraded - ready",
428-
availableCondition: configv1.ConditionTrue,
429-
progressingCondition: configv1.ConditionFalse,
430-
degradedCondition: configv1.ConditionFalse,
431-
want: true,
431+
name: "APIServer Available, not progressing, KCM OK - ready",
432+
apiserverConditions: []configv1.ClusterOperatorStatusCondition{
433+
{
434+
Type: configv1.OperatorAvailable,
435+
Status: configv1.ConditionTrue,
436+
},
437+
{
438+
Type: configv1.OperatorProgressing,
439+
Status: configv1.ConditionFalse,
440+
},
441+
{
442+
Type: configv1.OperatorDegraded,
443+
Status: configv1.ConditionFalse,
444+
},
445+
},
446+
kubeControllerConditions: []configv1.ClusterOperatorStatusCondition{
447+
{
448+
Type: configv1.OperatorAvailable,
449+
Status: configv1.ConditionTrue,
450+
},
451+
{
452+
Type: configv1.OperatorProgressing,
453+
Status: configv1.ConditionFalse,
454+
},
455+
{
456+
Type: configv1.OperatorDegraded,
457+
Status: configv1.ConditionFalse,
458+
},
459+
},
460+
want: true,
432461
},
433462
{
434-
name: "Available but still progressing - not ready",
435-
availableCondition: configv1.ConditionTrue,
436-
progressingCondition: configv1.ConditionTrue,
437-
degradedCondition: configv1.ConditionFalse,
438-
want: false,
463+
name: "APIServer OK, KCM degraded - not ready",
464+
apiserverConditions: []configv1.ClusterOperatorStatusCondition{
465+
{
466+
Type: configv1.OperatorAvailable,
467+
Status: configv1.ConditionTrue,
468+
},
469+
{
470+
Type: configv1.OperatorProgressing,
471+
Status: configv1.ConditionFalse,
472+
},
473+
{
474+
Type: configv1.OperatorDegraded,
475+
Status: configv1.ConditionFalse,
476+
},
477+
},
478+
kubeControllerConditions: []configv1.ClusterOperatorStatusCondition{
479+
{
480+
Type: configv1.OperatorAvailable,
481+
Status: configv1.ConditionTrue,
482+
},
483+
{
484+
Type: configv1.OperatorProgressing,
485+
Status: configv1.ConditionFalse,
486+
},
487+
{
488+
Type: configv1.OperatorDegraded,
489+
Status: configv1.ConditionTrue,
490+
},
491+
},
492+
want: false,
439493
},
440494
{
441-
name: "Available but degraded - not ready",
442-
availableCondition: configv1.ConditionTrue,
443-
progressingCondition: configv1.ConditionFalse,
444-
degradedCondition: configv1.ConditionTrue,
445-
want: false,
495+
name: "APIServer Available but still progressing, KCM OK - not ready",
496+
apiserverConditions: []configv1.ClusterOperatorStatusCondition{
497+
{
498+
Type: configv1.OperatorAvailable,
499+
Status: configv1.ConditionTrue,
500+
},
501+
{
502+
Type: configv1.OperatorProgressing,
503+
Status: configv1.ConditionTrue,
504+
},
505+
{
506+
Type: configv1.OperatorDegraded,
507+
Status: configv1.ConditionFalse,
508+
},
509+
},
510+
kubeControllerConditions: []configv1.ClusterOperatorStatusCondition{
511+
{
512+
Type: configv1.OperatorAvailable,
513+
Status: configv1.ConditionTrue,
514+
},
515+
{
516+
Type: configv1.OperatorProgressing,
517+
Status: configv1.ConditionFalse,
518+
},
519+
{
520+
Type: configv1.OperatorDegraded,
521+
Status: configv1.ConditionFalse,
522+
},
523+
},
524+
want: false,
525+
},
526+
{
527+
name: "APIServer Available but degraded, KCM OK - not ready",
528+
apiserverConditions: []configv1.ClusterOperatorStatusCondition{
529+
{
530+
Type: configv1.OperatorAvailable,
531+
Status: configv1.ConditionTrue,
532+
},
533+
{
534+
Type: configv1.OperatorProgressing,
535+
Status: configv1.ConditionFalse,
536+
},
537+
{
538+
Type: configv1.OperatorDegraded,
539+
Status: configv1.ConditionTrue,
540+
},
541+
},
542+
kubeControllerConditions: []configv1.ClusterOperatorStatusCondition{
543+
{
544+
Type: configv1.OperatorAvailable,
545+
Status: configv1.ConditionTrue,
546+
},
547+
{
548+
Type: configv1.OperatorProgressing,
549+
Status: configv1.ConditionFalse,
550+
},
551+
{
552+
Type: configv1.OperatorDegraded,
553+
Status: configv1.ConditionFalse,
554+
},
555+
},
556+
want: false,
446557
},
447558
{
448-
name: "Not available - not ready",
449-
availableCondition: configv1.ConditionFalse,
450-
progressingCondition: configv1.ConditionFalse,
451-
degradedCondition: configv1.ConditionFalse,
452-
want: false,
559+
name: "APIServer Not available, KCM OK - not ready",
560+
apiserverConditions: []configv1.ClusterOperatorStatusCondition{
561+
{
562+
Type: configv1.OperatorAvailable,
563+
Status: configv1.ConditionFalse,
564+
},
565+
{
566+
Type: configv1.OperatorProgressing,
567+
Status: configv1.ConditionFalse,
568+
},
569+
{
570+
Type: configv1.OperatorDegraded,
571+
Status: configv1.ConditionFalse,
572+
},
573+
},
574+
kubeControllerConditions: []configv1.ClusterOperatorStatusCondition{
575+
{
576+
Type: configv1.OperatorAvailable,
577+
Status: configv1.ConditionTrue,
578+
},
579+
{
580+
Type: configv1.OperatorProgressing,
581+
Status: configv1.ConditionFalse,
582+
},
583+
{
584+
Type: configv1.OperatorDegraded,
585+
Status: configv1.ConditionFalse,
586+
},
587+
},
588+
want: false,
453589
},
454590
} {
455591
t.Run(tt.name, func(t *testing.T) {
456-
configcli := configfake.NewSimpleClientset(&configv1.ClusterOperator{
457-
ObjectMeta: metav1.ObjectMeta{
458-
Name: "kube-apiserver",
592+
objects := []client.Object{
593+
&configv1.ClusterOperator{
594+
ObjectMeta: metav1.ObjectMeta{
595+
Name: "kube-apiserver",
596+
},
597+
Status: configv1.ClusterOperatorStatus{
598+
Conditions: tt.apiserverConditions,
599+
},
459600
},
460-
Status: configv1.ClusterOperatorStatus{
461-
Conditions: []configv1.ClusterOperatorStatusCondition{
462-
{
463-
Type: configv1.OperatorAvailable,
464-
Status: tt.availableCondition,
465-
},
466-
{
467-
Type: configv1.OperatorProgressing,
468-
Status: tt.progressingCondition,
469-
},
470-
{
471-
Type: configv1.OperatorDegraded,
472-
Status: tt.degradedCondition,
601+
&configv1.ClusterOperator{
602+
ObjectMeta: metav1.ObjectMeta{
603+
Name: "kube-controller-manager",
604+
},
605+
Status: configv1.ClusterOperatorStatus{
606+
Conditions: tt.kubeControllerConditions,
607+
},
608+
},
609+
// this being degraded should not affect the APIServer
610+
&configv1.ClusterOperator{
611+
ObjectMeta: metav1.ObjectMeta{
612+
Name: "kube-widget-operator",
613+
},
614+
Status: configv1.ClusterOperatorStatus{
615+
Conditions: []configv1.ClusterOperatorStatusCondition{
616+
{
617+
Type: configv1.OperatorAvailable,
618+
Status: configv1.ConditionFalse,
619+
},
620+
{
621+
Type: configv1.OperatorProgressing,
622+
Status: configv1.ConditionTrue,
623+
},
624+
{
625+
Type: configv1.OperatorDegraded,
626+
Status: configv1.ConditionTrue,
627+
},
473628
},
474629
},
475630
},
476-
})
631+
}
632+
_, log := testlog.New()
633+
ch := clienthelper.NewWithClient(log, clientfake.
634+
NewClientBuilder().
635+
WithObjects(objects...).
636+
Build())
477637

478638
m := &manager{
479-
log: logrus.NewEntry(logrus.StandardLogger()),
480-
configcli: configcli,
639+
log: log,
640+
ch: ch,
481641
}
482642

483-
result, err := m.apiServersReadyAfterCertificateConfig(ctx)
643+
result, err := m.clusterOperatorsHaveSettled(ctx)
484644
if err != nil {
485645
t.Errorf("Unexpected error: %v", err)
486646
}

0 commit comments

Comments
 (0)