From 3eb2b524c65167565c595d01e475497b066ef22a Mon Sep 17 00:00:00 2001 From: Dharaneeshwaran Ravichandran <17947543+dharaneeshvrd@users.noreply.github.com> Date: Wed, 28 Aug 2024 09:56:56 +0530 Subject: [PATCH] Refactor infra creation to improve the overall infra setup time (#1869) --- cloud/scope/powervs_cluster.go | 20 +- controllers/ibmpowervscluster_controller.go | 232 ++++++++++++++------ 2 files changed, 172 insertions(+), 80 deletions(-) diff --git a/cloud/scope/powervs_cluster.go b/cloud/scope/powervs_cluster.go index bbec94f2f..a8587a765 100644 --- a/cloud/scope/powervs_cluster.go +++ b/cloud/scope/powervs_cluster.go @@ -823,13 +823,13 @@ func (s *PowerVSClusterScope) createServiceInstance() (*resourcecontrollerv2.Res func (s *PowerVSClusterScope) ReconcileNetwork() (bool, error) { if s.GetDHCPServerID() != nil { s.V(3).Info("DHCP server ID is set, fetching details", "id", s.GetDHCPServerID()) - requeue, err := s.isDHCPServerActive() + active, err := s.isDHCPServerActive() if err != nil { return false, err } // if dhcp server exist and in active state, its assumed that dhcp network exist // TODO(Phase 2): Verify that dhcp network is exist. - return requeue, nil + return active, nil // TODO(karthik-k-n): If needed set dhcp status here } // check network exist in cloud @@ -851,7 +851,7 @@ func (s *PowerVSClusterScope) ReconcileNetwork() (bool, error) { s.Info("Created DHCP Server", "id", *dhcpServer) s.SetStatus(infrav1beta2.ResourceTypeDHCPServer, infrav1beta2.ResourceReference{ID: dhcpServer, ControllerCreated: ptr.To(true)}) - return true, nil + return false, nil } // checkNetwork checks the network exist in cloud. @@ -913,25 +913,25 @@ func (s *PowerVSClusterScope) isDHCPServerActive() (bool, error) { return false, err } - requeue, err := s.checkDHCPServerStatus(*dhcpServer) + active, err := s.checkDHCPServerStatus(*dhcpServer) if err != nil { return false, err } - return requeue, nil + return active, nil } // checkDHCPServerStatus checks the state of a DHCP server. -// If state is BUILD, true is returned indicating a requeue for reconciliation. +// If state is active, true is returned. // In all other cases, it returns false. func (s *PowerVSClusterScope) checkDHCPServerStatus(dhcpServer models.DHCPServerDetail) (bool, error) { s.V(3).Info("Checking the status of DHCP server", "id", *dhcpServer.ID) switch *dhcpServer.Status { case string(infrav1beta2.DHCPServerStateActive): s.V(3).Info("DHCP server is in active state") - return false, nil + return true, nil case string(infrav1beta2.DHCPServerStateBuild): s.V(3).Info("DHCP server is in build state") - return true, nil + return false, nil case string(infrav1beta2.DHCPServerStateError): return false, fmt.Errorf("DHCP server creation failed and is in error state") } @@ -1917,9 +1917,9 @@ func (s *PowerVSClusterScope) ReconcileLoadBalancers() (bool, error) { } s.Info("Created VPC load balancer", "id", loadBalancerStatus.ID) s.SetLoadBalancerStatus(loadBalancer.Name, *loadBalancerStatus) - return true, nil + return false, nil } - return false, nil + return true, nil } // checkLoadBalancerStatus checks the state of a VPC load balancer. diff --git a/controllers/ibmpowervscluster_controller.go b/controllers/ibmpowervscluster_controller.go index de90429ef..ec7d385e9 100644 --- a/controllers/ibmpowervscluster_controller.go +++ b/controllers/ibmpowervscluster_controller.go @@ -20,10 +20,12 @@ import ( "context" "fmt" "strings" + "sync" "time" "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/runtime" @@ -114,6 +116,116 @@ func (r *IBMPowerVSClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re return r.reconcile(clusterScope) } +type powerVSCluster struct { + cluster *infrav1beta2.IBMPowerVSCluster + mu sync.Mutex +} + +type reconcileResult struct { + reconcile.Result + error +} + +func (update *powerVSCluster) updateCondition(condition bool, conditionArgs ...interface{}) { + update.mu.Lock() + defer update.mu.Unlock() + if condition { + conditions.MarkTrue(update.cluster, conditionArgs[0].(capiv1beta1.ConditionType)) + return + } + + conditions.MarkFalse(update.cluster, conditionArgs[0].(capiv1beta1.ConditionType), conditionArgs[1].(string), conditionArgs[2].(capiv1beta1.ConditionSeverity), conditionArgs[3].(string), conditionArgs[4:]...) +} + +func (r *IBMPowerVSClusterReconciler) reconcilePowerVSResources(clusterScope *scope.PowerVSClusterScope, powerVSCluster *powerVSCluster, ch chan reconcileResult, wg *sync.WaitGroup) { + defer wg.Done() + powerVSLog := clusterScope.WithName("powervs") + // reconcile PowerVS service instance + powerVSLog.Info("Reconciling PowerVS service instance") + if requeue, err := clusterScope.ReconcilePowerVSServiceInstance(); err != nil { + powerVSLog.Error(err, "failed to reconcile PowerVS service instance") + powerVSCluster.updateCondition(false, infrav1beta2.ServiceInstanceReadyCondition, infrav1beta2.ServiceInstanceReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) + ch <- reconcileResult{reconcile.Result{}, err} + return + } else if requeue { + powerVSLog.Info("PowerVS service instance creation is pending, requeuing") + ch <- reconcileResult{reconcile.Result{Requeue: true}, nil} + return + } + powerVSCluster.updateCondition(true, infrav1beta2.ServiceInstanceReadyCondition) + + clusterScope.IBMPowerVSClient.WithClients(powervs.ServiceOptions{CloudInstanceID: clusterScope.GetServiceInstanceID()}) + + // reconcile network + powerVSLog.Info("Reconciling network") + if dhcpServerActive, err := clusterScope.ReconcileNetwork(); err != nil { + powerVSLog.Error(err, "failed to reconcile PowerVS network") + powerVSCluster.updateCondition(false, infrav1beta2.NetworkReadyCondition, infrav1beta2.NetworkReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) + ch <- reconcileResult{reconcile.Result{}, err} + return + } else if dhcpServerActive { + powerVSCluster.updateCondition(true, infrav1beta2.NetworkReadyCondition) + return + } + // Do not want to block the reconciliation of other resources like setting up TG and COS, so skipping the requeue and only logging the info. + powerVSLog.Info("PowerVS network creation is pending") +} + +func (r *IBMPowerVSClusterReconciler) reconcileVPCResources(clusterScope *scope.PowerVSClusterScope, powerVSCluster *powerVSCluster, ch chan reconcileResult, wg *sync.WaitGroup) { + defer wg.Done() + vpcLog := clusterScope.WithName("vpc") + vpcLog.Info("Reconciling VPC") + if requeue, err := clusterScope.ReconcileVPC(); err != nil { + clusterScope.Error(err, "failed to reconcile VPC") + powerVSCluster.updateCondition(false, infrav1beta2.VPCReadyCondition, infrav1beta2.VPCReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) + ch <- reconcileResult{reconcile.Result{}, err} + return + } else if requeue { + vpcLog.Info("VPC creation is pending, requeuing") + ch <- reconcileResult{reconcile.Result{Requeue: true}, nil} + return + } + powerVSCluster.updateCondition(true, infrav1beta2.VPCReadyCondition) + + // reconcile VPC Subnet + vpcLog.Info("Reconciling VPC subnets") + if requeue, err := clusterScope.ReconcileVPCSubnets(); err != nil { + vpcLog.Error(err, "failed to reconcile VPC subnets") + powerVSCluster.updateCondition(false, infrav1beta2.VPCSubnetReadyCondition, infrav1beta2.VPCSubnetReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) + ch <- reconcileResult{reconcile.Result{}, err} + return + } else if requeue { + vpcLog.Info("VPC subnet creation is pending, requeuing") + ch <- reconcileResult{reconcile.Result{Requeue: true}, nil} + return + } + powerVSCluster.updateCondition(true, infrav1beta2.VPCSubnetReadyCondition) + + // reconcile VPC security group + vpcLog.Info("Reconciling VPC security group") + if err := clusterScope.ReconcileVPCSecurityGroups(); err != nil { + vpcLog.Error(err, "failed to reconcile VPC security groups") + powerVSCluster.updateCondition(false, infrav1beta2.VPCSecurityGroupReadyCondition, infrav1beta2.VPCSecurityGroupReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) + ch <- reconcileResult{reconcile.Result{}, err} + return + } + powerVSCluster.updateCondition(true, infrav1beta2.VPCSecurityGroupReadyCondition) + + // reconcile LoadBalancer + vpcLog.Info("Reconciling VPC load balancers") + if loadBalancerReady, err := clusterScope.ReconcileLoadBalancers(); err != nil { + vpcLog.Error(err, "failed to reconcile VPC load balancers") + powerVSCluster.updateCondition(false, infrav1beta2.LoadBalancerReadyCondition, infrav1beta2.LoadBalancerReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) + ch <- reconcileResult{reconcile.Result{}, err} + return + } else if loadBalancerReady { + powerVSCluster.updateCondition(true, infrav1beta2.LoadBalancerReadyCondition) + return + } + // Do not want to block the reconciliation of other resources like setting up TG and COS, so skipping the requeue and only logging the info. + vpcLog.Info("VPC load balancer creation is pending") +} + func (r *IBMPowerVSClusterReconciler) reconcile(clusterScope *scope.PowerVSClusterScope) (ctrl.Result, error) { //nolint:gocyclo if controllerutil.AddFinalizer(clusterScope.IBMPowerVSCluster, infrav1beta2.IBMPowerVSClusterFinalizer) { return ctrl.Result{}, nil @@ -140,97 +252,82 @@ func (r *IBMPowerVSClusterReconciler) reconcile(clusterScope *scope.PowerVSClust return reconcile.Result{}, err } - powerVSCluster := clusterScope.IBMPowerVSCluster - // reconcile PowerVS service instance - clusterScope.Info("Reconciling PowerVS service instance") - if requeue, err := clusterScope.ReconcilePowerVSServiceInstance(); err != nil { - clusterScope.Error(err, "failed to reconcile PowerVS service instance") - conditions.MarkFalse(powerVSCluster, infrav1beta2.ServiceInstanceReadyCondition, infrav1beta2.ServiceInstanceReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) - return reconcile.Result{}, err - } else if requeue { - clusterScope.Info("PowerVS service instance creation is pending, requeuing") - return reconcile.Result{RequeueAfter: 1 * time.Minute}, nil + powerVSCluster := &powerVSCluster{ + cluster: clusterScope.IBMPowerVSCluster, } - conditions.MarkTrue(powerVSCluster, infrav1beta2.ServiceInstanceReadyCondition) - clusterScope.IBMPowerVSClient.WithClients(powervs.ServiceOptions{CloudInstanceID: clusterScope.GetServiceInstanceID()}) + var wg sync.WaitGroup + ch := make(chan reconcileResult) - // reconcile network - clusterScope.Info("Reconciling network") - if requeue, err := clusterScope.ReconcileNetwork(); err != nil { - clusterScope.Error(err, "failed to reconcile PowerVS network") - conditions.MarkFalse(powerVSCluster, infrav1beta2.NetworkReadyCondition, infrav1beta2.NetworkReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) - return reconcile.Result{}, err - } else if requeue { - clusterScope.Info("PowerVS network creation is pending, requeuing") - return reconcile.Result{RequeueAfter: 1 * time.Minute}, nil - } - conditions.MarkTrue(powerVSCluster, infrav1beta2.NetworkReadyCondition) + // reconcile PowerVS resources + wg.Add(1) + go r.reconcilePowerVSResources(clusterScope, powerVSCluster, ch, &wg) // reconcile VPC - clusterScope.Info("Reconciling VPC") - if requeue, err := clusterScope.ReconcileVPC(); err != nil { - clusterScope.Error(err, "failed to reconcile VPC") - conditions.MarkFalse(powerVSCluster, infrav1beta2.VPCReadyCondition, infrav1beta2.VPCReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) - return reconcile.Result{}, err - } else if requeue { - clusterScope.Info("VPC creation is pending, requeuing") - return reconcile.Result{RequeueAfter: 15 * time.Second}, nil - } - conditions.MarkTrue(powerVSCluster, infrav1beta2.VPCReadyCondition) + wg.Add(1) + go r.reconcileVPCResources(clusterScope, powerVSCluster, ch, &wg) - // reconcile VPC Subnet - clusterScope.Info("Reconciling VPC subnets") - if requeue, err := clusterScope.ReconcileVPCSubnets(); err != nil { - clusterScope.Error(err, "failed to reconcile VPC subnets") - conditions.MarkFalse(powerVSCluster, infrav1beta2.VPCSubnetReadyCondition, infrav1beta2.VPCSubnetReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) - return reconcile.Result{}, err - } else if requeue { - clusterScope.Info("VPC subnet creation is pending, requeuing") - return reconcile.Result{RequeueAfter: 15 * time.Second}, nil + // wait for above reconcile to complete and close the channel + go func() { + wg.Wait() + close(ch) + }() + + var requeue bool + var errList []error + // receive return values from the channel and decide the requeue + for val := range ch { + if val.Requeue { + requeue = true + } + if val.error != nil { + errList = append(errList, val.error) + } } - conditions.MarkTrue(powerVSCluster, infrav1beta2.VPCSubnetReadyCondition) - // reconcile VPC security group - clusterScope.Info("Reconciling VPC security group") - if err := clusterScope.ReconcileVPCSecurityGroups(); err != nil { - clusterScope.Error(err, "failed to reconcile VPC security groups") - conditions.MarkFalse(powerVSCluster, infrav1beta2.VPCSecurityGroupReadyCondition, infrav1beta2.VPCSecurityGroupReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) - return reconcile.Result{}, err + if requeue && len(errList) > 1 { + return ctrl.Result{RequeueAfter: 30 * time.Second}, kerrors.NewAggregate(errList) + } else if requeue { + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } else if len(errList) > 1 { + return ctrl.Result{}, kerrors.NewAggregate(errList) } - conditions.MarkTrue(powerVSCluster, infrav1beta2.VPCSecurityGroupReadyCondition) // reconcile Transit Gateway clusterScope.Info("Reconciling Transit Gateway") if requeue, err := clusterScope.ReconcileTransitGateway(); err != nil { clusterScope.Error(err, "failed to reconcile transit gateway") - conditions.MarkFalse(powerVSCluster, infrav1beta2.TransitGatewayReadyCondition, infrav1beta2.TransitGatewayReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) + conditions.MarkFalse(powerVSCluster.cluster, infrav1beta2.TransitGatewayReadyCondition, infrav1beta2.TransitGatewayReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) return reconcile.Result{}, err } else if requeue { clusterScope.Info("Transit gateway creation is pending, requeuing") return reconcile.Result{RequeueAfter: 1 * time.Minute}, nil } - conditions.MarkTrue(powerVSCluster, infrav1beta2.TransitGatewayReadyCondition) - - // reconcile LoadBalancer - clusterScope.Info("Reconciling VPC load balancers") - if requeue, err := clusterScope.ReconcileLoadBalancers(); err != nil { - clusterScope.Error(err, "failed to reconcile VPC load balancers") - conditions.MarkFalse(powerVSCluster, infrav1beta2.LoadBalancerReadyCondition, infrav1beta2.LoadBalancerReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) - return reconcile.Result{}, err - } else if requeue { - clusterScope.Info("VPC load balancer creation is pending, requeuing") - return reconcile.Result{RequeueAfter: 1 * time.Minute}, nil - } + conditions.MarkTrue(powerVSCluster.cluster, infrav1beta2.TransitGatewayReadyCondition) // reconcile COSInstance if clusterScope.IBMPowerVSCluster.Spec.Ignition != nil { clusterScope.Info("Reconciling COS service instance") if err := clusterScope.ReconcileCOSInstance(); err != nil { - conditions.MarkFalse(powerVSCluster, infrav1beta2.COSInstanceReadyCondition, infrav1beta2.COSInstanceReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) + conditions.MarkFalse(powerVSCluster.cluster, infrav1beta2.COSInstanceReadyCondition, infrav1beta2.COSInstanceReconciliationFailedReason, capiv1beta1.ConditionSeverityError, err.Error()) return reconcile.Result{}, err } - conditions.MarkTrue(powerVSCluster, infrav1beta2.COSInstanceReadyCondition) + conditions.MarkTrue(powerVSCluster.cluster, infrav1beta2.COSInstanceReadyCondition) + } + + var networkReady, loadBalancerReady bool + for _, cond := range clusterScope.IBMPowerVSCluster.Status.Conditions { + if cond.Type == infrav1beta2.NetworkReadyCondition && cond.Status == corev1.ConditionTrue { + networkReady = true + } + if cond.Type == infrav1beta2.LoadBalancerReadyCondition && cond.Status == corev1.ConditionTrue { + loadBalancerReady = true + } + } + + if !networkReady || !loadBalancerReady { + clusterScope.Info("Network or LoadBalancer still not ready, requeuing") + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil } // update cluster object with loadbalancer host @@ -238,10 +335,6 @@ func (r *IBMPowerVSClusterReconciler) reconcile(clusterScope *scope.PowerVSClust if loadBalancer == nil { return reconcile.Result{}, fmt.Errorf("failed to fetch public loadbalancer") } - if clusterScope.GetLoadBalancerState(loadBalancer.Name) == nil || *clusterScope.GetLoadBalancerState(loadBalancer.Name) != infrav1beta2.VPCLoadBalancerStateActive { - clusterScope.Info("LoadBalancer state is not active") - return reconcile.Result{RequeueAfter: time.Minute}, nil - } clusterScope.Info("Getting load balancer host") hostName := clusterScope.GetLoadBalancerHostName(loadBalancer.Name) @@ -249,7 +342,6 @@ func (r *IBMPowerVSClusterReconciler) reconcile(clusterScope *scope.PowerVSClust clusterScope.Info("LoadBalancer hostname is not yet available, requeuing") return reconcile.Result{RequeueAfter: time.Minute}, nil } - conditions.MarkTrue(powerVSCluster, infrav1beta2.LoadBalancerReadyCondition) clusterScope.IBMPowerVSCluster.Spec.ControlPlaneEndpoint.Host = *clusterScope.GetLoadBalancerHostName(loadBalancer.Name) clusterScope.IBMPowerVSCluster.Spec.ControlPlaneEndpoint.Port = clusterScope.APIServerPort()