Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions internal/controller/metal3.io/baremetalhost_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/predicate"
)

Expand Down Expand Up @@ -1457,6 +1458,51 @@ func (r *BareMetalHostReconciler) actionDeprovisioning(ctx context.Context, prov
return actionComplete{}
}

// hostFirmwareSpecsExist reports whether firmware servicing specs are still present
// on the host or related HFS/HFC objects. Used by the ServicingError recovery fast-path
// before calling getHostFirmwareSettings/Components, which block on generation mismatch.
// Only NotFound errors from Get are treated as absent resources; other errors are returned.
func (r *BareMetalHostReconciler) hostFirmwareSpecsExist(
ctx context.Context,
info *reconcileInfo,
liveFirmwareSettingsAllowed, liveFirmwareUpdatesAllowed bool,
) (bool, error) {
specsExist := false

if liveFirmwareSettingsAllowed {
if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) &&
info.host.Spec.Firmware != nil {
specsExist = true
}

if !specsExist {
hfsCheck := &metal3api.HostFirmwareSettings{}
err := r.Get(ctx, info.request.NamespacedName, hfsCheck)
if err != nil {
if !k8serrors.IsNotFound(err) {
return false, fmt.Errorf("could not load host firmware settings: %w", err)
}
} else if len(hfsCheck.Spec.Settings) > 0 {
specsExist = true
}
}
}

if liveFirmwareUpdatesAllowed {
hfcCheck := &metal3api.HostFirmwareComponents{}
err := r.Get(ctx, info.request.NamespacedName, hfcCheck)
if err != nil {
if !k8serrors.IsNotFound(err) {
return false, fmt.Errorf("could not load host firmware components: %w", err)
}
} else if len(hfcCheck.Spec.Updates) > 0 {
specsExist = true
}
}

return specsExist, nil
}

func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov provisioner.Provisioner, info *reconcileInfo, hup *metal3api.HostUpdatePolicy) (result actionResult) {
servicingData := provisioner.ServicingData{}

Expand All @@ -1474,6 +1520,36 @@ func (r *BareMetalHostReconciler) doServiceIfNeeded(ctx context.Context, prov pr
liveFirmwareUpdatesAllowed = (hup.Spec.FirmwareUpdates == metal3api.HostUpdatePolicyOnReboot)
}

// Fast-path: when recovering from a servicing error, check if specs are
// cleared before calling getHostFirmwareSettings/Components. Those
// functions fail on generation mismatch (sub-controller hasn't reconciled
// yet), which would block the abort/recovery path indefinitely.
if info.host.Status.ErrorType == metal3api.ServicingError &&
(liveFirmwareSettingsAllowed || liveFirmwareUpdatesAllowed) {
specsExist, err := r.hostFirmwareSpecsExist(ctx, info, liveFirmwareSettingsAllowed, liveFirmwareUpdatesAllowed)
if err != nil {
return actionError{fmt.Errorf("could not determine if firmware specs exist: %w", err)}
}
if !specsExist {
info.log.Info("specs cleared while in servicing error state, attempting abort")
provResult, _, err := prov.Service(ctx, servicingData, false, false)
if err != nil {
return actionError{fmt.Errorf("failed to abort servicing: %w", err)}
}
if provResult.ErrorMessage != "" {
return actionError{fmt.Errorf("failed to abort servicing: %s", provResult.ErrorMessage)}
}
if provResult.Dirty {
return actionContinue{provResult.RequeueAfter}
}
info.log.Info("successfully recovered from servicing error")
if clearErrorWithStatus(info.host, metal3api.OperationalStatusOK) {
return actionUpdate{}
}
return actionComplete{}
}
}

if liveFirmwareSettingsAllowed {
// handling pre-HFS FirmwareSettings here
if !reflect.DeepEqual(info.host.Status.Provisioning.Firmware, info.host.Spec.Firmware) {
Expand Down Expand Up @@ -2538,6 +2614,23 @@ func (r *BareMetalHostReconciler) SetupWithManager(mgr ctrl.Manager, preprovImgE
controller.Owns(&metal3api.PreprovisioningImage{})
}

// Watch HFC/HFS for spec changes (generation bumps) so that the BMH
// controller reconciles promptly when a user clears firmware specs,
// rather than waiting for error-state exponential backoff to expire.
// HFS and HFC use the same name and namespace as their BareMetalHost.
firmwareEventHandler := handler.EnqueueRequestsFromMapFunc(
func(_ context.Context, obj client.Object) []ctrl.Request {
return []ctrl.Request{{NamespacedName: client.ObjectKey{
Name: obj.GetName(),
Namespace: obj.GetNamespace(),
}}}
},
)
controller.Watches(&metal3api.HostFirmwareSettings{}, firmwareEventHandler,
builder.WithPredicates(predicate.GenerationChangedPredicate{}))
controller.Watches(&metal3api.HostFirmwareComponents{}, firmwareEventHandler,
builder.WithPredicates(predicate.GenerationChangedPredicate{}))

return controller.Complete(r)
}

Expand Down
101 changes: 101 additions & 0 deletions internal/controller/metal3.io/baremetalhost_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3053,6 +3053,107 @@ func TestHostFirmwareSettings(t *testing.T) {
}
}

func TestHostFirmwareSpecsExist(t *testing.T) {
var trueVal = true
var falseVal = false

host := newDefaultHost(t)
info := makeReconcileInfo(host)
info.request = newRequest(host)

t.Run("legacy firmware spec pending", func(t *testing.T) {
host := host.DeepCopy()
host.Status.Provisioning.Firmware = &metal3api.FirmwareConfig{
VirtualizationEnabled: &falseVal,
}
host.Spec.Firmware = &metal3api.FirmwareConfig{
VirtualizationEnabled: &trueVal,
}
info := makeReconcileInfo(host)
info.request = newRequest(host)
r := newTestReconciler(t, host)

exists, err := r.hostFirmwareSpecsExist(t.Context(), info, true, false)
require.NoError(t, err)
assert.True(t, exists)
})

t.Run("HFS with settings", func(t *testing.T) {
hfs := newHostFirmwareSettings(host, nil)
r := newTestReconciler(t, host, hfs)

exists, err := r.hostFirmwareSpecsExist(t.Context(), info, true, false)
require.NoError(t, err)
assert.True(t, exists)
})

t.Run("HFS not found and no legacy spec", func(t *testing.T) {
r := newTestReconciler(t, host)

exists, err := r.hostFirmwareSpecsExist(t.Context(), info, true, false)
require.NoError(t, err)
assert.False(t, exists)
})

t.Run("HFS empty settings", func(t *testing.T) {
hfs := newHostFirmwareSettings(host, nil)
hfs.Spec.Settings = nil
r := newTestReconciler(t, host, hfs)

exists, err := r.hostFirmwareSpecsExist(t.Context(), info, true, false)
require.NoError(t, err)
assert.False(t, exists)
})

t.Run("HFC with updates", func(t *testing.T) {
hfc := &metal3api.HostFirmwareComponents{
ObjectMeta: metav1.ObjectMeta{
Name: host.Name,
Namespace: host.Namespace,
},
Spec: metal3api.HostFirmwareComponentsSpec{
Updates: []metal3api.FirmwareUpdate{
{Component: "bmc", URL: "http://example.com/bmc"},
},
},
}
r := newTestReconciler(t, host, hfc)

exists, err := r.hostFirmwareSpecsExist(t.Context(), info, false, true)
require.NoError(t, err)
assert.True(t, exists)
})

t.Run("neither HFS nor HFC when both paths enabled", func(t *testing.T) {
r := newTestReconciler(t, host)

exists, err := r.hostFirmwareSpecsExist(t.Context(), info, true, true)
require.NoError(t, err)
assert.False(t, exists)
})

t.Run("Get error is propagated", func(t *testing.T) {
r := newTestReconciler(t, host)
r.Client = &getErrorClient{Client: r.Client, err: fmt.Errorf("apiserver unavailable")}

_, err := r.hostFirmwareSpecsExist(t.Context(), info, true, false)
require.Error(t, err)
assert.Contains(t, err.Error(), "could not load host firmware settings")
})
}

type getErrorClient struct {
client.Client
err error
}

func (c *getErrorClient) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
if c.err != nil {
return c.err
}
return c.Client.Get(ctx, key, obj, opts...)
}

func TestBMHTransitionToPreparing(t *testing.T) {
var True = true
var False = false
Expand Down
19 changes: 7 additions & 12 deletions internal/controller/metal3.io/hostfirmwaresettings_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,15 +247,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info
generation := info.hfs.GetGeneration()

if specMismatch {
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionTrue, reason, "") {
dirty = true
}

// Run validation on the Spec to detect invalid values entered by user, including Spec settings not in Status
// Eventually this will be handled by a webhook
errors := r.validateHostFirmwareSettings(info, &newStatus, schema)
if len(errors) == 0 {
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
dirty = true
}
} else {
Expand All @@ -266,15 +266,15 @@ func (r *HostFirmwareSettingsReconciler) updateStatus(ctx context.Context, info
}
}
reason = reasonConfigurationError
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionFalse, reason, "Invalid BIOS setting") {
dirty = true
}
}
} else {
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsValid, metav1.ConditionTrue, reason, "") {
dirty = true
}
if setCondition(generation, &newStatus, info, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") {
if setCondition(generation, &newStatus, metal3api.FirmwareSettingsChangeDetected, metav1.ConditionFalse, reason, "") {
dirty = true
}
}
Expand Down Expand Up @@ -425,7 +425,7 @@ func (r *HostFirmwareSettingsReconciler) publishEvent(ctx context.Context, reque
}
}

func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus, info *rInfo,
func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus,
cond metal3api.SettingsConditionType, newStatus metav1.ConditionStatus,
reason conditionReason, message string) bool {
newCondition := metav1.Condition{
Expand All @@ -435,13 +435,8 @@ func setCondition(generation int64, status *metal3api.HostFirmwareSettingsStatus
Reason: string(reason),
Message: message,
}
meta.SetStatusCondition(&status.Conditions, newCondition)

currCond := meta.FindStatusCondition(info.hfs.Status.Conditions, string(cond))
if currCond == nil || currCond.Status != newStatus {
return true
}
return false
return meta.SetStatusCondition(&status.Conditions, newCondition)
}

// Generate a name based on the schema key and values which should be the same for similar hardware.
Expand Down
26 changes: 26 additions & 0 deletions internal/controller/metal3.io/hostfirmwaresettings_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,32 @@ func TestStoreHostFirmwareSettings(t *testing.T) {
}
}

func TestSetConditionObservedGeneration(t *testing.T) {
status := &metal3api.HostFirmwareSettingsStatus{}

changed := setCondition(1, status,
metal3api.FirmwareSettingsValid, metav1.ConditionTrue,
reasonSuccess, "")
assert.True(t, changed, "first call should report a change")

cond := status.Conditions[0]
assert.Equal(t, int64(1), cond.ObservedGeneration)

changed = setCondition(1, status,
metal3api.FirmwareSettingsValid, metav1.ConditionTrue,
reasonSuccess, "")
assert.False(t, changed, "same generation and status should not report a change")

// This is the scenario that was broken: generation bumps but status
// stays the same. The old code compared only Status and missed the
// ObservedGeneration difference, returning false.
changed = setCondition(2, status,
metal3api.FirmwareSettingsValid, metav1.ConditionTrue,
reasonSuccess, "")
assert.True(t, changed, "generation change alone must report a change")
assert.Equal(t, int64(2), status.Conditions[0].ObservedGeneration)
}

// Test the function to validate hostFirmwareSettings.
func TestValidateHostFirmwareSettings(t *testing.T) {
testCases := []struct {
Expand Down
3 changes: 3 additions & 0 deletions test/e2e/config/ironic.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ intervals:
default/wait-secret-deletion: ["1m", "1s"]
default/wait-connect-ssh: ["2m", "10s"]
default/wait-externally-provisioned: ["1m", "10ms"]
default/wait-servicing: [ "1m", "100ms" ]
default/wait-firmware-settings: [ "15m", "1s" ]
default/wait-reconcile: [ "1s", "10ms" ]

kindExtraPortMappings:
# Expose Ironic ports so they are reachable outside of kind
Expand Down
Loading