linkpoolio · jleeh · Jun 9, 2026 · Jun 9, 2026 · Jun 9, 2026
diff --git a/module/bdev/nvme/bdev_nvme.c b/module/bdev/nvme/bdev_nvme.c
@@ -2015,7 +2015,23 @@ bdev_nvme_poll_adminq(void *arg)
 			bdev_nvme_change_adminq_poll_period(nvme_ctrlr,
 							    g_opts.nvme_adminq_poll_period_us);
 			disconnected_cb(nvme_ctrlr);
-		} else {
+		} else if (!nvme_ctrlr->resetting) {
+			/*
+			 * Only initiate failover when no reset/failover is already
+			 * in flight. While resetting is true the reset state machine
+			 * owns recovery (reconnect attempts, ctrlr_loss_timeout, and
+			 * advancing to the next trid on reset completion). Without
+			 * this guard, a stuck reconnect (e.g. a remote replica target
+			 * that accepts the TCP connection but never completes the
+			 * admin handshake) makes process_admin_completions return < 0
+			 * on every poll, and each poll re-calls bdev_nvme_failover_ctrlr
+			 * only to hit the "already in progress" path, return -EBUSY,
+			 * and emit a NOTICE. Observed in production as ~231k
+			 * "Unable to perform failover, already in progress." lines in
+			 * ~15 min (~1.1 GB of log), saturating the SPDK reactor until
+			 * the liveness probe SIGKILLed spdk_tgt. Skipping the redundant
+			 * re-drive is safe: failover-while-resetting is a no-op anyway.
+			 */
 			bdev_nvme_failover_ctrlr(nvme_ctrlr);
 		}
 	} else if (spdk_nvme_ctrlr_get_admin_qp_failure_reason(nvme_ctrlr->ctrlr) !=

diff --git a/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c b/test/unit/lib/bdev/nvme/bdev_nvme.c/bdev_nvme_ut.c
@@ -1838,6 +1838,100 @@ test_failover_ctrlr(void)
 	CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
 }
 
+static void
+test_failover_not_redriven_while_resetting(void)
+{
+	struct spdk_nvme_transport_id trid = {};
+	struct spdk_nvme_ctrlr ctrlr = {};
+	struct nvme_ctrlr *nvme_ctrlr = NULL;
+	struct spdk_io_channel *ch1, *ch2;
+	int rc;
+
+	ut_init_trid(&trid);
+	TAILQ_INIT(&ctrlr.active_io_qpairs);
+
+	set_thread(0);
+
+	rc = nvme_ctrlr_create(&ctrlr, "nvme0", &trid, NULL);
+	CU_ASSERT(rc == 0);
+
+	nvme_ctrlr = nvme_ctrlr_get_by_name("nvme0");
+	SPDK_CU_ASSERT_FATAL(nvme_ctrlr != NULL);
+
+	ch1 = spdk_get_io_channel(nvme_ctrlr);
+	SPDK_CU_ASSERT_FATAL(ch1 != NULL);
+
+	set_thread(1);
+
+	ch2 = spdk_get_io_channel(nvme_ctrlr);
+	SPDK_CU_ASSERT_FATAL(ch2 != NULL);
+
+	set_thread(1);
+
+	/*
+	 * Case 1: a reset/failover is already in progress (resetting == true).
+	 * The admin queue is disconnected, so bdev_nvme_poll_adminq() sees
+	 * spdk_nvme_ctrlr_process_admin_completions() < 0 with no disconnected_cb
+	 * pending. It must NOT re-drive failover while resetting -- doing so spins
+	 * a no-op every poll, which in production logged ~231k "already in
+	 * progress" NOTICEs and saturated the reactor until liveness SIGKILLed
+	 * spdk_tgt. We observe the skip via pending_failover: an un-guarded
+	 * re-drive reaches bdev_nvme_failover_ctrlr_unsafe's "resetting &&
+	 * !in_failover" branch and sets pending_failover = true.
+	 */
+	nvme_ctrlr->resetting = true;
+	nvme_ctrlr->in_failover = false;
+	nvme_ctrlr->pending_failover = false;
+	nvme_ctrlr->disconnected_cb = NULL;
+	ctrlr.adminq.is_connected = false;
+
+	bdev_nvme_poll_adminq(nvme_ctrlr);
+
+	CU_ASSERT(nvme_ctrlr->pending_failover == false);
+	CU_ASSERT(nvme_ctrlr->resetting == true);
+
+	/* End the simulated in-flight reset before the next case. */
+	nvme_ctrlr->resetting = false;
+
+	/*
+	 * Case 2: no reset in progress. The same admin-queue failure MUST still
+	 * initiate failover, so the guard does not suppress legitimate recovery.
+	 * The controller transitions to resetting, and the reset reconnects and
+	 * completes on its own (reconnect_poll restores adminq connectivity).
+	 */
+	nvme_ctrlr->in_failover = false;
+	nvme_ctrlr->pending_failover = false;
+	nvme_ctrlr->disconnected_cb = NULL;
+	ctrlr.adminq.is_connected = false;
+
+	bdev_nvme_poll_adminq(nvme_ctrlr);
+
+	CU_ASSERT(nvme_ctrlr->resetting == true);
+
+	poll_threads();
+	spdk_delay_us(g_opts.nvme_adminq_poll_period_us);
+	poll_threads();
+
+	CU_ASSERT(nvme_ctrlr->resetting == false);
+
+	spdk_put_io_channel(ch2);
+
+	set_thread(0);
+
+	spdk_put_io_channel(ch1);
+
+	poll_threads();
+
+	rc = spdk_bdev_nvme_delete("nvme0", &g_any_path, NULL, NULL);
+	CU_ASSERT(rc == 0);
+
+	poll_threads();
+	spdk_delay_us(1000);
+	poll_threads();
+
+	CU_ASSERT(nvme_ctrlr_get_by_name("nvme0") == NULL);
+}
+
 /* We had a bug when running test/nvmf/host/multipath.sh. The bug was the following.
  *
  * A nvme_ctrlr had trid1 and trid2 first. trid1 was active. A connection to trid1 was
@@ -8385,6 +8479,7 @@ main(int argc, char **argv)
 	CU_ADD_TEST(suite, test_reset_ctrlr);
 	CU_ADD_TEST(suite, test_race_between_reset_and_destruct_ctrlr);
 	CU_ADD_TEST(suite, test_failover_ctrlr);
+	CU_ADD_TEST(suite, test_failover_not_redriven_while_resetting);
 	CU_ADD_TEST(suite, test_race_between_failover_and_add_secondary_trid);
 	CU_ADD_TEST(suite, test_pending_reset);
 	CU_ADD_TEST(suite, test_attach_ctrlr);