From 6848b54373e88d0324f314c3e7e318f5a3a6d5da Mon Sep 17 00:00:00 2001
From: Maksim Dimitrov <dimitrov.maksim@gmail.com>
Date: Tue, 14 Apr 2026 17:58:09 +0300
Subject: [PATCH] core, graph, store: Fix unfail retry when deployment head
 behind error block

When a subgraph encounters a non-deterministic error and restarts from a
checkpoint before the error block, the unfail mechanism would incorrectly
stop retrying after the first attempt returned Noop.

This happened because `should_try_unfail_non_deterministic` was
unconditionally set to `false` after the first unfail attempt, even when
the deployment head hadn't reached the error block yet.

Fix: Add `UnfailOutcome::BehindErrorBlock` variant to distinguish
"deployment head behind error block" (keep retrying) from other Noop
cases (stop retrying).

Fixes #6205
---
 core/src/subgraph/runner/mod.rs             | 12 ++--
 graph/src/components/store/mod.rs           |  4 ++
 store/postgres/src/deployment_store.rs      |  5 +-
 store/test-store/tests/postgres/subgraph.rs |  4 +-
 tests/tests/runner_tests.rs                 | 74 +++++++++++++++++++++
 5 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/core/src/subgraph/runner/mod.rs b/core/src/subgraph/runner/mod.rs
index 5f6170556ca..19643c818ff 100644
--- a/core/src/subgraph/runner/mod.rs
+++ b/core/src/subgraph/runner/mod.rs
@@ -1303,12 +1303,14 @@ where
                         .unfail_non_deterministic_error(&block_ptr)
                         .await?;
 
-                    // Stop trying to unfail.
-                    self.state.should_try_unfail_non_deterministic = false;
+                    // Stop trying unless we're still behind the error block.
+                    if outcome != UnfailOutcome::BehindErrorBlock {
+                        self.state.should_try_unfail_non_deterministic = false;
 
-                    if let UnfailOutcome::Unfailed = outcome {
-                        self.metrics.subgraph.deployment_status.running();
-                        self.state.backoff.reset();
+                        if outcome == UnfailOutcome::Unfailed {
+                            self.metrics.subgraph.deployment_status.running();
+                            self.state.backoff.reset();
+                        }
                     }
                 }
 
diff --git a/graph/src/components/store/mod.rs b/graph/src/components/store/mod.rs
index d9a4e120dff..a3a85b51cc2 100644
--- a/graph/src/components/store/mod.rs
+++ b/graph/src/components/store/mod.rs
@@ -668,8 +668,12 @@ pub enum EntityOperation {
 
 #[derive(Debug, PartialEq)]
 pub enum UnfailOutcome {
+    /// Nothing to do - no error exists, or error is of wrong type (e.g., deterministic).
     Noop,
+    /// Successfully unfailed the subgraph.
     Unfailed,
+    /// The deployment head is still behind the error block, retry on subsequent blocks.
+    BehindErrorBlock,
 }
 
 #[derive(Clone, PartialEq, Eq, Debug)]
diff --git a/store/postgres/src/deployment_store.rs b/store/postgres/src/deployment_store.rs
index 22cff2d4f34..df4e34c448a 100644
--- a/store/postgres/src/deployment_store.rs
+++ b/store/postgres/src/deployment_store.rs
@@ -1872,7 +1872,8 @@ impl DeploymentStore {
 
                         Ok(UnfailOutcome::Unfailed)
                     }
-                // NOOP, the deployment head is still before where non-deterministic error happened.
+                // The deployment head is still before where non-deterministic error happened.
+                // Return BehindErrorBlock so the caller knows to retry on subsequent blocks.
                 block_range => {
                     info!(
                         self.logger,
@@ -1884,7 +1885,7 @@ impl DeploymentStore {
                         "error_block_hash" => subgraph_error.block_hash.as_ref().map(|hash| format!("0x{}", hex::encode(hash))),
                     );
 
-                    Ok(UnfailOutcome::Noop)
+                    Ok(UnfailOutcome::BehindErrorBlock)
                 }
             }
         }.scope_boxed()).await
diff --git a/store/test-store/tests/postgres/subgraph.rs b/store/test-store/tests/postgres/subgraph.rs
index e4679753e1e..5dd3c273160 100644
--- a/store/test-store/tests/postgres/subgraph.rs
+++ b/store/test-store/tests/postgres/subgraph.rs
@@ -1215,14 +1215,14 @@ fn fail_unfail_non_deterministic_error_noop() {
         // Fail the subgraph with a non-deterministic error, but with an advanced block.
         writable.fail_subgraph(error).await.unwrap();
 
-        // Since the block range of the block won't match the deployment head, this will be NOOP.
+        // Since the deployment head is behind the error block, this returns BehindErrorBlock.
         let outcome = writable
             .unfail_non_deterministic_error(&BLOCKS[1])
             .await
             .unwrap();
 
         // State continues the same besides a new error added to the database.
-        assert_eq!(outcome, UnfailOutcome::Noop);
+        assert_eq!(outcome, UnfailOutcome::BehindErrorBlock);
         assert_eq!(count().await, 2);
         let vi = get_version_info(&store, NAME).await;
         assert_eq!(NAME, vi.deployment_id.as_str());
diff --git a/tests/tests/runner_tests.rs b/tests/tests/runner_tests.rs
index 743f5714df3..1d44657c441 100644
--- a/tests/tests/runner_tests.rs
+++ b/tests/tests/runner_tests.rs
@@ -1418,3 +1418,77 @@ async fn skip_duplicates() {
         })
     );
 }
+
+/// Test that the unfail mechanism retries until the deployment head reaches
+/// the error block. This is a regression test for issue #6205.
+///
+/// Scenario:
+/// 1. Sync to block 1
+/// 2. Inject non-deterministic error at block 3 (ahead of head)
+/// 3. Run runner to block 3
+/// 4. At blocks 1-2: unfail returns BehindErrorBlock, keeps trying
+/// 5. At block 3: unfail succeeds, health → Healthy
+#[graph::test]
+async fn non_deterministic_unfail_retries_until_error_block() -> anyhow::Result<()> {
+    let RunnerTestRecipe { stores, test_info } =
+        RunnerTestRecipe::new("non_deterministic_unfail_retries", "end-block").await;
+
+    let blocks = {
+        let block_0 = genesis();
+        let mut block_1 = empty_block(block_0.ptr(), test_ptr(1));
+        let mut block_2 = empty_block(block_1.ptr(), test_ptr(2));
+        let mut block_3 = empty_block(block_2.ptr(), test_ptr(3));
+
+        // Add triggers to exercise normal block processing.
+        push_test_log(&mut block_1, "a");
+        push_test_log(&mut block_2, "b");
+        push_test_log(&mut block_3, "c");
+
+        vec![block_0, block_1, block_2, block_3]
+    };
+
+    let chain = chain(&test_info.test_name, blocks, &stores, None).await;
+    let ctx = fixture::setup(&test_info, &stores, &chain, None, None).await;
+
+    // Advance head to block 1.
+    ctx.start_and_sync_to(test_ptr(1)).await;
+
+    // Inject non-deterministic fatal error at block 3 (ahead of current head).
+    let writable = ctx
+        .store
+        .clone()
+        .writable(ctx.logger.clone(), ctx.deployment.id, Arc::new(vec![]))
+        .await
+        .unwrap();
+
+    writable
+        .fail_subgraph(SubgraphError {
+            subgraph_id: ctx.deployment.hash.clone(),
+            message: "injected transient error".to_string(),
+            block_ptr: Some(test_ptr(3)),
+            handler: None,
+            deterministic: false,
+        })
+        .await
+        .unwrap();
+
+    writable.flush().await.unwrap();
+
+    // Precondition: deployment is failed with non-deterministic error.
+    let status = ctx.indexing_status().await;
+    assert!(status.health == SubgraphHealth::Failed);
+    assert!(!status.fatal_error.as_ref().unwrap().deterministic);
+
+    // Run runner to block 3. The unfail mechanism should:
+    // - Return BehindErrorBlock at blocks 1-2 (keep trying)
+    // - Return Unfailed at block 3 (success)
+    let stop_block = test_ptr(3);
+    let _runner = ctx.runner(stop_block).await.run_for_test(false).await?;
+
+    // Postcondition: deployment is healthy, no fatal error.
+    let status = ctx.indexing_status().await;
+    assert!(status.health == SubgraphHealth::Healthy);
+    assert!(status.fatal_error.is_none());
+
+    Ok(())
+}