From 6848b54373e88d0324f314c3e7e318f5a3a6d5da Mon Sep 17 00:00:00 2001 From: Maksim Dimitrov Date: Tue, 14 Apr 2026 17:58:09 +0300 Subject: [PATCH] core, graph, store: Fix unfail retry when deployment head behind error block When a subgraph encounters a non-deterministic error and restarts from a checkpoint before the error block, the unfail mechanism would incorrectly stop retrying after the first attempt returned Noop. This happened because `should_try_unfail_non_deterministic` was unconditionally set to `false` after the first unfail attempt, even when the deployment head hadn't reached the error block yet. Fix: Add `UnfailOutcome::BehindErrorBlock` variant to distinguish "deployment head behind error block" (keep retrying) from other Noop cases (stop retrying). Fixes #6205 --- core/src/subgraph/runner/mod.rs | 12 ++-- graph/src/components/store/mod.rs | 4 ++ store/postgres/src/deployment_store.rs | 5 +- store/test-store/tests/postgres/subgraph.rs | 4 +- tests/tests/runner_tests.rs | 74 +++++++++++++++++++++ 5 files changed, 90 insertions(+), 9 deletions(-) diff --git a/core/src/subgraph/runner/mod.rs b/core/src/subgraph/runner/mod.rs index 5f6170556ca..19643c818ff 100644 --- a/core/src/subgraph/runner/mod.rs +++ b/core/src/subgraph/runner/mod.rs @@ -1303,12 +1303,14 @@ where .unfail_non_deterministic_error(&block_ptr) .await?; - // Stop trying to unfail. - self.state.should_try_unfail_non_deterministic = false; + // Stop trying unless we're still behind the error block. + if outcome != UnfailOutcome::BehindErrorBlock { + self.state.should_try_unfail_non_deterministic = false; - if let UnfailOutcome::Unfailed = outcome { - self.metrics.subgraph.deployment_status.running(); - self.state.backoff.reset(); + if outcome == UnfailOutcome::Unfailed { + self.metrics.subgraph.deployment_status.running(); + self.state.backoff.reset(); + } } } diff --git a/graph/src/components/store/mod.rs b/graph/src/components/store/mod.rs index d9a4e120dff..a3a85b51cc2 100644 --- a/graph/src/components/store/mod.rs +++ b/graph/src/components/store/mod.rs @@ -668,8 +668,12 @@ pub enum EntityOperation { #[derive(Debug, PartialEq)] pub enum UnfailOutcome { + /// Nothing to do - no error exists, or error is of wrong type (e.g., deterministic). Noop, + /// Successfully unfailed the subgraph. Unfailed, + /// The deployment head is still behind the error block, retry on subsequent blocks. + BehindErrorBlock, } #[derive(Clone, PartialEq, Eq, Debug)] diff --git a/store/postgres/src/deployment_store.rs b/store/postgres/src/deployment_store.rs index 22cff2d4f34..df4e34c448a 100644 --- a/store/postgres/src/deployment_store.rs +++ b/store/postgres/src/deployment_store.rs @@ -1872,7 +1872,8 @@ impl DeploymentStore { Ok(UnfailOutcome::Unfailed) } - // NOOP, the deployment head is still before where non-deterministic error happened. + // The deployment head is still before where non-deterministic error happened. + // Return BehindErrorBlock so the caller knows to retry on subsequent blocks. block_range => { info!( self.logger, @@ -1884,7 +1885,7 @@ impl DeploymentStore { "error_block_hash" => subgraph_error.block_hash.as_ref().map(|hash| format!("0x{}", hex::encode(hash))), ); - Ok(UnfailOutcome::Noop) + Ok(UnfailOutcome::BehindErrorBlock) } } }.scope_boxed()).await diff --git a/store/test-store/tests/postgres/subgraph.rs b/store/test-store/tests/postgres/subgraph.rs index e4679753e1e..5dd3c273160 100644 --- a/store/test-store/tests/postgres/subgraph.rs +++ b/store/test-store/tests/postgres/subgraph.rs @@ -1215,14 +1215,14 @@ fn fail_unfail_non_deterministic_error_noop() { // Fail the subgraph with a non-deterministic error, but with an advanced block. writable.fail_subgraph(error).await.unwrap(); - // Since the block range of the block won't match the deployment head, this will be NOOP. + // Since the deployment head is behind the error block, this returns BehindErrorBlock. let outcome = writable .unfail_non_deterministic_error(&BLOCKS[1]) .await .unwrap(); // State continues the same besides a new error added to the database. - assert_eq!(outcome, UnfailOutcome::Noop); + assert_eq!(outcome, UnfailOutcome::BehindErrorBlock); assert_eq!(count().await, 2); let vi = get_version_info(&store, NAME).await; assert_eq!(NAME, vi.deployment_id.as_str()); diff --git a/tests/tests/runner_tests.rs b/tests/tests/runner_tests.rs index 743f5714df3..1d44657c441 100644 --- a/tests/tests/runner_tests.rs +++ b/tests/tests/runner_tests.rs @@ -1418,3 +1418,77 @@ async fn skip_duplicates() { }) ); } + +/// Test that the unfail mechanism retries until the deployment head reaches +/// the error block. This is a regression test for issue #6205. +/// +/// Scenario: +/// 1. Sync to block 1 +/// 2. Inject non-deterministic error at block 3 (ahead of head) +/// 3. Run runner to block 3 +/// 4. At blocks 1-2: unfail returns BehindErrorBlock, keeps trying +/// 5. At block 3: unfail succeeds, health → Healthy +#[graph::test] +async fn non_deterministic_unfail_retries_until_error_block() -> anyhow::Result<()> { + let RunnerTestRecipe { stores, test_info } = + RunnerTestRecipe::new("non_deterministic_unfail_retries", "end-block").await; + + let blocks = { + let block_0 = genesis(); + let mut block_1 = empty_block(block_0.ptr(), test_ptr(1)); + let mut block_2 = empty_block(block_1.ptr(), test_ptr(2)); + let mut block_3 = empty_block(block_2.ptr(), test_ptr(3)); + + // Add triggers to exercise normal block processing. + push_test_log(&mut block_1, "a"); + push_test_log(&mut block_2, "b"); + push_test_log(&mut block_3, "c"); + + vec![block_0, block_1, block_2, block_3] + }; + + let chain = chain(&test_info.test_name, blocks, &stores, None).await; + let ctx = fixture::setup(&test_info, &stores, &chain, None, None).await; + + // Advance head to block 1. + ctx.start_and_sync_to(test_ptr(1)).await; + + // Inject non-deterministic fatal error at block 3 (ahead of current head). + let writable = ctx + .store + .clone() + .writable(ctx.logger.clone(), ctx.deployment.id, Arc::new(vec![])) + .await + .unwrap(); + + writable + .fail_subgraph(SubgraphError { + subgraph_id: ctx.deployment.hash.clone(), + message: "injected transient error".to_string(), + block_ptr: Some(test_ptr(3)), + handler: None, + deterministic: false, + }) + .await + .unwrap(); + + writable.flush().await.unwrap(); + + // Precondition: deployment is failed with non-deterministic error. + let status = ctx.indexing_status().await; + assert!(status.health == SubgraphHealth::Failed); + assert!(!status.fatal_error.as_ref().unwrap().deterministic); + + // Run runner to block 3. The unfail mechanism should: + // - Return BehindErrorBlock at blocks 1-2 (keep trying) + // - Return Unfailed at block 3 (success) + let stop_block = test_ptr(3); + let _runner = ctx.runner(stop_block).await.run_for_test(false).await?; + + // Postcondition: deployment is healthy, no fatal error. + let status = ctx.indexing_status().await; + assert!(status.health == SubgraphHealth::Healthy); + assert!(status.fatal_error.is_none()); + + Ok(()) +}