From f4811a9337a0955dc9461eb930e8662da8e4842d Mon Sep 17 00:00:00 2001 From: William Fawcett Date: Tue, 28 Apr 2026 13:32:34 +0100 Subject: [PATCH 1/3] fix(helpers): correct LLM-output parsing in factuality helpers Four related bugs in the prompt-and-parse layer surfaced by the post-v0.0.1 deep review: 1. extract_tag used a greedy `(.*)` which captured across multiple ``` fences, returning a string spanning intervening prose. Switch to non-greedy `(.*?)` so it stops at the first closing fence. 2. _assess_factuality_issue cleaned up the answer by string-replacing `code_recommendations` after potentially unwrapping its inner ```json fence, which left the wrapping tags (and the fence) behind in the answer body. Refactor to splice the full ... block out of the answer first, then unwrap the inner fence for the structured output field. 3. _assess_factuality_issue raised NoTagFoundError when the LLM omitted the tag, even though the prompt explicitly says recommendations are optional. Catch that case and return an empty `code_recommendations` field. 4. Both prompts had typos ("asessment", "udpate", "attemps", "You task") and the assess prompt's example JSON used single quotes plus comma-less object separators, so the model was being shown invalid JSON as the schema. Fix typos and rewrite the example to be valid double-quoted JSON. Also fix the extract prompt's implicit string concatenation and missing array commas. Tests updated: - TestAssessFactualityIssue: assertions now check that the entire block (including tags) is removed from `answer`. Added cases for the no-recs path and the malformed inner-fence path. - TestExtractTag: added a regression test for the greedy-match bug using a string with two ```json fences. Coverage stays at 100%. --- src/eve_mcp/server/helpers.py | 114 +++++++++++++++++------------- tests/test_server/test_helpers.py | 102 +++++++++++++++++++------- 2 files changed, 141 insertions(+), 75 deletions(-) diff --git a/src/eve_mcp/server/helpers.py b/src/eve_mcp/server/helpers.py index 018bc95..252ca7e 100644 --- a/src/eve_mcp/server/helpers.py +++ b/src/eve_mcp/server/helpers.py @@ -17,6 +17,8 @@ logger = logging.getLogger(__name__) +JSON_FENCE_MARKER = "```json" + class NoTagFoundError(Exception): """Exception to raise when no tag is found.""" @@ -33,12 +35,14 @@ def extract_xml_tag(text, tag): def extract_tag(text, tag): - """Extract tag from text.""" - pattern = rf"```{tag}(.*)```" + """Extract the first ```{tag} ... ``` markdown fence from text. + + Uses a non-greedy match so that subsequent fences in the same text + are not consumed. + """ + pattern = rf"```{tag}(.*?)```" if match := re.search(pattern, text, flags=re.DOTALL): - # extract the graph definition - content = match.group(1) - return content + return match.group(1) raise NoTagFoundError(f"no {tag} found in genai response") @@ -206,34 +210,31 @@ async def _extract_factuality_issues(question: str, python_code: str) -> str: {question} - Your task is to analyze the Google Earth Engine Python code below and extract what aspects - or issues are making scientific or data assumptions either explicitly or implicitly and might require - factual verification. + Your task is to analyze the Google Earth Engine Python code below and extract aspects + or issues that are making scientific or data assumptions, either explicitly or implicitly, + and that might require factual verification. {python_code} - Your response must be a list of json structures, each one describing a specific aspect you identify - and containing the following fields: - [ - {{ - "title": "A short title describing the aspect or assumption", - "description": "A detailed description of the aspect or assumption, " - "why it might require factual verification", - "facts": "Data, information, constants or facts to be verified", - "question_for_expert": "The question that should be posed to an expert " - "to verify the aspect or assumption" - }} - {{ ... more issues ...}} - ] + Wrap your response in a ```json fenced code block containing a JSON array. Each + array element describes one aspect, with these fields: + ```json + [ + {{ + "title": "A short title describing the aspect or assumption", + "description": "Why this aspect might require factual verification", + "facts": "Data, information, constants or facts to be verified", + "question_for_expert": "The question to pose to an expert" + }} + ] + ``` """ - r = await _query_eve(prompt) - r = json.loads(r) - issues = extract_tag(r["answer"], "json") - r.update({"issues": issues}) - return json.dumps(r) + payload = json.loads(await _query_eve(prompt)) + payload["issues"] = extract_tag(payload["answer"], "json") + return json.dumps(payload) async def _assess_factuality_issue( # pylint: disable=too-many-positional-arguments @@ -245,7 +246,7 @@ async def _assess_factuality_issue( # pylint: disable=too-many-positional-argum issue_question_for_expert: str, ) -> str: prompt = f""" - I am trying solve the following Earth Observation question + I am trying to solve the following Earth Observation question {question} @@ -271,37 +272,54 @@ async def _assess_factuality_issue( # pylint: disable=too-many-positional-argum {issue_facts} - You task, as an Earth Observation expert, is to answer the following question + Your task, as an Earth Observation expert, is to answer the following question {issue_question_for_expert} - Express your asessment as free Markdown text. + Express your assessment as free Markdown text. - If you have recommendations to fix or update the code, add a json string within - an xml tag with the following structure + If you have recommendations to fix or update the code, add a JSON object + inside an xml tag with the following structure + (use double-quoted JSON, not single quotes): {{ - 'recommendation_1_title': {{ - 'explanation': an explanation of the recommendation, - 'code_snippet': a string with python code with the suggested code udpate + "recommendation_1_title": {{ + "explanation": "an explanation of the recommendation", + "code_snippet": "a string with python code with the suggested update" }}, - 'recommendation_2_title': {{ - 'explanation': an explanation of the recommendation, - 'code_snippet': a string with python code with the suggested code udpate + "recommendation_2_title": {{ + "explanation": "an explanation of the recommendation", + "code_snippet": "a string with python code with the suggested update" }} }} + + If you have no recommendations, omit the tag entirely. """ - r = await _query_eve(prompt) - r = json.loads(r) - code_recommendations = extract_xml_tag(r["answer"], "CODE_RECOMMENDATIONS") - if ( - "```json" # pylint: disable=magic-value-comparison - in code_recommendations - ): - code_recommendations = extract_tag(code_recommendations, "json") - r["answer"] = r["answer"].replace(code_recommendations, "") - r.update({"code_recommendations": code_recommendations}) - return json.dumps(r) + payload = json.loads(await _query_eve(prompt)) + answer = payload["answer"] + + code_recommendations = "" + try: + inner = extract_xml_tag(answer, "CODE_RECOMMENDATIONS") + except NoTagFoundError: + # Recommendations are optional per the prompt. + pass + else: + block = f"{inner}" + answer = answer.replace(block, "") + code_recommendations = inner + if JSON_FENCE_MARKER in code_recommendations: + try: + code_recommendations = extract_tag( + code_recommendations, "json" + ) + except NoTagFoundError: + # Leave as-is if the inner fence is malformed. + pass + + payload["answer"] = answer + payload["code_recommendations"] = code_recommendations + return json.dumps(payload) diff --git a/tests/test_server/test_helpers.py b/tests/test_server/test_helpers.py index 47cb671..5116497 100644 --- a/tests/test_server/test_helpers.py +++ b/tests/test_server/test_helpers.py @@ -54,6 +54,17 @@ def test_raises_when_no_fence(helpers_mod): with pytest.raises(helpers_mod.NoTagFoundError): helpers_mod.extract_tag("plain text", "json") + @staticmethod + def test_does_not_span_multiple_fences(helpers_mod): + """Non-greedy match stops at the first closing fence.""" + text = ( + 'first ```json {"x": 1} ``` middle prose ' + '```json {"y": 2} ``` last' + ) + captured = helpers_mod.extract_tag(text, "json") + assert '"x": 1' in captured + assert '"y": 2' not in captured + class TestGetEveClient: """Test the get_eve_client lazy singleton.""" @@ -260,59 +271,96 @@ async def test_extracts_json_block_from_answer(helpers_mod): class TestAssessFactualityIssue: """Test the _assess_factuality_issue helper.""" + @staticmethod + async def _run_assess(helpers_mod, answer): + """Drive _assess_factuality_issue with a stubbed _query_eve.""" + query_payload = json.dumps( + {"answer": answer, "sources": [], "conversation_id": "c"} + ) + with patch( + f"{PKG}.helpers._query_eve", + new=AsyncMock(return_value=query_payload), + ): + raw = await helpers_mod._assess_factuality_issue( + "Q?", "code", "t", "d", "f", "eq" + ) + return json.loads(raw) + @pytest.mark.asyncio @staticmethod - async def test_strips_recommendations_block_from_answer( + async def test_strips_full_recommendations_block_from_answer( helpers_mod, ): - """Removes from answer; surfaces it - separately.""" + """Removes the entire ... block, + tags included, from the answer body.""" recs = '{"r1": {"explanation": "e", "code_snippet": "x = 1"}}' answer = ( "Markdown body. " f"{recs}" " trailer" ) - query_payload = json.dumps( - {"answer": answer, "sources": [], "conversation_id": "c"} + result = await TestAssessFactualityIssue._run_assess( + helpers_mod, answer ) - with patch( - f"{PKG}.helpers._query_eve", - new=AsyncMock(return_value=query_payload), - ): - raw = await helpers_mod._assess_factuality_issue( # pylint: disable=protected-access - "Q?", "code", "t", "d", "f", "eq" - ) - result = json.loads(raw) - assert "" in result["answer"] - # The recs payload itself is removed from the answer body. + assert "" not in result["answer"] + assert "" not in result["answer"] assert recs not in result["answer"] + assert "Markdown body." in result["answer"] + assert "trailer" in result["answer"] assert result["code_recommendations"] == recs @pytest.mark.asyncio @staticmethod - async def test_unwraps_inner_json_fence_in_recommendations( + async def test_unwraps_inner_json_fence_and_cleans_answer( helpers_mod, ): - """If wraps a ```json fence, it is unwrapped.""" + """If wraps a ```json fence, the inner + JSON is unwrapped AND the wrapping tags + fence are removed + from the answer.""" inner = '{"r1": {"explanation": "e", "code_snippet": "x = 1"}}' recs_with_fence = f"```json {inner} ```" answer = ( "Body. " f"{recs_with_fence}" ) - query_payload = json.dumps( - {"answer": answer, "sources": [], "conversation_id": "c"} + result = await TestAssessFactualityIssue._run_assess( + helpers_mod, answer ) - with patch( - f"{PKG}.helpers._query_eve", - new=AsyncMock(return_value=query_payload), - ): - raw = await helpers_mod._assess_factuality_issue( # pylint: disable=protected-access - "Q?", "code", "t", "d", "f", "eq" - ) - result = json.loads(raw) assert inner in result["code_recommendations"] + assert "" not in result["answer"] + assert "```json" not in result["answer"] + assert "Body." in result["answer"] + + @pytest.mark.asyncio + @staticmethod + async def test_returns_empty_recommendations_when_tag_absent( + helpers_mod, + ): + """If the LLM omits , the tool returns + an empty recommendations field rather than raising.""" + answer = "Body of the assessment with no recommendations section." + result = await TestAssessFactualityIssue._run_assess( + helpers_mod, answer + ) + assert result["code_recommendations"] == "" + assert result["answer"] == answer + + @pytest.mark.asyncio + @staticmethod + async def test_keeps_malformed_inner_fence_as_is( + helpers_mod, + ): + """If the inner fence claims to be json but doesn't close, + leave the recommendations as the raw block content.""" + broken = '```json {"r1": 1}' # missing closing fence + answer = ( + "Body. " f"{broken}" + ) + result = await TestAssessFactualityIssue._run_assess( + helpers_mod, answer + ) + assert "```json" in result["code_recommendations"] + assert "" not in result["answer"] def test_helpers_module_importable(): From 49a1809db743c9ee8fd782804e5188ad5ec692f1 Mon Sep 17 00:00:00 2001 From: William Fawcett Date: Fri, 1 May 2026 15:26:11 +0100 Subject: [PATCH 2/3] Add extra assert statements --- tests/test_server/test_helpers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_server/test_helpers.py b/tests/test_server/test_helpers.py index 5116497..0d3d788 100644 --- a/tests/test_server/test_helpers.py +++ b/tests/test_server/test_helpers.py @@ -62,7 +62,9 @@ def test_does_not_span_multiple_fences(helpers_mod): '```json {"y": 2} ``` last' ) captured = helpers_mod.extract_tag(text, "json") + assert "first" not in captured assert '"x": 1' in captured + assert "middle prose" not in captured assert '"y": 2' not in captured From ee1516ef3291cd8de27fd7f15e5dae54818d0fe3 Mon Sep 17 00:00:00 2001 From: William Fawcett Date: Fri, 1 May 2026 15:30:31 +0100 Subject: [PATCH 3/3] bump submodule version --- lib/eve-api | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/eve-api b/lib/eve-api index 1f8de11..57265a8 160000 --- a/lib/eve-api +++ b/lib/eve-api @@ -1 +1 @@ -Subproject commit 1f8de11795beff29597416a1ce2dde2a7ed66fa5 +Subproject commit 57265a8ebbe6cb4c1f826a33f84ad5487bbc427c