From f4811a9337a0955dc9461eb930e8662da8e4842d Mon Sep 17 00:00:00 2001
From: William Fawcett <willfaw@gmail.com>
Date: Tue, 28 Apr 2026 13:32:34 +0100
Subject: [PATCH 1/3] fix(helpers): correct LLM-output parsing in factuality
 helpers

Four related bugs in the prompt-and-parse layer surfaced by the
post-v0.0.1 deep review:

1. extract_tag used a greedy `(.*)` which captured across multiple
   ``` fences, returning a string spanning intervening prose. Switch
   to non-greedy `(.*?)` so it stops at the first closing fence.

2. _assess_factuality_issue cleaned up the answer by string-replacing
   `code_recommendations` after potentially unwrapping its inner
   ```json fence, which left the wrapping <CODE_RECOMMENDATIONS> tags
   (and the fence) behind in the answer body. Refactor to splice the
   full <CODE_RECOMMENDATIONS>...</CODE_RECOMMENDATIONS> block out of
   the answer first, then unwrap the inner fence for the structured
   output field.

3. _assess_factuality_issue raised NoTagFoundError when the LLM
   omitted the <CODE_RECOMMENDATIONS> tag, even though the prompt
   explicitly says recommendations are optional. Catch that case and
   return an empty `code_recommendations` field.

4. Both prompts had typos ("asessment", "udpate", "attemps", "You
   task") and the assess prompt's example JSON used single quotes
   plus comma-less object separators, so the model was being shown
   invalid JSON as the schema. Fix typos and rewrite the example to
   be valid double-quoted JSON. Also fix the extract prompt's
   implicit string concatenation and missing array commas.

Tests updated:
- TestAssessFactualityIssue: assertions now check that the entire
  <CODE_RECOMMENDATIONS> block (including tags) is removed from
  `answer`. Added cases for the no-recs path and the malformed
  inner-fence path.
- TestExtractTag: added a regression test for the greedy-match bug
  using a string with two ```json fences.

Coverage stays at 100%.
---
 src/eve_mcp/server/helpers.py     | 114 +++++++++++++++++-------------
 tests/test_server/test_helpers.py | 102 +++++++++++++++++++-------
 2 files changed, 141 insertions(+), 75 deletions(-)
diff --git a/src/eve_mcp/server/helpers.py b/src/eve_mcp/server/helpers.py
index 018bc95..252ca7e 100644
--- a/src/eve_mcp/server/helpers.py
+++ b/src/eve_mcp/server/helpers.py
@@ -17,6 +17,8 @@
 
 logger = logging.getLogger(__name__)
 
+JSON_FENCE_MARKER = "```json"
+
 
 class NoTagFoundError(Exception):
     """Exception to raise when no tag is found."""
@@ -33,12 +35,14 @@ def extract_xml_tag(text, tag):
 
 
 def extract_tag(text, tag):
-    """Extract tag from text."""
-    pattern = rf"```{tag}(.*)```"
+    """Extract the first ```{tag} ... ``` markdown fence from text.
+
+    Uses a non-greedy match so that subsequent fences in the same text
+    are not consumed.
+    """
+    pattern = rf"```{tag}(.*?)```"
     if match := re.search(pattern, text, flags=re.DOTALL):
-        # extract the graph definition
-        content = match.group(1)
-        return content
+        return match.group(1)
 
     raise NoTagFoundError(f"no {tag} found in genai response")
 
@@ -206,34 +210,31 @@ async def _extract_factuality_issues(question: str, python_code: str) -> str:
     {question}
     </QUESTION>
 
-    Your task is to analyze the Google Earth Engine Python code below and extract what aspects 
-    or issues are making scientific or data assumptions either explicitly or implicitly and might require 
-    factual verification.
+    Your task is to analyze the Google Earth Engine Python code below and extract aspects
+    or issues that are making scientific or data assumptions, either explicitly or implicitly,
+    and that might require factual verification.
 
     <PYTHON_CODE>
     {python_code}
     </PYTHON_CODE>
 
-    Your response must be a list of json structures, each one describing a specific aspect you identify
-    and containing the following fields:
-    [ 
-    {{
-        "title": "A short title describing the aspect or assumption",
-        "description": "A detailed description of the aspect or assumption, "
-            "why it might require factual verification",
-        "facts": "Data, information, constants or facts to be verified",
-        "question_for_expert": "The question that should be posed to an expert "
-            "to verify the aspect or assumption"
-    }}
-    {{ ... more issues ...}}
-    ]
+    Wrap your response in a ```json fenced code block containing a JSON array. Each
+    array element describes one aspect, with these fields:
 
+    ```json
+    [
+        {{
+            "title": "A short title describing the aspect or assumption",
+            "description": "Why this aspect might require factual verification",
+            "facts": "Data, information, constants or facts to be verified",
+            "question_for_expert": "The question to pose to an expert"
+        }}
+    ]
+    ```
     """
-    r = await _query_eve(prompt)
-    r = json.loads(r)
-    issues = extract_tag(r["answer"], "json")
-    r.update({"issues": issues})
-    return json.dumps(r)
+    payload = json.loads(await _query_eve(prompt))
+    payload["issues"] = extract_tag(payload["answer"], "json")
+    return json.dumps(payload)
 
 
 async def _assess_factuality_issue(  # pylint: disable=too-many-positional-arguments
@@ -245,7 +246,7 @@ async def _assess_factuality_issue(  # pylint: disable=too-many-positional-argum
     issue_question_for_expert: str,
 ) -> str:
     prompt = f"""
-    I am trying solve the following Earth Observation question
+    I am trying to solve the following Earth Observation question
 
     <EO_QUESTION>
     {question}
@@ -271,37 +272,54 @@ async def _assess_factuality_issue(  # pylint: disable=too-many-positional-argum
     {issue_facts}
     </ISSUE_FACTS>
 
-    You task, as an Earth Observation expert, is to answer the following question
+    Your task, as an Earth Observation expert, is to answer the following question
 
     {issue_question_for_expert}
 
-    Express your asessment as free Markdown text. 
+    Express your assessment as free Markdown text.
 
-    If you have recommendations to fix or update the code, add a json string within
-    an xml tag <CODE_RECOMMENDATIONS> with the following structure
+    If you have recommendations to fix or update the code, add a JSON object
+    inside an xml tag <CODE_RECOMMENDATIONS> with the following structure
+    (use double-quoted JSON, not single quotes):
 
     <CODE_RECOMMENDATIONS>
     {{
-        'recommendation_1_title': {{
-            'explanation': an explanation of the recommendation,
-            'code_snippet': a string with python code with the suggested code udpate
+        "recommendation_1_title": {{
+            "explanation": "an explanation of the recommendation",
+            "code_snippet": "a string with python code with the suggested update"
         }},
-        'recommendation_2_title': {{
-            'explanation': an explanation of the recommendation,
-            'code_snippet': a string with python code with the suggested code udpate
+        "recommendation_2_title": {{
+            "explanation": "an explanation of the recommendation",
+            "code_snippet": "a string with python code with the suggested update"
         }}
     }}
     </CODE_RECOMMENDATIONS>
+
+    If you have no recommendations, omit the <CODE_RECOMMENDATIONS> tag entirely.
     """
 
-    r = await _query_eve(prompt)
-    r = json.loads(r)
-    code_recommendations = extract_xml_tag(r["answer"], "CODE_RECOMMENDATIONS")
-    if (
-        "```json"  # pylint: disable=magic-value-comparison
-        in code_recommendations
-    ):
-        code_recommendations = extract_tag(code_recommendations, "json")
-    r["answer"] = r["answer"].replace(code_recommendations, "")
-    r.update({"code_recommendations": code_recommendations})
-    return json.dumps(r)
+    payload = json.loads(await _query_eve(prompt))
+    answer = payload["answer"]
+
+    code_recommendations = ""
+    try:
+        inner = extract_xml_tag(answer, "CODE_RECOMMENDATIONS")
+    except NoTagFoundError:
+        # Recommendations are optional per the prompt.
+        pass
+    else:
+        block = f"<CODE_RECOMMENDATIONS>{inner}</CODE_RECOMMENDATIONS>"
+        answer = answer.replace(block, "")
+        code_recommendations = inner
+        if JSON_FENCE_MARKER in code_recommendations:
+            try:
+                code_recommendations = extract_tag(
+                    code_recommendations, "json"
+                )
+            except NoTagFoundError:
+                # Leave as-is if the inner fence is malformed.
+                pass
+
+    payload["answer"] = answer
+    payload["code_recommendations"] = code_recommendations
+    return json.dumps(payload)
diff --git a/tests/test_server/test_helpers.py b/tests/test_server/test_helpers.py
index 47cb671..5116497 100644
--- a/tests/test_server/test_helpers.py
+++ b/tests/test_server/test_helpers.py
@@ -54,6 +54,17 @@ def test_raises_when_no_fence(helpers_mod):
         with pytest.raises(helpers_mod.NoTagFoundError):
             helpers_mod.extract_tag("plain text", "json")
 
+    @staticmethod
+    def test_does_not_span_multiple_fences(helpers_mod):
+        """Non-greedy match stops at the first closing fence."""
+        text = (
+            'first ```json {"x": 1} ``` middle prose '
+            '```json {"y": 2} ``` last'
+        )
+        captured = helpers_mod.extract_tag(text, "json")
+        assert '"x": 1' in captured
+        assert '"y": 2' not in captured
+
 
 class TestGetEveClient:
     """Test the get_eve_client lazy singleton."""
@@ -260,59 +271,96 @@ async def test_extracts_json_block_from_answer(helpers_mod):
 class TestAssessFactualityIssue:
     """Test the _assess_factuality_issue helper."""
 
+    @staticmethod
+    async def _run_assess(helpers_mod, answer):
+        """Drive _assess_factuality_issue with a stubbed _query_eve."""
+        query_payload = json.dumps(
+            {"answer": answer, "sources": [], "conversation_id": "c"}
+        )
+        with patch(
+            f"{PKG}.helpers._query_eve",
+            new=AsyncMock(return_value=query_payload),
+        ):
+            raw = await helpers_mod._assess_factuality_issue(
+                "Q?", "code", "t", "d", "f", "eq"
+            )
+        return json.loads(raw)
+
     @pytest.mark.asyncio
     @staticmethod
-    async def test_strips_recommendations_block_from_answer(
+    async def test_strips_full_recommendations_block_from_answer(
         helpers_mod,
     ):
-        """Removes <CODE_RECOMMENDATIONS> from answer; surfaces it
-        separately."""
+        """Removes the entire <CODE_RECOMMENDATIONS>...</...> block,
+        tags included, from the answer body."""
         recs = '{"r1": {"explanation": "e", "code_snippet": "x = 1"}}'
         answer = (
             "Markdown body. "
             f"<CODE_RECOMMENDATIONS>{recs}</CODE_RECOMMENDATIONS>"
             " trailer"
         )
-        query_payload = json.dumps(
-            {"answer": answer, "sources": [], "conversation_id": "c"}
+        result = await TestAssessFactualityIssue._run_assess(
+            helpers_mod, answer
         )
-        with patch(
-            f"{PKG}.helpers._query_eve",
-            new=AsyncMock(return_value=query_payload),
-        ):
-            raw = await helpers_mod._assess_factuality_issue(  # pylint: disable=protected-access
-                "Q?", "code", "t", "d", "f", "eq"
-            )
-        result = json.loads(raw)
-        assert "<CODE_RECOMMENDATIONS>" in result["answer"]
-        # The recs payload itself is removed from the answer body.
+        assert "<CODE_RECOMMENDATIONS>" not in result["answer"]
+        assert "</CODE_RECOMMENDATIONS>" not in result["answer"]
         assert recs not in result["answer"]
+        assert "Markdown body." in result["answer"]
+        assert "trailer" in result["answer"]
         assert result["code_recommendations"] == recs
 
     @pytest.mark.asyncio
     @staticmethod
-    async def test_unwraps_inner_json_fence_in_recommendations(
+    async def test_unwraps_inner_json_fence_and_cleans_answer(
         helpers_mod,
     ):
-        """If <CODE_RECOMMENDATIONS> wraps a ```json fence, it is unwrapped."""
+        """If <CODE_RECOMMENDATIONS> wraps a ```json fence, the inner
+        JSON is unwrapped AND the wrapping tags + fence are removed
+        from the answer."""
         inner = '{"r1": {"explanation": "e", "code_snippet": "x = 1"}}'
         recs_with_fence = f"```json {inner} ```"
         answer = (
             "Body. "
             f"<CODE_RECOMMENDATIONS>{recs_with_fence}</CODE_RECOMMENDATIONS>"
         )
-        query_payload = json.dumps(
-            {"answer": answer, "sources": [], "conversation_id": "c"}
+        result = await TestAssessFactualityIssue._run_assess(
+            helpers_mod, answer
         )
-        with patch(
-            f"{PKG}.helpers._query_eve",
-            new=AsyncMock(return_value=query_payload),
-        ):
-            raw = await helpers_mod._assess_factuality_issue(  # pylint: disable=protected-access
-                "Q?", "code", "t", "d", "f", "eq"
-            )
-        result = json.loads(raw)
         assert inner in result["code_recommendations"]
+        assert "<CODE_RECOMMENDATIONS>" not in result["answer"]
+        assert "```json" not in result["answer"]
+        assert "Body." in result["answer"]
+
+    @pytest.mark.asyncio
+    @staticmethod
+    async def test_returns_empty_recommendations_when_tag_absent(
+        helpers_mod,
+    ):
+        """If the LLM omits <CODE_RECOMMENDATIONS>, the tool returns
+        an empty recommendations field rather than raising."""
+        answer = "Body of the assessment with no recommendations section."
+        result = await TestAssessFactualityIssue._run_assess(
+            helpers_mod, answer
+        )
+        assert result["code_recommendations"] == ""
+        assert result["answer"] == answer
+
+    @pytest.mark.asyncio
+    @staticmethod
+    async def test_keeps_malformed_inner_fence_as_is(
+        helpers_mod,
+    ):
+        """If the inner fence claims to be json but doesn't close,
+        leave the recommendations as the raw block content."""
+        broken = '```json {"r1": 1}'  # missing closing fence
+        answer = (
+            "Body. " f"<CODE_RECOMMENDATIONS>{broken}</CODE_RECOMMENDATIONS>"
+        )
+        result = await TestAssessFactualityIssue._run_assess(
+            helpers_mod, answer
+        )
+        assert "```json" in result["code_recommendations"]
+        assert "<CODE_RECOMMENDATIONS>" not in result["answer"]
 
 
 def test_helpers_module_importable():

From 49a1809db743c9ee8fd782804e5188ad5ec692f1 Mon Sep 17 00:00:00 2001
From: William Fawcett <willfaw@gmail.com>
Date: Fri, 1 May 2026 15:26:11 +0100
Subject: [PATCH 2/3] Add extra assert statements

---
 tests/test_server/test_helpers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_server/test_helpers.py b/tests/test_server/test_helpers.py
index 5116497..0d3d788 100644
--- a/tests/test_server/test_helpers.py
+++ b/tests/test_server/test_helpers.py
@@ -62,7 +62,9 @@ def test_does_not_span_multiple_fences(helpers_mod):
             '```json {"y": 2} ``` last'
         )
         captured = helpers_mod.extract_tag(text, "json")
+        assert "first" not in captured
         assert '"x": 1' in captured
+        assert "middle prose" not in captured
         assert '"y": 2' not in captured
 
 

From ee1516ef3291cd8de27fd7f15e5dae54818d0fe3 Mon Sep 17 00:00:00 2001
From: William Fawcett <willfaw@gmail.com>
Date: Fri, 1 May 2026 15:30:31 +0100
Subject: [PATCH 3/3] bump submodule version

---
 lib/eve-api | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/eve-api b/lib/eve-api
index 1f8de11..57265a8 160000
--- a/lib/eve-api
+++ b/lib/eve-api
@@ -1 +1 @@
-Subproject commit 1f8de11795beff29597416a1ce2dde2a7ed66fa5
+Subproject commit 57265a8ebbe6cb4c1f826a33f84ad5487bbc427c