MemTensor · Memtensor-AI · Jun 29, 2026
diff --git a/src/memos/mem_reader/multi_modal_struct.py b/src/memos/mem_reader/multi_modal_struct.py
@@ -14,7 +14,7 @@
 from memos.mem_reader.read_pref_memory.process_preference_memory import process_preference_fine
 from memos.mem_reader.read_skill_memory.process_skill_memory import process_skill_memory_fine
 from memos.mem_reader.simple_struct import PROMPT_DICT, SimpleStructMemReader
-from memos.mem_reader.utils import parse_json_result
+from memos.mem_reader.utils import build_chat_extraction_messages, parse_json_result
 from memos.memories.textual.item import TextualMemoryItem, TreeNodeTextualMemoryMetadata
 from memos.plugins.hook_defs import H
 from memos.plugins.hooks import trigger_hook, trigger_single_hook
@@ -490,7 +490,12 @@ def _get_llm_response(
 
         logger.info(f"[MultiModalParser] Process String Fine Prompt: {prompt}")
 
-        messages = [{"role": "user", "content": prompt}]
+        # Split into system + user messages for chat-mode prompts so weak
+        # LLMs don't continue the embedded ``user: ...`` conversation
+        # (issue #1269). Doc / general_string templates have no
+        # ``Conversation:`` marker; the helper passes them through as a
+        # single ``user`` message — call shape preserved.
+        messages = build_chat_extraction_messages(prompt)
         try:
             response_text = self.llm.generate(messages)
             response_json = parse_json_result(response_text)

diff --git a/src/memos/mem_reader/simple_struct.py b/src/memos/mem_reader/simple_struct.py
@@ -25,6 +25,7 @@
     from memos.types.general_types import UserContext
 from memos.mem_reader.read_multi_modal import coerce_scene_data, detect_lang
 from memos.mem_reader.utils import (
+    build_chat_extraction_messages,
     count_tokens_text,
     derive_key,
     parse_json_result,
@@ -280,7 +281,12 @@ def _get_llm_response(self, mem_str: str, custom_tags: list[str] | None) -> dict
 
         if self.config.remove_prompt_example:
             prompt = prompt.replace(examples, "")
-        messages = [{"role": "user", "content": prompt}]
+        # Split into system + user messages so weak instruction-following
+        # LLMs do not interpret the embedded ``user: ...`` lines at the
+        # tail of the prompt as a chat they should continue, which would
+        # cause ``summary`` to be a chat reply instead of a real summary
+        # (issue #1269).
+        messages = build_chat_extraction_messages(prompt)
 
         response_text = self._safe_generate(messages)
         response_json = self._safe_parse(response_text)

diff --git a/src/memos/mem_reader/strategy_struct.py b/src/memos/mem_reader/strategy_struct.py
@@ -7,6 +7,7 @@
 from memos.configs.parser import ParserConfigFactory
 from memos.mem_reader.read_multi_modal import detect_lang
 from memos.mem_reader.simple_struct import SimpleStructMemReader
+from memos.mem_reader.utils import build_chat_extraction_messages
 from memos.parsers.factory import ParserFactory
 from memos.templates.mem_reader_prompts import (
     CUSTOM_TAGS_INSTRUCTION,
@@ -57,7 +58,9 @@ def _get_llm_response(self, mem_str: str, custom_tags: list[str] | None) -> dict
 
         if self.config.remove_prompt_example:  # TODO unused
             prompt = prompt.replace(examples, "")
-        messages = [{"role": "user", "content": prompt}]
+        # Split into system + user messages — see simple_struct for the
+        # rationale (issue #1269).
+        messages = build_chat_extraction_messages(prompt)
         try:
             response_text = self.llm.generate(messages)
             response_json = self.parse_json_result(response_text)

diff --git a/src/memos/mem_reader/utils.py b/src/memos/mem_reader/utils.py
@@ -35,6 +35,62 @@ def derive_key(text: str, max_len: int = 80) -> str:
     return (sent[:max_len]).strip()
 
 
+# Markers that introduce the rendered conversation block inside the
+# mem_reader chat-extraction prompt templates (English + Chinese).
+_CHAT_CONVO_MARKERS: tuple[str, ...] = ("\nConversation:\n", "\n对话：\n")
+_CHAT_OUTPUT_SUFFIXES: tuple[str, ...] = ("Your Output:", "您的输出：")
+_CHAT_JSON_ONLY_TRAILER: str = (
+    "\n\nReturn ONLY the single JSON object described above. "
+    "Do not reply to or continue the conversation."
+)
+
+
+def build_chat_extraction_messages(prompt: str) -> list[dict[str, str]]:
+    """Split a rendered chat-extraction prompt into ``system`` + ``user`` messages.
+
+    The mem_reader chat prompt templates end with::
+
+        ...
+        Conversation:
+        <rendered conversation>
+
+        Your Output:
+
+    Sending this whole block as a single ``user`` role message causes weak
+    instruction-following LLMs (small Ollama models, qwen2.5:1.5b,
+    phi4-mini, etc.) to *continue* the trailing ``user: ...`` lines as if
+    they were the live conversation, replying to the user's last message
+    instead of emitting the structured JSON.  Putting the instructions /
+    examples / format spec in a ``system`` message and only the
+    conversation block (plus an explicit "JSON only, do not reply"
+    trailer) in the ``user`` message restores instruction-following on
+    those models (see issue #1269).
+
+    If no conversation marker is found (doc / general_string templates or
+    a custom caller), fall back to a single ``user`` message so non-chat
+    call sites are unaffected.
+    """
+    if not prompt:
+        return [{"role": "user", "content": prompt or ""}]
+    for marker in _CHAT_CONVO_MARKERS:
+        idx = prompt.find(marker)
+        if idx == -1:
+            continue
+        system_part = prompt[:idx].rstrip()
+        # Keep the leading newline trimmed; preserve the "Conversation:"
+        # header on the user side so the model still sees a familiar label.
+        user_part = prompt[idx + 1 :].rstrip()
+        for suffix in _CHAT_OUTPUT_SUFFIXES:
+            if user_part.endswith(suffix):
+                user_part = user_part[: -len(suffix)].rstrip()
+        user_part = f"{user_part}{_CHAT_JSON_ONLY_TRAILER}"
+        return [
+            {"role": "system", "content": system_part},
+            {"role": "user", "content": user_part},
+        ]
+    return [{"role": "user", "content": prompt}]
+
+
 def parse_json_result(response_text: str) -> dict:
     s = (response_text or "").strip()
 

diff --git a/tests/mem_reader/test_simple_structure.py b/tests/mem_reader/test_simple_structure.py
@@ -7,7 +7,7 @@
 from memos.embedders.factory import EmbedderFactory
 from memos.llms.factory import LLMFactory
 from memos.mem_reader.simple_struct import SimpleStructMemReader
-from memos.mem_reader.utils import parse_json_result
+from memos.mem_reader.utils import build_chat_extraction_messages, parse_json_result
 from memos.memories.textual.item import TextualMemoryItem
 
 
@@ -118,6 +118,67 @@ def test_parse_json_result_failure(self):
 
         self.assertEqual(result, {})
 
+    def test_get_llm_response_uses_system_and_user_messages(self):
+        """Regression for #1269: weak models reply to the conversation
+        instead of summarising because the entire prompt (instructions +
+        examples + conversation + ``Your Output:`` trailer) was being sent
+        as a single ``user`` message. The chat extraction path must split
+        the prompt into a system message (instructions / examples / format)
+        and a user message (conversation block + JSON-only trailer).
+        """
+        captured: list[list[dict]] = []
+
+        def fake_generate(messages):
+            captured.append(messages)
+            return (
+                '{"memory list": [{"key": "k", "memory_type": "UserMemory", '
+                '"value": "v", "tags": []}], "summary": "real summary"}'
+            )
+
+        self.reader.llm.generate.side_effect = fake_generate
+        self.reader.embedder.embed.return_value = [[0.0]]
+
+        self.reader._get_llm_response(
+            "user: [2025-06-29 10:00]: Hello, how are you?\n"
+            "assistant: I'm fine, thanks. And you?\n"
+            "user: [2025-06-29 10:01]: Pretty good.\n",
+            custom_tags=None,
+        )
+
+        assert captured, "LLM was not called"
+        messages = captured[0]
+        roles = [m["role"] for m in messages]
+        self.assertEqual(
+            roles[:2],
+            ["system", "user"],
+            f"chat extraction must send a system+user pair, got {roles!r}",
+        )
+
+        system_content = messages[0]["content"]
+        user_content = messages[1]["content"]
+
+        # Instructions / examples / format spec must live in the system
+        # message, not the user message.
+        self.assertIn("memory extraction expert", system_content)
+        self.assertIn('"summary"', system_content)
+        self.assertNotIn("memory extraction expert", user_content)
+
+        # User message must carry the conversation block + an explicit
+        # "JSON only, do not reply" trailer that prevents small models
+        # from continuing the embedded chat.
+        self.assertIn("Conversation:", user_content)
+        self.assertIn("Pretty good.", user_content)
+        self.assertIn("JSON", user_content)
+        self.assertNotIn("Your Output:", user_content)
+
+    def test_build_chat_extraction_messages_fallback(self):
+        """When no Conversation: marker is present (doc / general string
+        templates), the helper falls back to a single user message so that
+        callers outside the chat path are unaffected.
+        """
+        msgs = build_chat_extraction_messages("plain prompt without marker")
+        self.assertEqual(msgs, [{"role": "user", "content": "plain prompt without marker"}])
+
 
 if __name__ == "__main__":
     unittest.main()