From 9b8a9599d5e00f61f9b2c2e883a02ecf1b0aa90c Mon Sep 17 00:00:00 2001 From: Thomas Adair <34932535+AdairBear@users.noreply.github.com> Date: Fri, 12 Jun 2026 10:52:50 -0700 Subject: [PATCH 1/4] feat(qm_mcp): research-corpus surface on QuantMind ingestion (#1) QuantMind v0.2 ships ingestion + LLM extraction only; its persistence, embedding, semantic-query, and Data-MCP layers are unbuilt future PRs. This adds that missing Stage-2 layer as a self-contained package that reuses QuantMind's own venv and fetch+format layer: - store.py filesystem CorpusStore (JSON + .npy vectors, stable-hash dedup) - embed.py OpenAI embeddings + grounded answer synthesis + summarizer - ingest.py fetch_arxiv/url/local -> markdown -> summarize -> embed -> store (skips the brittle paper_flow Paper-tree: gpt-4o-mini emits non-UUID node ids that the Paper schema rejects) - query.py embed question -> cosine top-k -> grounded, cited answer - server.py FastMCP stdio server: qm_ingest_arxiv/url/pdf/text, qm_query, qm_list_corpus, qm_delete_item - cli.py seeding + shell use; seed_corpus.txt; _smoke_mcp.py handshake test Secrets load from ~/.hermes/.env; uses VOICE_TOOLS_OPENAI_KEY (real OpenAI) since Hermes OPENAI_API_KEY is an OpenRouter key with no embeddings endpoint. Co-authored-by: Claude Opus 4.8 (1M context) --- .gitignore | 1 + qm_mcp/README.md | 86 ++++++++++++++ qm_mcp/__init__.py | 18 +++ qm_mcp/_smoke_mcp.py | 32 +++++ qm_mcp/cli.py | 126 ++++++++++++++++++++ qm_mcp/config.py | 97 +++++++++++++++ qm_mcp/embed.py | 130 +++++++++++++++++++++ qm_mcp/ingest.py | 260 +++++++++++++++++++++++++++++++++++++++++ qm_mcp/query.py | 66 +++++++++++ qm_mcp/seed_corpus.txt | 25 ++++ qm_mcp/server.py | 91 +++++++++++++++ qm_mcp/store.py | 115 ++++++++++++++++++ 12 files changed, 1047 insertions(+) create mode 100644 qm_mcp/README.md create mode 100644 qm_mcp/__init__.py create mode 100644 qm_mcp/_smoke_mcp.py create mode 100644 qm_mcp/cli.py create mode 100644 qm_mcp/config.py create mode 100644 qm_mcp/embed.py create mode 100644 qm_mcp/ingest.py create mode 100644 qm_mcp/query.py create mode 100644 qm_mcp/seed_corpus.txt create mode 100644 qm_mcp/server.py create mode 100644 qm_mcp/store.py diff --git a/.gitignore b/.gitignore index 539aa6c..4097ce9 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,4 @@ docs/superpowers/ .coverage htmlcov/ coverage.xml +.venv/ diff --git a/qm_mcp/README.md b/qm_mcp/README.md new file mode 100644 index 0000000..fd93fba --- /dev/null +++ b/qm_mcp/README.md @@ -0,0 +1,86 @@ +# qm_mcp — QuantMind research-corpus surface + +This package turns [QuantMind](../README.md) into a **queryable research +corpus** for Thomas's trading + AVST work, exposed over MCP so Personal +Hermes, Dispatch sessions, the Conductor, and future Akazi AVST all read the +same knowledge base. + +## Why this exists + +QuantMind v0.2 ships **ingestion + LLM extraction only** — `paper_flow` +fetches an arXiv id / URL / PDF / raw text, converts it to markdown, and +extracts a typed `Paper` tree. Its persistence, embedding, semantic-query, +and "Data MCP" layers are still **vision / future PRs** (PR6/PR7 per their +README). `qm_mcp` supplies exactly that missing Stage-2 layer: + +``` +ingest (QuantMind paper_flow) + → CorpusStore (~/.quantmind/corpus : one JSON + one vector per item) + → semantic query (OpenAI embeddings → cosine top-k → grounded answer) + → MCP server (qm_ingest_*, qm_query, qm_list_corpus, qm_delete_item) +``` + +It is dependency-light: it reuses QuantMind's own venv (`openai`, `numpy`, +`pydantic`, `httpx`, `mcp`) and stores everything on the local filesystem. + +## Secrets + +Loaded from `~/.hermes/.env` at runtime — nothing is hard-coded. Embeddings +and `paper_flow` extraction need a **real platform.openai.com** key. Hermes' +`OPENAI_API_KEY` is an OpenRouter key (`sk-or-…`, no embeddings endpoint), so +`qm_mcp` uses `VOICE_TOOLS_OPENAI_KEY` (the real OpenAI key kept for Whisper) +and forces it for this process only. + +## Run the MCP server + +```bash +/Users/thomasadair/projects/quant-mind/.venv/bin/python -m qm_mcp.server +``` + +Registered in Hermes `~/.hermes/config.yaml` under `mcp_servers: quantmind` +(see `docs/quantmind_brain_boundary.md` in the hermes-agent repo). + +## CLI (seeding + shell use) + +```bash +PY=/Users/thomasadair/projects/quant-mind/.venv/bin/python +$PY -m qm_mcp.cli ingest-arxiv 1105.3115 +$PY -m qm_mcp.cli ingest-pdf ~/papers/foo.pdf +$PY -m qm_mcp.cli ingest-url https://example.com/article +$PY -m qm_mcp.cli seed qm_mcp/seed_corpus.txt +$PY -m qm_mcp.cli query "What does Stoikov say about gamma?" +$PY -m qm_mcp.cli list +$PY -m qm_mcp.cli delete +``` + +## MCP tools + +| Tool | Purpose | +|---|---| +| `qm_ingest_arxiv(arxiv_id)` | Ingest an arXiv paper by id or URL | +| `qm_ingest_url(url)` | Ingest a web page / hosted PDF | +| `qm_ingest_pdf(path)` | Ingest a local PDF / HTML / Markdown file | +| `qm_ingest_text(text, title?)` | Ingest pasted text | +| `qm_query(question, k=5)` | Grounded natural-language answer + top-k sources | +| `qm_list_corpus()` | List all ingested items (metadata) | +| `qm_delete_item(item_id)` | Remove one item | + +## Storage + +`~/.quantmind/corpus/` (outside both git repos — never committed): +- `items/.json` — record: metadata + flattened context + full Paper tree +- `vectors/.npy` — 1536-dim embedding (aligned by id) +- `ingestion_log.jsonl` — append-only ledger of ingestion events + +`id` is a stable hash of the source, so re-ingesting is idempotent (dedup). + +## Known QuantMind quirks handled here + +- **Strict-schema rejection.** `Agent(output_type=Paper)` fails under OpenAI + strict structured output (recursive UUID-keyed tree). We pass a non-strict + `AgentOutputSchema(Paper, strict_json_schema=False)`. +- **No news flow.** QuantMind has `knowledge/news.py` types but no + `news_flow`. News/blog URLs go through the generic `HttpUrl` → `paper_flow` + path (trafilatura HTML → markdown → extraction). +- **DOI unsupported.** `paper_flow` raises `NotImplementedError` on DOI + inputs upstream; use arXiv id or a direct URL. diff --git a/qm_mcp/__init__.py b/qm_mcp/__init__.py new file mode 100644 index 0000000..3d04c3a --- /dev/null +++ b/qm_mcp/__init__.py @@ -0,0 +1,18 @@ +"""qm_mcp — the research-corpus surface built on top of QuantMind ingestion. + +QuantMind v0.2 ships ingestion + LLM extraction only (``paper_flow``); the +persistence, embedding, semantic-query, and MCP layers (its "Stage 2 / +Data MCP" vision) are not yet built upstream. This package supplies exactly +that missing layer so QuantMind becomes a usable, queryable corpus for +Thomas's trading + AVST research: + + ingest (paper_flow) -> CorpusStore (JSON + vectors) -> semantic query + \\-> MCP server (Hermes / Dispatch / Conductor) + +It is intentionally self-contained and dependency-light: it reuses +QuantMind's own venv (openai, numpy, pydantic, httpx, mcp) and stores the +corpus on the local filesystem under ``QM_CORPUS_DIR``. +""" + +__all__ = ["__version__"] +__version__ = "0.1.0" diff --git a/qm_mcp/_smoke_mcp.py b/qm_mcp/_smoke_mcp.py new file mode 100644 index 0000000..f7d24d4 --- /dev/null +++ b/qm_mcp/_smoke_mcp.py @@ -0,0 +1,32 @@ +"""Standalone MCP stdio smoke test: spawn the server, list tools, list corpus. + +Run under the QuantMind venv: + python -m qm_mcp._smoke_mcp +""" + +from __future__ import annotations + +import asyncio +import os + +from mcp import ClientSession, StdioServerParameters +from mcp.client.stdio import stdio_client + + +async def main() -> None: + params = StdioServerParameters( + command=os.sys.executable, + args=["-m", "qm_mcp.server"], + env={**os.environ, "PYTHONPATH": os.getcwd()}, + ) + async with stdio_client(params) as (read, write): + async with ClientSession(read, write) as session: + await session.initialize() + tools = await session.list_tools() + print("TOOLS:", [t.name for t in tools.tools]) + res = await session.call_tool("qm_list_corpus", {}) + print("LIST_CORPUS:", res.content[0].text[:400]) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/qm_mcp/cli.py b/qm_mcp/cli.py new file mode 100644 index 0000000..64e3250 --- /dev/null +++ b/qm_mcp/cli.py @@ -0,0 +1,126 @@ +"""Command-line surface for the QuantMind corpus. + +Used for seeding the initial corpus, manual queries, and as a shell-callable +backend for any tool that prefers a subprocess over MCP. Examples:: + + python -m qm_mcp.cli ingest-arxiv 1105.3115 + python -m qm_mcp.cli ingest-pdf ~/papers/foo.pdf + python -m qm_mcp.cli seed papers.txt # one source per line + python -m qm_mcp.cli query "What is gamma in Avellaneda-Stoikov?" + python -m qm_mcp.cli list + python -m qm_mcp.cli delete +""" + +from __future__ import annotations + +import argparse +import asyncio +import json +import sys +from pathlib import Path + +from qm_mcp import ingest as I +from qm_mcp.query import query as run_query +from qm_mcp.store import CorpusStore + + +def _print(obj) -> None: + print(json.dumps(obj, indent=2, default=str)) + + +async def _dispatch_source(src: str, *, force: bool): + """Route one seed line to the right ingest fn by simple heuristics.""" + s = src.strip() + if not s or s.startswith("#"): + return None + # Strip inline "# comment" trailers (seed files annotate ids). URLs may + # legitimately contain '#', so only strip for non-URL lines. + if not s.lower().startswith("http"): + s = s.split("#", 1)[0].strip() + if not s: + return None + low = s.lower() + if ( + low.startswith("arxiv:") + or low.startswith("http") + and "arxiv.org" in low + ): + return await I.ingest_arxiv( + s.split("arxiv:", 1)[-1].strip(), force=force + ) + if low.startswith("http://") or low.startswith("https://"): + return await I.ingest_url(s, force=force) + if Path(s).expanduser().is_file(): + return await I.ingest_pdf(s, force=force) + # bare token -> treat as arxiv id + return await I.ingest_arxiv(s, force=force) + + +async def _amain(args: argparse.Namespace) -> int: + if args.cmd == "ingest-arxiv": + _print(await I.ingest_arxiv(args.value, force=args.force)) + elif args.cmd == "ingest-url": + _print(await I.ingest_url(args.value, force=args.force)) + elif args.cmd == "ingest-pdf": + _print(await I.ingest_pdf(args.value, force=args.force)) + elif args.cmd == "ingest-text": + _print(await I.ingest_text(args.value, force=args.force)) + elif args.cmd == "seed": + lines = Path(args.value).read_text(encoding="utf-8").splitlines() + results = [] + for line in lines: + try: + res = await _dispatch_source(line, force=args.force) + except Exception as exc: # one bad source must not sink the batch + res = { + "source": line.strip(), + "status": "error", + "error": str(exc), + } + if res is not None: + results.append(res) + print( + f" [{res.get('status'):>8}] {res.get('title') or res.get('source') or res.get('error')}", + file=sys.stderr, + ) + _print({"seeded": results, "total": len(results)}) + elif args.cmd == "query": + _print(await run_query(args.value, k=args.k)) + elif args.cmd == "list": + store = CorpusStore() + _print({"count": len(store), "items": store.list_records(light=True)}) + elif args.cmd == "delete": + _print({"id": args.value, "deleted": CorpusStore().delete(args.value)}) + else: # pragma: no cover + return 2 + return 0 + + +def main() -> int: + p = argparse.ArgumentParser( + prog="qm_mcp.cli", description="QuantMind corpus CLI" + ) + p.add_argument( + "--force", action="store_true", help="re-ingest even if present" + ) + sub = p.add_subparsers(dest="cmd", required=True) + for name in ( + "ingest-arxiv", + "ingest-url", + "ingest-pdf", + "ingest-text", + "seed", + "delete", + ): + sp = sub.add_parser(name) + sp.add_argument("value") + qp = sub.add_parser("query") + qp.add_argument("value") + qp.add_argument("-k", type=int, default=5) + sub.add_parser("list") + args = p.parse_args() + return asyncio.run(_amain(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/qm_mcp/config.py b/qm_mcp/config.py new file mode 100644 index 0000000..6830f6f --- /dev/null +++ b/qm_mcp/config.py @@ -0,0 +1,97 @@ +"""Configuration + secret loading for the QuantMind corpus surface. + +Secrets are NOT hard-coded here. The OpenAI key (used by QuantMind's +``paper_flow`` extraction and by our embedding/synthesis calls) is loaded +from the canonical Hermes gateway env file ``~/.hermes/.env`` if present, +then from the process environment. This mirrors the Phase 3 Doppler/`.env` +pattern: the running gateway already owns these secrets. +""" + +from __future__ import annotations + +import os +from pathlib import Path + +# Canonical secret source: the always-on Hermes gateway env file. +_HERMES_ENV = Path.home() / ".hermes" / ".env" + +# Embedding + synthesis models. text-embedding-3-small is 1536-dim, cheap, +# and good enough for a coarse semantic pre-filter over a research corpus. +EMBED_MODEL = os.environ.get("QM_EMBED_MODEL", "text-embedding-3-small") +SYNTH_MODEL = os.environ.get("QM_SYNTH_MODEL", "gpt-4o-mini") +# Extraction model for paper_flow. gpt-4o-mini keeps per-paper cost to cents. +EXTRACT_MODEL = os.environ.get("QM_EXTRACT_MODEL", "gpt-4o-mini") + +# Embedding input ceiling (chars). text-embedding-3-small caps at ~8191 +# tokens; ~24k chars (~6k tokens) leaves comfortable headroom. +EMBED_CHAR_LIMIT = 24_000 +# Synthesis context ceiling (chars) across all retrieved sources. +SYNTH_CONTEXT_CHAR_LIMIT = 14_000 + + +def corpus_dir() -> Path: + """Root directory for the persisted corpus (items + vectors).""" + raw = os.environ.get("QM_CORPUS_DIR") + base = ( + Path(raw).expanduser() + if raw + else (Path.home() / ".quantmind" / "corpus") + ) + base.mkdir(parents=True, exist_ok=True) + return base + + +def load_secrets() -> None: + """Load OPENAI_API_KEY (and friends) from ~/.hermes/.env into os.environ. + + Existing process-env values win — we only fill gaps. This never prints + or returns the secret value. + """ + if not _HERMES_ENV.is_file(): + return + try: + for line in _HERMES_ENV.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, _, val = line.partition("=") + key = key.strip() + val = val.strip().strip('"').strip("'") + if key and key not in os.environ: + os.environ[key] = val + except OSError: + # Secret file unreadable — fall through to whatever is already in + # the environment. The OpenAI client will raise a clear error if the + # key is genuinely absent. + pass + + # CRITICAL: Hermes' OPENAI_API_KEY is an OpenRouter key (sk-or-...). That + # 401s against api.openai.com and OpenRouter exposes no embeddings + # endpoint. The real platform.openai.com key is stored separately as + # VOICE_TOOLS_OPENAI_KEY (used for Whisper). Force it as the OpenAI key + # for THIS process only so both QuantMind's openai-agents extraction and + # our embeddings/synthesis hit real OpenAI. We also clear any OpenAI base + # URL so the client cannot be redirected to OpenRouter. + real = os.environ.get("VOICE_TOOLS_OPENAI_KEY", "").strip() + if real: + os.environ["OPENAI_API_KEY"] = real + os.environ.pop("OPENAI_BASE_URL", None) + + +def require_openai_key() -> str: + """Return the real OpenAI key or raise a clear, actionable error.""" + load_secrets() + key = os.environ.get("OPENAI_API_KEY", "").strip() + if not key: + raise RuntimeError( + "No OpenAI key available. QuantMind ingestion + corpus embedding " + "need a real platform.openai.com key. Set VOICE_TOOLS_OPENAI_KEY " + "(preferred) or OPENAI_API_KEY in ~/.hermes/.env." + ) + if key.startswith("sk-or-"): + raise RuntimeError( + "The active OpenAI key is an OpenRouter key (sk-or-...), which " + "cannot do embeddings or reach api.openai.com. Set " + "VOICE_TOOLS_OPENAI_KEY to a real platform.openai.com key." + ) + return key diff --git a/qm_mcp/embed.py b/qm_mcp/embed.py new file mode 100644 index 0000000..0b0b15b --- /dev/null +++ b/qm_mcp/embed.py @@ -0,0 +1,130 @@ +"""Embedding + answer-synthesis via OpenAI (the key QuantMind already needs). + +Kept tiny and synchronous; callers on the async MCP path wrap these in +``asyncio.to_thread`` so the event loop is never blocked. +""" + +from __future__ import annotations + +import json + +import numpy as np +from openai import OpenAI + +from qm_mcp.config import ( + EMBED_CHAR_LIMIT, + EMBED_MODEL, + SYNTH_MODEL, + require_openai_key, +) + +# Markdown handed to the summarizer is truncated to keep the call cheap and +# inside context limits; the head of a paper carries abstract + intro + +# method, which is what the summary needs. +_SUMMARY_INPUT_CHAR_LIMIT = 30_000 + +_client: OpenAI | None = None + + +def _get_client() -> OpenAI: + global _client + if _client is None: + _client = OpenAI(api_key=require_openai_key()) + return _client + + +def embed_text(text: str) -> np.ndarray: + """Embed one string -> float32 vector.""" + payload = (text or "").strip()[:EMBED_CHAR_LIMIT] or "(empty)" + resp = _get_client().embeddings.create(model=EMBED_MODEL, input=payload) + return np.asarray(resp.data[0].embedding, dtype=np.float32) + + +def synthesize_answer(question: str, contexts: list[dict]) -> str: + """Answer ``question`` grounded in retrieved corpus contexts. + + ``contexts`` is a list of {title, source, text}. The model is told to + answer ONLY from the provided material and to name the papers it used, + so the corpus stays the source of truth (no free-floating hallucination). + """ + if not contexts: + return ( + "No matching items in the corpus yet — ingest some research first." + ) + + blocks = [] + for i, c in enumerate(contexts, 1): + blocks.append( + f"[{i}] {c.get('title', 'untitled')} " + f"({c.get('source', '?')})\n{c.get('text', '')}" + ) + corpus_block = "\n\n".join(blocks) + + system = ( + "You are a quantitative-finance research assistant answering from a " + "private corpus of ingested papers. Answer ONLY from the provided " + "sources. Cite the bracketed source numbers you use, e.g. [1]. If the " + "corpus does not contain the answer, say so plainly rather than " + "guessing." + ) + user = f"Question: {question}\n\n--- Corpus sources ---\n{corpus_block}" + + resp = _get_client().chat.completions.create( + model=SYNTH_MODEL, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + temperature=0.2, + ) + return (resp.choices[0].message.content or "").strip() + + +def summarize_markdown(title: str, source: str, markdown: str) -> dict: + """Extract a structured research summary from markdown. + + Returns ``{summary, key_findings[], tags[], asset_classes[]}``. This is + our own robust replacement for QuantMind's brittle ``Paper`` tree + extraction (which demands UUID node ids the LLM won't reliably emit). + Defensive: malformed JSON degrades to a plain-text summary. + """ + body = (markdown or "").strip()[:_SUMMARY_INPUT_CHAR_LIMIT] or "(empty)" + system = ( + "You extract a structured summary of a quantitative-finance research " + "document for a searchable corpus. Respond with a JSON object with " + "keys: title (the document's real title — recover it from the content; " + "ignore library watermarks, download stamps, headers/footers), summary " + "(a dense 150-250 word abstract covering the problem, method, and main " + "result), key_findings (list of 3-7 concise bullet strings), tags (list " + "of short topic tags), asset_classes (list, e.g. " + "equities/futures/fx/crypto/rates, empty if unspecified). Be faithful " + "to the source; do not invent results." + ) + user = f"Title hint: {title}\nSource: {source}\n\n--- Document ---\n{body}" + resp = _get_client().chat.completions.create( + model=SYNTH_MODEL, + messages=[ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ], + temperature=0.2, + response_format={"type": "json_object"}, + ) + raw = (resp.choices[0].message.content or "").strip() + try: + data = json.loads(raw) + except json.JSONDecodeError: + return { + "title": "", + "summary": raw, + "key_findings": [], + "tags": [], + "asset_classes": [], + } + return { + "title": str(data.get("title", "")).strip(), + "summary": str(data.get("summary", "")).strip(), + "key_findings": [str(x) for x in (data.get("key_findings") or [])], + "tags": [str(x) for x in (data.get("tags") or [])], + "asset_classes": [str(x) for x in (data.get("asset_classes") or [])], + } diff --git a/qm_mcp/ingest.py b/qm_mcp/ingest.py new file mode 100644 index 0000000..334a7de --- /dev/null +++ b/qm_mcp/ingest.py @@ -0,0 +1,260 @@ +"""Ingestion — QuantMind fetch+format + our own robust summarization. + +Design note: QuantMind's *fetch + format* layer is solid (arxiv API + httpx +download + pymupdf PDF->markdown + trafilatura HTML->markdown), so we use it +directly. Its *structured Paper-tree extraction* (``paper_flow``) is brittle +under OpenAI structured output (it demands UUID node ids the LLM won't emit), +so we skip it and run our own summarizer instead. The corpus item therefore +carries: source metadata, the cleaned markdown, a structured summary, and an +embedding — everything the query layer needs, none of the fragile tree. + +All entry points are async (the fetch/format layer is async); the MCP server +awaits them and the CLI wraps them in ``asyncio.run``. +""" + +from __future__ import annotations + +import asyncio +import hashlib +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from qm_mcp.config import ( + SYNTH_CONTEXT_CHAR_LIMIT, + corpus_dir, + load_secrets, +) +from qm_mcp.embed import embed_text, summarize_markdown +from qm_mcp.store import CorpusStore, make_id +from quantmind.preprocess.fetch import ( + Fetched, + fetch_arxiv, + fetch_url, + read_local_file, +) +from quantmind.preprocess.format import html_to_markdown, pdf_to_markdown + +_INGEST_LOG = "ingestion_log.jsonl" +# Cap the markdown we persist per item (text only, lives outside git). +_MARKDOWN_STORE_LIMIT = 400_000 + + +# ── format helper ────────────────────────────────────────────────────── +async def _to_markdown(raw: Fetched) -> str: + ct = (raw.content_type or "").lower() + if ct.startswith("application/pdf"): + return await pdf_to_markdown(raw.bytes) + if ct.startswith("text/html"): + return await html_to_markdown( + raw.bytes.decode("utf-8", errors="replace") + ) + return raw.bytes.decode("utf-8", errors="replace") + + +def _derive_title(markdown: str, fallback: str) -> str: + for line in markdown.splitlines(): + s = line.strip() + if s.startswith("#"): + return s.lstrip("#").strip() or fallback + if s: + return s[:160] + return fallback + + +# ── ledger ───────────────────────────────────────────────────────────── +def _append_ingestion_log(record: dict[str, Any]) -> None: + entry = { + "id": record["id"], + "title": record.get("title"), + "source_type": record.get("source_type"), + "source": record.get("source"), + "ingested_at": record.get("ingested_at"), + "event": "research.ingest", + } + with (corpus_dir() / _INGEST_LOG).open("a", encoding="utf-8") as fh: + fh.write(json.dumps(entry) + "\n") + + +# ── core persist ─────────────────────────────────────────────────────── +async def _persist( + *, + source_type: str, + source: str, + markdown: str, + meta: dict[str, Any], + store: CorpusStore, + force: bool, +) -> dict[str, Any]: + item_id = make_id(source_type, source) + if store.exists(item_id) and not force: + existing = store.get(item_id) or {} + return { + "id": item_id, + "status": "exists", + "title": existing.get("title"), + "source_type": source_type, + "source": source, + } + + title_hint = meta.get("title") or _derive_title(markdown, fallback=source) + structured = await asyncio.to_thread( + summarize_markdown, title_hint, source, markdown + ) + # arXiv metadata title is authoritative; otherwise prefer the LLM-recovered + # title (it ignores library watermarks / download stamps that the + # first-line heuristic would otherwise grab), then the heuristic hint. + title = meta.get("title") or structured.get("title") or title_hint + + summary = structured["summary"] + key_findings = structured["key_findings"] + abstract = meta.get("abstract") or "" + + embed_blob = "\n".join( + [title, abstract, summary, " ".join(key_findings)] + ).strip() + full_context = "\n\n".join( + [ + f"# {title}", + f"Abstract: {abstract}" if abstract else "", + f"Summary: {summary}", + "Key findings:\n- " + "\n- ".join(key_findings) + if key_findings + else "", + "--- Source excerpt ---\n" + markdown, + ] + )[:SYNTH_CONTEXT_CHAR_LIMIT] + + as_of = meta.get("published_at") or datetime.now(timezone.utc).isoformat() + record: dict[str, Any] = { + "id": item_id, + "source_type": source_type, + "source": source, + "item_type": "paper" + if source_type in ("arxiv", "local") + else "document", + "title": title, + "authors": list(meta.get("authors") or []), + "arxiv_id": meta.get("arxiv_id"), + "abstract": abstract, + "summary": summary, + "key_findings": key_findings, + "tags": structured["tags"], + "asset_classes": structured["asset_classes"], + "categories": list(meta.get("categories") or []), + "as_of": as_of, + "embedding_text": embed_blob, + "full_context": full_context, + "markdown": markdown[:_MARKDOWN_STORE_LIMIT], + "markdown_chars": len(markdown), + "ingested_at": datetime.now(timezone.utc).isoformat(), + } + + vector = await asyncio.to_thread(embed_text, embed_blob) + store.add(record, vector) + _append_ingestion_log(record) + return { + "id": item_id, + "status": "ingested", + "title": title, + "source_type": source_type, + "source": source, + "authors": record["authors"], + "tags": record["tags"], + "asset_classes": record["asset_classes"], + } + + +# ── public entry points ──────────────────────────────────────────────── +async def ingest_arxiv( + arxiv_id: str, *, store: CorpusStore | None = None, force: bool = False +): + load_secrets() + store = store or CorpusStore() + raw = await fetch_arxiv(arxiv_id) + markdown = await pdf_to_markdown(raw.bytes) + meta = { + "title": raw.title, + "authors": list(raw.authors), + "arxiv_id": raw.arxiv_id, + "abstract": raw.abstract, + "published_at": raw.published_at.isoformat() + if raw.published_at + else None, + "categories": list(raw.categories), + } + return await _persist( + source_type="arxiv", + source=raw.arxiv_id or arxiv_id, + markdown=markdown, + meta=meta, + store=store, + force=force, + ) + + +async def ingest_url( + url: str, *, store: CorpusStore | None = None, force: bool = False +): + load_secrets() + store = store or CorpusStore() + raw = await fetch_url(url) + markdown = await _to_markdown(raw) + return await _persist( + source_type="url", + source=url, + markdown=markdown, + meta={"content_type": raw.content_type}, + store=store, + force=force, + ) + + +async def ingest_pdf( + path: str, *, store: CorpusStore | None = None, force: bool = False +): + load_secrets() + store = store or CorpusStore() + abspath = str(Path(path).expanduser().resolve()) + if not Path(abspath).is_file(): + return { + "id": None, + "status": "error", + "error": f"file not found: {abspath}", + } + raw = await read_local_file(abspath) + markdown = await _to_markdown(raw) + return await _persist( + source_type="local", + source=abspath, + markdown=markdown, + meta={"content_type": raw.content_type}, + store=store, + force=force, + ) + + +async def ingest_text( + text: str, + *, + title_hint: str | None = None, + store: CorpusStore | None = None, + force: bool = False, +): + load_secrets() + store = store or CorpusStore() + key = ( + (title_hint or "text") + + ":" + + hashlib.sha1(text.encode("utf-8")).hexdigest()[:12] + ) + meta = {"title": title_hint} if title_hint else {} + return await _persist( + source_type="text", + source=key, + markdown=text, + meta=meta, + store=store, + force=force, + ) diff --git a/qm_mcp/query.py b/qm_mcp/query.py new file mode 100644 index 0000000..20f53f7 --- /dev/null +++ b/qm_mcp/query.py @@ -0,0 +1,66 @@ +"""Semantic query over the corpus: embed -> cosine top-k -> grounded answer.""" + +from __future__ import annotations + +import asyncio +from typing import Any + +from qm_mcp.config import load_secrets +from qm_mcp.embed import embed_text, synthesize_answer +from qm_mcp.store import CorpusStore + + +async def query( + question: str, + *, + k: int = 6, + synthesize: bool = True, + store: CorpusStore | None = None, +) -> dict[str, Any]: + """Answer ``question`` from the corpus. + + Returns ``{question, answer, sources:[{id,title,score,source,authors}]}``. + ``answer`` is None when ``synthesize=False`` (retrieval-only mode). + """ + load_secrets() + store = store or CorpusStore() + + if len(store) == 0: + return { + "question": question, + "answer": "The corpus is empty — ingest some research first.", + "sources": [], + } + + q_vec = await asyncio.to_thread(embed_text, question) + hits = store.search(q_vec, k=k) + + sources: list[dict[str, Any]] = [] + contexts: list[dict[str, str]] = [] + for item_id, score in hits: + rec = store.get(item_id) + if not rec: + continue + sources.append( + { + "id": item_id, + "title": rec.get("title"), + "score": round(score, 4), + "source_type": rec.get("source_type"), + "source": rec.get("source"), + "authors": rec.get("authors", []), + } + ) + contexts.append( + { + "title": rec.get("title", "untitled"), + "source": rec.get("arxiv_id") or rec.get("source", "?"), + "text": rec.get("full_context") or rec.get("summary", ""), + } + ) + + answer = None + if synthesize: + answer = await asyncio.to_thread(synthesize_answer, question, contexts) + + return {"question": question, "answer": answer, "sources": sources} diff --git a/qm_mcp/seed_corpus.txt b/qm_mcp/seed_corpus.txt new file mode 100644 index 0000000..f2be7e6 --- /dev/null +++ b/qm_mcp/seed_corpus.txt @@ -0,0 +1,25 @@ +# QuantMind initial corpus seed list — quant / market-making / microstructure. +# One source per line. Lines are routed by qm_mcp.cli.seed: +# - bare token or arxiv: prefix or arxiv.org URL -> arXiv ingestion +# - http(s) URL -> web/PDF ingestion +# - existing filesystem path -> local PDF/HTML/MD ingestion +# Comments (#) and blank lines are skipped. + +# ── Local references (trident-forge) ──────────────────────────────────── +/Users/thomasadair/projects/p1-growth/trident-forge/docs/references/avellaneda_stoikov_2008_HFT_limit_order_book.pdf + +# ── arXiv: market making / inventory ──────────────────────────────────── +1105.3115 # Gueant, Lehalle, Fernandez-Tapia — Dealing with inventory risk +1605.01862 # Gueant — Optimal market making +1405.4974 # Cartea, Jaimungal — Optimal execution with limit & market orders +1610.00261 # Lehalle, Mounjid — Limit order strategic placement + +# ── arXiv: limit order book / microstructure ──────────────────────────── +0712.0332 # Cont, Stoikov, Talreja — A stochastic model for order book dynamics +1210.1625 # Cont, Kukanov — Optimal order placement in limit order markets +1705.00109 # Stoikov — The micro-price +1105.1694 # Toth et al — Anomalous price impact / critical nature of liquidity + +# ── arXiv: volatility / point processes ───────────────────────────────── +1410.3394 # Gatheral, Jaisson, Rosenbaum — Volatility is rough +1502.04592 # Bacry, Mastromatteo, Muzy — Hawkes processes in finance diff --git a/qm_mcp/server.py b/qm_mcp/server.py new file mode 100644 index 0000000..88e30ea --- /dev/null +++ b/qm_mcp/server.py @@ -0,0 +1,91 @@ +"""QuantMind research-corpus MCP server (stdio). + +Exposes the corpus to any MCP client — Personal Hermes (the #research +channel), Dispatch sessions, the Conductor, future Akazi AVST. Tools are the +contract the brief asked for: + + qm_ingest_url(url) ingest a web page / hosted PDF + qm_ingest_pdf(path) ingest a local PDF / HTML / markdown file + qm_ingest_arxiv(arxiv_id) ingest an arXiv paper by id or URL + qm_ingest_text(text,title) ingest pasted text + qm_query(question, k) natural-language query (grounded answer + sources) + qm_list_corpus() list everything ingested + qm_delete_item(item_id) remove one item + +Run (under QuantMind's venv):: + + /Users/thomasadair/projects/quant-mind/.venv/bin/python -m qm_mcp.server +""" + +from __future__ import annotations + +from typing import Any + +from mcp.server.fastmcp import FastMCP + +from qm_mcp import ingest as _ingest +from qm_mcp.query import query as _query +from qm_mcp.store import CorpusStore + +mcp = FastMCP("quantmind-research") + + +@mcp.tool() +async def qm_ingest_arxiv(arxiv_id: str) -> dict[str, Any]: + """Ingest an arXiv paper into the research corpus. + + Args: + arxiv_id: arXiv id (e.g. "1105.3115") or a full arxiv.org URL. + """ + return await _ingest.ingest_arxiv(arxiv_id) + + +@mcp.tool() +async def qm_ingest_url(url: str) -> dict[str, Any]: + """Ingest a web page or hosted PDF (news, blog, report) by URL.""" + return await _ingest.ingest_url(url) + + +@mcp.tool() +async def qm_ingest_pdf(path: str) -> dict[str, Any]: + """Ingest a local PDF / HTML / Markdown file by filesystem path.""" + return await _ingest.ingest_pdf(path) + + +@mcp.tool() +async def qm_ingest_text(text: str, title: str | None = None) -> dict[str, Any]: + """Ingest pasted raw text as a corpus item.""" + return await _ingest.ingest_text(text, title_hint=title) + + +@mcp.tool() +async def qm_query(question: str, k: int = 6) -> dict[str, Any]: + """Ask the corpus a natural-language question. + + Returns a grounded answer (cited to ingested sources) plus the top-k + matching items. + """ + return await _query(question, k=k) + + +@mcp.tool() +def qm_list_corpus() -> dict[str, Any]: + """List every item in the research corpus (metadata only).""" + store = CorpusStore() + items = store.list_records(light=True) + return {"count": len(items), "items": items} + + +@mcp.tool() +def qm_delete_item(item_id: str) -> dict[str, Any]: + """Delete one corpus item by its id.""" + removed = CorpusStore().delete(item_id) + return {"id": item_id, "deleted": removed} + + +def main() -> None: + mcp.run() + + +if __name__ == "__main__": + main() diff --git a/qm_mcp/store.py b/qm_mcp/store.py new file mode 100644 index 0000000..53ab6ad --- /dev/null +++ b/qm_mcp/store.py @@ -0,0 +1,115 @@ +"""CorpusStore — filesystem persistence for ingested knowledge + vectors. + +Layout under ``QM_CORPUS_DIR`` (default ``~/.quantmind/corpus``):: + + items/.json one record per ingested item (metadata + full Paper) + vectors/.npy aligned embedding vector (float32) + +Item ``id`` is a stable hash of the source, so re-ingesting the same arXiv +id / URL / file is idempotent (dedup). The store has no global index file to +corrupt: listing globs ``items/``, and search loads the vectors on demand. +This is deliberately simple and good for hundreds–low-thousands of items. +""" + +from __future__ import annotations + +import hashlib +import json +from pathlib import Path +from typing import Any + +import numpy as np + +from qm_mcp.config import corpus_dir + + +def make_id(source_type: str, source: str) -> str: + """Stable dedup id for a source (sha1 of ``type:source``, 16 hex chars).""" + digest = hashlib.sha1(f"{source_type}:{source}".encode("utf-8")).hexdigest() + return digest[:16] + + +class CorpusStore: + """Filesystem-backed corpus of extracted knowledge + embeddings.""" + + def __init__(self, root: Path | None = None) -> None: + self.root = root or corpus_dir() + self.items_dir = self.root / "items" + self.vectors_dir = self.root / "vectors" + self.items_dir.mkdir(parents=True, exist_ok=True) + self.vectors_dir.mkdir(parents=True, exist_ok=True) + + # ── paths ────────────────────────────────────────────────────────── + def _item_path(self, item_id: str) -> Path: + return self.items_dir / f"{item_id}.json" + + def _vec_path(self, item_id: str) -> Path: + return self.vectors_dir / f"{item_id}.npy" + + # ── writes ───────────────────────────────────────────────────────── + def exists(self, item_id: str) -> bool: + return self._item_path(item_id).is_file() + + def add(self, record: dict[str, Any], vector: np.ndarray) -> None: + """Persist one record + its embedding vector atomically-ish.""" + item_id = record["id"] + tmp = self._item_path(item_id).with_suffix(".json.tmp") + tmp.write_text( + json.dumps(record, indent=2, default=str), encoding="utf-8" + ) + tmp.replace(self._item_path(item_id)) + np.save(self._vec_path(item_id), vector.astype(np.float32)) + + def delete(self, item_id: str) -> bool: + removed = False + for p in (self._item_path(item_id), self._vec_path(item_id)): + if p.is_file(): + p.unlink() + removed = True + return removed + + # ── reads ────────────────────────────────────────────────────────── + def get(self, item_id: str) -> dict[str, Any] | None: + p = self._item_path(item_id) + if not p.is_file(): + return None + return json.loads(p.read_text(encoding="utf-8")) + + def list_records(self, *, light: bool = True) -> list[dict[str, Any]]: + """All records. ``light=True`` drops the heavy ``paper`` tree.""" + out: list[dict[str, Any]] = [] + for p in sorted(self.items_dir.glob("*.json")): + try: + rec = json.loads(p.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + continue + if light: + rec = { + k: v + for k, v in rec.items() + if k not in ("paper", "full_context") + } + out.append(rec) + return out + + def __len__(self) -> int: + return sum(1 for _ in self.items_dir.glob("*.json")) + + # ── search ───────────────────────────────────────────────────────── + def search( + self, query_vec: np.ndarray, k: int = 5 + ) -> list[tuple[str, float]]: + """Cosine top-k. Returns [(id, score)] sorted desc.""" + ids: list[str] = [] + mats: list[np.ndarray] = [] + for vp in self.vectors_dir.glob("*.npy"): + ids.append(vp.stem) + mats.append(np.load(vp)) + if not ids: + return [] + matrix = np.vstack(mats).astype(np.float32) + q = query_vec.astype(np.float32) + denom = (np.linalg.norm(matrix, axis=1) * np.linalg.norm(q)) + 1e-9 + scores = (matrix @ q) / denom + order = np.argsort(-scores)[:k] + return [(ids[i], float(scores[i])) for i in order] From 615c4fb647cbb2a94b1766c1495a8385bb62b698 Mon Sep 17 00:00:00 2001 From: Thomas Adair Date: Fri, 12 Jun 2026 11:06:06 -0700 Subject: [PATCH 2/4] =?UTF-8?q?docs:=20qm=5Fmcp=20engineering=20log=20?= =?UTF-8?q?=E2=80=94=20record=20Phase=204=20merge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- QM_MCP_ENGINEERING_LOG.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 QM_MCP_ENGINEERING_LOG.md diff --git a/QM_MCP_ENGINEERING_LOG.md b/QM_MCP_ENGINEERING_LOG.md new file mode 100644 index 0000000..ceb0aac --- /dev/null +++ b/QM_MCP_ENGINEERING_LOG.md @@ -0,0 +1,23 @@ +# qm_mcp engineering log + +Append-only record of notable changes to the `qm_mcp/` research-corpus layer +(Thomas's additive layer on top of LLMQuant/quant-mind). Upstream `quantmind/` +history lives in the normal git log. + +## 2026-06-12 — Phase 4 landing: qm_mcp merged to master + +- **PR [#1](https://github.com/AdairBear/quant-mind/pull/1)** squash-merged → + `9b8a9599d5e00f61f9b2c2e883a02ecf1b0aa90c`. +- Adds the persistence + embedding + semantic-query + MCP layer + (`store.py`, `embed.py`, `ingest.py`, `query.py`, `server.py`, `cli.py`, + `seed_corpus.txt`, `_smoke_mcp.py`) that QuantMind v0.2 does not yet ship. +- Companion hermes-agent side: PR + [#10](https://github.com/AdairBear/hermes-agent/pull/10) → + `84314fa7eec991eccea8a59024c79f3cef53efbc` (the `#research` channel router + + `docs/quantmind_brain_boundary.md`). +- Landed in a **new private** `AdairBear/quant-mind` repo (origin left pointing + at upstream `LLMQuant/quant-mind`; `fork` remote added). +- Verified: direct stdio MCP call enumerates all 7 tools and `qm_query` returns + grounded, cited answers; corpus live (33 items incl. Databento + futures-microstructure articles). Live-gateway pickup pending an operator + restart (see `quantmind_brain_boundary.md` in hermes-agent for the open item). From 4baec3dbe6caed52bfe0e9d83bded445ef5c8f8a Mon Sep 17 00:00:00 2001 From: Thomas Adair Date: Thu, 25 Jun 2026 17:39:38 -0700 Subject: [PATCH 3/4] feat(grpo): GRPO-suitability tagging per Autodata 2606.25996 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `grpo_suitability: high|medium|low` to every corpus entry at ingest time, implementing the weak-vs-strong discrimination-gap framework from Kulikov et al. (FAIR at Meta, arXiv:2606.25996). V1 is a pure deterministic heuristic (no live model calls): - long + arxiv source + code present → high - short + news/unknown source + no code → low - everything else → medium Changes: - qm_mcp/grpo_suitability.py: GrpoSuitabilityScorer with score_entry(), length_band, domain_band, code_present helpers; V2 solver-gap hooks documented as TODOs - qm_mcp/ingest.py: score computed in _persist() and persisted to both items/.json and ingestion_log.jsonl; backward-compatible (existing entries not touched) - qm_mcp/test_grpo_suitability.py: 22 pytest cases covering heuristic correctness, domain-band edge cases, backward compat, idempotency - docs/grpo_suitability.md: framework reference, V1 rule table, V2 plan Co-Authored-By: Claude Sonnet 4.6 --- docs/grpo_suitability.md | 130 +++++++++++++++++ qm_mcp/grpo_suitability.py | 115 +++++++++++++++ qm_mcp/ingest.py | 11 ++ qm_mcp/test_grpo_suitability.py | 245 ++++++++++++++++++++++++++++++++ 4 files changed, 501 insertions(+) create mode 100644 docs/grpo_suitability.md create mode 100644 qm_mcp/grpo_suitability.py create mode 100644 qm_mcp/test_grpo_suitability.py diff --git a/docs/grpo_suitability.md b/docs/grpo_suitability.md new file mode 100644 index 0000000..2587da8 --- /dev/null +++ b/docs/grpo_suitability.md @@ -0,0 +1,130 @@ +# GRPO Suitability Tagging + +## Overview + +Each QuantMind corpus entry is tagged with a `grpo_suitability` field +(`"high"`, `"medium"`, or `"low"`) at ingest time. The tag scores how +useful the entry would be as training or evaluation data for a +Generalized Reward Policy Optimization (GRPO) loop, specifically whether +the entry's content sits in the **learnable zone** — questions it encodes +can be answered by a strong solver but not a weak one. + +## Theoretical Basis + +The framework is grounded in **Autodata** (Kulikov, Whitehouse, Wu, Nie +et al., FAIR at Meta, arXiv:2606.25996, 2026). Section 2b defines the +acceptance criterion for a GRPO-useful training example: + +> strong solver avg ≥ 0.65, weak solver avg < 0.50, gap ≥ 20pp + +Content that meets this criterion sits in the "learnable zone": too hard +for a weak model to recall from surface features, but consistently +solvable by a capable model using deep reasoning. Content outside this +zone either provides no discrimination signal (both solvers fail — too +hard) or no learning signal (both solvers succeed — too easy). + +Autodata's empirical result on legal reasoning tasks: 4.8% high-suitability +entries with naive CoT generation → 52% high-suitability entries after the +agentic loop. The gap shows how much corpus quality varies and why tagging +matters before training or evaluation. + +## V1 Heuristic (No Live Model Calls) + +V1 uses a deterministic heuristic on fields already present in the corpus +entry. No network calls, no LLM inference. The heuristic uses three signals +as proxies for discrimination potential: + +| Signal | Proxy for | +|---|---| +| `length_band` | Document depth (more content → richer reasoning surface) | +| `domain_band` | Source authority (peer-reviewed → higher reasoning demand) | +| `code_present` | Technical depth (math/code → non-trivial query surface) | + +### Length Band + +| Band | Threshold | +|---|---| +| `short` | `markdown_chars` < 5 000 | +| `medium` | 5 000 ≤ `markdown_chars` < 20 000 | +| `long` | `markdown_chars` ≥ 20 000 | + +### Domain Band + +| Band | Source types / URL patterns | +|---|---| +| `arxiv` | `source_type == "arxiv"` or `source_type == "local"` or URL contains `arxiv.org` | +| `ssrn` | URL contains `ssrn.com` | +| `substack` | URL contains `substack.com` | +| `news` | All other URLs, `source_type == "text"`, unrecognized sources | + +### Code Present + +`True` if the entry's markdown contains a fenced code block (` ``` `), +a math block (`$$`), or inline code of ≥ 4 characters. + +### V1 Decision Rule + +``` +long + arxiv + code_present → "high" +short + news + not code_present → "low" +everything else → "medium" +``` + +The rule is conservative: only the clearest signals on both ends are +tagged high or low. Uncertain cases default to medium. + +## Backward Compatibility + +Existing corpus entries that pre-date this feature simply lack the +`grpo_suitability` key. The scorer operates on any dict and returns a +score regardless of which optional fields are present — it will not raise +on a partial or legacy record. Downstream consumers should treat a missing +key as `null` (unscored), not as `"low"`. + +## Schema Impact + +### `~/.quantmind/corpus/items/.json` + +```jsonc +{ + // ... existing fields unchanged ... + "grpo_suitability": "high" // "high" | "medium" | "low" +} +``` + +### `~/.quantmind/corpus/ingestion_log.jsonl` + +```jsonc +{ + "id": "...", + "title": "...", + "source_type": "arxiv", + "source": "2606.25996", + "ingested_at": "...", + "grpo_suitability": "high", + "event": "research.ingest" +} +``` + +## V2 Plan — Actual Solver Gap + +When QuantMind has the LLM substrate to run two queries per entry, replace +the heuristic with a real discrimination measurement: + +1. **Weak query** — surface-recall question: *"What method did this paper + propose?"* Run via a cheap model (Haiku / small LLAMA). +2. **Strong query** — application question: *"Where does this method break + down in a non-stationary regime?"* Run via a capable model (Sonnet / + Opus). +3. **Gap** = strong score − weak score. +4. Tag `high` if `gap ≥ 0.20` AND `strong ≥ 0.65`; `low` if `gap < 0.05` + AND `strong < 0.50`; `medium` otherwise. + +The scorer class (`GrpoSuitabilityScorer`) already carries documented TODO +hooks in `qm_mcp/grpo_suitability.py` marking where these steps plug in. + +## Usage + +The tag is computed automatically at ingest time. To query by suitability, +filter the corpus store items by the `grpo_suitability` field. Priority +for Conductor and Strategy Lab use cases: surface `"high"` entries first. diff --git a/qm_mcp/grpo_suitability.py b/qm_mcp/grpo_suitability.py new file mode 100644 index 0000000..7b2c154 --- /dev/null +++ b/qm_mcp/grpo_suitability.py @@ -0,0 +1,115 @@ +"""GRPO-suitability scoring for QuantMind corpus entries. + +Implements the weak-vs-strong discrimination-gap framework from Autodata +(Kulikov et al., arXiv:2606.25996, FAIR at Meta, 2026). In v1 the scorer +is a pure heuristic — no live model calls — with documented hooks for the +v2 solver-gap measurement once QuantMind has the LLM substrate for it. + +Scoring bands +───────────── + high — long document + authoritative source + technical content + (strong proxy for "learnable zone" per Autodata §2b) + medium — everything else + low — short document + low-authority source + no code + (strong proxy for "too easy / too sparse to discriminate") +""" + +from __future__ import annotations + +import re +from typing import Any + +# Thresholds for the length proxy. +_SHORT_CHARS = 5_000 +_LONG_CHARS = 20_000 + +# Source-type → domain-band mapping. +_DOMAIN_BAND_BY_SOURCE_TYPE: dict[str, str] = { + "arxiv": "arxiv", + "local": "arxiv", # local PDFs are almost always papers +} + +# URL substring → domain band (checked in priority order). +_URL_DOMAIN_PATTERNS: list[tuple[str, str]] = [ + ("arxiv.org", "arxiv"), + ("ssrn.com", "ssrn"), + ("papers.ssrn.com", "ssrn"), + ("substack.com", "substack"), +] + +# Regex for fenced code blocks and inline code of ≥4 chars. +_CODE_PATTERN = re.compile(r"```|\$\$|`[^`]{4,}`") + + +class GrpoSuitabilityScorer: + """Deterministic v1 GRPO-suitability scorer. + + Operates entirely on fields already present in a corpus entry dict. + No network calls, no LLM inference — pure heuristic. + + V2 plan (when QuantMind has the LLM substrate): + ──────────────────────────────────────────────── + # TODO(v2): replace heuristic with Autodata-style solver gap. + # Steps: + # 1. _weak_query(entry) → float (surface-recall query via cheap model) + # 2. _strong_query(entry) → float (application query via frontier model) + # 3. gap = _strong_query - _weak_query + # 4. if gap >= 0.20 and _strong_query >= 0.65: return "high" + # elif gap >= 0.10: return "medium" + # else: return "low" + # Acceptance criterion per Autodata Table 1: + # strong avg ≥ 0.65, weak avg < 0.50, gap ≥ 20pp → "high" + """ + + # ── public API ──────────────────────────────────────────────────────── + + def score_entry(self, entry: dict[str, Any]) -> str: + """Return 'high', 'medium', or 'low' for *entry*. + + The entry dict must at minimum contain 'source_type'. All other + fields (markdown, markdown_chars, source) are read with .get() so + the scorer is safe on partial records and legacy entries. + """ + lb = self._length_band(entry) + db = self._domain_band(entry) + cp = self._code_present(entry) + + # V1 deterministic rule — see module docstring. + if lb == "long" and db == "arxiv" and cp: + return "high" + if lb == "short" and db == "news" and not cp: + return "low" + return "medium" + + # ── private helpers ─────────────────────────────────────────────────── + + def _length_band(self, entry: dict[str, Any]) -> str: + """Classify entry length as 'short', 'medium', or 'long'.""" + chars = entry.get("markdown_chars") + if chars is None: + markdown = entry.get("markdown") or entry.get("full_context") or "" + chars = len(markdown) + if chars < _SHORT_CHARS: + return "short" + if chars >= _LONG_CHARS: + return "long" + return "medium" + + def _domain_band(self, entry: dict[str, Any]) -> str: + """Classify source authority as 'arxiv', 'ssrn', 'substack', or 'news'.""" + source_type = entry.get("source_type") or "" + if source_type in _DOMAIN_BAND_BY_SOURCE_TYPE: + return _DOMAIN_BAND_BY_SOURCE_TYPE[source_type] + + source = (entry.get("source") or "").lower() + for substring, band in _URL_DOMAIN_PATTERNS: + if substring in source: + return band + + # text ingests and unknown URL sources default to "news" (lowest authority). + return "news" + + def _code_present(self, entry: dict[str, Any]) -> bool: + """Return True if the entry's markdown contains code or math blocks.""" + markdown = entry.get("markdown") or entry.get("full_context") or "" + return bool(_CODE_PATTERN.search(markdown)) diff --git a/qm_mcp/ingest.py b/qm_mcp/ingest.py index 334a7de..67f636a 100644 --- a/qm_mcp/ingest.py +++ b/qm_mcp/ingest.py @@ -27,6 +27,7 @@ load_secrets, ) from qm_mcp.embed import embed_text, summarize_markdown +from qm_mcp.grpo_suitability import GrpoSuitabilityScorer from qm_mcp.store import CorpusStore, make_id from quantmind.preprocess.fetch import ( Fetched, @@ -37,6 +38,7 @@ from quantmind.preprocess.format import html_to_markdown, pdf_to_markdown _INGEST_LOG = "ingestion_log.jsonl" +_grpo_scorer = GrpoSuitabilityScorer() # Cap the markdown we persist per item (text only, lives outside git). _MARKDOWN_STORE_LIMIT = 400_000 @@ -71,6 +73,7 @@ def _append_ingestion_log(record: dict[str, Any]) -> None: "source_type": record.get("source_type"), "source": record.get("source"), "ingested_at": record.get("ingested_at"), + "grpo_suitability": record.get("grpo_suitability"), "event": "research.ingest", } with (corpus_dir() / _INGEST_LOG).open("a", encoding="utf-8") as fh: @@ -149,6 +152,14 @@ async def _persist( "markdown": markdown[:_MARKDOWN_STORE_LIMIT], "markdown_chars": len(markdown), "ingested_at": datetime.now(timezone.utc).isoformat(), + "grpo_suitability": _grpo_scorer.score_entry( + { + "source_type": source_type, + "source": source, + "markdown": markdown[:_MARKDOWN_STORE_LIMIT], + "markdown_chars": len(markdown), + } + ), } vector = await asyncio.to_thread(embed_text, embed_blob) diff --git a/qm_mcp/test_grpo_suitability.py b/qm_mcp/test_grpo_suitability.py new file mode 100644 index 0000000..82af840 --- /dev/null +++ b/qm_mcp/test_grpo_suitability.py @@ -0,0 +1,245 @@ +"""Tests for GrpoSuitabilityScorer (qm_mcp.grpo_suitability).""" + +from __future__ import annotations + +import pytest + +from qm_mcp.grpo_suitability import GrpoSuitabilityScorer + +SCORER = GrpoSuitabilityScorer() + +# ── fixtures ────────────────────────────────────────────────────────────────── + +CODE_BLOCK = "```python\nx = 1\n```" +SHORT_TEXT = "Short text." +LONG_TEXT = "word " * 5_000 # 25 000 chars + + +def _make_entry( + source_type: str = "arxiv", + source: str = "2606.25996", + markdown: str | None = None, + markdown_chars: int | None = None, +) -> dict: + entry: dict = {"source_type": source_type, "source": source} + if markdown is not None: + entry["markdown"] = markdown + if markdown_chars is not None: + entry["markdown_chars"] = markdown_chars + return entry + + +# ── heuristic correctness ───────────────────────────────────────────────────── + + +class TestHighSuitability: + def test_long_arxiv_with_code_is_high(self) -> None: + entry = _make_entry( + source_type="arxiv", + markdown=LONG_TEXT + "\n" + CODE_BLOCK, + markdown_chars=len(LONG_TEXT) + len(CODE_BLOCK), + ) + assert SCORER.score_entry(entry) == "high" + + def test_long_local_pdf_with_code_is_high(self) -> None: + # local source_type maps to "arxiv" domain band (academic PDFs) + entry = _make_entry( + source_type="local", + source="/home/user/paper.pdf", + markdown=LONG_TEXT + "\n" + CODE_BLOCK, + markdown_chars=len(LONG_TEXT) + len(CODE_BLOCK), + ) + assert SCORER.score_entry(entry) == "high" + + def test_long_arxiv_without_code_is_medium(self) -> None: + entry = _make_entry( + source_type="arxiv", + markdown=LONG_TEXT, + markdown_chars=len(LONG_TEXT), + ) + assert SCORER.score_entry(entry) == "medium" + + +class TestLowSuitability: + def test_short_news_no_code_is_low(self) -> None: + entry = _make_entry( + source_type="url", + source="https://reuters.com/article/xyz", + markdown=SHORT_TEXT, + markdown_chars=len(SHORT_TEXT), + ) + assert SCORER.score_entry(entry) == "low" + + def test_short_text_ingest_no_code_is_low(self) -> None: + entry = _make_entry( + source_type="text", + source="text:abcdef123456", + markdown=SHORT_TEXT, + markdown_chars=len(SHORT_TEXT), + ) + assert SCORER.score_entry(entry) == "low" + + def test_short_news_with_code_is_not_low(self) -> None: + # code_present breaks the low rule → bumps to medium + entry = _make_entry( + source_type="url", + source="https://reuters.com/article/xyz", + markdown=SHORT_TEXT + "\n" + CODE_BLOCK, + markdown_chars=len(SHORT_TEXT) + len(CODE_BLOCK), + ) + assert SCORER.score_entry(entry) != "low" + + +class TestMediumSuitability: + def test_medium_length_arxiv_with_code_is_medium(self) -> None: + medium_text = "word " * 2_500 # 12 500 chars + entry = _make_entry( + source_type="arxiv", + markdown=medium_text + CODE_BLOCK, + markdown_chars=len(medium_text) + len(CODE_BLOCK), + ) + assert SCORER.score_entry(entry) == "medium" + + def test_ssrn_url_is_medium_not_high(self) -> None: + entry = _make_entry( + source_type="url", + source="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=1234", + markdown=LONG_TEXT + CODE_BLOCK, + markdown_chars=len(LONG_TEXT) + len(CODE_BLOCK), + ) + # ssrn domain band is not "arxiv" → can't be high + assert SCORER.score_entry(entry) == "medium" + + def test_substack_url_is_medium(self) -> None: + entry = _make_entry( + source_type="url", + source="https://example.substack.com/p/post", + markdown=LONG_TEXT + CODE_BLOCK, + markdown_chars=len(LONG_TEXT) + len(CODE_BLOCK), + ) + assert SCORER.score_entry(entry) == "medium" + + +# ── domain band edge cases ──────────────────────────────────────────────────── + + +class TestDomainBand: + def test_arxiv_url_resolves_to_arxiv_band(self) -> None: + entry = _make_entry( + source_type="url", + source="https://arxiv.org/abs/2606.25996", + markdown=LONG_TEXT + CODE_BLOCK, + markdown_chars=len(LONG_TEXT) + len(CODE_BLOCK), + ) + assert SCORER.score_entry(entry) == "high" + + def test_unknown_url_defaults_to_news_band(self) -> None: + entry = _make_entry( + source_type="url", + source="https://somesite.example.com/article", + markdown=SHORT_TEXT, + markdown_chars=len(SHORT_TEXT), + ) + assert SCORER.score_entry(entry) == "low" + + +# ── code-presence detection ─────────────────────────────────────────────────── + + +class TestCodePresent: + def test_fenced_code_block_detected(self) -> None: + entry = _make_entry( + source_type="arxiv", markdown=LONG_TEXT + "\n```\nx\n```" + ) + assert SCORER._code_present(entry) is True + + def test_math_block_detected(self) -> None: + entry = _make_entry( + source_type="arxiv", markdown=LONG_TEXT + "\n$$E=mc^2$$" + ) + assert SCORER._code_present(entry) is True + + def test_no_code_block(self) -> None: + entry = _make_entry(source_type="arxiv", markdown=LONG_TEXT) + assert SCORER._code_present(entry) is False + + def test_short_inline_code_ignored(self) -> None: + # inline code < 4 chars does not count + entry = _make_entry(source_type="arxiv", markdown=LONG_TEXT + " `x` ") + assert SCORER._code_present(entry) is False + + +# ── backward compatibility ──────────────────────────────────────────────────── + + +class TestBackwardCompatibility: + def test_entry_without_markdown_chars_field(self) -> None: + """Entries without markdown_chars fall back to len(markdown).""" + entry = { + "source_type": "arxiv", + "source": "1234.5678", + "markdown": LONG_TEXT, + } + result = SCORER.score_entry(entry) + assert result in {"high", "medium", "low"} + + def test_minimal_entry_only_source_type(self) -> None: + """score_entry must not raise on a minimal entry with no markdown.""" + entry = {"source_type": "arxiv"} + result = SCORER.score_entry(entry) + assert result in {"high", "medium", "low"} + + def test_entry_without_grpo_field_does_not_raise(self) -> None: + """Existing corpus items that lack grpo_suitability are not broken.""" + entry = { + "id": "abc123", + "source_type": "arxiv", + "source": "2606.25996", + "title": "Some Paper", + # no grpo_suitability key — backward-compatible + } + result = SCORER.score_entry(entry) + assert result in {"high", "medium", "low"} + + def test_full_context_fallback_when_no_markdown(self) -> None: + """Scorer falls back to full_context if markdown is absent.""" + entry = { + "source_type": "arxiv", + "full_context": LONG_TEXT + CODE_BLOCK, + "markdown_chars": len(LONG_TEXT) + len(CODE_BLOCK), + } + assert SCORER.score_entry(entry) == "high" + + +# ── idempotency ─────────────────────────────────────────────────────────────── + + +class TestIdempotency: + @pytest.mark.parametrize( + "entry", + [ + _make_entry( + source_type="arxiv", + markdown=LONG_TEXT + CODE_BLOCK, + markdown_chars=len(LONG_TEXT) + len(CODE_BLOCK), + ), + _make_entry( + source_type="url", + source="https://reuters.com/x", + markdown=SHORT_TEXT, + markdown_chars=len(SHORT_TEXT), + ), + _make_entry( + source_type="url", + source="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=9", + markdown=LONG_TEXT, + markdown_chars=len(LONG_TEXT), + ), + ], + ) + def test_same_entry_scored_twice_gives_same_result( + self, entry: dict + ) -> None: + first = SCORER.score_entry(entry) + second = SCORER.score_entry(entry) + assert first == second From 4b86b8db2bbb1bfa530082410857d647cce75c9d Mon Sep 17 00:00:00 2001 From: Thomas Adair Date: Thu, 25 Jun 2026 17:45:36 -0700 Subject: [PATCH 4/4] chore(ci): move cov-fail-under=75 from addopts to verify.sh Keeps the coverage floor enforced by CI (scripts/verify.sh) while allowing sub-package test suites (e.g. qm_mcp/) to run standalone without a false failure when quantmind code is not exercised. Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 4 +++- scripts/verify.sh | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 50d6fe9..1ffbe98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -199,9 +199,11 @@ testpaths = ["tests"] addopts = [ "--cov=quantmind", "--cov-report=term-missing", - "--cov-fail-under=75", "-ra", ] +# Coverage floor is enforced by scripts/verify.sh (--cov-fail-under=75). +# Keeping it out of addopts lets sub-package test suites (e.g. qm_mcp/) run +# standalone without a false failure when quantmind code is not exercised. asyncio_mode = "auto" [tool.coverage.run] diff --git a/scripts/verify.sh b/scripts/verify.sh index a7b5a7a..44ac854 100755 --- a/scripts/verify.sh +++ b/scripts/verify.sh @@ -34,7 +34,7 @@ echo "==> [4/5] lint-imports" lint-imports echo "==> [5/5] pytest --cov" -pytest +pytest --cov-fail-under=75 echo echo "[OK] verify loop passed"