diff --git a/CHANGELOG.md b/CHANGELOG.md index abc341c09..6a4794330 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Fixes + +- Fixed index corruption that could happen when the same project was opened through two different path spellings — a symlinked checkout, or upper/lowercase variants of one path on a case-insensitive drive (Windows NTFS, or a WSL `/mnt` drive). CodeGraph now recognizes these as the same project and shares a single database connection instead of opening a second one that could corrupt the index. (#1057) + ## [1.1.6] - 2026-06-30 diff --git a/__tests__/root-identity.test.ts b/__tests__/root-identity.test.ts new file mode 100644 index 000000000..7152fe714 --- /dev/null +++ b/__tests__/root-identity.test.ts @@ -0,0 +1,76 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { canonicalRootKey, findNearestCodeGraphRoot } from '../src/directory'; + +/** + * Regression coverage for #1057: the MCP server keyed its open-DB connection + * cache by the resolved-root PATH STRING, so two spellings of one physical repo + * — a symlinked checkout, or a case-variant on a case-insensitive mount (NTFS, + * WSL DrvFs `/mnt/c`) — each opened a SEPARATE SQLite connection to the same + * `.codegraph/codegraph.db` and corrupted the index. + * + * `canonicalRootKey` keys on filesystem identity (dev:ino), which is identical + * for every spelling, so the cache dedupes them onto one connection. The + * symlink case below is the deterministic, filesystem-agnostic proxy for the + * case-insensitive-mount scenario (both produce two path strings for one inode); + * it fails against the pre-fix `findNearestCodeGraphRoot`, which returned the + * un-canonicalized symlink path. + */ +describe('index root identity (#1057)', () => { + let tmp: string; + + beforeEach(() => { + tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-rootid-')); + }); + afterEach(() => { + fs.rmSync(tmp, { recursive: true, force: true }); + }); + + function makeProject(name: string): string { + const proj = path.join(tmp, name); + fs.mkdirSync(path.join(proj, '.codegraph'), { recursive: true }); + fs.writeFileSync(path.join(proj, '.codegraph', 'codegraph.db'), 'x'); + return proj; + } + + it('gives one identity key to a directory and a symlink that points at it', () => { + const real = makeProject('proj'); + const link = path.join(tmp, 'projLink'); + fs.symlinkSync(real, link); + + // Two distinct path strings for one physical directory... + expect(path.resolve(real)).not.toBe(path.resolve(link)); + // ...but ONE filesystem identity, so the connection cache dedupes them. + expect(canonicalRootKey(link)).toBe(canonicalRootKey(real)); + }); + + it('maps both spellings of a resolved root to one cache identity', () => { + const real = makeProject('proj'); + const link = path.join(tmp, 'projLink'); + fs.symlinkSync(real, link); + + // findNearestCodeGraphRoot resolves each spelling to its own (cased) string, + const fromReal = findNearestCodeGraphRoot(real); + const fromLink = findNearestCodeGraphRoot(link); + expect(fromReal).not.toBeNull(); + expect(fromLink).not.toBeNull(); + + // ...but the connection cache keys on identity, so both converge — which is + // what stops the second SQLite connection that pre-fix corrupted the index. + expect(canonicalRootKey(fromLink!)).toBe(canonicalRootKey(fromReal!)); + }); + + it('keeps distinct projects on distinct identity keys', () => { + const a = makeProject('a'); + const b = makeProject('b'); + expect(canonicalRootKey(a)).not.toBe(canonicalRootKey(b)); + }); + + it('falls back to a stable string key when the root cannot be stat-ed', () => { + const gone = path.join(tmp, 'does-not-exist'); + // No throw, and deterministic for a given input. + expect(canonicalRootKey(gone)).toBe(canonicalRootKey(gone)); + }); +}); diff --git a/src/directory.ts b/src/directory.ts index da5b6e9cb..a98d788d6 100644 --- a/src/directory.ts +++ b/src/directory.ts @@ -155,6 +155,40 @@ export function unsafeIndexRootReason(projectRoot: string): string | null { return null; } +/** + * Resolve `dir` to its `realpathSync` form (symlinks + `.`/`..` collapsed), + * falling back to the input on failure (e.g. a path that vanished mid-call). + * Used as the stat-failure fallback of {@link canonicalRootKey}. + */ +function canonicalizeRoot(dir: string): string { + try { + return fs.realpathSync(dir); + } catch { + return dir; + } +} + +/** + * A stable filesystem-IDENTITY key for an index root: `":"`. Unlike a + * path string — even a realpath'd one — this is identical for EVERY spelling of + * the same physical directory, including a case-variant on a case-insensitive + * mount (Windows NTFS, or WSL's DrvFs `/mnt/c`) where `realpathSync` preserves + * the caller's casing and so cannot dedupe. The MCP server keys its open-DB + * connection cache by this so two spellings of one repo share ONE SQLite + * connection instead of opening a second that corrupts the shared + * `.codegraph/codegraph.db` (#1057, same second-connection mechanism as #238). + * Falls back to the realpath'd path when the directory can't be stat'd (e.g. it + * vanished mid-call), so the key stays usable and stable-enough. + */ +export function canonicalRootKey(root: string): string { + try { + const s = fs.statSync(root); + return `${s.dev}:${s.ino}`; + } catch { + return canonicalizeRoot(root); + } +} + export function findNearestCodeGraphRoot(startPath: string): string | null { let current = path.resolve(startPath); const root = path.parse(current).root; diff --git a/src/mcp/tools.ts b/src/mcp/tools.ts index ad7c02612..505b27d84 100644 --- a/src/mcp/tools.ts +++ b/src/mcp/tools.ts @@ -6,7 +6,7 @@ import type CodeGraph from '../index'; import type { QueryPool } from './query-pool'; -import { findNearestCodeGraphRoot } from '../directory'; +import { findNearestCodeGraphRoot, canonicalRootKey } from '../directory'; // Lazy-load the heavy CodeGraph chain off the MCP startup path — see the same // helper in engine.ts. ToolHandler must load to answer tools/list (static // schemas), but it must NOT drag in sqlite/query layers before the daemon binds; @@ -1052,6 +1052,16 @@ export class ToolHandler { ); } + // Identity-key the open connection by FILESYSTEM identity (dev:ino), not the + // path string. Two spellings of one repo — a symlinked checkout, or a + // case-variant on a case-insensitive mount (NTFS, or WSL DrvFs `/mnt/c`) — + // must share ONE connection; a second connection to the same + // `.codegraph/codegraph.db` corrupts the index (#1057, same second-connection + // mechanism as #238 below). realpath alone can't dedupe case-variants (it + // preserves the caller's casing), so we key on the inode, which is identical + // for every spelling. + const rootKey = canonicalRootKey(resolvedRoot); + // If the path resolves to the default project, reuse the already-open // default instance rather than opening a SECOND connection to the same DB. // A duplicate connection serializes reads against the watcher's auto-sync @@ -1059,18 +1069,19 @@ export class ToolHandler { // support) that surfaces as intermittent // "database is locked" on concurrent tool calls. See issue #238. The // default instance is owned/closed by the server, so it's never cached. - if (this.cg && this.cg.getProjectRoot() === resolvedRoot) { + if (this.cg && canonicalRootKey(this.cg.getProjectRoot()) === rootKey) { return this.freshen(this.cg); } - // Cache the open DB connection by RESOLVED ROOT only — never by the input - // path. One key per instance means closeAll() closes each exactly once, and - // a changed resolution maps to a different entry instead of a stale hit. - const cached = this.projectCache.get(resolvedRoot); + // Cache the open DB connection by ROOT IDENTITY only — never by the input + // path. One key per physical index means closeAll() closes each exactly + // once, and a changed resolution maps to a different entry instead of a + // stale hit. + const cached = this.projectCache.get(rootKey); if (cached) return this.freshen(cached); const cg = loadCodeGraph().openSync(resolvedRoot); - this.projectCache.set(resolvedRoot, cg); + this.projectCache.set(rootKey, cg); return cg; }