From dc487914a11b648533aaddd6243aab80df929781 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 15 Apr 2026 12:17:45 +0100 Subject: [PATCH 01/17] WIP shard sqlite cache --- mypy/build.py | 5 +++- mypy/metastore.py | 72 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 56 insertions(+), 21 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 4e9480d8d3ef..f2a22f498fb9 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1818,8 +1818,11 @@ def exclude_from_backups(target_dir: str) -> None: def create_metastore(options: Options, parallel_worker: bool) -> MetadataStore: """Create the appropriate metadata store.""" if options.sqlite_cache: + num_shards = max(options.num_workers, 1) mds: MetadataStore = SqliteMetadataStore( - _cache_dir_prefix(options), set_journal_mode=not parallel_worker + _cache_dir_prefix(options), + set_journal_mode=not parallel_worker, + num_shards=num_shards, ) else: mds = FilesystemMetadataStore(_cache_dir_prefix(options)) diff --git a/mypy/metastore.py b/mypy/metastore.py index 23ca8e921a33..af4caadac9aa 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -11,6 +11,7 @@ from __future__ import annotations import binascii +import hashlib import os import time from abc import abstractmethod @@ -168,24 +169,54 @@ def connect_db(db_file: str, set_journal_mode: bool) -> sqlite3.Connection: return db +def _stable_hash(s: str) -> int: + """A deterministic hash, consistent across processes (unlike built-in hash()).""" + return int.from_bytes(hashlib.md5(s.encode("utf-8")).digest()[:4], "little") + + class SqliteMetadataStore(MetadataStore): - def __init__(self, cache_dir_prefix: str, set_journal_mode: bool = False) -> None: + def __init__( + self, cache_dir_prefix: str, set_journal_mode: bool = False, num_shards: int = 1 + ) -> None: # We check startswith instead of equality because the version # will have already been appended by the time the cache dir is # passed here. - self.db = None + self.dbs: list[sqlite3.Connection] = [] + self.num_shards = num_shards if cache_dir_prefix.startswith(os.devnull): return os.makedirs(cache_dir_prefix, exist_ok=True) - self.db = connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode) + if num_shards <= 1: + self.dbs.append( + connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode) + ) + else: + for i in range(num_shards): + self.dbs.append( + connect_db( + os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode + ) + ) + # Track which shards have been written to since last commit. + self.dirty_shards: set[int] = set() + + def _db_for(self, name: str) -> sqlite3.Connection: + if not self.dbs: + raise FileNotFoundError() + if self.num_shards <= 1: + return self.dbs[0] + return self.dbs[_stable_hash(name) % self.num_shards] + + def _shard_index(self, name: str) -> int: + if self.num_shards <= 1: + return 0 + return _stable_hash(name) % self.num_shards def _query(self, name: str, field: str) -> Any: # Raises FileNotFound for consistency with the file system version - if not self.db: - raise FileNotFoundError() - - cur = self.db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,)) + db = self._db_for(name) + cur = db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,)) results = cur.fetchall() if not results: raise FileNotFoundError() @@ -205,39 +236,40 @@ def read(self, name: str) -> bytes: def write(self, name: str, data: bytes, mtime: float | None = None) -> bool: import sqlite3 - if not self.db: + if not self.dbs: return False try: if mtime is None: mtime = time.time() - self.db.execute( + db = self._db_for(name) + db.execute( "INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)", (name, mtime, data), ) + self.dirty_shards.add(self._shard_index(name)) except sqlite3.OperationalError: return False return True def remove(self, name: str) -> None: - if not self.db: - raise FileNotFoundError() - - self.db.execute("DELETE FROM files2 WHERE path = ?", (name,)) + db = self._db_for(name) + db.execute("DELETE FROM files2 WHERE path = ?", (name,)) + self.dirty_shards.add(self._shard_index(name)) def commit(self) -> None: - if self.db: - self.db.commit() + for i in self.dirty_shards: + self.dbs[i].commit() + self.dirty_shards.clear() def list_all(self) -> Iterable[str]: - if self.db: - for row in self.db.execute("SELECT path FROM files2"): + for db in self.dbs: + for row in db.execute("SELECT path FROM files2"): yield row[0] def close(self) -> None: - if self.db: - db = self.db - self.db = None + for db in self.dbs: db.close() + self.dbs.clear() def __del__(self) -> None: self.close() From 93bbeb9b06efa1ea5a534d6c195fb39d671e08a9 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 15 Apr 2026 12:52:54 +0100 Subject: [PATCH 02/17] Fix missing commit --- mypy/build.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mypy/build.py b/mypy/build.py index f2a22f498fb9..405e3e76617f 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -4427,6 +4427,10 @@ def find_stale_sccs( def process_graph(graph: Graph, manager: BuildManager) -> None: """Process everything in dependency order.""" + if manager.workers: + # Commit any cache writes from graph loading before workers try to read them. + manager.commit() + # Broadcast graph to workers before computing SCCs to save a bit of time. # TODO: check if we can optimize by sending only part of the graph needed for given SCC. # For example only send modules in the SCC and their dependencies. From 494c8adbe59688bb9610b88a198a0ec726f5fdfe Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 15 Apr 2026 13:09:53 +0100 Subject: [PATCH 03/17] WIP use autocommit --- mypy/metastore.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mypy/metastore.py b/mypy/metastore.py index af4caadac9aa..421fb960a151 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -155,10 +155,17 @@ def close(self) -> None: """ -def connect_db(db_file: str, set_journal_mode: bool) -> sqlite3.Connection: +def connect_db(db_file: str, set_journal_mode: bool, autocommit: bool = False) -> sqlite3.Connection: import sqlite3.dbapi2 - db = sqlite3.dbapi2.connect(db_file, check_same_thread=False) + db = sqlite3.dbapi2.connect( + db_file, + check_same_thread=False, + # With autocommit, each statement is its own transaction, so the write lock + # is held only for microseconds per INSERT. This avoids long lock holds that + # block other workers writing to the same shard. + isolation_level=None if autocommit else "DEFERRED", + ) # This is a bit unfortunate (as we may get corrupt cache after e.g. Ctrl + C), # but without this flag, commits are *very* slow, especially when using HDDs, # see https://www.sqlite.org/faq.html#q19 for details. @@ -187,6 +194,9 @@ def __init__( return os.makedirs(cache_dir_prefix, exist_ok=True) + # Use autocommit when sharded so each INSERT is its own transaction, + # minimizing lock hold time across workers. + autocommit = num_shards > 1 if num_shards <= 1: self.dbs.append( connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode) @@ -195,7 +205,9 @@ def __init__( for i in range(num_shards): self.dbs.append( connect_db( - os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode + os_path_join(cache_dir_prefix, f"cache.{i}.db"), + set_journal_mode, + autocommit=autocommit, ) ) # Track which shards have been written to since last commit. From fbcea0e0e3f7e9839451d1d9d322cfba6a850257 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 15 Apr 2026 13:19:14 +0100 Subject: [PATCH 04/17] Strip extension --- mypy/metastore.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/mypy/metastore.py b/mypy/metastore.py index 421fb960a151..abc30f0fa6ae 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -181,6 +181,25 @@ def _stable_hash(s: str) -> int: return int.from_bytes(hashlib.md5(s.encode("utf-8")).digest()[:4], "little") +def _cache_stem(name: str) -> str: + """Extract the canonical module stem from a cache file path. + + All cache files for a module share a common prefix (the stem): + foo/bar/baz.meta.ff, foo/bar/baz.data.ff, foo/bar/baz.meta_ex.ff, etc. + For packages: foo/bar/__init__.meta.ff -> foo/bar/__init__ + + Global files like @deps.meta.json -> @deps + """ + # Split at first '.' in the basename to get the stem. + # E.g. "foo/bar/baz.meta.ff" -> "foo/bar/baz" + # "foo/bar/__init__.data.ff" -> "foo/bar/__init__" + # "@deps.meta.json" -> "@deps" + dot = name.find(".") + if dot == -1: + return name + return name[:dot] + + class SqliteMetadataStore(MetadataStore): def __init__( self, cache_dir_prefix: str, set_journal_mode: bool = False, num_shards: int = 1 @@ -218,12 +237,12 @@ def _db_for(self, name: str) -> sqlite3.Connection: raise FileNotFoundError() if self.num_shards <= 1: return self.dbs[0] - return self.dbs[_stable_hash(name) % self.num_shards] + return self.dbs[_stable_hash(_cache_stem(name)) % self.num_shards] def _shard_index(self, name: str) -> int: if self.num_shards <= 1: return 0 - return _stable_hash(name) % self.num_shards + return _stable_hash(_cache_stem(name)) % self.num_shards def _query(self, name: str, field: str) -> Any: # Raises FileNotFound for consistency with the file system version From c176f29dd63a5c3c4cd26c9054580cd6185e8727 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 15 Apr 2026 13:40:42 +0100 Subject: [PATCH 05/17] One commit per module (instead of per insertion, or per SCC) --- mypy/build.py | 18 +++++++++++++++++- mypy/metastore.py | 30 +++++++++++++++++------------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 405e3e76617f..cbda9ae92051 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1273,6 +1273,10 @@ def commit(self) -> None: self.metastore.commit() self.add_stats(cache_commit_time=time.time() - t0) + def commit_module(self, meta_file: str) -> None: + """Commit cache writes for a single module (identified by its meta file path).""" + self.metastore.commit_path(meta_file) + def verbosity(self) -> int: return self.options.verbosity @@ -4682,6 +4686,8 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None: t4 = time.time() # Flush errors, and write cache in two phases: first data files, then meta files. + # The two-phase structure is needed because meta.dep_hashes references interface_hash + # values from other modules in the SCC, which are updated by write_cache(). meta_tuples = {} errors_by_id = {} for id in stale: @@ -4692,7 +4698,11 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None: ) manager.flush_errors(manager.errors.simplify_path(graph[id].xpath), formatted, False) errors_by_id[id] = errors - meta_tuples[id] = graph[id].write_cache() + meta_tuple = graph[id].write_cache() + meta_tuples[id] = meta_tuple + # Commit data file write immediately to avoid holding shard locks across modules. + if meta_tuple is not None: + manager.commit_module(meta_tuple[1]) for id in stale: meta_tuple = meta_tuples[id] if meta_tuple is None: @@ -4716,6 +4726,7 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None: error_lines=errors_by_id.get(id, []), ) write_cache_meta_ex(meta_file, meta_ex, manager) + manager.commit_module(meta_file) manager.done_sccs.add(ascc.id) manager.add_stats( load_missing_time=t1 - t0, @@ -4768,6 +4779,9 @@ def process_stale_scc_interface( for id in stale: meta_tuple = graph[id].write_cache() meta_tuples[id] = meta_tuple + # Commit data file write immediately to avoid holding shard locks across modules. + if meta_tuple is not None: + manager.commit_module(meta_tuple[1]) for id in stale: meta_tuple = meta_tuples[id] if meta_tuple is None: @@ -4780,6 +4794,7 @@ def process_stale_scc_interface( if state.priorities.get(dep) != PRI_INDIRECT ] write_cache_meta(meta, manager, meta_file) + manager.commit_module(meta_file) scc_result.append((id, ModuleResult(graph[id].interface_hash.hex(), []), meta_file)) manager.done_sccs.add(ascc.id) manager.add_stats( @@ -4859,6 +4874,7 @@ def process_stale_scc_implementation( # If there are no errors, only write the cache, don't send anything back # to the caller (as a micro-optimization). write_cache_meta_ex(meta_file, meta_ex, manager) + manager.commit_module(meta_file) manager.add_stats(type_check_time_implementation=time.time() - t0) return scc_result diff --git a/mypy/metastore.py b/mypy/metastore.py index abc30f0fa6ae..f9e526da0d11 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -66,6 +66,14 @@ def commit(self) -> None: called. """ + def commit_path(self, name: str) -> None: + """Commit changes related to a specific cache path. + + For sharded stores, this commits only the shard containing the path. + Default implementation commits everything. + """ + self.commit() + @abstractmethod def list_all(self) -> Iterable[str]: ... @@ -155,17 +163,10 @@ def close(self) -> None: """ -def connect_db(db_file: str, set_journal_mode: bool, autocommit: bool = False) -> sqlite3.Connection: +def connect_db(db_file: str, set_journal_mode: bool) -> sqlite3.Connection: import sqlite3.dbapi2 - db = sqlite3.dbapi2.connect( - db_file, - check_same_thread=False, - # With autocommit, each statement is its own transaction, so the write lock - # is held only for microseconds per INSERT. This avoids long lock holds that - # block other workers writing to the same shard. - isolation_level=None if autocommit else "DEFERRED", - ) + db = sqlite3.dbapi2.connect(db_file, check_same_thread=False) # This is a bit unfortunate (as we may get corrupt cache after e.g. Ctrl + C), # but without this flag, commits are *very* slow, especially when using HDDs, # see https://www.sqlite.org/faq.html#q19 for details. @@ -213,9 +214,6 @@ def __init__( return os.makedirs(cache_dir_prefix, exist_ok=True) - # Use autocommit when sharded so each INSERT is its own transaction, - # minimizing lock hold time across workers. - autocommit = num_shards > 1 if num_shards <= 1: self.dbs.append( connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode) @@ -226,7 +224,6 @@ def __init__( connect_db( os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode, - autocommit=autocommit, ) ) # Track which shards have been written to since last commit. @@ -292,6 +289,13 @@ def commit(self) -> None: self.dbs[i].commit() self.dirty_shards.clear() + def commit_path(self, name: str) -> None: + with record_time("sqlite.commit_path"): + i = self._shard_index(name) + if i in self.dirty_shards: + self.dbs[i].commit() + self.dirty_shards.discard(i) + def list_all(self) -> Iterable[str]: for db in self.dbs: for row in db.execute("SELECT path FROM files2"): From 0bbb7ed988991fadddcc1ffe2cc3a348359843c0 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 16 Apr 2026 08:39:28 +0100 Subject: [PATCH 06/17] Fix sharding issue --- mypy/test/test_diff_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mypy/test/test_diff_cache.py b/mypy/test/test_diff_cache.py index dac10e9e40eb..e2de71b47bae 100644 --- a/mypy/test/test_diff_cache.py +++ b/mypy/test/test_diff_cache.py @@ -152,8 +152,8 @@ def test_diff_cache_produces_valid_json(self) -> None: def read_all(cache_dir: str) -> dict[str, bytes]: store = SqliteMetadataStore(cache_dir) result = {name: store.read(name) for name in store.list_all()} - assert store.db is not None - store.db.close() + for db in store.dbs: + db.close() return result before = read_all(patched_ver) From 13ba5622db241ec4ce2d35e18a6f7ed9f1ba0442 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Thu, 16 Apr 2026 10:22:12 +0100 Subject: [PATCH 07/17] Use a mypyc optimized hashing algorithm --- mypy/metastore.py | 49 +++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/mypy/metastore.py b/mypy/metastore.py index f9e526da0d11..fbff38a3d55b 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -11,13 +11,14 @@ from __future__ import annotations import binascii -import hashlib import os import time from abc import abstractmethod from collections.abc import Iterable from typing import TYPE_CHECKING, Any +from mypy_extensions import i64 + from mypy.util import os_path_join if TYPE_CHECKING: @@ -177,28 +178,30 @@ def connect_db(db_file: str, set_journal_mode: bool) -> sqlite3.Connection: return db -def _stable_hash(s: str) -> int: - """A deterministic hash, consistent across processes (unlike built-in hash()).""" - return int.from_bytes(hashlib.md5(s.encode("utf-8")).digest()[:4], "little") - - -def _cache_stem(name: str) -> str: - """Extract the canonical module stem from a cache file path. - - All cache files for a module share a common prefix (the stem): - foo/bar/baz.meta.ff, foo/bar/baz.data.ff, foo/bar/baz.meta_ex.ff, etc. - For packages: foo/bar/__init__.meta.ff -> foo/bar/__init__ +def hash_path_stem(s: str) -> int: + """Hash the stem of a cache file path (everything before the first dot in the basename). - Global files like @deps.meta.json -> @deps + This is a combined stem-extraction + hash function optimized for mypyc compilation. + Uses only integer arithmetic, avoiding intermediate string allocations. """ - # Split at first '.' in the basename to get the stem. - # E.g. "foo/bar/baz.meta.ff" -> "foo/bar/baz" - # "foo/bar/__init__.data.ff" -> "foo/bar/__init__" - # "@deps.meta.json" -> "@deps" - dot = name.find(".") - if dot == -1: - return name - return name[:dot] + # First find end of stem (scanning backwards, stop at first dot after last separator) + i = len(s) - 1 + end: i64 = i + while i >= 0: + c: i64 = ord(s[i]) + if c == ord("/") or c == ord("\\"): + break + if c == ord("."): + end = i + i -= 1 + # Calculate hash + hv: i64 = 123 + i = end + while i >= 0: + c = i64(ord(s[i])) + hv = (hv * 33) ^ c + i -= 1 + return (hv ^ (hv >> 16) ^ (hv >> 32) ^ (hv >> 48)) & 0xFFFFFF class SqliteMetadataStore(MetadataStore): @@ -234,12 +237,12 @@ def _db_for(self, name: str) -> sqlite3.Connection: raise FileNotFoundError() if self.num_shards <= 1: return self.dbs[0] - return self.dbs[_stable_hash(_cache_stem(name)) % self.num_shards] + return self.dbs[hash_path_stem(name) % self.num_shards] def _shard_index(self, name: str) -> int: if self.num_shards <= 1: return 0 - return _stable_hash(_cache_stem(name)) % self.num_shards + return hash_path_stem(name) % self.num_shards def _query(self, name: str, field: str) -> Any: # Raises FileNotFound for consistency with the file system version From 224d5bbfe202b5527eeea8bbeb34f0efd74568f2 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Fri, 17 Apr 2026 13:13:48 +0100 Subject: [PATCH 08/17] Improve hash used for sharding --- mypy/metastore.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mypy/metastore.py b/mypy/metastore.py index fbff38a3d55b..b851ed24f099 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -201,7 +201,14 @@ def hash_path_stem(s: str) -> int: c = i64(ord(s[i])) hv = (hv * 33) ^ c i -= 1 - return (hv ^ (hv >> 16) ^ (hv >> 32) ^ (hv >> 48)) & 0xFFFFFF + # Murmur3 finalizer for better bit avalanche (improves shard uniformity) + hv = (hv ^ (hv >> 32)) & 0xFFFFFFFF + hv ^= hv >> 16 + hv = (hv * 0x85EBCA6B) & 0xFFFFFFFF + hv ^= hv >> 13 + hv = (hv * 0xC2B2AE35) & 0xFFFFFFFF + hv ^= hv >> 16 + return int(hv) class SqliteMetadataStore(MetadataStore): From 1e45e93e3a7071bf52f3c773dc7f3f111b8a5209 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 15:17:36 +0100 Subject: [PATCH 09/17] Default to 16 shards and make shard count configurable (hidden option) --- mypy/build.py | 3 +-- mypy/main.py | 7 +++++++ mypy/options.py | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index cbda9ae92051..b69554bb5758 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1822,11 +1822,10 @@ def exclude_from_backups(target_dir: str) -> None: def create_metastore(options: Options, parallel_worker: bool) -> MetadataStore: """Create the appropriate metadata store.""" if options.sqlite_cache: - num_shards = max(options.num_workers, 1) mds: MetadataStore = SqliteMetadataStore( _cache_dir_prefix(options), set_journal_mode=not parallel_worker, - num_shards=num_shards, + num_shards=options.sqlite_num_shards, ) else: mds = FilesystemMetadataStore(_cache_dir_prefix(options)) diff --git a/mypy/main.py b/mypy/main.py index e90ee961fc70..caf9acd584cd 100644 --- a/mypy/main.py +++ b/mypy/main.py @@ -1077,6 +1077,13 @@ def add_invertible_flag( help="Use a sqlite database to store the cache", group=incremental_group, ) + incremental_group.add_argument( + "--sqlite-num-shards", + type=int, + default=16, + dest="sqlite_num_shards", + help=argparse.SUPPRESS, + ) incremental_group.add_argument( "--cache-fine-grained", action="store_true", diff --git a/mypy/options.py b/mypy/options.py index 81fd88345a43..1e794db0b1d5 100644 --- a/mypy/options.py +++ b/mypy/options.py @@ -302,6 +302,7 @@ def __init__(self) -> None: self.incremental = True self.cache_dir = defaults.CACHE_DIR self.sqlite_cache = True + self.sqlite_num_shards = 16 self.fixed_format_cache = True self.debug_cache = False self.skip_version_check = False From a2aa9e8906fe526bdda9ac716fbb63fe9b57e1ca Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 15:21:39 +0100 Subject: [PATCH 10/17] Refactor a bit --- mypy/metastore.py | 42 ++---------------------------------------- mypy/util.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/mypy/metastore.py b/mypy/metastore.py index b851ed24f099..2e62c111a406 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -17,9 +17,7 @@ from collections.abc import Iterable from typing import TYPE_CHECKING, Any -from mypy_extensions import i64 - -from mypy.util import os_path_join +from mypy.util import hash_path_stem, os_path_join if TYPE_CHECKING: # We avoid importing sqlite3 unless we are using it so we can mostly work @@ -178,39 +176,6 @@ def connect_db(db_file: str, set_journal_mode: bool) -> sqlite3.Connection: return db -def hash_path_stem(s: str) -> int: - """Hash the stem of a cache file path (everything before the first dot in the basename). - - This is a combined stem-extraction + hash function optimized for mypyc compilation. - Uses only integer arithmetic, avoiding intermediate string allocations. - """ - # First find end of stem (scanning backwards, stop at first dot after last separator) - i = len(s) - 1 - end: i64 = i - while i >= 0: - c: i64 = ord(s[i]) - if c == ord("/") or c == ord("\\"): - break - if c == ord("."): - end = i - i -= 1 - # Calculate hash - hv: i64 = 123 - i = end - while i >= 0: - c = i64(ord(s[i])) - hv = (hv * 33) ^ c - i -= 1 - # Murmur3 finalizer for better bit avalanche (improves shard uniformity) - hv = (hv ^ (hv >> 32)) & 0xFFFFFFFF - hv ^= hv >> 16 - hv = (hv * 0x85EBCA6B) & 0xFFFFFFFF - hv ^= hv >> 13 - hv = (hv * 0xC2B2AE35) & 0xFFFFFFFF - hv ^= hv >> 16 - return int(hv) - - class SqliteMetadataStore(MetadataStore): def __init__( self, cache_dir_prefix: str, set_journal_mode: bool = False, num_shards: int = 1 @@ -231,10 +196,7 @@ def __init__( else: for i in range(num_shards): self.dbs.append( - connect_db( - os_path_join(cache_dir_prefix, f"cache.{i}.db"), - set_journal_mode, - ) + connect_db(os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode) ) # Track which shards have been written to since last commit. self.dirty_shards: set[int] = set() diff --git a/mypy/util.py b/mypy/util.py index 5c51881452ab..a48a7ea824da 100644 --- a/mypy/util.py +++ b/mypy/util.py @@ -14,6 +14,8 @@ from importlib import resources as importlib_resources from typing import IO, Any, Final, Literal, TypeVar +from mypy_extensions import i64 + orjson: Any try: import orjson # type: ignore[import-not-found, no-redef, unused-ignore] @@ -1009,3 +1011,36 @@ def get_available_threads() -> int: available_threads = cpu_count _AVAILABLE_THREADS = available_threads return available_threads + + +def hash_path_stem(s: str) -> int: + """Hash the stem of a cache file path (everything before the first dot in the basename). + + This is a combined stem-extraction + hash function optimized for mypyc compilation. + Uses only integer arithmetic, avoiding intermediate string allocations. + """ + # First find end of stem (scanning backwards, stop at first dot after last separator) + i = len(s) - 1 + end: i64 = i + while i >= 0: + c: i64 = ord(s[i]) + if c == ord("/") or c == ord("\\"): + break + if c == ord("."): + end = i + i -= 1 + # Calculate hash + hv: i64 = 123 + i = end + while i >= 0: + c = i64(ord(s[i])) + hv = (hv * 33) ^ c + i -= 1 + # Murmur3 finalizer for better bit avalanche (improves shard uniformity) + hv = (hv ^ (hv >> 32)) & 0xFFFFFFFF + hv ^= hv >> 16 + hv = (hv * 0x85EBCA6B) & 0xFFFFFFFF + hv ^= hv >> 13 + hv = (hv * 0xC2B2AE35) & 0xFFFFFFFF + hv ^= hv >> 16 + return int(hv) From 76602c31c9c2c70934cbc01fe379d99ebbd2a3e8 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 15:22:21 +0100 Subject: [PATCH 11/17] Fix --- mypy/metastore.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mypy/metastore.py b/mypy/metastore.py index 2e62c111a406..f3d860a12be4 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -262,11 +262,10 @@ def commit(self) -> None: self.dirty_shards.clear() def commit_path(self, name: str) -> None: - with record_time("sqlite.commit_path"): - i = self._shard_index(name) - if i in self.dirty_shards: - self.dbs[i].commit() - self.dirty_shards.discard(i) + i = self._shard_index(name) + if i in self.dirty_shards: + self.dbs[i].commit() + self.dirty_shards.discard(i) def list_all(self) -> Iterable[str]: for db in self.dbs: From 4c40915117f30e75b16c466b42e301ad49ccc894 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 15:24:50 +0100 Subject: [PATCH 12/17] Fix --- mypy/metastore.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mypy/metastore.py b/mypy/metastore.py index f3d860a12be4..d0e5c3748317 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -185,6 +185,7 @@ def __init__( # passed here. self.dbs: list[sqlite3.Connection] = [] self.num_shards = num_shards + self.dirty_shards: set[int] = set() if cache_dir_prefix.startswith(os.devnull): return @@ -198,8 +199,6 @@ def __init__( self.dbs.append( connect_db(os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode) ) - # Track which shards have been written to since last commit. - self.dirty_shards: set[int] = set() def _db_for(self, name: str) -> sqlite3.Connection: if not self.dbs: From 3d76f07bf143f115abc9a0ff870412fe2f122008 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 15:42:57 +0100 Subject: [PATCH 13/17] Polish --- mypy/metastore.py | 12 +++++------- mypy/test/test_diff_cache.py | 3 +-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/mypy/metastore.py b/mypy/metastore.py index d0e5c3748317..c48ac78bdcb7 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -200,18 +200,16 @@ def __init__( connect_db(os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode) ) - def _db_for(self, name: str) -> sqlite3.Connection: - if not self.dbs: - raise FileNotFoundError() - if self.num_shards <= 1: - return self.dbs[0] - return self.dbs[hash_path_stem(name) % self.num_shards] - def _shard_index(self, name: str) -> int: if self.num_shards <= 1: return 0 return hash_path_stem(name) % self.num_shards + def _db_for(self, name: str) -> sqlite3.Connection: + if not self.dbs: + raise FileNotFoundError() + return self.dbs[self._shard_index(name)] + def _query(self, name: str, field: str) -> Any: # Raises FileNotFound for consistency with the file system version db = self._db_for(name) diff --git a/mypy/test/test_diff_cache.py b/mypy/test/test_diff_cache.py index e2de71b47bae..3f67a798236e 100644 --- a/mypy/test/test_diff_cache.py +++ b/mypy/test/test_diff_cache.py @@ -152,8 +152,7 @@ def test_diff_cache_produces_valid_json(self) -> None: def read_all(cache_dir: str) -> dict[str, bytes]: store = SqliteMetadataStore(cache_dir) result = {name: store.read(name) for name in store.list_all()} - for db in store.dbs: - db.close() + store.close() return result before = read_all(patched_ver) From 171d8ac50a6035a13714f8f0228e608a62c50ea0 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 15:52:31 +0100 Subject: [PATCH 14/17] Use final constant for shard count --- mypy/defaults.py | 1 + mypy/main.py | 2 +- mypy/options.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mypy/defaults.py b/mypy/defaults.py index 1b0c7f12374a..d8197e4db00d 100644 --- a/mypy/defaults.py +++ b/mypy/defaults.py @@ -14,6 +14,7 @@ PYTHON3_VERSION_MIN: Final = (3, 10) # Keep in sync with supported target versions CACHE_DIR: Final = ".mypy_cache" +SQLITE_NUM_SHARDS: Final = 16 CONFIG_NAMES: Final = ["mypy.ini", ".mypy.ini"] SHARED_CONFIG_NAMES: Final = ["pyproject.toml", "setup.cfg"] diff --git a/mypy/main.py b/mypy/main.py index caf9acd584cd..99dfa1cfead6 100644 --- a/mypy/main.py +++ b/mypy/main.py @@ -1080,7 +1080,7 @@ def add_invertible_flag( incremental_group.add_argument( "--sqlite-num-shards", type=int, - default=16, + default=defaults.SQLITE_NUM_SHARDS, dest="sqlite_num_shards", help=argparse.SUPPRESS, ) diff --git a/mypy/options.py b/mypy/options.py index 1e794db0b1d5..c8498c2081b4 100644 --- a/mypy/options.py +++ b/mypy/options.py @@ -302,7 +302,7 @@ def __init__(self) -> None: self.incremental = True self.cache_dir = defaults.CACHE_DIR self.sqlite_cache = True - self.sqlite_num_shards = 16 + self.sqlite_num_shards = defaults.SQLITE_NUM_SHARDS self.fixed_format_cache = True self.debug_cache = False self.skip_version_check = False From 0ce0136bdf955ece29a622f04f88cbe071390be9 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 15:54:10 +0100 Subject: [PATCH 15/17] Update cache diff scripts --- misc/apply-cache-diff.py | 16 +++++++++++----- misc/diff-cache.py | 12 ++++++++---- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/misc/apply-cache-diff.py b/misc/apply-cache-diff.py index 50de48796ebe..309184b34017 100644 --- a/misc/apply-cache-diff.py +++ b/misc/apply-cache-diff.py @@ -19,19 +19,22 @@ from librt.internal import ReadBuffer from mypy.cache import CacheMeta +from mypy.defaults import SQLITE_NUM_SHARDS from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore from mypy.util import json_dumps, json_loads -def make_cache(input_dir: str, sqlite: bool) -> MetadataStore: +def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore: if sqlite: - return SqliteMetadataStore(input_dir) + return SqliteMetadataStore(input_dir, num_shards=num_shards) else: return FilesystemMetadataStore(input_dir) -def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None: - cache = make_cache(cache_dir, sqlite) +def apply_diff( + cache_dir: str, diff_file: str, sqlite: bool = False, num_shards: int = SQLITE_NUM_SHARDS +) -> None: + cache = make_cache(cache_dir, sqlite, num_shards=num_shards) with open(diff_file, "rb") as f: diff = json_loads(f.read()) @@ -63,11 +66,14 @@ def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None: def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache") + parser.add_argument( + "--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS + ) parser.add_argument("cache_dir", help="Directory for the cache") parser.add_argument("diff", help="Cache diff file") args = parser.parse_args() - apply_diff(args.cache_dir, args.diff, args.sqlite) + apply_diff(args.cache_dir, args.diff, args.sqlite, num_shards=args.num_shards) if __name__ == "__main__": diff --git a/misc/diff-cache.py b/misc/diff-cache.py index 07a2f416a270..a82abf7382cd 100644 --- a/misc/diff-cache.py +++ b/misc/diff-cache.py @@ -19,13 +19,14 @@ from librt.internal import ReadBuffer, WriteBuffer from mypy.cache import CacheMeta, CacheMetaEx +from mypy.defaults import SQLITE_NUM_SHARDS from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore from mypy.util import json_dumps, json_loads -def make_cache(input_dir: str, sqlite: bool) -> MetadataStore: +def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore: if sqlite: - return SqliteMetadataStore(input_dir) + return SqliteMetadataStore(input_dir, num_shards=num_shards) else: return FilesystemMetadataStore(input_dir) @@ -154,13 +155,16 @@ def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--verbose", action="store_true", default=False, help="Increase verbosity") parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache") + parser.add_argument( + "--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS + ) parser.add_argument("input_dir1", help="Input directory for the original cache") parser.add_argument("input_dir2", help="Input directory for the target cache") parser.add_argument("output", help="Output file with the diff from original cache") args = parser.parse_args() - cache1 = make_cache(args.input_dir1, args.sqlite) - cache2 = make_cache(args.input_dir2, args.sqlite) + cache1 = make_cache(args.input_dir1, args.sqlite, num_shards=args.num_shards) + cache2 = make_cache(args.input_dir2, args.sqlite, num_shards=args.num_shards) type_misses: dict[str, int] = defaultdict(int) type_hits: dict[str, int] = defaultdict(int) From 81170c0e4b3e7f16359355258c0454924e16c034 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 15:57:10 +0100 Subject: [PATCH 16/17] Fix test --- mypy/test/test_diff_cache.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mypy/test/test_diff_cache.py b/mypy/test/test_diff_cache.py index 3f67a798236e..5c632c1153c6 100644 --- a/mypy/test/test_diff_cache.py +++ b/mypy/test/test_diff_cache.py @@ -11,6 +11,7 @@ import time import unittest +from mypy.defaults import SQLITE_NUM_SHARDS from mypy.test.config import PREFIX _MISC_DIR = os.path.join(PREFIX, "misc") @@ -150,7 +151,7 @@ def test_diff_cache_produces_valid_json(self) -> None: from mypy.metastore import SqliteMetadataStore def read_all(cache_dir: str) -> dict[str, bytes]: - store = SqliteMetadataStore(cache_dir) + store = SqliteMetadataStore(cache_dir, num_shards=SQLITE_NUM_SHARDS) result = {name: store.read(name) for name in store.list_all()} store.close() return result From 65f2cb95d69d8d0cddeedad6410fb81e5098ad74 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 17:44:08 +0100 Subject: [PATCH 17/17] Update convert-cache.py --- misc/convert-cache.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/misc/convert-cache.py b/misc/convert-cache.py index 966befeffb68..214a9a82d3c3 100755 --- a/misc/convert-cache.py +++ b/misc/convert-cache.py @@ -15,6 +15,7 @@ import argparse +from mypy.defaults import SQLITE_NUM_SHARDS from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore @@ -26,6 +27,13 @@ def main() -> None: default=False, help="Convert to a sqlite cache (default: convert from)", ) + parser.add_argument( + "--num-shards", + type=int, + default=SQLITE_NUM_SHARDS, + dest="num_shards", + help=argparse.SUPPRESS, + ) parser.add_argument( "--output_dir", action="store", @@ -37,17 +45,23 @@ def main() -> None: input_dir = args.input_dir output_dir = args.output_dir or input_dir + num_shards = args.num_shards assert os.path.isdir(output_dir), f"{output_dir} is not a directory" if args.to_sqlite: input: MetadataStore = FilesystemMetadataStore(input_dir) - output: MetadataStore = SqliteMetadataStore(output_dir) + output: MetadataStore = SqliteMetadataStore(output_dir, num_shards=num_shards) else: - fnam = os.path.join(input_dir, "cache.db") - msg = f"{fnam} does not exist" - if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)): - msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)" - assert os.path.isfile(fnam), msg - input, output = SqliteMetadataStore(input_dir), FilesystemMetadataStore(output_dir) + if num_shards <= 1: + db_files = [os.path.join(input_dir, "cache.db")] + else: + db_files = [os.path.join(input_dir, f"cache.{i}.db") for i in range(num_shards)] + for fnam in db_files: + msg = f"{fnam} does not exist" + if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)): + msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)" + assert os.path.isfile(fnam), msg + input = SqliteMetadataStore(input_dir, num_shards=num_shards) + output = FilesystemMetadataStore(output_dir) for s in input.list_all(): if s.endswith((".json", ".ff")):