diff --git a/misc/apply-cache-diff.py b/misc/apply-cache-diff.py index 50de48796ebe..309184b34017 100644 --- a/misc/apply-cache-diff.py +++ b/misc/apply-cache-diff.py @@ -19,19 +19,22 @@ from librt.internal import ReadBuffer from mypy.cache import CacheMeta +from mypy.defaults import SQLITE_NUM_SHARDS from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore from mypy.util import json_dumps, json_loads -def make_cache(input_dir: str, sqlite: bool) -> MetadataStore: +def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore: if sqlite: - return SqliteMetadataStore(input_dir) + return SqliteMetadataStore(input_dir, num_shards=num_shards) else: return FilesystemMetadataStore(input_dir) -def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None: - cache = make_cache(cache_dir, sqlite) +def apply_diff( + cache_dir: str, diff_file: str, sqlite: bool = False, num_shards: int = SQLITE_NUM_SHARDS +) -> None: + cache = make_cache(cache_dir, sqlite, num_shards=num_shards) with open(diff_file, "rb") as f: diff = json_loads(f.read()) @@ -63,11 +66,14 @@ def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None: def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache") + parser.add_argument( + "--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS + ) parser.add_argument("cache_dir", help="Directory for the cache") parser.add_argument("diff", help="Cache diff file") args = parser.parse_args() - apply_diff(args.cache_dir, args.diff, args.sqlite) + apply_diff(args.cache_dir, args.diff, args.sqlite, num_shards=args.num_shards) if __name__ == "__main__": diff --git a/misc/convert-cache.py b/misc/convert-cache.py index 966befeffb68..214a9a82d3c3 100755 --- a/misc/convert-cache.py +++ b/misc/convert-cache.py @@ -15,6 +15,7 @@ import argparse +from mypy.defaults import SQLITE_NUM_SHARDS from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore @@ -26,6 +27,13 @@ def main() -> None: default=False, help="Convert to a sqlite cache (default: convert from)", ) + parser.add_argument( + "--num-shards", + type=int, + default=SQLITE_NUM_SHARDS, + dest="num_shards", + help=argparse.SUPPRESS, + ) parser.add_argument( "--output_dir", action="store", @@ -37,17 +45,23 @@ def main() -> None: input_dir = args.input_dir output_dir = args.output_dir or input_dir + num_shards = args.num_shards assert os.path.isdir(output_dir), f"{output_dir} is not a directory" if args.to_sqlite: input: MetadataStore = FilesystemMetadataStore(input_dir) - output: MetadataStore = SqliteMetadataStore(output_dir) + output: MetadataStore = SqliteMetadataStore(output_dir, num_shards=num_shards) else: - fnam = os.path.join(input_dir, "cache.db") - msg = f"{fnam} does not exist" - if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)): - msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)" - assert os.path.isfile(fnam), msg - input, output = SqliteMetadataStore(input_dir), FilesystemMetadataStore(output_dir) + if num_shards <= 1: + db_files = [os.path.join(input_dir, "cache.db")] + else: + db_files = [os.path.join(input_dir, f"cache.{i}.db") for i in range(num_shards)] + for fnam in db_files: + msg = f"{fnam} does not exist" + if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)): + msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)" + assert os.path.isfile(fnam), msg + input = SqliteMetadataStore(input_dir, num_shards=num_shards) + output = FilesystemMetadataStore(output_dir) for s in input.list_all(): if s.endswith((".json", ".ff")): diff --git a/misc/diff-cache.py b/misc/diff-cache.py index 07a2f416a270..a82abf7382cd 100644 --- a/misc/diff-cache.py +++ b/misc/diff-cache.py @@ -19,13 +19,14 @@ from librt.internal import ReadBuffer, WriteBuffer from mypy.cache import CacheMeta, CacheMetaEx +from mypy.defaults import SQLITE_NUM_SHARDS from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore from mypy.util import json_dumps, json_loads -def make_cache(input_dir: str, sqlite: bool) -> MetadataStore: +def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore: if sqlite: - return SqliteMetadataStore(input_dir) + return SqliteMetadataStore(input_dir, num_shards=num_shards) else: return FilesystemMetadataStore(input_dir) @@ -154,13 +155,16 @@ def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--verbose", action="store_true", default=False, help="Increase verbosity") parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache") + parser.add_argument( + "--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS + ) parser.add_argument("input_dir1", help="Input directory for the original cache") parser.add_argument("input_dir2", help="Input directory for the target cache") parser.add_argument("output", help="Output file with the diff from original cache") args = parser.parse_args() - cache1 = make_cache(args.input_dir1, args.sqlite) - cache2 = make_cache(args.input_dir2, args.sqlite) + cache1 = make_cache(args.input_dir1, args.sqlite, num_shards=args.num_shards) + cache2 = make_cache(args.input_dir2, args.sqlite, num_shards=args.num_shards) type_misses: dict[str, int] = defaultdict(int) type_hits: dict[str, int] = defaultdict(int) diff --git a/mypy/build.py b/mypy/build.py index 4e9480d8d3ef..b69554bb5758 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1273,6 +1273,10 @@ def commit(self) -> None: self.metastore.commit() self.add_stats(cache_commit_time=time.time() - t0) + def commit_module(self, meta_file: str) -> None: + """Commit cache writes for a single module (identified by its meta file path).""" + self.metastore.commit_path(meta_file) + def verbosity(self) -> int: return self.options.verbosity @@ -1819,7 +1823,9 @@ def create_metastore(options: Options, parallel_worker: bool) -> MetadataStore: """Create the appropriate metadata store.""" if options.sqlite_cache: mds: MetadataStore = SqliteMetadataStore( - _cache_dir_prefix(options), set_journal_mode=not parallel_worker + _cache_dir_prefix(options), + set_journal_mode=not parallel_worker, + num_shards=options.sqlite_num_shards, ) else: mds = FilesystemMetadataStore(_cache_dir_prefix(options)) @@ -4424,6 +4430,10 @@ def find_stale_sccs( def process_graph(graph: Graph, manager: BuildManager) -> None: """Process everything in dependency order.""" + if manager.workers: + # Commit any cache writes from graph loading before workers try to read them. + manager.commit() + # Broadcast graph to workers before computing SCCs to save a bit of time. # TODO: check if we can optimize by sending only part of the graph needed for given SCC. # For example only send modules in the SCC and their dependencies. @@ -4675,6 +4685,8 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None: t4 = time.time() # Flush errors, and write cache in two phases: first data files, then meta files. + # The two-phase structure is needed because meta.dep_hashes references interface_hash + # values from other modules in the SCC, which are updated by write_cache(). meta_tuples = {} errors_by_id = {} for id in stale: @@ -4685,7 +4697,11 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None: ) manager.flush_errors(manager.errors.simplify_path(graph[id].xpath), formatted, False) errors_by_id[id] = errors - meta_tuples[id] = graph[id].write_cache() + meta_tuple = graph[id].write_cache() + meta_tuples[id] = meta_tuple + # Commit data file write immediately to avoid holding shard locks across modules. + if meta_tuple is not None: + manager.commit_module(meta_tuple[1]) for id in stale: meta_tuple = meta_tuples[id] if meta_tuple is None: @@ -4709,6 +4725,7 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None: error_lines=errors_by_id.get(id, []), ) write_cache_meta_ex(meta_file, meta_ex, manager) + manager.commit_module(meta_file) manager.done_sccs.add(ascc.id) manager.add_stats( load_missing_time=t1 - t0, @@ -4761,6 +4778,9 @@ def process_stale_scc_interface( for id in stale: meta_tuple = graph[id].write_cache() meta_tuples[id] = meta_tuple + # Commit data file write immediately to avoid holding shard locks across modules. + if meta_tuple is not None: + manager.commit_module(meta_tuple[1]) for id in stale: meta_tuple = meta_tuples[id] if meta_tuple is None: @@ -4773,6 +4793,7 @@ def process_stale_scc_interface( if state.priorities.get(dep) != PRI_INDIRECT ] write_cache_meta(meta, manager, meta_file) + manager.commit_module(meta_file) scc_result.append((id, ModuleResult(graph[id].interface_hash.hex(), []), meta_file)) manager.done_sccs.add(ascc.id) manager.add_stats( @@ -4852,6 +4873,7 @@ def process_stale_scc_implementation( # If there are no errors, only write the cache, don't send anything back # to the caller (as a micro-optimization). write_cache_meta_ex(meta_file, meta_ex, manager) + manager.commit_module(meta_file) manager.add_stats(type_check_time_implementation=time.time() - t0) return scc_result diff --git a/mypy/defaults.py b/mypy/defaults.py index 1b0c7f12374a..d8197e4db00d 100644 --- a/mypy/defaults.py +++ b/mypy/defaults.py @@ -14,6 +14,7 @@ PYTHON3_VERSION_MIN: Final = (3, 10) # Keep in sync with supported target versions CACHE_DIR: Final = ".mypy_cache" +SQLITE_NUM_SHARDS: Final = 16 CONFIG_NAMES: Final = ["mypy.ini", ".mypy.ini"] SHARED_CONFIG_NAMES: Final = ["pyproject.toml", "setup.cfg"] diff --git a/mypy/main.py b/mypy/main.py index e90ee961fc70..99dfa1cfead6 100644 --- a/mypy/main.py +++ b/mypy/main.py @@ -1077,6 +1077,13 @@ def add_invertible_flag( help="Use a sqlite database to store the cache", group=incremental_group, ) + incremental_group.add_argument( + "--sqlite-num-shards", + type=int, + default=defaults.SQLITE_NUM_SHARDS, + dest="sqlite_num_shards", + help=argparse.SUPPRESS, + ) incremental_group.add_argument( "--cache-fine-grained", action="store_true", diff --git a/mypy/metastore.py b/mypy/metastore.py index 23ca8e921a33..c48ac78bdcb7 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -17,7 +17,7 @@ from collections.abc import Iterable from typing import TYPE_CHECKING, Any -from mypy.util import os_path_join +from mypy.util import hash_path_stem, os_path_join if TYPE_CHECKING: # We avoid importing sqlite3 unless we are using it so we can mostly work @@ -65,6 +65,14 @@ def commit(self) -> None: called. """ + def commit_path(self, name: str) -> None: + """Commit changes related to a specific cache path. + + For sharded stores, this commits only the shard containing the path. + Default implementation commits everything. + """ + self.commit() + @abstractmethod def list_all(self) -> Iterable[str]: ... @@ -169,23 +177,43 @@ def connect_db(db_file: str, set_journal_mode: bool) -> sqlite3.Connection: class SqliteMetadataStore(MetadataStore): - def __init__(self, cache_dir_prefix: str, set_journal_mode: bool = False) -> None: + def __init__( + self, cache_dir_prefix: str, set_journal_mode: bool = False, num_shards: int = 1 + ) -> None: # We check startswith instead of equality because the version # will have already been appended by the time the cache dir is # passed here. - self.db = None + self.dbs: list[sqlite3.Connection] = [] + self.num_shards = num_shards + self.dirty_shards: set[int] = set() if cache_dir_prefix.startswith(os.devnull): return os.makedirs(cache_dir_prefix, exist_ok=True) - self.db = connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode) + if num_shards <= 1: + self.dbs.append( + connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode) + ) + else: + for i in range(num_shards): + self.dbs.append( + connect_db(os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode) + ) + + def _shard_index(self, name: str) -> int: + if self.num_shards <= 1: + return 0 + return hash_path_stem(name) % self.num_shards + + def _db_for(self, name: str) -> sqlite3.Connection: + if not self.dbs: + raise FileNotFoundError() + return self.dbs[self._shard_index(name)] def _query(self, name: str, field: str) -> Any: # Raises FileNotFound for consistency with the file system version - if not self.db: - raise FileNotFoundError() - - cur = self.db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,)) + db = self._db_for(name) + cur = db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,)) results = cur.fetchall() if not results: raise FileNotFoundError() @@ -205,39 +233,46 @@ def read(self, name: str) -> bytes: def write(self, name: str, data: bytes, mtime: float | None = None) -> bool: import sqlite3 - if not self.db: + if not self.dbs: return False try: if mtime is None: mtime = time.time() - self.db.execute( + db = self._db_for(name) + db.execute( "INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)", (name, mtime, data), ) + self.dirty_shards.add(self._shard_index(name)) except sqlite3.OperationalError: return False return True def remove(self, name: str) -> None: - if not self.db: - raise FileNotFoundError() - - self.db.execute("DELETE FROM files2 WHERE path = ?", (name,)) + db = self._db_for(name) + db.execute("DELETE FROM files2 WHERE path = ?", (name,)) + self.dirty_shards.add(self._shard_index(name)) def commit(self) -> None: - if self.db: - self.db.commit() + for i in self.dirty_shards: + self.dbs[i].commit() + self.dirty_shards.clear() + + def commit_path(self, name: str) -> None: + i = self._shard_index(name) + if i in self.dirty_shards: + self.dbs[i].commit() + self.dirty_shards.discard(i) def list_all(self) -> Iterable[str]: - if self.db: - for row in self.db.execute("SELECT path FROM files2"): + for db in self.dbs: + for row in db.execute("SELECT path FROM files2"): yield row[0] def close(self) -> None: - if self.db: - db = self.db - self.db = None + for db in self.dbs: db.close() + self.dbs.clear() def __del__(self) -> None: self.close() diff --git a/mypy/options.py b/mypy/options.py index 81fd88345a43..c8498c2081b4 100644 --- a/mypy/options.py +++ b/mypy/options.py @@ -302,6 +302,7 @@ def __init__(self) -> None: self.incremental = True self.cache_dir = defaults.CACHE_DIR self.sqlite_cache = True + self.sqlite_num_shards = defaults.SQLITE_NUM_SHARDS self.fixed_format_cache = True self.debug_cache = False self.skip_version_check = False diff --git a/mypy/test/test_diff_cache.py b/mypy/test/test_diff_cache.py index dac10e9e40eb..5c632c1153c6 100644 --- a/mypy/test/test_diff_cache.py +++ b/mypy/test/test_diff_cache.py @@ -11,6 +11,7 @@ import time import unittest +from mypy.defaults import SQLITE_NUM_SHARDS from mypy.test.config import PREFIX _MISC_DIR = os.path.join(PREFIX, "misc") @@ -150,10 +151,9 @@ def test_diff_cache_produces_valid_json(self) -> None: from mypy.metastore import SqliteMetadataStore def read_all(cache_dir: str) -> dict[str, bytes]: - store = SqliteMetadataStore(cache_dir) + store = SqliteMetadataStore(cache_dir, num_shards=SQLITE_NUM_SHARDS) result = {name: store.read(name) for name in store.list_all()} - assert store.db is not None - store.db.close() + store.close() return result before = read_all(patched_ver) diff --git a/mypy/util.py b/mypy/util.py index 5c51881452ab..a48a7ea824da 100644 --- a/mypy/util.py +++ b/mypy/util.py @@ -14,6 +14,8 @@ from importlib import resources as importlib_resources from typing import IO, Any, Final, Literal, TypeVar +from mypy_extensions import i64 + orjson: Any try: import orjson # type: ignore[import-not-found, no-redef, unused-ignore] @@ -1009,3 +1011,36 @@ def get_available_threads() -> int: available_threads = cpu_count _AVAILABLE_THREADS = available_threads return available_threads + + +def hash_path_stem(s: str) -> int: + """Hash the stem of a cache file path (everything before the first dot in the basename). + + This is a combined stem-extraction + hash function optimized for mypyc compilation. + Uses only integer arithmetic, avoiding intermediate string allocations. + """ + # First find end of stem (scanning backwards, stop at first dot after last separator) + i = len(s) - 1 + end: i64 = i + while i >= 0: + c: i64 = ord(s[i]) + if c == ord("/") or c == ord("\\"): + break + if c == ord("."): + end = i + i -= 1 + # Calculate hash + hv: i64 = 123 + i = end + while i >= 0: + c = i64(ord(s[i])) + hv = (hv * 33) ^ c + i -= 1 + # Murmur3 finalizer for better bit avalanche (improves shard uniformity) + hv = (hv ^ (hv >> 32)) & 0xFFFFFFFF + hv ^= hv >> 16 + hv = (hv * 0x85EBCA6B) & 0xFFFFFFFF + hv ^= hv >> 13 + hv = (hv * 0xC2B2AE35) & 0xFFFFFFFF + hv ^= hv >> 16 + return int(hv)