Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions misc/apply-cache-diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@
from librt.internal import ReadBuffer

from mypy.cache import CacheMeta
from mypy.defaults import SQLITE_NUM_SHARDS
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
from mypy.util import json_dumps, json_loads


def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore:
if sqlite:
return SqliteMetadataStore(input_dir)
return SqliteMetadataStore(input_dir, num_shards=num_shards)
else:
return FilesystemMetadataStore(input_dir)


def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
cache = make_cache(cache_dir, sqlite)
def apply_diff(
cache_dir: str, diff_file: str, sqlite: bool = False, num_shards: int = SQLITE_NUM_SHARDS
) -> None:
cache = make_cache(cache_dir, sqlite, num_shards=num_shards)
with open(diff_file, "rb") as f:
diff = json_loads(f.read())

Expand Down Expand Up @@ -63,11 +66,14 @@ def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None:
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache")
parser.add_argument(
"--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS
)
parser.add_argument("cache_dir", help="Directory for the cache")
parser.add_argument("diff", help="Cache diff file")
args = parser.parse_args()

apply_diff(args.cache_dir, args.diff, args.sqlite)
apply_diff(args.cache_dir, args.diff, args.sqlite, num_shards=args.num_shards)


if __name__ == "__main__":
Expand Down
28 changes: 21 additions & 7 deletions misc/convert-cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import argparse

from mypy.defaults import SQLITE_NUM_SHARDS
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore


Expand All @@ -26,6 +27,13 @@ def main() -> None:
default=False,
help="Convert to a sqlite cache (default: convert from)",
)
parser.add_argument(
"--num-shards",
type=int,
default=SQLITE_NUM_SHARDS,
dest="num_shards",
help=argparse.SUPPRESS,
)
parser.add_argument(
"--output_dir",
action="store",
Expand All @@ -37,17 +45,23 @@ def main() -> None:

input_dir = args.input_dir
output_dir = args.output_dir or input_dir
num_shards = args.num_shards
assert os.path.isdir(output_dir), f"{output_dir} is not a directory"
if args.to_sqlite:
input: MetadataStore = FilesystemMetadataStore(input_dir)
output: MetadataStore = SqliteMetadataStore(output_dir)
output: MetadataStore = SqliteMetadataStore(output_dir, num_shards=num_shards)
else:
fnam = os.path.join(input_dir, "cache.db")
msg = f"{fnam} does not exist"
if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)):
msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)"
assert os.path.isfile(fnam), msg
input, output = SqliteMetadataStore(input_dir), FilesystemMetadataStore(output_dir)
if num_shards <= 1:
db_files = [os.path.join(input_dir, "cache.db")]
else:
db_files = [os.path.join(input_dir, f"cache.{i}.db") for i in range(num_shards)]
for fnam in db_files:
msg = f"{fnam} does not exist"
if not re.match(r"[0-9]+\.[0-9]+$", os.path.basename(input_dir)):
msg += f" (are you missing Python version at the end, e.g. {input_dir}/3.11)"
assert os.path.isfile(fnam), msg
input = SqliteMetadataStore(input_dir, num_shards=num_shards)
output = FilesystemMetadataStore(output_dir)

for s in input.list_all():
if s.endswith((".json", ".ff")):
Expand Down
12 changes: 8 additions & 4 deletions misc/diff-cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
from librt.internal import ReadBuffer, WriteBuffer

from mypy.cache import CacheMeta, CacheMetaEx
from mypy.defaults import SQLITE_NUM_SHARDS
from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore
from mypy.util import json_dumps, json_loads


def make_cache(input_dir: str, sqlite: bool) -> MetadataStore:
def make_cache(input_dir: str, sqlite: bool, num_shards: int = SQLITE_NUM_SHARDS) -> MetadataStore:
if sqlite:
return SqliteMetadataStore(input_dir)
return SqliteMetadataStore(input_dir, num_shards=num_shards)
else:
return FilesystemMetadataStore(input_dir)

Expand Down Expand Up @@ -154,13 +155,16 @@ def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--verbose", action="store_true", default=False, help="Increase verbosity")
parser.add_argument("--sqlite", action="store_true", default=False, help="Use a sqlite cache")
parser.add_argument(
"--num-shards", type=int, default=SQLITE_NUM_SHARDS, help=argparse.SUPPRESS
)
parser.add_argument("input_dir1", help="Input directory for the original cache")
parser.add_argument("input_dir2", help="Input directory for the target cache")
parser.add_argument("output", help="Output file with the diff from original cache")
args = parser.parse_args()

cache1 = make_cache(args.input_dir1, args.sqlite)
cache2 = make_cache(args.input_dir2, args.sqlite)
cache1 = make_cache(args.input_dir1, args.sqlite, num_shards=args.num_shards)
cache2 = make_cache(args.input_dir2, args.sqlite, num_shards=args.num_shards)

type_misses: dict[str, int] = defaultdict(int)
type_hits: dict[str, int] = defaultdict(int)
Expand Down
26 changes: 24 additions & 2 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -1273,6 +1273,10 @@ def commit(self) -> None:
self.metastore.commit()
self.add_stats(cache_commit_time=time.time() - t0)

def commit_module(self, meta_file: str) -> None:
"""Commit cache writes for a single module (identified by its meta file path)."""
self.metastore.commit_path(meta_file)

def verbosity(self) -> int:
return self.options.verbosity

Expand Down Expand Up @@ -1819,7 +1823,9 @@ def create_metastore(options: Options, parallel_worker: bool) -> MetadataStore:
"""Create the appropriate metadata store."""
if options.sqlite_cache:
mds: MetadataStore = SqliteMetadataStore(
_cache_dir_prefix(options), set_journal_mode=not parallel_worker
_cache_dir_prefix(options),
set_journal_mode=not parallel_worker,
num_shards=options.sqlite_num_shards,
)
else:
mds = FilesystemMetadataStore(_cache_dir_prefix(options))
Expand Down Expand Up @@ -4424,6 +4430,10 @@ def find_stale_sccs(

def process_graph(graph: Graph, manager: BuildManager) -> None:
"""Process everything in dependency order."""
if manager.workers:
# Commit any cache writes from graph loading before workers try to read them.
manager.commit()

# Broadcast graph to workers before computing SCCs to save a bit of time.
# TODO: check if we can optimize by sending only part of the graph needed for given SCC.
# For example only send modules in the SCC and their dependencies.
Expand Down Expand Up @@ -4675,6 +4685,8 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:

t4 = time.time()
# Flush errors, and write cache in two phases: first data files, then meta files.
# The two-phase structure is needed because meta.dep_hashes references interface_hash
# values from other modules in the SCC, which are updated by write_cache().
meta_tuples = {}
errors_by_id = {}
for id in stale:
Expand All @@ -4685,7 +4697,11 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:
)
manager.flush_errors(manager.errors.simplify_path(graph[id].xpath), formatted, False)
errors_by_id[id] = errors
meta_tuples[id] = graph[id].write_cache()
meta_tuple = graph[id].write_cache()
meta_tuples[id] = meta_tuple
# Commit data file write immediately to avoid holding shard locks across modules.
if meta_tuple is not None:
manager.commit_module(meta_tuple[1])
for id in stale:
meta_tuple = meta_tuples[id]
if meta_tuple is None:
Expand All @@ -4709,6 +4725,7 @@ def process_stale_scc(graph: Graph, ascc: SCC, manager: BuildManager) -> None:
error_lines=errors_by_id.get(id, []),
)
write_cache_meta_ex(meta_file, meta_ex, manager)
manager.commit_module(meta_file)
manager.done_sccs.add(ascc.id)
manager.add_stats(
load_missing_time=t1 - t0,
Expand Down Expand Up @@ -4761,6 +4778,9 @@ def process_stale_scc_interface(
for id in stale:
meta_tuple = graph[id].write_cache()
meta_tuples[id] = meta_tuple
# Commit data file write immediately to avoid holding shard locks across modules.
if meta_tuple is not None:
manager.commit_module(meta_tuple[1])
for id in stale:
meta_tuple = meta_tuples[id]
if meta_tuple is None:
Expand All @@ -4773,6 +4793,7 @@ def process_stale_scc_interface(
if state.priorities.get(dep) != PRI_INDIRECT
]
write_cache_meta(meta, manager, meta_file)
manager.commit_module(meta_file)
scc_result.append((id, ModuleResult(graph[id].interface_hash.hex(), []), meta_file))
manager.done_sccs.add(ascc.id)
manager.add_stats(
Expand Down Expand Up @@ -4852,6 +4873,7 @@ def process_stale_scc_implementation(
# If there are no errors, only write the cache, don't send anything back
# to the caller (as a micro-optimization).
write_cache_meta_ex(meta_file, meta_ex, manager)
manager.commit_module(meta_file)
Copy link
Copy Markdown
Member

@ilevkivskyi ilevkivskyi Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we write after literally every file, then I guess the .commit() calls in worker.py are no-op, right? Do you think it is safer to keep them?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to commit after every file to avoid transactions that span multiple shards. Committing multiple times per file could be redundant, though.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to commit after every file to avoid transactions that span multiple shards

TBH I am not sure why exactly it is a problem. We don't create any kind of shared client-side transaction (apart from those that may be created by individual connections under the hood).

Anyway, I am thinking that if we need to commit after each write, then we should simply use isolation_level=None and delete all the commit() calls altogether. Because what we are doing now is literally re-implementing isolation_level=None. IIUC with this setting each statement becomes its own transaction, if I read the docs correctly https://docs.python.org/3/library/sqlite3.html#sqlite3.Connection.isolation_level

@hauntsaninja Please correct me if I am wrong.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(To be clear, it is fine to do this in a separate PR)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That matches my understanding


manager.add_stats(type_check_time_implementation=time.time() - t0)
return scc_result
Expand Down
1 change: 1 addition & 0 deletions mypy/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
PYTHON3_VERSION_MIN: Final = (3, 10) # Keep in sync with supported target versions

CACHE_DIR: Final = ".mypy_cache"
SQLITE_NUM_SHARDS: Final = 16

CONFIG_NAMES: Final = ["mypy.ini", ".mypy.ini"]
SHARED_CONFIG_NAMES: Final = ["pyproject.toml", "setup.cfg"]
Expand Down
7 changes: 7 additions & 0 deletions mypy/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,13 @@ def add_invertible_flag(
help="Use a sqlite database to store the cache",
group=incremental_group,
)
incremental_group.add_argument(
"--sqlite-num-shards",
type=int,
default=defaults.SQLITE_NUM_SHARDS,
dest="sqlite_num_shards",
help=argparse.SUPPRESS,
)
incremental_group.add_argument(
"--cache-fine-grained",
action="store_true",
Expand Down
77 changes: 56 additions & 21 deletions mypy/metastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any

from mypy.util import os_path_join
from mypy.util import hash_path_stem, os_path_join

if TYPE_CHECKING:
# We avoid importing sqlite3 unless we are using it so we can mostly work
Expand Down Expand Up @@ -65,6 +65,14 @@ def commit(self) -> None:
called.
"""

def commit_path(self, name: str) -> None:
"""Commit changes related to a specific cache path.

For sharded stores, this commits only the shard containing the path.
Default implementation commits everything.
"""
self.commit()

@abstractmethod
def list_all(self) -> Iterable[str]: ...

Expand Down Expand Up @@ -169,23 +177,43 @@ def connect_db(db_file: str, set_journal_mode: bool) -> sqlite3.Connection:


class SqliteMetadataStore(MetadataStore):
def __init__(self, cache_dir_prefix: str, set_journal_mode: bool = False) -> None:
def __init__(
self, cache_dir_prefix: str, set_journal_mode: bool = False, num_shards: int = 1
) -> None:
# We check startswith instead of equality because the version
# will have already been appended by the time the cache dir is
# passed here.
self.db = None
self.dbs: list[sqlite3.Connection] = []
self.num_shards = num_shards
self.dirty_shards: set[int] = set()
if cache_dir_prefix.startswith(os.devnull):
return

os.makedirs(cache_dir_prefix, exist_ok=True)
self.db = connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode)
if num_shards <= 1:
self.dbs.append(
connect_db(os_path_join(cache_dir_prefix, "cache.db"), set_journal_mode)
)
else:
for i in range(num_shards):
self.dbs.append(
connect_db(os_path_join(cache_dir_prefix, f"cache.{i}.db"), set_journal_mode)
)

def _shard_index(self, name: str) -> int:
if self.num_shards <= 1:
return 0
return hash_path_stem(name) % self.num_shards

def _db_for(self, name: str) -> sqlite3.Connection:
if not self.dbs:
raise FileNotFoundError()
return self.dbs[self._shard_index(name)]

def _query(self, name: str, field: str) -> Any:
# Raises FileNotFound for consistency with the file system version
if not self.db:
raise FileNotFoundError()

cur = self.db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,))
db = self._db_for(name)
cur = db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,))
results = cur.fetchall()
if not results:
raise FileNotFoundError()
Expand All @@ -205,39 +233,46 @@ def read(self, name: str) -> bytes:
def write(self, name: str, data: bytes, mtime: float | None = None) -> bool:
import sqlite3

if not self.db:
if not self.dbs:
return False
try:
if mtime is None:
mtime = time.time()
self.db.execute(
db = self._db_for(name)
db.execute(
"INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)",
(name, mtime, data),
)
self.dirty_shards.add(self._shard_index(name))
except sqlite3.OperationalError:
return False
return True

def remove(self, name: str) -> None:
if not self.db:
raise FileNotFoundError()

self.db.execute("DELETE FROM files2 WHERE path = ?", (name,))
db = self._db_for(name)
db.execute("DELETE FROM files2 WHERE path = ?", (name,))
self.dirty_shards.add(self._shard_index(name))

def commit(self) -> None:
if self.db:
self.db.commit()
for i in self.dirty_shards:
self.dbs[i].commit()
self.dirty_shards.clear()

def commit_path(self, name: str) -> None:
i = self._shard_index(name)
if i in self.dirty_shards:
self.dbs[i].commit()
self.dirty_shards.discard(i)

def list_all(self) -> Iterable[str]:
if self.db:
for row in self.db.execute("SELECT path FROM files2"):
for db in self.dbs:
for row in db.execute("SELECT path FROM files2"):
yield row[0]

def close(self) -> None:
if self.db:
db = self.db
self.db = None
for db in self.dbs:
db.close()
self.dbs.clear()

def __del__(self) -> None:
self.close()
1 change: 1 addition & 0 deletions mypy/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ def __init__(self) -> None:
self.incremental = True
self.cache_dir = defaults.CACHE_DIR
self.sqlite_cache = True
self.sqlite_num_shards = defaults.SQLITE_NUM_SHARDS
self.fixed_format_cache = True
self.debug_cache = False
self.skip_version_check = False
Expand Down
Loading
Loading