From 3472f39096453b4beca35d9d18e0f5fd0aa43439 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 15 Apr 2026 17:35:18 +0100 Subject: [PATCH 01/11] Avoid reading the source file sequentially in parallel parsing The file is now usually only read in the Rust extension. This improves parallel scaling, as `get_source()` was a sequential bottleneck. I measured ~5% improvement to parallel type checking times in some cases on macOS (though it was a bit noisy). --- mypy/build.py | 24 ++++++++++++++++++++---- mypy/nativeparse.py | 26 +++++++++++++++++++++----- mypy/nodes.py | 28 +++++++++++++++++++++++++++- mypy/test/test_nativeparse.py | 2 +- 4 files changed, 69 insertions(+), 11 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 4e9480d8d3ef..bc00cc309987 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1052,15 +1052,12 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor: for state in parallel_states: state.needs_parse = False - # New parser reads source from file directly, we do this only for - # the side effect of parsing inline mypy configurations. - state.get_source() if state.id not in self.ast_cache: self.log(f"Parsing {state.xpath} ({state.id})") ignore_errors = state.ignore_all or state.options.ignore_errors if ignore_errors: self.errors.ignored_files.add(state.xpath) - futures.append(executor.submit(state.parse_file_inner, state.source or "")) + futures.append(executor.submit(state.parse_file_inner, "")) parallel_parsed_states.append(state) parallel_parsed_states_set.add(state) else: @@ -1093,6 +1090,25 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S for state in parallel_states: assert state.tree is not None if state in parallel_parsed_states_set: + # Extract source_hash and mypy_comments from raw_data produced by + # the native parser, avoiding a separate sequential get_source() call. + raw_data = state.tree.raw_data + if raw_data is not None: + state.source_hash = raw_data.source_hash + if raw_data.mypy_comments: + changes, config_errors = parse_mypy_comments( + raw_data.mypy_comments, state.options + ) + state.options = state.options.apply_changes(changes) + self.errors.set_file(state.xpath, state.id, state.options) + for lineno, error in config_errors: + self.error(lineno, error) + state.check_for_invalid_options() + else: + # Fallback for non-native parser path (shouldn't normally happen + # in the parallel path, but be safe). + state.get_source() + state.size_hint = os.path.getsize(state.xpath) state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) state.semantic_analysis_pass1() self.ast_cache[state.id] = (state.tree, state.early_errors) diff --git a/mypy/nativeparse.py b/mypy/nativeparse.py index fd90d85fa355..68dea3f44d00 100644 --- a/mypy/nativeparse.py +++ b/mypy/nativeparse.py @@ -210,13 +210,27 @@ def native_parse( node.path = filename return node, [], [] - b, errors, ignores, import_bytes, is_partial_package, uses_template_strings = ( - parse_to_binary_ast(filename, options, skip_function_bodies) - ) + ( + b, + errors, + ignores, + import_bytes, + is_partial_package, + uses_template_strings, + source_hash, + mypy_comments, + ) = parse_to_binary_ast(filename, options, skip_function_bodies) node = MypyFile([], []) node.path = filename node.raw_data = FileRawData( - b, import_bytes, errors, dict(ignores), is_partial_package, uses_template_strings + b, + import_bytes, + errors, + dict(ignores), + is_partial_package, + uses_template_strings, + source_hash, + mypy_comments, ) return node, errors, ignores @@ -243,7 +257,7 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]: def parse_to_binary_ast( filename: str, options: Options, skip_function_bodies: bool = False -) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool]: +) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]: # This is a horrible hack to work around a mypyc bug where imported # module may be not ready in a thread sometimes. t0 = time.time() @@ -267,6 +281,8 @@ def parse_to_binary_ast( import_bytes, ast_data["is_partial_package"], ast_data["uses_template_strings"], + ast_data["source_hash"], + ast_data["mypy_comments"], ) diff --git a/mypy/nodes.py b/mypy/nodes.py index 4f43ec7eaaa4..3dafffa5570d 100644 --- a/mypy/nodes.py +++ b/mypy/nodes.py @@ -360,6 +360,8 @@ class FileRawData: "ignored_lines", "is_partial_stub_package", "uses_template_strings", + "source_hash", + "mypy_comments", ) defs: bytes @@ -368,6 +370,8 @@ class FileRawData: ignored_lines: dict[int, list[str]] is_partial_stub_package: bool uses_template_strings: bool + source_hash: str + mypy_comments: list[tuple[int, str]] def __init__( self, @@ -377,6 +381,8 @@ def __init__( ignored_lines: dict[int, list[str]], is_partial_stub_package: bool, uses_template_strings: bool, + source_hash: str = "", + mypy_comments: list[tuple[int, str]] | None = None, ) -> None: self.defs = defs self.imports = imports @@ -384,6 +390,8 @@ def __init__( self.ignored_lines = ignored_lines self.is_partial_stub_package = is_partial_stub_package self.uses_template_strings = uses_template_strings + self.source_hash = source_hash + self.mypy_comments = mypy_comments if mypy_comments is not None else [] def write(self, data: WriteBuffer) -> None: write_bytes(data, self.defs) @@ -399,6 +407,12 @@ def write(self, data: WriteBuffer) -> None: write_str_list(data, codes) write_bool(data, self.is_partial_stub_package) write_bool(data, self.uses_template_strings) + write_str(data, self.source_hash) + write_tag(data, LIST_GEN) + write_int_bare(data, len(self.mypy_comments)) + for line, text in self.mypy_comments: + write_int(data, line) + write_str(data, text) @classmethod def read(cls, data: ReadBuffer) -> FileRawData: @@ -408,8 +422,20 @@ def read(cls, data: ReadBuffer) -> FileRawData: raw_errors = [read_parse_error(data) for _ in range(read_int_bare(data))] assert read_tag(data) == DICT_INT_GEN ignored_lines = {read_int(data): read_str_list(data) for _ in range(read_int_bare(data))} + is_partial_stub_package = read_bool(data) + uses_template_strings = read_bool(data) + source_hash = read_str(data) + assert read_tag(data) == LIST_GEN + mypy_comments = [(read_int(data), read_str(data)) for _ in range(read_int_bare(data))] return FileRawData( - defs, imports, raw_errors, ignored_lines, read_bool(data), read_bool(data) + defs, + imports, + raw_errors, + ignored_lines, + is_partial_stub_package, + uses_template_strings, + source_hash, + mypy_comments, ) diff --git a/mypy/test/test_nativeparse.py b/mypy/test/test_nativeparse.py index f9a18ea992c2..b50da5f5d02c 100644 --- a/mypy/test/test_nativeparse.py +++ b/mypy/test/test_nativeparse.py @@ -251,7 +251,7 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) -> ] with temp_source("print('hello')") as fnam: - b, _, _, _, _, _ = parse_to_binary_ast(fnam, Options()) + b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options()) assert list(b) == ( [LITERAL_INT, 22, nodes.EXPR_STMT, nodes.CALL_EXPR] + [nodes.NAME_EXPR, LITERAL_STR] From 98ead81eee1253c2adab89ce0ee84d3da5cb44af Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 14:08:07 +0100 Subject: [PATCH 02/11] Don't serialize/deserialize --- mypy/nodes.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/mypy/nodes.py b/mypy/nodes.py index 3dafffa5570d..808d16060771 100644 --- a/mypy/nodes.py +++ b/mypy/nodes.py @@ -407,12 +407,6 @@ def write(self, data: WriteBuffer) -> None: write_str_list(data, codes) write_bool(data, self.is_partial_stub_package) write_bool(data, self.uses_template_strings) - write_str(data, self.source_hash) - write_tag(data, LIST_GEN) - write_int_bare(data, len(self.mypy_comments)) - for line, text in self.mypy_comments: - write_int(data, line) - write_str(data, text) @classmethod def read(cls, data: ReadBuffer) -> FileRawData: @@ -422,20 +416,13 @@ def read(cls, data: ReadBuffer) -> FileRawData: raw_errors = [read_parse_error(data) for _ in range(read_int_bare(data))] assert read_tag(data) == DICT_INT_GEN ignored_lines = {read_int(data): read_str_list(data) for _ in range(read_int_bare(data))} - is_partial_stub_package = read_bool(data) - uses_template_strings = read_bool(data) - source_hash = read_str(data) - assert read_tag(data) == LIST_GEN - mypy_comments = [(read_int(data), read_str(data)) for _ in range(read_int_bare(data))] return FileRawData( defs, imports, raw_errors, ignored_lines, - is_partial_stub_package, - uses_template_strings, - source_hash, - mypy_comments, + read_bool(data), + read_bool(data), ) From 2877475bdfb39e2d91b88ff50a85cf790b684226 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 14:22:51 +0100 Subject: [PATCH 03/11] Fix cached reads --- mypy/build.py | 14 +++++++++----- mypy/nodes.py | 7 +------ 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index bc00cc309987..72c507b4e4e5 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -953,7 +953,7 @@ def __init__( # until all the files have been added. This means that a # new file can be processed O(n**2) times. This cache # avoids most of this redundant work. - self.ast_cache: dict[str, tuple[MypyFile, list[ErrorInfo]]] = {} + self.ast_cache: dict[str, tuple[MypyFile, list[ErrorInfo], str | None]] = {} # Number of times we used GC optimization hack for fresh SCCs. self.gc_freeze_cycles = 0 # Mapping from SCC id to corresponding SCC instance. This is populated @@ -1062,7 +1062,9 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S parallel_parsed_states_set.add(state) else: self.log(f"Using cached AST for {state.xpath} ({state.id})") - state.tree, state.early_errors = self.ast_cache[state.id] + state.tree, state.early_errors, source_hash = self.ast_cache[state.id] + if state.source_hash is None: + state.source_hash = source_hash # Parse sequential before waiting on parallel. for state in sequential_states: @@ -1111,7 +1113,7 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S state.size_hint = os.path.getsize(state.xpath) state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) state.semantic_analysis_pass1() - self.ast_cache[state.id] = (state.tree, state.early_errors) + self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash) self.modules[state.id] = state.tree state.check_blockers() state.setup_errors() @@ -3165,14 +3167,16 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None = else: # Reuse a cached AST manager.log(f"Using cached AST for {self.xpath} ({self.id})") - self.tree, self.early_errors = manager.ast_cache[self.id] + self.tree, self.early_errors, source_hash = manager.ast_cache[self.id] + if self.source_hash is None: + self.source_hash = source_hash assert self.tree is not None if not temporary: manager.modules[self.id] = self.tree self.check_blockers() - manager.ast_cache[self.id] = (self.tree, self.early_errors) + manager.ast_cache[self.id] = (self.tree, self.early_errors, self.source_hash) self.setup_errors() def setup_errors(self) -> None: diff --git a/mypy/nodes.py b/mypy/nodes.py index 808d16060771..61d7d419199f 100644 --- a/mypy/nodes.py +++ b/mypy/nodes.py @@ -417,12 +417,7 @@ def read(cls, data: ReadBuffer) -> FileRawData: assert read_tag(data) == DICT_INT_GEN ignored_lines = {read_int(data): read_str_list(data) for _ in range(read_int_bare(data))} return FileRawData( - defs, - imports, - raw_errors, - ignored_lines, - read_bool(data), - read_bool(data), + defs, imports, raw_errors, ignored_lines, read_bool(data), read_bool(data) ) From 4b84a57de72647b70ab4de08b547687881be807b Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 14:39:18 +0100 Subject: [PATCH 04/11] Process inline config earlier --- mypy/build.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 72c507b4e4e5..25d7c1a082b3 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1076,11 +1076,25 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S # New parser returns serialized trees that need to be de-serialized. with state.wrap_context(): assert state.tree is not None - if state.tree.raw_data: + raw_data = state.tree.raw_data + if raw_data is not None: + # Apply inline mypy config before deserialization, since + # some options (e.g. implicit_optional) affect how the + # AST is built during deserialization. + state.source_hash = raw_data.source_hash + if raw_data.mypy_comments: + changes, config_errors = parse_mypy_comments( + raw_data.mypy_comments, state.options + ) + state.options = state.options.apply_changes(changes) + self.errors.set_file(state.xpath, state.id, state.options) + for lineno, error in config_errors: + self.error(lineno, error) + state.check_for_invalid_options() state.tree = load_from_raw( state.xpath, state.id, - state.tree.raw_data, + raw_data, self.errors, state.options, imports_only=bool(self.workers), @@ -1092,21 +1106,11 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S for state in parallel_states: assert state.tree is not None if state in parallel_parsed_states_set: - # Extract source_hash and mypy_comments from raw_data produced by - # the native parser, avoiding a separate sequential get_source() call. - raw_data = state.tree.raw_data - if raw_data is not None: - state.source_hash = raw_data.source_hash - if raw_data.mypy_comments: - changes, config_errors = parse_mypy_comments( - raw_data.mypy_comments, state.options - ) - state.options = state.options.apply_changes(changes) - self.errors.set_file(state.xpath, state.id, state.options) - for lineno, error in config_errors: - self.error(lineno, error) - state.check_for_invalid_options() - else: + if state.tree.raw_data is not None: + # source_hash was already extracted above, but raw_data + # may have been preserved for workers (imports_only=True). + pass + elif state.source_hash is None: # Fallback for non-native parser path (shouldn't normally happen # in the parallel path, but be safe). state.get_source() From bc0d231f1d0711e3d0a5049e4f67a5420cb5f09d Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 14:43:22 +0100 Subject: [PATCH 05/11] Add inline config test case --- test-data/unit/check-optional.test | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test-data/unit/check-optional.test b/test-data/unit/check-optional.test index fe8ce9ede9d2..5e3d6e0e58ab 100644 --- a/test-data/unit/check-optional.test +++ b/test-data/unit/check-optional.test @@ -1356,3 +1356,9 @@ def f(x: object) -> None: with C(): pass [builtins fixtures/tuple.pyi] + +[case testInferOptionalFromDefaultNoneInlineConfig] +# mypy: implicit-optional +def f(x: int = None) -> None: + reveal_type(x) # N: Revealed type is "builtins.int | None" +f(None) From 68def94fba2f574a9aeca08eef4d3f3a761ed462 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 16:19:47 +0100 Subject: [PATCH 06/11] Revert "Don't serialize/deserialize" This reverts commit 98ead81eee1253c2adab89ce0ee84d3da5cb44af. --- mypy/nodes.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/mypy/nodes.py b/mypy/nodes.py index 61d7d419199f..3dafffa5570d 100644 --- a/mypy/nodes.py +++ b/mypy/nodes.py @@ -407,6 +407,12 @@ def write(self, data: WriteBuffer) -> None: write_str_list(data, codes) write_bool(data, self.is_partial_stub_package) write_bool(data, self.uses_template_strings) + write_str(data, self.source_hash) + write_tag(data, LIST_GEN) + write_int_bare(data, len(self.mypy_comments)) + for line, text in self.mypy_comments: + write_int(data, line) + write_str(data, text) @classmethod def read(cls, data: ReadBuffer) -> FileRawData: @@ -416,8 +422,20 @@ def read(cls, data: ReadBuffer) -> FileRawData: raw_errors = [read_parse_error(data) for _ in range(read_int_bare(data))] assert read_tag(data) == DICT_INT_GEN ignored_lines = {read_int(data): read_str_list(data) for _ in range(read_int_bare(data))} + is_partial_stub_package = read_bool(data) + uses_template_strings = read_bool(data) + source_hash = read_str(data) + assert read_tag(data) == LIST_GEN + mypy_comments = [(read_int(data), read_str(data)) for _ in range(read_int_bare(data))] return FileRawData( - defs, imports, raw_errors, ignored_lines, read_bool(data), read_bool(data) + defs, + imports, + raw_errors, + ignored_lines, + is_partial_stub_package, + uses_template_strings, + source_hash, + mypy_comments, ) From 6941a1acff494347635b66e28a841a1ff0bf48d3 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 16:26:43 +0100 Subject: [PATCH 07/11] Address feedback --- mypy/build.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 25d7c1a082b3..4b9534ccfcd6 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1063,8 +1063,7 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S else: self.log(f"Using cached AST for {state.xpath} ({state.id})") state.tree, state.early_errors, source_hash = self.ast_cache[state.id] - if state.source_hash is None: - state.source_hash = source_hash + state.source_hash = source_hash # Parse sequential before waiting on parallel. for state in sequential_states: @@ -3172,8 +3171,7 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None = # Reuse a cached AST manager.log(f"Using cached AST for {self.xpath} ({self.id})") self.tree, self.early_errors, source_hash = manager.ast_cache[self.id] - if self.source_hash is None: - self.source_hash = source_hash + self.source_hash = source_hash assert self.tree is not None if not temporary: From b0cac68b4b62946dbc1d597d81456551acd856b8 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 16:36:07 +0100 Subject: [PATCH 08/11] Apply inline config more consistently --- mypy/build.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 4b9534ccfcd6..3ae7ed834e6d 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1081,15 +1081,7 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S # some options (e.g. implicit_optional) affect how the # AST is built during deserialization. state.source_hash = raw_data.source_hash - if raw_data.mypy_comments: - changes, config_errors = parse_mypy_comments( - raw_data.mypy_comments, state.options - ) - state.options = state.options.apply_changes(changes) - self.errors.set_file(state.xpath, state.id, state.options) - for lineno, error in config_errors: - self.error(lineno, error) - state.check_for_invalid_options() + state.apply_inline_configuration(raw_data.mypy_comments) state.tree = load_from_raw( state.xpath, state.id, @@ -3111,7 +3103,6 @@ def get_source(self) -> str: self.source_hash = compute_hash(source) self.parse_inline_configuration(source) - self.check_for_invalid_options() self.size_hint = len(source) self.time_spent_us += time_spent_us(t0) @@ -3136,7 +3127,10 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None = # The file was already parsed. return - source = self.get_source() + if raw_data is None: + source = self.get_source() + else: + source = "" manager = self.manager # Can we reuse a previously parsed AST? This avoids redundant work in daemon. if self.id not in manager.ast_cache: @@ -3146,6 +3140,12 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None = self.manager.errors.ignored_files.add(self.xpath) with self.wrap_context(): manager.errors.set_file(self.xpath, self.id, options=self.options) + if raw_data is not None: + # Apply inline mypy config before deserialization, since + # some options (e.g. implicit_optional) affect how the + # AST is built during deserialization. + self.source_hash = raw_data.source_hash + self.apply_inline_configuration(raw_data.mypy_comments) self.parse_file_inner(source, raw_data) assert self.tree is not None # New parser returns serialized trees that need to be de-serialized. @@ -3191,12 +3191,17 @@ def setup_errors(self) -> None: def parse_inline_configuration(self, source: str) -> None: """Check for inline mypy: options directive and parse them.""" flags = get_mypy_comments(source) + self.apply_inline_configuration(flags) + + def apply_inline_configuration(self, flags: list[tuple[int, str]] | None) -> None: + """Apply inline mypy configuration comments and check for invalid options.""" if flags: changes, config_errors = parse_mypy_comments(flags, self.options) self.options = self.options.apply_changes(changes) self.manager.errors.set_file(self.xpath, self.id, self.options) for lineno, error in config_errors: self.manager.error(lineno, error) + self.check_for_invalid_options() def check_for_invalid_options(self) -> None: if self.options.mypyc and not self.options.strict_bytes: From 5d8d5446267a0b83e2201cd0786919aece7bf7b0 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 16:43:02 +0100 Subject: [PATCH 09/11] Minor tweak --- mypy/build.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 3ae7ed834e6d..b67c590f351d 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1078,8 +1078,7 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S raw_data = state.tree.raw_data if raw_data is not None: # Apply inline mypy config before deserialization, since - # some options (e.g. implicit_optional) affect how the - # AST is built during deserialization. + # some options (e.g. implicit_optional) affect deserialization state.source_hash = raw_data.source_hash state.apply_inline_configuration(raw_data.mypy_comments) state.tree = load_from_raw( From bc0fd4935f4ff2147c1fe9e102e4cb06feb410c9 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 16:57:12 +0100 Subject: [PATCH 10/11] Update comment --- mypy/build.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index b67c590f351d..8db265d598c0 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1101,8 +1101,7 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S # may have been preserved for workers (imports_only=True). pass elif state.source_hash is None: - # Fallback for non-native parser path (shouldn't normally happen - # in the parallel path, but be safe). + # At least namespace packages may not have source. state.get_source() state.size_hint = os.path.getsize(state.xpath) state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) From 106811691da3c39f1253cc31f5d21303b3638a73 Mon Sep 17 00:00:00 2001 From: Jukka Lehtosalo Date: Wed, 22 Apr 2026 17:19:58 +0100 Subject: [PATCH 11/11] Refactor based on feedback --- mypy/build.py | 98 ++++++++++++++++++++++++++++++--------------------- mypy/parse.py | 6 +++- 2 files changed, 63 insertions(+), 41 deletions(-) diff --git a/mypy/build.py b/mypy/build.py index 8db265d598c0..d79f98f7489e 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1039,11 +1039,66 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S as an optimization to parallelize only those parts of the code that can be parallelized efficiently. """ + parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw( + sequential_states, parallel_states + ) + + for state in parallel_parsed_states: + # New parser returns serialized ASTs. Deserialize full trees only if not using + # parallel workers. + with state.wrap_context(): + assert state.tree is not None + raw_data = state.tree.raw_data + if raw_data is not None: + # Apply inline mypy config before deserialization, since + # some options (e.g. implicit_optional) affect deserialization + state.source_hash = raw_data.source_hash + state.apply_inline_configuration(raw_data.mypy_comments) + state.tree = load_from_raw( + state.xpath, + state.id, + raw_data, + self.errors, + state.options, + imports_only=bool(self.workers), + ) + if self.errors.is_blockers(): + self.log("Bailing due to parse errors") + self.errors.raise_error() + + for state in parallel_states: + assert state.tree is not None + if state in parallel_parsed_states_set: + if state.tree.raw_data is not None: + # source_hash was already extracted above, but raw_data + # may have been preserved for workers (imports_only=True). + pass + elif state.source_hash is None: + # At least namespace packages may not have source. + state.get_source() + state.size_hint = os.path.getsize(state.xpath) + state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) + state.semantic_analysis_pass1() + self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash) + self.modules[state.id] = state.tree + state.check_blockers() + state.setup_errors() + + def parse_files_threaded_raw( + self, sequential_states: list[State], parallel_states: list[State] + ) -> tuple[list[State], set[State]]: + """Parse files using a thread pool. + + Also parse sequential states while waiting for the parallel results. + Trees from the new parser are left in raw (serialized) form. + + Return (list, set) of states that were actually parsed (not cached). + """ futures = [] # Use both list and a set to have more predictable order of errors, # while also not sacrificing performance. - parallel_parsed_states = [] - parallel_parsed_states_set = set() + parallel_parsed_states: list[State] = [] + parallel_parsed_states_set: set[State] = set() # Use at least --num-workers if specified by user. available_threads = max(get_available_threads(), self.options.num_workers) # Overhead from trying to parallelize (small) blocking portion of @@ -1071,45 +1126,8 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S for fut in wait(futures).done: fut.result() - for state in parallel_parsed_states: - # New parser returns serialized trees that need to be de-serialized. - with state.wrap_context(): - assert state.tree is not None - raw_data = state.tree.raw_data - if raw_data is not None: - # Apply inline mypy config before deserialization, since - # some options (e.g. implicit_optional) affect deserialization - state.source_hash = raw_data.source_hash - state.apply_inline_configuration(raw_data.mypy_comments) - state.tree = load_from_raw( - state.xpath, - state.id, - raw_data, - self.errors, - state.options, - imports_only=bool(self.workers), - ) - if self.errors.is_blockers(): - self.log("Bailing due to parse errors") - self.errors.raise_error() - for state in parallel_states: - assert state.tree is not None - if state in parallel_parsed_states_set: - if state.tree.raw_data is not None: - # source_hash was already extracted above, but raw_data - # may have been preserved for workers (imports_only=True). - pass - elif state.source_hash is None: - # At least namespace packages may not have source. - state.get_source() - state.size_hint = os.path.getsize(state.xpath) - state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) - state.semantic_analysis_pass1() - self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash) - self.modules[state.id] = state.tree - state.check_blockers() - state.setup_errors() + return parallel_parsed_states, parallel_parsed_states_set def post_parse_all(self, states: list[State]) -> None: for state in states: diff --git a/mypy/parse.py b/mypy/parse.py index d2626737b8c4..b0901a3a2455 100644 --- a/mypy/parse.py +++ b/mypy/parse.py @@ -64,7 +64,11 @@ def load_from_raw( options: Options, imports_only: bool = False, ) -> MypyFile: - """Load AST from parsed binary data and report stored errors.""" + """Load AST from parsed binary data and report stored errors. + + If imports_only is true, only deserialize imports and return a mostly + empty AST. + """ from mypy.nativeparse import State, deserialize_imports, read_statements state = State(options)