Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 80 additions & 37 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,7 @@ def __init__(
# until all the files have been added. This means that a
# new file can be processed O(n**2) times. This cache
# avoids most of this redundant work.
self.ast_cache: dict[str, tuple[MypyFile, list[ErrorInfo]]] = {}
self.ast_cache: dict[str, tuple[MypyFile, list[ErrorInfo], str | None]] = {}
# Number of times we used GC optimization hack for fresh SCCs.
self.gc_freeze_cycles = 0
# Mapping from SCC id to corresponding SCC instance. This is populated
Expand Down Expand Up @@ -1039,11 +1039,66 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
as an optimization to parallelize only those parts of the code that can be
parallelized efficiently.
"""
parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
sequential_states, parallel_states
)

for state in parallel_parsed_states:
# New parser returns serialized ASTs. Deserialize full trees only if not using
# parallel workers.
with state.wrap_context():
assert state.tree is not None
raw_data = state.tree.raw_data
if raw_data is not None:
# Apply inline mypy config before deserialization, since
# some options (e.g. implicit_optional) affect deserialization
state.source_hash = raw_data.source_hash
state.apply_inline_configuration(raw_data.mypy_comments)
state.tree = load_from_raw(
state.xpath,
state.id,
raw_data,
self.errors,
state.options,
imports_only=bool(self.workers),
)
if self.errors.is_blockers():
self.log("Bailing due to parse errors")
self.errors.raise_error()

for state in parallel_states:
assert state.tree is not None
if state in parallel_parsed_states_set:
if state.tree.raw_data is not None:
# source_hash was already extracted above, but raw_data
# may have been preserved for workers (imports_only=True).
pass
elif state.source_hash is None:
# At least namespace packages may not have source.
state.get_source()
state.size_hint = os.path.getsize(state.xpath)
state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
state.semantic_analysis_pass1()
self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash)
self.modules[state.id] = state.tree
state.check_blockers()
state.setup_errors()

def parse_files_threaded_raw(
self, sequential_states: list[State], parallel_states: list[State]
) -> tuple[list[State], set[State]]:
"""Parse files using a thread pool.

Also parse sequential states while waiting for the parallel results.
Trees from the new parser are left in raw (serialized) form.

Return (list, set) of states that were actually parsed (not cached).
"""
futures = []
# Use both list and a set to have more predictable order of errors,
# while also not sacrificing performance.
parallel_parsed_states = []
parallel_parsed_states_set = set()
parallel_parsed_states: list[State] = []
parallel_parsed_states_set: set[State] = set()
# Use at least --num-workers if specified by user.
available_threads = max(get_available_threads(), self.options.num_workers)
# Overhead from trying to parallelize (small) blocking portion of
Expand All @@ -1052,53 +1107,27 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor:
for state in parallel_states:
state.needs_parse = False
# New parser reads source from file directly, we do this only for
# the side effect of parsing inline mypy configurations.
state.get_source()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you need to (conditionally) remove the same call in State.parse_file(), otherwise the worker will call it when loading the tree (look for state.parse_file(raw_data=raw_data) in worker.py).

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated.

if state.id not in self.ast_cache:
self.log(f"Parsing {state.xpath} ({state.id})")
ignore_errors = state.ignore_all or state.options.ignore_errors
if ignore_errors:
self.errors.ignored_files.add(state.xpath)
futures.append(executor.submit(state.parse_file_inner, state.source or ""))
futures.append(executor.submit(state.parse_file_inner, ""))
parallel_parsed_states.append(state)
parallel_parsed_states_set.add(state)
else:
self.log(f"Using cached AST for {state.xpath} ({state.id})")
state.tree, state.early_errors = self.ast_cache[state.id]
state.tree, state.early_errors, source_hash = self.ast_cache[state.id]
state.source_hash = source_hash

# Parse sequential before waiting on parallel.
for state in sequential_states:
state.parse_file()

for fut in wait(futures).done:
fut.result()
for state in parallel_parsed_states:
# New parser returns serialized trees that need to be de-serialized.
with state.wrap_context():
assert state.tree is not None
if state.tree.raw_data:
state.tree = load_from_raw(
state.xpath,
state.id,
state.tree.raw_data,
self.errors,
state.options,
imports_only=bool(self.workers),
)
if self.errors.is_blockers():
self.log("Bailing due to parse errors")
self.errors.raise_error()

for state in parallel_states:
assert state.tree is not None
if state in parallel_parsed_states_set:
state.early_errors = list(self.errors.error_info_map.get(state.xpath, []))
state.semantic_analysis_pass1()
self.ast_cache[state.id] = (state.tree, state.early_errors)
self.modules[state.id] = state.tree
state.check_blockers()
state.setup_errors()
return parallel_parsed_states, parallel_parsed_states_set

def post_parse_all(self, states: list[State]) -> None:
for state in states:
Expand Down Expand Up @@ -3090,7 +3119,6 @@ def get_source(self) -> str:
self.source_hash = compute_hash(source)

self.parse_inline_configuration(source)
self.check_for_invalid_options()

self.size_hint = len(source)
self.time_spent_us += time_spent_us(t0)
Expand All @@ -3115,7 +3143,10 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
# The file was already parsed.
return

source = self.get_source()
if raw_data is None:
source = self.get_source()
else:
source = ""
manager = self.manager
# Can we reuse a previously parsed AST? This avoids redundant work in daemon.
if self.id not in manager.ast_cache:
Expand All @@ -3125,6 +3156,12 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
self.manager.errors.ignored_files.add(self.xpath)
with self.wrap_context():
manager.errors.set_file(self.xpath, self.id, options=self.options)
if raw_data is not None:
# Apply inline mypy config before deserialization, since
# some options (e.g. implicit_optional) affect how the
# AST is built during deserialization.
self.source_hash = raw_data.source_hash
self.apply_inline_configuration(raw_data.mypy_comments)
self.parse_file_inner(source, raw_data)
assert self.tree is not None
# New parser returns serialized trees that need to be de-serialized.
Expand All @@ -3149,14 +3186,15 @@ def parse_file(self, *, temporary: bool = False, raw_data: FileRawData | None =
else:
# Reuse a cached AST
manager.log(f"Using cached AST for {self.xpath} ({self.id})")
self.tree, self.early_errors = manager.ast_cache[self.id]
self.tree, self.early_errors, source_hash = manager.ast_cache[self.id]
self.source_hash = source_hash

assert self.tree is not None
if not temporary:
manager.modules[self.id] = self.tree
self.check_blockers()

manager.ast_cache[self.id] = (self.tree, self.early_errors)
manager.ast_cache[self.id] = (self.tree, self.early_errors, self.source_hash)
self.setup_errors()

def setup_errors(self) -> None:
Expand All @@ -3169,12 +3207,17 @@ def setup_errors(self) -> None:
def parse_inline_configuration(self, source: str) -> None:
"""Check for inline mypy: options directive and parse them."""
flags = get_mypy_comments(source)
self.apply_inline_configuration(flags)

def apply_inline_configuration(self, flags: list[tuple[int, str]] | None) -> None:
"""Apply inline mypy configuration comments and check for invalid options."""
if flags:
changes, config_errors = parse_mypy_comments(flags, self.options)
self.options = self.options.apply_changes(changes)
self.manager.errors.set_file(self.xpath, self.id, self.options)
for lineno, error in config_errors:
self.manager.error(lineno, error)
self.check_for_invalid_options()

def check_for_invalid_options(self) -> None:
if self.options.mypyc and not self.options.strict_bytes:
Expand Down
26 changes: 21 additions & 5 deletions mypy/nativeparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,27 @@ def native_parse(
node.path = filename
return node, [], []

b, errors, ignores, import_bytes, is_partial_package, uses_template_strings = (
parse_to_binary_ast(filename, options, skip_function_bodies)
)
(
b,
errors,
ignores,
import_bytes,
is_partial_package,
uses_template_strings,
source_hash,
mypy_comments,
) = parse_to_binary_ast(filename, options, skip_function_bodies)
node = MypyFile([], [])
node.path = filename
node.raw_data = FileRawData(
b, import_bytes, errors, dict(ignores), is_partial_package, uses_template_strings
b,
import_bytes,
errors,
dict(ignores),
is_partial_package,
uses_template_strings,
source_hash,
mypy_comments,
)
return node, errors, ignores

Expand All @@ -243,7 +257,7 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]:

def parse_to_binary_ast(
filename: str, options: Options, skip_function_bodies: bool = False
) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool]:
) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]:
# This is a horrible hack to work around a mypyc bug where imported
# module may be not ready in a thread sometimes.
t0 = time.time()
Expand All @@ -267,6 +281,8 @@ def parse_to_binary_ast(
import_bytes,
ast_data["is_partial_package"],
ast_data["uses_template_strings"],
ast_data["source_hash"],
ast_data["mypy_comments"],
)


Expand Down
28 changes: 27 additions & 1 deletion mypy/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,8 @@ class FileRawData:
"ignored_lines",
"is_partial_stub_package",
"uses_template_strings",
"source_hash",
"mypy_comments",
)

defs: bytes
Expand All @@ -368,6 +370,8 @@ class FileRawData:
ignored_lines: dict[int, list[str]]
is_partial_stub_package: bool
uses_template_strings: bool
source_hash: str
mypy_comments: list[tuple[int, str]]

def __init__(
self,
Expand All @@ -377,13 +381,17 @@ def __init__(
ignored_lines: dict[int, list[str]],
is_partial_stub_package: bool,
uses_template_strings: bool,
source_hash: str = "",
mypy_comments: list[tuple[int, str]] | None = None,
) -> None:
self.defs = defs
self.imports = imports
self.raw_errors = raw_errors
self.ignored_lines = ignored_lines
self.is_partial_stub_package = is_partial_stub_package
self.uses_template_strings = uses_template_strings
self.source_hash = source_hash
self.mypy_comments = mypy_comments if mypy_comments is not None else []
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these two (or at least the second one) need to be sent to the worker, i.e. you will need to handle them in write() and read(). The worker needs to know the full options, since we don't send options over the socket for each module (it is a big object). I guess tests pass now, because the worker still calls get_source().

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added serialiation back (I had it removed since I thought it's not needed).


def write(self, data: WriteBuffer) -> None:
write_bytes(data, self.defs)
Expand All @@ -399,6 +407,12 @@ def write(self, data: WriteBuffer) -> None:
write_str_list(data, codes)
write_bool(data, self.is_partial_stub_package)
write_bool(data, self.uses_template_strings)
write_str(data, self.source_hash)
write_tag(data, LIST_GEN)
write_int_bare(data, len(self.mypy_comments))
for line, text in self.mypy_comments:
write_int(data, line)
write_str(data, text)

@classmethod
def read(cls, data: ReadBuffer) -> FileRawData:
Expand All @@ -408,8 +422,20 @@ def read(cls, data: ReadBuffer) -> FileRawData:
raw_errors = [read_parse_error(data) for _ in range(read_int_bare(data))]
assert read_tag(data) == DICT_INT_GEN
ignored_lines = {read_int(data): read_str_list(data) for _ in range(read_int_bare(data))}
is_partial_stub_package = read_bool(data)
uses_template_strings = read_bool(data)
source_hash = read_str(data)
assert read_tag(data) == LIST_GEN
mypy_comments = [(read_int(data), read_str(data)) for _ in range(read_int_bare(data))]
return FileRawData(
defs, imports, raw_errors, ignored_lines, read_bool(data), read_bool(data)
defs,
imports,
raw_errors,
ignored_lines,
is_partial_stub_package,
uses_template_strings,
source_hash,
mypy_comments,
)


Expand Down
6 changes: 5 additions & 1 deletion mypy/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,11 @@ def load_from_raw(
options: Options,
imports_only: bool = False,
) -> MypyFile:
"""Load AST from parsed binary data and report stored errors."""
"""Load AST from parsed binary data and report stored errors.

If imports_only is true, only deserialize imports and return a mostly
empty AST.
"""
from mypy.nativeparse import State, deserialize_imports, read_statements

state = State(options)
Expand Down
2 changes: 1 addition & 1 deletion mypy/test/test_nativeparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) ->
]

with temp_source("print('hello')") as fnam:
b, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())
b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())
assert list(b) == (
[LITERAL_INT, 22, nodes.EXPR_STMT, nodes.CALL_EXPR]
+ [nodes.NAME_EXPR, LITERAL_STR]
Expand Down
6 changes: 6 additions & 0 deletions test-data/unit/check-optional.test
Original file line number Diff line number Diff line change
Expand Up @@ -1356,3 +1356,9 @@ def f(x: object) -> None:
with C():
pass
[builtins fixtures/tuple.pyi]

[case testInferOptionalFromDefaultNoneInlineConfig]
# mypy: implicit-optional
def f(x: int = None) -> None:
reveal_type(x) # N: Revealed type is "builtins.int | None"
f(None)
Loading