Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion misc/dump-ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def dump(fname: str, python_version: tuple[int, int], quiet: bool = False) -> No
options.python_version = python_version
with open(fname, "rb") as f:
s = f.read()
tree = parse(s, fname, None, errors=Errors(options), options=options, file_exists=True)
tree = parse(s, fname, None, errors=Errors(options), options=options)
if not quiet:
print(tree)

Expand Down
62 changes: 30 additions & 32 deletions mypy/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@
from mypy.modules_state import modules_state
from mypy.nodes import Expression
from mypy.options import Options
from mypy.parse import load_from_raw, parse
from mypy.parse import load_from_raw, parse, parse_native
from mypy.plugin import ChainedPlugin, Plugin, ReportConfigContext
from mypy.plugins.default import DefaultPlugin
from mypy.renaming import LimitedVariableRenameVisitor, VariableRenameVisitor
Expand Down Expand Up @@ -1024,35 +1024,27 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None:
self.post_parse_all(states)
return

sequential_states = []
parallel_states = []
for state in states:
if state.tree is not None:
# The file was already parsed.
continue
if not self.fscache.exists(state.xpath, real_only=True):
# New parser only supports parsing on-disk files.
sequential_states.append(state)
state.needs_parse = False
continue
parallel_states.append(state)
if len(parallel_states) > 1:
self.parse_parallel(sequential_states, parallel_states)
else:
# Avoid using executor when there is no parallelism.
for state in states:
state.parse_file()
self.parse_parallel(parallel_states)
if post_parse:
self.post_parse_all(states)

def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None:
def parse_parallel(self, parallel_states: list[State]) -> None:
"""Perform parallel parsing of states.

Note: this duplicates a bit of logic from State.parse_file(). This is done
as an optimization to parallelize only those parts of the code that can be
parallelized efficiently.
"""
parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw(
sequential_states, parallel_states
parallel_states
)

for state in parallel_parsed_states:
Expand Down Expand Up @@ -1097,12 +1089,9 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S
state.check_blockers()
state.setup_errors()

def parse_files_threaded_raw(
self, sequential_states: list[State], parallel_states: list[State]
) -> tuple[list[State], set[State]]:
"""Parse files using a thread pool.
def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], set[State]]:
"""Parse files in parallel using a thread pool.

Also parse sequential states while waiting for the parallel results.
Trees from the new parser are left in raw (serialized) form.

Return (list, set) of states that were actually parsed (not cached).
Expand All @@ -1118,25 +1107,23 @@ def parse_files_threaded_raw(
# parse_file_inner() results in no visible improvement with more than 8 threads.
# TODO: reuse thread pool and/or batch small files in single submit() call.
with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor:
for state in parallel_states:
for state in states:
state.needs_parse = False
if state.id not in self.ast_cache:
self.log(f"Parsing {state.xpath} ({state.id})")
ignore_errors = state.ignore_all or state.options.ignore_errors
if ignore_errors:
self.errors.ignored_files.add(state.xpath)
futures.append(executor.submit(state.parse_file_inner, ""))
futures.append(
executor.submit(state.parse_file_inner, state.source, parallel=True)
)
parallel_parsed_states.append(state)
parallel_parsed_states_set.add(state)
else:
self.log(f"Using cached AST for {state.xpath} ({state.id})")
state.tree, state.early_errors, source_hash = self.ast_cache[state.id]
state.source_hash = source_hash

# Parse sequential before waiting on parallel.
for state in sequential_states:
state.parse_file()

for fut in wait(futures).done:
fut.result()

Expand Down Expand Up @@ -1279,21 +1266,32 @@ def parse_file(
self,
id: str,
path: str,
source: str,
source: str | None,
options: Options,
raw_data: FileRawData | None = None,
parallel: bool = False,
) -> MypyFile:
"""Parse the source of a file with the given name.

Raise CompileError if there is a parse error.
"""
file_exists = self.fscache.exists(path, real_only=True)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can:

  • Remove other two call sites to fscache.exists() in this file (and update relevant code).
  • Remove real_only parameter and related logic from fscache. IIRC it is only needed for native parser.

t0 = time.time()
if raw_data:
# If possible, deserialize from known binary data instead of parsing from scratch.
tree = load_from_raw(path, id, raw_data, self.errors, options)
else:
tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists)
if source is not None:
tree = parse(source, path, id, self.errors, options=options)
else:
assert parallel
if not os.path.exists(path):
build_error(
"Cannot read file '{}': {}".format(
path.replace(os.getcwd() + os.sep, ""),
os.strerror(2), # `errno.ENOENT`
)
)
tree = parse_native(source, path, id, self.errors, options=options)
tree._fullname = id
if self.stats_enabled:
with self.stats_lock:
Expand Down Expand Up @@ -3192,10 +3190,12 @@ def get_source(self) -> str:
self.time_spent_us += time_spent_us(t0)
return source

def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None:
def parse_file_inner(
self, source: str | None, raw_data: FileRawData | None = None, parallel: bool = False
) -> None:
t0 = time_ref()
self.tree = self.manager.parse_file(
self.id, self.xpath, source, options=self.options, raw_data=raw_data
self.id, self.xpath, source, self.options, raw_data, parallel
)
self.time_spent_us += time_spent_us(t0)

Expand Down Expand Up @@ -3319,9 +3319,7 @@ def semantic_analysis_pass1(self) -> None:
#
# TODO: This should not be considered as a semantic analysis
# pass -- it's an independent pass.
if not options.native_parser or not self.manager.fscache.exists(
self.xpath, real_only=True
):
if not options.native_parser:
analyzer = SemanticAnalyzerPreAnalysis()
with self.wrap_context():
analyzer.visit_file(self.tree, self.xpath, self.id, options)
Expand Down
1 change: 0 additions & 1 deletion mypy/checkstrformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,6 @@ def apply_field_accessors(
module=None,
options=self.chk.options,
errors=temp_errors,
file_exists=False,
eager=True,
)
if temp_errors.is_errors():
Expand Down
5 changes: 1 addition & 4 deletions mypy/fscache.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,13 +253,10 @@ def isdir(self, path: str) -> bool:
return False
return stat.S_ISDIR(st.st_mode)

def exists(self, path: str, real_only: bool = False) -> bool:
def exists(self, path: str) -> bool:
st = self.stat_or_none(path)
if st is None:
return False
if real_only:
dirname = os.path.dirname(path)
return dirname not in self.fake_package_cache
return True

def read(self, path: str) -> bytes:
Expand Down
13 changes: 10 additions & 3 deletions mypy/nativeparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,10 @@ def add_error(


def native_parse(
filename: str, options: Options, skip_function_bodies: bool = False
filename: str,
options: Options,
source: str | bytes | None = None,
skip_function_bodies: bool = False,
) -> tuple[MypyFile, list[ParseError], TypeIgnores]:
"""Parse a Python file using the native Rust-based parser.

Expand Down Expand Up @@ -211,7 +214,7 @@ def native_parse(
uses_template_strings,
source_hash,
mypy_comments,
) = parse_to_binary_ast(filename, options, skip_function_bodies)
) = parse_to_binary_ast(filename, options, source, skip_function_bodies)
node = MypyFile([], [])
node.path = filename
node.raw_data = FileRawData(
Expand Down Expand Up @@ -248,7 +251,10 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]:


def parse_to_binary_ast(
filename: str, options: Options, skip_function_bodies: bool = False
filename: str,
options: Options,
source: str | bytes | None = None,
skip_function_bodies: bool = False,
) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]:
# This is a horrible hack to work around a mypyc bug where imported
# module may be not ready in a thread sometimes.
Expand All @@ -259,6 +265,7 @@ def parse_to_binary_ast(
raise ImportError("Cannot import ast_serialize")
ast_bytes, errors, ignores, import_bytes, ast_data = ast_serialize.parse(
filename,
source,
skip_function_bodies=skip_function_bodies,
python_version=options.python_version,
platform=options.platform,
Expand Down
46 changes: 26 additions & 20 deletions mypy/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ def parse(
module: str | None,
errors: Errors,
options: Options,
file_exists: bool,
eager: bool = False,
) -> MypyFile:
"""Parse a source file, without doing any semantic analysis.
Expand All @@ -29,25 +28,7 @@ def parse(
the parse errors, use eager=True.
"""
if options.native_parser:
# Native parser only works with actual files on disk
# Fall back to fastparse for in-memory source or non-existent files
if file_exists:
import mypy.nativeparse

ignore_errors = options.ignore_errors or fnam in errors.ignored_files
# If errors are ignored, we can drop many function bodies to speed up type checking.
strip_function_bodies = ignore_errors and not options.preserve_asts
tree, _, _ = mypy.nativeparse.native_parse(
fnam, options, skip_function_bodies=strip_function_bodies
)
# Set is_stub based on file extension
tree.is_stub = fnam.endswith(".pyi")
# Note: tree.imports is populated directly by load_from_raw() with deserialized
# import metadata, so we don't need to collect imports via AST traversal
if eager and tree.raw_data is not None:
tree = load_from_raw(fnam, module, tree.raw_data, errors, options)
return tree
# Fall through to fastparse for non-existent files
return parse_native(source, fnam, module, errors, options, eager)

if options.transform_source is not None:
source = options.transform_source(source)
Expand Down Expand Up @@ -102,6 +83,31 @@ def load_from_raw(
return tree


def parse_native(
source: str | bytes | None,
fnam: str,
module: str | None,
errors: Errors,
options: Options,
eager: bool = False,
) -> MypyFile:
import mypy.nativeparse

ignore_errors = options.ignore_errors or fnam in errors.ignored_files
# If errors are ignored, we can drop many function bodies to speed up type checking.
strip_function_bodies = ignore_errors and not options.preserve_asts
tree, _, _ = mypy.nativeparse.native_parse(
fnam, options, source, skip_function_bodies=strip_function_bodies
)
# Set is_stub based on file extension
tree.is_stub = fnam.endswith(".pyi")
# Note: tree.imports is populated directly by load_from_raw() with deserialized
# import metadata, so we don't need to collect imports via AST traversal
if eager and tree.raw_data is not None:
tree = load_from_raw(fnam, module, tree.raw_data, errors, options)
return tree


def report_parse_error(error: ParseError, errors: Errors) -> None:
message = error["message"]
# Standardize error message by capitalizing the first word
Expand Down
8 changes: 1 addition & 7 deletions mypy/stubgen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1745,13 +1745,7 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None:
source = mypy.util.decode_python_encoding(data)
errors = Errors(mypy_options)
mod.ast = mypy.parse.parse(
source,
fnam=mod.path,
module=mod.module,
errors=errors,
options=mypy_options,
file_exists=True,
eager=True,
source, fnam=mod.path, module=mod.module, errors=errors, options=mypy_options, eager=True
)
mod.ast._fullname = mod.module
if errors.is_blockers():
Expand Down
30 changes: 24 additions & 6 deletions mypy/test/test_nativeparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_parser(testcase: DataDrivenTestCase) -> None:

try:
with temp_source(source) as fnam:
node, errors, type_ignores = native_parse(fnam, options, skip_function_bodies)
node, errors, type_ignores = native_parse(fnam, options, None, skip_function_bodies)
errors += load_tree(node, options)
node.path = "main"
a = node.str_with_options(options).split("\n")
Expand Down Expand Up @@ -234,7 +234,7 @@ def format_reachable_imports(node: MypyFile) -> list[str]:

@unittest.skipUnless(has_nativeparse, "nativeparse not available")
class TestNativeParserBinaryFormat(unittest.TestCase):
def test_trivial_binary_data(self) -> None:
def _assert_trivial_binary_data(self, b: bytes, /) -> None:
# A quick sanity check to ensure the serialized data looks as expected. Only covers
# a few AST nodes.

Expand All @@ -250,9 +250,9 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) ->
int_enc(end_column - start_column),
]

with temp_source("print('hello')") as fnam:
b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())
assert list(b) == (
self.assertEqual(
list(b),
(
[LITERAL_INT, 22, nodes.EXPR_STMT, nodes.CALL_EXPR]
+ [nodes.NAME_EXPR, LITERAL_STR]
+ [int_enc(5)]
Expand All @@ -269,7 +269,25 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) ->
+ [LIST_GEN, 22, LITERAL_NONE]
+ locs(1, 0, 1, 14)
+ [END_TAG, END_TAG]
)
),
)

def test_trivial_binary_data_from_file(self) -> None:
with temp_source("print('hello')") as fnam:
b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options())
self._assert_trivial_binary_data(b)

def test_trivial_binary_data_from_string_source(self) -> None:
b, _, _, _, _, _, _, _ = parse_to_binary_ast("", Options(), "print('hello')")
self._assert_trivial_binary_data(b)

def test_trivial_binary_data_from_bytes_source(self) -> None:
b, _, _, _, _, _, _, _ = parse_to_binary_ast("", Options(), b"print('hello')")
self._assert_trivial_binary_data(b)

def test_invalid_bytes_raises(self) -> None:
with self.assertRaises(UnicodeDecodeError):
parse_to_binary_ast("", Options(), b"\xff")


@contextlib.contextmanager
Expand Down
2 changes: 0 additions & 2 deletions mypy/test/testparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def test_parser(testcase: DataDrivenTestCase) -> None:
module="__main__",
errors=errors,
options=options,
file_exists=False,
eager=True,
)
if errors.is_errors():
Expand Down Expand Up @@ -108,7 +107,6 @@ def test_parse_error(testcase: DataDrivenTestCase) -> None:
"__main__",
errors=errors,
options=options,
file_exists=False,
eager=True,
)
if errors.is_errors():
Expand Down
1 change: 1 addition & 0 deletions test-data/unit/cmdline.test
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,7 @@ import d

[case testPackageRootMultipleParallel]
# cmd: mypy --package-root=a/ --package-root=./ a/b/c.py d.py main.py --num-workers=2
[file a/b/__init__.py]
[file a/b/c.py]
[file d.py]
[file main.py]
Expand Down
Loading