diff --git a/misc/dump-ast.py b/misc/dump-ast.py index 68ea8bc0dc61..7fdf905bae0b 100755 --- a/misc/dump-ast.py +++ b/misc/dump-ast.py @@ -19,7 +19,7 @@ def dump(fname: str, python_version: tuple[int, int], quiet: bool = False) -> No options.python_version = python_version with open(fname, "rb") as f: s = f.read() - tree = parse(s, fname, None, errors=Errors(options), options=options, file_exists=True) + tree = parse(s, fname, None, errors=Errors(options), options=options) if not quiet: print(tree) diff --git a/mypy/build.py b/mypy/build.py index 8d5db0bab8df..8bea6f645a14 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -1024,85 +1024,77 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: self.post_parse_all(states) return - sequential_states = [] parallel_states = [] for state in states: + if not self.fscache.exists(state.xpath): + build_error( + "Cannot read file '{}': {}".format( + state.xpath.replace(os.getcwd() + os.sep, ""), + os.strerror(2), # `errno.ENOENT` + ) + ) if state.tree is not None: # The file was already parsed. - continue - if not self.fscache.exists(state.xpath, real_only=True): - # New parser only supports parsing on-disk files. - sequential_states.append(state) + state.needs_parse = False continue parallel_states.append(state) + if len(parallel_states) > 1: - self.parse_parallel(sequential_states, parallel_states) - else: - # Avoid using executor when there is no parallelism. - for state in states: - state.parse_file() - if post_parse: - self.post_parse_all(states) + # This duplicates a bit of logic from State.parse_file(). This is done to + # optimize handling of states parsed in parallel. - def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None: - """Perform parallel parsing of states. + parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw( + parallel_states + ) - Note: this duplicates a bit of logic from State.parse_file(). This is done - as an optimization to parallelize only those parts of the code that can be - parallelized efficiently. - """ - parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw( - sequential_states, parallel_states - ) + for state in parallel_parsed_states: + # New parser only returns serialized ASTs + with state.wrap_context(): + assert state.tree is not None + raw_data = state.tree.raw_data + if raw_data is not None: + state.source_hash = raw_data.source_hash + state.apply_inline_configuration(raw_data.mypy_comments) + state.tree = load_from_raw( + state.xpath, + state.id, + raw_data, + self.errors, + state.options, + imports_only=bool(self.workers), + ) + if self.errors.is_blockers(): + self.log("Bailing due to parse errors") + self.errors.raise_error() - for state in parallel_parsed_states: - # New parser returns serialized ASTs. Deserialize full trees only if not using - # parallel workers. - with state.wrap_context(): + for state in parallel_states: assert state.tree is not None - raw_data = state.tree.raw_data - if raw_data is not None: - # Apply inline mypy config before deserialization, since - # some options (e.g. implicit_optional) affect deserialization - state.source_hash = raw_data.source_hash - state.apply_inline_configuration(raw_data.mypy_comments) - state.tree = load_from_raw( - state.xpath, - state.id, - raw_data, - self.errors, - state.options, - imports_only=bool(self.workers), - ) - if self.errors.is_blockers(): - self.log("Bailing due to parse errors") - self.errors.raise_error() - - for state in parallel_states: - assert state.tree is not None - if state in parallel_parsed_states_set: + if state in parallel_parsed_states_set: + if state.tree.raw_data is not None: + # source_hash was already extracted above, but raw_data + # may have been preserved for workers (imports_only=True). + pass + elif state.source_hash is None: + # At least namespace packages may not have source. + state.get_source() + state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) + state.semantic_analysis_pass1() + self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash) + self.modules[state.id] = state.tree if state.tree.raw_data is not None: - # source_hash was already extracted above, but raw_data - # may have been preserved for workers (imports_only=True). - pass - elif state.source_hash is None: - # At least namespace packages may not have source. - state.get_source() - state.early_errors = list(self.errors.error_info_map.get(state.xpath, [])) - state.semantic_analysis_pass1() - self.ast_cache[state.id] = (state.tree, state.early_errors, state.source_hash) - self.modules[state.id] = state.tree - if state.tree.raw_data is not None: - state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT - state.check_blockers() - state.setup_errors() - - def parse_files_threaded_raw( - self, sequential_states: list[State], parallel_states: list[State] - ) -> tuple[list[State], set[State]]: - """Parse files using a thread pool. - - Also parse sequential states while waiting for the parallel results. + state.size_hint = len(state.tree.raw_data.defs) + MIN_SIZE_HINT + state.check_blockers() + state.setup_errors() + elif len(parallel_states) == 1: + # Avoid using executor when there is no parallelism. + parallel_states[0].parse_file() + + if post_parse: + self.post_parse_all(states) + + def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], set[State]]: + """Parse files in parallel using a thread pool. + Trees from the new parser are left in raw (serialized) form. Return (list, set) of states that were actually parsed (not cached). @@ -1118,14 +1110,14 @@ def parse_files_threaded_raw( # parse_file_inner() results in no visible improvement with more than 8 threads. # TODO: reuse thread pool and/or batch small files in single submit() call. with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor: - for state in parallel_states: + for state in states: state.needs_parse = False if state.id not in self.ast_cache: self.log(f"Parsing {state.xpath} ({state.id})") ignore_errors = state.ignore_all or state.options.ignore_errors if ignore_errors: self.errors.ignored_files.add(state.xpath) - futures.append(executor.submit(state.parse_file_inner, "")) + futures.append(executor.submit(state.parse_file_inner, state.source)) parallel_parsed_states.append(state) parallel_parsed_states_set.add(state) else: @@ -1133,10 +1125,6 @@ def parse_files_threaded_raw( state.tree, state.early_errors, source_hash = self.ast_cache[state.id] state.source_hash = source_hash - # Parse sequential before waiting on parallel. - for state in sequential_states: - state.parse_file() - for fut in wait(futures).done: fut.result() @@ -1279,7 +1267,7 @@ def parse_file( self, id: str, path: str, - source: str, + source: str | None, options: Options, raw_data: FileRawData | None = None, ) -> MypyFile: @@ -1287,13 +1275,19 @@ def parse_file( Raise CompileError if there is a parse error. """ - file_exists = self.fscache.exists(path, real_only=True) t0 = time.time() if raw_data: # If possible, deserialize from known binary data instead of parsing from scratch. tree = load_from_raw(path, id, raw_data, self.errors, options) else: - tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists) + # Handle fake `__init__.py` files due to `--package-root` + if ( + (source is None) + and (os.path.dirname(path) in self.fscache.fake_package_cache) + and (os.path.basename(path) == "__init__.py") + ): + source = "" + tree = parse(source, path, id, self.errors, options=options) tree._fullname = id if self.stats_enabled: with self.stats_lock: @@ -3179,7 +3173,7 @@ def get_source(self) -> str: else: err = f"{self.path}: error: Cannot decode file: {str(decodeerr)}" raise CompileError([err], module_with_blocker=self.id) from decodeerr - elif self.path and self.manager.fscache.isdir(self.path): + elif self.path and manager.fscache.isdir(self.path): source = "" self.source_hash = "" else: @@ -3192,7 +3186,7 @@ def get_source(self) -> str: self.time_spent_us += time_spent_us(t0) return source - def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None: + def parse_file_inner(self, source: str | None, raw_data: FileRawData | None = None) -> None: t0 = time_ref() self.tree = self.manager.parse_file( self.id, self.xpath, source, options=self.options, raw_data=raw_data @@ -3319,9 +3313,7 @@ def semantic_analysis_pass1(self) -> None: # # TODO: This should not be considered as a semantic analysis # pass -- it's an independent pass. - if not options.native_parser or not self.manager.fscache.exists( - self.xpath, real_only=True - ): + if not options.native_parser: analyzer = SemanticAnalyzerPreAnalysis() with self.wrap_context(): analyzer.visit_file(self.tree, self.xpath, self.id, options) diff --git a/mypy/checkstrformat.py b/mypy/checkstrformat.py index e96af007e29c..aba49d71b77e 100644 --- a/mypy/checkstrformat.py +++ b/mypy/checkstrformat.py @@ -587,7 +587,6 @@ def apply_field_accessors( module=None, options=self.chk.options, errors=temp_errors, - file_exists=False, eager=True, ) if temp_errors.is_errors(): diff --git a/mypy/fscache.py b/mypy/fscache.py index 75041633eb90..63fe5368a2a9 100644 --- a/mypy/fscache.py +++ b/mypy/fscache.py @@ -253,13 +253,10 @@ def isdir(self, path: str) -> bool: return False return stat.S_ISDIR(st.st_mode) - def exists(self, path: str, real_only: bool = False) -> bool: + def exists(self, path: str) -> bool: st = self.stat_or_none(path) if st is None: return False - if real_only: - dirname = os.path.dirname(path) - return dirname not in self.fake_package_cache return True def read(self, path: str) -> bytes: diff --git a/mypy/nativeparse.py b/mypy/nativeparse.py index d048e9bce65e..414426580fa7 100644 --- a/mypy/nativeparse.py +++ b/mypy/nativeparse.py @@ -182,7 +182,10 @@ def add_error( def native_parse( - filename: str, options: Options, skip_function_bodies: bool = False + filename: str, + options: Options, + source: str | bytes | None = None, + skip_function_bodies: bool = False, ) -> tuple[MypyFile, list[ParseError], TypeIgnores]: """Parse a Python file using the native Rust-based parser. @@ -211,7 +214,7 @@ def native_parse( uses_template_strings, source_hash, mypy_comments, - ) = parse_to_binary_ast(filename, options, skip_function_bodies) + ) = parse_to_binary_ast(filename, options, source, skip_function_bodies) node = MypyFile([], []) node.path = filename node.raw_data = FileRawData( @@ -248,7 +251,10 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]: def parse_to_binary_ast( - filename: str, options: Options, skip_function_bodies: bool = False + filename: str, + options: Options, + source: str | bytes | None = None, + skip_function_bodies: bool = False, ) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]: # This is a horrible hack to work around a mypyc bug where imported # module may be not ready in a thread sometimes. @@ -259,6 +265,7 @@ def parse_to_binary_ast( raise ImportError("Cannot import ast_serialize") ast_bytes, errors, ignores, import_bytes, ast_data = ast_serialize.parse( filename, + source, skip_function_bodies=skip_function_bodies, python_version=options.python_version, platform=options.platform, diff --git a/mypy/parse.py b/mypy/parse.py index b0901a3a2455..a8fb5542a704 100644 --- a/mypy/parse.py +++ b/mypy/parse.py @@ -12,12 +12,11 @@ def parse( - source: str | bytes, + source: str | bytes | None, fnam: str, module: str | None, errors: Errors, options: Options, - file_exists: bool, eager: bool = False, ) -> MypyFile: """Parse a source file, without doing any semantic analysis. @@ -27,28 +26,29 @@ def parse( New parser returns empty tree with serialized data. To get the full tree and the parse errors, use eager=True. + + `source` must not be `None` if the old parser is used. The new parser will read and + parse contents from path `fnam` if `source` is `None`. """ if options.native_parser: - # Native parser only works with actual files on disk - # Fall back to fastparse for in-memory source or non-existent files - if file_exists: - import mypy.nativeparse - - ignore_errors = options.ignore_errors or fnam in errors.ignored_files - # If errors are ignored, we can drop many function bodies to speed up type checking. - strip_function_bodies = ignore_errors and not options.preserve_asts - tree, _, _ = mypy.nativeparse.native_parse( - fnam, options, skip_function_bodies=strip_function_bodies - ) - # Set is_stub based on file extension - tree.is_stub = fnam.endswith(".pyi") - # Note: tree.imports is populated directly by load_from_raw() with deserialized - # import metadata, so we don't need to collect imports via AST traversal - if eager and tree.raw_data is not None: - tree = load_from_raw(fnam, module, tree.raw_data, errors, options) - return tree - # Fall through to fastparse for non-existent files - + import mypy.nativeparse + + ignore_errors = options.ignore_errors or fnam in errors.ignored_files + # If errors are ignored, we can drop many function bodies to speed up type checking. + strip_function_bodies = ignore_errors and not options.preserve_asts + tree, _, _ = mypy.nativeparse.native_parse( + fnam, options, source, skip_function_bodies=strip_function_bodies + ) + # Set is_stub based on file extension + tree.is_stub = fnam.endswith(".pyi") + # Note: tree.imports is populated directly by load_from_raw() with deserialized + # import metadata, so we don't need to collect imports via AST traversal + if eager and tree.raw_data is not None: + tree = load_from_raw(fnam, module, tree.raw_data, errors, options) + return tree + + if source is None: + raise ValueError("Source cannot be `None` when using the old parser") if options.transform_source is not None: source = options.transform_source(source) import mypy.fastparse diff --git a/mypy/stubgen.py b/mypy/stubgen.py index 9c682ba4b820..9b0089b6aec0 100755 --- a/mypy/stubgen.py +++ b/mypy/stubgen.py @@ -1745,13 +1745,7 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None: source = mypy.util.decode_python_encoding(data) errors = Errors(mypy_options) mod.ast = mypy.parse.parse( - source, - fnam=mod.path, - module=mod.module, - errors=errors, - options=mypy_options, - file_exists=True, - eager=True, + source, fnam=mod.path, module=mod.module, errors=errors, options=mypy_options, eager=True ) mod.ast._fullname = mod.module if errors.is_blockers(): diff --git a/mypy/test/test_nativeparse.py b/mypy/test/test_nativeparse.py index b50da5f5d02c..e0a0da29166b 100644 --- a/mypy/test/test_nativeparse.py +++ b/mypy/test/test_nativeparse.py @@ -98,7 +98,7 @@ def test_parser(testcase: DataDrivenTestCase) -> None: try: with temp_source(source) as fnam: - node, errors, type_ignores = native_parse(fnam, options, skip_function_bodies) + node, errors, type_ignores = native_parse(fnam, options, None, skip_function_bodies) errors += load_tree(node, options) node.path = "main" a = node.str_with_options(options).split("\n") @@ -234,7 +234,7 @@ def format_reachable_imports(node: MypyFile) -> list[str]: @unittest.skipUnless(has_nativeparse, "nativeparse not available") class TestNativeParserBinaryFormat(unittest.TestCase): - def test_trivial_binary_data(self) -> None: + def _assert_trivial_binary_data(self, b: bytes, /) -> None: # A quick sanity check to ensure the serialized data looks as expected. Only covers # a few AST nodes. @@ -250,9 +250,9 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) -> int_enc(end_column - start_column), ] - with temp_source("print('hello')") as fnam: - b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options()) - assert list(b) == ( + self.assertEqual( + list(b), + ( [LITERAL_INT, 22, nodes.EXPR_STMT, nodes.CALL_EXPR] + [nodes.NAME_EXPR, LITERAL_STR] + [int_enc(5)] @@ -269,7 +269,25 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) -> + [LIST_GEN, 22, LITERAL_NONE] + locs(1, 0, 1, 14) + [END_TAG, END_TAG] - ) + ), + ) + + def test_trivial_binary_data_from_file(self) -> None: + with temp_source("print('hello')") as fnam: + b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options()) + self._assert_trivial_binary_data(b) + + def test_trivial_binary_data_from_string_source(self) -> None: + b, _, _, _, _, _, _, _ = parse_to_binary_ast("", Options(), "print('hello')") + self._assert_trivial_binary_data(b) + + def test_trivial_binary_data_from_bytes_source(self) -> None: + b, _, _, _, _, _, _, _ = parse_to_binary_ast("", Options(), b"print('hello')") + self._assert_trivial_binary_data(b) + + def test_invalid_bytes_raises(self) -> None: + with self.assertRaises(UnicodeDecodeError): + parse_to_binary_ast("", Options(), b"\xff") @contextlib.contextmanager diff --git a/mypy/test/testparse.py b/mypy/test/testparse.py index 6d00f5b5710f..8f4de5bc7412 100644 --- a/mypy/test/testparse.py +++ b/mypy/test/testparse.py @@ -66,7 +66,6 @@ def test_parser(testcase: DataDrivenTestCase) -> None: module="__main__", errors=errors, options=options, - file_exists=False, eager=True, ) if errors.is_errors(): @@ -108,7 +107,6 @@ def test_parse_error(testcase: DataDrivenTestCase) -> None: "__main__", errors=errors, options=options, - file_exists=False, eager=True, ) if errors.is_errors():