diff --git a/misc/dump-ast.py b/misc/dump-ast.py index 68ea8bc0dc61..7fdf905bae0b 100755 --- a/misc/dump-ast.py +++ b/misc/dump-ast.py @@ -19,7 +19,7 @@ def dump(fname: str, python_version: tuple[int, int], quiet: bool = False) -> No options.python_version = python_version with open(fname, "rb") as f: s = f.read() - tree = parse(s, fname, None, errors=Errors(options), options=options, file_exists=True) + tree = parse(s, fname, None, errors=Errors(options), options=options) if not quiet: print(tree) diff --git a/mypy/build.py b/mypy/build.py index 8d5db0bab8df..be2ab5d9d709 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -163,7 +163,7 @@ from mypy.modules_state import modules_state from mypy.nodes import Expression from mypy.options import Options -from mypy.parse import load_from_raw, parse +from mypy.parse import load_from_raw, parse, parse_native from mypy.plugin import ChainedPlugin, Plugin, ReportConfigContext from mypy.plugins.default import DefaultPlugin from mypy.renaming import LimitedVariableRenameVisitor, VariableRenameVisitor @@ -1024,27 +1024,19 @@ def parse_all(self, states: list[State], post_parse: bool = True) -> None: self.post_parse_all(states) return - sequential_states = [] parallel_states = [] for state in states: if state.tree is not None: # The file was already parsed. - continue - if not self.fscache.exists(state.xpath, real_only=True): - # New parser only supports parsing on-disk files. - sequential_states.append(state) + state.needs_parse = False continue parallel_states.append(state) if len(parallel_states) > 1: - self.parse_parallel(sequential_states, parallel_states) - else: - # Avoid using executor when there is no parallelism. - for state in states: - state.parse_file() + self.parse_parallel(parallel_states) if post_parse: self.post_parse_all(states) - def parse_parallel(self, sequential_states: list[State], parallel_states: list[State]) -> None: + def parse_parallel(self, parallel_states: list[State]) -> None: """Perform parallel parsing of states. Note: this duplicates a bit of logic from State.parse_file(). This is done @@ -1052,7 +1044,7 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S parallelized efficiently. """ parallel_parsed_states, parallel_parsed_states_set = self.parse_files_threaded_raw( - sequential_states, parallel_states + parallel_states ) for state in parallel_parsed_states: @@ -1097,12 +1089,9 @@ def parse_parallel(self, sequential_states: list[State], parallel_states: list[S state.check_blockers() state.setup_errors() - def parse_files_threaded_raw( - self, sequential_states: list[State], parallel_states: list[State] - ) -> tuple[list[State], set[State]]: - """Parse files using a thread pool. + def parse_files_threaded_raw(self, states: list[State]) -> tuple[list[State], set[State]]: + """Parse files in parallel using a thread pool. - Also parse sequential states while waiting for the parallel results. Trees from the new parser are left in raw (serialized) form. Return (list, set) of states that were actually parsed (not cached). @@ -1118,14 +1107,16 @@ def parse_files_threaded_raw( # parse_file_inner() results in no visible improvement with more than 8 threads. # TODO: reuse thread pool and/or batch small files in single submit() call. with ThreadPoolExecutor(max_workers=min(available_threads, 8)) as executor: - for state in parallel_states: + for state in states: state.needs_parse = False if state.id not in self.ast_cache: self.log(f"Parsing {state.xpath} ({state.id})") ignore_errors = state.ignore_all or state.options.ignore_errors if ignore_errors: self.errors.ignored_files.add(state.xpath) - futures.append(executor.submit(state.parse_file_inner, "")) + futures.append( + executor.submit(state.parse_file_inner, state.source, parallel=True) + ) parallel_parsed_states.append(state) parallel_parsed_states_set.add(state) else: @@ -1133,10 +1124,6 @@ def parse_files_threaded_raw( state.tree, state.early_errors, source_hash = self.ast_cache[state.id] state.source_hash = source_hash - # Parse sequential before waiting on parallel. - for state in sequential_states: - state.parse_file() - for fut in wait(futures).done: fut.result() @@ -1279,21 +1266,32 @@ def parse_file( self, id: str, path: str, - source: str, + source: str | None, options: Options, raw_data: FileRawData | None = None, + parallel: bool = False, ) -> MypyFile: """Parse the source of a file with the given name. Raise CompileError if there is a parse error. """ - file_exists = self.fscache.exists(path, real_only=True) t0 = time.time() if raw_data: # If possible, deserialize from known binary data instead of parsing from scratch. tree = load_from_raw(path, id, raw_data, self.errors, options) else: - tree = parse(source, path, id, self.errors, options=options, file_exists=file_exists) + if source is not None: + tree = parse(source, path, id, self.errors, options=options) + else: + assert parallel + if not os.path.exists(path): + build_error( + "Cannot read file '{}': {}".format( + path.replace(os.getcwd() + os.sep, ""), + os.strerror(2), # `errno.ENOENT` + ) + ) + tree = parse_native(source, path, id, self.errors, options=options) tree._fullname = id if self.stats_enabled: with self.stats_lock: @@ -3192,10 +3190,12 @@ def get_source(self) -> str: self.time_spent_us += time_spent_us(t0) return source - def parse_file_inner(self, source: str, raw_data: FileRawData | None = None) -> None: + def parse_file_inner( + self, source: str | None, raw_data: FileRawData | None = None, parallel: bool = False + ) -> None: t0 = time_ref() self.tree = self.manager.parse_file( - self.id, self.xpath, source, options=self.options, raw_data=raw_data + self.id, self.xpath, source, self.options, raw_data, parallel ) self.time_spent_us += time_spent_us(t0) @@ -3319,9 +3319,7 @@ def semantic_analysis_pass1(self) -> None: # # TODO: This should not be considered as a semantic analysis # pass -- it's an independent pass. - if not options.native_parser or not self.manager.fscache.exists( - self.xpath, real_only=True - ): + if not options.native_parser: analyzer = SemanticAnalyzerPreAnalysis() with self.wrap_context(): analyzer.visit_file(self.tree, self.xpath, self.id, options) diff --git a/mypy/checkstrformat.py b/mypy/checkstrformat.py index e96af007e29c..aba49d71b77e 100644 --- a/mypy/checkstrformat.py +++ b/mypy/checkstrformat.py @@ -587,7 +587,6 @@ def apply_field_accessors( module=None, options=self.chk.options, errors=temp_errors, - file_exists=False, eager=True, ) if temp_errors.is_errors(): diff --git a/mypy/fscache.py b/mypy/fscache.py index 75041633eb90..63fe5368a2a9 100644 --- a/mypy/fscache.py +++ b/mypy/fscache.py @@ -253,13 +253,10 @@ def isdir(self, path: str) -> bool: return False return stat.S_ISDIR(st.st_mode) - def exists(self, path: str, real_only: bool = False) -> bool: + def exists(self, path: str) -> bool: st = self.stat_or_none(path) if st is None: return False - if real_only: - dirname = os.path.dirname(path) - return dirname not in self.fake_package_cache return True def read(self, path: str) -> bytes: diff --git a/mypy/nativeparse.py b/mypy/nativeparse.py index d048e9bce65e..414426580fa7 100644 --- a/mypy/nativeparse.py +++ b/mypy/nativeparse.py @@ -182,7 +182,10 @@ def add_error( def native_parse( - filename: str, options: Options, skip_function_bodies: bool = False + filename: str, + options: Options, + source: str | bytes | None = None, + skip_function_bodies: bool = False, ) -> tuple[MypyFile, list[ParseError], TypeIgnores]: """Parse a Python file using the native Rust-based parser. @@ -211,7 +214,7 @@ def native_parse( uses_template_strings, source_hash, mypy_comments, - ) = parse_to_binary_ast(filename, options, skip_function_bodies) + ) = parse_to_binary_ast(filename, options, source, skip_function_bodies) node = MypyFile([], []) node.path = filename node.raw_data = FileRawData( @@ -248,7 +251,10 @@ def read_statements(state: State, data: ReadBuffer, n: int) -> list[Statement]: def parse_to_binary_ast( - filename: str, options: Options, skip_function_bodies: bool = False + filename: str, + options: Options, + source: str | bytes | None = None, + skip_function_bodies: bool = False, ) -> tuple[bytes, list[ParseError], TypeIgnores, bytes, bool, bool, str, list[tuple[int, str]]]: # This is a horrible hack to work around a mypyc bug where imported # module may be not ready in a thread sometimes. @@ -259,6 +265,7 @@ def parse_to_binary_ast( raise ImportError("Cannot import ast_serialize") ast_bytes, errors, ignores, import_bytes, ast_data = ast_serialize.parse( filename, + source, skip_function_bodies=skip_function_bodies, python_version=options.python_version, platform=options.platform, diff --git a/mypy/parse.py b/mypy/parse.py index b0901a3a2455..515dd57257b9 100644 --- a/mypy/parse.py +++ b/mypy/parse.py @@ -17,7 +17,6 @@ def parse( module: str | None, errors: Errors, options: Options, - file_exists: bool, eager: bool = False, ) -> MypyFile: """Parse a source file, without doing any semantic analysis. @@ -29,25 +28,7 @@ def parse( the parse errors, use eager=True. """ if options.native_parser: - # Native parser only works with actual files on disk - # Fall back to fastparse for in-memory source or non-existent files - if file_exists: - import mypy.nativeparse - - ignore_errors = options.ignore_errors or fnam in errors.ignored_files - # If errors are ignored, we can drop many function bodies to speed up type checking. - strip_function_bodies = ignore_errors and not options.preserve_asts - tree, _, _ = mypy.nativeparse.native_parse( - fnam, options, skip_function_bodies=strip_function_bodies - ) - # Set is_stub based on file extension - tree.is_stub = fnam.endswith(".pyi") - # Note: tree.imports is populated directly by load_from_raw() with deserialized - # import metadata, so we don't need to collect imports via AST traversal - if eager and tree.raw_data is not None: - tree = load_from_raw(fnam, module, tree.raw_data, errors, options) - return tree - # Fall through to fastparse for non-existent files + return parse_native(source, fnam, module, errors, options, eager) if options.transform_source is not None: source = options.transform_source(source) @@ -102,6 +83,31 @@ def load_from_raw( return tree +def parse_native( + source: str | bytes | None, + fnam: str, + module: str | None, + errors: Errors, + options: Options, + eager: bool = False, +) -> MypyFile: + import mypy.nativeparse + + ignore_errors = options.ignore_errors or fnam in errors.ignored_files + # If errors are ignored, we can drop many function bodies to speed up type checking. + strip_function_bodies = ignore_errors and not options.preserve_asts + tree, _, _ = mypy.nativeparse.native_parse( + fnam, options, source, skip_function_bodies=strip_function_bodies + ) + # Set is_stub based on file extension + tree.is_stub = fnam.endswith(".pyi") + # Note: tree.imports is populated directly by load_from_raw() with deserialized + # import metadata, so we don't need to collect imports via AST traversal + if eager and tree.raw_data is not None: + tree = load_from_raw(fnam, module, tree.raw_data, errors, options) + return tree + + def report_parse_error(error: ParseError, errors: Errors) -> None: message = error["message"] # Standardize error message by capitalizing the first word diff --git a/mypy/stubgen.py b/mypy/stubgen.py index 9c682ba4b820..9b0089b6aec0 100755 --- a/mypy/stubgen.py +++ b/mypy/stubgen.py @@ -1745,13 +1745,7 @@ def parse_source_file(mod: StubSource, mypy_options: MypyOptions) -> None: source = mypy.util.decode_python_encoding(data) errors = Errors(mypy_options) mod.ast = mypy.parse.parse( - source, - fnam=mod.path, - module=mod.module, - errors=errors, - options=mypy_options, - file_exists=True, - eager=True, + source, fnam=mod.path, module=mod.module, errors=errors, options=mypy_options, eager=True ) mod.ast._fullname = mod.module if errors.is_blockers(): diff --git a/mypy/test/test_nativeparse.py b/mypy/test/test_nativeparse.py index b50da5f5d02c..e0a0da29166b 100644 --- a/mypy/test/test_nativeparse.py +++ b/mypy/test/test_nativeparse.py @@ -98,7 +98,7 @@ def test_parser(testcase: DataDrivenTestCase) -> None: try: with temp_source(source) as fnam: - node, errors, type_ignores = native_parse(fnam, options, skip_function_bodies) + node, errors, type_ignores = native_parse(fnam, options, None, skip_function_bodies) errors += load_tree(node, options) node.path = "main" a = node.str_with_options(options).split("\n") @@ -234,7 +234,7 @@ def format_reachable_imports(node: MypyFile) -> list[str]: @unittest.skipUnless(has_nativeparse, "nativeparse not available") class TestNativeParserBinaryFormat(unittest.TestCase): - def test_trivial_binary_data(self) -> None: + def _assert_trivial_binary_data(self, b: bytes, /) -> None: # A quick sanity check to ensure the serialized data looks as expected. Only covers # a few AST nodes. @@ -250,9 +250,9 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) -> int_enc(end_column - start_column), ] - with temp_source("print('hello')") as fnam: - b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options()) - assert list(b) == ( + self.assertEqual( + list(b), + ( [LITERAL_INT, 22, nodes.EXPR_STMT, nodes.CALL_EXPR] + [nodes.NAME_EXPR, LITERAL_STR] + [int_enc(5)] @@ -269,7 +269,25 @@ def locs(start_line: int, start_column: int, end_line: int, end_column: int) -> + [LIST_GEN, 22, LITERAL_NONE] + locs(1, 0, 1, 14) + [END_TAG, END_TAG] - ) + ), + ) + + def test_trivial_binary_data_from_file(self) -> None: + with temp_source("print('hello')") as fnam: + b, _, _, _, _, _, _, _ = parse_to_binary_ast(fnam, Options()) + self._assert_trivial_binary_data(b) + + def test_trivial_binary_data_from_string_source(self) -> None: + b, _, _, _, _, _, _, _ = parse_to_binary_ast("", Options(), "print('hello')") + self._assert_trivial_binary_data(b) + + def test_trivial_binary_data_from_bytes_source(self) -> None: + b, _, _, _, _, _, _, _ = parse_to_binary_ast("", Options(), b"print('hello')") + self._assert_trivial_binary_data(b) + + def test_invalid_bytes_raises(self) -> None: + with self.assertRaises(UnicodeDecodeError): + parse_to_binary_ast("", Options(), b"\xff") @contextlib.contextmanager diff --git a/mypy/test/testparse.py b/mypy/test/testparse.py index 6d00f5b5710f..8f4de5bc7412 100644 --- a/mypy/test/testparse.py +++ b/mypy/test/testparse.py @@ -66,7 +66,6 @@ def test_parser(testcase: DataDrivenTestCase) -> None: module="__main__", errors=errors, options=options, - file_exists=False, eager=True, ) if errors.is_errors(): @@ -108,7 +107,6 @@ def test_parse_error(testcase: DataDrivenTestCase) -> None: "__main__", errors=errors, options=options, - file_exists=False, eager=True, ) if errors.is_errors(): diff --git a/test-data/unit/cmdline.test b/test-data/unit/cmdline.test index cfba7a81e928..04719ea81eac 100644 --- a/test-data/unit/cmdline.test +++ b/test-data/unit/cmdline.test @@ -593,6 +593,7 @@ import d [case testPackageRootMultipleParallel] # cmd: mypy --package-root=a/ --package-root=./ a/b/c.py d.py main.py --num-workers=2 +[file a/b/__init__.py] [file a/b/c.py] [file d.py] [file main.py]