Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 131 additions & 16 deletions mypyc/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,22 +566,114 @@ def construct_groups(
return groups


def get_header_deps(cfiles: list[tuple[str, str]]) -> list[str]:
"""Find all the headers used by a group of cfiles.
# Single regex that captures both `#include "foo"` and `#include <foo>`. The
# alternation lets us tell the two forms apart: the quoted-form match populates
# group 1 and the angle-form match populates group 2. The C preprocessor
# applies different search rules to each kind (see `_extract_includes`), so we
# carry the kind through resolution rather than collapsing them up front.
_INCLUDE_RE = re.compile(r'#\s*include\s+(?:"([^"]+)"|<([^>]+)>)')


def _extract_includes(contents: str) -> list[tuple[bool, str]]:
"""Return each `#include` directive's (is_angled, name) from `contents`.

is_angled=False for `#include "foo"`, True for `#include <foo>`.
"""
out: list[tuple[bool, str]] = []
for quoted, angled in _INCLUDE_RE.findall(contents):
if quoted:
out.append((False, quoted))
else:
out.append((True, angled))
return out


def get_header_deps(cfiles: list[tuple[str, str]]) -> list[tuple[bool, str]]:
"""Find all the headers directly included by a group of cfiles.

Returns a sorted, deduplicated list of `(is_angled, header_name)` pairs.
Callers that only need the names can ignore the bool, but it is needed by
`resolve_cfile_deps` to apply the correct preprocessor search order.

We do this by just regexping the source, which is a bit simpler than
properly plumbing the data through.
properly plumbing the data through. Transitive header-to-header includes
are picked up by `resolve_cfile_deps` in `mypyc_build`, which can read
the on-disk headers after every group has written its files.

Arguments:
cfiles: A list of (file name, file contents) pairs.
"""
headers: set[str] = set()
headers: set[tuple[bool, str]] = set()
for _, contents in cfiles:
headers.update(re.findall(r'#include [<"]([^>"]+)[>"]', contents))
headers.update(_extract_includes(contents))

return sorted(headers)


def resolve_cfile_deps(
cfile_dir: str, direct_includes: list[tuple[bool, str]], target_dir: str
) -> set[str]:
"""Resolve a .c file's `#include` directives to on-disk paths, walking
transitively through resolved headers.

The C preprocessor resolves `#include "foo"` against the includer's
directory first, then via -I, while `#include <foo>` only uses -I. We
mirror that exactly: quoted includes are searched in (includer_dir,
target_dir) order, and angled includes are searched in target_dir only.
`target_dir` is the only -I path that holds files we generate; anything
we cannot resolve under it (or, for quoted form, the includer's dir) is
dropped — lib-rt headers like `<Python.h>` and `<CPy.h>` live elsewhere
and do not change between builds, so they are not real deps for
incremental purposes.

The walk is transitive: each resolved header is opened and scanned for
its own `#include` directives. Without this, cross-group export-table
headers reached via `__native_internal_<mod>.h` (which includes
`<other_group/__native_other.h>`) would be missed, and edits that shift
struct offsets in `other_group` would not trigger a recompile of the
consumer's .o file. Its baked-in offsets would then resolve to whatever
class/function now occupies that slot — silent runtime corruption.

Returns a set of resolved paths suitable for use as an Extension.depends
list.
"""
resolved: set[str] = set()
# Worklist of (search_dir, is_angled, header_name). search_dir is the
# includer's directory — for the initial cfile it is the cfile's dir, for
# a transitively-included header it is that header's dir. It is only
# consulted for quoted-form includes.
worklist: list[tuple[str, bool, str]] = [
(cfile_dir, is_angled, dep) for is_angled, dep in direct_includes
]
while worklist:
search_dir, is_angled, dep = worklist.pop()
# Quoted form: includer's dir first, then -I (target_dir).
# Angled form: -I only (skips the includer's dir).
search_bases = (target_dir,) if is_angled else (search_dir, target_dir)
for base in search_bases:
candidate = os.path.normpath(os.path.join(base, dep))
if not os.path.exists(candidate):
continue
if candidate in resolved:
break
resolved.add(candidate)
# Recurse only into headers. Some lib-rt sources are pulled in
# as `#include "init.c"` etc.; those do not resolve under
# target_dir so they get filtered out before we would try to scan
# them, but the .h guard is a cheap belt-and-braces.
if candidate.endswith(".h"):
try:
with open(candidate, encoding="utf-8", errors="replace") as f:
header_contents = f.read()
except OSError:
header_contents = ""
sub_dir = os.path.dirname(candidate)
for sub_angled, sub in _extract_includes(header_contents):
worklist.append((sub_dir, sub_angled, sub))
break
return resolved


def mypyc_build(
paths: list[str],
compiler_options: CompilerOptions,
Expand Down Expand Up @@ -630,11 +722,17 @@ def mypyc_build(
for (path, dirs, internal) in skip_cgen_input[1]
]

# Write out the generated C and collect the files for each group
# Write out the generated C and collect the files for each group.
# Should this be here??
group_cfilenames: list[tuple[list[str], list[str]]] = []
# Per-group list of (full_cfile_path, raw_include_targets). Resolution is
# deferred until every group has written its files: a header in one group
# may include a header generated by another group, so resolving immediately
# would miss cross-group deps for groups processed first.
pending: list[list[tuple[str, list[tuple[bool, str]]]]] = []
for cfiles in group_cfiles:
cfilenames = []
per_cfile_deps: list[tuple[str, list[tuple[bool, str]]]] = []
for cfile, ctext in cfiles:
cfile = os.path.join(compiler_options.target_dir, cfile)
# Empty contents marks a file the previous run already wrote
Expand All @@ -643,16 +741,33 @@ def mypyc_build(
write_file(cfile, ctext)
if os.path.splitext(cfile)[1] == ".c":
cfilenames.append(cfile)

# The header regex matches both quote styles, so the result can
# include system headers like `<Python.h>` that don't live under
# target_dir. Joining those produces non-existent paths which
# would force a full rebuild on every run via Extension.depends.
candidate_deps = (
os.path.join(compiler_options.target_dir, dep) for dep in get_header_deps(cfiles)
)
deps = [d for d in candidate_deps if os.path.exists(d)]
group_cfilenames.append((cfilenames, deps))
# For fully-cached groups ctext is empty; read the on-disk .c so
# the dep resolver can walk its transitive header chain and populate
# Extension.depends — otherwise cross-group export-table header
# changes (e.g. a new class shifting struct offsets) won't trigger
# a recompile of this cached consumer's .o.
if not ctext and os.path.exists(cfile):
try:
with open(cfile, encoding="utf-8") as _f:
ctext = _f.read()
except OSError:
pass
per_cfile_deps.append((cfile, get_header_deps([(cfile, ctext)])))
pending.append(per_cfile_deps)
group_cfilenames.append((cfilenames, []))

# Second pass: resolve each .c file's deps transitively now that every
# group's headers are on disk. See resolve_cfile_deps for the rules.
for i, per_cfile in enumerate(pending):
deps_set: set[str] = set()
for cfile_full, dep_names in per_cfile:
deps_set.update(
resolve_cfile_deps(
os.path.dirname(cfile_full), dep_names, compiler_options.target_dir
)
)
cfilenames, _ = group_cfilenames[i]
group_cfilenames[i] = (cfilenames, sorted(deps_set))

return groups, group_cfilenames, source_deps

Expand Down
Loading
Loading