Model/tests/test_lambda_packaging.py

267 lines
10 KiB
Python

"""Static packaging linter for Lambda container images.
Every Lambda here ships as a Docker image that copies a *subset* of the repo
(``COPY utils/ utils/``, ``COPY backend/ backend/``, ...) and then runs a
handler via ``CMD ["<module>.handler"]``. If the handler's import graph reaches
a top-level package the Dockerfile forgot to ``COPY``, the function dies at cold
start with ``Runtime.ImportModuleError: No module named '<pkg>'`` — but only
*inside the image*. In the dev/test tree every package is present, so a plain
``import`` test can't see the gap. This is exactly how ``No module named
'domain'`` reached a deployed address2UPRN.
The RIE smoke tests (.github/workflows/_smoke_test_lambda.yml) catch this too,
but only by building the full image (minutes) and only for hand-listed services.
This test catches the same class of bug in milliseconds, locally, for *every*
handler Dockerfile — by statically computing each handler's import-time module
graph and asserting every repo file it reaches is copied into the image.
Scope: import-time (module-level) imports only — the ones that run at Lambda
init, which is what ImportModuleError is about. Imports inside function bodies
and under ``if TYPE_CHECKING:`` are deliberately ignored. Third-party / stdlib
imports are out of scope (that's requirements.txt's job, covered by the RIE
smoke test actually installing and importing).
"""
from __future__ import annotations
import ast
import json
import re
from pathlib import Path
from typing import Optional
import pytest
REPO_ROOT = Path(__file__).resolve().parents[1]
# Dockerfiles that are not Lambda handlers (test harness, dev containers).
_SKIP_DOCKERFILES = {"Dockerfile.test", "Dockerfile.test.dockerignore"}
_SKIP_PARTS = {".git", "node_modules", ".devcontainer"}
def _toplevel_names() -> set[str]:
"""Top-level repo packages/modules — the namespace handler imports resolve
against (imports are absolute: ``domain.x``, ``backend.y``)."""
names: set[str] = set()
for p in REPO_ROOT.iterdir():
if p.name.startswith(".") or p.name == "__pycache__":
continue
if p.is_dir():
names.add(p.name)
elif p.suffix == ".py":
names.add(p.stem)
return names
_TOP = _toplevel_names()
def _is_type_checking(test: ast.expr) -> bool:
if isinstance(test, ast.Name):
return test.id == "TYPE_CHECKING"
if isinstance(test, ast.Attribute):
return test.attr == "TYPE_CHECKING"
return False
def _import_time_imports(path: Path) -> list[str]:
"""Absolute module names imported when ``path`` is imported (i.e. at Lambda
init). Descends into module-level if/try/with and class bodies, but not into
function bodies (lazy) or ``if TYPE_CHECKING:`` blocks (never executed)."""
try:
tree = ast.parse(path.read_text(encoding="utf-8"), str(path))
except (SyntaxError, UnicodeDecodeError):
return []
out: list[str] = []
def visit(stmts: list[ast.stmt]) -> None:
for node in stmts:
if isinstance(node, ast.Import):
out.extend(alias.name for alias in node.names)
elif isinstance(node, ast.ImportFrom):
if not node.level and node.module: # absolute imports only
out.append(node.module)
elif isinstance(node, ast.If):
if _is_type_checking(node.test):
continue
visit(node.body)
visit(node.orelse)
elif isinstance(node, ast.Try):
visit(node.body)
visit(node.orelse)
visit(node.finalbody)
for handler in node.handlers:
visit(handler.body)
elif isinstance(node, ast.With):
visit(node.body)
elif isinstance(node, ast.ClassDef):
visit(node.body)
# FunctionDef / AsyncFunctionDef bodies are intentionally skipped.
visit(tree.body)
return out
def _module_to_file(module: str) -> Optional[Path]:
"""Resolve a dotted module to its repo source file (``foo.bar`` ->
``foo/bar.py`` or ``foo/bar/__init__.py``)."""
base = REPO_ROOT.joinpath(*module.split("."))
py = base.with_suffix(".py")
if py.is_file():
return py
init = base / "__init__.py"
if init.is_file():
return init
return None
def _import_closure(start: Path) -> dict[Path, Optional[Path]]:
"""Repo files reachable from ``start`` via import-time imports, mapped to the
first file that imported each (for blame in failure messages)."""
reached: dict[Path, Optional[Path]] = {}
stack: list[tuple[Path, Optional[Path]]] = [(start, None)]
while stack:
path, importer = stack.pop()
if path in reached:
continue
reached[path] = importer
for module in _import_time_imports(path):
if module.split(".")[0] not in _TOP:
continue # stdlib / third-party — not our concern here
target = _module_to_file(module)
if target is not None and target not in reached:
stack.append((target, path))
return reached
def _norm(path_token: str) -> str:
return path_token.lstrip("./").rstrip("/")
def _parse_handler_spec(dockerfile_text: str) -> Optional[str]:
"""The ``<module>.handler`` string from the ``CMD`` line, or None if this
isn't a Lambda handler image."""
match = re.search(r"^CMD\s+(\[.*\]|.+)$", dockerfile_text, re.MULTILINE)
if not match:
return None
raw = match.group(1).strip()
try:
parsed = json.loads(raw)
spec = parsed[0] if isinstance(parsed, list) and parsed else raw
except json.JSONDecodeError:
spec = raw.strip('"')
return spec if isinstance(spec, str) and spec.endswith(".handler") else None
def _parse_copies(dockerfile_text: str) -> list[tuple[list[str], str]]:
"""``COPY`` instructions as (sources, dest), dropping ``--flag`` tokens."""
copies: list[tuple[list[str], str]] = []
for match in re.finditer(r"^COPY\s+(.+)$", dockerfile_text, re.MULTILINE):
tokens = [t for t in match.group(1).split() if not t.startswith("--")]
if len(tokens) < 2:
continue
*sources, dest = tokens
copies.append((sources, dest))
return copies
def _resolve_handler_file(
spec: str, copies: list[tuple[list[str], str]]
) -> Optional[Path]:
"""Map a handler spec to its repo source file.
Handles both in-place layouts (``backend.foo.handler`` -> ``backend/foo/
handler.py``, present via ``COPY backend/ backend/``) and root-placed
handlers (``main.handler`` where a ``COPY <src> /var/task/main.py`` or
``COPY <src>/main.py .`` puts the file at the image root)."""
module_path, _func = spec.rsplit(".", 1)
direct = REPO_ROOT / (module_path.replace(".", "/") + ".py")
if direct.is_file():
return direct
# Root-placed module: find the COPY whose destination basename matches.
wanted = module_path.split("/")[-1] + ".py"
for sources, dest in copies:
dest_norm = _norm(dest)
dest_is_named_file = Path(dest_norm).name == wanted
dest_is_dir = dest_norm in ("", module_path.split("/")[-1]) or dest.endswith("/")
for src in sources:
src_path = REPO_ROOT / _norm(src)
if not src_path.is_file():
continue
if dest_is_named_file or (dest_is_dir and src_path.name == wanted):
return src_path
return None
def _is_copied(rel_path: str, copies: list[tuple[list[str], str]]) -> bool:
"""Whether a repo-relative file path lands in the image via some COPY."""
rel_path = _norm(rel_path)
for sources, _dest in copies:
for src in sources:
src_norm = _norm(src)
if src_norm == "" or src_norm == rel_path or rel_path.startswith(src_norm + "/"):
return True
return False
def _discover_handler_dockerfiles() -> list[Path]:
found: list[Path] = []
for path in REPO_ROOT.rglob("*Dockerfile*"):
if path.name in _SKIP_DOCKERFILES:
continue
if any(part in _SKIP_PARTS for part in path.relative_to(REPO_ROOT).parts):
continue
try:
text = path.read_text(encoding="utf-8")
except OSError:
continue
if _parse_handler_spec(text):
found.append(path)
return sorted(found)
_HANDLER_DOCKERFILES = _discover_handler_dockerfiles()
def test_handler_dockerfiles_discovered() -> None:
"""Guard against the discovery silently finding nothing (e.g. a refactor
that renames Dockerfiles), which would make every check below vacuous."""
assert _HANDLER_DOCKERFILES, "no Lambda handler Dockerfiles found under repo root"
@pytest.mark.parametrize(
"dockerfile",
_HANDLER_DOCKERFILES,
ids=[str(p.relative_to(REPO_ROOT)) for p in _HANDLER_DOCKERFILES],
)
def test_lambda_image_copies_full_import_closure(dockerfile: Path) -> None:
"""Every repo file the handler imports at init must be COPYed into the image."""
text = dockerfile.read_text(encoding="utf-8")
spec = _parse_handler_spec(text)
assert spec is not None # discovery guaranteed this
copies = _parse_copies(text)
handler_file = _resolve_handler_file(spec, copies)
assert handler_file is not None, (
f"{dockerfile.relative_to(REPO_ROOT)}: could not locate the source file "
f"for CMD handler {spec!r}. Update _resolve_handler_file if this is a new "
f"handler layout."
)
missing: list[str] = []
for reached, importer in _import_closure(handler_file).items():
rel = str(reached.relative_to(REPO_ROOT))
if not _is_copied(rel, copies):
blame = (
str(importer.relative_to(REPO_ROOT)) if importer else "(handler entrypoint)"
)
missing.append(f" - {rel}\n imported by {blame}")
assert not missing, (
f"{dockerfile.relative_to(REPO_ROOT)} runs `{spec}` but does not COPY "
f"{len(missing)} file(s) it imports at init. The Lambda will fail at cold "
f"start with Runtime.ImportModuleError. Add the missing top-level "
f"package(s) as `COPY <pkg>/ <pkg>/`:\n" + "\n".join(sorted(missing))
)