From 4e02eb7c77f1be1c38456fa02f4683f446a68fbb Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 2 Jun 2026 15:03:20 +0000 Subject: [PATCH] more tests to ensure we don't deploy something that is brokern --- applications/ara_first_run/Dockerfile | 3 + backend/address2UPRN/handler/Dockerfile | 3 + backend/docker/engine.Dockerfile | 3 + backend/ecmk_fetcher/handler/Dockerfile | 6 + backend/pashub_fetcher/handler/Dockerfile | 6 + tests/test_lambda_packaging.py | 267 ++++++++++++++++++++++ 6 files changed, 288 insertions(+) create mode 100644 tests/test_lambda_packaging.py diff --git a/applications/ara_first_run/Dockerfile b/applications/ara_first_run/Dockerfile index 2d3f6515..894f7ee1 100644 --- a/applications/ara_first_run/Dockerfile +++ b/applications/ara_first_run/Dockerfile @@ -25,6 +25,9 @@ COPY infrastructure/ infrastructure/ COPY orchestration/ orchestration/ COPY repositories/ repositories/ COPY utilities/ utilities/ +# domain.property.site_notes -> datatypes.epc.domain.epc_property_data; without +# this the lambda fails at init with "No module named 'datatypes'". +COPY datatypes/ datatypes/ COPY applications/ applications/ # Place the handler at the Lambda task root so the runtime can resolve diff --git a/backend/address2UPRN/handler/Dockerfile b/backend/address2UPRN/handler/Dockerfile index ffecf54b..017f260f 100644 --- a/backend/address2UPRN/handler/Dockerfile +++ b/backend/address2UPRN/handler/Dockerfile @@ -35,6 +35,9 @@ COPY datatypes/ datatypes/ # main.py imports infrastructure.epc_client.epc_client_service (EpcClientService); # without this the lambda fails at init with "No module named 'infrastructure'". COPY infrastructure/ infrastructure/ +# EpcClientService -> datatypes.epc.domain.mapper -> domain.sap10_calculator; +# without this the lambda fails at init with "No module named 'domain'". +COPY domain/ domain/ # Copy the handler COPY backend/address2UPRN/main.py . diff --git a/backend/docker/engine.Dockerfile b/backend/docker/engine.Dockerfile index 07f2d859..58ccd481 100644 --- a/backend/docker/engine.Dockerfile +++ b/backend/docker/engine.Dockerfile @@ -40,6 +40,9 @@ COPY --from=build-image /usr/local/lib/python3.11/site-packages/ /usr/local/lib/ COPY ./backend/ ./backend COPY ./recommendations/ ./recommendations COPY ./utils/ ./utils/ +# engine.py -> backend.apis.GoogleSolarApi -> infrastructure.solar; without this +# the lambda fails at init with "No module named 'infrastructure'". +COPY ./infrastructure/ ./infrastructure/ COPY ./etl/epc/ ./etl/epc/ COPY ./etl/epc_clean/ ./etl/epc_clean/ COPY ./etl/bill_savings/ ./etl/bill_savings/ diff --git a/backend/ecmk_fetcher/handler/Dockerfile b/backend/ecmk_fetcher/handler/Dockerfile index fa2126fd..aebcd7aa 100644 --- a/backend/ecmk_fetcher/handler/Dockerfile +++ b/backend/ecmk_fetcher/handler/Dockerfile @@ -13,6 +13,12 @@ RUN pip install --no-cache-dir -r requirements.txt COPY utils/ utils/ COPY backend/ backend/ COPY datatypes/ datatypes/ +# handler -> ecmk_service -> documents_parser.parser -> datatypes.epc.domain.mapper +# -> domain.sap10_calculator, and -> db_writer -> epc_property model +# -> infrastructure.postgres. Without these the lambda fails at init with +# "No module named 'domain'" / "'infrastructure'". +COPY domain/ domain/ +COPY infrastructure/ infrastructure/ # Local lambda entrypoint # ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] diff --git a/backend/pashub_fetcher/handler/Dockerfile b/backend/pashub_fetcher/handler/Dockerfile index f8c2008c..575b8565 100644 --- a/backend/pashub_fetcher/handler/Dockerfile +++ b/backend/pashub_fetcher/handler/Dockerfile @@ -10,6 +10,12 @@ WORKDIR /var/task COPY utils/ utils/ COPY backend/ backend/ COPY datatypes/ datatypes/ +# handler -> pashub_service -> documents_parser.parser -> datatypes.epc.domain.mapper +# -> domain.sap10_calculator, and -> db_writer -> epc_property model +# -> infrastructure.postgres. Without these the lambda fails at init with +# "No module named 'domain'" / "'infrastructure'". +COPY domain/ domain/ +COPY infrastructure/ infrastructure/ COPY backend/pashub_fetcher/handler/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt diff --git a/tests/test_lambda_packaging.py b/tests/test_lambda_packaging.py new file mode 100644 index 00000000..39990338 --- /dev/null +++ b/tests/test_lambda_packaging.py @@ -0,0 +1,267 @@ +"""Static packaging linter for Lambda container images. + +Every Lambda here ships as a Docker image that copies a *subset* of the repo +(``COPY utils/ utils/``, ``COPY backend/ backend/``, ...) and then runs a +handler via ``CMD [".handler"]``. If the handler's import graph reaches +a top-level package the Dockerfile forgot to ``COPY``, the function dies at cold +start with ``Runtime.ImportModuleError: No module named ''`` — but only +*inside the image*. In the dev/test tree every package is present, so a plain +``import`` test can't see the gap. This is exactly how ``No module named +'domain'`` reached a deployed address2UPRN. + +The RIE smoke tests (.github/workflows/_smoke_test_lambda.yml) catch this too, +but only by building the full image (minutes) and only for hand-listed services. +This test catches the same class of bug in milliseconds, locally, for *every* +handler Dockerfile — by statically computing each handler's import-time module +graph and asserting every repo file it reaches is copied into the image. + +Scope: import-time (module-level) imports only — the ones that run at Lambda +init, which is what ImportModuleError is about. Imports inside function bodies +and under ``if TYPE_CHECKING:`` are deliberately ignored. Third-party / stdlib +imports are out of scope (that's requirements.txt's job, covered by the RIE +smoke test actually installing and importing). +""" + +from __future__ import annotations + +import ast +import json +import re +from pathlib import Path +from typing import Optional + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[1] + +# Dockerfiles that are not Lambda handlers (test harness, dev containers). +_SKIP_DOCKERFILES = {"Dockerfile.test", "Dockerfile.test.dockerignore"} +_SKIP_PARTS = {".git", "node_modules", ".devcontainer"} + + +def _toplevel_names() -> set[str]: + """Top-level repo packages/modules — the namespace handler imports resolve + against (imports are absolute: ``domain.x``, ``backend.y``).""" + names: set[str] = set() + for p in REPO_ROOT.iterdir(): + if p.name.startswith(".") or p.name == "__pycache__": + continue + if p.is_dir(): + names.add(p.name) + elif p.suffix == ".py": + names.add(p.stem) + return names + + +_TOP = _toplevel_names() + + +def _is_type_checking(test: ast.expr) -> bool: + if isinstance(test, ast.Name): + return test.id == "TYPE_CHECKING" + if isinstance(test, ast.Attribute): + return test.attr == "TYPE_CHECKING" + return False + + +def _import_time_imports(path: Path) -> list[str]: + """Absolute module names imported when ``path`` is imported (i.e. at Lambda + init). Descends into module-level if/try/with and class bodies, but not into + function bodies (lazy) or ``if TYPE_CHECKING:`` blocks (never executed).""" + try: + tree = ast.parse(path.read_text(encoding="utf-8"), str(path)) + except (SyntaxError, UnicodeDecodeError): + return [] + out: list[str] = [] + + def visit(stmts: list[ast.stmt]) -> None: + for node in stmts: + if isinstance(node, ast.Import): + out.extend(alias.name for alias in node.names) + elif isinstance(node, ast.ImportFrom): + if not node.level and node.module: # absolute imports only + out.append(node.module) + elif isinstance(node, ast.If): + if _is_type_checking(node.test): + continue + visit(node.body) + visit(node.orelse) + elif isinstance(node, ast.Try): + visit(node.body) + visit(node.orelse) + visit(node.finalbody) + for handler in node.handlers: + visit(handler.body) + elif isinstance(node, ast.With): + visit(node.body) + elif isinstance(node, ast.ClassDef): + visit(node.body) + # FunctionDef / AsyncFunctionDef bodies are intentionally skipped. + + visit(tree.body) + return out + + +def _module_to_file(module: str) -> Optional[Path]: + """Resolve a dotted module to its repo source file (``foo.bar`` -> + ``foo/bar.py`` or ``foo/bar/__init__.py``).""" + base = REPO_ROOT.joinpath(*module.split(".")) + py = base.with_suffix(".py") + if py.is_file(): + return py + init = base / "__init__.py" + if init.is_file(): + return init + return None + + +def _import_closure(start: Path) -> dict[Path, Optional[Path]]: + """Repo files reachable from ``start`` via import-time imports, mapped to the + first file that imported each (for blame in failure messages).""" + reached: dict[Path, Optional[Path]] = {} + stack: list[tuple[Path, Optional[Path]]] = [(start, None)] + while stack: + path, importer = stack.pop() + if path in reached: + continue + reached[path] = importer + for module in _import_time_imports(path): + if module.split(".")[0] not in _TOP: + continue # stdlib / third-party — not our concern here + target = _module_to_file(module) + if target is not None and target not in reached: + stack.append((target, path)) + return reached + + +def _norm(path_token: str) -> str: + return path_token.lstrip("./").rstrip("/") + + +def _parse_handler_spec(dockerfile_text: str) -> Optional[str]: + """The ``.handler`` string from the ``CMD`` line, or None if this + isn't a Lambda handler image.""" + match = re.search(r"^CMD\s+(\[.*\]|.+)$", dockerfile_text, re.MULTILINE) + if not match: + return None + raw = match.group(1).strip() + try: + parsed = json.loads(raw) + spec = parsed[0] if isinstance(parsed, list) and parsed else raw + except json.JSONDecodeError: + spec = raw.strip('"') + return spec if isinstance(spec, str) and spec.endswith(".handler") else None + + +def _parse_copies(dockerfile_text: str) -> list[tuple[list[str], str]]: + """``COPY`` instructions as (sources, dest), dropping ``--flag`` tokens.""" + copies: list[tuple[list[str], str]] = [] + for match in re.finditer(r"^COPY\s+(.+)$", dockerfile_text, re.MULTILINE): + tokens = [t for t in match.group(1).split() if not t.startswith("--")] + if len(tokens) < 2: + continue + *sources, dest = tokens + copies.append((sources, dest)) + return copies + + +def _resolve_handler_file( + spec: str, copies: list[tuple[list[str], str]] +) -> Optional[Path]: + """Map a handler spec to its repo source file. + + Handles both in-place layouts (``backend.foo.handler`` -> ``backend/foo/ + handler.py``, present via ``COPY backend/ backend/``) and root-placed + handlers (``main.handler`` where a ``COPY /var/task/main.py`` or + ``COPY /main.py .`` puts the file at the image root).""" + module_path, _func = spec.rsplit(".", 1) + + direct = REPO_ROOT / (module_path.replace(".", "/") + ".py") + if direct.is_file(): + return direct + + # Root-placed module: find the COPY whose destination basename matches. + wanted = module_path.split("/")[-1] + ".py" + for sources, dest in copies: + dest_norm = _norm(dest) + dest_is_named_file = Path(dest_norm).name == wanted + dest_is_dir = dest_norm in ("", module_path.split("/")[-1]) or dest.endswith("/") + for src in sources: + src_path = REPO_ROOT / _norm(src) + if not src_path.is_file(): + continue + if dest_is_named_file or (dest_is_dir and src_path.name == wanted): + return src_path + return None + + +def _is_copied(rel_path: str, copies: list[tuple[list[str], str]]) -> bool: + """Whether a repo-relative file path lands in the image via some COPY.""" + rel_path = _norm(rel_path) + for sources, _dest in copies: + for src in sources: + src_norm = _norm(src) + if src_norm == "" or src_norm == rel_path or rel_path.startswith(src_norm + "/"): + return True + return False + + +def _discover_handler_dockerfiles() -> list[Path]: + found: list[Path] = [] + for path in REPO_ROOT.rglob("*Dockerfile*"): + if path.name in _SKIP_DOCKERFILES: + continue + if any(part in _SKIP_PARTS for part in path.relative_to(REPO_ROOT).parts): + continue + try: + text = path.read_text(encoding="utf-8") + except OSError: + continue + if _parse_handler_spec(text): + found.append(path) + return sorted(found) + + +_HANDLER_DOCKERFILES = _discover_handler_dockerfiles() + + +def test_handler_dockerfiles_discovered() -> None: + """Guard against the discovery silently finding nothing (e.g. a refactor + that renames Dockerfiles), which would make every check below vacuous.""" + assert _HANDLER_DOCKERFILES, "no Lambda handler Dockerfiles found under repo root" + + +@pytest.mark.parametrize( + "dockerfile", + _HANDLER_DOCKERFILES, + ids=[str(p.relative_to(REPO_ROOT)) for p in _HANDLER_DOCKERFILES], +) +def test_lambda_image_copies_full_import_closure(dockerfile: Path) -> None: + """Every repo file the handler imports at init must be COPYed into the image.""" + text = dockerfile.read_text(encoding="utf-8") + spec = _parse_handler_spec(text) + assert spec is not None # discovery guaranteed this + copies = _parse_copies(text) + + handler_file = _resolve_handler_file(spec, copies) + assert handler_file is not None, ( + f"{dockerfile.relative_to(REPO_ROOT)}: could not locate the source file " + f"for CMD handler {spec!r}. Update _resolve_handler_file if this is a new " + f"handler layout." + ) + + missing: list[str] = [] + for reached, importer in _import_closure(handler_file).items(): + rel = str(reached.relative_to(REPO_ROOT)) + if not _is_copied(rel, copies): + blame = ( + str(importer.relative_to(REPO_ROOT)) if importer else "(handler entrypoint)" + ) + missing.append(f" - {rel}\n imported by {blame}") + + assert not missing, ( + f"{dockerfile.relative_to(REPO_ROOT)} runs `{spec}` but does not COPY " + f"{len(missing)} file(s) it imports at init. The Lambda will fail at cold " + f"start with Runtime.ImportModuleError. Add the missing top-level " + f"package(s) as `COPY / /`:\n" + "\n".join(sorted(missing)) + )