diff --git a/.devcontainer/backend/docker-compose.yml b/.devcontainer/backend/docker-compose.yml index a9350c81..52d01621 100644 --- a/.devcontainer/backend/docker-compose.yml +++ b/.devcontainer/backend/docker-compose.yml @@ -2,7 +2,7 @@ version: '3.8' # Unique Compose project name so this repo's devcontainer doesn't collide with # other model-* clones (which all live in .devcontainer/backend/ and would # otherwise default to the same project name "backend", clobbering each other). -name: model-backend +name: landlord-backend services: model-backend: diff --git a/etl/hubspot/scripts/scraper/handler/Dockerfile b/etl/hubspot/scripts/scraper/handler/Dockerfile index 012da376..fc4fb051 100644 --- a/etl/hubspot/scripts/scraper/handler/Dockerfile +++ b/etl/hubspot/scripts/scraper/handler/Dockerfile @@ -1,5 +1,7 @@ -FROM public.ecr.aws/lambda/python:3.10 -# FROM python:3.11.10-bullseye +# 3.11: domain/modelling/measure_type.py (pulled in transitively via +# backend.app.db.models -> infrastructure.postgres.modelling -> domain) uses +# enum.StrEnum, which only exists in Python 3.11+. +FROM public.ecr.aws/lambda/python:3.11 # Set working directory (Lambda task root) WORKDIR /var/task @@ -17,6 +19,11 @@ RUN pip install --no-cache-dir -r requirements.txt COPY backend/ backend/ COPY utils/ utils/ COPY datatypes/ datatypes/ +# main -> backend.app.db.models.{epc_property,recommendations} -> +# infrastructure.postgres.{epc_property_table,modelling} -> domain.modelling. +# Without these the lambda fails at init with "No module named 'infrastructure'". +COPY infrastructure/ infrastructure/ +COPY domain/ domain/ COPY etl/hubspot etl/hubspot # Copy the handler diff --git a/tests/test_lambda_packaging.py b/tests/test_lambda_packaging.py index 39990338..5d862ae5 100644 --- a/tests/test_lambda_packaging.py +++ b/tests/test_lambda_packaging.py @@ -64,23 +64,51 @@ def _is_type_checking(test: ast.expr) -> bool: return False +def _file_package_parts(path: Path) -> list[str]: + """The components of ``__package__`` Python assigns when importing ``path``. + + For a regular module ``a/b/c.py`` and for a package ``a/b/__init__.py`` alike + this is the containing directory (``["a", "b"]``) — i.e. the anchor that + ``from . import x`` resolves against.""" + return list(path.relative_to(REPO_ROOT).parts)[:-1] + + def _import_time_imports(path: Path) -> list[str]: """Absolute module names imported when ``path`` is imported (i.e. at Lambda init). Descends into module-level if/try/with and class bodies, but not into - function bodies (lazy) or ``if TYPE_CHECKING:`` blocks (never executed).""" + function bodies (lazy) or ``if TYPE_CHECKING:`` blocks (never executed). + + Relative imports (``from .x import y``) are resolved to their absolute name + against ``path``'s package — the codebase re-exports through package + ``__init__.py`` files this way, so dropping them would hide real init-time + dependencies (e.g. ``functions/__init__.py`` -> ``from .portfolio_functions + import *`` -> ... -> ``infrastructure``).""" try: tree = ast.parse(path.read_text(encoding="utf-8"), str(path)) except (SyntaxError, UnicodeDecodeError): return [] + pkg_parts = _file_package_parts(path) out: list[str] = [] + def _relative_base(level: int) -> list[str]: + # level 1 anchors on the package itself; each extra level climbs one up. + keep = len(pkg_parts) - (level - 1) + return pkg_parts[:keep] if keep > 0 else [] + def visit(stmts: list[ast.stmt]) -> None: for node in stmts: if isinstance(node, ast.Import): out.extend(alias.name for alias in node.names) elif isinstance(node, ast.ImportFrom): - if not node.level and node.module: # absolute imports only - out.append(node.module) + if not node.level: # absolute import + if node.module: + out.append(node.module) + else: # relative import — resolve against this file's package + base = _relative_base(node.level) + if node.module: # from .pkg.mod import name + out.append(".".join(base + node.module.split("."))) + else: # from . import a, b -> base.a, base.b (submodules) + out.extend(".".join(base + [alias.name]) for alias in node.names) elif isinstance(node, ast.If): if _is_type_checking(node.test): continue @@ -102,17 +130,27 @@ def _import_time_imports(path: Path) -> list[str]: return out -def _module_to_file(module: str) -> Optional[Path]: - """Resolve a dotted module to its repo source file (``foo.bar`` -> - ``foo/bar.py`` or ``foo/bar/__init__.py``).""" - base = REPO_ROOT.joinpath(*module.split(".")) - py = base.with_suffix(".py") - if py.is_file(): - return py - init = base / "__init__.py" - if init.is_file(): - return init - return None +def _module_files(module: str) -> list[Path]: + """Every repo file executed when ``module`` is imported: the module's own + file *plus* each ancestor package's ``__init__.py``. + + Importing ``a.b.c`` runs ``a/__init__.py``, ``a/b/__init__.py`` and + ``a/b/c.py`` (or ``a/b/c/__init__.py``) in turn — so an ``__init__.py`` part + way down the path can pull in a whole subtree (and the package it lives in + must be COPYed). ``_module_to_file`` resolves only the leaf, which is why the + closure used to stop short of those intermediate packages.""" + parts = module.split(".") + files: list[Path] = [] + for depth in range(1, len(parts) + 1): + base = REPO_ROOT.joinpath(*parts[:depth]) + init = base / "__init__.py" + if init.is_file(): + files.append(init) + if depth == len(parts): # the leaf may be a plain module file + leaf = base.with_suffix(".py") + if leaf.is_file(): + files.append(leaf) + return files def _import_closure(start: Path) -> dict[Path, Optional[Path]]: @@ -128,9 +166,9 @@ def _import_closure(start: Path) -> dict[Path, Optional[Path]]: for module in _import_time_imports(path): if module.split(".")[0] not in _TOP: continue # stdlib / third-party — not our concern here - target = _module_to_file(module) - if target is not None and target not in reached: - stack.append((target, path)) + for target in _module_files(module): + if target not in reached: + stack.append((target, path)) return reached @@ -206,6 +244,21 @@ def _is_copied(rel_path: str, copies: list[tuple[list[str], str]]) -> bool: return False +def _package_dir_present(pkg_rel: str, copies: list[tuple[list[str], str]]) -> bool: + """Whether the image will contain ``pkg_rel`` as a directory because some + COPY brings in a file beneath it. Used to excuse an un-copied package + ``__init__.py``: in Python 3 a directory present without its ``__init__.py`` + imports fine as a *namespace package*, so the missing ``__init__`` is not a + cold-start ``ModuleNotFoundError`` (only a wholly-absent package is).""" + pkg_rel = _norm(pkg_rel) + for sources, _dest in copies: + for src in sources: + src_norm = _norm(src) + if src_norm == pkg_rel or src_norm.startswith(pkg_rel + "/"): + return True + return False + + def _discover_handler_dockerfiles() -> list[Path]: found: list[Path] = [] for path in REPO_ROOT.rglob("*Dockerfile*"): @@ -253,11 +306,21 @@ def test_lambda_image_copies_full_import_closure(dockerfile: Path) -> None: missing: list[str] = [] for reached, importer in _import_closure(handler_file).items(): rel = str(reached.relative_to(REPO_ROOT)) - if not _is_copied(rel, copies): - blame = ( - str(importer.relative_to(REPO_ROOT)) if importer else "(handler entrypoint)" - ) - missing.append(f" - {rel}\n imported by {blame}") + if _is_copied(rel, copies): + continue + # An un-copied package __init__.py is non-fatal when its directory still + # exists in the image (some other file under it is copied): Python falls + # back to a namespace package. We still traverse such __init__ files for + # their imports above; we just don't demand they be copied. A wholly + # absent package (no file under it copied) is a real ModuleNotFoundError. + if reached.name == "__init__.py" and _package_dir_present( + str(reached.parent.relative_to(REPO_ROOT)), copies + ): + continue + blame = ( + str(importer.relative_to(REPO_ROOT)) if importer else "(handler entrypoint)" + ) + missing.append(f" - {rel}\n imported by {blame}") assert not missing, ( f"{dockerfile.relative_to(REPO_ROOT)} runs `{spec}` but does not COPY "