From 3683d9141f694fe3f51aa1e8ea1b8cb61be493c7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 14 Apr 2026 15:04:10 +0100 Subject: [PATCH 001/106] protecting pushes to dev --- .github/workflows/protect_releases.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .github/workflows/protect_releases.yml diff --git a/.github/workflows/protect_releases.yml b/.github/workflows/protect_releases.yml new file mode 100644 index 00000000..cbd08e2f --- /dev/null +++ b/.github/workflows/protect_releases.yml @@ -0,0 +1,17 @@ +name: Restrict PR source + +on: + pull_request: + branches: + - dev + +jobs: + check-source-branch: + runs-on: ubuntu-latest + steps: + - name: Fail if PR is not from main + run: | + if [[ "${{ github.head_ref }}" != "main" ]]; then + echo "Only PRs from main are allowed into dev" + exit 1 + fi \ No newline at end of file From 6d4942c0136f58721920c4ab9f5756cc72d71428 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 25 Apr 2026 15:03:07 +0000 Subject: [PATCH 002/106] adding to dev container to create shared network on start up --- .devcontainer/backend/devcontainer.json | 1 + Makefile | 10 +++++++++- README.md | 21 +++++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index a9b7352a..ee37224f 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -4,6 +4,7 @@ "service": "model-backend", "remoteUser": "vscode", "workspaceFolder": "/workspaces/model", + "initializeCommand": "docker network create shared-dev 2>/dev/null || true", "postStartCommand": "bash .devcontainer/backend/post-install.sh", "mounts": [ "source=${localEnv:HOME},target=/workspaces/home,type=bind", diff --git a/Makefile b/Makefile index 00942acd..255e2abf 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ PYTHON = python -.PHONY: setup test lint typecheck check clean +.PHONY: setup test lint typecheck check clean network-setup dev-setup # Install dev dependencies + tox setup: @@ -28,3 +28,11 @@ check: lint typecheck test # Clean up tox environments clean: rm -rf .tox + +# Create shared Docker network required by dev container (idempotent) +network-setup: + docker network create shared-dev 2>/dev/null || true + +# First-time dev environment setup +dev-setup: network-setup + @echo "Dev environment ready. Open the repo in VS Code and select 'Reopen in Container'." diff --git a/README.md b/README.md index b470e12c..0f88328a 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,27 @@ The different folders in this repository relate to services that can be used independently, or can be imported and used as part of a larger application +# Getting Started + +## Prerequisites + +- [Docker Desktop](https://www.docker.com/products/docker-desktop/) +- [VS Code](https://code.visualstudio.com/) with the [Dev Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) + +## Dev Container Setup + +This repo uses a Docker Compose-based dev container. The `model-backend` service joins a `shared-dev` Docker network so it can communicate with other local services (e.g. a frontend container) running on your machine. + +**VS Code users:** The `initializeCommand` in `devcontainer.json` creates the `shared-dev` network automatically before the container starts. No manual step required — just open the repo and select **Reopen in Container**. + +**Non-VS Code / CI workflows:** Run the following once before starting the container: + +```commandline +make dev-setup +``` + +This is idempotent and safe to re-run if the network already exists. + # Folders ### backend/ From f8d785411bd9862e56dff2baea906ed78a6263af Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 25 Apr 2026 18:54:41 +0000 Subject: [PATCH 003/106] setting up new claude dev instructions --- .devcontainer/backend/Dockerfile | 10 ++++-- .../backend/install-claude-skills.sh | 15 +++++++++ CLAUDE.md | 31 +++++++++++++++++++ UBIQUITOUS_LANGUAGE.md | 9 ++++++ 4 files changed, 63 insertions(+), 2 deletions(-) create mode 100755 .devcontainer/backend/install-claude-skills.sh create mode 100644 UBIQUITOUS_LANGUAGE.md diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile index a92d37f6..983670e4 100644 --- a/.devcontainer/backend/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -86,10 +86,16 @@ USER ${USER} # Bootstrap LazyVim starter config RUN git clone https://github.com/LazyVim/starter /home/${USER}/.config/nvim \ && rm -rf /home/${USER}/.config/nvim/.git -# Install Claude +# Install Claude + plugins RUN curl -fsSL https://claude.ai/install.sh | bash \ && export PATH="/home/${USER}/.local/bin:${PATH}" \ && claude plugin marketplace add JuliusBrussee/caveman \ - && claude plugin install caveman@caveman + && claude plugin install caveman@caveman \ + && claude plugin marketplace add mattpocock/skills \ + && claude plugin install skills@grill-me \ + && claude plugin install skills@to-prd \ + && claude plugin install skills@ubiquitous-language \ + && claude plugin install skills@tdd \ + && claude plugin install skills@improve-codebase-architecture ENV PATH="/home/vscode/.local/bin:${PATH}" USER root diff --git a/.devcontainer/backend/install-claude-skills.sh b/.devcontainer/backend/install-claude-skills.sh new file mode 100755 index 00000000..71727e4d --- /dev/null +++ b/.devcontainer/backend/install-claude-skills.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Run this in an existing container to install the mattpocock skills +# without rebuilding the image. New containers get them automatically via Dockerfile. +set -euo pipefail + +echo "Installing Claude Code skills (mattpocock/skills)..." + +claude plugin marketplace add mattpocock/skills +claude plugin install skills@grill-me +claude plugin install skills@to-prd +claude plugin install skills@ubiquitous-language +claude plugin install skills@tdd +claude plugin install skills@improve-codebase-architecture + +echo "Done. Available: /grill-me /to-prd /ubiquitous-language /tdd /improve-codebase-architecture" diff --git a/CLAUDE.md b/CLAUDE.md index de2917f2..263679ff 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -28,3 +28,34 @@ You MUST read the overview resource to understand the complete workflow. The inf +## Available Skills + +Five Claude Code skills are installed in this repo's dev container. Each maps to a phase of the feature lifecycle. + +| Skill | Invoke | When to use | +|-------|--------|-------------| +| **grill-me** | `/grill-me` | Before implementing — stress-tests a design through sequential questioning | +| **to-prd** | `/to-prd` | After a planning conversation — formalises context into a GitHub issue PRD | +| **ubiquitous-language** | `/ubiquitous-language` | When domain terms are drifting or ambiguous — builds/updates `UBIQUITOUS_LANGUAGE.md` | +| **tdd** | `/tdd` | During implementation — enforces vertical-slice TDD (one test → one impl → repeat) | +| **improve-codebase-architecture** | `/improve-codebase-architecture` | During refactoring — surfaces shallow modules and proposes deepening opportunities | + +### Typical session chains + +**Feature planning:** +`/grill-me` → `/to-prd` → `/ubiquitous-language` + +**Implementation:** +`/tdd` (+ `/grill-me` if a design fork appears mid-session) + +**Refactoring:** +`/improve-codebase-architecture` → `/grill-me` → `/tdd` → `/ubiquitous-language` + +### First time setting up? + +New containers install all skills automatically via the Dockerfile. If you're in an existing container, run: + +```bash +bash .devcontainer/backend/install-claude-skills.sh +``` + diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md new file mode 100644 index 00000000..3f2c3fe3 --- /dev/null +++ b/UBIQUITOUS_LANGUAGE.md @@ -0,0 +1,9 @@ +# Ubiquitous Language + +Domain terminology glossary for this project. Generated and maintained by the `/ubiquitous-language` Claude Code skill. + +Invoke `/ubiquitous-language` in any session to extract new terms from the conversation, flag ambiguities, and update this file with canonical definitions. + +--- + + From 9ce1928b1e7cabc375263677319336745f61c094 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 25 Apr 2026 20:28:43 +0000 Subject: [PATCH 004/106] added new skills to repo and reduced size of dev container (Sorry Jun-te) --- .devcontainer/backend/Dockerfile | 19 +++++++++---------- .devcontainer/backend/devcontainer.json | 4 ---- .../backend/install-claude-skills.sh | 11 +++++------ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile index 983670e4..ebe405a0 100644 --- a/.devcontainer/backend/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -10,7 +10,7 @@ ARG DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y --no-install-recommends \ sudo jq vim curl git ca-certificates wget \ build-essential pkg-config automake autoconf libtool \ - ripgrep fd-find make unzip \ + ripgrep fd-find make unzip bash-completion \ && rm -rf /var/lib/apt/lists/* # Neovim latest (LazyVim needs >=0.9) @@ -65,8 +65,8 @@ RUN echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] \ https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ tee /etc/apt/sources.list.d/hashicorp.list RUN apt update -RUN apt-get install terraform -RUN terraform -install-autocomplete +RUN apt-get install -y terraform +RUN terraform -install-autocomplete || true # Install postgres RUN apt install -y wget gnupg2 lsb-release @@ -86,16 +86,15 @@ USER ${USER} # Bootstrap LazyVim starter config RUN git clone https://github.com/LazyVim/starter /home/${USER}/.config/nvim \ && rm -rf /home/${USER}/.config/nvim/.git -# Install Claude + plugins +# Install Claude + plugins + skills RUN curl -fsSL https://claude.ai/install.sh | bash \ && export PATH="/home/${USER}/.local/bin:${PATH}" \ && claude plugin marketplace add JuliusBrussee/caveman \ && claude plugin install caveman@caveman \ - && claude plugin marketplace add mattpocock/skills \ - && claude plugin install skills@grill-me \ - && claude plugin install skills@to-prd \ - && claude plugin install skills@ubiquitous-language \ - && claude plugin install skills@tdd \ - && claude plugin install skills@improve-codebase-architecture + && npx skills@latest add --global --yes mattpocock/skills/grill-me \ + && npx skills@latest add --global --yes mattpocock/skills/to-prd \ + && npx skills@latest add --global --yes mattpocock/skills/ubiquitous-language \ + && npx skills@latest add --global --yes mattpocock/skills/tdd \ + && npx skills@latest add --global --yes mattpocock/skills/improve-codebase-architecture ENV PATH="/home/vscode/.local/bin:${PATH}" USER root diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index ee37224f..54e45095 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -17,7 +17,6 @@ "ms-toolsai.jupyter", "mechatroner.rainbow-csv", "ms-toolsai.datawrangler", - "lindacong.vscode-book-reader", "4ops.terraform", "fabiospampinato.vscode-todo-plus", "jgclark.vscode-todo-highlight", @@ -26,9 +25,6 @@ "ms-python.black-formatter", "waderyan.gitblame", "GrapeCity.gc-excelviewer", - "jakobhoeg.vscode-pokemon", - "github.vscode-github-actions", - "me-dutour-mathieu.vscode-github-actions", "anthropic.claude-code", "eamodio.gitlens" ], diff --git a/.devcontainer/backend/install-claude-skills.sh b/.devcontainer/backend/install-claude-skills.sh index 71727e4d..a54f69e0 100755 --- a/.devcontainer/backend/install-claude-skills.sh +++ b/.devcontainer/backend/install-claude-skills.sh @@ -5,11 +5,10 @@ set -euo pipefail echo "Installing Claude Code skills (mattpocock/skills)..." -claude plugin marketplace add mattpocock/skills -claude plugin install skills@grill-me -claude plugin install skills@to-prd -claude plugin install skills@ubiquitous-language -claude plugin install skills@tdd -claude plugin install skills@improve-codebase-architecture +npx skills@latest add --global --yes mattpocock/skills/grill-me +npx skills@latest add --global --yes mattpocock/skills/to-prd +npx skills@latest add --global --yes mattpocock/skills/ubiquitous-language +npx skills@latest add --global --yes mattpocock/skills/tdd +npx skills@latest add --global --yes mattpocock/skills/improve-codebase-architecture echo "Done. Available: /grill-me /to-prd /ubiquitous-language /tdd /improve-codebase-architecture" From 3ed25030d44edf2f01e37637bd4f02110285c55a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 25 Apr 2026 22:17:38 +0000 Subject: [PATCH 005/106] added new api call for new epc api --- backend/address2UPRN/main.py | 128 ++------------------------ backend/utils/addressMatch.py | 46 +++++++++ datatypes/epc/domain/mapper.py | 22 +++++ datatypes/epc/schema/tests/helpers.py | 78 +--------------- pytest.ini | 2 +- 5 files changed, 80 insertions(+), 196 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 28ad344f..bd562bc7 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -1,8 +1,6 @@ from typing import Optional -from epc_api.client import EpcClient import os -from urllib.parse import urlencode import pandas as pd from utils.logger import setup_logger import json @@ -16,7 +14,7 @@ from utils.s3 import ( ) from datetime import datetime -from backend.utils.addressMatch import AddressMatch +from backend.utils.addressMatch import AddressMatch, get_uprn_candidates, df_has_single_uprn, score_addresses logger = setup_logger() @@ -29,122 +27,14 @@ if EPC_AUTH_TOKEN is None: raise RuntimeError("EPC_AUTH_TOKEN not defined in env") -def score_addresses( - df: pd.DataFrame, - user_address: str, - column: str = "address", -) -> pd.Series: - if column not in df.columns: - raise ValueError(f"Missing column: {column}") - - return df[column].apply(lambda x: AddressMatch.score(user_address, x)) - - -def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): - """ - Recursively fetch EPC data by postcode. - If results hit the size limit, retry with double size up to max_attempts. - """ - client = EpcClient(auth_token=EPC_AUTH_TOKEN) - - url = os.path.join(client.domestic.host, "search") - - if size: - url += "?" + urlencode({"size": size}) - - search_resp = client.domestic.call( - url=url, - method="get", - params={"postcode": postcode}, - ) - if not search_resp or "rows" not in search_resp: - return pd.DataFrame() - - results_df = pd.DataFrame(search_resp["rows"], columns=search_resp["column-names"]) - - row_count = len(results_df) - - # If we hit the size limit, there *may* be more results - if row_count == size: - print( - f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. " - f"Attempt {attempt}/{max_attempts}." - ) - - if attempt < max_attempts: - print(f"🔁 Retrying with size={size * 2}") - return get_epc_data_with_postcode( - postcode=postcode, - size=size * 2, - attempt=attempt + 1, - max_attempts=max_attempts, - ) - else: - print( - "🚨 Max attempts reached. Results may be truncated. " - "(Please do a manual review by the tech team.)" - ) - - return results_df - - -def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: - """ - Returns True if all non-null UPRNs in df match the given uprn. - Returns False otherwise. - """ - - if column not in df.columns: - return False - - # Drop nulls and normalise to string - uprns = df[column].dropna().astype(str).str.strip().unique() - - # No valid UPRNs to compare - if len(uprns) == 0: - return False - - # Exactly one unique UPRN and it matches - return len(uprns) == 1 and uprns[0] == str(uprn) - - -def get_uprn_candidates( - df: pd.DataFrame, - user_address: str, - address_column: str = "address", - uprn_column: str = "uprn", -) -> pd.DataFrame: - """ - Annotate EPC results with lexicographical similarity scores and ranks. - - Returns a DataFrame sorted by descending lexiscore. - DOES NOT choose or return a UPRN. - """ - - if address_column not in df.columns: - raise ValueError(f"Missing column: {address_column}") - - if uprn_column not in df.columns: - raise ValueError(f"Missing column: {uprn_column}") - - out = df.copy() - - user_norm = AddressMatch.normalise_address(user_address) - - out["lexiscore"] = out[address_column].apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) - - # Normalise UPRN to string - out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) - - # Rank: 1 = best match - out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) - - return out.sort_values( - ["lexirank", "lexiscore"], - ascending=[True, False], - ) +def get_epc_data_with_postcode(postcode: str) -> pd.DataFrame: + from backend.epc_client.client import EpcClientService + service = EpcClientService(auth_token=EPC_AUTH_TOKEN) + results = service.search_by_postcode(postcode) + return pd.DataFrame([ + {"address": r.address_line_1, "uprn": r.uprn} + for r in results + ]) def get_uprn_with_epc_df( diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index 411bb07c..12c1ac53 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -2,6 +2,7 @@ import re from typing import Any, Optional from difflib import SequenceMatcher import requests +import pandas as pd class AddressMatch: @@ -199,3 +200,48 @@ class AddressMatch: 0.65 * token_score + 0.35 * char_score, 4, ) + + +def score_addresses( + df: pd.DataFrame, + user_address: str, + column: str = "address", +) -> pd.Series: + if column not in df.columns: + raise ValueError(f"Missing column: {column}") + return df[column].apply(lambda x: AddressMatch.score(user_address, x)) + + +def get_uprn_candidates( + df: pd.DataFrame, + user_address: str, + address_column: str = "address", + uprn_column: str = "uprn", +) -> pd.DataFrame: + """ + Annotate EPC results with lexicographical similarity scores and ranks. + Returns a DataFrame sorted by descending lexiscore. + """ + if address_column not in df.columns: + raise ValueError(f"Missing column: {address_column}") + if uprn_column not in df.columns: + raise ValueError(f"Missing column: {uprn_column}") + + out = df.copy() + user_norm = AddressMatch.normalise_address(user_address) + out["lexiscore"] = out[address_column].apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) + out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) + out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) + return out.sort_values(["lexirank", "lexiscore"], ascending=[True, False]) + + +def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: + """Returns True if all non-null UPRNs in df match the given uprn.""" + if column not in df.columns: + return False + uprns = df[column].dropna().astype(str).str.strip().unique() + if len(uprns) == 0: + return False + return len(uprns) == 1 and uprns[0] == str(uprn) diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 1afade5c..7ef74340 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -1447,6 +1447,28 @@ class EpcPropertyDataMapper: ) -> List[EnergyElement]: return [EpcPropertyDataMapper._map_energy_element(e) for e in elements] + @staticmethod + def from_api_response(data: dict) -> "EpcPropertyData": + """ + Dispatch to the correct schema mapper based on schema_type. + Supports RdSAP-Schema-21.0.0 and RdSAP-Schema-21.0.1 only. + Raises ValueError for unsupported schemas — add cases here as needed. + """ + from datatypes.epc.schema.helpers import from_dict + + schema = data.get("schema_type", "") + if schema == "RdSAP-Schema-21.0.1": + from datatypes.epc.schema.rdsap_schema_21_0_1 import RdSapSchema21_0_1 + return EpcPropertyDataMapper.from_rdsap_schema_21_0_1( + from_dict(RdSapSchema21_0_1, data) + ) + if schema == "RdSAP-Schema-21.0.0": + from datatypes.epc.schema.rdsap_schema_21_0_0 import RdSapSchema21_0_0 + return EpcPropertyDataMapper.from_rdsap_schema_21_0_0( + from_dict(RdSapSchema21_0_0, data) + ) + raise ValueError(f"Unsupported EPC schema: {schema!r}") + # --------------------------------------------------------------------------- # Private helpers diff --git a/datatypes/epc/schema/tests/helpers.py b/datatypes/epc/schema/tests/helpers.py index 22f132d2..06338c0a 100644 --- a/datatypes/epc/schema/tests/helpers.py +++ b/datatypes/epc/schema/tests/helpers.py @@ -1,77 +1,3 @@ -import dataclasses -import typing -from datetime import date -from typing import Any, Dict, Type, TypeVar +from datatypes.epc.schema.helpers import from_dict -T = TypeVar("T") - - -def from_dict(cls: Type[T], data: Dict[str, Any]) -> T: - """ - Recursively convert a plain dict (e.g. from json.loads) into the given - dataclass type, using the field type hints to convert nested structures. - - Handles: - - Nested dataclasses - - List[SomeDataclass] - - Optional[X] / Union[X, None] - - Union[DataclassType, primitive] (e.g. Union[Measurement, int]) - - Primitive pass-through for Union[str, int] etc. - """ - return _from_dict_impl(cls, data) # type: ignore[return-value] - - -def _from_dict_impl(cls: Any, data: Any) -> Any: - hints = typing.get_type_hints(cls) - kwargs: Dict[str, Any] = {} - - for field in dataclasses.fields(cls): # type: ignore[arg-type] - has_default = ( - field.default is not dataclasses.MISSING - or field.default_factory is not dataclasses.MISSING # type: ignore[misc] - ) - if field.name not in data: - if has_default: - continue - raise ValueError(f"{cls.__name__}: missing required field '{field.name}'") - - kwargs[field.name] = _coerce(data[field.name], hints[field.name]) - - return cls(**kwargs) - - -def _coerce(value: Any, hint: Any) -> Any: - if value is None: - return None - - origin = typing.get_origin(hint) - args = typing.get_args(hint) - - # Union (includes Optional[X] which is Union[X, None]) - if origin is typing.Union: - if value is None: - return None - non_none_args = [a for a in args if a is not type(None)] - if len(non_none_args) == 1: - # Optional[X] — recurse so List[X] and nested dataclasses are handled - return _coerce(value, non_none_args[0]) - # Multi-type Union (e.g. Union[Measurement, int]): try dataclasses first - for arg in non_none_args: - if dataclasses.is_dataclass(arg) and isinstance(value, dict): - return _from_dict_impl(arg, value) - # All remaining args are primitives — return value as-is - return value - - # List[X] - if origin is list: - item_hint = args[0] - return [_coerce(item, item_hint) for item in value] - - # Plain dataclass - if dataclasses.is_dataclass(hint) and isinstance(value, dict): - return _from_dict_impl(hint, value) - - if hint is date and isinstance(value, str): - return date.fromisoformat(value) - - return value +__all__ = ["from_dict"] diff --git a/pytest.ini b/pytest.ini index 33231c61..1ddc8747 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,6 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/documents_parser/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/documents_parser/tests backend/epc_client/tests markers = integration: mark a test as an integration test From fa0c77af782e661a8254d5882e8cb27708faf617 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 25 Apr 2026 22:24:26 +0000 Subject: [PATCH 006/106] updated ubiqutous language --- UBIQUITOUS_LANGUAGE.md | 71 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/UBIQUITOUS_LANGUAGE.md b/UBIQUITOUS_LANGUAGE.md index 3f2c3fe3..1765cbc8 100644 --- a/UBIQUITOUS_LANGUAGE.md +++ b/UBIQUITOUS_LANGUAGE.md @@ -6,4 +6,73 @@ Invoke `/ubiquitous-language` in any session to extract new terms from the conve --- - +## Energy Performance Certificates + +| Term | Definition | Aliases to avoid | +|------|------------|------------------| +| **EPC** | An Energy Performance Certificate — a government-issued document rating a dwelling's energy efficiency from A (best) to G (worst). | "energy certificate", "energy report" | +| **Certificate Number** | The unique identifier assigned to an EPC by the government registry. | "cert number", "EPC ID" | +| **Registration Date** | The date an EPC was lodged with the government register; used to identify the most recent certificate for a property. | "assessment date", "submission date" | +| **EPC Band** | A single letter A–G representing a property's current or potential energy efficiency rating. | "energy rating", "EPC grade", "EPC score" | +| **Schema Type** | The versioned RdSAP or SAP schema that describes the structure of a certificate's raw data (e.g. `RdSAP-Schema-21.0.1`). | "schema version", "EPC format" | +| **Domestic Certificate** | An EPC issued for a residential dwelling, as opposed to a commercial one. | "residential EPC", "home EPC" | + +## Properties and Addresses + +| Term | Definition | Aliases to avoid | +|------|------------|------------------| +| **UPRN** | Unique Property Reference Number — the government-issued permanent identifier for a physical address in the UK. | "property ID", "address ID", "code" | +| **Postcode** | A UK postal code used to group nearby addresses; the primary search key for finding EPC records. | "zip code", "postal code" | +| **User Address** | A free-text address string provided by a user or imported from a customer dataset, before any normalisation or matching. | "user input", "raw address", "user_inputed_address" | +| **Dwelling** | A single residential unit that can hold an EPC — a house, flat, or maisonette. | "property", "unit", "home" | + +## Address Matching + +| Term | Definition | Aliases to avoid | +|------|------------|------------------| +| **Lexiscore** | A similarity score in [0, 1] between a user address and a candidate EPC address; combines token overlap and character-level similarity. | "score", "match score", "similarity" | +| **Lexirank** | Dense rank of candidates sorted by lexiscore descending; rank 1 = best match. | "rank", "position" | +| **UPRN Candidate** | An EPC search result that is a plausible match for a given user address, before scoring decides the winner. | "match candidate", "result" | +| **Score Threshold** | The minimum lexiscore (currently 0.6) below which no match is returned even if a candidate exists. | "minimum score", "cutoff" | +| **Ambiguous Match** | A matching outcome where two or more candidates share lexirank 1, making it impossible to select a unique winner. | "tie", "draw", "duplicate" | +| **Best Match** | The single UPRN candidate with lexirank 1 that meets or exceeds the score threshold. | "winner", "top result" | + +## API and Integration + +| Term | Definition | Aliases to avoid | +|------|------------|------------------| +| **EPC Search Result** | A lightweight record returned by the government domestic search endpoint — contains address lines, postcode, UPRN, band, and certificate number but not the full certificate data. | "search row", "EPC row", "result" | +| **EPC Property Data** | The fully mapped domain object produced after fetching and parsing a complete EPC certificate. | "EPC data", "certificate data", "parsed EPC" | +| **Old EPC API** | The retired government API (`epc.opendatacommunities.org`) using HTTP Basic auth; decommissioned May 2026. | "legacy API" | +| **New EPC API** | The replacement government API (`api.get-energy-performance-data.communities.gov.uk`) using Bearer token auth. | "new API", "current API" | +| **Bearer Token** | The auth credential required by the new EPC API; stored in the `EPC_AUTH_TOKEN` environment variable. | "API key", "auth token", "secret" | + +## Relationships + +- An **EPC** belongs to exactly one **Dwelling** and has one **Certificate Number**. +- A **Dwelling** may have multiple **EPCs** across time; the one with the most recent **Registration Date** is the current one. +- A **UPRN** identifies a **Dwelling** permanently; it does not change when the property changes owner. +- An **EPC Search Result** is a summary; it points to a full **EPC** via its **Certificate Number**. +- **Address Matching** uses a **User Address** and **Postcode** to find a **UPRN** by scoring **UPRN Candidates** from an EPC search. +- A **Lexirank** of 1 with no **Ambiguous Match** and a **Lexiscore** ≥ the **Score Threshold** produces a **Best Match**. + +## Example dialogue + +> **Dev:** "We have a user address and postcode. How do we find the UPRN?" + +> **Domain expert:** "Search the **New EPC API** by **Postcode** — you get back a list of **EPC Search Results** for that area. Each one has an address and a **UPRN**. Score each against the **User Address** using the **Lexiscore**. If the top **UPRN Candidate** scores above the **Score Threshold** and there's no **Ambiguous Match**, that's your **Best Match**." + +> **Dev:** "What if two results share the same address line 1?" + +> **Domain expert:** "That's an **Ambiguous Match** — two candidates at **Lexirank** 1. Fall back to scoring on the full address using all address lines joined together. If that still ties, return nothing." + +> **Dev:** "Once we have the best match, do we use the UPRN or fetch the full EPC?" + +> **Domain expert:** "Depends on what you need. The **EPC Search Result** gives you the **EPC Band** and **Certificate Number**. If you need energy efficiency detail, use the **Certificate Number** to fetch the full **EPC Property Data**." + +## Flagged ambiguities + +- **"address"** appears as both the raw **User Address** (free-text from customer data) and a structured field on an **EPC Search Result** (normalised address lines). Always qualify: "user address" vs "EPC address" or "address line 1". +- **"score"** is used for the `AddressMatch.score()` function output, the `lexiscore` DataFrame column, and informally in conversation. Prefer **Lexiscore** in domain discussions; reserve "score" for method-level code comments. +- **"user_inputed_address"** in `backend/address2UPRN/main.py` is a misspelling and a synonym for **User Address** — the canonical term. New code should use `user_address`. +- **"EPC"** is overloaded as both the document (an Energy Performance Certificate) and the rating band letter. Use **EPC** for the document and **EPC Band** for the letter. From d338be867b0938580f4c4c90ab9e0b52245dec97 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 25 Apr 2026 22:41:57 +0000 Subject: [PATCH 007/106] added missing files --- backend/epc_client/__init__.py | 3 + backend/epc_client/_retry.py | 23 ++ backend/epc_client/client.py | 175 ++++++++++++++ backend/epc_client/exceptions.py | 10 + backend/epc_client/requirements.txt | 1 + backend/epc_client/tests/__init__.py | 0 backend/epc_client/tests/conftest.py | 48 ++++ backend/epc_client/tests/test_client.py | 224 ++++++++++++++++++ .../tests/test_mapper_dispatcher.py | 31 +++ datatypes/epc/schema/helpers.py | 77 ++++++ 10 files changed, 592 insertions(+) create mode 100644 backend/epc_client/__init__.py create mode 100644 backend/epc_client/_retry.py create mode 100644 backend/epc_client/client.py create mode 100644 backend/epc_client/exceptions.py create mode 100644 backend/epc_client/requirements.txt create mode 100644 backend/epc_client/tests/__init__.py create mode 100644 backend/epc_client/tests/conftest.py create mode 100644 backend/epc_client/tests/test_client.py create mode 100644 backend/epc_client/tests/test_mapper_dispatcher.py create mode 100644 datatypes/epc/schema/helpers.py diff --git a/backend/epc_client/__init__.py b/backend/epc_client/__init__.py new file mode 100644 index 00000000..720594f7 --- /dev/null +++ b/backend/epc_client/__init__.py @@ -0,0 +1,3 @@ +from backend.epc_client.client import EpcClientService, EpcSearchResult + +__all__ = ["EpcClientService", "EpcSearchResult"] diff --git a/backend/epc_client/_retry.py b/backend/epc_client/_retry.py new file mode 100644 index 00000000..e290e95b --- /dev/null +++ b/backend/epc_client/_retry.py @@ -0,0 +1,23 @@ +import time +from typing import Callable, TypeVar + +from backend.epc_client.exceptions import EpcRateLimitError + +T = TypeVar("T") + + +def call_with_retry( + fn: Callable[[], T], + max_retries: int = 5, + backoff_base: float = 1.0, + backoff_multiplier: float = 2.0, +) -> T: + last_exc: EpcRateLimitError | None = None + for attempt in range(max_retries + 1): + try: + return fn() + except EpcRateLimitError as exc: + last_exc = exc + if attempt < max_retries: + time.sleep(backoff_base * (backoff_multiplier ** attempt)) + raise last_exc # type: ignore[misc] diff --git a/backend/epc_client/client.py b/backend/epc_client/client.py new file mode 100644 index 00000000..33f25ef5 --- /dev/null +++ b/backend/epc_client/client.py @@ -0,0 +1,175 @@ +# Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml +from __future__ import annotations + +from dataclasses import dataclass +from typing import Callable, Optional + +import httpx +import pandas as pd + +from backend.epc_client.exceptions import EpcApiError, EpcNotFoundError, EpcRateLimitError +from backend.epc_client._retry import call_with_retry +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper + + +@dataclass +class EpcSearchResult: + certificate_number: str + address_line_1: str + address_line_2: Optional[str] + address_line_3: Optional[str] + address_line_4: Optional[str] + postcode: str + post_town: str + uprn: Optional[int] + current_energy_efficiency_band: str + registration_date: str + + def full_address(self) -> str: + parts = [ + self.address_line_1, + self.address_line_2, + self.address_line_3, + self.address_line_4, + ] + return ", ".join(p for p in parts if p) + + +class EpcClientService: + BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk" + _MIN_MATCH_SCORE = 0.6 + + def __init__(self, auth_token: str) -> None: + self._headers = { + "Authorization": f"Bearer {auth_token}", + "Accept": "application/json", + } + + def get_by_certificate_number(self, cert_num: str) -> EpcPropertyData: + raw = call_with_retry(lambda: self._fetch_certificate(cert_num)) + return EpcPropertyDataMapper.from_api_response(raw) + + def get_by_uprn(self, uprn: int) -> Optional[EpcPropertyData]: + results = call_with_retry(lambda: self._search(uprn=uprn)) + if not results: + return None + latest = max(results, key=lambda r: r.registration_date) + return self.get_by_certificate_number(latest.certificate_number) + + def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: + return call_with_retry(lambda: self._search(postcode=postcode)) + + def find_best_match(self, postcode: str, address: str) -> Optional[EpcPropertyData]: + from backend.utils.addressMatch import get_uprn_candidates + + candidates = self.search_by_postcode(postcode) + if not candidates: + return None + + # Round 1: score on addressLine1 only + cert_num = self._pick_best_cert(candidates, address, use_full_address=False, fn=get_uprn_candidates) + if cert_num: + return self._safe_get(cert_num) + + # Round 2: score on all address lines joined + cert_num = self._pick_best_cert(candidates, address, use_full_address=True, fn=get_uprn_candidates) + if cert_num: + return self._safe_get(cert_num) + + return None + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _fetch_certificate(self, cert_num: str) -> dict: + resp = httpx.get( + f"{self.BASE_URL}/api/certificate", + params={"certificate_number": cert_num}, + headers=self._headers, + ) + if resp.status_code == 404: + raise EpcNotFoundError(cert_num) + if resp.status_code == 429: + raise EpcRateLimitError("Rate limited by EPC API") + if not resp.is_success: + raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") + return resp.json()["data"] + + def _search( + self, + postcode: Optional[str] = None, + uprn: Optional[int] = None, + ) -> list[EpcSearchResult]: + params: dict[str, str | int] = {} + if postcode: + params["postcode"] = postcode + if uprn is not None: + params["uprn"] = uprn + + resp = httpx.get( + f"{self.BASE_URL}/api/domestic/search", + params=params, + headers=self._headers, + ) + if resp.status_code == 404: + return [] + if resp.status_code == 429: + raise EpcRateLimitError("Rate limited by EPC API") + if not resp.is_success: + raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") + + rows = resp.json().get("data", []) + return [self._parse_search_result(r) for r in rows] + + @staticmethod + def _parse_search_result(row: dict) -> EpcSearchResult: + return EpcSearchResult( + certificate_number=row["certificateNumber"], + address_line_1=row["addressLine1"], + address_line_2=row.get("addressLine2"), + address_line_3=row.get("addressLine3"), + address_line_4=row.get("addressLine4"), + postcode=row["postcode"], + post_town=row["postTown"], + uprn=row.get("uprn"), + current_energy_efficiency_band=row["currentEnergyEfficiencyBand"], + registration_date=row["registrationDate"], + ) + + def _pick_best_cert( + self, + candidates: list[EpcSearchResult], + user_address: str, + use_full_address: bool, + fn: Callable[..., pd.DataFrame], + ) -> Optional[str]: + df = pd.DataFrame([ + { + "address": r.full_address() if use_full_address else r.address_line_1, + "uprn": str(r.uprn) if r.uprn is not None else "", + "certificate_number": r.certificate_number, + } + for r in candidates + ]) + + scored = fn(df, user_address=user_address) + if scored.empty: + return None + + best_score = scored.iloc[0]["lexiscore"] + if best_score < self._MIN_MATCH_SCORE: + return None + + top = scored[scored["lexirank"] == 1] + if len(top) != 1: + return None + + return str(top.iloc[0]["certificate_number"]) + + def _safe_get(self, cert_num: str) -> Optional[EpcPropertyData]: + try: + return self.get_by_certificate_number(cert_num) + except EpcNotFoundError: + return None diff --git a/backend/epc_client/exceptions.py b/backend/epc_client/exceptions.py new file mode 100644 index 00000000..49f1542a --- /dev/null +++ b/backend/epc_client/exceptions.py @@ -0,0 +1,10 @@ +class EpcApiError(Exception): + """Base for all EPC client errors.""" + + +class EpcNotFoundError(EpcApiError): + """Raised when the API returns 404.""" + + +class EpcRateLimitError(EpcApiError): + """Raised when the API returns 429 and all retries are exhausted.""" diff --git a/backend/epc_client/requirements.txt b/backend/epc_client/requirements.txt new file mode 100644 index 00000000..aa69c38b --- /dev/null +++ b/backend/epc_client/requirements.txt @@ -0,0 +1 @@ +httpx>=0.27.0 diff --git a/backend/epc_client/tests/__init__.py b/backend/epc_client/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/epc_client/tests/conftest.py b/backend/epc_client/tests/conftest.py new file mode 100644 index 00000000..2ed444af --- /dev/null +++ b/backend/epc_client/tests/conftest.py @@ -0,0 +1,48 @@ +import json +import pathlib +import pytest + +from backend.epc_client.client import EpcClientService + +SAMPLES_DIR = pathlib.Path("backend/epc_api/json_samples") + + +@pytest.fixture +def rdsap_21_0_0_cert(): + return json.loads((SAMPLES_DIR / "RdSAP-Schema-21.0.0/epc.json").read_text()) + + +@pytest.fixture +def rdsap_21_0_1_cert(): + return json.loads((SAMPLES_DIR / "RdSAP-Schema-21.0.1/epc.json").read_text()) + + +@pytest.fixture +def epc_service(): + return EpcClientService(auth_token="test-token") + + +def make_search_row( + cert_num="CERT-001", + address_line_1="1 Test Street", + postcode="SW1A 1AA", + post_town="London", + uprn=100023336956, + band="D", + registration_date="2024-01-01", + address_line_2=None, + address_line_3=None, + address_line_4=None, +): + return { + "certificateNumber": cert_num, + "addressLine1": address_line_1, + "addressLine2": address_line_2, + "addressLine3": address_line_3, + "addressLine4": address_line_4, + "postcode": postcode, + "postTown": post_town, + "uprn": uprn, + "currentEnergyEfficiencyBand": band, + "registrationDate": registration_date, + } diff --git a/backend/epc_client/tests/test_client.py b/backend/epc_client/tests/test_client.py new file mode 100644 index 00000000..51dd2a12 --- /dev/null +++ b/backend/epc_client/tests/test_client.py @@ -0,0 +1,224 @@ +from unittest.mock import MagicMock, patch, call +import pytest + +from backend.epc_client.client import EpcClientService, EpcSearchResult +from backend.epc_client.exceptions import EpcNotFoundError, EpcRateLimitError +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from backend.epc_client.tests.conftest import make_search_row + + +def _mock_response(status_code=200, json_data=None): + resp = MagicMock() + resp.status_code = status_code + resp.is_success = 200 <= status_code < 300 + resp.json.return_value = json_data or {} + resp.text = str(json_data) + return resp + + +# --------------------------------------------------------------------------- +# Test 1: get_by_certificate_number happy path +# --------------------------------------------------------------------------- + +def test_get_by_certificate_number_returns_epc_property_data(epc_service, rdsap_21_0_1_cert): + cert_response = {"data": rdsap_21_0_1_cert} + with patch("httpx.get", return_value=_mock_response(200, cert_response)): + result = epc_service.get_by_certificate_number("CERT-001") + + assert isinstance(result, EpcPropertyData) + + +# --------------------------------------------------------------------------- +# Test 2: get_by_certificate_number 404 → EpcNotFoundError +# --------------------------------------------------------------------------- + +def test_get_by_certificate_number_404_raises_not_found(epc_service): + with patch("httpx.get", return_value=_mock_response(404)): + with pytest.raises(EpcNotFoundError): + epc_service.get_by_certificate_number("BAD-CERT") + + +# --------------------------------------------------------------------------- +# Test 3: 429 retried, succeeds on 3rd attempt +# --------------------------------------------------------------------------- + +def test_get_by_certificate_number_retries_on_429_and_succeeds(epc_service, rdsap_21_0_1_cert): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429), + _mock_response(429), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch("time.sleep"): + result = epc_service.get_by_certificate_number("CERT-001") + + assert isinstance(result, EpcPropertyData) + + +# --------------------------------------------------------------------------- +# Test 4: get_by_uprn empty search → None +# --------------------------------------------------------------------------- + +def test_get_by_uprn_returns_none_when_no_results(epc_service): + with patch("httpx.get", return_value=_mock_response(200, {"data": []})): + result = epc_service.get_by_uprn(100023336956) + + assert result is None + + +# --------------------------------------------------------------------------- +# Test 5: get_by_uprn multiple results → fetches latest by registration_date +# --------------------------------------------------------------------------- + +def test_get_by_uprn_picks_most_recent_certificate(epc_service, rdsap_21_0_1_cert): + search_rows = [ + make_search_row(cert_num="CERT-OLD", registration_date="2022-01-01"), + make_search_row(cert_num="CERT-NEW", registration_date="2024-06-01"), + make_search_row(cert_num="CERT-MID", registration_date="2023-03-15"), + ] + cert_response = {"data": rdsap_21_0_1_cert} + + def fake_get(url, params=None, **kwargs): + if "search" in url: + return _mock_response(200, {"data": search_rows}) + return _mock_response(200, cert_response) + + with patch("httpx.get", side_effect=fake_get) as mock_get: + result = epc_service.get_by_uprn(100023336956) + + assert isinstance(result, EpcPropertyData) + # Second call must be for the most recent cert + cert_call = mock_get.call_args_list[1] + assert cert_call.kwargs["params"]["certificate_number"] == "CERT-NEW" + + +# --------------------------------------------------------------------------- +# Test 6: search_by_postcode returns list[EpcSearchResult] +# --------------------------------------------------------------------------- + +def test_search_by_postcode_returns_results(epc_service): + rows = [ + make_search_row(cert_num="CERT-A", address_line_1="1 High Street"), + make_search_row(cert_num="CERT-B", address_line_1="2 High Street"), + ] + with patch("httpx.get", return_value=_mock_response(200, {"data": rows})): + results = epc_service.search_by_postcode("SW1A 1AA") + + assert len(results) == 2 + assert all(isinstance(r, EpcSearchResult) for r in results) + assert results[0].certificate_number == "CERT-A" + assert results[1].address_line_1 == "2 High Street" + + +# --------------------------------------------------------------------------- +# Test 7: search_by_postcode 404 → empty list +# --------------------------------------------------------------------------- + +def test_search_by_postcode_404_returns_empty_list(epc_service): + with patch("httpx.get", return_value=_mock_response(404)): + results = epc_service.search_by_postcode("ZZ9 9ZZ") + + assert results == [] + + +# --------------------------------------------------------------------------- +# Tests 8-10: find_best_match +# --------------------------------------------------------------------------- + +def _make_scored_df(rows, scores, ranks): + import pandas as pd + df = pd.DataFrame(rows) + df["lexiscore"] = scores + df["lexirank"] = ranks + return df.sort_values("lexirank") + + +def test_find_best_match_round1_clear_winner(epc_service, rdsap_21_0_1_cert): + search_rows = [ + make_search_row(cert_num="CERT-WIN", address_line_1="1 High Street"), + make_search_row(cert_num="CERT-LOSE", address_line_1="99 Nowhere Lane"), + ] + cert_response = {"data": rdsap_21_0_1_cert} + + df_rows = [ + {"address": "1 High Street", "uprn": "100023336956", "certificate_number": "CERT-WIN"}, + {"address": "99 Nowhere Lane", "uprn": "100023336956", "certificate_number": "CERT-LOSE"}, + ] + scored = _make_scored_df(df_rows, [0.9, 0.1], [1, 2]) + + def fake_get(url, params=None, **kwargs): + if "search" in url: + return _mock_response(200, {"data": search_rows}) + return _mock_response(200, cert_response) + + with patch("httpx.get", side_effect=fake_get), \ + patch("backend.utils.addressMatch.get_uprn_candidates", return_value=scored): + result = epc_service.find_best_match("SW1A 1AA", "1 High Street") + + assert isinstance(result, EpcPropertyData) + + +def test_find_best_match_round1_ambiguous_round2_resolves(epc_service, rdsap_21_0_1_cert): + search_rows = [ + make_search_row( + cert_num="CERT-A", address_line_1="1 High Street", + address_line_2="Ground Floor", + ), + make_search_row( + cert_num="CERT-B", address_line_1="1 High Street", + address_line_2="First Floor", + ), + ] + cert_response = {"data": rdsap_21_0_1_cert} + + # Round 1: both score equally — ambiguous (two rank-1s) + ambiguous = _make_scored_df( + [ + {"address": "1 High Street", "uprn": "111", "certificate_number": "CERT-A"}, + {"address": "1 High Street", "uprn": "222", "certificate_number": "CERT-B"}, + ], + [0.9, 0.9], + [1, 1], + ) + # Round 2: CERT-A wins on full address + resolved = _make_scored_df( + [ + {"address": "1 High Street, Ground Floor", "uprn": "111", "certificate_number": "CERT-A"}, + {"address": "1 High Street, First Floor", "uprn": "222", "certificate_number": "CERT-B"}, + ], + [0.85, 0.4], + [1, 2], + ) + + call_count = {"n": 0} + + def fake_candidates(df, user_address, **kwargs): + call_count["n"] += 1 + return ambiguous if call_count["n"] == 1 else resolved + + def fake_get(url, params=None, **kwargs): + if "search" in url: + return _mock_response(200, {"data": search_rows}) + return _mock_response(200, cert_response) + + with patch("httpx.get", side_effect=fake_get), \ + patch("backend.utils.addressMatch.get_uprn_candidates", side_effect=fake_candidates): + result = epc_service.find_best_match("SW1A 1AA", "1 High Street Ground Floor") + + assert isinstance(result, EpcPropertyData) + + +def test_find_best_match_returns_none_when_no_good_match(epc_service): + search_rows = [make_search_row(cert_num="CERT-X", address_line_1="99 Nowhere Lane")] + + low_score = _make_scored_df( + [{"address": "99 Nowhere Lane", "uprn": "111", "certificate_number": "CERT-X"}], + [0.1], + [1], + ) + + with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})), \ + patch("backend.utils.addressMatch.get_uprn_candidates", return_value=low_score): + result = epc_service.find_best_match("SW1A 1AA", "1 Completely Different Road") + + assert result is None diff --git a/backend/epc_client/tests/test_mapper_dispatcher.py b/backend/epc_client/tests/test_mapper_dispatcher.py new file mode 100644 index 00000000..efb9c4ec --- /dev/null +++ b/backend/epc_client/tests/test_mapper_dispatcher.py @@ -0,0 +1,31 @@ +import pytest + +from datatypes.epc.domain.mapper import EpcPropertyDataMapper +from datatypes.epc.domain.epc_property_data import EpcPropertyData + + +# --------------------------------------------------------------------------- +# Test 1: from_api_response with RdSAP-Schema-21.0.0 fixture → EpcPropertyData +# --------------------------------------------------------------------------- + +def test_from_api_response_rdsap_21_0_0(rdsap_21_0_0_cert): + result = EpcPropertyDataMapper.from_api_response(rdsap_21_0_0_cert) + assert isinstance(result, EpcPropertyData) + + +# --------------------------------------------------------------------------- +# Test 2: from_api_response with RdSAP-Schema-21.0.1 fixture → EpcPropertyData +# --------------------------------------------------------------------------- + +def test_from_api_response_rdsap_21_0_1(rdsap_21_0_1_cert): + result = EpcPropertyDataMapper.from_api_response(rdsap_21_0_1_cert) + assert isinstance(result, EpcPropertyData) + + +# --------------------------------------------------------------------------- +# Test 3: unknown schema_type → ValueError +# --------------------------------------------------------------------------- + +def test_from_api_response_unknown_schema_raises(): + with pytest.raises(ValueError, match="Unsupported EPC schema"): + EpcPropertyDataMapper.from_api_response({"schema_type": "RdSAP-Schema-99.0.0"}) diff --git a/datatypes/epc/schema/helpers.py b/datatypes/epc/schema/helpers.py new file mode 100644 index 00000000..22f132d2 --- /dev/null +++ b/datatypes/epc/schema/helpers.py @@ -0,0 +1,77 @@ +import dataclasses +import typing +from datetime import date +from typing import Any, Dict, Type, TypeVar + +T = TypeVar("T") + + +def from_dict(cls: Type[T], data: Dict[str, Any]) -> T: + """ + Recursively convert a plain dict (e.g. from json.loads) into the given + dataclass type, using the field type hints to convert nested structures. + + Handles: + - Nested dataclasses + - List[SomeDataclass] + - Optional[X] / Union[X, None] + - Union[DataclassType, primitive] (e.g. Union[Measurement, int]) + - Primitive pass-through for Union[str, int] etc. + """ + return _from_dict_impl(cls, data) # type: ignore[return-value] + + +def _from_dict_impl(cls: Any, data: Any) -> Any: + hints = typing.get_type_hints(cls) + kwargs: Dict[str, Any] = {} + + for field in dataclasses.fields(cls): # type: ignore[arg-type] + has_default = ( + field.default is not dataclasses.MISSING + or field.default_factory is not dataclasses.MISSING # type: ignore[misc] + ) + if field.name not in data: + if has_default: + continue + raise ValueError(f"{cls.__name__}: missing required field '{field.name}'") + + kwargs[field.name] = _coerce(data[field.name], hints[field.name]) + + return cls(**kwargs) + + +def _coerce(value: Any, hint: Any) -> Any: + if value is None: + return None + + origin = typing.get_origin(hint) + args = typing.get_args(hint) + + # Union (includes Optional[X] which is Union[X, None]) + if origin is typing.Union: + if value is None: + return None + non_none_args = [a for a in args if a is not type(None)] + if len(non_none_args) == 1: + # Optional[X] — recurse so List[X] and nested dataclasses are handled + return _coerce(value, non_none_args[0]) + # Multi-type Union (e.g. Union[Measurement, int]): try dataclasses first + for arg in non_none_args: + if dataclasses.is_dataclass(arg) and isinstance(value, dict): + return _from_dict_impl(arg, value) + # All remaining args are primitives — return value as-is + return value + + # List[X] + if origin is list: + item_hint = args[0] + return [_coerce(item, item_hint) for item in value] + + # Plain dataclass + if dataclasses.is_dataclass(hint) and isinstance(value, dict): + return _from_dict_impl(hint, value) + + if hint is date and isinstance(value, str): + return date.fromisoformat(value) + + return value From 0d3189beee875487296100a2d3c72ec7c446ee70 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 25 Apr 2026 22:55:35 +0000 Subject: [PATCH 008/106] added httpx dependency --- backend/engine/requirements.txt | 4 +++- backend/epc_client/requirements.txt | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/engine/requirements.txt b/backend/engine/requirements.txt index 5cca1211..41d07a1a 100644 --- a/backend/engine/requirements.txt +++ b/backend/engine/requirements.txt @@ -23,4 +23,6 @@ pyarrow==17.0.0 fastparquet==2024.5.0 aiohttp==3.10.10 # find my epc -beautifulsoup4 \ No newline at end of file +beautifulsoup4 +# HTTP client (epc_client module) +httpx==0.28.1 \ No newline at end of file diff --git a/backend/epc_client/requirements.txt b/backend/epc_client/requirements.txt index aa69c38b..cee32373 100644 --- a/backend/epc_client/requirements.txt +++ b/backend/epc_client/requirements.txt @@ -1 +1 @@ -httpx>=0.27.0 +httpx==0.28.1 From 077f14764b36bf51cf6a8954d10a31dc2412d58f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 25 Apr 2026 23:03:11 +0000 Subject: [PATCH 009/106] updated test for 101 columns with new fields on property_details_epc --- backend/export/tests/test_export.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/export/tests/test_export.py b/backend/export/tests/test_export.py index b00d1744..f13ef374 100644 --- a/backend/export/tests/test_export.py +++ b/backend/export/tests/test_export.py @@ -284,8 +284,8 @@ def test_default_export_integration(db_session): assert df.shape == ( 10, - 100, - ), "Expected dataframe shape to be (10, 100), got {}".format(df.shape) + 101, + ), "Expected dataframe shape to be (10, 101), got {}".format(df.shape) def test_solar_with_battery_example(db_session): From 09558629731f6e8f2281dd593324f257bd0b7586 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 27 Apr 2026 11:32:44 +0000 Subject: [PATCH 010/106] working on integrating new EPC api into address2UPRN --- .github/workflows/deploy_fastapi_backend.yml | 5 ++++ .github/workflows/unit_tests.yml | 1 + .gitignore | 3 ++ backend/.env.example | 3 +- backend/address2UPRN/main.py | 29 +++++++++++--------- backend/address2UPRN/tests/test_csv.py | 1 - backend/app/config.py | 1 + conftest.py | 10 ++++--- 8 files changed, 34 insertions(+), 19 deletions(-) diff --git a/.github/workflows/deploy_fastapi_backend.yml b/.github/workflows/deploy_fastapi_backend.yml index 5ad4d6ac..cb861d31 100644 --- a/.github/workflows/deploy_fastapi_backend.yml +++ b/.github/workflows/deploy_fastapi_backend.yml @@ -51,6 +51,10 @@ jobs: id: set_auth_token run: echo "::set-output name=auth_token::${{ secrets[format('{0}_EPC_AUTH_TOKEN', github.ref_name)] }}" + - name: Set Open EPC API token + id: set_open_epc_token + run: echo "::set-output name=open_epc_token::${{ secrets[format('{0}_OPEN_EPC_API_TOKEN', github.ref_name)] }}" + # Store port, name and host in github secrets - name: Set DB credentials id: set_db_credentials @@ -127,6 +131,7 @@ jobs: GOOGLE_SOLAR_API_KEY: ${{ steps.set_api_secrets.outputs.google_solar_api_key }} DOMAIN_NAME: ${{ steps.set_domain.outputs.domain }} EPC_AUTH_TOKEN: ${{ steps.set_auth_token.outputs.auth_token }} + OPEN_EPC_API_TOKEN: ${{ steps.set_open_epc_token.outputs.open_epc_token }} DB_HOST: ${{ steps.set_db_credentials.outputs.db_host }} DB_PORT: ${{ steps.set_db_credentials.outputs.db_port }} DB_NAME: ${{ steps.set_db_credentials.outputs.db_name }} diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 436428f9..e1f4fb48 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -49,6 +49,7 @@ jobs: docker run --rm \ --network host \ -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \ + -e OPEN_EPC_API_TOKEN=${{ secrets.DEV_OPEN_EPC_API_TOKEN }} \ -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \ -e DB_HOST=localhost \ -e DB_NAME=test \ diff --git a/.gitignore b/.gitignore index d6d23313..888d527a 100644 --- a/.gitignore +++ b/.gitignore @@ -292,3 +292,6 @@ pyrightconfig.json # playwright output */pashub_fetcher/videos/* backlog/* + +# Local Claude config files +.claude/* \ No newline at end of file diff --git a/backend/.env.example b/backend/.env.example index 352192d0..04611719 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -1,4 +1,5 @@ API_KEY = example-api-key ENVIRONMENT = local SECRET_KEY = YOUR_SECRET_KEY -ALGORITHM = HS256 \ No newline at end of file +ALGORITHM = HS256 +OPEN_EPC_API_TOKEN = your_token_here \ No newline at end of file diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index bd562bc7..98f8c65b 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -14,27 +14,30 @@ from utils.s3 import ( ) from datetime import datetime -from backend.utils.addressMatch import AddressMatch, get_uprn_candidates, df_has_single_uprn, score_addresses +from backend.utils.addressMatch import ( + AddressMatch, + get_uprn_candidates, + df_has_single_uprn, + score_addresses, +) logger = setup_logger() -EPC_AUTH_TOKEN = os.getenv( - "EPC_AUTH_TOKEN", -) +OPEN_EPC_API_TOKEN = os.getenv("OPEN_EPC_API_TOKEN") -if EPC_AUTH_TOKEN is None: - raise RuntimeError("EPC_AUTH_TOKEN not defined in env") +if OPEN_EPC_API_TOKEN is None: + raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env") def get_epc_data_with_postcode(postcode: str) -> pd.DataFrame: from backend.epc_client.client import EpcClientService - service = EpcClientService(auth_token=EPC_AUTH_TOKEN) + + service = EpcClientService(auth_token=OPEN_EPC_API_TOKEN) results = service.search_by_postcode(postcode) - return pd.DataFrame([ - {"address": r.address_line_1, "uprn": r.uprn} - for r in results - ]) + return pd.DataFrame( + [{"address": r.address_line_1, "uprn": r.uprn} for r in results] + ) def get_uprn_with_epc_df( @@ -58,8 +61,8 @@ def get_uprn_with_epc_df( best_score = scored_df.iloc[0]["lexiscore"] # # Return None if score is below threshold - # if best_score < 0.7: - # return None + if best_score < 0.7: + return None # All rank-1 rows (possible draw) top_rank_df = scored_df[scored_df["lexirank"] == 1] diff --git a/backend/address2UPRN/tests/test_csv.py b/backend/address2UPRN/tests/test_csv.py index 70e7a9f9..a8f0b1b4 100644 --- a/backend/address2UPRN/tests/test_csv.py +++ b/backend/address2UPRN/tests/test_csv.py @@ -31,7 +31,6 @@ def test_uprn_resolution_matches_manual( postcode: str, expected_uprn: str, ): - from utils.logger import setup_logger uprn = get_uprn(user_input, postcode) if uprn: diff --git a/backend/app/config.py b/backend/app/config.py index 70a6b50c..44826d24 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -44,6 +44,7 @@ class Settings(BaseSettings): # Third parties EPC_AUTH_TOKEN: str = "changeme" + OPEN_EPC_API_TOKEN: str = "changeme" GOOGLE_SOLAR_API_KEY: str = "changeme" # Database settings diff --git a/conftest.py b/conftest.py index 2ea20ebb..0689853b 100644 --- a/conftest.py +++ b/conftest.py @@ -1,11 +1,9 @@ import os +from pathlib import Path from backend.app.config import get_settings -import os from dotenv import load_dotenv -import os -# Load .env in conftest.py directory for local development -load_dotenv() +load_dotenv(Path(__file__).resolve().parent / "backend" / ".env") DEFAULT_ENV = { "API_KEY": "test", @@ -18,6 +16,10 @@ DEFAULT_ENV = { "EPC_AUTH_TOKEN", "test", ), # overridden in GitHub Actions + "OPEN_EPC_API_TOKEN": os.getenv( + "OPEN_EPC_API_TOKEN", + "test", + ), # overridden in GitHub Actions "GOOGLE_SOLAR_API_KEY": "test", "DB_HOST": "localhost", "DB_USERNAME": "test", From 1af6bc674831f28dc62cd865f048f09b74a6dc90 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 27 Apr 2026 12:15:30 +0000 Subject: [PATCH 011/106] creating lodgment dates data, using old EPC api, to verify test failures --- .../tests/populate_lodgement_dates.py | 81 ++ backend/address2UPRN/tests/test_csv.py | 46 +- .../tests/test_lodgement_dates.json | 1230 +++++++++++++++++ 3 files changed, 1348 insertions(+), 9 deletions(-) create mode 100644 backend/address2UPRN/tests/populate_lodgement_dates.py create mode 100644 backend/address2UPRN/tests/test_lodgement_dates.json diff --git a/backend/address2UPRN/tests/populate_lodgement_dates.py b/backend/address2UPRN/tests/populate_lodgement_dates.py new file mode 100644 index 00000000..0726596b --- /dev/null +++ b/backend/address2UPRN/tests/populate_lodgement_dates.py @@ -0,0 +1,81 @@ +import csv +import json +import os +from pathlib import Path +from urllib.parse import urlencode + +import pandas as pd +from epc_api.client import EpcClient + +FIXTURE_PATH = Path(__file__).parent / "test_data.csv" +SIDECAR_PATH = Path(__file__).parent / "test_lodgement_dates.json" + + +def fetch_postcode_records(client: EpcClient, postcode: str) -> pd.DataFrame: + url = os.path.join(client.domestic.host, "search") + url += "?" + urlencode({"size": 500}) + resp = client.domestic.call(url=url, method="get", params={"postcode": postcode}) + if not resp or "rows" not in resp: + return pd.DataFrame() + return pd.DataFrame(resp["rows"], columns=resp["column-names"]) + + +def main(): + auth_token = os.getenv("EPC_AUTH_TOKEN") + if not auth_token: + raise RuntimeError("EPC_AUTH_TOKEN not set") + + client = EpcClient(auth_token=auth_token) + + sidecar = {} + if SIDECAR_PATH.exists(): + sidecar = json.loads(SIDECAR_PATH.read_text()) + + with open(FIXTURE_PATH, newline="", encoding="utf-8") as f: + rows = list(csv.DictReader(f)) + + by_postcode: dict[str, list[dict]] = {} + for row in rows: + if row["Manual UPRN Code"] == "None": + continue + by_postcode.setdefault(row["Postcode"], []).append(row) + + for postcode, postcode_rows in by_postcode.items(): + print(f"Fetching {postcode} ({len(postcode_rows)} rows)...") + try: + epc_df = fetch_postcode_records(client, postcode) + except Exception as e: + print(f" ERROR: {e}") + continue + + if epc_df.empty: + print(f" No results from old API for {postcode}") + continue + + epc_df["uprn"] = epc_df["uprn"].astype(str).str.replace(r"\.0$", "", regex=True) + + for row in postcode_rows: + key = f"{row['User Input']}|{row['Postcode']}" + if key in sidecar: + continue + + expected_uprn = str(row["Manual UPRN Code"]).strip() + match = epc_df[epc_df["uprn"] == expected_uprn] + + if match.empty: + print(f" WARN: UPRN {expected_uprn} not found in old API for {postcode}") + sidecar[key] = {"lodgement_date": None, "found_in_old_api": False} + else: + lodgement_date = match.iloc[0].get("lodgement-date") + sidecar[key] = { + "lodgement_date": str(lodgement_date) if lodgement_date else None, + "found_in_old_api": True, + } + print(f" {row['User Input']}: {lodgement_date}") + + SIDECAR_PATH.write_text(json.dumps(sidecar, indent=2)) + print(f"\nWritten to {SIDECAR_PATH}") + + +if __name__ == "__main__": + main() diff --git a/backend/address2UPRN/tests/test_csv.py b/backend/address2UPRN/tests/test_csv.py index a8f0b1b4..d8f54c39 100644 --- a/backend/address2UPRN/tests/test_csv.py +++ b/backend/address2UPRN/tests/test_csv.py @@ -1,25 +1,54 @@ # tests/test_address_to_uprn_csv.py import csv +import json import pytest +from datetime import date from pathlib import Path from backend.address2UPRN.main import get_uprn FIXTURE_PATH = Path(__file__).parent / "test_data.csv" +SIDECAR_PATH = Path(__file__).parent / "test_lodgement_dates.json" +NEW_API_CUTOFF = date(2012, 1, 1) + + +def _load_sidecar() -> dict: + if SIDECAR_PATH.exists(): + return json.loads(SIDECAR_PATH.read_text()) + return {} def load_test_cases(): + sidecar = _load_sidecar() with open(FIXTURE_PATH, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) - return [ - pytest.param( - row["User Input"], - row["Postcode"], - row["Manual UPRN Code"], - id=f'{row["User Input"]} [{row["Postcode"]}]', + cases = [] + for row in reader: + key = f"{row['User Input']}|{row['Postcode']}" + entry = sidecar.get(key, {}) + lodgement_date = entry.get("lodgement_date") + + marks = [] + if lodgement_date: + parsed = date.fromisoformat(lodgement_date[:10]) + if parsed < NEW_API_CUTOFF: + marks.append( + pytest.mark.xfail( + reason=f"EPC lodged {lodgement_date} — predates new API coverage (Jan 2012)", + strict=False, + ) + ) + + cases.append( + pytest.param( + row["User Input"], + row["Postcode"], + row["Manual UPRN Code"], + id=f'{row["User Input"]} [{row["Postcode"]}]', + marks=marks, + ) ) - for row in reader - ] + return cases @pytest.mark.parametrize( @@ -31,7 +60,6 @@ def test_uprn_resolution_matches_manual( postcode: str, expected_uprn: str, ): - uprn = get_uprn(user_input, postcode) if uprn: assert uprn == expected_uprn diff --git a/backend/address2UPRN/tests/test_lodgement_dates.json b/backend/address2UPRN/tests/test_lodgement_dates.json new file mode 100644 index 00000000..c58be704 --- /dev/null +++ b/backend/address2UPRN/tests/test_lodgement_dates.json @@ -0,0 +1,1230 @@ +{ + "47 The Fairway|OX16 0RR": { + "lodgement_date": "2010-03-16", + "found_in_old_api": true + }, + "11 REGENT COURT|SL1 3LG": { + "lodgement_date": "2022-05-04", + "found_in_old_api": true + }, + "3/137a Windmill Road|TW8 9NH": { + "lodgement_date": "2025-01-30", + "found_in_old_api": true + }, + "Flat 33|SW18 4BE": { + "lodgement_date": "2022-04-27", + "found_in_old_api": true + }, + "FLAT 1 Brendon Grove|N2 8JE": { + "lodgement_date": "2011-02-17", + "found_in_old_api": true + }, + "Flat 15|KT8 2NE": { + "lodgement_date": "2018-03-26", + "found_in_old_api": true + }, + "FLAT 5 Stonehill Road|W4 3AH": { + "lodgement_date": "2025-09-22", + "found_in_old_api": true + }, + "Flat 10|W4 3AH": { + "lodgement_date": "2023-06-15", + "found_in_old_api": true + }, + "Flat 11|W4 3AH": { + "lodgement_date": "2023-10-19", + "found_in_old_api": true + }, + "Flat 12, Forbes House|W4 3AH": { + "lodgement_date": "2023-10-04", + "found_in_old_api": true + }, + "Flat 13|W4 3AH": { + "lodgement_date": "2012-05-14", + "found_in_old_api": true + }, + "Flat 14|W4 3AH": { + "lodgement_date": "2022-10-15", + "found_in_old_api": true + }, + "Flat 15|W4 3AH": { + "lodgement_date": "2009-08-25", + "found_in_old_api": true + }, + "Flat 16|W4 3AH": { + "lodgement_date": "2012-05-23", + "found_in_old_api": true + }, + "Flat 17|W4 3AH": { + "lodgement_date": "2023-08-31", + "found_in_old_api": true + }, + "Flat 19|W4 3AH": { + "lodgement_date": "2025-07-16", + "found_in_old_api": true + }, + "Flat 20|W4 3AH": { + "lodgement_date": "2024-10-27", + "found_in_old_api": true + }, + "Flat 21|W4 3AH": { + "lodgement_date": "2023-08-08", + "found_in_old_api": true + }, + "Flat 22|W4 3AH": { + "lodgement_date": "2022-10-15", + "found_in_old_api": true + }, + "Flat 23|W4 3AH": { + "lodgement_date": "2022-10-15", + "found_in_old_api": true + }, + "Flat 24|W4 3AH": { + "lodgement_date": "2024-01-12", + "found_in_old_api": true + }, + "10 Douglas Court|SL7 1UQ": { + "lodgement_date": "2018-10-25", + "found_in_old_api": true + }, + "1 Windmill Road|HP17 8JA": { + "lodgement_date": "2009-08-25", + "found_in_old_api": true + }, + "31 Denewood|HP13 7LH": { + "lodgement_date": "2009-03-23", + "found_in_old_api": true + }, + "10, Greenways Drive|TW4 5DD": { + "lodgement_date": "2012-11-29", + "found_in_old_api": true + }, + "Flat 11|TW4 5DD": { + "lodgement_date": "2012-11-29", + "found_in_old_api": true + }, + "12, Greenways Drive|TW4 5DD": { + "lodgement_date": "2012-11-29", + "found_in_old_api": true + }, + "Flat 13|TW4 5DD": { + "lodgement_date": "2012-11-29", + "found_in_old_api": true + }, + "Flat 14|TW4 5DD": { + "lodgement_date": "2012-11-29", + "found_in_old_api": true + }, + "Flat 15|TW4 5DD": { + "lodgement_date": "2012-11-29", + "found_in_old_api": true + }, + "Flat 16|TW4 5DD": { + "lodgement_date": "2025-02-26", + "found_in_old_api": true + }, + "Flat 17|TW4 5DD": { + "lodgement_date": "2012-11-29", + "found_in_old_api": true + }, + "Flat 18|TW4 5DD": { + "lodgement_date": "2022-12-28", + "found_in_old_api": true + }, + "FLAT 1 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 2 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 3 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 4 Goodstone Court|HA1 4FL": { + "lodgement_date": "2022-12-14", + "found_in_old_api": true + }, + "FLAT 5 Goodstone Court|HA1 4FL": { + "lodgement_date": "2016-10-04", + "found_in_old_api": true + }, + "FLAT 6 Goodstone Court|HA1 4FL": { + "lodgement_date": "2024-06-05", + "found_in_old_api": true + }, + "FLAT 7 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 8 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 9 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 10 Goodstone Court|HA1 4FL": { + "lodgement_date": "2023-09-21", + "found_in_old_api": true + }, + "FLAT 11 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 12 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 13 Goodstone Court|HA1 4FL": { + "lodgement_date": "2022-12-13", + "found_in_old_api": true + }, + "FLAT 14 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 15 Goodstone Court|HA1 4FL": { + "lodgement_date": "2024-02-09", + "found_in_old_api": true + }, + "FLAT 16 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 17 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 18 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 19 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 20 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 21 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 22 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 23 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 24 Goodstone Court|HA1 4FL": { + "lodgement_date": "2024-10-24", + "found_in_old_api": true + }, + "FLAT 25 Goodstone Court|HA1 4FL": { + "lodgement_date": "2020-01-18", + "found_in_old_api": true + }, + "FLAT 26 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 27 Goodstone Court|HA1 4FL": { + "lodgement_date": "2022-11-04", + "found_in_old_api": true + }, + "FLAT 28 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 29 Goodstone Court|HA1 4FL": { + "lodgement_date": "2023-10-13", + "found_in_old_api": true + }, + "FLAT 30 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 31 Goodstone Court|HA1 4FL": { + "lodgement_date": "2023-04-19", + "found_in_old_api": true + }, + "FLAT 32 Goodstone Court|HA1 4FL": { + "lodgement_date": "2025-11-18", + "found_in_old_api": true + }, + "FLAT 33 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 34 Goodstone Court|HA1 4FL": { + "lodgement_date": "2022-09-19", + "found_in_old_api": true + }, + "FLAT 35 Goodstone Court|HA1 4FL": { + "lodgement_date": "2021-10-13", + "found_in_old_api": true + }, + "FLAT 36 Goodstone Court|HA1 4FL": { + "lodgement_date": "2022-10-12", + "found_in_old_api": true + }, + "FLAT 37 Goodstone Court|HA1 4FL": { + "lodgement_date": "2024-08-26", + "found_in_old_api": true + }, + "FLAT 38 Goodstone Court|HA1 4FL": { + "lodgement_date": "2023-05-26", + "found_in_old_api": true + }, + "FLAT 39 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 40 Goodstone Court|HA1 4FL": { + "lodgement_date": "2023-10-05", + "found_in_old_api": true + }, + "FLAT 41 Goodstone Court|HA1 4FL": { + "lodgement_date": "2025-11-24", + "found_in_old_api": true + }, + "FLAT 42 Goodstone Court|HA1 4FL": { + "lodgement_date": "2012-11-06", + "found_in_old_api": true + }, + "FLAT 43 Goodstone Court|HA1 4FL": { + "lodgement_date": "2025-07-08", + "found_in_old_api": true + }, + "30c, Bosanquet Close|UB8 3PE": { + "lodgement_date": "2019-05-27", + "found_in_old_api": true + }, + "30e, Bosanquet Close|UB8 3PE": { + "lodgement_date": "2024-07-30", + "found_in_old_api": true + }, + "13 Stubwick Court, Old Saw Mill Place|HP6 6FF": { + "lodgement_date": "2025-07-05", + "found_in_old_api": true + }, + "14 Stubwick Court, Old Saw Mill Place|HP6 6FF": { + "lodgement_date": "2012-07-18", + "found_in_old_api": true + }, + "15 Stubwick Court, Old Saw Mill Place|HP6 6FF": { + "lodgement_date": "2012-06-11", + "found_in_old_api": true + }, + "16 Stubwick Court, Old Saw Mill Place|HP6 6FF": { + "lodgement_date": "2022-07-01", + "found_in_old_api": true + }, + "17 Stubwick Court, Old Saw Mill Place|HP6 6FF": { + "lodgement_date": "2025-01-07", + "found_in_old_api": true + }, + "18 Stubwick Court, Old Saw Mill Place|HP6 6FF": { + "lodgement_date": "2012-07-18", + "found_in_old_api": true + }, + "19 Stubwick Court, Old Saw Mill Place|HP6 6FF": { + "lodgement_date": "2025-03-22", + "found_in_old_api": true + }, + "20 Stubwick Court, Old Saw Mill Place|HP6 6FF": { + "lodgement_date": "2022-08-15", + "found_in_old_api": true + }, + "21 Stubwick Court, Old Saw Mill Place|HP6 6FF": { + "lodgement_date": "2012-07-18", + "found_in_old_api": true + }, + "90a Murray Road|W5 4DA": { + "lodgement_date": "2013-12-12", + "found_in_old_api": true + }, + "Flat 1, 6 Wolverton Gardens|W5 3LJ": { + "lodgement_date": "2017-10-13", + "found_in_old_api": true + }, + "1, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "10, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "20, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "2, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "3, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "4, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "5, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "6, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "7, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "8, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "9, Monsted House|UB1 1FG": { + "lodgement_date": "2019-02-08", + "found_in_old_api": true + }, + "1 Cullis House, 1, Accolade Avenue|UB1 1FH": { + "lodgement_date": "2018-11-05", + "found_in_old_api": true + }, + "2 Cullis House, 1, Accolade Avenue|UB1 1FH": { + "lodgement_date": "2018-11-05", + "found_in_old_api": true + }, + "3 Cullis House, 1, Accolade Avenue|UB1 1FH": { + "lodgement_date": "2018-11-05", + "found_in_old_api": true + }, + "4 Cullis House, 1, Accolade Avenue|UB1 1FH": { + "lodgement_date": "2018-11-05", + "found_in_old_api": true + }, + "5 Cullis House, 1, Accolade Avenue|UB1 1FH": { + "lodgement_date": "2018-11-05", + "found_in_old_api": true + }, + "6 Cullis House, 1, Accolade Avenue|UB1 1FH": { + "lodgement_date": "2018-11-05", + "found_in_old_api": true + }, + "1 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-10", + "found_in_old_api": true + }, + "2 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-10", + "found_in_old_api": true + }, + "3 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-13", + "found_in_old_api": true + }, + "4 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-13", + "found_in_old_api": true + }, + "5 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-13", + "found_in_old_api": true + }, + "6 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-13", + "found_in_old_api": true + }, + "7 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-13", + "found_in_old_api": true + }, + "8 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-13", + "found_in_old_api": true + }, + "9 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-13", + "found_in_old_api": true + }, + "10 Genteel House Samara Drive|UB1 1FJ": { + "lodgement_date": "2019-05-13", + "found_in_old_api": true + }, + "Flat 1 Ash Tree House, 2, Thompson Avenue|SE5 0TE": { + "lodgement_date": "2018-09-05", + "found_in_old_api": true + }, + "Flat 3 ASH TREE HOUSE|SE5 0TE": { + "lodgement_date": "2018-09-05", + "found_in_old_api": true + }, + "Flat 5 ASH TREE HOUSE|SE5 0TE": { + "lodgement_date": "2019-09-12", + "found_in_old_api": true + }, + "Flat 8 ASH TREE HOUSE|SE5 0TE": { + "lodgement_date": "2011-10-26", + "found_in_old_api": true + }, + "Flat 12 ASH TREE HOUSE|SE5 0TE": { + "lodgement_date": "2018-09-05", + "found_in_old_api": true + }, + "FLAT 1 599 HARROW ROAD|W10 4RA": { + "lodgement_date": "2017-01-12", + "found_in_old_api": true + }, + "FLAT 2 599 HARROW ROAD|W10 4RA": { + "lodgement_date": "2020-07-28", + "found_in_old_api": true + }, + "FLAT 5 599 HARROW ROAD|W10 4RA": { + "lodgement_date": "2017-01-12", + "found_in_old_api": true + }, + "Flat 1, Ohio Building|SE13 7RX": { + "lodgement_date": "2023-08-15", + "found_in_old_api": true + }, + "Flat 2, Ohio Building|SE13 7RX": { + "lodgement_date": "2017-06-09", + "found_in_old_api": true + }, + "Apartment 1 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2017-01-05", + "found_in_old_api": true + }, + "Apartment 2 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2014-01-22", + "found_in_old_api": true + }, + "Apartment 3 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2009-02-25", + "found_in_old_api": true + }, + "Apartment 4 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2017-01-05", + "found_in_old_api": true + }, + "Apartment 5 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2009-02-25", + "found_in_old_api": true + }, + "Apartment 6 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2009-02-25", + "found_in_old_api": true + }, + "Apartment 7 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2022-10-24", + "found_in_old_api": true + }, + "Apartment 8 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2009-02-25", + "found_in_old_api": true + }, + "Apartment 9 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2009-02-25", + "found_in_old_api": true + }, + "Apartment 10 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2009-02-25", + "found_in_old_api": true + }, + "Apartment 11 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2011-08-17", + "found_in_old_api": true + }, + "Apartment 12 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2009-02-25", + "found_in_old_api": true + }, + "Apartment 13 Block B, 105, Benwell Road|N7 7BW": { + "lodgement_date": "2009-02-25", + "found_in_old_api": true + }, + "Apartment 1 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2023-07-19", + "found_in_old_api": true + }, + "Apartment 2 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2022-10-20", + "found_in_old_api": true + }, + "Apartment 3 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2009-05-15", + "found_in_old_api": true + }, + "Apartment 4 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2009-05-15", + "found_in_old_api": true + }, + "Apartment 5 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2009-05-15", + "found_in_old_api": true + }, + "Apartment 6 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2012-11-08", + "found_in_old_api": true + }, + "Apartment 7 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2015-08-30", + "found_in_old_api": true + }, + "Apartment 8 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2020-08-02", + "found_in_old_api": true + }, + "Apartment 9 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2021-10-12", + "found_in_old_api": true + }, + "Apartment 10 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2009-05-15", + "found_in_old_api": true + }, + "Apartment 11 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2009-05-15", + "found_in_old_api": true + }, + "Apartment 12 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2022-02-22", + "found_in_old_api": true + }, + "Apartment 13 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2009-05-15", + "found_in_old_api": true + }, + "Apartment 14 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2009-05-15", + "found_in_old_api": true + }, + "Apartment 15 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2009-05-15", + "found_in_old_api": true + }, + "Apartment 16 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2009-05-15", + "found_in_old_api": true + }, + "Apartment 17Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2019-01-22", + "found_in_old_api": true + }, + "Apartment 18 Block D, 32, Hornsey Road|N7 7AT": { + "lodgement_date": "2013-06-03", + "found_in_old_api": true + }, + "FLAT B 158 LEAHURST ROAD|SE13 5NL": { + "lodgement_date": "2014-01-24", + "found_in_old_api": true + }, + "2 COLLEGE HOUSE|CM7 1JS": { + "lodgement_date": "2017-01-12", + "found_in_old_api": true + }, + "3 COLLEGE HOUSE|CM7 1JS": { + "lodgement_date": "2017-01-12", + "found_in_old_api": true + }, + "2 Anita Street|M4 5DU": { + "lodgement_date": "2019-10-18", + "found_in_old_api": true + }, + "5 Anita Street|M4 5DU": { + "lodgement_date": "2012-12-21", + "found_in_old_api": true + }, + "6 Anita Street|M4 5DU": { + "lodgement_date": "2021-02-16", + "found_in_old_api": true + }, + "10 Anita Street|M4 5DU": { + "lodgement_date": "2021-07-01", + "found_in_old_api": true + }, + "12 Anita Street|M4 5DU": { + "lodgement_date": "2025-08-08", + "found_in_old_api": true + }, + "26 Anita Street|M4 5DU": { + "lodgement_date": "2010-06-25", + "found_in_old_api": true + }, + "33 Anita Street|M4 5DU": { + "lodgement_date": "2017-03-10", + "found_in_old_api": true + }, + "35 Anita Street|M4 5DU": { + "lodgement_date": "2015-11-18", + "found_in_old_api": true + }, + "36 Anita Street|M4 5DU": { + "lodgement_date": "2013-09-12", + "found_in_old_api": true + }, + "23 George Leigh Street|M4 5DR": { + "lodgement_date": "2025-03-11", + "found_in_old_api": true + }, + "35 George Leigh Street|M4 5DR": { + "lodgement_date": "2024-05-29", + "found_in_old_api": true + }, + "39 George Leigh Street|M4 5DR": { + "lodgement_date": "2024-05-28", + "found_in_old_api": true + }, + "51 George Leigh Street|M4 5DR": { + "lodgement_date": "2022-02-03", + "found_in_old_api": true + }, + "1a, Victoria Square|M4 5DX": { + "lodgement_date": "2016-01-08", + "found_in_old_api": true + }, + "4a, Victoria Square|M4 5DX": { + "lodgement_date": "2012-09-19", + "found_in_old_api": true + }, + "5a Victoria Square|M4 5DX": { + "lodgement_date": "2012-06-25", + "found_in_old_api": true + }, + " 6a Victoria Square|M4 5DX": { + "lodgement_date": "2023-02-13", + "found_in_old_api": true + }, + "7a Victoria Square|M4 5DX": { + "lodgement_date": "2017-03-15", + "found_in_old_api": true + }, + "8a Victoria Square|M4 5DX": { + "lodgement_date": "2019-11-25", + "found_in_old_api": true + }, + "9a Victoria Square|M4 5DX": { + "lodgement_date": "2026-02-24", + "found_in_old_api": true + }, + "10a Victoria Square|M4 5DX": { + "lodgement_date": "2013-10-16", + "found_in_old_api": true + }, + "11a Victoria Square|M4 5DX": { + "lodgement_date": "2015-11-06", + "found_in_old_api": true + }, + "12a Victoria Square|M4 5DX": { + "lodgement_date": "2022-11-08", + "found_in_old_api": true + }, + "13a Victoria Square|M4 5DX": { + "lodgement_date": "2025-04-27", + "found_in_old_api": true + }, + "14a Victoria Square|M4 5DX": { + "lodgement_date": "2010-11-09", + "found_in_old_api": true + }, + "15a Victoria Square|M4 5DX": { + "lodgement_date": "2012-03-26", + "found_in_old_api": true + }, + "16a Victoria Square|M4 5DX": { + "lodgement_date": "2009-05-28", + "found_in_old_api": true + }, + "17a Victoria Square|M4 5DX": { + "lodgement_date": "2012-12-20", + "found_in_old_api": true + }, + "18a Victoria Square|M4 5DX": { + "lodgement_date": "2022-07-21", + "found_in_old_api": true + }, + "19a Victoria Square|M4 5DX": { + "lodgement_date": "2009-08-18", + "found_in_old_api": true + }, + "20a Victoria Square|M4 5DX": { + "lodgement_date": "2014-05-27", + "found_in_old_api": true + }, + "21a Victoria Square|M4 5DY": { + "lodgement_date": "2010-04-08", + "found_in_old_api": true + }, + "23a Victoria Square|M4 5DY": { + "lodgement_date": "2016-04-05", + "found_in_old_api": true + }, + "24a Victoria Square|M4 5DY": { + "lodgement_date": "2022-03-23", + "found_in_old_api": true + }, + "25a Victoria Square|M4 5DY": { + "lodgement_date": "2024-10-13", + "found_in_old_api": true + }, + "26a Victoria Square|M4 5DY": { + "lodgement_date": "2024-03-25", + "found_in_old_api": true + }, + "27a Victoria Square|M4 5DY": { + "lodgement_date": "2009-10-05", + "found_in_old_api": true + }, + "29a Victoria Square|M4 5DY": { + "lodgement_date": "2024-05-27", + "found_in_old_api": true + }, + "30a Victoria Square|M4 5DY": { + "lodgement_date": "2011-09-07", + "found_in_old_api": true + }, + "31a Victoria Square|M4 5DY": { + "lodgement_date": "2010-12-09", + "found_in_old_api": true + }, + "32a Victoria Square|M4 5DY": { + "lodgement_date": "2021-02-17", + "found_in_old_api": true + }, + "33a Victoria Square|M4 5DY": { + "lodgement_date": "2011-04-05", + "found_in_old_api": true + }, + "34a Victoria Square|M4 5DY": { + "lodgement_date": "2021-08-13", + "found_in_old_api": true + }, + "36a Victoria Square|M4 5DY": { + "lodgement_date": "2011-04-05", + "found_in_old_api": true + }, + "37a Victoria Square|M4 5DY": { + "lodgement_date": "2018-07-02", + "found_in_old_api": true + }, + "38a Victoria Square|M4 5DY": { + "lodgement_date": "2010-02-02", + "found_in_old_api": true + }, + "39a Victoria Square|M4 5DY": { + "lodgement_date": "2018-01-04", + "found_in_old_api": true + }, + "41a Victoria Square|M4 5DY": { + "lodgement_date": "2011-05-23", + "found_in_old_api": true + }, + "42a Victoria Square|M4 5DY": { + "lodgement_date": "2010-10-14", + "found_in_old_api": true + }, + "43a Victoria Square|M4 5DY": { + "lodgement_date": "2018-10-11", + "found_in_old_api": true + }, + "44a Victoria Square|M4 5DY": { + "lodgement_date": "2010-06-08", + "found_in_old_api": true + }, + "45a Victoria Square|M4 5DY": { + "lodgement_date": "2023-03-08", + "found_in_old_api": true + }, + "46a Victoria Square|M4 5DY": { + "lodgement_date": "2010-12-09", + "found_in_old_api": true + }, + "47a Victoria Square|M4 5DY": { + "lodgement_date": "2010-02-09", + "found_in_old_api": true + }, + "48a Victoria Square|M4 5DY": { + "lodgement_date": "2011-04-12", + "found_in_old_api": true + }, + "49a Victoria Square|M4 5DY": { + "lodgement_date": "2010-11-09", + "found_in_old_api": true + }, + "50a Victoria Square|M4 5DY": { + "lodgement_date": "2025-09-06", + "found_in_old_api": true + }, + "51a Victoria Square|M4 5DY": { + "lodgement_date": "2009-10-05", + "found_in_old_api": true + }, + "52a Victoria Square|M4 5DY": { + "lodgement_date": "2010-12-17", + "found_in_old_api": true + }, + "53a Victoria Square|M4 5DY": { + "lodgement_date": "2022-11-10", + "found_in_old_api": true + }, + "54a Victoria Square|M4 5DY": { + "lodgement_date": "2021-01-08", + "found_in_old_api": true + }, + "55a Victoria Square|M4 5DY": { + "lodgement_date": "2009-08-18", + "found_in_old_api": true + }, + "56a Victoria Square|M4 5DZ": { + "lodgement_date": "2019-03-15", + "found_in_old_api": true + }, + "58a Victoria Square|M4 5DZ": { + "lodgement_date": "2018-11-14", + "found_in_old_api": true + }, + "59a Victoria Square|M4 5DZ": { + "lodgement_date": "2013-11-26", + "found_in_old_api": true + }, + "60a Victoria Square|M4 5DZ": { + "lodgement_date": "2024-06-12", + "found_in_old_api": true + }, + "61a Victoria Square|M4 5DZ": { + "lodgement_date": "2024-08-05", + "found_in_old_api": true + }, + "62a Victoria Square|M4 5DZ": { + "lodgement_date": "2013-05-24", + "found_in_old_api": true + }, + "64a Victoria Square|M4 5DZ": { + "lodgement_date": "2021-07-29", + "found_in_old_api": true + }, + "65a Victoria Square|M4 5DZ": { + "lodgement_date": "2011-08-26", + "found_in_old_api": true + }, + "68a Victoria Square|M4 5DZ": { + "lodgement_date": "2022-03-29", + "found_in_old_api": true + }, + "69a Victoria Square|M4 5DZ": { + "lodgement_date": "2011-01-19", + "found_in_old_api": true + }, + "70a Victoria Square|M4 5DZ": { + "lodgement_date": "2011-07-27", + "found_in_old_api": true + }, + "71a Victoria Square|M4 5DZ": { + "lodgement_date": "2016-11-22", + "found_in_old_api": true + }, + "72a Victoria Square|M4 5DZ": { + "lodgement_date": "2019-01-07", + "found_in_old_api": true + }, + "73a Victoria Square|M4 5DZ": { + "lodgement_date": "2014-07-25", + "found_in_old_api": true + }, + "75a Victoria Square|M4 5DZ": { + "lodgement_date": "2016-01-20", + "found_in_old_api": true + }, + "76a Victoria Square|M4 5DZ": { + "lodgement_date": "2018-01-26", + "found_in_old_api": true + }, + "78a Victoria Square|M4 5DZ": { + "lodgement_date": "2011-06-02", + "found_in_old_api": true + }, + "79a Victoria Square|M4 5DZ": { + "lodgement_date": "2022-01-26", + "found_in_old_api": true + }, + "80a Victoria Square|M4 5DZ": { + "lodgement_date": "2018-11-05", + "found_in_old_api": true + }, + "81a Victoria Square|M4 5DZ": { + "lodgement_date": "2017-03-05", + "found_in_old_api": true + }, + "83a Victoria Square|M4 5DZ": { + "lodgement_date": "2012-05-01", + "found_in_old_api": true + }, + "85a Victoria Square|M4 5DZ": { + "lodgement_date": "2009-10-21", + "found_in_old_api": true + }, + "86a Victoria Square|M4 5DZ": { + "lodgement_date": "2024-05-29", + "found_in_old_api": true + }, + "87a Victoria Square|M4 5DZ": { + "lodgement_date": "2025-07-13", + "found_in_old_api": true + }, + "89a Victoria Square|M4 5DZ": { + "lodgement_date": "2016-05-12", + "found_in_old_api": true + }, + "90a Victoria Square|M4 5DZ": { + "lodgement_date": "2012-05-09", + "found_in_old_api": true + }, + "91a Victoria Square|M4 5DZ": { + "lodgement_date": "2025-04-30", + "found_in_old_api": true + }, + "92a Victoria Square|M4 5DZ": { + "lodgement_date": "2021-07-29", + "found_in_old_api": true + }, + "93a Victoria Square|M4 5EA": { + "lodgement_date": "2013-02-26", + "found_in_old_api": true + }, + "95a Victoria Square|M4 5EA": { + "lodgement_date": "2020-09-06", + "found_in_old_api": true + }, + "96a Victoria Square|M4 5EA": { + "lodgement_date": "2022-06-30", + "found_in_old_api": true + }, + "97a Victoria Square|M4 5EA": { + "lodgement_date": "2016-09-05", + "found_in_old_api": true + }, + "98a Victoria Square|M4 5EA": { + "lodgement_date": "2019-12-19", + "found_in_old_api": true + }, + "99a Victoria Square|M4 5EA": { + "lodgement_date": "2009-03-05", + "found_in_old_api": true + }, + "100a Victoria Square|M4 5EA": { + "lodgement_date": "2011-03-31", + "found_in_old_api": true + }, + "103a Victoria Square|M4 5EA": { + "lodgement_date": "2009-03-05", + "found_in_old_api": true + }, + "104a Victoria Square|M4 5EA": { + "lodgement_date": "2010-01-21", + "found_in_old_api": true + }, + "106a Victoria Square|M4 5EA": { + "lodgement_date": "2015-12-10", + "found_in_old_api": true + }, + "107a Victoria Square|M4 5EA": { + "lodgement_date": "2013-07-01", + "found_in_old_api": true + }, + "108a Victoria Square|M4 5EA": { + "lodgement_date": "2023-03-01", + "found_in_old_api": true + }, + "109a Victoria Square|M4 5EA": { + "lodgement_date": "2010-03-24", + "found_in_old_api": true + }, + "110a Victoria Square|M4 5EA": { + "lodgement_date": "2019-02-25", + "found_in_old_api": true + }, + "111a Victoria Square|M4 5EA": { + "lodgement_date": "2010-02-01", + "found_in_old_api": true + }, + "113a Victoria Square|M4 5EA": { + "lodgement_date": "2012-11-21", + "found_in_old_api": true + }, + "114a Victoria Square|M4 5EA": { + "lodgement_date": "2013-12-06", + "found_in_old_api": true + }, + "115a Victoria Square|M4 5EA": { + "lodgement_date": "2022-08-25", + "found_in_old_api": true + }, + "116a Victoria Square|M4 5EA": { + "lodgement_date": "2011-02-25", + "found_in_old_api": true + }, + "119a Victoria Square|M4 5EA": { + "lodgement_date": "2024-04-12", + "found_in_old_api": true + }, + "120a Victoria Square|M4 5EA": { + "lodgement_date": "2011-04-04", + "found_in_old_api": true + }, + "121a Victoria Square|M4 5EA": { + "lodgement_date": "2010-11-09", + "found_in_old_api": true + }, + "122a Victoria Square|M4 5EA": { + "lodgement_date": "2012-05-01", + "found_in_old_api": true + }, + "123a Victoria Square|M4 5EA": { + "lodgement_date": "2022-01-12", + "found_in_old_api": true + }, + "125a Victoria Square|M4 5EA": { + "lodgement_date": "2023-11-22", + "found_in_old_api": true + }, + "126a Victoria Square|M4 5EA": { + "lodgement_date": "2010-08-24", + "found_in_old_api": true + }, + "127a Victoria Square|M4 5EA": { + "lodgement_date": "2020-03-01", + "found_in_old_api": true + }, + "128a Victoria Square|M4 5EA": { + "lodgement_date": "2015-02-04", + "found_in_old_api": true + }, + "129a Victoria Square|M4 5EA": { + "lodgement_date": "2010-07-07", + "found_in_old_api": true + }, + "130a Victoria Square|M4 5FA": { + "lodgement_date": "2026-02-11", + "found_in_old_api": true + }, + "131a Victoria Square|M4 5FA": { + "lodgement_date": "2025-05-29", + "found_in_old_api": true + }, + "132a Victoria Square|M4 5FA": { + "lodgement_date": "2019-12-24", + "found_in_old_api": true + }, + "134a Victoria Square|M4 5FA": { + "lodgement_date": "2011-08-18", + "found_in_old_api": true + }, + "135a Victoria Square|M4 5FA": { + "lodgement_date": "2019-09-05", + "found_in_old_api": true + }, + "136a Victoria Square|M4 5FA": { + "lodgement_date": "2025-02-14", + "found_in_old_api": true + }, + "137a Victoria Square|M4 5FA": { + "lodgement_date": "2024-07-17", + "found_in_old_api": true + }, + "138a Victoria Square|M4 5FA": { + "lodgement_date": "2023-10-11", + "found_in_old_api": true + }, + "139a Victoria Square|M4 5FA": { + "lodgement_date": "2021-06-22", + "found_in_old_api": true + }, + "140a Victoria Square|M4 5FA": { + "lodgement_date": "2020-06-15", + "found_in_old_api": true + }, + "141a Victoria Square|M4 5FA": { + "lodgement_date": "2025-12-22", + "found_in_old_api": true + }, + "142a Victoria Square|M4 5FA": { + "lodgement_date": "2025-12-22", + "found_in_old_api": true + }, + "143a Victoria Square|M4 5FA": { + "lodgement_date": "2023-01-18", + "found_in_old_api": true + }, + "144a Victoria Square|M4 5FA": { + "lodgement_date": "2011-04-04", + "found_in_old_api": true + }, + "146a Victoria Square|M4 5FA": { + "lodgement_date": "2022-09-21", + "found_in_old_api": true + }, + "147a Victoria Square|M4 5FA": { + "lodgement_date": "2011-05-04", + "found_in_old_api": true + }, + "148a Victoria Square|M4 5FA": { + "lodgement_date": "2014-11-18", + "found_in_old_api": true + }, + "149a Victoria Square|M4 5FA": { + "lodgement_date": "2009-12-14", + "found_in_old_api": true + }, + "150a Victoria Square|M4 5FA": { + "lodgement_date": "2009-12-14", + "found_in_old_api": true + }, + "152a Victoria Square|M4 5FA": { + "lodgement_date": "2017-06-23", + "found_in_old_api": true + }, + "154a Victoria Square|M4 5FA": { + "lodgement_date": "2025-04-29", + "found_in_old_api": true + }, + "156a Victoria Square|M4 5FA": { + "lodgement_date": "2011-04-05", + "found_in_old_api": true + }, + "157a Victoria Square|M4 5FA": { + "lodgement_date": "2023-09-11", + "found_in_old_api": true + }, + "158a Victoria Square|M4 5FA": { + "lodgement_date": "2021-12-07", + "found_in_old_api": true + }, + "160a Victoria Square|M4 5FA": { + "lodgement_date": "2011-02-04", + "found_in_old_api": true + }, + "163a Victoria Square|M4 5FA": { + "lodgement_date": "2010-02-02", + "found_in_old_api": true + }, + "164a Victoria Square|M4 5FA": { + "lodgement_date": "2020-10-19", + "found_in_old_api": true + }, + "165a Victoria Square|M4 5FA": { + "lodgement_date": "2019-12-13", + "found_in_old_api": true + } +} \ No newline at end of file From 8f2885474bb8c0959a23e4c60398a60f07c5987e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Apr 2026 11:53:52 +0000 Subject: [PATCH 012/106] fixing address2uprn tests --- backend/address2UPRN/tests/test_data.csv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/address2UPRN/tests/test_data.csv b/backend/address2UPRN/tests/test_data.csv index ee23813b..aaeee66d 100644 --- a/backend/address2UPRN/tests/test_data.csv +++ b/backend/address2UPRN/tests/test_data.csv @@ -117,14 +117,14 @@ FLAT 43 Goodstone Court,HA1 4FL,10070269095 10 Genteel House Samara Drive,UB1 1FJ,12189844 1 ASH TREE HOUSE,SE5 0TE,None "Flat 1 Ash Tree House, 2, Thompson Avenue",SE5 0TE,10009803979 -3 ASH TREE HOUSE,SE5 0TE,None +3 ASH TREE HOUSE,SE5 0TE,10009803981 Flat 3 ASH TREE HOUSE,SE5 0TE,10009803981 -5 ASH TREE HOUSE,SE5 0TE,None +5 ASH TREE HOUSE,SE5 0TE,10009803983 Flat 5 ASH TREE HOUSE,SE5 0TE,10009803983 Flat 8 ASH TREE HOUSE,SE5 0TE,10009803986 8 ASH TREE HOUSE,SE5 0TE,None Flat 12 ASH TREE HOUSE,SE5 0TE,10009803990 -12 ASH TREE HOUSE,SE5 0TE,None +12 ASH TREE HOUSE,SE5 0TE,10009803990 FLAT 1 599 HARROW ROAD,W10 4RA,217113930 FLAT 2 599 HARROW ROAD,W10 4RA,217113931 FLAT 3 599 HARROW ROAD,W10 4RA,None From 8ec6eecc4d28157db264258c1555a6ae464129ff Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Apr 2026 12:00:19 +0000 Subject: [PATCH 013/106] reverting manually tweaked tests --- backend/address2UPRN/main.py | 3 ++- backend/address2UPRN/tests/test_data.csv | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 98f8c65b..fad5c64e 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -78,7 +78,8 @@ def get_uprn_with_epc_df( # Safe to return the agreed UPRN found_uprn = top_rank_df.iloc[0]["uprn"] - if found_uprn == "": + # Handling numeric missingness in new api + if found_uprn in ["", "nan"]: return None if verbose: diff --git a/backend/address2UPRN/tests/test_data.csv b/backend/address2UPRN/tests/test_data.csv index aaeee66d..ee23813b 100644 --- a/backend/address2UPRN/tests/test_data.csv +++ b/backend/address2UPRN/tests/test_data.csv @@ -117,14 +117,14 @@ FLAT 43 Goodstone Court,HA1 4FL,10070269095 10 Genteel House Samara Drive,UB1 1FJ,12189844 1 ASH TREE HOUSE,SE5 0TE,None "Flat 1 Ash Tree House, 2, Thompson Avenue",SE5 0TE,10009803979 -3 ASH TREE HOUSE,SE5 0TE,10009803981 +3 ASH TREE HOUSE,SE5 0TE,None Flat 3 ASH TREE HOUSE,SE5 0TE,10009803981 -5 ASH TREE HOUSE,SE5 0TE,10009803983 +5 ASH TREE HOUSE,SE5 0TE,None Flat 5 ASH TREE HOUSE,SE5 0TE,10009803983 Flat 8 ASH TREE HOUSE,SE5 0TE,10009803986 8 ASH TREE HOUSE,SE5 0TE,None Flat 12 ASH TREE HOUSE,SE5 0TE,10009803990 -12 ASH TREE HOUSE,SE5 0TE,10009803990 +12 ASH TREE HOUSE,SE5 0TE,None FLAT 1 599 HARROW ROAD,W10 4RA,217113930 FLAT 2 599 HARROW ROAD,W10 4RA,217113931 FLAT 3 599 HARROW ROAD,W10 4RA,None From 821a0a08f7508a72c1d71fb8cfc46963d3f60b39 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Apr 2026 12:02:34 +0000 Subject: [PATCH 014/106] addressing feedback on from_api_response --- datatypes/epc/domain/mapper.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 7ef74340..d5212fe5 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -1,5 +1,5 @@ from datetime import date -from typing import List, Optional, Sequence, Union +from typing import List, Optional, Sequence, Union, Dict, Any from datatypes.epc.domain.epc_property_data import ( EnergyElement, @@ -1448,7 +1448,7 @@ class EpcPropertyDataMapper: return [EpcPropertyDataMapper._map_energy_element(e) for e in elements] @staticmethod - def from_api_response(data: dict) -> "EpcPropertyData": + def from_api_response(data: Dict[str, Any]) -> "EpcPropertyData": """ Dispatch to the correct schema mapper based on schema_type. Supports RdSAP-Schema-21.0.0 and RdSAP-Schema-21.0.1 only. @@ -1459,11 +1459,13 @@ class EpcPropertyDataMapper: schema = data.get("schema_type", "") if schema == "RdSAP-Schema-21.0.1": from datatypes.epc.schema.rdsap_schema_21_0_1 import RdSapSchema21_0_1 + return EpcPropertyDataMapper.from_rdsap_schema_21_0_1( from_dict(RdSapSchema21_0_1, data) ) if schema == "RdSAP-Schema-21.0.0": from datatypes.epc.schema.rdsap_schema_21_0_0 import RdSapSchema21_0_0 + return EpcPropertyDataMapper.from_rdsap_schema_21_0_0( from_dict(RdSapSchema21_0_0, data) ) @@ -1596,7 +1598,11 @@ def _map_sap_heating( fuel_type = ( _raw_fuel if _raw_fuel - else ("Electricity" if main.system_type.lower() in _ELECTRIC_SYSTEM_TYPES else _raw_fuel) + else ( + "Electricity" + if main.system_type.lower() in _ELECTRIC_SYSTEM_TYPES + else _raw_fuel + ) ) return SapHeating( @@ -1618,7 +1624,11 @@ def _map_sap_heating( secondary_fuel_type=secondary_fuel_type, secondary_heating_type=heating.secondary_heating.secondary_system, shower_outlets=shower_outlets, - cylinder_size=heating.water_heating.cylinder_size if heating.water_heating.cylinder_size != "No Cylinder" else None, + cylinder_size=( + heating.water_heating.cylinder_size + if heating.water_heating.cylinder_size != "No Cylinder" + else None + ), cylinder_insulation_type=heating.water_heating.insulation_type, cylinder_insulation_thickness_mm=heating.water_heating.insulation_thickness_mm, immersion_heating_type=heating.water_heating.immersion_type, From 001e9ce88235f1231d9c87eea2136a44daf04b91 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Apr 2026 12:03:39 +0000 Subject: [PATCH 015/106] remove inline import --- datatypes/epc/domain/mapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index d5212fe5..cc960f87 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -1,5 +1,6 @@ from datetime import date from typing import List, Optional, Sequence, Union, Dict, Any +from datatypes.epc.schema.helpers import from_dict from datatypes.epc.domain.epc_property_data import ( EnergyElement, @@ -1454,7 +1455,6 @@ class EpcPropertyDataMapper: Supports RdSAP-Schema-21.0.0 and RdSAP-Schema-21.0.1 only. Raises ValueError for unsupported schemas — add cases here as needed. """ - from datatypes.epc.schema.helpers import from_dict schema = data.get("schema_type", "") if schema == "RdSAP-Schema-21.0.1": From cadf8836d13a3249bb591fb0abc626d86f8ac9a3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Apr 2026 12:04:46 +0000 Subject: [PATCH 016/106] making full_address property --- backend/epc_client/client.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/backend/epc_client/client.py b/backend/epc_client/client.py index 33f25ef5..0e3b48fc 100644 --- a/backend/epc_client/client.py +++ b/backend/epc_client/client.py @@ -7,7 +7,11 @@ from typing import Callable, Optional import httpx import pandas as pd -from backend.epc_client.exceptions import EpcApiError, EpcNotFoundError, EpcRateLimitError +from backend.epc_client.exceptions import ( + EpcApiError, + EpcNotFoundError, + EpcRateLimitError, +) from backend.epc_client._retry import call_with_retry from datatypes.epc.domain.epc_property_data import EpcPropertyData from datatypes.epc.domain.mapper import EpcPropertyDataMapper @@ -26,6 +30,7 @@ class EpcSearchResult: current_energy_efficiency_band: str registration_date: str + @property def full_address(self) -> str: parts = [ self.address_line_1, @@ -68,12 +73,16 @@ class EpcClientService: return None # Round 1: score on addressLine1 only - cert_num = self._pick_best_cert(candidates, address, use_full_address=False, fn=get_uprn_candidates) + cert_num = self._pick_best_cert( + candidates, address, use_full_address=False, fn=get_uprn_candidates + ) if cert_num: return self._safe_get(cert_num) # Round 2: score on all address lines joined - cert_num = self._pick_best_cert(candidates, address, use_full_address=True, fn=get_uprn_candidates) + cert_num = self._pick_best_cert( + candidates, address, use_full_address=True, fn=get_uprn_candidates + ) if cert_num: return self._safe_get(cert_num) @@ -145,14 +154,18 @@ class EpcClientService: use_full_address: bool, fn: Callable[..., pd.DataFrame], ) -> Optional[str]: - df = pd.DataFrame([ - { - "address": r.full_address() if use_full_address else r.address_line_1, - "uprn": str(r.uprn) if r.uprn is not None else "", - "certificate_number": r.certificate_number, - } - for r in candidates - ]) + df = pd.DataFrame( + [ + { + "address": ( + r.full_address() if use_full_address else r.address_line_1 + ), + "uprn": str(r.uprn) if r.uprn is not None else "", + "certificate_number": r.certificate_number, + } + for r in candidates + ] + ) scored = fn(df, user_address=user_address) if scored.empty: From a1b207ba558e391c4c37ca65dd8d7bf5432d76d9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 28 Apr 2026 13:46:09 +0000 Subject: [PATCH 017/106] bolstering testing --- CLAUDE.md | 6 ++ backend/app/requirements/requirements.txt | 7 +- backend/epc_client/__init__.py | 4 +- backend/epc_client/client.py | 97 +---------------------- backend/epc_client/requirements.txt | 1 - backend/epc_client/tests/test_client.py | 78 +++++------------- backend/tests/test_address_match.py | 60 ++++++++++++++ backend/utils/addressMatch.py | 9 ++- backend/utils/epc_address_match.py | 67 ++++++++++++++++ datatypes/epc/search/__init__.py | 3 + datatypes/epc/search/epc_search_result.py | 28 +++++++ pyproject.toml | 2 - 12 files changed, 201 insertions(+), 161 deletions(-) delete mode 100644 backend/epc_client/requirements.txt create mode 100644 backend/tests/test_address_match.py create mode 100644 backend/utils/epc_address_match.py create mode 100644 datatypes/epc/search/__init__.py create mode 100644 datatypes/epc/search/epc_search_result.py diff --git a/CLAUDE.md b/CLAUDE.md index 263679ff..23d465a7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -59,3 +59,9 @@ New containers install all skills automatically via the Dockerfile. If you're in bash .devcontainer/backend/install-claude-skills.sh ``` +## Type Safety + +All new code must pass `pyright` with zero errors under `typeCheckingMode = strict`. +Annotate all function return types. Use `dict[str, Any]` for untyped external API +payloads — never bare `dict`. Add `pandas-stubs` when introducing pandas to a module. + diff --git a/backend/app/requirements/requirements.txt b/backend/app/requirements/requirements.txt index 9fdbfe4c..80907a79 100644 --- a/backend/app/requirements/requirements.txt +++ b/backend/app/requirements/requirements.txt @@ -13,4 +13,9 @@ boto3==1.35.44 openpyxl==3.1.5 # Basic pytz -sqlmodel \ No newline at end of file +sqlmodel +# HTTP client +httpx==0.28.1 +# Data +pandas +pandas-stubs \ No newline at end of file diff --git a/backend/epc_client/__init__.py b/backend/epc_client/__init__.py index 720594f7..ab46a266 100644 --- a/backend/epc_client/__init__.py +++ b/backend/epc_client/__init__.py @@ -1,3 +1,3 @@ -from backend.epc_client.client import EpcClientService, EpcSearchResult +from backend.epc_client.client import EpcClientService -__all__ = ["EpcClientService", "EpcSearchResult"] +__all__ = ["EpcClientService"] diff --git a/backend/epc_client/client.py b/backend/epc_client/client.py index 0e3b48fc..d00a164f 100644 --- a/backend/epc_client/client.py +++ b/backend/epc_client/client.py @@ -1,11 +1,9 @@ # Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml from __future__ import annotations -from dataclasses import dataclass -from typing import Callable, Optional +from typing import Any, Optional import httpx -import pandas as pd from backend.epc_client.exceptions import ( EpcApiError, @@ -15,35 +13,11 @@ from backend.epc_client.exceptions import ( from backend.epc_client._retry import call_with_retry from datatypes.epc.domain.epc_property_data import EpcPropertyData from datatypes.epc.domain.mapper import EpcPropertyDataMapper - - -@dataclass -class EpcSearchResult: - certificate_number: str - address_line_1: str - address_line_2: Optional[str] - address_line_3: Optional[str] - address_line_4: Optional[str] - postcode: str - post_town: str - uprn: Optional[int] - current_energy_efficiency_band: str - registration_date: str - - @property - def full_address(self) -> str: - parts = [ - self.address_line_1, - self.address_line_2, - self.address_line_3, - self.address_line_4, - ] - return ", ".join(p for p in parts if p) +from datatypes.epc.search import EpcSearchResult class EpcClientService: BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk" - _MIN_MATCH_SCORE = 0.6 def __init__(self, auth_token: str) -> None: self._headers = { @@ -65,34 +39,11 @@ class EpcClientService: def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: return call_with_retry(lambda: self._search(postcode=postcode)) - def find_best_match(self, postcode: str, address: str) -> Optional[EpcPropertyData]: - from backend.utils.addressMatch import get_uprn_candidates - - candidates = self.search_by_postcode(postcode) - if not candidates: - return None - - # Round 1: score on addressLine1 only - cert_num = self._pick_best_cert( - candidates, address, use_full_address=False, fn=get_uprn_candidates - ) - if cert_num: - return self._safe_get(cert_num) - - # Round 2: score on all address lines joined - cert_num = self._pick_best_cert( - candidates, address, use_full_address=True, fn=get_uprn_candidates - ) - if cert_num: - return self._safe_get(cert_num) - - return None - # ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------ - def _fetch_certificate(self, cert_num: str) -> dict: + def _fetch_certificate(self, cert_num: str) -> dict[str, Any]: resp = httpx.get( f"{self.BASE_URL}/api/certificate", params={"certificate_number": cert_num}, @@ -133,7 +84,7 @@ class EpcClientService: return [self._parse_search_result(r) for r in rows] @staticmethod - def _parse_search_result(row: dict) -> EpcSearchResult: + def _parse_search_result(row: dict[str, Any]) -> EpcSearchResult: return EpcSearchResult( certificate_number=row["certificateNumber"], address_line_1=row["addressLine1"], @@ -146,43 +97,3 @@ class EpcClientService: current_energy_efficiency_band=row["currentEnergyEfficiencyBand"], registration_date=row["registrationDate"], ) - - def _pick_best_cert( - self, - candidates: list[EpcSearchResult], - user_address: str, - use_full_address: bool, - fn: Callable[..., pd.DataFrame], - ) -> Optional[str]: - df = pd.DataFrame( - [ - { - "address": ( - r.full_address() if use_full_address else r.address_line_1 - ), - "uprn": str(r.uprn) if r.uprn is not None else "", - "certificate_number": r.certificate_number, - } - for r in candidates - ] - ) - - scored = fn(df, user_address=user_address) - if scored.empty: - return None - - best_score = scored.iloc[0]["lexiscore"] - if best_score < self._MIN_MATCH_SCORE: - return None - - top = scored[scored["lexirank"] == 1] - if len(top) != 1: - return None - - return str(top.iloc[0]["certificate_number"]) - - def _safe_get(self, cert_num: str) -> Optional[EpcPropertyData]: - try: - return self.get_by_certificate_number(cert_num) - except EpcNotFoundError: - return None diff --git a/backend/epc_client/requirements.txt b/backend/epc_client/requirements.txt deleted file mode 100644 index cee32373..00000000 --- a/backend/epc_client/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -httpx==0.28.1 diff --git a/backend/epc_client/tests/test_client.py b/backend/epc_client/tests/test_client.py index 51dd2a12..7933f21d 100644 --- a/backend/epc_client/tests/test_client.py +++ b/backend/epc_client/tests/test_client.py @@ -1,7 +1,9 @@ from unittest.mock import MagicMock, patch, call import pytest -from backend.epc_client.client import EpcClientService, EpcSearchResult +from backend.epc_client.client import EpcClientService +from backend.utils.epc_address_match import find_best_epc_match +from datatypes.epc.search import EpcSearchResult from backend.epc_client.exceptions import EpcNotFoundError, EpcRateLimitError from datatypes.epc.domain.epc_property_data import EpcPropertyData from backend.epc_client.tests.conftest import make_search_row @@ -122,88 +124,51 @@ def test_search_by_postcode_404_returns_empty_list(epc_service): # --------------------------------------------------------------------------- -# Tests 8-10: find_best_match +# Tests 8-10: find_best_epc_match — real scoring, only HTTP mocked # --------------------------------------------------------------------------- -def _make_scored_df(rows, scores, ranks): - import pandas as pd - df = pd.DataFrame(rows) - df["lexiscore"] = scores - df["lexirank"] = ranks - return df.sort_values("lexirank") - - -def test_find_best_match_round1_clear_winner(epc_service, rdsap_21_0_1_cert): +def test_find_best_match_clear_winner_on_first_pass(epc_service, rdsap_21_0_1_cert): search_rows = [ make_search_row(cert_num="CERT-WIN", address_line_1="1 High Street"), make_search_row(cert_num="CERT-LOSE", address_line_1="99 Nowhere Lane"), ] cert_response = {"data": rdsap_21_0_1_cert} - df_rows = [ - {"address": "1 High Street", "uprn": "100023336956", "certificate_number": "CERT-WIN"}, - {"address": "99 Nowhere Lane", "uprn": "100023336956", "certificate_number": "CERT-LOSE"}, - ] - scored = _make_scored_df(df_rows, [0.9, 0.1], [1, 2]) - def fake_get(url, params=None, **kwargs): if "search" in url: return _mock_response(200, {"data": search_rows}) return _mock_response(200, cert_response) - with patch("httpx.get", side_effect=fake_get), \ - patch("backend.utils.addressMatch.get_uprn_candidates", return_value=scored): - result = epc_service.find_best_match("SW1A 1AA", "1 High Street") + with patch("httpx.get", side_effect=fake_get): + result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street") assert isinstance(result, EpcPropertyData) -def test_find_best_match_round1_ambiguous_round2_resolves(epc_service, rdsap_21_0_1_cert): +def test_find_best_match_resolves_on_second_pass_using_full_address(epc_service, rdsap_21_0_1_cert): + # Both candidates share address_line_1 — round 1 is ambiguous. + # Round 2 scores against full_address and picks the correct floor. search_rows = [ make_search_row( - cert_num="CERT-A", address_line_1="1 High Street", + cert_num="CERT-A", + address_line_1="1 High Street", address_line_2="Ground Floor", ), make_search_row( - cert_num="CERT-B", address_line_1="1 High Street", + cert_num="CERT-B", + address_line_1="1 High Street", address_line_2="First Floor", ), ] cert_response = {"data": rdsap_21_0_1_cert} - # Round 1: both score equally — ambiguous (two rank-1s) - ambiguous = _make_scored_df( - [ - {"address": "1 High Street", "uprn": "111", "certificate_number": "CERT-A"}, - {"address": "1 High Street", "uprn": "222", "certificate_number": "CERT-B"}, - ], - [0.9, 0.9], - [1, 1], - ) - # Round 2: CERT-A wins on full address - resolved = _make_scored_df( - [ - {"address": "1 High Street, Ground Floor", "uprn": "111", "certificate_number": "CERT-A"}, - {"address": "1 High Street, First Floor", "uprn": "222", "certificate_number": "CERT-B"}, - ], - [0.85, 0.4], - [1, 2], - ) - - call_count = {"n": 0} - - def fake_candidates(df, user_address, **kwargs): - call_count["n"] += 1 - return ambiguous if call_count["n"] == 1 else resolved - def fake_get(url, params=None, **kwargs): if "search" in url: return _mock_response(200, {"data": search_rows}) return _mock_response(200, cert_response) - with patch("httpx.get", side_effect=fake_get), \ - patch("backend.utils.addressMatch.get_uprn_candidates", side_effect=fake_candidates): - result = epc_service.find_best_match("SW1A 1AA", "1 High Street Ground Floor") + with patch("httpx.get", side_effect=fake_get): + result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street Ground Floor") assert isinstance(result, EpcPropertyData) @@ -211,14 +176,7 @@ def test_find_best_match_round1_ambiguous_round2_resolves(epc_service, rdsap_21_ def test_find_best_match_returns_none_when_no_good_match(epc_service): search_rows = [make_search_row(cert_num="CERT-X", address_line_1="99 Nowhere Lane")] - low_score = _make_scored_df( - [{"address": "99 Nowhere Lane", "uprn": "111", "certificate_number": "CERT-X"}], - [0.1], - [1], - ) - - with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})), \ - patch("backend.utils.addressMatch.get_uprn_candidates", return_value=low_score): - result = epc_service.find_best_match("SW1A 1AA", "1 Completely Different Road") + with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})): + result = find_best_epc_match(epc_service, "SW1A 1AA", "1 Completely Different Road") assert result is None diff --git a/backend/tests/test_address_match.py b/backend/tests/test_address_match.py new file mode 100644 index 00000000..f6a564df --- /dev/null +++ b/backend/tests/test_address_match.py @@ -0,0 +1,60 @@ +from backend.utils.addressMatch import AddressMatch + + +class TestNormaliseAddress: + def test_lowercases_input(self): + assert AddressMatch.normalise_address("1 HIGH STREET") == "1 high street" + + def test_expands_road_abbreviation(self): + assert AddressMatch.normalise_address("1 Moreton Rd") == "1 moreton road" + + def test_expands_avenue_abbreviation(self): + assert AddressMatch.normalise_address("2 Park Ave") == "2 park avenue" + + def test_removes_punctuation_keeps_slash(self): + result = AddressMatch.normalise_address("Flat 1/A, Some Road") + assert "," not in result + assert "/" in result + + def test_splits_digit_letter_suffix(self): + assert "42 a" in AddressMatch.normalise_address("42a Some Road") + + def test_empty_string_returns_empty(self): + assert AddressMatch.normalise_address("") == "" + + def test_removes_no_prefix(self): + result = AddressMatch.normalise_address("No 5 High Street") + assert "no" not in result.split() + assert "5" in result + + +class TestScore: + def test_identical_address_scores_one(self): + assert AddressMatch.score("1 High Street", "1 High Street") == 1.0 + + def test_case_insensitive(self): + assert AddressMatch.score("1 HIGH STREET", "1 high street") == 1.0 + + def test_street_type_synonym_scores_one(self): + # "Rd" expands to "road" during normalisation — should be identical + assert AddressMatch.score("1 High Rd", "1 High Road") == 1.0 + + def test_different_building_numbers_score_zero(self): + assert AddressMatch.score("1 High Street", "2 High Street") == 0.0 + + def test_disjoint_number_sets_score_zero(self): + assert AddressMatch.score("1 High Street", "99 Nowhere Lane") == 0.0 + + def test_user_address_has_number_but_epc_does_not_scores_zero(self): + assert AddressMatch.score("1 High Street", "High Street") == 0.0 + + def test_partial_address_scores_above_threshold(self): + # Extra token in user address ("London") — same building number, high overlap + score = AddressMatch.score("1 High Street London", "1 High Street") + assert 0.6 <= score < 1.0 + + def test_flat_number_mismatch_scores_zero(self): + # User has two numbers but no "flat" token; EPC has different flat number + # Triggers the order-sensitive flat guard + score = AddressMatch.score("3 42 High Street", "Flat 7 42 High Street") + assert score == 0.0 diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index 12c1ac53..a0c6ebdf 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -1,8 +1,13 @@ +from __future__ import annotations + import re -from typing import Any, Optional from difflib import SequenceMatcher +from typing import TYPE_CHECKING, Any, Optional + import requests -import pandas as pd + +if TYPE_CHECKING: + import pandas as pd class AddressMatch: diff --git a/backend/utils/epc_address_match.py b/backend/utils/epc_address_match.py new file mode 100644 index 00000000..f73d6d1d --- /dev/null +++ b/backend/utils/epc_address_match.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +from backend.utils.addressMatch import AddressMatch +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.search import EpcSearchResult + +if TYPE_CHECKING: + from backend.epc_client.client import EpcClientService + +_MIN_MATCH_SCORE = 0.6 + + +def find_best_epc_match( + service: EpcClientService, + postcode: str, + address: str, +) -> Optional[EpcPropertyData]: + candidates = service.search_by_postcode(postcode) + if not candidates: + return None + + cert_num = _pick_best_cert(candidates, address, use_full_address=False) + if cert_num: + return _safe_get(service, cert_num) + + cert_num = _pick_best_cert(candidates, address, use_full_address=True) + if cert_num: + return _safe_get(service, cert_num) + + return None + + +def _pick_best_cert( + candidates: list[EpcSearchResult], + user_address: str, + use_full_address: bool, +) -> Optional[str]: + scored: list[tuple[float, str]] = [ + ( + AddressMatch.score( + user_address, + r.full_address if use_full_address else r.address_line_1, + ), + r.certificate_number, + ) + for r in candidates + ] + if not scored: + return None + best_score = max(s for s, _ in scored) + if best_score < _MIN_MATCH_SCORE: + return None + top = [cert for s, cert in scored if s == best_score] + if len(top) != 1: + return None + return top[0] + + +def _safe_get(service: EpcClientService, cert_num: str) -> Optional[EpcPropertyData]: + from backend.epc_client.exceptions import EpcNotFoundError + + try: + return service.get_by_certificate_number(cert_num) + except EpcNotFoundError: + return None diff --git a/datatypes/epc/search/__init__.py b/datatypes/epc/search/__init__.py new file mode 100644 index 00000000..3e08a56e --- /dev/null +++ b/datatypes/epc/search/__init__.py @@ -0,0 +1,3 @@ +from datatypes.epc.search.epc_search_result import EpcSearchResult + +__all__ = ["EpcSearchResult"] diff --git a/datatypes/epc/search/epc_search_result.py b/datatypes/epc/search/epc_search_result.py new file mode 100644 index 00000000..b6f47caf --- /dev/null +++ b/datatypes/epc/search/epc_search_result.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class EpcSearchResult: + certificate_number: str + address_line_1: str + address_line_2: Optional[str] + address_line_3: Optional[str] + address_line_4: Optional[str] + postcode: str + post_town: str + uprn: Optional[int] + current_energy_efficiency_band: str + registration_date: str + + @property + def full_address(self) -> str: + parts = [ + self.address_line_1, + self.address_line_2, + self.address_line_3, + self.address_line_4, + ] + return ", ".join(p for p in parts if p) diff --git a/pyproject.toml b/pyproject.toml index 72ec3f0c..49108861 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1 @@ [tool.pyright] -reportUnknownMemberType = false -reportUnknownVariableType = false \ No newline at end of file From 87afac86315bae7ee19b8f029bcce4ab872add1a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 30 Apr 2026 09:58:24 +0100 Subject: [PATCH 018/106] minor exporting data --- backend/export/property_scenarios/main.py | 6 +++--- sfr/principal_pitch/2_export_data.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/backend/export/property_scenarios/main.py b/backend/export/property_scenarios/main.py index 64627e01..100e34e8 100644 --- a/backend/export/property_scenarios/main.py +++ b/backend/export/property_scenarios/main.py @@ -83,7 +83,7 @@ def process_export( else: scenario_recs = recommendations_df[ recommendations_df["scenario_id"] == group_key - ] + ] if scenario_recs.empty: logger.info( @@ -140,8 +140,8 @@ def handler( body_dict = { "task_id": "test", "subtask_id": "test", - "portfolio_id": 682, - "scenario_ids": [1210], + "portfolio_id": 632, + "scenario_ids": [1144], "default_plans_only": False, } :param event: Lambda event containing export request details diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index b275086d..9fdff9f6 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -26,13 +26,13 @@ from backend.app.db.functions.materials_functions import get_materials from collections import defaultdict from sqlalchemy import func -PORTFOLIO_ID = 711 -SCENARIOS = [1233] +PORTFOLIO_ID = 632 +SCENARIOS = [1144] scenario_names = { - 1233: "Reach EPC C", + 1144: "EPC C", } -project_name = "Novus" +project_name = "Calico" def get_data(portfolio_id, scenario_ids): @@ -230,7 +230,7 @@ for scenario_id in SCENARIOS: # Get recs for this scenario recommended_measures_df = recommendations_df[ recommendations_df["scenario_id"] == scenario_id - ][["property_id", "measure_type", "estimated_cost", "default"]] + ][["property_id", "measure_type", "estimated_cost", "default"]] recommended_measures_df = recommended_measures_df[ recommended_measures_df["default"] ] @@ -238,7 +238,7 @@ for scenario_id in SCENARIOS: post_install_sap = recommendations_df[ recommendations_df["scenario_id"] == scenario_id - ][["property_id", "default", "sap_points"]] + ][["property_id", "default", "sap_points"]] post_install_sap = post_install_sap[post_install_sap["default"]] # Sum up the sap points by property id post_install_sap = ( From 24ec68bb9f1d5f6e5ca8eb748b87cf64145ac7df Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 6 May 2026 07:55:37 +0000 Subject: [PATCH 019/106] save progress for historical epc procress --- backend/etl/etl_opendatacommunities/README.md | 14 ++ backend/etl/etl_opendatacommunities/main.py | 144 ++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 backend/etl/etl_opendatacommunities/README.md create mode 100644 backend/etl/etl_opendatacommunities/main.py diff --git a/backend/etl/etl_opendatacommunities/README.md b/backend/etl/etl_opendatacommunities/README.md new file mode 100644 index 00000000..bf16ba89 --- /dev/null +++ b/backend/etl/etl_opendatacommunities/README.md @@ -0,0 +1,14 @@ +This website https://epc.opendatacommunities.org/ has closed down on 30th May 2026 + +So we downloaded the data and moved everything to S3 ( s3://retrofit-data-dev/epc_opendatacommunities/master_backup/ ) + +This scripts assumes the following: + +1) You downloaded the master copy, uncompressed it and set it to a path so we can read the csv + + +The script funciton is: + +1) reads csv for all data, seperate each iteration by postcode +2) compresses the csv and save it in the location +2) only gets the postcode data, compresses and uploads to s3 -> location s3://retrofit-data-dev/epc_opendatacommunities//compressed data \ No newline at end of file diff --git a/backend/etl/etl_opendatacommunities/main.py b/backend/etl/etl_opendatacommunities/main.py new file mode 100644 index 00000000..30b4045a --- /dev/null +++ b/backend/etl/etl_opendatacommunities/main.py @@ -0,0 +1,144 @@ +from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait +from io import BytesIO +from pathlib import Path +from typing import Any + +import boto3 +import pandas as pd +from botocore.config import Config +from tqdm import tqdm + +from utils.logger import setup_logger + +logger = setup_logger() + +SRC_ROOT = Path("/workspaces/home/epc_data") +TMP_ROOT = Path("/tmp/epc_postcodes") +S3_BUCKET = "retrofit-data-dev" +S3_PREFIX = "epc_opendatacommunities" + +REC_COLS = { + "IMPROVEMENT_ITEM", + "IMPROVEMENT_SUMMARY_TEXT", + "IMPROVEMENT_DESCR_TEXT", + "IMPROVEMENT_ID", + "IMPROVEMENT_ID_TEXT", + "INDICATIVE_COST", +} + +# This scripts assume you downloading the zip, unzip it, and running it locally + + +def sanitise(pc: pd.Series) -> pd.Series: + return pc.astype("string").str.upper().str.replace(" ", "", regex=False) + + +def shard_la(la_dir: Path) -> None: + certs = pd.read_csv(la_dir / "certificates.csv", low_memory=False) + recs = pd.read_csv(la_dir / "recommendations.csv", low_memory=False) + merged = certs.merge(recs, on="LMK_KEY", how="left") + + merged["POSTCODE_CLEAN"] = sanitise(merged["POSTCODE"]) + before = len(merged) + merged = merged.dropna(subset=["POSTCODE_CLEAN"]) + merged = merged[merged["POSTCODE_CLEAN"] != ""] + dropped = before - len(merged) + if dropped: + logger.warning(f"{la_dir.name}: dropped {dropped} rows with empty postcode") + + for pc, group in merged.groupby("POSTCODE_CLEAN", sort=False): + out = TMP_ROOT / f"{pc}.csv" + group.drop(columns=["POSTCODE_CLEAN"]).to_csv( + out, mode="a", header=not out.exists(), index=False + ) + + +def list_existing_keys(s3: Any) -> set[str]: + existing: set[str] = set() + paginator = s3.get_paginator("list_objects_v2") + pages = paginator.paginate(Bucket=S3_BUCKET, Prefix=f"{S3_PREFIX}/") + for page in tqdm(pages, desc="list s3"): + for obj in page.get("Contents", []): + existing.add(obj["Key"]) + logger.info(f"Found {len(existing)} existing objects under {S3_PREFIX}/") + return existing + + +def upload_postcode(path: Path, s3: Any) -> None: + df = pd.read_csv(path, low_memory=False).drop_duplicates() + + cert_cols = [c for c in df.columns if c not in REC_COLS] + cert_only = df[cert_cols].drop_duplicates() + dupes = cert_only["LMK_KEY"].value_counts() + bad = dupes[dupes > 1] + if not bad.empty: + raise ValueError( + f"Postcode {path.stem}: LMK_KEY appears with conflicting cert data: " + f"{bad.index.tolist()[:5]}" + ) + + buf = BytesIO() + df.to_csv(buf, index=False, compression="gzip") + s3.put_object( + Bucket=S3_BUCKET, + Key=f"{S3_PREFIX}/{path.stem}/data.csv.gz", + Body=buf.getvalue(), + ContentType="text/csv", + ContentEncoding="gzip", + ) + + +def main(): + TMP_ROOT.mkdir(parents=True, exist_ok=True) + la_dirs = sorted( + p for p in SRC_ROOT.iterdir() if p.is_dir() and p.name.startswith("domestic-") + ) + logger.info(f"Sharding {len(la_dirs)} LA folders -> {TMP_ROOT}") + + # for la in tqdm(la_dirs, desc="shard"): + # shard_la(la) + + s3 = boto3.client( + "s3", + config=Config(max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"}), + ) + pc_files = sorted(TMP_ROOT.glob("*.csv")) + logger.info(f"Found {len(pc_files)} local shards") + + existing = list_existing_keys(s3) + todo = [p for p in pc_files if f"{S3_PREFIX}/{p.stem}/data.csv.gz" not in existing] + skipped = len(pc_files) - len(todo) + logger.info( + f"Uploading {len(todo)} shards (skipping {skipped} already in S3) -> " + f"s3://{S3_BUCKET}/{S3_PREFIX}/" + ) + + workers = 256 + todo_iter = iter(todo) + inflight: dict[Any, Path] = {} + pbar = tqdm(total=len(todo), desc="upload") + with ThreadPoolExecutor(max_workers=workers) as pool: + for _ in range(workers * 2): + pc = next(todo_iter, None) + if pc is None: + break + inflight[pool.submit(upload_postcode, pc, s3)] = pc + + while inflight: + done, _ = wait(inflight.keys(), return_when=FIRST_COMPLETED) + for fut in done: + pc = inflight.pop(fut) + try: + fut.result() + except Exception as e: + logger.error(f"{pc.name}: {e}") + raise + pbar.update(1) + nxt = next(todo_iter, None) + if nxt is not None: + inflight[pool.submit(upload_postcode, nxt, s3)] = nxt + pbar.close() + + +if __name__ == "__main__": + main() From 4f45eeb3e9bf854b5f246916d04b99d1eb45020b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 15:55:40 +0000 Subject: [PATCH 020/106] save --- asset_list/AssetList.py | 323 ++++++++++++++++++------------------ asset_list/app.py | 30 +++- backend/app/config.py | 1 + backend/app/local/router.py | 17 +- backend/app/main.py | 16 +- recommendations/Costs.py | 229 +++++++++++++++---------- 6 files changed, 345 insertions(+), 271 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index dede3162..573c4f7c 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -31,17 +31,19 @@ from recommendations.recommendation_utils import ( from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes -from dotenv import load_dotenv +# from dotenv import load_dotenv logger = setup_logger() -load_dotenv(dotenv_path="../backend/.env") +# load_dotenv(dotenv_path="../backend/.env") # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +# OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") class DataRemapper: - def __init__(self, standard_values, standard_map=None, max_tokens=1000): + def __init__( + self, standard_values, standard_map=None, max_tokens=1000, api_key=None + ): """ Initialize the remapper with standard values and a predefined mapping. @@ -75,7 +77,8 @@ class DataRemapper: "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, } - self.openai_client = OpenAI(api_key=OPENAI_API_KEY) + print(f"DATA REMAPPER api key is {api_key}") + self.openai_client = OpenAI(api_key=api_key) @staticmethod def clean_string(text): @@ -136,12 +139,20 @@ class DataRemapper: raise ValueError("Input tokens exceed the maximum limit.") logger.info("Calling OpenAI API for standardization...") - response = self.openai_client.chat.completions.create( - model=self.ai_model, - messages=[{"role": "user", "content": prompt}], - max_tokens=self.max_tokens, - temperature=0.1, - ) + + try: + response = self.openai_client.chat.completions.create( + model=self.ai_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.1, + ) + except Exception as e: + print(f"[debug] OpenAI call failed. type={type(e).__name__}") + print(f"[debug] status={getattr(e, 'status_code', None)}") + print(f"[debug] body={getattr(e, 'response', None) and e.response.text}") + print(f"[debug] model={self.ai_model}") + raise output_text = response.choices[0].message.content.strip() output_tokens = self.count_tokens(output_text) # Count output tokens @@ -504,6 +515,7 @@ class AssetList: landlord_block_reference=None, phase=False, header=0, + openai_api_key=None, ): self.local_filepath = local_filepath self.sheet_name = sheet_name @@ -529,6 +541,7 @@ class AssetList: self.ecosurv = None self.ecosurv_no_match = pd.DataFrame() self.geographical_areas = pd.DataFrame() + self.openai_api_key = openai_api_key # When this is True, we intend to break the programme into multiple phases. We may need to review # how this is structured in the future, as depending on how we get future data, we may need to @@ -1107,6 +1120,7 @@ class AssetList: remapper = DataRemapper( standard_values=config["standard_values"], standard_map=config["standard_map"], + api_key=self.openai_api_key, ) remap_dictionary = remapper.standardize_list( values_to_remap=values_to_remap.tolist() @@ -1296,8 +1310,8 @@ class AssetList: self.standardised_asset_list[ self.ATTRIBUTE_HAS_SOLAR ] = self.standardised_asset_list[ - self.FIND_EPC_DATA_NAMES["Solar photovoltaics"] - ] | ~self.standardised_asset_list[ + self.FIND_EPC_DATA_NAMES["Solar photovoltaics"] + ] | ~self.standardised_asset_list[ self.EPC_API_DATA_NAMES["photo-supply"] ].isin( ["0.0", 0, None, "", np.nan] @@ -1315,7 +1329,7 @@ class AssetList: property_type=( str(x[self.STANDARD_PROPERTY_TYPE]).title() if str(x[self.STANDARD_PROPERTY_TYPE]).title() - in accepted_epc_property_types + in accepted_epc_property_types else ( x[self.EPC_API_DATA_NAMES["property-type"]] if not pd.isnull( @@ -1373,9 +1387,9 @@ class AssetList: self.standardised_asset_list.apply( lambda x: estimate_perimeter( floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] - / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] - / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], ), axis=1, ) @@ -1460,7 +1474,7 @@ class AssetList: year_lower_bound = ( 2007 if x[self.EPC_API_DATA_NAMES["construction-age-band"]] - == "England and Wales: 2007 onwards" + == "England and Wales: 2007 onwards" else 2012 ) @@ -1515,7 +1529,7 @@ class AssetList: age_band_matches = ( "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] - == int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) + == int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) else "EPC Age Band is different from Year Built" ) @@ -1545,7 +1559,7 @@ class AssetList: age_band_matches = ( "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) - and (x[self.STANDARD_YEAR_BUILT] <= float(upper_date)) + and (x[self.STANDARD_YEAR_BUILT] <= float(upper_date)) else ( "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) @@ -1717,22 +1731,22 @@ class AssetList: if self.non_intrusives_present: if self.new_format_non_insturives_present_v2: non_intrusives_wall_filter = ( - self.standardised_asset_list["non-intrusives: Construction"] - == "CAVITY" - ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( + self.standardised_asset_list["non-intrusives: Construction"] + == "CAVITY" + ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( ["EMPTY", "PARTIAL", "EMPTY CAVITY"] ) else: non_intrusives_wall_filter = ( - self.standardised_asset_list["non-intrusives: Construction"] - == "CAVITY" - ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( + self.standardised_asset_list["non-intrusives: Construction"] + == "CAVITY" + ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( ["EMPTY", "PARTIAL"] ) elif self.old_format_non_intrusives_present: non_intrusives_wall_filter = self.standardised_asset_list[ - "non-intrusives: WFT Findings" - ].str.lower().str.strip().isin( + "non-intrusives: WFT Findings" + ].str.lower().str.strip().isin( [ "empty cavity", "partial fill", @@ -1742,18 +1756,18 @@ class AssetList: "empty cav", ] ) | ( - ( - self.standardised_asset_list["non-intrusives: WFT Findings"] - .str.lower() - .str.strip() - .str.contains("empty cavity|partial fill") - & ~self.standardised_asset_list["non-intrusives: WFT Findings"] - .astype(str) - .str.lower() - .str.strip() - .str.contains("major access issues") - ) - ) + ( + self.standardised_asset_list["non-intrusives: WFT Findings"] + .str.lower() + .str.strip() + .str.contains("empty cavity|partial fill") + & ~self.standardised_asset_list["non-intrusives: WFT Findings"] + .astype(str) + .str.lower() + .str.strip() + .str.contains("major access issues") + ) + ) else: # We set the filter to False, as we have no non-intrusives non_intrusives_wall_filter = False @@ -1765,12 +1779,12 @@ class AssetList: ) else: year_built_filter = ( - self.standardised_asset_list[self.STANDARD_YEAR_BUILT] - <= self.EMPTY_CAVITY_YEAR_THRESHOLD - ) | ( - self.standardised_asset_list["epc_year_upper_bound"] - <= self.EMPTY_CAVITY_YEAR_THRESHOLD - ) + self.standardised_asset_list[self.STANDARD_YEAR_BUILT] + <= self.EMPTY_CAVITY_YEAR_THRESHOLD + ) | ( + self.standardised_asset_list["epc_year_upper_bound"] + <= self.EMPTY_CAVITY_YEAR_THRESHOLD + ) # Criteria: # The property isn't a bedsit @@ -1811,8 +1825,8 @@ class AssetList: ] = ( ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity_has_solar" - ] + "non_intrusive_indicates_empty_cavity_has_solar" + ] & ( ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin( ["bedsit"] @@ -1888,8 +1902,8 @@ class AssetList: .str.lower() .isin(self.EPC_NO_WALL_INSULATION_DESCRIPTIONS) | self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( - ["uninsulated cavity"] - ) + ["uninsulated cavity"] + ) ) ###################################################### @@ -1926,8 +1940,8 @@ class AssetList: extraction_wall_filter = ( extraction_wall_filter & ~self.standardised_asset_list[ - "non-intrusives: Eligibility (Red/Yellow/Green)" - ].isin(["RED"]) + "non-intrusives: Eligibility (Red/Yellow/Green)" + ].isin(["RED"]) ) self.standardised_asset_list[ @@ -2023,26 +2037,26 @@ class AssetList: self.standardised_asset_list[ "solar_epc_data_indicates_correct_heating_system" ] = ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] - ] - .str.lower() - .str.contains( - "air source heat pump|ground source heat pump|boiler and radiators, electric" - ) - ) | ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] - ] - .str.lower() - .str.contains("electric storage heaters") - & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheatcont-description"] - ] - == "Controls for high heat retention storage heaters" - ) + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .str.contains( + "air source heat pump|ground source heat pump|boiler and radiators, electric" ) + ) | ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .str.contains("electric storage heaters") + & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheatcont-description"] + ] + == "Controls for high heat retention storage heaters" + ) + ) # If the landlord has given us the heating system, we default to that on heating upgrades. Because of the # poor heating in place, if the EPC indicates that this property had a low efficiency heating system but the @@ -2050,25 +2064,25 @@ class AssetList: self.standardised_asset_list[ "solar_epc_data_indicates_requires_heating_upgrade" ] = ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .str.contains("electric storage heaters|room heaters") + & ( self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] + self.EPC_API_DATA_NAMES["mainheatcont-description"] ] - .str.lower() - .str.contains("electric storage heaters|room heaters") - & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheatcont-description"] - ] - != "Controls for high heat retention storage heaters" - ) - ) & ( - ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( - ["district heating", "communal heating", "communal gas boiler"] - ) - & ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] - .astype(str) - .str.contains("gas ") + != "Controls for high heat retention storage heaters" ) + ) & ( + ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + ["district heating", "communal heating", "communal gas boiler"] + ) + & ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] + .astype(str) + .str.contains("gas ") + ) # Basic check - both of the previous two shouldn't be true simultaneously if ( @@ -2148,8 +2162,8 @@ class AssetList: self.standardised_asset_list[ "solar_non_intrusives_walls_insulated" ] = self.standardised_asset_list[ - "non-intrusives: WFT Findings" - ].str.lower().str.strip().isin( + "non-intrusives: WFT Findings" + ].str.lower().str.strip().isin( [ "retro drilled", "retro filled", @@ -2158,8 +2172,8 @@ class AssetList: "retro drilled and filled", ] ) | self.standardised_asset_list[ - "non-intrusives: WFT Findings" - ].str.lower().str.strip().str.contains( + "non-intrusives: WFT Findings" + ].str.lower().str.strip().str.contains( "retro drilled" ) else: @@ -2176,19 +2190,14 @@ class AssetList: ) self.standardised_asset_list["solar_epc_walls_insulated"] = ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES[ - "walls-description"]] - .str.lower() - .str.contains("|".join( - self.EPC_INSULATED_WALLS_SUBSTRINGS)) - ) | ( - self.standardised_asset_list[ - "walls_u_value"].apply( - lambda x: x <= 0.7 if not pd.isnull( - x) else False - ) - ) + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]] + .str.lower() + .str.contains("|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS)) + ) | ( + self.standardised_asset_list["walls_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) roof_data = [] for desc in self.standardised_asset_list[ @@ -2230,20 +2239,20 @@ class AssetList: self.standardised_asset_list[ "solar_epc_loft_needs_topup" ] = self.standardised_asset_list[ - self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS - ].apply( + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS + ].apply( lambda x: int(x) < 200 if str(x).isdigit() else False ) | ( - ( - self.standardised_asset_list["is_loft"] - | self.standardised_asset_list["is_pitched"] - ) - & ( - self.standardised_asset_list[ - self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS - ].isin(["below average", "none"]) - ) + ( + self.standardised_asset_list["is_loft"] + | self.standardised_asset_list["is_pitched"] ) + & ( + self.standardised_asset_list[ + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS + ].isin(["below average", "none"]) + ) + ) self.standardised_asset_list["epc_has_floor_recommendation"] = ( self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) @@ -2252,16 +2261,15 @@ class AssetList: # Check if the boiler is electric # We check if it contains both the terms boiler & electric self.standardised_asset_list["has_electric_boiler"] = ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] - ] - .str.lower() - .isin(["boiler and radiators, electric"]) - ) | ( - self.standardised_asset_list[ - self.STANDARD_HEATING_SYSTEM] - == "electric boiler" - ) + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .isin(["boiler and radiators, electric"]) + ) | ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] + == "electric boiler" + ) #################################### # Check solar eligibility @@ -2399,11 +2407,11 @@ class AssetList: empty_cavity_map = { "non_intrusive_indicates_empty_cavity": self.EMPTY_CAVITY_NON_INTRUSIVE - + ": ", + + ": ", "non_intrusive_indicates_empty_cavity_has_solar": f"{self.EMPTY_CAVITY_NON_INTRUSIVE} - property " - "already has solar: ", + "already has solar: ", "non_intrusive_indicates_empty_cavity_no_year_filter": f"{self.EMPTY_CAVITY_NON_INTRUSIVE}, " - f"built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: ", + f"built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: ", } for variable, description in empty_cavity_map.items(): self.standardised_asset_list["cavity_reason"] = np.where( @@ -2419,8 +2427,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & ( self.standardised_asset_list["non-intrusives: WFT Findings"] .str.lower() @@ -2445,8 +2453,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & self.standardised_asset_list[ "non_intrusive_indicates_cavity_extraction" ] @@ -2461,8 +2469,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & ( self.standardised_asset_list["non-intrusives: Insulated"] == "RETRO DRILLED" @@ -2478,8 +2486,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & ( self.standardised_asset_list["non-intrusives: Insulated"] == "FILLED AT BUILD" @@ -2495,8 +2503,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & pd.isnull(self.standardised_asset_list["cavity_reason"]) ), f"{self.EPC_EMPTY}: " + self.standardised_asset_list["SAP Category"], @@ -2640,7 +2648,7 @@ class AssetList: identified_work = self.standardised_asset_list[ ~pd.isnull(self.standardised_asset_list["cavity_reason"]) | ~pd.isnull(self.standardised_asset_list["solar_reason"]) - ][self.DOMNA_PROPERTY_ID].values + ][self.DOMNA_PROPERTY_ID].values if self.DOMNA_PROPERTY_ID in self.outcomes.columns: self.outcomes_for_output = self.outcomes[ @@ -2675,12 +2683,12 @@ class AssetList: blocks_of_flats = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" - ] + ] non_blocks_of_flats = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" - ] + ] # Produce some aggregate figures self.work_type_figures = { @@ -2723,7 +2731,7 @@ class AssetList: blocks = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" - ].copy() + ].copy() if blocks.empty: return @@ -2860,7 +2868,7 @@ class AssetList: self.standardised_asset_list = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" - ] + ] self.standardised_asset_list = pd.concat( [self.standardised_asset_list, expanded_blocks], ignore_index=True @@ -2940,7 +2948,7 @@ class AssetList: # find any block refs with more than 50% emptires viable_empty_blocks = self.block_analysis_df[ self.block_analysis_df["Percentage of Empties"] >= 0.50 - ] + ] if not viable_empty_blocks.empty: project_code_lookup = viable_empty_blocks[["Block Reference"]].copy() @@ -3179,7 +3187,7 @@ class AssetList: contact_details = pd.read_excel(local_filepath, sheet_name=sheet_name)[ [self.contact_detail_fields["landlord_property_id"]] + details_colnames - ] + ] contact_details = contact_details[ ~pd.isnull( contact_details[self.contact_detail_fields["landlord_property_id"]] @@ -3572,13 +3580,10 @@ class AssetList: "Non-Intrusives: Date Checked ": date_of_inspections, "Non-Intrusives: Wall Type ": non_intrusives_construction, "Non-intrusives: Insulation ": non_intrusives_insulated, - "Non-intrusives: Insulation Material ": - non_intrusives_insulation_material, - "Non-Intrusives: CIGA Check Required ": - non_intrusives_ciga_check_required, + "Non-intrusives: Insulation Material ": non_intrusives_insulation_material, + "Non-Intrusives: CIGA Check Required ": non_intrusives_ciga_check_required, "Non-Intrusives: PV Access Issues ": non_intrusives_pv_access, - "Non-Intrusives: Roof Orientation ": - non_intrusives_roof_orientation, + "Non-Intrusives: Roof Orientation ": non_intrusives_roof_orientation, "Non-Intrusives: Surveyor Notes ": non_intrusives_surveyor_notes, "Non-Intrusives: Surveyor Name ": non_intrusives_surveyor_name, "CIGA: Date Requested ": None, # TODO: Don't have this for the moment @@ -3755,8 +3760,8 @@ class AssetList: # We compare address line 1 to full address if any( df[self.STANDARD_FULL_ADDRESS] - .str.lower() - .str.contains(row["Address Line 1"].lower(), na=False) + .str.lower() + .str.contains(row["Address Line 1"].lower(), na=False) ): df = df[ df[self.STANDARD_FULL_ADDRESS] @@ -3996,7 +4001,7 @@ class AssetList: matched = matched[ matched["houseno"].astype(str) == house_no_to_match - ] + ] if matched.shape[0] == 1: lookup_i.append( { @@ -4021,7 +4026,7 @@ class AssetList: )[0] matched = matched[ matched[self.STANDARD_FULL_ADDRESS] == best_match - ] + ] lookup_i.append( { "row_id": x["row_id"], @@ -4332,7 +4337,7 @@ class AssetList: df = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] == row[master_id_colnames[idx]] - ] + ] if df.shape[0] == 1: matched.append( { @@ -4438,7 +4443,7 @@ class AssetList: )[1] ) > 90 - ] + ] if df.shape[0] == 0: unmatched.append(row["row_id"]) @@ -4446,8 +4451,8 @@ class AssetList: if any( df[self.STANDARD_FULL_ADDRESS] - .str.lower() - .str.contains( + .str.lower() + .str.contains( " ".join( [row[house_no_col], row["Street / Block Name"]] ).lower() @@ -4474,7 +4479,7 @@ class AssetList: row[property_type_col].split(" ")[-1].lower() ) & (df[self.STANDARD_PROPERTY_TYPE] != "block of flats") - ] + ] if df.shape[0] != 1: # We have multiple matches - it's likely because the landlord has a duplicate diff --git a/asset_list/app.py b/asset_list/app.py index 49ec48a0..7413c7cb 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -21,6 +21,11 @@ EPC_AUTH_TOKEN = os.getenv( OPENAI_API_KEY = os.getenv( "OPENAI_API_KEY", ) +print( + f"[debug] OPENAI_API_KEY loaded: " + f"{OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:] if OPENAI_API_KEY else 'NONE'} " + f"(len={len(OPENAI_API_KEY) if OPENAI_API_KEY else 0})" +) def extract_address1( @@ -74,23 +79,23 @@ def app(): """ data_folder = "/workspaces/model/asset_list" - data_filename = "2026-04-22T08_22_00.779745_61049fd3.xlsx" - sheet_name = "in" - postcode_column = "postcode_clean" - address1_column = "address2uprn_address" + data_filename = "input.xlsx" + sheet_name = "Handovers" + postcode_column = "POSTCODE" + address1_column = "Full Addres" address1_method = None - fulladdress_column = "address2uprn_address" + fulladdress_column = "Full Addres" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "address2uprn_uprn" - landlord_property_type = "Property Type" # Good to include if landlord gave - landlord_built_form = "Built Form" # Good to include if landlord gave + landlord_os_uprn = "domna_found_uprn" + landlord_property_type = "PROPERTY TYPE" # Good to include if landlord gave + landlord_built_form = "Type Description" # Good to include if landlord gave landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "UPRN" + landlord_property_id = "PROP REF" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -131,6 +136,7 @@ def app(): landlord_sap=landlord_sap, landlord_block_reference=landlord_block_reference, phase=phase, + openai_api_key=OPENAI_API_KEY, ) asset_list.init_standardise() @@ -462,3 +468,9 @@ def app(): asset_list.duplicated_addresses.to_excel( writer, sheet_name="Duplicate Properties", index=False ) + + + + +for key,value in dict.items(): + lsakjfldsa \ No newline at end of file diff --git a/backend/app/config.py b/backend/app/config.py index 70a6b50c..e72eb693 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -77,6 +77,7 @@ class Settings(BaseSettings): OSMOSIS_ACD_SHAREPOINT_ID: Optional[str] = None PRIVATE_PAY_SHAREPOINT_ID: Optional[str] = None SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID: Optional[str] = None + OPENAI_API_KEY: Optional[str] = None # Pas Hub PASHUB_EMAIL: Optional[str] = None diff --git a/backend/app/local/router.py b/backend/app/local/router.py index 0977be04..ea04dc49 100644 --- a/backend/app/local/router.py +++ b/backend/app/local/router.py @@ -2,8 +2,8 @@ from fastapi import APIRouter, HTTPException, status from jose import jwt, jwe import json import datetime -from app.config import get_settings -from app.dependencies import get_derived_encryption_key +from backend.app.config import get_settings +from backend.app.dependencies import get_derived_encryption_key router = APIRouter( prefix="/local", @@ -27,7 +27,12 @@ def create_dummy_token(secret: str) -> str: "dbId": "known_id", } - token = jwe.encrypt(json.dumps(claims), get_derived_encryption_key(secret), algorithm="dir", encryption="A256GCM") + token = jwe.encrypt( + json.dumps(claims), + get_derived_encryption_key(secret), + algorithm="dir", + encryption="A256GCM", + ) return token @@ -40,6 +45,8 @@ async def dummy_token(): async def dummy_token(): settings = get_settings() if settings.ENVIRONMENT != "local": - raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, - detail="Dummy token can only be generated in local environment") + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Dummy token can only be generated in local environment", + ) return {"dummy_token": create_dummy_token(settings.SECRET_KEY)} diff --git a/backend/app/main.py b/backend/app/main.py index c9733c18..55dfef7d 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -30,10 +30,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE logger.error(f"Validation Errors: {exc.errors()}") return JSONResponse( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - content=jsonable_encoder({ - "detail": exc.errors(), - "body": exc.body - }), + content=jsonable_encoder({"detail": exc.errors(), "body": exc.body}), ) @@ -63,7 +60,8 @@ app.include_router(tasks_router.router, prefix="/v1") app.include_router(bulk_uploads_router.router, prefix="/v1") if get_settings().ENVIRONMENT == "local": - from app.local import router as local_router + from backend.app.local import router as local_router + app.include_router(local_router.router) handler = Mangum(app) @@ -98,10 +96,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE logger.error(f"Validation Errors: {exc.errors()}") return JSONResponse( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - content=jsonable_encoder({ - "detail": exc.errors(), - "body": exc.body - }), + content=jsonable_encoder({"detail": exc.errors(), "body": exc.body}), ) @@ -130,7 +125,8 @@ app.include_router(whlg_router.router, prefix="/v1") app.include_router(bulk_uploads_router.router, prefix="/v1") if get_settings().ENVIRONMENT == "local": - from app.local import router as local_router + from backend.app.local import router as local_router + app.include_router(local_router.router) handler = Mangum(app) diff --git a/recommendations/Costs.py b/recommendations/Costs.py index bd8f160a..fc72d4d8 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -21,28 +21,28 @@ regional_labour_variations = [ {"Region": "Yorkshire and the Humber", "Adjustment_Factor": 0.86}, {"Region": "Wales", "Adjustment_Factor": 0.88}, {"Region": "Scotland", "Adjustment_Factor": 0.88}, - {"Region": "Northern Ireland", "Adjustment_Factor": 0.76} + {"Region": "Northern Ireland", "Adjustment_Factor": 0.76}, ] # Installers are now working with 435 watt panels PANEL_SIZE = 0.435 INSTALLER_SOLAR_COSTS = [ - {'n_panels': 4, 'array_kwp': 4 * PANEL_SIZE, 'cost': 4089.25, 'installer': 'CEG'}, - {'n_panels': 5, 'array_kwp': 5 * PANEL_SIZE, 'cost': 4242.48, 'installer': 'CEG'}, - {'n_panels': 6, 'array_kwp': 6 * PANEL_SIZE, 'cost': 4395.71, 'installer': 'CEG'}, - {'n_panels': 7, 'array_kwp': 7 * PANEL_SIZE, 'cost': 4548.94, 'installer': 'CEG'}, - {'n_panels': 8, 'array_kwp': 8 * PANEL_SIZE, 'cost': 4702.17, 'installer': 'CEG'}, - {'n_panels': 9, 'array_kwp': 9 * PANEL_SIZE, 'cost': 4855.41, 'installer': 'CEG'}, - {'n_panels': 10, 'array_kwp': 10 * PANEL_SIZE, 'cost': 5010.95, 'installer': 'CEG'}, - {'n_panels': 11, 'array_kwp': 11 * PANEL_SIZE, 'cost': 5166.49, 'installer': 'CEG'}, - {'n_panels': 12, 'array_kwp': 12 * PANEL_SIZE, 'cost': 5322.04, 'installer': 'CEG'}, - {'n_panels': 13, 'array_kwp': 13 * PANEL_SIZE, 'cost': 5657.6, 'installer': 'CEG'}, - {'n_panels': 14, 'array_kwp': 14 * PANEL_SIZE, 'cost': 5993.16, 'installer': 'CEG'}, - {'n_panels': 15, 'array_kwp': 15 * PANEL_SIZE, 'cost': 6328.71, 'installer': 'CEG'}, - {'n_panels': 16, 'array_kwp': 16 * PANEL_SIZE, 'cost': 6483.33, 'installer': 'CEG'}, - {'n_panels': 17, 'array_kwp': 17 * PANEL_SIZE, 'cost': 6637.95, 'installer': 'CEG'}, - {'n_panels': 18, 'array_kwp': 18 * PANEL_SIZE, 'cost': 6792.57, 'installer': 'CEG'} + {"n_panels": 4, "array_kwp": 4 * PANEL_SIZE, "cost": 4089.25, "installer": "CEG"}, + {"n_panels": 5, "array_kwp": 5 * PANEL_SIZE, "cost": 4242.48, "installer": "CEG"}, + {"n_panels": 6, "array_kwp": 6 * PANEL_SIZE, "cost": 4395.71, "installer": "CEG"}, + {"n_panels": 7, "array_kwp": 7 * PANEL_SIZE, "cost": 4548.94, "installer": "CEG"}, + {"n_panels": 8, "array_kwp": 8 * PANEL_SIZE, "cost": 4702.17, "installer": "CEG"}, + {"n_panels": 9, "array_kwp": 9 * PANEL_SIZE, "cost": 4855.41, "installer": "CEG"}, + {"n_panels": 10, "array_kwp": 10 * PANEL_SIZE, "cost": 5010.95, "installer": "CEG"}, + {"n_panels": 11, "array_kwp": 11 * PANEL_SIZE, "cost": 5166.49, "installer": "CEG"}, + {"n_panels": 12, "array_kwp": 12 * PANEL_SIZE, "cost": 5322.04, "installer": "CEG"}, + {"n_panels": 13, "array_kwp": 13 * PANEL_SIZE, "cost": 5657.6, "installer": "CEG"}, + {"n_panels": 14, "array_kwp": 14 * PANEL_SIZE, "cost": 5993.16, "installer": "CEG"}, + {"n_panels": 15, "array_kwp": 15 * PANEL_SIZE, "cost": 6328.71, "installer": "CEG"}, + {"n_panels": 16, "array_kwp": 16 * PANEL_SIZE, "cost": 6483.33, "installer": "CEG"}, + {"n_panels": 17, "array_kwp": 17 * PANEL_SIZE, "cost": 6637.95, "installer": "CEG"}, + {"n_panels": 18, "array_kwp": 18 * PANEL_SIZE, "cost": 6792.57, "installer": "CEG"}, ] # These are costs we received from CRG, for pricing up air source heat pumps @@ -80,7 +80,12 @@ INSTALLER_SOLAR_PV_INVERTER_COST = 7500 INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST = 500 # Just a rough guess to labour costs INSTALLER_SOLAR_BATTERY_COSTS = [ - {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 3769.89, 'installer': 'JJC'}, + { + "capacity_kwh": 5, + "description": "Battery Add on", + "cost": 3769.89, + "installer": "JJC", + }, # {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'}, # {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'}, # {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'} @@ -102,10 +107,14 @@ TTZC_SMART_THERMOSTAT_LABOUR_HOURS = 2 TTZC_ELECTRICIAN_HOURLY_RATE = 45 # Based on cost of a Nest temperature sensor TTZC_ROOM_TEMPERATURE_SENSOR_COST = 50 -TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS = 0.17 # (Assume ~ 10 mins install per sensor) +TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS = ( + 0.17 # (Assume ~ 10 mins install per sensor) +) # Basedon an average cost of smart radiator values TTZC_SMART_RADIATOR_VALUES = 50 -TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS = 0.37 # (Assume ~ 15-30 mins install per valve) +TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS = ( + 0.37 # (Assume ~ 15-30 mins install per valve) +) # boiler prices based on # This is the cost of a firs time central heating install from The Warm Front rate card @@ -169,7 +178,7 @@ class Costs: "heater_removal": 0.1, "sealing_open_fireplace": 0.1, "mechanical_ventilation": 0.26, - "sloping_ceiling_insulation": 0.26 # Similar to IWI so using the same contingency + "sloping_ceiling_insulation": 0.26, # Similar to IWI so using the same contingency } # Preliminaries are a percentage of the total cost of the work and covers the cost of site-specific costs @@ -195,36 +204,46 @@ class Costs: :param property_instance: Instance of a Property class containing relevant details like wall area. """ - if not hasattr(property_instance, 'insulation_wall_area'): - raise ValueError("Property instance must have an 'insulation_wall_area' attribute") + if not hasattr(property_instance, "insulation_wall_area"): + raise ValueError( + "Property instance must have an 'insulation_wall_area' attribute" + ) self.property = property_instance self.regional_labour_variations = regional_labour_variations self.region = county_to_region_map.get(self.property.epc_record.county, None) if self.region is None: # Try and grab using the local-authority-label - self.region = county_to_region_map.get(self.property.epc_record.local_authority_label, None) + self.region = county_to_region_map.get( + self.property.epc_record.local_authority_label, None + ) if self.region is None: # Try and get the region after converting the keys to lower - self.region = { - k.lower(): v for k, v in county_to_region_map.items() - }.get(self.property.epc_record.local_authority_label.lower(), None) + if self.property.epc_record.local_authority_label is not None: + self.region = { + k.lower(): v for k, v in county_to_region_map.items() + }.get(self.property.epc_record.local_authority_label.lower(), None) if self.region is None: - logger.warning("No region found for county %s, defaulting to South East England", - self.property.epc_record.county) + logger.warning( + "No region found for county %s, defaulting to South East England", + self.property.epc_record.county, + ) self.region = "South East England" self.labour_adjustment_factor = [ - x["Adjustment_Factor"] for x in self.regional_labour_variations if - x["Region"] == self.region + x["Adjustment_Factor"] + for x in self.regional_labour_variations + if x["Region"] == self.region ][0] if not self.labour_adjustment_factor: raise ValueError("Labour adjustment factor not found") - def cavity_wall_insulation(self, wall_area, material, is_extraction_and_refill=False): + def cavity_wall_insulation( + self, wall_area, material, is_extraction_and_refill=False + ): """ Calculates the total cost for cavity wall insulation based on material and labor costs, including contingency, preliminaries, profit, and VAT. @@ -318,7 +337,8 @@ class Costs: return { "total": total_cost, - "contingency": self.CONTINGENCIES["suspended_floor_insulation"] * total_cost, + "contingency": self.CONTINGENCIES["suspended_floor_insulation"] + * total_cost, "contingency_rate": self.CONTINGENCIES["suspended_floor_insulation"], "labour_hours": labour_hours, "labour_days": labour_days, @@ -370,8 +390,7 @@ class Costs: # - Apply sub-linear scaling for realism # - Enforce a minimum duration so estimates are not unrealistically low labour_days = max( - min_days, - base_days * (insulation_floor_area / base_area) ** labour_exponent + min_days, base_days * (insulation_floor_area / base_area) ** labour_exponent ) return labour_days @@ -388,7 +407,9 @@ class Costs: total_cost = material["total_cost"] * insulation_floor_area daily_labour_rate = 300 # Based on checkatrade - labour_days = self._estimate_number_of_days_for_solid_floor(insulation_floor_area) + labour_days = self._estimate_number_of_days_for_solid_floor( + insulation_floor_area + ) labour_cost = labour_days * daily_labour_rate total_cost = total_cost + labour_cost @@ -404,7 +425,6 @@ class Costs: } def low_energy_lighting(self, number_of_lights, material): - """ Calculates the total cost for low energy lighting based on material and labor costs, including contingency, preliminaries, profit, and VAT. @@ -419,7 +439,7 @@ class Costs: total_cost = material["total_cost"] * number_of_lights labour_hours = 1 - labour_days = (labour_hours / 8) + labour_days = labour_hours / 8 return { "total": total_cost, @@ -450,26 +470,22 @@ class Costs: } @classmethod - def solar_pv( - cls, - solar_product, - scaffolding_options, - n_floors - ): - - """ - - """ + def solar_pv(cls, solar_product, scaffolding_options, n_floors): + """ """ system_cost = solar_product["total_cost"] if not solar_product["includes_scaffolding"]: # We base this on the number of floors - scaffolding = [x["total_cost"] for x in scaffolding_options if x["size"] == n_floors] + scaffolding = [ + x["total_cost"] for x in scaffolding_options if x["size"] == n_floors + ] if not scaffolding: # If we have no options, handle this if n_floors <= 3: - raise ValueError("No scaffolding options available for 3 or fewer floors") + raise ValueError( + "No scaffolding options available for 3 or fewer floors" + ) # We take the largest scaffolding option available scaffolding_cost = max([x["total_cost"] for x in scaffolding_options]) else: @@ -523,9 +539,9 @@ class Costs: We base the estimates for the cost of electric room heaters on the cost per room as estimated by the following article: https://www.bestelectricradiators.co.uk/blog/cost-to-install-a-new-heating-system-uk/ - + :param number_heated_rooms: int, number of rooms to be heated - :return: + :return: """ total_cost = 500 * number_heated_rooms @@ -547,11 +563,11 @@ class Costs: } def high_heat_electric_storage_heaters( - self, number_heated_rooms: int, + self, + number_heated_rooms: int, needs_cylinder: bool, - product: dict | None = None + product: dict | None = None, ): - """ We base the estimates for the cost of electric storage heaters on the cost per room as estimated by the energy saving trust @@ -578,8 +594,11 @@ class Costs: return { "total": total_cost, - "contingency": total_cost * self.CONTINGENCIES["high_heat_retention_storage_heaters"], - "contingency_rate": self.CONTINGENCIES["high_heat_retention_storage_heaters"], + "contingency": total_cost + * self.CONTINGENCIES["high_heat_retention_storage_heaters"], + "contingency_rate": self.CONTINGENCIES[ + "high_heat_retention_storage_heaters" + ], "subtotal": subtotal_before_vat, "vat": vat, "labour_hours": labour_hours, @@ -690,14 +709,14 @@ class Costs: # The product costs are inclusive of VAT product_costs = ( - TTZC_SMART_THERMOSTAT_COST + - TTZC_ROOM_TEMPERATURE_SENSOR_COST * number_heated_rooms + - TTZC_SMART_RADIATOR_VALUES * number_heated_rooms + TTZC_SMART_THERMOSTAT_COST + + TTZC_ROOM_TEMPERATURE_SENSOR_COST * number_heated_rooms + + TTZC_SMART_RADIATOR_VALUES * number_heated_rooms ) labour_hours = ( - TTZC_SMART_THERMOSTAT_LABOUR_HOURS + - TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS * number_heated_rooms + - TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS * number_heated_rooms + TTZC_SMART_THERMOSTAT_LABOUR_HOURS + + TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS * number_heated_rooms + + TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS * number_heated_rooms ) labour_costs = TTZC_ELECTRICIAN_HOURLY_RATE * labour_hours # Add continency and preliminaries to the labour to account for the complexity of the job @@ -722,7 +741,9 @@ class Costs: "labour_days": labour_days, } - def programmer_trvs_bypass(self, number_heated_rooms, has_programmer, has_trvs, has_bypass): + def programmer_trvs_bypass( + self, number_heated_rooms, has_programmer, has_trvs, has_bypass + ): total_cost = 0 labour_hours = 0 @@ -779,7 +800,9 @@ class Costs: } @staticmethod - def _estimate_n_radiators(number_habitable_rooms, total_floor_area, property_type, built_form): + def _estimate_n_radiators( + number_habitable_rooms, total_floor_area, property_type, built_form + ): # Base number of radiators: one per habitable room base_radiators = number_habitable_rooms @@ -787,34 +810,49 @@ class Costs: additional_radiators = 3 # Initial assumption # Adjust additional radiators based on property type - if property_type == 'Flat': - additional_radiators -= 1 # Flats may need fewer radiators due to less exposure - elif property_type in ['House', 'Bungalow', 'Maisonette']: + if property_type == "Flat": + additional_radiators -= ( + 1 # Flats may need fewer radiators due to less exposure + ) + elif property_type in ["House", "Bungalow", "Maisonette"]: # Multiple floors in Maisonette may require additional heating points - additional_radiators += 2 # Houses and bungalows might need more due to greater exposure + additional_radiators += ( + 2 # Houses and bungalows might need more due to greater exposure + ) else: raise Exception("Invalid property type") # Adjust total radiator needs based on built form form_factor = { - 'Enclosed Mid-Terrace': 0.9, - 'Mid-Terrace': 0.95, - 'Enclosed End-Terrace': 0.95, - 'Semi-Detached': 1.05, - 'Detached': 1.25, - 'End-Terrace': 1.05 + "Enclosed Mid-Terrace": 0.9, + "Mid-Terrace": 0.95, + "Enclosed End-Terrace": 0.95, + "Semi-Detached": 1.05, + "Detached": 1.25, + "End-Terrace": 1.05, } # Calculate total heating power needed and number of radiators based on standard output total_heating_power_required = total_floor_area * 80 # Watts per square meter radiator_output = 1000 # Average wattage per radiator - total_radiators_based_on_power = (total_heating_power_required / radiator_output) * form_factor[built_form] + total_radiators_based_on_power = ( + total_heating_power_required / radiator_output + ) * form_factor[built_form] # Final estimation taking the higher of calculated needs or base room count - estimated_radiators = max(total_radiators_based_on_power, base_radiators + additional_radiators) + estimated_radiators = max( + total_radiators_based_on_power, base_radiators + additional_radiators + ) return round(estimated_radiators) - def boiler(self, exising_room_heaters, system_change, n_heated_rooms, n_rooms, is_electric=False): + def boiler( + self, + exising_room_heaters, + system_change, + n_heated_rooms, + n_rooms, + is_electric=False, + ): """ Based on a basic estimate of median value £2600 to install a low carbon combi boiler First time central heating vosts can als be found here: @@ -859,12 +897,14 @@ class Costs: number_habitable_rooms=n_rooms, total_floor_area=self.property.floor_area, property_type=self.property.epc_record.property_type, - built_form=self.property.epc_record.built_form + built_form=self.property.epc_record.built_form, ) additionals_labour_cost = labour_rate * self.labour_adjustment_factor radiator_cost = DOUBLE_RADIATOR_COST * n_radiators - system_change_cost = radiator_cost + FLUE_COST + PIPEWORK_COST + additionals_labour_cost + system_change_cost = ( + radiator_cost + FLUE_COST + PIPEWORK_COST + additionals_labour_cost + ) system_change_cost_before_vat = system_change_cost / (1 + self.VAT_RATE) system_change_vat = system_change_cost - system_change_cost_before_vat # We add an extra labour day for the system change @@ -897,14 +937,18 @@ class Costs: else: return 250 - def air_source_heat_pump(self, ashp_size: float, number_heated_rooms: int, total_floor_area: float) -> dict: + def air_source_heat_pump( + self, ashp_size: float, number_heated_rooms: int, total_floor_area: float + ) -> dict: """ We produce a cost estimation for an air source heat pump, based on costs we have received from installers. """ system_cost = ( - (ASHP_SMALL_SYSTEM_COST if ashp_size <= 8.5 else ASHP_LARGE_SYSTEM_COST) + ASHP_SECURITY + ASHP_WALL_BRACKET + (ASHP_SMALL_SYSTEM_COST if ashp_size <= 8.5 else ASHP_LARGE_SYSTEM_COST) + + ASHP_SECURITY + + ASHP_WALL_BRACKET ) available_n_rads = [x["n_radiators"] for x in ASHP_DISTRIBUTION_SYSTEM_COSTS] @@ -940,7 +984,9 @@ class Costs: } @staticmethod - def _estimate_number_of_days_for_sloping_ceiling(insulation_roof_area: float) -> float: + def _estimate_number_of_days_for_sloping_ceiling( + insulation_roof_area: float, + ) -> float: """ Estimate labour days required to insulate an existing sloping ceiling. @@ -965,14 +1011,15 @@ class Costs: min_days = 2 labour_days = max( - min_days, - base_days * (insulation_roof_area / base_area) ** labour_exponent + min_days, base_days * (insulation_roof_area / base_area) ** labour_exponent ) return labour_days @classmethod - def sloping_ceiling_insulation(cls, insulation_roof_area: float) -> Mapping[str, float]: + def sloping_ceiling_insulation( + cls, insulation_roof_area: float + ) -> Mapping[str, float]: """ This costing for this is based on Checkatrade desktop research, since we are yet to receive installer quotes. :param insulation_roof_area: Area of the sloping ceiling to be insulated @@ -985,14 +1032,20 @@ class Costs: # https://www.checkatrade.com/blog/cost-guides/vaulted-ceiling-cost/ # https://www.thegreenage.co.uk/can-i-insulate-my-sloping-ceiling/ # These assumptions last updated 21/02/2026 - insulation_cost_per_m2 = 52 # The actual install process is quite similar to IWI + insulation_cost_per_m2 = ( + 52 # The actual install process is quite similar to IWI + ) labour_rate = 250 # per day contingency_rate = cls.CONTINGENCIES["sloping_ceiling_insulation"] - labour_days = cls._estimate_number_of_days_for_sloping_ceiling(insulation_roof_area) + labour_days = cls._estimate_number_of_days_for_sloping_ceiling( + insulation_roof_area + ) labour_hours = labour_days * 8 - total = (insulation_cost_per_m2 * insulation_roof_area) + (labour_rate * labour_days) + total = (insulation_cost_per_m2 * insulation_roof_area) + ( + labour_rate * labour_days + ) # Assume VAT included in the total => total is 120% of subtotal vat = total - (total / 1.2) From 02fb3afbe41d479742ad4053f296c99e807f22ae Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:04:01 +0000 Subject: [PATCH 021/106] defined histrocial epc data shapre from csv --- datatypes/epc/schema/historic_epc.py | 97 ++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 datatypes/epc/schema/historic_epc.py diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py new file mode 100644 index 00000000..e158ac1f --- /dev/null +++ b/datatypes/epc/schema/historic_epc.py @@ -0,0 +1,97 @@ +from dataclasses import dataclass + + +@dataclass +class HistoricEpc: + lmk_key: str + address1: str + address2: str + address3: str + postcode: str + building_reference_number: str + current_energy_rating: str + potential_energy_rating: str + current_energy_efficiency: str + potential_energy_efficiency: str + property_type: str + built_form: str + inspection_date: str + local_authority: str + constituency: str + county: str + lodgement_date: str + transaction_type: str + environment_impact_current: str + environment_impact_potential: str + energy_consumption_current: str + energy_consumption_potential: str + co2_emissions_current: str + co2_emiss_curr_per_floor_area: str + co2_emissions_potential: str + lighting_cost_current: str + lighting_cost_potential: str + heating_cost_current: str + heating_cost_potential: str + hot_water_cost_current: str + hot_water_cost_potential: str + total_floor_area: str + energy_tariff: str + mains_gas_flag: str + floor_level: str + flat_top_storey: str + flat_storey_count: str + main_heating_controls: str + multi_glaze_proportion: str + glazed_type: str + glazed_area: str + extension_count: str + number_habitable_rooms: str + number_heated_rooms: str + low_energy_lighting: str + number_open_fireplaces: str + hotwater_description: str + hot_water_energy_eff: str + hot_water_env_eff: str + floor_description: str + floor_energy_eff: str + floor_env_eff: str + windows_description: str + windows_energy_eff: str + windows_env_eff: str + walls_description: str + walls_energy_eff: str + walls_env_eff: str + secondheat_description: str + sheating_energy_eff: str + sheating_env_eff: str + roof_description: str + roof_energy_eff: str + roof_env_eff: str + mainheat_description: str + mainheat_energy_eff: str + mainheat_env_eff: str + mainheatcont_description: str + mainheatc_energy_eff: str + mainheatc_env_eff: str + lighting_description: str + lighting_energy_eff: str + lighting_env_eff: str + main_fuel: str + wind_turbine_count: str + heat_loss_corridor: str + unheated_corridor_length: str + floor_height: str + photo_supply: str + solar_water_heating_flag: str + mechanical_ventilation: str + address: str + local_authority_label: str + constituency_label: str + posttown: str + construction_age_band: str + lodgement_datetime: str + tenure: str + fixed_lighting_outlets_count: str + low_energy_fixed_light_count: str + uprn: str + uprn_source: str From 90a4d83243604a4830dc0f6a7752a67f928c571f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:04:27 +0000 Subject: [PATCH 022/106] added init files to make it a python module --- backend/etl/__init__.py | 0 backend/etl/etl_opendatacommunities/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 backend/etl/__init__.py create mode 100644 backend/etl/etl_opendatacommunities/__init__.py diff --git a/backend/etl/__init__.py b/backend/etl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/etl/etl_opendatacommunities/__init__.py b/backend/etl/etl_opendatacommunities/__init__.py new file mode 100644 index 00000000..e69de29b From 74b7b87de6384419687bdb1bc57468ea3ee4959e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:22:41 +0000 Subject: [PATCH 023/106] =?UTF-8?q?load=20historic=20epc=20from=20csv=20?= =?UTF-8?q?=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- datatypes/epc/loaders/__init__.py | 0 datatypes/epc/loaders/historic_epc.py | 5 ++ datatypes/epc/schema/historic_epc.py | 7 +++ .../schema/tests/test_historic_epc_loading.py | 55 +++++++++++++++++++ 4 files changed, 67 insertions(+) create mode 100644 datatypes/epc/loaders/__init__.py create mode 100644 datatypes/epc/loaders/historic_epc.py create mode 100644 datatypes/epc/schema/tests/test_historic_epc_loading.py diff --git a/datatypes/epc/loaders/__init__.py b/datatypes/epc/loaders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/datatypes/epc/loaders/historic_epc.py b/datatypes/epc/loaders/historic_epc.py new file mode 100644 index 00000000..8555a706 --- /dev/null +++ b/datatypes/epc/loaders/historic_epc.py @@ -0,0 +1,5 @@ +from datatypes.epc.schema.historic_epc import HistoricEpc + + +def read_historic_epc_csv(path: str) -> list[HistoricEpc]: + raise NotImplementedError("read_historic_epc_csv not implemented yet") diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py index e158ac1f..9ebe4b09 100644 --- a/datatypes/epc/schema/historic_epc.py +++ b/datatypes/epc/schema/historic_epc.py @@ -95,3 +95,10 @@ class HistoricEpc: low_energy_fixed_light_count: str uprn: str uprn_source: str + report_type: str + improvement_item: str + improvement_summary_text: str + improvement_descr_text: str + improvement_id: str + improvement_id_text: str + indicative_cost: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py new file mode 100644 index 00000000..d5d5ea22 --- /dev/null +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -0,0 +1,55 @@ +import os + +import pytest + +from datatypes.epc.loaders.historic_epc import read_historic_epc_csv +from datatypes.epc.schema.historic_epc import HistoricEpc + +FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") + + +class TestHistoricEpcLoading: + + @pytest.fixture + def epc(self) -> HistoricEpc: + rows = read_historic_epc_csv(os.path.join(FIXTURES, "historic_epc.csv")) + return rows[0] + + def test_returns_historic_epc_instance(self, epc: HistoricEpc) -> None: + assert isinstance(epc, HistoricEpc) + + def test_lmk_key(self, epc: HistoricEpc) -> None: + assert epc.lmk_key == "9292c3bf26a8876ce59274401ea73e3de5bd0b3e52a507c2162a46e57db8ea2f" + + def test_address1(self, epc: HistoricEpc) -> None: + assert epc.address1 == "47 GORDON ROAD" + + def test_postcode(self, epc: HistoricEpc) -> None: + assert epc.postcode == "AB33 8AL" + + def test_current_energy_rating(self, epc: HistoricEpc) -> None: + assert epc.current_energy_rating == "E" + + def test_property_type(self, epc: HistoricEpc) -> None: + assert epc.property_type == "House" + + def test_built_form(self, epc: HistoricEpc) -> None: + assert epc.built_form == "Semi-Detached" + + def test_inspection_date(self, epc: HistoricEpc) -> None: + assert epc.inspection_date == "2021-04-11" + + def test_uprn(self, epc: HistoricEpc) -> None: + assert epc.uprn == "151020766.0" + + def test_uprn_source(self, epc: HistoricEpc) -> None: + assert epc.uprn_source == "Energy Assessor" + + def test_report_type(self, epc: HistoricEpc) -> None: + assert epc.report_type == "100" + + def test_improvement_summary_text(self, epc: HistoricEpc) -> None: + assert epc.improvement_summary_text == "Increase loft insulation to 270 mm" + + def test_indicative_cost(self, epc: HistoricEpc) -> None: + assert epc.indicative_cost == "£100 - £350" From 32bf1cc98de5bb73c753e772e81d0571595dc8fa Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:26:29 +0000 Subject: [PATCH 024/106] =?UTF-8?q?load=20historic=20epc=20from=20csv=20?= =?UTF-8?q?=F0=9F=9F=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- datatypes/epc/loaders/historic_epc.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/datatypes/epc/loaders/historic_epc.py b/datatypes/epc/loaders/historic_epc.py index 8555a706..7b563315 100644 --- a/datatypes/epc/loaders/historic_epc.py +++ b/datatypes/epc/loaders/historic_epc.py @@ -1,5 +1,18 @@ +import csv + from datatypes.epc.schema.historic_epc import HistoricEpc +def _normalise(value: str | None) -> str: + if value is None: + return "" + return value.replace("\xa0", " ") + + def read_historic_epc_csv(path: str) -> list[HistoricEpc]: - raise NotImplementedError("read_historic_epc_csv not implemented yet") + with open(path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [ + HistoricEpc(**{k.lower(): _normalise(v) for k, v in row.items()}) + for row in reader + ] From a39c3a0772566e28fc00a6c6ee2507a1d8b12b27 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 12:03:35 +0000 Subject: [PATCH 025/106] added added historic epc data class with shape --- backend/etl/etl_opendatacommunities/main.py | 37 +++++++------------ datatypes/epc/domain/historic_epc.py | 10 +++++ datatypes/epc/schema/historic_epc.py | 6 --- .../schema/tests/test_historic_epc_loading.py | 6 --- sfr/principal_pitch/2_export_data.py | 8 ++-- 5 files changed, 27 insertions(+), 40 deletions(-) create mode 100644 datatypes/epc/domain/historic_epc.py diff --git a/backend/etl/etl_opendatacommunities/main.py b/backend/etl/etl_opendatacommunities/main.py index 30b4045a..2bd41005 100644 --- a/backend/etl/etl_opendatacommunities/main.py +++ b/backend/etl/etl_opendatacommunities/main.py @@ -15,16 +15,7 @@ logger = setup_logger() SRC_ROOT = Path("/workspaces/home/epc_data") TMP_ROOT = Path("/tmp/epc_postcodes") S3_BUCKET = "retrofit-data-dev" -S3_PREFIX = "epc_opendatacommunities" - -REC_COLS = { - "IMPROVEMENT_ITEM", - "IMPROVEMENT_SUMMARY_TEXT", - "IMPROVEMENT_DESCR_TEXT", - "IMPROVEMENT_ID", - "IMPROVEMENT_ID_TEXT", - "INDICATIVE_COST", -} +S3_PREFIX = "historical_epc" # This scripts assume you downloading the zip, unzip it, and running it locally @@ -35,18 +26,16 @@ def sanitise(pc: pd.Series) -> pd.Series: def shard_la(la_dir: Path) -> None: certs = pd.read_csv(la_dir / "certificates.csv", low_memory=False) - recs = pd.read_csv(la_dir / "recommendations.csv", low_memory=False) - merged = certs.merge(recs, on="LMK_KEY", how="left") - merged["POSTCODE_CLEAN"] = sanitise(merged["POSTCODE"]) - before = len(merged) - merged = merged.dropna(subset=["POSTCODE_CLEAN"]) - merged = merged[merged["POSTCODE_CLEAN"] != ""] - dropped = before - len(merged) + certs["POSTCODE_CLEAN"] = sanitise(certs["POSTCODE"]) + before = len(certs) + certs = certs.dropna(subset=["POSTCODE_CLEAN"]) + certs = certs[certs["POSTCODE_CLEAN"] != ""] + dropped = before - len(certs) if dropped: logger.warning(f"{la_dir.name}: dropped {dropped} rows with empty postcode") - for pc, group in merged.groupby("POSTCODE_CLEAN", sort=False): + for pc, group in certs.groupby("POSTCODE_CLEAN", sort=False): out = TMP_ROOT / f"{pc}.csv" group.drop(columns=["POSTCODE_CLEAN"]).to_csv( out, mode="a", header=not out.exists(), index=False @@ -67,9 +56,7 @@ def list_existing_keys(s3: Any) -> set[str]: def upload_postcode(path: Path, s3: Any) -> None: df = pd.read_csv(path, low_memory=False).drop_duplicates() - cert_cols = [c for c in df.columns if c not in REC_COLS] - cert_only = df[cert_cols].drop_duplicates() - dupes = cert_only["LMK_KEY"].value_counts() + dupes = df["LMK_KEY"].value_counts() bad = dupes[dupes > 1] if not bad.empty: raise ValueError( @@ -95,12 +82,14 @@ def main(): ) logger.info(f"Sharding {len(la_dirs)} LA folders -> {TMP_ROOT}") - # for la in tqdm(la_dirs, desc="shard"): - # shard_la(la) + for la in tqdm(la_dirs, desc="shard"): + shard_la(la) s3 = boto3.client( "s3", - config=Config(max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"}), + config=Config( + max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"} + ), ) pc_files = sorted(TMP_ROOT.glob("*.csv")) logger.info(f"Found {len(pc_files)} local shards") diff --git a/datatypes/epc/domain/historic_epc.py b/datatypes/epc/domain/historic_epc.py new file mode 100644 index 00000000..230c6327 --- /dev/null +++ b/datatypes/epc/domain/historic_epc.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + + +@dataclass +class HistoricEpc: + address1: str + address2: str + address3: str + postcode: str + uprn: str diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py index 9ebe4b09..f64ab8c4 100644 --- a/datatypes/epc/schema/historic_epc.py +++ b/datatypes/epc/schema/historic_epc.py @@ -96,9 +96,3 @@ class HistoricEpc: uprn: str uprn_source: str report_type: str - improvement_item: str - improvement_summary_text: str - improvement_descr_text: str - improvement_id: str - improvement_id_text: str - indicative_cost: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py index d5d5ea22..2170a8a6 100644 --- a/datatypes/epc/schema/tests/test_historic_epc_loading.py +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -47,9 +47,3 @@ class TestHistoricEpcLoading: def test_report_type(self, epc: HistoricEpc) -> None: assert epc.report_type == "100" - - def test_improvement_summary_text(self, epc: HistoricEpc) -> None: - assert epc.improvement_summary_text == "Increase loft insulation to 270 mm" - - def test_indicative_cost(self, epc: HistoricEpc) -> None: - assert epc.indicative_cost == "£100 - £350" diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index b275086d..5e3ce5d5 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -26,13 +26,13 @@ from backend.app.db.functions.materials_functions import get_materials from collections import defaultdict from sqlalchemy import func -PORTFOLIO_ID = 711 -SCENARIOS = [1233] +PORTFOLIO_ID = 632 +SCENARIOS = [1144] scenario_names = { - 1233: "Reach EPC C", + 1144: "EPC C", } -project_name = "Novus" +project_name = "Calico Refresh" def get_data(portfolio_id, scenario_ids): From 7a49f5df20e61836ee19fc19e77abd024fc35880 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 12:19:03 +0000 Subject: [PATCH 026/106] save plan temporary while i incorporate skills to claude --- datatypes/epc/domain/plan.md | 161 +++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 datatypes/epc/domain/plan.md diff --git a/datatypes/epc/domain/plan.md b/datatypes/epc/domain/plan.md new file mode 100644 index 00000000..45cc495b --- /dev/null +++ b/datatypes/epc/domain/plan.md @@ -0,0 +1,161 @@ +# Historic EPC address-match service + +## Context + +ETL `backend/etl/etl_opendatacommunities/main.py` shards `certificates.csv` by sanitised postcode and uploads gzipped CSVs to `s3://retrofit-data-dev/historical_epc//data.csv.gz`. Need a pure-python lib that, given `(user_address, postcode)`, fetches the corresponding shard and scores every row against the user address using the same lexiscore as `address2UPRN` — but returning the full scored df (not a single UPRN), so callers can apply their own thresholding. + +Mirrors pattern in [backend/address2UPRN/main.py:111-147](backend/address2UPRN/main.py#L111-L147) (`get_uprn_candidates`) but reads from S3 historic CSV instead of the EPC live API. No Lambda, no script — lib only for now. + +## Approach + +Add a wrapper class `HistoricEpcMatches` and a function `match_addresses_for_postcode` to the existing domain file. Add a small gzip-CSV S3 helper to `utils/s3.py`. + +### 1. Add gzip-CSV S3 reader + +In [utils/s3.py](utils/s3.py) (after `read_dataframe_from_s3_parquet` ~line 167): + +```python +def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: + if not file_key.endswith(".csv.gz"): + raise ValueError("file_key must end with .csv.gz") + buf = read_io_from_s3(bucket_name, file_key) + return pd.read_csv(buf, compression="gzip", low_memory=False) +``` + +Reuses existing `read_io_from_s3` (line 105). Caller catches `botocore.exceptions.ClientError` for missing-key handling. + +### 2. Append matcher to domain module + +In [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — keep existing `HistoricEpc` dataclass intact, append: + +```python +from typing import Optional +import pandas as pd +from botocore.exceptions import ClientError + +from backend.utils.addressMatch import AddressMatch +from utils.s3 import read_csv_gz_from_s3 + + +@dataclass +class HistoricEpcMatches: + """Scored historic EPC rows for a single postcode.""" + user_address: str + postcode: str # sanitised + df: pd.DataFrame # has lexiscore + lexirank, sorted best-first + + def top(self) -> Optional[pd.Series]: + return None if self.df.empty else self.df.iloc[0] + + def top_n(self, k: int) -> pd.DataFrame: + return self.df.head(k) + + def unambiguous_uprn(self, uprn_column: str = "UPRN") -> Optional[str]: + if self.df.empty: + return None + top_rank = self.df["lexirank"].min() + uprns = ( + self.df.loc[self.df["lexirank"] == top_rank, uprn_column] + .dropna().astype(str).str.replace(r"\.0$", "", regex=True) + .unique() + ) + return uprns[0] if len(uprns) == 1 else None + + +def _sanitise_postcode(postcode: str) -> str: + if not postcode: + raise ValueError("postcode must be non-empty") + return postcode.upper().replace(" ", "") + + +def match_addresses_for_postcode( + user_address: str, + postcode: str, + *, + bucket: str = "retrofit-data-dev", + prefix: str = "historical_epc", + address_column: str = "ADDRESS", +) -> HistoricEpcMatches: + if not user_address: + raise ValueError("user_address must be non-empty") + + pc = _sanitise_postcode(postcode) + key = f"{prefix}/{pc}/data.csv.gz" + + try: + df = read_csv_gz_from_s3(bucket, key) + except ClientError as e: + if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): + raise FileNotFoundError( + f"No historic EPC data at s3://{bucket}/{key}" + ) from e + raise + + if address_column not in df.columns: + raise ValueError( + f"Missing address column {address_column!r} in {key}" + ) + + user_norm = AddressMatch.normalise_address(user_address) + df = df.copy() + df["lexiscore"] = df[address_column].fillna("").apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) + df["lexirank"] = ( + df["lexiscore"].rank(method="dense", ascending=False).astype(int) + ) + df = df.sort_values(["lexirank", "lexiscore"], ascending=[True, False]).reset_index(drop=True) + + return HistoricEpcMatches(user_address=user_address, postcode=pc, df=df) +``` + +### Reuse notes +- `AddressMatch.normalise_address` + `AddressMatch.levenshtein` from [backend/utils/addressMatch.py](backend/utils/addressMatch.py) — same scoring as address2UPRN. +- Score column copy uses `.fillna("")` to defend against NaN in `ADDRESS`. +- Defaults match ETL output: bucket `retrofit-data-dev`, prefix `historical_epc`, column `ADDRESS` (uppercase). + +### 3. Tests + +New: [datatypes/epc/domain/tests/__init__.py](datatypes/epc/domain/tests/__init__.py) (empty) and [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py). + +Reuse existing fixture `datatypes/epc/schema/tests/fixtures/historic_epc.csv` — read it in-memory in tests; do NOT commit a `.csv.gz` fixture. Patch target: `datatypes.epc.domain.historic_epc.read_csv_gz_from_s3` (local binding, not `utils.s3.read_csv_gz_from_s3`). + +Cases: +1. `_sanitise_postcode("ab33 8al") == "AB338AL"`; empty raises. +2. Returned df has `lexiscore` + `lexirank` columns, row count preserved. +3. df sorted: `iloc[0]["lexirank"] == 1`, `lexiscore` monotone non-increasing. +4. S3 key built correctly: `"AB33 8AL"` → key `"historical_epc/AB338AL/data.csv.gz"` (spy on patched helper). +5. `ClientError` with code `NoSuchKey` → `FileNotFoundError`. +6. Exact-match address → `unambiguous_uprn()` returns that UPRN; ambiguous tie → `None`. +7. `top()` / `top_n(k)` shape checks. + +## Critical files +- [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — append matcher +- [utils/s3.py](utils/s3.py) — add `read_csv_gz_from_s3` +- [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py) — new + +## Out of scope +- Lambda handler / SQS wiring (deferred — lib only) +- Threshold logic (caller decides via wrapper helpers) +- Postcode validation via `postcodes.io` (`AddressMatch.is_valid_postcode` exists if needed later) +- Refactoring `sanitise(pd.Series)` in `etl_opendatacommunities/main.py` — separate concern + +## Verification +``` +cd /workspaces/model && pytest datatypes/epc/domain/tests/test_historic_epc_match.py -v +``` + +Sample real-S3 call (needs AWS creds): +```python +from datatypes.epc.domain.historic_epc import match_addresses_for_postcode +m = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") +print(m.df[["ADDRESS", "UPRN", "lexiscore", "lexirank"]].head()) +print(m.unambiguous_uprn()) +``` + +## Sequencing +1. Add `read_csv_gz_from_s3` to `utils/s3.py`. +2. Append matcher + wrapper to `datatypes/epc/domain/historic_epc.py`. +3. Add tests. + +Steps 2 & 3 depend on 1. No `__init__.py` re-exports needed. From 676022a4c054d7301b1d7542b582fadbb63705b4 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 12:53:37 +0000 Subject: [PATCH 027/106] =?UTF-8?q?Fix=20coordination/design=20field=20nam?= =?UTF-8?q?es=20and=20add=20MagicPlan=20trigger=20to=20HubspotDealDiffer?= =?UTF-8?q?=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 6 ++ etl/hubspot/tests/test_hubspot_deal_differ.py | 76 ++++++++++++++++++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index 9e7069fc..a53df4f7 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -194,6 +194,12 @@ class HubspotDealDiffer: and new_status != old_deal.design_status ) + @staticmethod + def check_for_magicplan_trigger( + new_deal: Dict[str, str], old_deal: HubspotDealData + ) -> bool: + raise NotImplementedError + @staticmethod def _lodgement_completed( new_deal: Dict[str, str], old_deal: HubspotDealData diff --git a/etl/hubspot/tests/test_hubspot_deal_differ.py b/etl/hubspot/tests/test_hubspot_deal_differ.py index 0523c982..273a82a0 100644 --- a/etl/hubspot/tests/test_hubspot_deal_differ.py +++ b/etl/hubspot/tests/test_hubspot_deal_differ.py @@ -109,7 +109,7 @@ def test_pashub_trigger__coordination_completed_and_pashub_link_set__returns_tru new_deal = make_new_deal( deal_id, pashub_link="www.google.co.uk", - coordination_status=coordination_status, + **{"coordination_status__stage_1_": coordination_status}, ) assert ( @@ -156,7 +156,7 @@ def test_pashub_trigger__design_completed_and_pashub_link_set__returns_true() -> new_deal = make_new_deal( deal_id, pashub_link="www.google.co.uk", - design_status="uploaded", + retrofit_design_status="uploaded", ) assert ( @@ -177,7 +177,7 @@ def test_pashub_trigger__design_completed_and_pashub_link_not_set__returns_false new_deal = make_new_deal( deal_id, - design_status="uploaded", + retrofit_design_status="uploaded", ) assert ( @@ -270,6 +270,76 @@ def test_pashub_trigger__coordination_design_lodgement_not_completed_and_pashub_ ) +# ========================== +# MAGICPLAN TRIGGER TESTS +# ========================== + + +def test_magicplan_trigger__transitions_to_coordination_complete__returns_true() -> None: + deal_id = uuid.uuid4() + + # Arrange + old_deal = make_old_deal(id=deal_id, coordination_status="in progress") + new_deal = make_new_deal( + deal_id, + **{"coordination_status__stage_1_": "(v1) ioe/mtp complete"}, + ) + + # Act + result = HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + + # Assert + assert result is True + + +def test_magicplan_trigger__already_in_coordination_complete_unrelated_change__returns_false() -> None: + deal_id = uuid.uuid4() + + # Arrange + old_deal = make_old_deal( + id=deal_id, + coordination_status="(v1) ioe/mtp complete", + outcome="pending", + ) + new_deal = make_new_deal( + deal_id, + **{"coordination_status__stage_1_": "(v1) ioe/mtp complete"}, + outcome="won", + ) + + # Act + result = HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + + # Assert + assert result is False + + +def test_magicplan_trigger__transitions_to_non_complete_coordination_status__returns_false() -> None: + deal_id = uuid.uuid4() + + # Arrange + old_deal = make_old_deal(id=deal_id, coordination_status="in progress") + new_deal = make_new_deal( + deal_id, + **{"coordination_status__stage_1_": "design submitted"}, + ) + + # Act + result = HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + + # Assert + assert result is False + + # ======================= # DB UPDATE TRIGGER TESTS # ======================= From 69faa530a4c5a4dce34a00919baf965e2f59943a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 12:55:23 +0000 Subject: [PATCH 028/106] =?UTF-8?q?Fix=20coordination/design=20field=20nam?= =?UTF-8?q?es=20and=20add=20MagicPlan=20trigger=20to=20HubspotDealDiffer?= =?UTF-8?q?=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index a53df4f7..5435a46d 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -178,7 +178,7 @@ class HubspotDealDiffer: def _coordination_completed( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - new_status: str = new_deal.get("coordination_status") or "" + new_status: str = new_deal.get("coordination_status__stage_1_") or "" return ( new_status != "" and new_status.lower() in HubspotDealDiffer.COORDINATION_COMPLETE @@ -187,7 +187,7 @@ class HubspotDealDiffer: @staticmethod def _design_completed(new_deal: Dict[str, str], old_deal: HubspotDealData) -> bool: - new_status: str = new_deal.get("design_status") or "" + new_status: str = new_deal.get("retrofit_design_status") or "" return ( new_status != "" and new_status.lower() == HubspotDealDiffer.RETROFIT_DESIGN_COMPLETE @@ -198,7 +198,12 @@ class HubspotDealDiffer: def check_for_magicplan_trigger( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - raise NotImplementedError + new_status = (new_deal.get("coordination_status__stage_1_") or "").lower() + old_status = (old_deal.coordination_status or "").lower() + return ( + new_status in HubspotDealDiffer.COORDINATION_COMPLETE + and old_status not in HubspotDealDiffer.COORDINATION_COMPLETE + ) @staticmethod def _lodgement_completed( From 489b0ba30eb730139916901da7b585627428c3e9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:05:38 +0000 Subject: [PATCH 029/106] =?UTF-8?q?Add=20MagicPlan=20SQS=20trigger=20to=20?= =?UTF-8?q?HubSpot=20orchestrator=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/config.py | 1 + .../hubspot_trigger_orchestrator/__init__.py | 0 .../tests/__init__.py | 0 .../tests/test_orchestrator.py | 148 ++++++++++++++++++ etl/hubspot/scripts/scraper/main.py | 21 +++ 5 files changed, 170 insertions(+) create mode 100644 backend/hubspot_trigger_orchestrator/__init__.py create mode 100644 backend/hubspot_trigger_orchestrator/tests/__init__.py create mode 100644 backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py diff --git a/backend/app/config.py b/backend/app/config.py index e939d6e4..21f12902 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -39,6 +39,7 @@ class Settings(BaseSettings): ENGINE_SQS_URL: str = "changeme" CATEGORISATION_SQS_URL: str = "changeme" PASHUB_TO_ARA_SQS_URL: str = "changeme" + MAGICPLAN_SQS_URL: str = "changeme" POSTCODE_SPLITTER_SQS_URL: str = "changeme" COMBINER_SQS_URL: str = "changeme" diff --git a/backend/hubspot_trigger_orchestrator/__init__.py b/backend/hubspot_trigger_orchestrator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/hubspot_trigger_orchestrator/tests/__init__.py b/backend/hubspot_trigger_orchestrator/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py b/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py new file mode 100644 index 00000000..6d18c4b4 --- /dev/null +++ b/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py @@ -0,0 +1,148 @@ +import json +import uuid +from typing import Any, Dict, Optional +from unittest.mock import MagicMock, patch + +import pytest + +from backend.app.db.models.hubspot_deal_data import HubspotDealData +from etl.hubspot.scripts.scraper.main import handler + +COORDINATION_COMPLETE = "(v1) ioe/mtp complete" +DEAL_NAME = "123 Main Street" +UPRN = "12345678" +DEAL_ID = "999" +MAGICPLAN_QUEUE_URL = "https://sqs.eu-west-2.amazonaws.com/123/magic-plan-dev" + + +def make_hubspot_deal( + coordination_status: Optional[str] = None, **kwargs: Any +) -> Dict[str, Any]: + deal: Dict[str, Any] = { + "hs_object_id": DEAL_ID, + "dealname": DEAL_NAME, + "pashub_link": None, + **kwargs, + } + if coordination_status is not None: + deal["coordination_status__stage_1_"] = coordination_status + return deal + + +def make_db_deal(coordination_status: Optional[str] = None, **kwargs: Any) -> HubspotDealData: + return HubspotDealData( + id=uuid.uuid4(), + deal_id=DEAL_ID, + coordination_status=coordination_status, + **kwargs, + ) + + +def run_handler( + hubspot_deal: Dict[str, Any], + db_deal: Optional[HubspotDealData], + listing: Optional[dict], +) -> MagicMock: + mock_sqs = MagicMock() + mock_sqs.send_message.return_value = {"MessageId": "test-id"} + + with ( + patch("etl.hubspot.scripts.scraper.main.HubspotDataToDb") as mock_db_cls, + patch("etl.hubspot.scripts.scraper.main.HubspotClient") as mock_hs_cls, + patch("etl.hubspot.scripts.scraper.main.boto3") as mock_boto3, + patch("etl.hubspot.scripts.scraper.main.get_settings") as mock_settings, + ): + mock_db_cls.return_value.find_deal_with_deal_id.return_value = db_deal + mock_db_cls.return_value.upsert_deal.return_value = None + mock_hs_cls.return_value.get_deal_and_company_and_listing.return_value = ( + hubspot_deal, + None, + listing, + ) + mock_boto3.client.return_value = mock_sqs + mock_settings.return_value.MAGICPLAN_SQS_URL = MAGICPLAN_QUEUE_URL + mock_settings.return_value.PASHUB_TO_ARA_SQS_URL = "https://sqs.test/pashub" + + handler.__wrapped__({"hubspot_deal_id": DEAL_ID}, "") + + return mock_sqs + + +# ======================= +# NEW DEAL PATH +# ======================= + + +def test_new_deal_in_coordination_complete__sends_sqs_message() -> None: + # Arrange + hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + listing = {"national_uprn": UPRN} + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=listing) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=MAGICPLAN_QUEUE_URL, + MessageBody=json.dumps({"address": DEAL_NAME, "uprn": UPRN}), + ) + + +def test_new_deal_not_in_coordination_complete__no_sqs_message() -> None: + # Arrange + hubspot_deal = make_hubspot_deal(coordination_status="in progress") + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) + + # Assert + mock_sqs.send_message.assert_not_called() + + +def test_new_deal_with_no_listing__uprn_is_none_in_message() -> None: + # Arrange + hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=MAGICPLAN_QUEUE_URL, + MessageBody=json.dumps({"address": DEAL_NAME, "uprn": None}), + ) + + +# ======================= +# EXISTING DEAL PATH +# ======================= + + +def test_existing_deal_transitions_to_coordination_complete__sends_sqs_message() -> None: + # Arrange + db_deal = make_db_deal(coordination_status="in progress") + hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + listing = {"national_uprn": UPRN} + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=listing) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=MAGICPLAN_QUEUE_URL, + MessageBody=json.dumps({"address": DEAL_NAME, "uprn": UPRN}), + ) + + +def test_existing_deal_already_in_coordination_complete_unrelated_change__no_sqs_message() -> None: + # Arrange + db_deal = make_db_deal(coordination_status=COORDINATION_COMPLETE, dealname="Old Name") + hubspot_deal = make_hubspot_deal( + coordination_status=COORDINATION_COMPLETE, dealname="New Name" + ) + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=None) + + # Assert + mock_sqs.send_message.assert_not_called() diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 3ed208a2..cd76e26f 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -56,6 +56,13 @@ def handler(body: dict[str, Any], context: Any) -> None: f"Triggering Pas Hub file fetcher for HubSpot deal ID {hubspot_deal_id}" ) _trigger_pashub_fetcher(sqs_client, hubspot_deal_id, hubspot_deal) + + coordination_status = (hubspot_deal.get("coordination_status__stage_1_") or "").lower() + if coordination_status in HubspotDealDiffer.COORDINATION_COMPLETE: + logger.info( + f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" + ) + _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing) else: # Deal already in db, check whether anything has changed logger.info( @@ -97,9 +104,23 @@ def handler(body: dict[str, Any], context: Any) -> None: f"Not Triggering PasHub file fetcher for HubSpot deal ID {hubspot_deal_id}" ) + if HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=hubspot_deal, old_deal=db_deal + ): + logger.info( + f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" + ) + _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing) + print("done") +def _trigger_magicplan_fetcher( + sqs_client: Any, hubspot_deal: Dict[str, str], listing: Optional[dict[str, str]] +) -> None: + raise NotImplementedError + + def _trigger_pashub_fetcher( sqs_client: Any, deal_id: str, hubspot_deal: Dict[str, str] ) -> None: From a1a445f6f270f62b148e7d3f79eace348213e893 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:06:43 +0000 Subject: [PATCH 030/106] =?UTF-8?q?Add=20MagicPlan=20SQS=20trigger=20to=20?= =?UTF-8?q?HubSpot=20orchestrator=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/scripts/scraper/main.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index cd76e26f..a39e8b37 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -118,7 +118,17 @@ def handler(body: dict[str, Any], context: Any) -> None: def _trigger_magicplan_fetcher( sqs_client: Any, hubspot_deal: Dict[str, str], listing: Optional[dict[str, str]] ) -> None: - raise NotImplementedError + message_body = { + "address": hubspot_deal.get("dealname"), + "uprn": listing.get("national_uprn") if listing else None, + } + response = sqs_client.send_message( + QueueUrl=get_settings().MAGICPLAN_SQS_URL, + MessageBody=json.dumps(message_body), + ) + logger.info( + f"Sent message to MagicPlan queue. MessageId: {response['MessageId']}" + ) def _trigger_pashub_fetcher( From ed68a10127b132d883b927655c5f0ce2cdc41815 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:07:36 +0000 Subject: [PATCH 031/106] magic plan client terraform --- .../terraform/lambda/magic_plan/main.tf | 41 +++++++++++++++++++ .../terraform/lambda/magic_plan/outputs.tf | 9 ++++ .../terraform/lambda/magic_plan/provider.tf | 16 ++++++++ 3 files changed, 66 insertions(+) create mode 100644 infrastructure/terraform/lambda/magic_plan/main.tf create mode 100644 infrastructure/terraform/lambda/magic_plan/outputs.tf create mode 100644 infrastructure/terraform/lambda/magic_plan/provider.tf diff --git a/infrastructure/terraform/lambda/magic_plan/main.tf b/infrastructure/terraform/lambda/magic_plan/main.tf new file mode 100644 index 00000000..56adac1b --- /dev/null +++ b/infrastructure/terraform/lambda/magic_plan/main.tf @@ -0,0 +1,41 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + +module "lambda" { + source = "../../modules/lambda_with_sqs" + + name = "magic_plan" + stage = var.stage + + image_uri = local.image_uri + + maximum_concurrency = var.maximum_concurrency + reserved_concurrent_executions = var.reserved_concurrent_executions + batch_size = var.batch_size + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + MAGICPLAN_CUSTOMER_ID = var.magicplan_customer_id + MAGICPLAN_API_KEY = var.magicplan_api_key + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + DB_HOST = var.db_host + DB_NAME = var.db_name + DB_PORT = var.db_port + } +} diff --git a/infrastructure/terraform/lambda/magic_plan/outputs.tf b/infrastructure/terraform/lambda/magic_plan/outputs.tf new file mode 100644 index 00000000..2082933f --- /dev/null +++ b/infrastructure/terraform/lambda/magic_plan/outputs.tf @@ -0,0 +1,9 @@ +output "magic_plan_queue_url" { + value = module.lambda.queue_url + description = "URL of the MagicPlan SQS queue" +} + +output "magic_plan_queue_arn" { + value = module.lambda.queue_arn + description = "ARN of the MagicPlan SQS queue" +} diff --git a/infrastructure/terraform/lambda/magic_plan/provider.tf b/infrastructure/terraform/lambda/magic_plan/provider.tf new file mode 100644 index 00000000..9e7020ac --- /dev/null +++ b/infrastructure/terraform/lambda/magic_plan/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "magic-plan-hubspot-trigger-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} From fd77fa51fdad5f1ea5dd3a6859f3051dd0665d84 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:07:53 +0000 Subject: [PATCH 032/106] magic plan client terraform --- .../terraform/lambda/magic_plan/variables.tf | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 infrastructure/terraform/lambda/magic_plan/variables.tf diff --git a/infrastructure/terraform/lambda/magic_plan/variables.tf b/infrastructure/terraform/lambda/magic_plan/variables.tf new file mode 100644 index 00000000..03f88e75 --- /dev/null +++ b/infrastructure/terraform/lambda/magic_plan/variables.tf @@ -0,0 +1,68 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} + +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "maximum_concurrency" { + type = number + default = null + description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." +} + +variable "reserved_concurrent_executions" { + type = number + default = 1 +} + +variable "batch_size" { + type = number + default = 1 +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} + +variable "magicplan_customer_id" { + type = string + sensitive = true +} + +variable "magicplan_api_key" { + type = string + sensitive = true +} + +variable "db_host" { + type = string + sensitive = true +} + +variable "db_name" { + type = string + sensitive = true +} + +variable "db_port" { + type = string + sensitive = true +} From feaa1ea68093f74087952597be691ee5a387fc8a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:12:13 +0000 Subject: [PATCH 033/106] Add MagicPlan Lambda Dockerfile, CI/CD jobs, and SQS IAM wiring in hubspot_deal_etl --- .github/workflows/deploy_terraform.yml | 40 ++++++++++++++++++- backend/magic_plan/handler/Dockerfile | 26 ++++++++++++ backend/magic_plan/handler/requirements.txt | 7 ++++ .../terraform/lambda/hubspot_deal_etl/main.tf | 26 +++++++++++- 4 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 backend/magic_plan/handler/Dockerfile create mode 100644 backend/magic_plan/handler/requirements.txt diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 398232c6..e0343974 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -537,11 +537,49 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + # ============================================================ + # Build MagicPlan Lambda image + # ============================================================ + magic_plan_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: magic-plan-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/magic_plan/handler/Dockerfile + build_context: . + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + # ============================================================ + # Deploy MagicPlan Lambda + # ============================================================ + magic_plan_lambda: + needs: [magic_plan_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: magic_plan + lambda_path: infrastructure/terraform/lambda/magic_plan + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: magic-plan-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.magic_plan_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + TF_VAR_db_host: ${{ secrets.DEV_DB_HOST }} + TF_VAR_db_name: ${{ secrets.DEV_DB_NAME }} + TF_VAR_db_port: ${{ secrets.DEV_DB_PORT }} + TF_VAR_magicplan_customer_id: ${{ secrets.MAGICPLAN_CUSTOMER_ID }} + TF_VAR_magicplan_api_key: ${{ secrets.MAGICPLAN_API_KEY }} + # ============================================================ # Deploy Hubspot ETL Lambda # ============================================================ hubspot_etl_lambda: - needs: [hubspot_etl_image, determine_stage, pashub_to_ara_lambda] + needs: [hubspot_etl_image, determine_stage, pashub_to_ara_lambda, magic_plan_lambda] uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: hubspot-etl-to-ara diff --git a/backend/magic_plan/handler/Dockerfile b/backend/magic_plan/handler/Dockerfile new file mode 100644 index 00000000..7c83ebe6 --- /dev/null +++ b/backend/magic_plan/handler/Dockerfile @@ -0,0 +1,26 @@ +FROM mcr.microsoft.com/playwright/python:v1.58.0-jammy + +# Install AWS Lambda RIE +ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/local/bin/aws-lambda-rie +RUN chmod +x /usr/local/bin/aws-lambda-rie + +# Set working directory (Lambda task root) +WORKDIR /var/task + +COPY backend/magic_plan/handler/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY utils/ utils/ +COPY backend/ backend/ +COPY datatypes/ datatypes/ + +# Local lambda entrypoint +# ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] + +# AWS lambda entrypoint +ENTRYPOINT ["python", "-m", "awslambdaric"] + +# ----------------------------- +# Lambda handler +# ----------------------------- +CMD ["backend.magic_plan.handler.handler"] diff --git a/backend/magic_plan/handler/requirements.txt b/backend/magic_plan/handler/requirements.txt new file mode 100644 index 00000000..cfacf455 --- /dev/null +++ b/backend/magic_plan/handler/requirements.txt @@ -0,0 +1,7 @@ +awslambdaric +requests +sqlalchemy==2.0.36 +sqlmodel +psycopg2-binary==2.9.10 +pydantic-settings==2.6.0 +boto3==1.35.44 diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf index 48dd6b78..800dc3b6 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf @@ -12,7 +12,16 @@ data "terraform_remote_state" "pashub_to_ara" { config = { bucket = "pashub-to-ara-terraform-state" key = "env:/${var.stage}/terraform.tfstate" - region = "eu-west-2" + region = "eu-west-2" + } +} + +data "terraform_remote_state" "magic_plan" { + backend = "s3" + config = { + bucket = "magic-plan-hubspot-trigger-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" } } @@ -49,6 +58,7 @@ module "hubspot_deal_etl" { HUBSPOT_API_KEY = var.hubspot_api_key PASHUB_TO_ARA_SQS_URL = data.terraform_remote_state.pashub_to_ara.outputs.pashub_to_ara_queue_url + MAGICPLAN_SQS_URL = data.terraform_remote_state.magic_plan.outputs.magic_plan_queue_url } } @@ -76,4 +86,18 @@ module "hubspot_deal_etl_sqs_policy" { resource "aws_iam_role_policy_attachment" "hubspot_deal_etl_sqs_send" { role = module.hubspot_deal_etl.role_name policy_arn = module.hubspot_deal_etl_sqs_policy.policy_arn +} + +module "hubspot_deal_etl_magicplan_sqs_policy" { + source = "../../modules/general_iam_policy" + + policy_name = "hubspot-deal-etl-magicplan-sqs-send-${var.stage}" + policy_description = "Allow HubSpot ETL Lambda to send messages to MagicPlan queue" + actions = ["sqs:SendMessage"] + resources = [data.terraform_remote_state.magic_plan.outputs.magic_plan_queue_arn] +} + +resource "aws_iam_role_policy_attachment" "hubspot_deal_etl_magicplan_sqs_send" { + role = module.hubspot_deal_etl.role_name + policy_arn = module.hubspot_deal_etl_magicplan_sqs_policy.policy_arn } \ No newline at end of file From e30e06cb6e61f24668bba039e0fb17c36f4a9307 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:24:04 +0000 Subject: [PATCH 034/106] simplify dockerfile as playwright not used --- backend/magic_plan/handler/Dockerfile | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/backend/magic_plan/handler/Dockerfile b/backend/magic_plan/handler/Dockerfile index 7c83ebe6..ffd85c02 100644 --- a/backend/magic_plan/handler/Dockerfile +++ b/backend/magic_plan/handler/Dockerfile @@ -1,10 +1,5 @@ -FROM mcr.microsoft.com/playwright/python:v1.58.0-jammy +FROM public.ecr.aws/lambda/python:3.11 -# Install AWS Lambda RIE -ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/local/bin/aws-lambda-rie -RUN chmod +x /usr/local/bin/aws-lambda-rie - -# Set working directory (Lambda task root) WORKDIR /var/task COPY backend/magic_plan/handler/requirements.txt . @@ -14,13 +9,4 @@ COPY utils/ utils/ COPY backend/ backend/ COPY datatypes/ datatypes/ -# Local lambda entrypoint -# ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] - -# AWS lambda entrypoint -ENTRYPOINT ["python", "-m", "awslambdaric"] - -# ----------------------------- -# Lambda handler -# ----------------------------- CMD ["backend.magic_plan.handler.handler"] From 74b3a7f297b90535f32208189c6d9a87c24f806a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:42:57 +0000 Subject: [PATCH 035/106] =?UTF-8?q?Add=20hubspot=5Fdeal=5Fid=20required=20?= =?UTF-8?q?field=20to=20MagicPlanTriggerRequest=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../magic_plan/tests/test_magic_plan_trigger_request.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backend/magic_plan/tests/test_magic_plan_trigger_request.py b/backend/magic_plan/tests/test_magic_plan_trigger_request.py index 46a20a37..131ea93b 100644 --- a/backend/magic_plan/tests/test_magic_plan_trigger_request.py +++ b/backend/magic_plan/tests/test_magic_plan_trigger_request.py @@ -38,3 +38,11 @@ def test_extra_fields_ignored() -> None: req = MagicPlanTriggerRequest.model_validate(payload) # Assert assert req.address == "123 High St London SW1A 1AA" + + +def test_missing_hubspot_deal_id_raises() -> None: + # Arrange + payload = {"address": "123 High St London SW1A 1AA"} + # Act / Assert + with pytest.raises(ValidationError): + MagicPlanTriggerRequest.model_validate(payload) From 4a9cabe1979a1f0e12ef97f13dc634d4928b4fef Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:45:10 +0000 Subject: [PATCH 036/106] =?UTF-8?q?Add=20hubspot=5Fdeal=5Fid=20required=20?= =?UTF-8?q?field=20to=20MagicPlanTriggerRequest=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- backend/magic_plan/handler.py | 2 +- backend/magic_plan/magic_plan_trigger_request.py | 1 + .../magic_plan/tests/test_magic_plan_trigger_request.py | 9 +++++---- etl/hubspot/scripts/scraper/main.py | 7 ++++--- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/backend/magic_plan/handler.py b/backend/magic_plan/handler.py index a592cc6a..22933e13 100644 --- a/backend/magic_plan/handler.py +++ b/backend/magic_plan/handler.py @@ -28,7 +28,7 @@ if __name__ == "__main__": event = { "Records": [ { - "body": '{"address": "2 Laburnum Way Bromley BR2 8BZ"}', + "body": '{"address": "2 Laburnum Way Bromley BR2 8BZ", "hubspot_deal_id": "local-test-deal"}', "messageId": "local-test", } ] diff --git a/backend/magic_plan/magic_plan_trigger_request.py b/backend/magic_plan/magic_plan_trigger_request.py index bb0151e4..e93c055c 100644 --- a/backend/magic_plan/magic_plan_trigger_request.py +++ b/backend/magic_plan/magic_plan_trigger_request.py @@ -7,4 +7,5 @@ class MagicPlanTriggerRequest(BaseModel): model_config = ConfigDict(extra="ignore") address: str + hubspot_deal_id: str uprn: Optional[str] = None diff --git a/backend/magic_plan/tests/test_magic_plan_trigger_request.py b/backend/magic_plan/tests/test_magic_plan_trigger_request.py index 131ea93b..9fb2754a 100644 --- a/backend/magic_plan/tests/test_magic_plan_trigger_request.py +++ b/backend/magic_plan/tests/test_magic_plan_trigger_request.py @@ -6,17 +6,18 @@ from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerReques def test_valid_payload_with_address_only() -> None: # Arrange - payload = {"address": "123 High St London SW1A 1AA"} + payload = {"address": "123 High St London SW1A 1AA", "hubspot_deal_id": "123456789"} # Act req = MagicPlanTriggerRequest.model_validate(payload) # Assert assert req.address == "123 High St London SW1A 1AA" + assert req.hubspot_deal_id == "123456789" assert req.uprn is None def test_valid_payload_with_uprn() -> None: # Arrange - payload = {"address": "123 High St London SW1A 1AA", "uprn": "100023336956"} + payload = {"address": "123 High St London SW1A 1AA", "hubspot_deal_id": "123456789", "uprn": "100023336956"} # Act req = MagicPlanTriggerRequest.model_validate(payload) # Assert @@ -25,7 +26,7 @@ def test_valid_payload_with_uprn() -> None: def test_missing_address_raises() -> None: # Arrange - payload = {"uprn": "100023336956"} + payload = {"hubspot_deal_id": "123456789", "uprn": "100023336956"} # Act / Assert with pytest.raises(ValidationError): MagicPlanTriggerRequest.model_validate(payload) @@ -33,7 +34,7 @@ def test_missing_address_raises() -> None: def test_extra_fields_ignored() -> None: # Arrange - payload = {"address": "123 High St London SW1A 1AA", "unknown_field": "whatever"} + payload = {"address": "123 High St London SW1A 1AA", "hubspot_deal_id": "123456789", "unknown_field": "whatever"} # Act req = MagicPlanTriggerRequest.model_validate(payload) # Assert diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index a39e8b37..32007cd4 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -62,7 +62,7 @@ def handler(body: dict[str, Any], context: Any) -> None: logger.info( f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" ) - _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing) + _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing, hubspot_deal_id) else: # Deal already in db, check whether anything has changed logger.info( @@ -110,16 +110,17 @@ def handler(body: dict[str, Any], context: Any) -> None: logger.info( f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" ) - _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing) + _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing, hubspot_deal_id) print("done") def _trigger_magicplan_fetcher( - sqs_client: Any, hubspot_deal: Dict[str, str], listing: Optional[dict[str, str]] + sqs_client: Any, hubspot_deal: Dict[str, str], listing: Optional[dict[str, str]], hubspot_deal_id: str ) -> None: message_body = { "address": hubspot_deal.get("dealname"), + "hubspot_deal_id": hubspot_deal_id, "uprn": listing.get("national_uprn") if listing else None, } response = sqs_client.send_message( From c3aae8fd51373a9b38cbdb023275bbd12172519c Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:08:56 +0000 Subject: [PATCH 037/106] =?UTF-8?q?Expose=20get=5Fplan=5Fraw=20method=20on?= =?UTF-8?q?=20MagicPlanClient=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 3 ++ .../tests/test_magic_plan_client.py | 52 +++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index 60f70fb1..172190fd 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -22,3 +22,6 @@ class MagicPlanClient: ) r.raise_for_status() return MagicPlanPlan.model_validate(r.json()["data"]) + + def get_plan_raw(self, plan_id: str) -> bytes: + raise NotImplementedError diff --git a/backend/magic_plan/tests/test_magic_plan_client.py b/backend/magic_plan/tests/test_magic_plan_client.py index 1be1448f..c96b9cdf 100644 --- a/backend/magic_plan/tests/test_magic_plan_client.py +++ b/backend/magic_plan/tests/test_magic_plan_client.py @@ -172,3 +172,55 @@ def test_get_plan_propagates_http_error( # Act / Assert with pytest.raises(requests.HTTPError): client.get_plan("some-id") + + +# --- get_plan_raw --- + + +def test_get_plan_raw_returns_bytes( + client: MagicPlanClient, mock_session: MagicMock +) -> None: + # Arrange + mock_session.get.return_value.content = b'{"data": "raw"}' + plan_id = "a7285ed1-878d-47eb-8aa6-85ef9e187516" + # Act + result = client.get_plan_raw(plan_id) + # Assert + assert isinstance(result, bytes) + + +def test_get_plan_raw_calls_correct_url( + client: MagicPlanClient, mock_session: MagicMock +) -> None: + # Arrange + mock_session.get.return_value.content = b"{}" + plan_id = "a7285ed1-878d-47eb-8aa6-85ef9e187516" + # Act + client.get_plan_raw(plan_id) + # Assert + mock_session.get.assert_called_once_with( + f"{BASE_URL}/plans/{plan_id}", params={"key": API_KEY} + ) + + +def test_get_plan_raw_calls_raise_for_status( + client: MagicPlanClient, mock_session: MagicMock +) -> None: + # Arrange + mock_session.get.return_value.content = b"{}" + # Act + client.get_plan_raw("a7285ed1-878d-47eb-8aa6-85ef9e187516") + # Assert + mock_session.get.return_value.raise_for_status.assert_called_once() + + +def test_get_plan_raw_propagates_http_error( + client: MagicPlanClient, mock_session: MagicMock +) -> None: + # Arrange + mock_session.get.return_value.raise_for_status.side_effect = requests.HTTPError( + "500" + ) + # Act / Assert + with pytest.raises(requests.HTTPError): + client.get_plan_raw("some-id") From f6c17be70a3d85e0638cea58f3edcdaf530c0aee Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:09:33 +0000 Subject: [PATCH 038/106] =?UTF-8?q?Expose=20get=5Fplan=5Fraw=20method=20on?= =?UTF-8?q?=20MagicPlanClient=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index 172190fd..06905e6a 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -17,11 +17,14 @@ class MagicPlanClient: return PlansListResponse.model_validate(r.json()["data"]) def get_plan(self, plan_id: str) -> MagicPlanPlan: + return MagicPlanPlan.model_validate(self._fetch_plan(plan_id).json()["data"]) + + def get_plan_raw(self, plan_id: str) -> bytes: + return self._fetch_plan(plan_id).content + + def _fetch_plan(self, plan_id: str) -> requests.Response: r = self._session.get( f"{_BASE_URL}/plans/{plan_id}", params={"key": self._api_key} ) r.raise_for_status() - return MagicPlanPlan.model_validate(r.json()["data"]) - - def get_plan_raw(self, plan_id: str) -> bytes: - raise NotImplementedError + return r From 7c9cb5b161b4e910d95c96c6633362000c4b9e18 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:14:42 +0000 Subject: [PATCH 039/106] =?UTF-8?q?Upload=20gzip-compressed=20MagicPlan=20?= =?UTF-8?q?JSON=20to=20S3=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/db/models/uploaded_file.py | 2 + backend/magic_plan/handler.py | 2 +- backend/magic_plan/magic_plan_service.py | 13 +++- .../tests/test_magic_plan_service.py | 66 +++++++++++++++++-- 4 files changed, 72 insertions(+), 11 deletions(-) diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py index a516a1df..c629f574 100644 --- a/backend/app/db/models/uploaded_file.py +++ b/backend/app/db/models/uploaded_file.py @@ -17,6 +17,7 @@ class FileTypeEnum(enum.Enum): ECMK_SITE_NOTE = "ecmk_site_note" ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note" ECMK_SURVEY_XML = "ecmk_survey_xml" + MAGIC_PLAN_JSON = "magic_plan_json" class FileSourceEnum(enum.Enum): @@ -24,6 +25,7 @@ class FileSourceEnum(enum.Enum): SHAREPOINT = "sharepoint" HUBSPOT = "hubspot" ECMK = "ecmk" + MAGIC_PLAN = "magic_plan" class UploadedFile(Base): diff --git a/backend/magic_plan/handler.py b/backend/magic_plan/handler.py index 22933e13..45de8554 100644 --- a/backend/magic_plan/handler.py +++ b/backend/magic_plan/handler.py @@ -19,7 +19,7 @@ def handler(body: dict[str, Any], context: Any) -> str: customer_id=settings.MAGICPLAN_CUSTOMER_ID, api_key=settings.MAGICPLAN_API_KEY, ) - plan: Plan = MagicPlanService(client).run(payload.address, payload.uprn) + plan: Plan = MagicPlanService(client, s3_bucket="retrofit-energy-assessments-dev").run(payload) logger.info("Saved MagicPlan plan uid=%s", plan.uid) return plan.uid diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 91b3cd13..6be6486c 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -1,3 +1,4 @@ +import gzip from typing import Optional from datatypes.magicplan.api.response import ( @@ -12,23 +13,29 @@ from backend.app.db.connection import db_session from backend.app.db.functions.magic_plan_functions import save_plan from backend.magic_plan.address_matcher import find_matching_plan from backend.magic_plan.magic_plan_client import MagicPlanClient +from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerRequest from utils.logger import setup_logger +from utils.s3 import save_data_to_s3 logger = setup_logger() class MagicPlanService: - def __init__(self, client: MagicPlanClient) -> None: + def __init__(self, client: MagicPlanClient, s3_bucket: str) -> None: self._client = client + self._s3_bucket = s3_bucket + + def run(self, request: MagicPlanTriggerRequest) -> Plan: + address = request.address + uprn = request.uprn - def run(self, address: str, uprn: Optional[str] = None) -> Plan: if uprn is not None: logger.info("MagicPlanService.run uprn=%s", uprn) plans_response: PlansListResponse = self._client.get_plans() matched: Optional[PlanSummary] = find_matching_plan( plans_response.plans, address - ) # TODO: use address2UPRN instead? or create AddressMatch domain class + ) if matched is None: raise ValueError(f"No MagicPlan found for address: {address!r}") diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index 8e433b87..87e20506 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from unittest.mock import MagicMock, patch +from unittest.mock import ANY, MagicMock, patch import pytest @@ -10,9 +10,11 @@ from datatypes.magicplan.domain.models import Plan from backend.magic_plan.magic_plan_client import MagicPlanClient from backend.magic_plan.magic_plan_service import MagicPlanService +from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerRequest FIXTURE_DIR = Path(__file__).parents[2] / "magic_plan" PLAN_ID = "a7285ed1-878d-47eb-8aa6-85ef9e187516" +S3_BUCKET = "test-bucket" @pytest.fixture(scope="module") @@ -45,7 +47,17 @@ def mock_client() -> MagicMock: def _make_service(mock_client: MagicMock) -> MagicPlanService: - return MagicPlanService(client=mock_client) + return MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) + + +def _make_request( + address: str = "2 Laburnum Way Bromley BR2 8BZ", + hubspot_deal_id: str = "deal-123", + uprn: str | None = None, +) -> MagicPlanTriggerRequest: + return MagicPlanTriggerRequest( + address=address, hubspot_deal_id=hubspot_deal_id, uprn=uprn + ) # --- no match --- @@ -57,7 +69,7 @@ def test_run_raises_when_no_plan_found(mock_client: MagicMock) -> None: service = _make_service(mock_client) # Act / Assert with pytest.raises(ValueError, match="No MagicPlan found"): - service.run("99 Nowhere Road London SW1A 1AA") + service.run(_make_request(address="99 Nowhere Road London SW1A 1AA")) # --- match found --- @@ -78,8 +90,10 @@ def test_run_fetches_plan_with_matched_id( return_value=plan_summary, ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" ): - service.run("2 Laburnum Way Bromley BR2 8BZ") + service.run(_make_request()) # Assert mock_client.get_plan.assert_called_once_with(plan_summary.id) @@ -99,8 +113,10 @@ def test_run_returns_mapped_plan( return_value=plan_summary, ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" ): - result = service.run("2 Laburnum Way Bromley BR2 8BZ") + result = service.run(_make_request()) # Assert assert isinstance(result, Plan) assert result.uid == PLAN_ID @@ -120,8 +136,10 @@ def test_run_calls_save_plan_with_mapped_plan( return_value=plan_summary, ), patch("backend.magic_plan.magic_plan_service.save_plan") as mock_save, patch( "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" ): - service.run("2 Laburnum Way Bromley BR2 8BZ") + service.run(_make_request()) # Assert — save_plan called with a Plan whose uid matches call_args = mock_save.call_args saved_plan: Plan = call_args[0][1] @@ -142,5 +160,39 @@ def test_run_accepts_uprn_without_error( return_value=plan_summary, ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" ): - service.run("2 Laburnum Way Bromley BR2 8BZ", uprn="100023336956") + service.run(_make_request(uprn="100023336956")) + + +# --- S3 upload --- + + +def test_run_uploads_to_s3_with_uprn_key( + mock_client: MagicMock, + api_magic_plan: MagicPlanPlan, + plan_summary: PlanSummary, +) -> None: + # Arrange + mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plan.return_value = api_magic_plan + mock_client.get_plan_raw.return_value = b'{"raw": "data"}' + request = _make_request(uprn="100023336956") + service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) + with patch( + "backend.magic_plan.magic_plan_service.find_matching_plan", + return_value=plan_summary, + ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( + "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" + ) as mock_s3: + # Act + service.run(request) + # Assert + mock_s3.assert_called_once_with( + ANY, + S3_BUCKET, + f"documents/uprn/100023336956/magic_plan_{plan_summary.id}.json.gz", + ) From 14a064fdefd1d6a9362cf3a4c078bbfc6da5b316 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:16:41 +0000 Subject: [PATCH 040/106] =?UTF-8?q?Upload=20gzip-compressed=20MagicPlan=20?= =?UTF-8?q?JSON=20to=20S3=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 8 ++++++++ backend/magic_plan/tests/test_magic_plan_service.py | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 6be6486c..bb68fa42 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -41,8 +41,16 @@ class MagicPlanService: raise ValueError(f"No MagicPlan found for address: {address!r}") magic_plan: MagicPlanPlan = self._client.get_plan(matched.id) + raw_bytes: bytes = self._client.get_plan_raw(matched.id) plan: Plan = map_plan(magic_plan) + compressed = gzip.compress(raw_bytes) + if uprn is not None: + s3_key = f"documents/uprn/{uprn}/magic_plan_{matched.id}.json.gz" + else: + s3_key = f"documents/hubspot_deal_id/{request.hubspot_deal_id}/magic_plan_{matched.id}.json.gz" + save_data_to_s3(compressed, self._s3_bucket, s3_key) + with db_session() as session: save_plan(session, plan) diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index 87e20506..65c19b7a 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -43,7 +43,9 @@ def plan_summary() -> PlanSummary: @pytest.fixture() def mock_client() -> MagicMock: - return MagicMock(spec=MagicPlanClient) + client = MagicMock(spec=MagicPlanClient) + client.get_plan_raw.return_value = b"{}" + return client def _make_service(mock_client: MagicMock) -> MagicPlanService: From 03e8750c1ac233b408820eb56b4976b7b5a97d0b Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:17:51 +0000 Subject: [PATCH 041/106] =?UTF-8?q?Upload=20MagicPlan=20JSON=20to=20S3=20u?= =?UTF-8?q?sing=20hubspot=5Fdeal=5Fid=20key=20when=20UPRN=20absent=20?= =?UTF-8?q?=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_magic_plan_service.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index 65c19b7a..70099e91 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -198,3 +198,31 @@ def test_run_uploads_to_s3_with_uprn_key( S3_BUCKET, f"documents/uprn/100023336956/magic_plan_{plan_summary.id}.json.gz", ) + + +def test_run_uploads_to_s3_with_deal_id_key_when_uprn_absent( + mock_client: MagicMock, + api_magic_plan: MagicPlanPlan, + plan_summary: PlanSummary, +) -> None: + # Arrange + mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plan.return_value = api_magic_plan + request = _make_request(hubspot_deal_id="deal-456", uprn=None) + service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) + with patch( + "backend.magic_plan.magic_plan_service.find_matching_plan", + return_value=plan_summary, + ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( + "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" + ) as mock_s3: + # Act + service.run(request) + # Assert + mock_s3.assert_called_once_with( + ANY, + S3_BUCKET, + f"documents/hubspot_deal_id/deal-456/magic_plan_{plan_summary.id}.json.gz", + ) From 8ac77ce8b967eb87884443ad4e40e5e2d8c6fa29 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:19:49 +0000 Subject: [PATCH 042/106] =?UTF-8?q?Persist=20UploadedFile=20record=20for?= =?UTF-8?q?=20each=20MagicPlan=20S3=20upload=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 5 +++ .../tests/test_magic_plan_service.py | 45 +++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index bb68fa42..0aeed686 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -11,6 +11,11 @@ from datatypes.magicplan.domain.models import Plan from backend.app.db.connection import db_session from backend.app.db.functions.magic_plan_functions import save_plan +from backend.app.db.models.uploaded_file import ( + FileSourceEnum, + FileTypeEnum, + UploadedFile, +) from backend.magic_plan.address_matcher import find_matching_plan from backend.magic_plan.magic_plan_client import MagicPlanClient from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerRequest diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index 70099e91..b7580546 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -8,6 +8,11 @@ from datatypes.magicplan.api.response import MagicPlanPlan, PlanSummary from datatypes.magicplan.domain.mapper import map_plan from datatypes.magicplan.domain.models import Plan +from backend.app.db.models.uploaded_file import ( + FileSourceEnum, + FileTypeEnum, + UploadedFile, +) from backend.magic_plan.magic_plan_client import MagicPlanClient from backend.magic_plan.magic_plan_service import MagicPlanService from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerRequest @@ -226,3 +231,43 @@ def test_run_uploads_to_s3_with_deal_id_key_when_uprn_absent( S3_BUCKET, f"documents/hubspot_deal_id/deal-456/magic_plan_{plan_summary.id}.json.gz", ) + + +# --- UploadedFile record --- + + +def test_run_creates_uploaded_file_record( + mock_client: MagicMock, + api_magic_plan: MagicPlanPlan, + plan_summary: PlanSummary, +) -> None: + # Arrange + mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plan.return_value = api_magic_plan + request = _make_request(hubspot_deal_id="deal-789", uprn="100023336956") + service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) + mock_session = MagicMock() + with patch( + "backend.magic_plan.magic_plan_service.find_matching_plan", + return_value=plan_summary, + ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( + "backend.magic_plan.magic_plan_service.db_session" + ) as mock_db, patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" + ): + mock_db.return_value.__enter__.return_value = mock_session + # Act + service.run(request) + # Assert + added_objects = [call.args[0] for call in mock_session.add.call_args_list] + uploaded_file = next( + (obj for obj in added_objects if isinstance(obj, UploadedFile)), None + ) + assert uploaded_file is not None + assert uploaded_file.file_source == FileSourceEnum.MAGIC_PLAN.value + assert uploaded_file.file_type == FileTypeEnum.MAGIC_PLAN_JSON.value + assert uploaded_file.s3_file_bucket == S3_BUCKET + assert uploaded_file.s3_file_key == f"documents/uprn/100023336956/magic_plan_{plan_summary.id}.json.gz" + assert uploaded_file.s3_upload_timestamp is not None + assert uploaded_file.uprn == 100023336956 + assert uploaded_file.hubspot_deal_id == "deal-789" From 337474e7733110e633601c5a7eaed26d6bbe4241 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:20:49 +0000 Subject: [PATCH 043/106] =?UTF-8?q?Persist=20UploadedFile=20record=20for?= =?UTF-8?q?=20each=20MagicPlan=20S3=20upload=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 0aeed686..7860fec9 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -1,4 +1,5 @@ import gzip +from datetime import datetime, timezone from typing import Optional from datatypes.magicplan.api.response import ( @@ -58,5 +59,16 @@ class MagicPlanService: with db_session() as session: save_plan(session, plan) + session.add( + UploadedFile( + s3_file_bucket=self._s3_bucket, + s3_file_key=s3_key, + s3_upload_timestamp=datetime.now(timezone.utc), + uprn=int(uprn) if uprn is not None else None, + hubspot_deal_id=request.hubspot_deal_id, + file_source=FileSourceEnum.MAGIC_PLAN.value, + file_type=FileTypeEnum.MAGIC_PLAN_JSON.value, + ) + ) return plan From e1972e4349d86db57eca6ad5230bcc900a10935a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:23:32 +0000 Subject: [PATCH 044/106] =?UTF-8?q?Upload=20gzip-compressed=20MagicPlan=20?= =?UTF-8?q?JSON=20to=20S3=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 47 +++++++++++++++--------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 7860fec9..9ac17e55 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -50,25 +50,38 @@ class MagicPlanService: raw_bytes: bytes = self._client.get_plan_raw(matched.id) plan: Plan = map_plan(magic_plan) - compressed = gzip.compress(raw_bytes) - if uprn is not None: - s3_key = f"documents/uprn/{uprn}/magic_plan_{matched.id}.json.gz" - else: - s3_key = f"documents/hubspot_deal_id/{request.hubspot_deal_id}/magic_plan_{matched.id}.json.gz" - save_data_to_s3(compressed, self._s3_bucket, s3_key) + uploaded_file = self._upload_raw_plan_json( + plan_id=matched.id, + raw_bytes=raw_bytes, + uprn=uprn, + hubspot_deal_id=request.hubspot_deal_id, + ) with db_session() as session: save_plan(session, plan) - session.add( - UploadedFile( - s3_file_bucket=self._s3_bucket, - s3_file_key=s3_key, - s3_upload_timestamp=datetime.now(timezone.utc), - uprn=int(uprn) if uprn is not None else None, - hubspot_deal_id=request.hubspot_deal_id, - file_source=FileSourceEnum.MAGIC_PLAN.value, - file_type=FileTypeEnum.MAGIC_PLAN_JSON.value, - ) - ) + session.add(uploaded_file) return plan + + def _upload_raw_plan_json( + self, + plan_id: str, + raw_bytes: bytes, + uprn: Optional[str], + hubspot_deal_id: str, + ) -> UploadedFile: + compressed = gzip.compress(raw_bytes) + if uprn is not None: + s3_key = f"documents/uprn/{uprn}/magic_plan_{plan_id}.json.gz" + else: + s3_key = f"documents/hubspot_deal_id/{hubspot_deal_id}/magic_plan_{plan_id}.json.gz" + save_data_to_s3(compressed, self._s3_bucket, s3_key) + return UploadedFile( + s3_file_bucket=self._s3_bucket, + s3_file_key=s3_key, + s3_upload_timestamp=datetime.now(timezone.utc), + uprn=int(uprn) if uprn is not None else None, + hubspot_deal_id=hubspot_deal_id, + file_source=FileSourceEnum.MAGIC_PLAN.value, + file_type=FileTypeEnum.MAGIC_PLAN_JSON.value, + ) From 9f62e3c31abe61a9f67faf38e3bbc77730a1b64d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:30:59 +0000 Subject: [PATCH 045/106] typehint --- backend/magic_plan/magic_plan_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 9ac17e55..6ed25c0c 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -50,7 +50,7 @@ class MagicPlanService: raw_bytes: bytes = self._client.get_plan_raw(matched.id) plan: Plan = map_plan(magic_plan) - uploaded_file = self._upload_raw_plan_json( + uploaded_file: UploadedFile = self._upload_raw_plan_json( plan_id=matched.id, raw_bytes=raw_bytes, uprn=uprn, From c9c43f178c51ae061dce767f1062981a3fa8acf3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 14:48:15 +0000 Subject: [PATCH 046/106] demo generated for use in address2uprn --- backend/address2UPRN/main.py | 76 +----- backend/address2UPRN/scoring.py | 57 +++++ datatypes/epc/domain/historic_epc.py | 88 +++++++ datatypes/epc/domain/historic_epc_matching.py | 114 +++++++++ datatypes/epc/domain/plan.md | 161 ------------ .../tests/test_historic_epc_matching.py | 239 ++++++++++++++++++ datatypes/epc/loaders/historic_epc.py | 2 +- datatypes/epc/schema/historic_epc.py | 98 ------- .../schema/tests/test_historic_epc_loading.py | 2 +- scripts/historic_epc_demo.py | 47 ++++ utils/s3.py | 15 ++ 11 files changed, 570 insertions(+), 329 deletions(-) create mode 100644 backend/address2UPRN/scoring.py create mode 100644 datatypes/epc/domain/historic_epc_matching.py delete mode 100644 datatypes/epc/domain/plan.md create mode 100644 datatypes/epc/domain/tests/test_historic_epc_matching.py delete mode 100644 datatypes/epc/schema/historic_epc.py create mode 100644 scripts/historic_epc_demo.py diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 28ad344f..b83c7a58 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -17,16 +17,12 @@ from utils.s3 import ( from datetime import datetime from backend.utils.addressMatch import AddressMatch - -logger = setup_logger() - - -EPC_AUTH_TOKEN = os.getenv( - "EPC_AUTH_TOKEN", +from backend.address2UPRN.scoring import ( # noqa: F401 (re-exported) + df_has_single_uprn, + get_uprn_candidates, ) -if EPC_AUTH_TOKEN is None: - raise RuntimeError("EPC_AUTH_TOKEN not defined in env") +logger = setup_logger() def score_addresses( @@ -45,7 +41,10 @@ def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): Recursively fetch EPC data by postcode. If results hit the size limit, retry with double size up to max_attempts. """ - client = EpcClient(auth_token=EPC_AUTH_TOKEN) + auth_token = os.getenv("EPC_AUTH_TOKEN") + if auth_token is None: + raise RuntimeError("EPC_AUTH_TOKEN not defined in env") + client = EpcClient(auth_token=auth_token) url = os.path.join(client.domestic.host, "search") @@ -88,65 +87,6 @@ def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): return results_df -def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: - """ - Returns True if all non-null UPRNs in df match the given uprn. - Returns False otherwise. - """ - - if column not in df.columns: - return False - - # Drop nulls and normalise to string - uprns = df[column].dropna().astype(str).str.strip().unique() - - # No valid UPRNs to compare - if len(uprns) == 0: - return False - - # Exactly one unique UPRN and it matches - return len(uprns) == 1 and uprns[0] == str(uprn) - - -def get_uprn_candidates( - df: pd.DataFrame, - user_address: str, - address_column: str = "address", - uprn_column: str = "uprn", -) -> pd.DataFrame: - """ - Annotate EPC results with lexicographical similarity scores and ranks. - - Returns a DataFrame sorted by descending lexiscore. - DOES NOT choose or return a UPRN. - """ - - if address_column not in df.columns: - raise ValueError(f"Missing column: {address_column}") - - if uprn_column not in df.columns: - raise ValueError(f"Missing column: {uprn_column}") - - out = df.copy() - - user_norm = AddressMatch.normalise_address(user_address) - - out["lexiscore"] = out[address_column].apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) - - # Normalise UPRN to string - out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) - - # Rank: 1 = best match - out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) - - return out.sort_values( - ["lexirank", "lexiscore"], - ascending=[True, False], - ) - - def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, diff --git a/backend/address2UPRN/scoring.py b/backend/address2UPRN/scoring.py new file mode 100644 index 00000000..d31b9aea --- /dev/null +++ b/backend/address2UPRN/scoring.py @@ -0,0 +1,57 @@ +import pandas as pd + +from backend.utils.addressMatch import AddressMatch + + +def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: + """ + Returns True if all non-null UPRNs in df match the given uprn. + Returns False otherwise. + """ + + if column not in df.columns: + return False + + uprns = df[column].dropna().astype(str).str.strip().unique() + + if len(uprns) == 0: + return False + + return len(uprns) == 1 and uprns[0] == str(uprn) + + +def get_uprn_candidates( + df: pd.DataFrame, + user_address: str, + address_column: str = "address", + uprn_column: str = "uprn", +) -> pd.DataFrame: + """ + Annotate EPC results with lexicographical similarity scores and ranks. + + Returns a DataFrame sorted by descending lexiscore. + DOES NOT choose or return a UPRN. + """ + + if address_column not in df.columns: + raise ValueError(f"Missing column: {address_column}") + + if uprn_column not in df.columns: + raise ValueError(f"Missing column: {uprn_column}") + + out = df.copy() + + user_norm = AddressMatch.normalise_address(user_address) + + out["lexiscore"] = out[address_column].apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) + + out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) + + out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) + + return out.sort_values( + ["lexirank", "lexiscore"], + ascending=[True, False], + ) diff --git a/datatypes/epc/domain/historic_epc.py b/datatypes/epc/domain/historic_epc.py index 230c6327..f64ab8c4 100644 --- a/datatypes/epc/domain/historic_epc.py +++ b/datatypes/epc/domain/historic_epc.py @@ -3,8 +3,96 @@ from dataclasses import dataclass @dataclass class HistoricEpc: + lmk_key: str address1: str address2: str address3: str postcode: str + building_reference_number: str + current_energy_rating: str + potential_energy_rating: str + current_energy_efficiency: str + potential_energy_efficiency: str + property_type: str + built_form: str + inspection_date: str + local_authority: str + constituency: str + county: str + lodgement_date: str + transaction_type: str + environment_impact_current: str + environment_impact_potential: str + energy_consumption_current: str + energy_consumption_potential: str + co2_emissions_current: str + co2_emiss_curr_per_floor_area: str + co2_emissions_potential: str + lighting_cost_current: str + lighting_cost_potential: str + heating_cost_current: str + heating_cost_potential: str + hot_water_cost_current: str + hot_water_cost_potential: str + total_floor_area: str + energy_tariff: str + mains_gas_flag: str + floor_level: str + flat_top_storey: str + flat_storey_count: str + main_heating_controls: str + multi_glaze_proportion: str + glazed_type: str + glazed_area: str + extension_count: str + number_habitable_rooms: str + number_heated_rooms: str + low_energy_lighting: str + number_open_fireplaces: str + hotwater_description: str + hot_water_energy_eff: str + hot_water_env_eff: str + floor_description: str + floor_energy_eff: str + floor_env_eff: str + windows_description: str + windows_energy_eff: str + windows_env_eff: str + walls_description: str + walls_energy_eff: str + walls_env_eff: str + secondheat_description: str + sheating_energy_eff: str + sheating_env_eff: str + roof_description: str + roof_energy_eff: str + roof_env_eff: str + mainheat_description: str + mainheat_energy_eff: str + mainheat_env_eff: str + mainheatcont_description: str + mainheatc_energy_eff: str + mainheatc_env_eff: str + lighting_description: str + lighting_energy_eff: str + lighting_env_eff: str + main_fuel: str + wind_turbine_count: str + heat_loss_corridor: str + unheated_corridor_length: str + floor_height: str + photo_supply: str + solar_water_heating_flag: str + mechanical_ventilation: str + address: str + local_authority_label: str + constituency_label: str + posttown: str + construction_age_band: str + lodgement_datetime: str + tenure: str + fixed_lighting_outlets_count: str + low_energy_fixed_light_count: str uprn: str + uprn_source: str + report_type: str diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py new file mode 100644 index 00000000..53f602ae --- /dev/null +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -0,0 +1,114 @@ +from dataclasses import dataclass +from typing import Any, Optional + +import pandas as pd +from botocore.exceptions import ClientError + +from backend.address2UPRN.scoring import get_uprn_candidates +from backend.utils.addressMatch import AddressMatch +from datatypes.epc.domain.historic_epc import HistoricEpc +from utils.s3 import parse_s3_uri, read_csv_gz_from_s3 + +DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" + +_EXTRA_COLS = {"lexiscore", "lexirank"} + + +def _cell_to_str(v: Any) -> str: + if v is None or (isinstance(v, float) and pd.isna(v)): + return "" + s = str(v).replace("\xa0", " ") + # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". + # Treat that as missing so unambiguous_uprn truthiness checks work. + if s.lower() == "nan": + return "" + return s + + +def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: + kwargs = { + col.lower(): _cell_to_str(val) + for col, val in row.items() + if col.lower() not in _EXTRA_COLS + } + return HistoricEpc(**kwargs) + + +@dataclass(frozen=True) +class ScoredHistoricEpc: + record: HistoricEpc + lexiscore: float + lexirank: int + + +@dataclass +class HistoricEpcMatches: + user_address: str + postcode: str + matches: list[ScoredHistoricEpc] + + def top(self) -> Optional[ScoredHistoricEpc]: + return self.matches[0] if self.matches else None + + def top_n(self, k: int) -> list[ScoredHistoricEpc]: + return self.matches[:k] + + def unambiguous_uprn(self) -> Optional[str]: + top = self.top() + if top is None or top.lexiscore <= 0: + return None + rank1 = [m for m in self.matches if m.lexirank == top.lexirank] + uprns = {m.record.uprn for m in rank1 if m.record.uprn} + return next(iter(uprns)) if len(uprns) == 1 else None + + +def _sanitise_postcode(postcode: str) -> str: + cleaned = (postcode or "").upper().replace(" ", "") + if not cleaned: + raise ValueError("postcode must contain non-whitespace characters") + if not AddressMatch.is_valid_postcode(cleaned): + raise ValueError(f"postcode {cleaned!r} is not a valid UK postcode") + return cleaned + + +def match_addresses_for_postcode( + user_address: str, + postcode: str, + *, + s3_root: str = DEFAULT_S3_ROOT, + address_column: str = "ADDRESS", + uprn_column: str = "UPRN", +) -> HistoricEpcMatches: + if not user_address: + raise ValueError("user_address must be non-empty") + + pc = _sanitise_postcode(postcode) + bucket, root_prefix = parse_s3_uri(s3_root) + key = f"{root_prefix.rstrip('/')}/{pc}/data.csv.gz" + + try: + df = read_csv_gz_from_s3(bucket, key) + except ClientError as e: + if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): + raise FileNotFoundError( + f"No historic EPC data at s3://{bucket}/{key}" + ) from e + raise + + scored = get_uprn_candidates( + df, + user_address=user_address, + address_column=address_column, + uprn_column=uprn_column, + ) + + matches = [ + ScoredHistoricEpc( + record=_row_to_historic_epc(row), + lexiscore=float(row["lexiscore"]), + lexirank=int(row["lexirank"]), + ) + for _, row in scored.iterrows() + ] + + return HistoricEpcMatches(user_address=user_address, postcode=pc, matches=matches) diff --git a/datatypes/epc/domain/plan.md b/datatypes/epc/domain/plan.md deleted file mode 100644 index 45cc495b..00000000 --- a/datatypes/epc/domain/plan.md +++ /dev/null @@ -1,161 +0,0 @@ -# Historic EPC address-match service - -## Context - -ETL `backend/etl/etl_opendatacommunities/main.py` shards `certificates.csv` by sanitised postcode and uploads gzipped CSVs to `s3://retrofit-data-dev/historical_epc//data.csv.gz`. Need a pure-python lib that, given `(user_address, postcode)`, fetches the corresponding shard and scores every row against the user address using the same lexiscore as `address2UPRN` — but returning the full scored df (not a single UPRN), so callers can apply their own thresholding. - -Mirrors pattern in [backend/address2UPRN/main.py:111-147](backend/address2UPRN/main.py#L111-L147) (`get_uprn_candidates`) but reads from S3 historic CSV instead of the EPC live API. No Lambda, no script — lib only for now. - -## Approach - -Add a wrapper class `HistoricEpcMatches` and a function `match_addresses_for_postcode` to the existing domain file. Add a small gzip-CSV S3 helper to `utils/s3.py`. - -### 1. Add gzip-CSV S3 reader - -In [utils/s3.py](utils/s3.py) (after `read_dataframe_from_s3_parquet` ~line 167): - -```python -def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: - if not file_key.endswith(".csv.gz"): - raise ValueError("file_key must end with .csv.gz") - buf = read_io_from_s3(bucket_name, file_key) - return pd.read_csv(buf, compression="gzip", low_memory=False) -``` - -Reuses existing `read_io_from_s3` (line 105). Caller catches `botocore.exceptions.ClientError` for missing-key handling. - -### 2. Append matcher to domain module - -In [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — keep existing `HistoricEpc` dataclass intact, append: - -```python -from typing import Optional -import pandas as pd -from botocore.exceptions import ClientError - -from backend.utils.addressMatch import AddressMatch -from utils.s3 import read_csv_gz_from_s3 - - -@dataclass -class HistoricEpcMatches: - """Scored historic EPC rows for a single postcode.""" - user_address: str - postcode: str # sanitised - df: pd.DataFrame # has lexiscore + lexirank, sorted best-first - - def top(self) -> Optional[pd.Series]: - return None if self.df.empty else self.df.iloc[0] - - def top_n(self, k: int) -> pd.DataFrame: - return self.df.head(k) - - def unambiguous_uprn(self, uprn_column: str = "UPRN") -> Optional[str]: - if self.df.empty: - return None - top_rank = self.df["lexirank"].min() - uprns = ( - self.df.loc[self.df["lexirank"] == top_rank, uprn_column] - .dropna().astype(str).str.replace(r"\.0$", "", regex=True) - .unique() - ) - return uprns[0] if len(uprns) == 1 else None - - -def _sanitise_postcode(postcode: str) -> str: - if not postcode: - raise ValueError("postcode must be non-empty") - return postcode.upper().replace(" ", "") - - -def match_addresses_for_postcode( - user_address: str, - postcode: str, - *, - bucket: str = "retrofit-data-dev", - prefix: str = "historical_epc", - address_column: str = "ADDRESS", -) -> HistoricEpcMatches: - if not user_address: - raise ValueError("user_address must be non-empty") - - pc = _sanitise_postcode(postcode) - key = f"{prefix}/{pc}/data.csv.gz" - - try: - df = read_csv_gz_from_s3(bucket, key) - except ClientError as e: - if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): - raise FileNotFoundError( - f"No historic EPC data at s3://{bucket}/{key}" - ) from e - raise - - if address_column not in df.columns: - raise ValueError( - f"Missing address column {address_column!r} in {key}" - ) - - user_norm = AddressMatch.normalise_address(user_address) - df = df.copy() - df["lexiscore"] = df[address_column].fillna("").apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) - df["lexirank"] = ( - df["lexiscore"].rank(method="dense", ascending=False).astype(int) - ) - df = df.sort_values(["lexirank", "lexiscore"], ascending=[True, False]).reset_index(drop=True) - - return HistoricEpcMatches(user_address=user_address, postcode=pc, df=df) -``` - -### Reuse notes -- `AddressMatch.normalise_address` + `AddressMatch.levenshtein` from [backend/utils/addressMatch.py](backend/utils/addressMatch.py) — same scoring as address2UPRN. -- Score column copy uses `.fillna("")` to defend against NaN in `ADDRESS`. -- Defaults match ETL output: bucket `retrofit-data-dev`, prefix `historical_epc`, column `ADDRESS` (uppercase). - -### 3. Tests - -New: [datatypes/epc/domain/tests/__init__.py](datatypes/epc/domain/tests/__init__.py) (empty) and [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py). - -Reuse existing fixture `datatypes/epc/schema/tests/fixtures/historic_epc.csv` — read it in-memory in tests; do NOT commit a `.csv.gz` fixture. Patch target: `datatypes.epc.domain.historic_epc.read_csv_gz_from_s3` (local binding, not `utils.s3.read_csv_gz_from_s3`). - -Cases: -1. `_sanitise_postcode("ab33 8al") == "AB338AL"`; empty raises. -2. Returned df has `lexiscore` + `lexirank` columns, row count preserved. -3. df sorted: `iloc[0]["lexirank"] == 1`, `lexiscore` monotone non-increasing. -4. S3 key built correctly: `"AB33 8AL"` → key `"historical_epc/AB338AL/data.csv.gz"` (spy on patched helper). -5. `ClientError` with code `NoSuchKey` → `FileNotFoundError`. -6. Exact-match address → `unambiguous_uprn()` returns that UPRN; ambiguous tie → `None`. -7. `top()` / `top_n(k)` shape checks. - -## Critical files -- [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — append matcher -- [utils/s3.py](utils/s3.py) — add `read_csv_gz_from_s3` -- [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py) — new - -## Out of scope -- Lambda handler / SQS wiring (deferred — lib only) -- Threshold logic (caller decides via wrapper helpers) -- Postcode validation via `postcodes.io` (`AddressMatch.is_valid_postcode` exists if needed later) -- Refactoring `sanitise(pd.Series)` in `etl_opendatacommunities/main.py` — separate concern - -## Verification -``` -cd /workspaces/model && pytest datatypes/epc/domain/tests/test_historic_epc_match.py -v -``` - -Sample real-S3 call (needs AWS creds): -```python -from datatypes.epc.domain.historic_epc import match_addresses_for_postcode -m = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") -print(m.df[["ADDRESS", "UPRN", "lexiscore", "lexirank"]].head()) -print(m.unambiguous_uprn()) -``` - -## Sequencing -1. Add `read_csv_gz_from_s3` to `utils/s3.py`. -2. Append matcher + wrapper to `datatypes/epc/domain/historic_epc.py`. -3. Add tests. - -Steps 2 & 3 depend on 1. No `__init__.py` re-exports needed. diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py new file mode 100644 index 00000000..c23846e1 --- /dev/null +++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py @@ -0,0 +1,239 @@ +from unittest.mock import patch + +import numpy as np +import pandas as pd +import pytest +from botocore.exceptions import ClientError + +from datatypes.epc.domain import historic_epc_matching as matcher_mod +from datatypes.epc.domain.historic_epc_matching import ( + HistoricEpcMatches, + ScoredHistoricEpc, + _sanitise_postcode, + match_addresses_for_postcode, +) + + +# Columns required by the HistoricEpc dataclass (lower-cased CSV columns). +# The matcher only reads ADDRESS + UPRN to score; everything else is filled +# with "" but must be present for HistoricEpc(**kwargs) to construct. +_FULL_COLUMN_FIELDS = [ + "LMK_KEY", "ADDRESS1", "ADDRESS2", "ADDRESS3", "POSTCODE", + "BUILDING_REFERENCE_NUMBER", "CURRENT_ENERGY_RATING", "POTENTIAL_ENERGY_RATING", + "CURRENT_ENERGY_EFFICIENCY", "POTENTIAL_ENERGY_EFFICIENCY", "PROPERTY_TYPE", + "BUILT_FORM", "INSPECTION_DATE", "LOCAL_AUTHORITY", "CONSTITUENCY", "COUNTY", + "LODGEMENT_DATE", "TRANSACTION_TYPE", "ENVIRONMENT_IMPACT_CURRENT", + "ENVIRONMENT_IMPACT_POTENTIAL", "ENERGY_CONSUMPTION_CURRENT", + "ENERGY_CONSUMPTION_POTENTIAL", "CO2_EMISSIONS_CURRENT", + "CO2_EMISS_CURR_PER_FLOOR_AREA", "CO2_EMISSIONS_POTENTIAL", + "LIGHTING_COST_CURRENT", "LIGHTING_COST_POTENTIAL", "HEATING_COST_CURRENT", + "HEATING_COST_POTENTIAL", "HOT_WATER_COST_CURRENT", "HOT_WATER_COST_POTENTIAL", + "TOTAL_FLOOR_AREA", "ENERGY_TARIFF", "MAINS_GAS_FLAG", "FLOOR_LEVEL", + "FLAT_TOP_STOREY", "FLAT_STOREY_COUNT", "MAIN_HEATING_CONTROLS", + "MULTI_GLAZE_PROPORTION", "GLAZED_TYPE", "GLAZED_AREA", "EXTENSION_COUNT", + "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "LOW_ENERGY_LIGHTING", + "NUMBER_OPEN_FIREPLACES", "HOTWATER_DESCRIPTION", "HOT_WATER_ENERGY_EFF", + "HOT_WATER_ENV_EFF", "FLOOR_DESCRIPTION", "FLOOR_ENERGY_EFF", "FLOOR_ENV_EFF", + "WINDOWS_DESCRIPTION", "WINDOWS_ENERGY_EFF", "WINDOWS_ENV_EFF", + "WALLS_DESCRIPTION", "WALLS_ENERGY_EFF", "WALLS_ENV_EFF", + "SECONDHEAT_DESCRIPTION", "SHEATING_ENERGY_EFF", "SHEATING_ENV_EFF", + "ROOF_DESCRIPTION", "ROOF_ENERGY_EFF", "ROOF_ENV_EFF", "MAINHEAT_DESCRIPTION", + "MAINHEAT_ENERGY_EFF", "MAINHEAT_ENV_EFF", "MAINHEATCONT_DESCRIPTION", + "MAINHEATC_ENERGY_EFF", "MAINHEATC_ENV_EFF", "LIGHTING_DESCRIPTION", + "LIGHTING_ENERGY_EFF", "LIGHTING_ENV_EFF", "MAIN_FUEL", "WIND_TURBINE_COUNT", + "HEAT_LOSS_CORRIDOR", "UNHEATED_CORRIDOR_LENGTH", "FLOOR_HEIGHT", + "PHOTO_SUPPLY", "SOLAR_WATER_HEATING_FLAG", "MECHANICAL_VENTILATION", + "ADDRESS", "LOCAL_AUTHORITY_LABEL", "CONSTITUENCY_LABEL", "POSTTOWN", + "CONSTRUCTION_AGE_BAND", "LODGEMENT_DATETIME", "TENURE", + "FIXED_LIGHTING_OUTLETS_COUNT", "LOW_ENERGY_FIXED_LIGHT_COUNT", "UPRN", + "UPRN_SOURCE", "REPORT_TYPE", +] + + +def _row(address: str, uprn) -> dict: + row = {col: "" for col in _FULL_COLUMN_FIELDS} + row["ADDRESS"] = address + row["UPRN"] = uprn + return row + + +def _build_df(rows: list[dict]) -> pd.DataFrame: + return pd.DataFrame(rows, columns=_FULL_COLUMN_FIELDS) + + +@pytest.fixture +def patch_postcode_valid(): + with patch.object(matcher_mod.AddressMatch, "is_valid_postcode", return_value=True) as m: + yield m + + +@pytest.fixture +def patch_read(): + with patch.object(matcher_mod, "read_csv_gz_from_s3") as m: + yield m + + +# ---------- _sanitise_postcode ---------- + + +class TestSanitisePostcode: + + def test_uppercases_and_strips_spaces(self, patch_postcode_valid): + assert _sanitise_postcode("ab33 8al") == "AB338AL" + + def test_empty_raises(self, patch_postcode_valid): + with pytest.raises(ValueError, match="non-whitespace"): + _sanitise_postcode("") + + def test_whitespace_only_raises(self, patch_postcode_valid): + with pytest.raises(ValueError, match="non-whitespace"): + _sanitise_postcode(" ") + + def test_invalid_postcode_raises(self): + with patch.object( + matcher_mod.AddressMatch, "is_valid_postcode", return_value=False + ): + with pytest.raises(ValueError, match="not a valid UK postcode"): + _sanitise_postcode("NONSENSE") + + +# ---------- match_addresses_for_postcode ---------- + + +class TestMatchAddressesForPostcode: + + def test_preserves_row_count_including_zero_score_rows( + self, patch_read, patch_postcode_valid + ): + # Disjoint number sets => hard zero. Still kept in matches. + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("999 SOMEWHERE ELSE", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert isinstance(result, HistoricEpcMatches) + assert len(result.matches) == 2 + + def test_top_has_lexirank_one_and_lexiscore_monotone( + self, patch_read, patch_postcode_valid + ): + patch_read.return_value = _build_df([ + _row("48 GORDON ROAD", "200"), # near miss + _row("47 GORDON ROAD", "100"), # exact (after normalisation) + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert result.top().lexirank == 1 + scores = [m.lexiscore for m in result.matches] + assert scores == sorted(scores, reverse=True) + + def test_s3_key_built_from_default_root(self, patch_read, patch_postcode_valid): + patch_read.return_value = _build_df([_row("47 GORDON ROAD", "100")]) + match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + patch_read.assert_called_once_with( + "retrofit-data-dev", "historical_epc/AB338AL/data.csv.gz" + ) + + def test_s3_key_respects_custom_root_with_trailing_slash( + self, patch_read, patch_postcode_valid + ): + patch_read.return_value = _build_df([_row("47 GORDON ROAD", "100")]) + match_addresses_for_postcode( + "47 Gordon Road", + "AB33 8AL", + s3_root="s3://my-bucket/some/prefix/", + ) + patch_read.assert_called_once_with( + "my-bucket", "some/prefix/AB338AL/data.csv.gz" + ) + + def test_no_such_key_translates_to_filenotfound( + self, patch_read, patch_postcode_valid + ): + patch_read.side_effect = ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "missing"}}, "GetObject" + ) + with pytest.raises(FileNotFoundError): + match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + + def test_other_client_error_propagates(self, patch_read, patch_postcode_valid): + patch_read.side_effect = ClientError( + {"Error": {"Code": "AccessDenied", "Message": "nope"}}, "GetObject" + ) + with pytest.raises(ClientError): + match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + + def test_empty_user_address_raises(self, patch_postcode_valid): + with pytest.raises(ValueError, match="user_address"): + match_addresses_for_postcode("", "AB33 8AL") + + +# ---------- unambiguous_uprn ---------- + + +class TestUnambiguousUprn: + + def test_exact_match_returns_uprn(self, patch_read, patch_postcode_valid): + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert result.unambiguous_uprn() == "100" + + def test_ambiguous_tie_returns_none(self, patch_read, patch_postcode_valid): + # Two duplicate addresses with different UPRNs share rank-1. + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("47 GORDON ROAD", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert result.unambiguous_uprn() is None + + def test_all_zero_score_returns_none_even_when_uprn_unique( + self, patch_read, patch_postcode_valid + ): + # User address has building number 47; no row has 47 -> all hard-zero. + patch_read.return_value = _build_df([ + _row("999 ELSEWHERE", "100"), + _row("888 ELSEWHERE", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert all(m.lexiscore == 0.0 for m in result.matches) + assert result.unambiguous_uprn() is None + + def test_nan_uprn_becomes_empty_string_not_nan( + self, patch_read, patch_postcode_valid + ): + # Use a real NaN in the UPRN cell. + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", np.nan), + _row("48 GORDON ROAD", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + top = result.top() + # _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), + # so unambiguous_uprn's truthiness check correctly drops the row. + assert top.record.uprn == "" + + +# ---------- top / top_n ---------- + + +class TestTopHelpers: + + def test_top_n_returns_first_k(self, patch_read, patch_postcode_valid): + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + _row("49 GORDON ROAD", "300"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + top2 = result.top_n(2) + assert len(top2) == 2 + assert all(isinstance(m, ScoredHistoricEpc) for m in top2) + + def test_top_on_empty_matches_returns_none(self): + empty = HistoricEpcMatches(user_address="x", postcode="AB338AL", matches=[]) + assert empty.top() is None + assert empty.top_n(5) == [] + assert empty.unambiguous_uprn() is None diff --git a/datatypes/epc/loaders/historic_epc.py b/datatypes/epc/loaders/historic_epc.py index 7b563315..a4757d23 100644 --- a/datatypes/epc/loaders/historic_epc.py +++ b/datatypes/epc/loaders/historic_epc.py @@ -1,6 +1,6 @@ import csv -from datatypes.epc.schema.historic_epc import HistoricEpc +from datatypes.epc.domain.historic_epc import HistoricEpc def _normalise(value: str | None) -> str: diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py deleted file mode 100644 index f64ab8c4..00000000 --- a/datatypes/epc/schema/historic_epc.py +++ /dev/null @@ -1,98 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class HistoricEpc: - lmk_key: str - address1: str - address2: str - address3: str - postcode: str - building_reference_number: str - current_energy_rating: str - potential_energy_rating: str - current_energy_efficiency: str - potential_energy_efficiency: str - property_type: str - built_form: str - inspection_date: str - local_authority: str - constituency: str - county: str - lodgement_date: str - transaction_type: str - environment_impact_current: str - environment_impact_potential: str - energy_consumption_current: str - energy_consumption_potential: str - co2_emissions_current: str - co2_emiss_curr_per_floor_area: str - co2_emissions_potential: str - lighting_cost_current: str - lighting_cost_potential: str - heating_cost_current: str - heating_cost_potential: str - hot_water_cost_current: str - hot_water_cost_potential: str - total_floor_area: str - energy_tariff: str - mains_gas_flag: str - floor_level: str - flat_top_storey: str - flat_storey_count: str - main_heating_controls: str - multi_glaze_proportion: str - glazed_type: str - glazed_area: str - extension_count: str - number_habitable_rooms: str - number_heated_rooms: str - low_energy_lighting: str - number_open_fireplaces: str - hotwater_description: str - hot_water_energy_eff: str - hot_water_env_eff: str - floor_description: str - floor_energy_eff: str - floor_env_eff: str - windows_description: str - windows_energy_eff: str - windows_env_eff: str - walls_description: str - walls_energy_eff: str - walls_env_eff: str - secondheat_description: str - sheating_energy_eff: str - sheating_env_eff: str - roof_description: str - roof_energy_eff: str - roof_env_eff: str - mainheat_description: str - mainheat_energy_eff: str - mainheat_env_eff: str - mainheatcont_description: str - mainheatc_energy_eff: str - mainheatc_env_eff: str - lighting_description: str - lighting_energy_eff: str - lighting_env_eff: str - main_fuel: str - wind_turbine_count: str - heat_loss_corridor: str - unheated_corridor_length: str - floor_height: str - photo_supply: str - solar_water_heating_flag: str - mechanical_ventilation: str - address: str - local_authority_label: str - constituency_label: str - posttown: str - construction_age_band: str - lodgement_datetime: str - tenure: str - fixed_lighting_outlets_count: str - low_energy_fixed_light_count: str - uprn: str - uprn_source: str - report_type: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py index 2170a8a6..a42f383e 100644 --- a/datatypes/epc/schema/tests/test_historic_epc_loading.py +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -3,7 +3,7 @@ import os import pytest from datatypes.epc.loaders.historic_epc import read_historic_epc_csv -from datatypes.epc.schema.historic_epc import HistoricEpc +from datatypes.epc.domain.historic_epc import HistoricEpc FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") diff --git a/scripts/historic_epc_demo.py b/scripts/historic_epc_demo.py new file mode 100644 index 00000000..b47c3a3c --- /dev/null +++ b/scripts/historic_epc_demo.py @@ -0,0 +1,47 @@ +"""Demo: look up historic EPC records for an address + postcode. + +Reads the gzipped CSV at + s3://retrofit-data-dev/historical_epc//data.csv.gz +scores rows against the user-provided address, and prints the top matches. + +Usage: + python -m scripts.historic_epc_demo "47 Gordon Road" "AB33 8AL" + python -m scripts.historic_epc_demo # uses defaults below +""" + +import sys + +from datatypes.epc.domain.historic_epc_matching import match_addresses_for_postcode + + +def main(user_address: str, postcode: str) -> None: + print(f"Looking up: {user_address!r} @ {postcode!r}\n") + + result = match_addresses_for_postcode(user_address, postcode) + + print(f"Found {len(result.matches)} candidate row(s).\n") + + print("Top 3 matches:") + for m in result.top_n(3): + print( + f" rank={m.lexirank} score={m.lexiscore:.3f} " + f"uprn={m.record.uprn or '(none)':<14} {m.record.address}" + ) + + print() + uprn = result.unambiguous_uprn() + if uprn: + print(f"Unambiguous UPRN: {uprn}") + else: + print("No unambiguous UPRN (zero-score, tie, or empty result).") + + +if __name__ == "__main__": + args = sys.argv[1:] + if len(args) == 2: + main(args[0], args[1]) + elif len(args) == 0: + main("47 Gordon Road", "AB33 8AL") + else: + print(__doc__) + sys.exit(2) diff --git a/utils/s3.py b/utils/s3.py index 930e2e15..a28f074e 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -167,6 +167,21 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key): return df +def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: + """ + Read a gzipped CSV from S3 into a pandas DataFrame. + + :param bucket_name: Name of the S3 bucket. + :param file_key: Key of the file (must end in .csv.gz). + :return: A pandas DataFrame. + """ + if not file_key.endswith(".csv.gz"): + raise ValueError("file_key must end with .csv.gz") + + buffer = read_io_from_s3(bucket_name=bucket_name, file_key=file_key) + return pd.read_csv(buffer, compression="gzip", low_memory=False) + + def save_csv_to_s3(dataframe, bucket_name, file_name): """ Save a Pandas DataFrame to a CSV file in an S3 bucket. From 7ef5dc49223676cc833e7deead3e8dd2339c178e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 15:06:41 +0000 Subject: [PATCH 047/106] update csv --- backend/etl/etl_opendatacommunities/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/etl/etl_opendatacommunities/README.md b/backend/etl/etl_opendatacommunities/README.md index bf16ba89..65441caf 100644 --- a/backend/etl/etl_opendatacommunities/README.md +++ b/backend/etl/etl_opendatacommunities/README.md @@ -1,6 +1,6 @@ This website https://epc.opendatacommunities.org/ has closed down on 30th May 2026 -So we downloaded the data and moved everything to S3 ( s3://retrofit-data-dev/epc_opendatacommunities/master_backup/ ) +So we downloaded the data and moved everything to S3 ( s3://retrofit-data-dev/histroical_epc/0_master_backup/ ) This scripts assumes the following: @@ -11,4 +11,4 @@ The script funciton is: 1) reads csv for all data, seperate each iteration by postcode 2) compresses the csv and save it in the location -2) only gets the postcode data, compresses and uploads to s3 -> location s3://retrofit-data-dev/epc_opendatacommunities//compressed data \ No newline at end of file +3) location s3://retrofit-data-dev/epc_opendatacommunities//compressed data.csv \ No newline at end of file From ce2b61d60b10b5608f390a267ad2fe2d92b7d164 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 15:07:09 +0000 Subject: [PATCH 048/106] =?UTF-8?q?Upload=20gzip-compressed=20MagicPlan=20?= =?UTF-8?q?JSON=20to=20S3=20-=20only=20make=20one=20API=20call=20?= =?UTF-8?q?=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 5 ++++- backend/magic_plan/tests/test_magic_plan_service.py | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 6ed25c0c..fb0a7610 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -1,4 +1,5 @@ import gzip +import json from datetime import datetime, timezone from typing import Optional @@ -46,8 +47,10 @@ class MagicPlanService: if matched is None: raise ValueError(f"No MagicPlan found for address: {address!r}") - magic_plan: MagicPlanPlan = self._client.get_plan(matched.id) raw_bytes: bytes = self._client.get_plan_raw(matched.id) + magic_plan: MagicPlanPlan = MagicPlanPlan.model_validate( + json.loads(raw_bytes)["data"] + ) plan: Plan = map_plan(magic_plan) uploaded_file: UploadedFile = self._upload_raw_plan_json( diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index b7580546..f6954824 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -49,7 +49,9 @@ def plan_summary() -> PlanSummary: @pytest.fixture() def mock_client() -> MagicMock: client = MagicMock(spec=MagicPlanClient) - client.get_plan_raw.return_value = b"{}" + client.get_plan_raw.return_value = ( + FIXTURE_DIR / "magicplan_api_plan_response_example.json" + ).read_bytes() return client @@ -102,7 +104,7 @@ def test_run_fetches_plan_with_matched_id( ): service.run(_make_request()) # Assert - mock_client.get_plan.assert_called_once_with(plan_summary.id) + mock_client.get_plan_raw.assert_called_once_with(plan_summary.id) def test_run_returns_mapped_plan( @@ -183,8 +185,6 @@ def test_run_uploads_to_s3_with_uprn_key( ) -> None: # Arrange mock_client.get_plans.return_value.plans = [plan_summary] - mock_client.get_plan.return_value = api_magic_plan - mock_client.get_plan_raw.return_value = b'{"raw": "data"}' request = _make_request(uprn="100023336956") service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) with patch( From 1243690d100a0d73b6a91100e02ed289f160b87a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 15:41:12 +0000 Subject: [PATCH 049/106] give handler permission to write to s3 bucket --- backend/magic_plan/handler.py | 1 + infrastructure/terraform/lambda/magic_plan/main.tf | 5 +++++ infrastructure/terraform/shared/main.tf | 14 ++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/backend/magic_plan/handler.py b/backend/magic_plan/handler.py index 45de8554..f2c03b90 100644 --- a/backend/magic_plan/handler.py +++ b/backend/magic_plan/handler.py @@ -19,6 +19,7 @@ def handler(body: dict[str, Any], context: Any) -> str: customer_id=settings.MAGICPLAN_CUSTOMER_ID, api_key=settings.MAGICPLAN_API_KEY, ) + # TODO: read s3_bucket from env var so staging/prod use the correct bucket plan: Plan = MagicPlanService(client, s3_bucket="retrofit-energy-assessments-dev").run(payload) logger.info("Saved MagicPlan plan uid=%s", plan.uid) return plan.uid diff --git a/infrastructure/terraform/lambda/magic_plan/main.tf b/infrastructure/terraform/lambda/magic_plan/main.tf index 56adac1b..e2017b42 100644 --- a/infrastructure/terraform/lambda/magic_plan/main.tf +++ b/infrastructure/terraform/lambda/magic_plan/main.tf @@ -15,6 +15,11 @@ locals { db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) } +resource "aws_iam_role_policy_attachment" "magic_plan_s3_write" { + role = module.lambda.role_name + policy_arn = data.terraform_remote_state.shared.outputs.magic_plan_s3_write_arn +} + module "lambda" { source = "../../modules/lambda_with_sqs" diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 050ebdc2..e32ce395 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -745,4 +745,18 @@ module "magic_plan_client_registry" { source = "../modules/container_registry" name = "magic-plan" stage = var.stage +} + +module "magic_plan_s3_write" { + source = "../modules/s3_iam_policy" + + policy_name = "MagicPlanWriteS3" + policy_description = "Allow MagicPlan Lambda to write to retrofit energy assessments bucket" + bucket_arns = ["arn:aws:s3:::retrofit-energy-assessments-${var.stage}"] + actions = ["s3:PutObject", "s3:AbortMultipartUpload"] + resource_paths = ["/*"] +} + +output "magic_plan_s3_write_arn" { + value = module.magic_plan_s3_write.policy_arn } \ No newline at end of file From aadf73ed87db510efc17a1579122d7bc5e65ca15 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 15:44:14 +0000 Subject: [PATCH 050/106] combine s3 write policies into one and apply to pashub and magicplan lambdas --- .../terraform/lambda/magic_plan/main.tf | 2 +- .../terraform/lambda/pashub_to_ara/main.tf | 2 +- infrastructure/terraform/shared/main.tf | 23 ++++--------------- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/infrastructure/terraform/lambda/magic_plan/main.tf b/infrastructure/terraform/lambda/magic_plan/main.tf index e2017b42..48fc3867 100644 --- a/infrastructure/terraform/lambda/magic_plan/main.tf +++ b/infrastructure/terraform/lambda/magic_plan/main.tf @@ -17,7 +17,7 @@ locals { resource "aws_iam_role_policy_attachment" "magic_plan_s3_write" { role = module.lambda.role_name - policy_arn = data.terraform_remote_state.shared.outputs.magic_plan_s3_write_arn + policy_arn = data.terraform_remote_state.shared.outputs.energy_assessments_s3_write_arn } module "lambda" { diff --git a/infrastructure/terraform/lambda/pashub_to_ara/main.tf b/infrastructure/terraform/lambda/pashub_to_ara/main.tf index 1a457617..902d7845 100644 --- a/infrastructure/terraform/lambda/pashub_to_ara/main.tf +++ b/infrastructure/terraform/lambda/pashub_to_ara/main.tf @@ -54,5 +54,5 @@ module "lambda" { resource "aws_iam_role_policy_attachment" "pashub_to_ara_s3_write" { role = module.lambda.role_name - policy_arn = data.terraform_remote_state.shared.outputs.pashub_to_ara_s3_write_arn + policy_arn = data.terraform_remote_state.shared.outputs.energy_assessments_s3_write_arn } diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index e32ce395..2c3200de 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -568,18 +568,18 @@ module "pashub_to_ara_registry" { stage = var.stage } -module "pashub_to_ara_s3_write" { +module "energy_assessments_s3_write" { source = "../modules/s3_iam_policy" - policy_name = "PashubToAraWriteS3" - policy_description = "Allow PasHub to ARA Lambda to write to retrofit energy assessments bucket" + policy_name = "EnergyAssessmentsWriteS3" + policy_description = "Allow lambdas to write to retrofit energy assessments bucket" bucket_arns = ["arn:aws:s3:::retrofit-energy-assessments-${var.stage}"] actions = ["s3:PutObject", "s3:AbortMultipartUpload"] resource_paths = ["/*"] } -output "pashub_to_ara_s3_write_arn" { - value = module.pashub_to_ara_s3_write.policy_arn +output "energy_assessments_s3_write_arn" { + value = module.energy_assessments_s3_write.policy_arn } ################################################ @@ -747,16 +747,3 @@ module "magic_plan_client_registry" { stage = var.stage } -module "magic_plan_s3_write" { - source = "../modules/s3_iam_policy" - - policy_name = "MagicPlanWriteS3" - policy_description = "Allow MagicPlan Lambda to write to retrofit energy assessments bucket" - bucket_arns = ["arn:aws:s3:::retrofit-energy-assessments-${var.stage}"] - actions = ["s3:PutObject", "s3:AbortMultipartUpload"] - resource_paths = ["/*"] -} - -output "magic_plan_s3_write_arn" { - value = module.magic_plan_s3_write.policy_arn -} \ No newline at end of file From fb758b76bf2dcecbed486b569b9fa5e345a85ddc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:37:44 +0000 Subject: [PATCH 051/106] changed to utils --- datatypes/epc/domain/historic_epc_matching.py | 16 +++------------- .../domain/tests/test_historic_epc_matching.py | 2 +- utils/pandas_utils.py | 14 ++++++++++++++ utils/s3.py | 2 -- 4 files changed, 18 insertions(+), 16 deletions(-) create mode 100644 utils/pandas_utils.py diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 53f602ae..2eb590e8 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import pandas as pd from botocore.exceptions import ClientError @@ -7,6 +7,7 @@ from botocore.exceptions import ClientError from backend.address2UPRN.scoring import get_uprn_candidates from backend.utils.addressMatch import AddressMatch from datatypes.epc.domain.historic_epc import HistoricEpc +from utils.pandas_utils import pandas_cell_to_str from utils.s3 import parse_s3_uri, read_csv_gz_from_s3 DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" @@ -14,20 +15,9 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" _EXTRA_COLS = {"lexiscore", "lexirank"} -def _cell_to_str(v: Any) -> str: - if v is None or (isinstance(v, float) and pd.isna(v)): - return "" - s = str(v).replace("\xa0", " ") - # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". - # Treat that as missing so unambiguous_uprn truthiness checks work. - if s.lower() == "nan": - return "" - return s - - def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: kwargs = { - col.lower(): _cell_to_str(val) + col.lower(): pandas_cell_to_str(val) for col, val in row.items() if col.lower() not in _EXTRA_COLS } diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py index c23846e1..1c3ee6d4 100644 --- a/datatypes/epc/domain/tests/test_historic_epc_matching.py +++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py @@ -211,7 +211,7 @@ class TestUnambiguousUprn: ]) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") top = result.top() - # _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), + # pandas_cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), # so unambiguous_uprn's truthiness check correctly drops the row. assert top.record.uprn == "" diff --git a/utils/pandas_utils.py b/utils/pandas_utils.py new file mode 100644 index 00000000..b32cde10 --- /dev/null +++ b/utils/pandas_utils.py @@ -0,0 +1,14 @@ +from typing import Any + +import pandas as pd + + +def pandas_cell_to_str(v: Any) -> str: + if v is None or (isinstance(v, float) and pd.isna(v)): + return "" + s = str(v).replace("\xa0", " ") + # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". + # Treat that as missing so unambiguous_uprn truthiness checks work. + if s.lower() == "nan": + return "" + return s diff --git a/utils/s3.py b/utils/s3.py index a28f074e..13d272e7 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -6,8 +6,6 @@ from io import BytesIO, StringIO from urllib.parse import unquote from utils.logger import setup_logger from botocore.exceptions import NoCredentialsError, PartialCredentialsError -from typing import Any - logger = setup_logger() From dccb35c2bc05f613bb767958b2a0ea7e517433e8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:44:55 +0000 Subject: [PATCH 052/106] fixed s3 location --- backend/etl/etl_opendatacommunities/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/etl/etl_opendatacommunities/README.md b/backend/etl/etl_opendatacommunities/README.md index 65441caf..728ac468 100644 --- a/backend/etl/etl_opendatacommunities/README.md +++ b/backend/etl/etl_opendatacommunities/README.md @@ -11,4 +11,4 @@ The script funciton is: 1) reads csv for all data, seperate each iteration by postcode 2) compresses the csv and save it in the location -3) location s3://retrofit-data-dev/epc_opendatacommunities//compressed data.csv \ No newline at end of file +3) location s3://retrofit-data-dev/historical_epc//compressed data.csv \ No newline at end of file From bf91722f30699124af4b36c23efcb108d09eeab8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:45:26 +0000 Subject: [PATCH 053/106] renamed a function to be self commenting --- datatypes/epc/domain/historic_epc_matching.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 2eb590e8..95ca9d9f 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -15,7 +15,7 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" _EXTRA_COLS = {"lexiscore", "lexirank"} -def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: +def _map_historic_epc_pandas_row_to_domain(row: pd.Series) -> HistoricEpc: kwargs = { col.lower(): pandas_cell_to_str(val) for col, val in row.items() @@ -94,7 +94,7 @@ def match_addresses_for_postcode( matches = [ ScoredHistoricEpc( - record=_row_to_historic_epc(row), + record=_map_historic_epc_pandas_row_to_domain(row), lexiscore=float(row["lexiscore"]), lexirank=int(row["lexirank"]), ) From 2049553176ee88e1fefa2062f300d036e58206a1 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 11 May 2026 09:25:41 +0000 Subject: [PATCH 054/106] =?UTF-8?q?Trigger=20MagicPlan=20on=20outcome=20"s?= =?UTF-8?q?urveyed"=20transition=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 7 +-- etl/hubspot/tests/test_hubspot_deal_differ.py | 49 ++++++++++--------- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index 5435a46d..ba3dc27a 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -198,12 +198,7 @@ class HubspotDealDiffer: def check_for_magicplan_trigger( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - new_status = (new_deal.get("coordination_status__stage_1_") or "").lower() - old_status = (old_deal.coordination_status or "").lower() - return ( - new_status in HubspotDealDiffer.COORDINATION_COMPLETE - and old_status not in HubspotDealDiffer.COORDINATION_COMPLETE - ) + raise NotImplementedError @staticmethod def _lodgement_completed( diff --git a/etl/hubspot/tests/test_hubspot_deal_differ.py b/etl/hubspot/tests/test_hubspot_deal_differ.py index 273a82a0..94952424 100644 --- a/etl/hubspot/tests/test_hubspot_deal_differ.py +++ b/etl/hubspot/tests/test_hubspot_deal_differ.py @@ -275,15 +275,12 @@ def test_pashub_trigger__coordination_design_lodgement_not_completed_and_pashub_ # ========================== -def test_magicplan_trigger__transitions_to_coordination_complete__returns_true() -> None: +def test_magicplan_trigger__outcome_transitions_to_surveyed__returns_true() -> None: deal_id = uuid.uuid4() # Arrange - old_deal = make_old_deal(id=deal_id, coordination_status="in progress") - new_deal = make_new_deal( - deal_id, - **{"coordination_status__stage_1_": "(v1) ioe/mtp complete"}, - ) + old_deal = make_old_deal(id=deal_id, outcome="assessed") + new_deal = make_new_deal(deal_id, outcome="surveyed") # Act result = HubspotDealDiffer.check_for_magicplan_trigger( @@ -295,20 +292,12 @@ def test_magicplan_trigger__transitions_to_coordination_complete__returns_true() assert result is True -def test_magicplan_trigger__already_in_coordination_complete_unrelated_change__returns_false() -> None: +def test_magicplan_trigger__outcome_already_surveyed__returns_false() -> None: deal_id = uuid.uuid4() # Arrange - old_deal = make_old_deal( - id=deal_id, - coordination_status="(v1) ioe/mtp complete", - outcome="pending", - ) - new_deal = make_new_deal( - deal_id, - **{"coordination_status__stage_1_": "(v1) ioe/mtp complete"}, - outcome="won", - ) + old_deal = make_old_deal(id=deal_id, outcome="surveyed") + new_deal = make_new_deal(deal_id, outcome="surveyed") # Act result = HubspotDealDiffer.check_for_magicplan_trigger( @@ -320,15 +309,12 @@ def test_magicplan_trigger__already_in_coordination_complete_unrelated_change__r assert result is False -def test_magicplan_trigger__transitions_to_non_complete_coordination_status__returns_false() -> None: +def test_magicplan_trigger__outcome_transitions_to_non_surveyed__returns_false() -> None: deal_id = uuid.uuid4() # Arrange - old_deal = make_old_deal(id=deal_id, coordination_status="in progress") - new_deal = make_new_deal( - deal_id, - **{"coordination_status__stage_1_": "design submitted"}, - ) + old_deal = make_old_deal(id=deal_id, outcome="assessed") + new_deal = make_new_deal(deal_id, outcome="assessed") # Act result = HubspotDealDiffer.check_for_magicplan_trigger( @@ -340,6 +326,23 @@ def test_magicplan_trigger__transitions_to_non_complete_coordination_status__ret assert result is False +def test_magicplan_trigger__outcome_surveyed_uppercase__returns_true() -> None: + deal_id = uuid.uuid4() + + # Arrange + old_deal = make_old_deal(id=deal_id, outcome="assessed") + new_deal = make_new_deal(deal_id, outcome="SURVEYED") + + # Act + result = HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + + # Assert + assert result is True + + # ======================= # DB UPDATE TRIGGER TESTS # ======================= From c15ffdf2c01765b0baa3e1fb371afe8c54e462c4 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 11 May 2026 09:26:20 +0000 Subject: [PATCH 055/106] =?UTF-8?q?Trigger=20MagicPlan=20on=20outcome=20"s?= =?UTF-8?q?urveyed"=20transition=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index ba3dc27a..724a3e68 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -198,7 +198,9 @@ class HubspotDealDiffer: def check_for_magicplan_trigger( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - raise NotImplementedError + new_outcome = (new_deal.get("outcome") or "").lower() + old_outcome = (old_deal.outcome or "").lower() + return new_outcome == "surveyed" and old_outcome != "surveyed" @staticmethod def _lodgement_completed( From 9aae5bf482a522e5b2fc4cb41174b7ef05ff1b07 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 15:20:17 +0000 Subject: [PATCH 056/106] added logic to deal with flats --- backend/address2UPRN/main.py | 92 ++++++++++++++++++++++++++++++----- backend/utils/addressMatch.py | 23 +++++++++ 2 files changed, 102 insertions(+), 13 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index fad5c64e..0938a53b 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -24,22 +24,53 @@ from backend.utils.addressMatch import ( logger = setup_logger() -OPEN_EPC_API_TOKEN = os.getenv("OPEN_EPC_API_TOKEN") - -if OPEN_EPC_API_TOKEN is None: - raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env") - - def get_epc_data_with_postcode(postcode: str) -> pd.DataFrame: from backend.epc_client.client import EpcClientService - service = EpcClientService(auth_token=OPEN_EPC_API_TOKEN) + token = os.getenv("OPEN_EPC_API_TOKEN") + if token is None: + raise RuntimeError("OPEN_EPC_API_TOKEN not defined in env") + + service = EpcClientService(auth_token=token) results = service.search_by_postcode(postcode) return pd.DataFrame( [{"address": r.address_line_1, "uprn": r.uprn} for r in results] ) +def get_uprn_from_historic_epc( + user_inputed_address: str, + postcode: str, +) -> Optional[tuple[str, str, float]]: + """Resolve a UPRN via historic EPC S3 data. + + Returns (uprn, address, lexiscore) when the historic dataset agrees on a + single rank-1 UPRN, None otherwise (missing postcode file, zero score, + or ambiguous top rank). The score gate is `unambiguous_uprn`'s own + (score > 0); the 0.7 heuristic used for the new-EPC source isn't applied + here because historic addresses use a more verbose format that + systematically depresses lexiscores. + """ + from datatypes.epc.domain.historic_epc_matching import ( + match_addresses_for_postcode, + ) + + try: + result = match_addresses_for_postcode(user_inputed_address, postcode) + except FileNotFoundError: + return None + + uprn = result.unambiguous_uprn() + if not uprn or uprn in ("", "nan"): + return None + + top = result.top() + if top is None: + return None + + return (uprn, top.record.address, top.lexiscore) + + def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, @@ -95,20 +126,37 @@ def get_uprn( ): """ Return uprn (str) - Return False if failed to find a sensible matching epc - Return None when epc found but no UPRN + Return None when no sensible match is found in either EPC source. - This function fetches EPC data via API for a single postcode. - For processing multiple addresses in the same postcode, use get_uprn_with_epc_df instead. + Tries the new EPC API first; if that yields no confident match, falls + back to the historic EPC dataset on S3. + + For processing multiple addresses in the same postcode, use + get_uprn_with_epc_df instead. """ df = get_epc_data_with_postcode(postcode=postcode) - return get_uprn_with_epc_df( + result = get_uprn_with_epc_df( user_inputed_address=user_inputed_address, epc_df=df, - verbose=verbose, + verbose=True, ) + if not result: + result = get_uprn_from_historic_epc( + user_inputed_address=user_inputed_address, + postcode=postcode, + ) + if result: + logger.info( + f"Historic EPC matched {user_inputed_address} in {postcode}" + ) + + if not result: + return None + + return result if verbose else result[0] + def resolve_uprns_for_postcode_group( group_df: pd.DataFrame, @@ -379,6 +427,7 @@ def handler(event, context, local=False): ) continue + # Process each address in this postcode with the same EPC data for row in postcode_rows: try: @@ -404,6 +453,23 @@ def handler(event, context, local=False): verbose=True, ) + # Fallback to historic EPC if new EPC produced no match + if not result: + try: + result = get_uprn_from_historic_epc( + user_inputed_address=address2uprn_user_input, + postcode=postcode, + ) + except Exception as e: + logger.error( + f"Historic EPC lookup failed for {address2uprn_user_input} in {postcode}: {e}" + ) + result = None + if result: + logger.info( + f"Historic EPC matched {address2uprn_user_input} in {postcode}" + ) + # Parse result tuple if successful if result: uprn, found_address, score = result diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index a0c6ebdf..1435a629 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -178,6 +178,29 @@ class AddressMatch: tok in a_norm for tok in ("flat", "apt", "apartment", "unit") ) has_flat_token_epc = "flat" in b_norm + # Slash-format like "3/137a" is an implicit flat reference + # (flat 3 of 137a) even without a "flat" keyword. + has_implicit_flat_user = bool(re.search(r"\d+\s*/\s*\d+", a_norm)) + # If the user named a street, their leading number is a house number, + # not a flat number — so an EPC "Flat N, …" candidate is a wrong unit. + # Without a street token (e.g. "2 College House"), the user may be + # implicitly naming a flat in a named building; don't apply the guard. + STREET_TYPE_TOKENS = { + "road", "street", "lane", "avenue", "close", "way", + "crescent", "court", "drive", "place", "terrace", "mews", + "gardens", "square", "grove", "park", "walk", "row", + "green", "hill", "rise", "parade", "broadway", + } + user_tokens = set(a_norm.split()) + has_street_type_user = bool(user_tokens & STREET_TYPE_TOKENS) + + if ( + has_flat_token_epc + and not has_flat_token_user + and not has_implicit_flat_user + and has_street_type_user + ): + return 0.0 if ( len(seq_a) == 2 From 197e9a0e009565b72ad461f1f49ec26cb13226da Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 15:21:16 +0000 Subject: [PATCH 057/106] added histroci_epc.csv --- datatypes/epc/schema/tests/fixtures/historic_epc.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 datatypes/epc/schema/tests/fixtures/historic_epc.csv diff --git a/datatypes/epc/schema/tests/fixtures/historic_epc.csv b/datatypes/epc/schema/tests/fixtures/historic_epc.csv new file mode 100644 index 00000000..b4c10739 --- /dev/null +++ b/datatypes/epc/schema/tests/fixtures/historic_epc.csv @@ -0,0 +1,2 @@ +LMK_KEY,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_ENERGY_RATING,POTENTIAL_ENERGY_RATING,CURRENT_ENERGY_EFFICIENCY,POTENTIAL_ENERGY_EFFICIENCY,PROPERTY_TYPE,BUILT_FORM,INSPECTION_DATE,LOCAL_AUTHORITY,CONSTITUENCY,COUNTY,LODGEMENT_DATE,TRANSACTION_TYPE,ENVIRONMENT_IMPACT_CURRENT,ENVIRONMENT_IMPACT_POTENTIAL,ENERGY_CONSUMPTION_CURRENT,ENERGY_CONSUMPTION_POTENTIAL,CO2_EMISSIONS_CURRENT,CO2_EMISS_CURR_PER_FLOOR_AREA,CO2_EMISSIONS_POTENTIAL,LIGHTING_COST_CURRENT,LIGHTING_COST_POTENTIAL,HEATING_COST_CURRENT,HEATING_COST_POTENTIAL,HOT_WATER_COST_CURRENT,HOT_WATER_COST_POTENTIAL,TOTAL_FLOOR_AREA,ENERGY_TARIFF,MAINS_GAS_FLAG,FLOOR_LEVEL,FLAT_TOP_STOREY,FLAT_STOREY_COUNT,MAIN_HEATING_CONTROLS,MULTI_GLAZE_PROPORTION,GLAZED_TYPE,GLAZED_AREA,EXTENSION_COUNT,NUMBER_HABITABLE_ROOMS,NUMBER_HEATED_ROOMS,LOW_ENERGY_LIGHTING,NUMBER_OPEN_FIREPLACES,HOTWATER_DESCRIPTION,HOT_WATER_ENERGY_EFF,HOT_WATER_ENV_EFF,FLOOR_DESCRIPTION,FLOOR_ENERGY_EFF,FLOOR_ENV_EFF,WINDOWS_DESCRIPTION,WINDOWS_ENERGY_EFF,WINDOWS_ENV_EFF,WALLS_DESCRIPTION,WALLS_ENERGY_EFF,WALLS_ENV_EFF,SECONDHEAT_DESCRIPTION,SHEATING_ENERGY_EFF,SHEATING_ENV_EFF,ROOF_DESCRIPTION,ROOF_ENERGY_EFF,ROOF_ENV_EFF,MAINHEAT_DESCRIPTION,MAINHEAT_ENERGY_EFF,MAINHEAT_ENV_EFF,MAINHEATCONT_DESCRIPTION,MAINHEATC_ENERGY_EFF,MAINHEATC_ENV_EFF,LIGHTING_DESCRIPTION,LIGHTING_ENERGY_EFF,LIGHTING_ENV_EFF,MAIN_FUEL,WIND_TURBINE_COUNT,HEAT_LOSS_CORRIDOR,UNHEATED_CORRIDOR_LENGTH,FLOOR_HEIGHT,PHOTO_SUPPLY,SOLAR_WATER_HEATING_FLAG,MECHANICAL_VENTILATION,ADDRESS,LOCAL_AUTHORITY_LABEL,CONSTITUENCY_LABEL,POSTTOWN,CONSTRUCTION_AGE_BAND,LODGEMENT_DATETIME,TENURE,FIXED_LIGHTING_OUTLETS_COUNT,LOW_ENERGY_FIXED_LIGHT_COUNT,UPRN,UPRN_SOURCE,REPORT_TYPE +9292c3bf26a8876ce59274401ea73e3de5bd0b3e52a507c2162a46e57db8ea2f,47 GORDON ROAD,ALFORD,,AB33 8AL,10001111325,E,B,42,87,House,Semi-Detached,2021-04-11,,Unknown,,2021-04-12,ECO assessment,49,69,450,299,5.5,76,3.6,69,77,1579,715,349,118,72.0,Single,N,,,,,100.0,"double glazing, unknown install date",Normal,0.0,3.0,3.0,86.0,0.0,"Electric immersion, standard tariff",Very Poor,Poor,"Solid, no insulation (assumed)",,,Fully double glazed,Average,Average,"Granite or whinstone, as built, partial insulation (assumed)",Average,Average,,,,"Pitched, 100 mm loft insulation",Average,Average,"Room heaters, electric",Very Poor,Poor,Appliance thermostats,Good,Good,Low energy lighting in 86% of fixed outlets,Very Good,Very Good,electricity (not community),0.0,,,2.4,0.0,N,natural,"47 GORDON ROAD, ALFORD",,,ALFORD,England and Wales: 1976-1982,2021-04-12 21:45:35,Rented (private),7.0,,151020766.0,Energy Assessor,100 \ No newline at end of file From 1934c889b0892ef521e38274797df967bd282bfb Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 16:23:03 +0000 Subject: [PATCH 058/106] refactored test to deal with flats better --- backend/address2UPRN/tests/test_data.csv | 5 +++-- backend/utils/addressMatch.py | 28 ++++++++++-------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/backend/address2UPRN/tests/test_data.csv b/backend/address2UPRN/tests/test_data.csv index ee23813b..408edc29 100644 --- a/backend/address2UPRN/tests/test_data.csv +++ b/backend/address2UPRN/tests/test_data.csv @@ -168,8 +168,8 @@ FLAT 8 599 HARROW ROAD,W10 4RA,None "Apartment 18 Block D, 32, Hornsey Road",N7 7AT,10012792383 24b Honley Road,SE6 2HZ,None FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974 -2 COLLEGE HOUSE,CM7 1JS,100091449870 -3 COLLEGE HOUSE,CM7 1JS,100091449871 +2 COLLEGE HOUSE,CM7 1JS,None +3 COLLEGE HOUSE,CM7 1JS,None 1 Anita Street,M4 5DU,None 2 Anita Street,M4 5DU,77123061 5 Anita Street,M4 5DU,77123081 @@ -279,6 +279,7 @@ FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974 80a Victoria Square,M4 5DZ,77211231 81a Victoria Square,M4 5DZ,77211232 82 Victoria Square,M4 5DZ,None +82a Victoria Square,M4 5DZ,77211233 83a Victoria Square,M4 5DZ,77211234 84a Victoria Square,M4 5DZ,None 85a Victoria Square,M4 5DZ,77211236 diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index 1435a629..ee9d1004 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -127,6 +127,7 @@ class AddressMatch: Assumes formats like: - '42 moreton road' - 'flat 3 42 moreton road' + - '82 a victoria square' (recombined to '82a') """ tokens = s.split() @@ -142,9 +143,15 @@ class AddressMatch: continue cleaned.append(t) - # first remaining number is building number - for t in cleaned: - if re.fullmatch(r"\d+[a-z]?", t): + # first remaining number is building number; recombine with a + # single-letter suffix when normalisation has split "82a" → "82 a" + for i, t in enumerate(cleaned): + if re.fullmatch(r"\d+[a-z]", t): + return t + if re.fullmatch(r"\d+", t): + nxt = cleaned[i + 1] if i + 1 < len(cleaned) else None + if nxt is not None and re.fullmatch(r"[a-z]", nxt): + return t + nxt return t return None @@ -181,24 +188,13 @@ class AddressMatch: # Slash-format like "3/137a" is an implicit flat reference # (flat 3 of 137a) even without a "flat" keyword. has_implicit_flat_user = bool(re.search(r"\d+\s*/\s*\d+", a_norm)) - # If the user named a street, their leading number is a house number, - # not a flat number — so an EPC "Flat N, …" candidate is a wrong unit. - # Without a street token (e.g. "2 College House"), the user may be - # implicitly naming a flat in a named building; don't apply the guard. - STREET_TYPE_TOKENS = { - "road", "street", "lane", "avenue", "close", "way", - "crescent", "court", "drive", "place", "terrace", "mews", - "gardens", "square", "grove", "park", "walk", "row", - "green", "hill", "rise", "parade", "broadway", - } - user_tokens = set(a_norm.split()) - has_street_type_user = bool(user_tokens & STREET_TYPE_TOKENS) + # EPC says it's a flat but user gave no flat indication + # (neither keyword nor slash-format). Unlikely to be the right unit. if ( has_flat_token_epc and not has_flat_token_user and not has_implicit_flat_user - and has_street_type_user ): return 0.0 From f0300eb8ff4749da93a49a259c88f448ab7dee08 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 08:57:24 +0000 Subject: [PATCH 059/106] =?UTF-8?q?Replace=20new-deal=20MagicPlan=20trigge?= =?UTF-8?q?r=20to=20use=20outcome=3D=3D"surveyed"=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 16 ++--- .../hubspot/tests/test_scraper_handler.py | 65 +++++++++---------- pytest.ini | 2 +- 3 files changed, 39 insertions(+), 44 deletions(-) rename backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py => etl/hubspot/tests/test_scraper_handler.py (61%) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index 724a3e68..da0072c1 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -162,6 +162,14 @@ class HubspotDealDiffer: return False + @staticmethod + def check_for_magicplan_trigger( + new_deal: Dict[str, str], old_deal: HubspotDealData + ) -> bool: + new_outcome = (new_deal.get("outcome") or "").lower() + old_outcome = (old_deal.outcome or "").lower() + return new_outcome == "surveyed" and old_outcome != "surveyed" + @staticmethod def _has_valid_pashub_link(new_pashub_link: str) -> bool: return bool(new_pashub_link) @@ -194,14 +202,6 @@ class HubspotDealDiffer: and new_status != old_deal.design_status ) - @staticmethod - def check_for_magicplan_trigger( - new_deal: Dict[str, str], old_deal: HubspotDealData - ) -> bool: - new_outcome = (new_deal.get("outcome") or "").lower() - old_outcome = (old_deal.outcome or "").lower() - return new_outcome == "surveyed" and old_outcome != "surveyed" - @staticmethod def _lodgement_completed( new_deal: Dict[str, str], old_deal: HubspotDealData diff --git a/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py b/etl/hubspot/tests/test_scraper_handler.py similarity index 61% rename from backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py rename to etl/hubspot/tests/test_scraper_handler.py index 6d18c4b4..e2f80d07 100644 --- a/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py +++ b/etl/hubspot/tests/test_scraper_handler.py @@ -3,37 +3,28 @@ import uuid from typing import Any, Dict, Optional from unittest.mock import MagicMock, patch -import pytest - from backend.app.db.models.hubspot_deal_data import HubspotDealData from etl.hubspot.scripts.scraper.main import handler -COORDINATION_COMPLETE = "(v1) ioe/mtp complete" DEAL_NAME = "123 Main Street" UPRN = "12345678" DEAL_ID = "999" MAGICPLAN_QUEUE_URL = "https://sqs.eu-west-2.amazonaws.com/123/magic-plan-dev" -def make_hubspot_deal( - coordination_status: Optional[str] = None, **kwargs: Any -) -> Dict[str, Any]: - deal: Dict[str, Any] = { +def make_hubspot_deal(**kwargs: Any) -> Dict[str, Any]: + return { "hs_object_id": DEAL_ID, "dealname": DEAL_NAME, "pashub_link": None, **kwargs, } - if coordination_status is not None: - deal["coordination_status__stage_1_"] = coordination_status - return deal -def make_db_deal(coordination_status: Optional[str] = None, **kwargs: Any) -> HubspotDealData: +def make_db_deal(**kwargs: Any) -> HubspotDealData: return HubspotDealData( id=uuid.uuid4(), deal_id=DEAL_ID, - coordination_status=coordination_status, **kwargs, ) @@ -68,14 +59,14 @@ def run_handler( return mock_sqs -# ======================= -# NEW DEAL PATH -# ======================= +# ==================================== +# NEW DEAL PATH - MagicPlan trigger +# ==================================== -def test_new_deal_in_coordination_complete__sends_sqs_message() -> None: +def test_new_deal__outcome_is_surveyed__triggers_magicplan() -> None: # Arrange - hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + hubspot_deal = make_hubspot_deal(outcome="surveyed") listing = {"national_uprn": UPRN} # Act @@ -84,13 +75,15 @@ def test_new_deal_in_coordination_complete__sends_sqs_message() -> None: # Assert mock_sqs.send_message.assert_called_once_with( QueueUrl=MAGICPLAN_QUEUE_URL, - MessageBody=json.dumps({"address": DEAL_NAME, "uprn": UPRN}), + MessageBody=json.dumps( + {"address": DEAL_NAME, "hubspot_deal_id": DEAL_ID, "uprn": UPRN} + ), ) -def test_new_deal_not_in_coordination_complete__no_sqs_message() -> None: +def test_new_deal__outcome_is_not_surveyed__does_not_trigger_magicplan() -> None: # Arrange - hubspot_deal = make_hubspot_deal(coordination_status="in progress") + hubspot_deal = make_hubspot_deal(outcome="assessed") # Act mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) @@ -99,9 +92,9 @@ def test_new_deal_not_in_coordination_complete__no_sqs_message() -> None: mock_sqs.send_message.assert_not_called() -def test_new_deal_with_no_listing__uprn_is_none_in_message() -> None: +def test_new_deal__outcome_is_surveyed__no_listing__magicplan_message_uprn_is_null() -> None: # Arrange - hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + hubspot_deal = make_hubspot_deal(outcome="surveyed") # Act mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) @@ -109,19 +102,21 @@ def test_new_deal_with_no_listing__uprn_is_none_in_message() -> None: # Assert mock_sqs.send_message.assert_called_once_with( QueueUrl=MAGICPLAN_QUEUE_URL, - MessageBody=json.dumps({"address": DEAL_NAME, "uprn": None}), + MessageBody=json.dumps( + {"address": DEAL_NAME, "hubspot_deal_id": DEAL_ID, "uprn": None} + ), ) -# ======================= -# EXISTING DEAL PATH -# ======================= +# ========================================== +# EXISTING DEAL PATH - MagicPlan trigger +# ========================================== -def test_existing_deal_transitions_to_coordination_complete__sends_sqs_message() -> None: +def test_existing_deal__outcome_transitions_to_surveyed__triggers_magicplan() -> None: # Arrange - db_deal = make_db_deal(coordination_status="in progress") - hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + db_deal = make_db_deal(outcome="assessed") + hubspot_deal = make_hubspot_deal(outcome="surveyed") listing = {"national_uprn": UPRN} # Act @@ -130,16 +125,16 @@ def test_existing_deal_transitions_to_coordination_complete__sends_sqs_message() # Assert mock_sqs.send_message.assert_called_once_with( QueueUrl=MAGICPLAN_QUEUE_URL, - MessageBody=json.dumps({"address": DEAL_NAME, "uprn": UPRN}), + MessageBody=json.dumps( + {"address": DEAL_NAME, "hubspot_deal_id": DEAL_ID, "uprn": UPRN} + ), ) -def test_existing_deal_already_in_coordination_complete_unrelated_change__no_sqs_message() -> None: +def test_existing_deal__outcome_already_surveyed__unrelated_change__does_not_trigger_magicplan() -> None: # Arrange - db_deal = make_db_deal(coordination_status=COORDINATION_COMPLETE, dealname="Old Name") - hubspot_deal = make_hubspot_deal( - coordination_status=COORDINATION_COMPLETE, dealname="New Name" - ) + db_deal = make_db_deal(outcome="surveyed", dealname="Old Name") + hubspot_deal = make_hubspot_deal(outcome="surveyed", dealname="New Name") # Act mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=None) diff --git a/pytest.ini b/pytest.ini index 398c5b71..e2a4a25d 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,6 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/pashub_fetcher/tests backend/documents_parser/tests backend/magic_plan/tests datatypes/magicplan/api/tests datatypes/magicplan/domain/tests backend/app/db/functions/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/pashub_fetcher/tests backend/documents_parser/tests backend/magic_plan/tests datatypes/magicplan/api/tests datatypes/magicplan/domain/tests backend/app/db/functions/tests markers = integration: mark a test as an integration test From 9386846044e087b38a15bbfae3dd306f376cd205 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 08:58:15 +0000 Subject: [PATCH 060/106] =?UTF-8?q?Replace=20new-deal=20MagicPlan=20trigge?= =?UTF-8?q?r=20to=20use=20outcome=3D=3D"surveyed"=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/scripts/scraper/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 32007cd4..86844352 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -57,8 +57,7 @@ def handler(body: dict[str, Any], context: Any) -> None: ) _trigger_pashub_fetcher(sqs_client, hubspot_deal_id, hubspot_deal) - coordination_status = (hubspot_deal.get("coordination_status__stage_1_") or "").lower() - if coordination_status in HubspotDealDiffer.COORDINATION_COMPLETE: + if (hubspot_deal.get("outcome") or "").lower() == "surveyed": logger.info( f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" ) From 9501146ec815b2f2998017acb646571c922ba277 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 08:59:32 +0000 Subject: [PATCH 061/106] =?UTF-8?q?Replace=20new-deal=20MagicPlan=20trigge?= =?UTF-8?q?r=20to=20use=20outcome=3D=3D"surveyed"=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/hubspot_trigger_orchestrator/__init__.py | 0 backend/hubspot_trigger_orchestrator/tests/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 backend/hubspot_trigger_orchestrator/__init__.py delete mode 100644 backend/hubspot_trigger_orchestrator/tests/__init__.py diff --git a/backend/hubspot_trigger_orchestrator/__init__.py b/backend/hubspot_trigger_orchestrator/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/hubspot_trigger_orchestrator/tests/__init__.py b/backend/hubspot_trigger_orchestrator/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 From 71aadfe78d1237cca0721707d97b7fc01e6bb6c3 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 09:31:47 +0000 Subject: [PATCH 062/106] add pashub functions to orchestrator tests, and rename existing magicplan ones --- etl/hubspot/tests/test_scraper_handler.py | 96 +++++++++++++++++++++-- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/etl/hubspot/tests/test_scraper_handler.py b/etl/hubspot/tests/test_scraper_handler.py index e2f80d07..4810d171 100644 --- a/etl/hubspot/tests/test_scraper_handler.py +++ b/etl/hubspot/tests/test_scraper_handler.py @@ -9,7 +9,9 @@ from etl.hubspot.scripts.scraper.main import handler DEAL_NAME = "123 Main Street" UPRN = "12345678" DEAL_ID = "999" +PASHUB_LINK = "https://pashub.example.com/deal/999" MAGICPLAN_QUEUE_URL = "https://sqs.eu-west-2.amazonaws.com/123/magic-plan-dev" +PASHUB_QUEUE_URL = "https://sqs.test/pashub" def make_hubspot_deal(**kwargs: Any) -> Dict[str, Any]: @@ -52,7 +54,7 @@ def run_handler( ) mock_boto3.client.return_value = mock_sqs mock_settings.return_value.MAGICPLAN_SQS_URL = MAGICPLAN_QUEUE_URL - mock_settings.return_value.PASHUB_TO_ARA_SQS_URL = "https://sqs.test/pashub" + mock_settings.return_value.PASHUB_TO_ARA_SQS_URL = PASHUB_QUEUE_URL handler.__wrapped__({"hubspot_deal_id": DEAL_ID}, "") @@ -64,7 +66,7 @@ def run_handler( # ==================================== -def test_new_deal__outcome_is_surveyed__triggers_magicplan() -> None: +def test_new_deal_surveyed__sends_magicplan_sqs() -> None: # Arrange hubspot_deal = make_hubspot_deal(outcome="surveyed") listing = {"national_uprn": UPRN} @@ -81,7 +83,7 @@ def test_new_deal__outcome_is_surveyed__triggers_magicplan() -> None: ) -def test_new_deal__outcome_is_not_surveyed__does_not_trigger_magicplan() -> None: +def test_new_deal_not_surveyed__no_magicplan_sqs() -> None: # Arrange hubspot_deal = make_hubspot_deal(outcome="assessed") @@ -92,7 +94,7 @@ def test_new_deal__outcome_is_not_surveyed__does_not_trigger_magicplan() -> None mock_sqs.send_message.assert_not_called() -def test_new_deal__outcome_is_surveyed__no_listing__magicplan_message_uprn_is_null() -> None: +def test_new_deal_surveyed_no_listing__magicplan_sqs_uprn_is_null() -> None: # Arrange hubspot_deal = make_hubspot_deal(outcome="surveyed") @@ -113,7 +115,7 @@ def test_new_deal__outcome_is_surveyed__no_listing__magicplan_message_uprn_is_nu # ========================================== -def test_existing_deal__outcome_transitions_to_surveyed__triggers_magicplan() -> None: +def test_existing_deal_surveyed_transition__sends_magicplan_sqs() -> None: # Arrange db_deal = make_db_deal(outcome="assessed") hubspot_deal = make_hubspot_deal(outcome="surveyed") @@ -131,7 +133,7 @@ def test_existing_deal__outcome_transitions_to_surveyed__triggers_magicplan() -> ) -def test_existing_deal__outcome_already_surveyed__unrelated_change__does_not_trigger_magicplan() -> None: +def test_existing_deal_already_surveyed__no_magicplan_sqs() -> None: # Arrange db_deal = make_db_deal(outcome="surveyed", dealname="Old Name") hubspot_deal = make_hubspot_deal(outcome="surveyed", dealname="New Name") @@ -141,3 +143,85 @@ def test_existing_deal__outcome_already_surveyed__unrelated_change__does_not_tri # Assert mock_sqs.send_message.assert_not_called() + + +# ==================================== +# NEW DEAL PATH - PasHub trigger +# ==================================== + + +def test_new_deal_with_pashub_link__sends_pashub_sqs() -> None: + # Arrange + hubspot_deal = make_hubspot_deal(pashub_link=PASHUB_LINK) + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=PASHUB_QUEUE_URL, + MessageBody=json.dumps( + { + "pashub_link": PASHUB_LINK, + "address": None, + "hubspot_deal_id": DEAL_ID, + "sharepoint_link": None, + "uprn": None, + "landlord_property_id": None, + "deal_stage": None, + } + ), + ) + + +def test_new_deal_no_pashub_link__no_pashub_sqs() -> None: + # Arrange + hubspot_deal = make_hubspot_deal() + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) + + # Assert + mock_sqs.send_message.assert_not_called() + + +# ========================================== +# EXISTING DEAL PATH - PasHub trigger +# ========================================== + + +def test_existing_deal_pashub_link_added__sends_pashub_sqs() -> None: + # Arrange + db_deal = make_db_deal(pashub_link=None) + hubspot_deal = make_hubspot_deal(pashub_link=PASHUB_LINK) + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=None) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=PASHUB_QUEUE_URL, + MessageBody=json.dumps( + { + "pashub_link": PASHUB_LINK, + "address": None, + "hubspot_deal_id": DEAL_ID, + "sharepoint_link": None, + "uprn": None, + "landlord_property_id": None, + "deal_stage": None, + } + ), + ) + + +def test_existing_deal_pashub_link_unchanged__no_pashub_sqs() -> None: + # Arrange + db_deal = make_db_deal(pashub_link=PASHUB_LINK, dealname="Old Name") + hubspot_deal = make_hubspot_deal(pashub_link=PASHUB_LINK, dealname="New Name") + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=None) + + # Assert + mock_sqs.send_message.assert_not_called() From 5edae06a659a3b6853df56ff8e68b5c2d186e4c9 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 09:37:23 +0000 Subject: [PATCH 063/106] added imports at the top of the file instead of function --- backend/address2UPRN/main.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 0938a53b..a7378fbe 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -20,12 +20,15 @@ from backend.utils.addressMatch import ( df_has_single_uprn, score_addresses, ) +from datatypes.epc.domain.historic_epc_matching import ( + match_addresses_for_postcode, +) +from backend.epc_client.client import EpcClientService logger = setup_logger() def get_epc_data_with_postcode(postcode: str) -> pd.DataFrame: - from backend.epc_client.client import EpcClientService token = os.getenv("OPEN_EPC_API_TOKEN") if token is None: @@ -51,9 +54,6 @@ def get_uprn_from_historic_epc( here because historic addresses use a more verbose format that systematically depresses lexiscores. """ - from datatypes.epc.domain.historic_epc_matching import ( - match_addresses_for_postcode, - ) try: result = match_addresses_for_postcode(user_inputed_address, postcode) @@ -148,9 +148,7 @@ def get_uprn( postcode=postcode, ) if result: - logger.info( - f"Historic EPC matched {user_inputed_address} in {postcode}" - ) + logger.info(f"Historic EPC matched {user_inputed_address} in {postcode}") if not result: return None @@ -427,7 +425,6 @@ def handler(event, context, local=False): ) continue - # Process each address in this postcode with the same EPC data for row in postcode_rows: try: From c22528299ce1ba240263a3315537734dc0e456fd Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 09:40:12 +0000 Subject: [PATCH 064/106] added type hinting to uprn --- backend/address2UPRN/main.py | 2 +- scripts/historic_epc_demo.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index a7378fbe..6b684cef 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -60,7 +60,7 @@ def get_uprn_from_historic_epc( except FileNotFoundError: return None - uprn = result.unambiguous_uprn() + uprn: Optional[str] = result.unambiguous_uprn() if not uprn or uprn in ("", "nan"): return None diff --git a/scripts/historic_epc_demo.py b/scripts/historic_epc_demo.py index b47c3a3c..31e1ee28 100644 --- a/scripts/historic_epc_demo.py +++ b/scripts/historic_epc_demo.py @@ -12,6 +12,7 @@ Usage: import sys from datatypes.epc.domain.historic_epc_matching import match_addresses_for_postcode +from typing import Optional def main(user_address: str, postcode: str) -> None: @@ -29,7 +30,7 @@ def main(user_address: str, postcode: str) -> None: ) print() - uprn = result.unambiguous_uprn() + uprn: Optional[str] = result.unambiguous_uprn() if uprn: print(f"Unambiguous UPRN: {uprn}") else: From b72d5fbf42f530eb439d5e8804a9fb270a035f53 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 09:43:40 +0000 Subject: [PATCH 065/106] fix nitpick --- backend/address2UPRN/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 6b684cef..e49088f4 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -61,7 +61,7 @@ def get_uprn_from_historic_epc( return None uprn: Optional[str] = result.unambiguous_uprn() - if not uprn or uprn in ("", "nan"): + if not uprn or uprn == "nan": return None top = result.top() From e06ead55d0226ec216969fa749de861ece1f4ce8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 09:48:21 +0000 Subject: [PATCH 066/106] add more type hint --- backend/address2UPRN/main.py | 3 +- .../tests/test_historic_epc_matching.py | 211 ++++++++++++------ 2 files changed, 150 insertions(+), 64 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index e49088f4..642733a7 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -24,6 +24,7 @@ from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) from backend.epc_client.client import EpcClientService +from datatypes.epc.domain.historic_epc_matching import ScoredHistoricEpc logger = setup_logger() @@ -64,7 +65,7 @@ def get_uprn_from_historic_epc( if not uprn or uprn == "nan": return None - top = result.top() + top: Optional[ScoredHistoricEpc] = result.top() if top is None: return None diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py index 1c3ee6d4..ce86e5c0 100644 --- a/datatypes/epc/domain/tests/test_historic_epc_matching.py +++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py @@ -1,3 +1,4 @@ +from typing import Optional from unittest.mock import patch import numpy as np @@ -13,40 +14,103 @@ from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) - # Columns required by the HistoricEpc dataclass (lower-cased CSV columns). # The matcher only reads ADDRESS + UPRN to score; everything else is filled # with "" but must be present for HistoricEpc(**kwargs) to construct. _FULL_COLUMN_FIELDS = [ - "LMK_KEY", "ADDRESS1", "ADDRESS2", "ADDRESS3", "POSTCODE", - "BUILDING_REFERENCE_NUMBER", "CURRENT_ENERGY_RATING", "POTENTIAL_ENERGY_RATING", - "CURRENT_ENERGY_EFFICIENCY", "POTENTIAL_ENERGY_EFFICIENCY", "PROPERTY_TYPE", - "BUILT_FORM", "INSPECTION_DATE", "LOCAL_AUTHORITY", "CONSTITUENCY", "COUNTY", - "LODGEMENT_DATE", "TRANSACTION_TYPE", "ENVIRONMENT_IMPACT_CURRENT", - "ENVIRONMENT_IMPACT_POTENTIAL", "ENERGY_CONSUMPTION_CURRENT", - "ENERGY_CONSUMPTION_POTENTIAL", "CO2_EMISSIONS_CURRENT", - "CO2_EMISS_CURR_PER_FLOOR_AREA", "CO2_EMISSIONS_POTENTIAL", - "LIGHTING_COST_CURRENT", "LIGHTING_COST_POTENTIAL", "HEATING_COST_CURRENT", - "HEATING_COST_POTENTIAL", "HOT_WATER_COST_CURRENT", "HOT_WATER_COST_POTENTIAL", - "TOTAL_FLOOR_AREA", "ENERGY_TARIFF", "MAINS_GAS_FLAG", "FLOOR_LEVEL", - "FLAT_TOP_STOREY", "FLAT_STOREY_COUNT", "MAIN_HEATING_CONTROLS", - "MULTI_GLAZE_PROPORTION", "GLAZED_TYPE", "GLAZED_AREA", "EXTENSION_COUNT", - "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "LOW_ENERGY_LIGHTING", - "NUMBER_OPEN_FIREPLACES", "HOTWATER_DESCRIPTION", "HOT_WATER_ENERGY_EFF", - "HOT_WATER_ENV_EFF", "FLOOR_DESCRIPTION", "FLOOR_ENERGY_EFF", "FLOOR_ENV_EFF", - "WINDOWS_DESCRIPTION", "WINDOWS_ENERGY_EFF", "WINDOWS_ENV_EFF", - "WALLS_DESCRIPTION", "WALLS_ENERGY_EFF", "WALLS_ENV_EFF", - "SECONDHEAT_DESCRIPTION", "SHEATING_ENERGY_EFF", "SHEATING_ENV_EFF", - "ROOF_DESCRIPTION", "ROOF_ENERGY_EFF", "ROOF_ENV_EFF", "MAINHEAT_DESCRIPTION", - "MAINHEAT_ENERGY_EFF", "MAINHEAT_ENV_EFF", "MAINHEATCONT_DESCRIPTION", - "MAINHEATC_ENERGY_EFF", "MAINHEATC_ENV_EFF", "LIGHTING_DESCRIPTION", - "LIGHTING_ENERGY_EFF", "LIGHTING_ENV_EFF", "MAIN_FUEL", "WIND_TURBINE_COUNT", - "HEAT_LOSS_CORRIDOR", "UNHEATED_CORRIDOR_LENGTH", "FLOOR_HEIGHT", - "PHOTO_SUPPLY", "SOLAR_WATER_HEATING_FLAG", "MECHANICAL_VENTILATION", - "ADDRESS", "LOCAL_AUTHORITY_LABEL", "CONSTITUENCY_LABEL", "POSTTOWN", - "CONSTRUCTION_AGE_BAND", "LODGEMENT_DATETIME", "TENURE", - "FIXED_LIGHTING_OUTLETS_COUNT", "LOW_ENERGY_FIXED_LIGHT_COUNT", "UPRN", - "UPRN_SOURCE", "REPORT_TYPE", + "LMK_KEY", + "ADDRESS1", + "ADDRESS2", + "ADDRESS3", + "POSTCODE", + "BUILDING_REFERENCE_NUMBER", + "CURRENT_ENERGY_RATING", + "POTENTIAL_ENERGY_RATING", + "CURRENT_ENERGY_EFFICIENCY", + "POTENTIAL_ENERGY_EFFICIENCY", + "PROPERTY_TYPE", + "BUILT_FORM", + "INSPECTION_DATE", + "LOCAL_AUTHORITY", + "CONSTITUENCY", + "COUNTY", + "LODGEMENT_DATE", + "TRANSACTION_TYPE", + "ENVIRONMENT_IMPACT_CURRENT", + "ENVIRONMENT_IMPACT_POTENTIAL", + "ENERGY_CONSUMPTION_CURRENT", + "ENERGY_CONSUMPTION_POTENTIAL", + "CO2_EMISSIONS_CURRENT", + "CO2_EMISS_CURR_PER_FLOOR_AREA", + "CO2_EMISSIONS_POTENTIAL", + "LIGHTING_COST_CURRENT", + "LIGHTING_COST_POTENTIAL", + "HEATING_COST_CURRENT", + "HEATING_COST_POTENTIAL", + "HOT_WATER_COST_CURRENT", + "HOT_WATER_COST_POTENTIAL", + "TOTAL_FLOOR_AREA", + "ENERGY_TARIFF", + "MAINS_GAS_FLAG", + "FLOOR_LEVEL", + "FLAT_TOP_STOREY", + "FLAT_STOREY_COUNT", + "MAIN_HEATING_CONTROLS", + "MULTI_GLAZE_PROPORTION", + "GLAZED_TYPE", + "GLAZED_AREA", + "EXTENSION_COUNT", + "NUMBER_HABITABLE_ROOMS", + "NUMBER_HEATED_ROOMS", + "LOW_ENERGY_LIGHTING", + "NUMBER_OPEN_FIREPLACES", + "HOTWATER_DESCRIPTION", + "HOT_WATER_ENERGY_EFF", + "HOT_WATER_ENV_EFF", + "FLOOR_DESCRIPTION", + "FLOOR_ENERGY_EFF", + "FLOOR_ENV_EFF", + "WINDOWS_DESCRIPTION", + "WINDOWS_ENERGY_EFF", + "WINDOWS_ENV_EFF", + "WALLS_DESCRIPTION", + "WALLS_ENERGY_EFF", + "WALLS_ENV_EFF", + "SECONDHEAT_DESCRIPTION", + "SHEATING_ENERGY_EFF", + "SHEATING_ENV_EFF", + "ROOF_DESCRIPTION", + "ROOF_ENERGY_EFF", + "ROOF_ENV_EFF", + "MAINHEAT_DESCRIPTION", + "MAINHEAT_ENERGY_EFF", + "MAINHEAT_ENV_EFF", + "MAINHEATCONT_DESCRIPTION", + "MAINHEATC_ENERGY_EFF", + "MAINHEATC_ENV_EFF", + "LIGHTING_DESCRIPTION", + "LIGHTING_ENERGY_EFF", + "LIGHTING_ENV_EFF", + "MAIN_FUEL", + "WIND_TURBINE_COUNT", + "HEAT_LOSS_CORRIDOR", + "UNHEATED_CORRIDOR_LENGTH", + "FLOOR_HEIGHT", + "PHOTO_SUPPLY", + "SOLAR_WATER_HEATING_FLAG", + "MECHANICAL_VENTILATION", + "ADDRESS", + "LOCAL_AUTHORITY_LABEL", + "CONSTITUENCY_LABEL", + "POSTTOWN", + "CONSTRUCTION_AGE_BAND", + "LODGEMENT_DATETIME", + "TENURE", + "FIXED_LIGHTING_OUTLETS_COUNT", + "LOW_ENERGY_FIXED_LIGHT_COUNT", + "UPRN", + "UPRN_SOURCE", + "REPORT_TYPE", ] @@ -63,7 +127,9 @@ def _build_df(rows: list[dict]) -> pd.DataFrame: @pytest.fixture def patch_postcode_valid(): - with patch.object(matcher_mod.AddressMatch, "is_valid_postcode", return_value=True) as m: + with patch.object( + matcher_mod.AddressMatch, "is_valid_postcode", return_value=True + ) as m: yield m @@ -106,10 +172,12 @@ class TestMatchAddressesForPostcode: self, patch_read, patch_postcode_valid ): # Disjoint number sets => hard zero. Still kept in matches. - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", "100"), - _row("999 SOMEWHERE ELSE", "200"), - ]) + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", "100"), + _row("999 SOMEWHERE ELSE", "200"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert isinstance(result, HistoricEpcMatches) assert len(result.matches) == 2 @@ -117,10 +185,12 @@ class TestMatchAddressesForPostcode: def test_top_has_lexirank_one_and_lexiscore_monotone( self, patch_read, patch_postcode_valid ): - patch_read.return_value = _build_df([ - _row("48 GORDON ROAD", "200"), # near miss - _row("47 GORDON ROAD", "100"), # exact (after normalisation) - ]) + patch_read.return_value = _build_df( + [ + _row("48 GORDON ROAD", "200"), # near miss + _row("47 GORDON ROAD", "100"), # exact (after normalisation) + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert result.top().lexirank == 1 scores = [m.lexiscore for m in result.matches] @@ -173,19 +243,23 @@ class TestMatchAddressesForPostcode: class TestUnambiguousUprn: def test_exact_match_returns_uprn(self, patch_read, patch_postcode_valid): - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", "100"), - _row("48 GORDON ROAD", "200"), - ]) + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert result.unambiguous_uprn() == "100" def test_ambiguous_tie_returns_none(self, patch_read, patch_postcode_valid): # Two duplicate addresses with different UPRNs share rank-1. - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", "100"), - _row("47 GORDON ROAD", "200"), - ]) + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", "100"), + _row("47 GORDON ROAD", "200"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert result.unambiguous_uprn() is None @@ -193,10 +267,12 @@ class TestUnambiguousUprn: self, patch_read, patch_postcode_valid ): # User address has building number 47; no row has 47 -> all hard-zero. - patch_read.return_value = _build_df([ - _row("999 ELSEWHERE", "100"), - _row("888 ELSEWHERE", "200"), - ]) + patch_read.return_value = _build_df( + [ + _row("999 ELSEWHERE", "100"), + _row("888 ELSEWHERE", "200"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") assert all(m.lexiscore == 0.0 for m in result.matches) assert result.unambiguous_uprn() is None @@ -205,15 +281,22 @@ class TestUnambiguousUprn: self, patch_read, patch_postcode_valid ): # Use a real NaN in the UPRN cell. - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", np.nan), - _row("48 GORDON ROAD", "200"), - ]) - result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") - top = result.top() + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", np.nan), + _row("48 GORDON ROAD", "200"), + ] + ) + result: HistoricEpcMatches = match_addresses_for_postcode( + "47 Gordon Road", "AB33 8AL" + ) + top: Optional[ScoredHistoricEpc] = result.top() # pandas_cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), # so unambiguous_uprn's truthiness check correctly drops the row. - assert top.record.uprn == "" + if top: + assert top.record.uprn == "" + else: + pytest.fail("should have an epc score, no results found :(") # ---------- top / top_n ---------- @@ -222,11 +305,13 @@ class TestUnambiguousUprn: class TestTopHelpers: def test_top_n_returns_first_k(self, patch_read, patch_postcode_valid): - patch_read.return_value = _build_df([ - _row("47 GORDON ROAD", "100"), - _row("48 GORDON ROAD", "200"), - _row("49 GORDON ROAD", "300"), - ]) + patch_read.return_value = _build_df( + [ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + _row("49 GORDON ROAD", "300"), + ] + ) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") top2 = result.top_n(2) assert len(top2) == 2 From 2c5c8337cc907e419277c0ef8e95f6eedb8c99ab Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 10:01:25 +0000 Subject: [PATCH 067/106] added more type hints --- backend/address2UPRN/main.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 642733a7..8832e157 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -68,15 +68,14 @@ def get_uprn_from_historic_epc( top: Optional[ScoredHistoricEpc] = result.top() if top is None: return None - - return (uprn, top.record.address, top.lexiscore) + return uprn, top.record.address, top.lexiscore def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, verbose: bool = False, -): +) -> Optional[str | tuple[str, str, float]]: """ Return uprn (str) using a pre-fetched EPC dataframe. This avoids calling the API multiple times for the same postcode. @@ -137,7 +136,7 @@ def get_uprn( """ df = get_epc_data_with_postcode(postcode=postcode) - result = get_uprn_with_epc_df( + result: Optional[] = get_uprn_with_epc_df( user_inputed_address=user_inputed_address, epc_df=df, verbose=True, @@ -445,7 +444,7 @@ def handler(event, context, local=False): continue # Get UPRN using the pre-fetched EPC data with all return options - result = get_uprn_with_epc_df( + result: Optional[tuple[str, str, float]] = get_uprn_with_epc_df( user_inputed_address=address2uprn_user_input, epc_df=epc_df, verbose=True, @@ -562,3 +561,4 @@ def handler(event, context, local=False): # Don't add results to return messages as its too verbose # capture the exepection as e, into s3, to find the logs go to s3 # Upload results to s3 as well as csv + From 8635e2a1aaf2072d4fc09e7fe7bc0de8984b71ea Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 10:08:00 +0000 Subject: [PATCH 068/106] change file name of epc client service --- backend/address2UPRN/main.py | 2 +- backend/epc_client/__init__.py | 2 +- backend/epc_client/client.py | 99 ------------------------- backend/epc_client/tests/conftest.py | 2 +- backend/epc_client/tests/test_client.py | 30 ++++++-- backend/utils/epc_address_match.py | 2 +- 6 files changed, 28 insertions(+), 109 deletions(-) delete mode 100644 backend/epc_client/client.py diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 8832e157..7e0baeaa 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -23,7 +23,7 @@ from backend.utils.addressMatch import ( from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) -from backend.epc_client.client import EpcClientService +from backend.epc_client.epc_client_service import EpcClientService from datatypes.epc.domain.historic_epc_matching import ScoredHistoricEpc logger = setup_logger() diff --git a/backend/epc_client/__init__.py b/backend/epc_client/__init__.py index ab46a266..84062592 100644 --- a/backend/epc_client/__init__.py +++ b/backend/epc_client/__init__.py @@ -1,3 +1,3 @@ -from backend.epc_client.client import EpcClientService +from backend.epc_client.epc_client_service import EpcClientService __all__ = ["EpcClientService"] diff --git a/backend/epc_client/client.py b/backend/epc_client/client.py deleted file mode 100644 index d00a164f..00000000 --- a/backend/epc_client/client.py +++ /dev/null @@ -1,99 +0,0 @@ -# Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml -from __future__ import annotations - -from typing import Any, Optional - -import httpx - -from backend.epc_client.exceptions import ( - EpcApiError, - EpcNotFoundError, - EpcRateLimitError, -) -from backend.epc_client._retry import call_with_retry -from datatypes.epc.domain.epc_property_data import EpcPropertyData -from datatypes.epc.domain.mapper import EpcPropertyDataMapper -from datatypes.epc.search import EpcSearchResult - - -class EpcClientService: - BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk" - - def __init__(self, auth_token: str) -> None: - self._headers = { - "Authorization": f"Bearer {auth_token}", - "Accept": "application/json", - } - - def get_by_certificate_number(self, cert_num: str) -> EpcPropertyData: - raw = call_with_retry(lambda: self._fetch_certificate(cert_num)) - return EpcPropertyDataMapper.from_api_response(raw) - - def get_by_uprn(self, uprn: int) -> Optional[EpcPropertyData]: - results = call_with_retry(lambda: self._search(uprn=uprn)) - if not results: - return None - latest = max(results, key=lambda r: r.registration_date) - return self.get_by_certificate_number(latest.certificate_number) - - def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: - return call_with_retry(lambda: self._search(postcode=postcode)) - - # ------------------------------------------------------------------ - # Private helpers - # ------------------------------------------------------------------ - - def _fetch_certificate(self, cert_num: str) -> dict[str, Any]: - resp = httpx.get( - f"{self.BASE_URL}/api/certificate", - params={"certificate_number": cert_num}, - headers=self._headers, - ) - if resp.status_code == 404: - raise EpcNotFoundError(cert_num) - if resp.status_code == 429: - raise EpcRateLimitError("Rate limited by EPC API") - if not resp.is_success: - raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") - return resp.json()["data"] - - def _search( - self, - postcode: Optional[str] = None, - uprn: Optional[int] = None, - ) -> list[EpcSearchResult]: - params: dict[str, str | int] = {} - if postcode: - params["postcode"] = postcode - if uprn is not None: - params["uprn"] = uprn - - resp = httpx.get( - f"{self.BASE_URL}/api/domestic/search", - params=params, - headers=self._headers, - ) - if resp.status_code == 404: - return [] - if resp.status_code == 429: - raise EpcRateLimitError("Rate limited by EPC API") - if not resp.is_success: - raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") - - rows = resp.json().get("data", []) - return [self._parse_search_result(r) for r in rows] - - @staticmethod - def _parse_search_result(row: dict[str, Any]) -> EpcSearchResult: - return EpcSearchResult( - certificate_number=row["certificateNumber"], - address_line_1=row["addressLine1"], - address_line_2=row.get("addressLine2"), - address_line_3=row.get("addressLine3"), - address_line_4=row.get("addressLine4"), - postcode=row["postcode"], - post_town=row["postTown"], - uprn=row.get("uprn"), - current_energy_efficiency_band=row["currentEnergyEfficiencyBand"], - registration_date=row["registrationDate"], - ) diff --git a/backend/epc_client/tests/conftest.py b/backend/epc_client/tests/conftest.py index 2ed444af..2dab138e 100644 --- a/backend/epc_client/tests/conftest.py +++ b/backend/epc_client/tests/conftest.py @@ -2,7 +2,7 @@ import json import pathlib import pytest -from backend.epc_client.client import EpcClientService +from backend.epc_client.epc_client_service import EpcClientService SAMPLES_DIR = pathlib.Path("backend/epc_api/json_samples") diff --git a/backend/epc_client/tests/test_client.py b/backend/epc_client/tests/test_client.py index 7933f21d..849b4a25 100644 --- a/backend/epc_client/tests/test_client.py +++ b/backend/epc_client/tests/test_client.py @@ -1,7 +1,7 @@ from unittest.mock import MagicMock, patch, call import pytest -from backend.epc_client.client import EpcClientService +from backend.epc_client.epc_client_service import EpcClientService from backend.utils.epc_address_match import find_best_epc_match from datatypes.epc.search import EpcSearchResult from backend.epc_client.exceptions import EpcNotFoundError, EpcRateLimitError @@ -22,7 +22,10 @@ def _mock_response(status_code=200, json_data=None): # Test 1: get_by_certificate_number happy path # --------------------------------------------------------------------------- -def test_get_by_certificate_number_returns_epc_property_data(epc_service, rdsap_21_0_1_cert): + +def test_get_by_certificate_number_returns_epc_property_data( + epc_service, rdsap_21_0_1_cert +): cert_response = {"data": rdsap_21_0_1_cert} with patch("httpx.get", return_value=_mock_response(200, cert_response)): result = epc_service.get_by_certificate_number("CERT-001") @@ -34,6 +37,7 @@ def test_get_by_certificate_number_returns_epc_property_data(epc_service, rdsap_ # Test 2: get_by_certificate_number 404 → EpcNotFoundError # --------------------------------------------------------------------------- + def test_get_by_certificate_number_404_raises_not_found(epc_service): with patch("httpx.get", return_value=_mock_response(404)): with pytest.raises(EpcNotFoundError): @@ -44,7 +48,10 @@ def test_get_by_certificate_number_404_raises_not_found(epc_service): # Test 3: 429 retried, succeeds on 3rd attempt # --------------------------------------------------------------------------- -def test_get_by_certificate_number_retries_on_429_and_succeeds(epc_service, rdsap_21_0_1_cert): + +def test_get_by_certificate_number_retries_on_429_and_succeeds( + epc_service, rdsap_21_0_1_cert +): cert_response = {"data": rdsap_21_0_1_cert} responses = [ _mock_response(429), @@ -61,6 +68,7 @@ def test_get_by_certificate_number_retries_on_429_and_succeeds(epc_service, rdsa # Test 4: get_by_uprn empty search → None # --------------------------------------------------------------------------- + def test_get_by_uprn_returns_none_when_no_results(epc_service): with patch("httpx.get", return_value=_mock_response(200, {"data": []})): result = epc_service.get_by_uprn(100023336956) @@ -72,6 +80,7 @@ def test_get_by_uprn_returns_none_when_no_results(epc_service): # Test 5: get_by_uprn multiple results → fetches latest by registration_date # --------------------------------------------------------------------------- + def test_get_by_uprn_picks_most_recent_certificate(epc_service, rdsap_21_0_1_cert): search_rows = [ make_search_row(cert_num="CERT-OLD", registration_date="2022-01-01"), @@ -98,6 +107,7 @@ def test_get_by_uprn_picks_most_recent_certificate(epc_service, rdsap_21_0_1_cer # Test 6: search_by_postcode returns list[EpcSearchResult] # --------------------------------------------------------------------------- + def test_search_by_postcode_returns_results(epc_service): rows = [ make_search_row(cert_num="CERT-A", address_line_1="1 High Street"), @@ -116,6 +126,7 @@ def test_search_by_postcode_returns_results(epc_service): # Test 7: search_by_postcode 404 → empty list # --------------------------------------------------------------------------- + def test_search_by_postcode_404_returns_empty_list(epc_service): with patch("httpx.get", return_value=_mock_response(404)): results = epc_service.search_by_postcode("ZZ9 9ZZ") @@ -127,6 +138,7 @@ def test_search_by_postcode_404_returns_empty_list(epc_service): # Tests 8-10: find_best_epc_match — real scoring, only HTTP mocked # --------------------------------------------------------------------------- + def test_find_best_match_clear_winner_on_first_pass(epc_service, rdsap_21_0_1_cert): search_rows = [ make_search_row(cert_num="CERT-WIN", address_line_1="1 High Street"), @@ -145,7 +157,9 @@ def test_find_best_match_clear_winner_on_first_pass(epc_service, rdsap_21_0_1_ce assert isinstance(result, EpcPropertyData) -def test_find_best_match_resolves_on_second_pass_using_full_address(epc_service, rdsap_21_0_1_cert): +def test_find_best_match_resolves_on_second_pass_using_full_address( + epc_service, rdsap_21_0_1_cert +): # Both candidates share address_line_1 — round 1 is ambiguous. # Round 2 scores against full_address and picks the correct floor. search_rows = [ @@ -168,7 +182,9 @@ def test_find_best_match_resolves_on_second_pass_using_full_address(epc_service, return _mock_response(200, cert_response) with patch("httpx.get", side_effect=fake_get): - result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street Ground Floor") + result = find_best_epc_match( + epc_service, "SW1A 1AA", "1 High Street Ground Floor" + ) assert isinstance(result, EpcPropertyData) @@ -177,6 +193,8 @@ def test_find_best_match_returns_none_when_no_good_match(epc_service): search_rows = [make_search_row(cert_num="CERT-X", address_line_1="99 Nowhere Lane")] with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})): - result = find_best_epc_match(epc_service, "SW1A 1AA", "1 Completely Different Road") + result = find_best_epc_match( + epc_service, "SW1A 1AA", "1 Completely Different Road" + ) assert result is None diff --git a/backend/utils/epc_address_match.py b/backend/utils/epc_address_match.py index f73d6d1d..0df56eca 100644 --- a/backend/utils/epc_address_match.py +++ b/backend/utils/epc_address_match.py @@ -7,7 +7,7 @@ from datatypes.epc.domain.epc_property_data import EpcPropertyData from datatypes.epc.search import EpcSearchResult if TYPE_CHECKING: - from backend.epc_client.client import EpcClientService + from backend.epc_client.epc_client_service import EpcClientService _MIN_MATCH_SCORE = 0.6 From f52fe001cc4b8077ffb8bb16affa3ed0d960482c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 10:14:16 +0000 Subject: [PATCH 069/106] renamed file --- backend/epc_client/epc_client_service.py | 99 ++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 backend/epc_client/epc_client_service.py diff --git a/backend/epc_client/epc_client_service.py b/backend/epc_client/epc_client_service.py new file mode 100644 index 00000000..d00a164f --- /dev/null +++ b/backend/epc_client/epc_client_service.py @@ -0,0 +1,99 @@ +# Spec: https://raw.githubusercontent.com/communitiesuk/epb-data-warehouse/main/api/api.yml +from __future__ import annotations + +from typing import Any, Optional + +import httpx + +from backend.epc_client.exceptions import ( + EpcApiError, + EpcNotFoundError, + EpcRateLimitError, +) +from backend.epc_client._retry import call_with_retry +from datatypes.epc.domain.epc_property_data import EpcPropertyData +from datatypes.epc.domain.mapper import EpcPropertyDataMapper +from datatypes.epc.search import EpcSearchResult + + +class EpcClientService: + BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk" + + def __init__(self, auth_token: str) -> None: + self._headers = { + "Authorization": f"Bearer {auth_token}", + "Accept": "application/json", + } + + def get_by_certificate_number(self, cert_num: str) -> EpcPropertyData: + raw = call_with_retry(lambda: self._fetch_certificate(cert_num)) + return EpcPropertyDataMapper.from_api_response(raw) + + def get_by_uprn(self, uprn: int) -> Optional[EpcPropertyData]: + results = call_with_retry(lambda: self._search(uprn=uprn)) + if not results: + return None + latest = max(results, key=lambda r: r.registration_date) + return self.get_by_certificate_number(latest.certificate_number) + + def search_by_postcode(self, postcode: str) -> list[EpcSearchResult]: + return call_with_retry(lambda: self._search(postcode=postcode)) + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _fetch_certificate(self, cert_num: str) -> dict[str, Any]: + resp = httpx.get( + f"{self.BASE_URL}/api/certificate", + params={"certificate_number": cert_num}, + headers=self._headers, + ) + if resp.status_code == 404: + raise EpcNotFoundError(cert_num) + if resp.status_code == 429: + raise EpcRateLimitError("Rate limited by EPC API") + if not resp.is_success: + raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") + return resp.json()["data"] + + def _search( + self, + postcode: Optional[str] = None, + uprn: Optional[int] = None, + ) -> list[EpcSearchResult]: + params: dict[str, str | int] = {} + if postcode: + params["postcode"] = postcode + if uprn is not None: + params["uprn"] = uprn + + resp = httpx.get( + f"{self.BASE_URL}/api/domestic/search", + params=params, + headers=self._headers, + ) + if resp.status_code == 404: + return [] + if resp.status_code == 429: + raise EpcRateLimitError("Rate limited by EPC API") + if not resp.is_success: + raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") + + rows = resp.json().get("data", []) + return [self._parse_search_result(r) for r in rows] + + @staticmethod + def _parse_search_result(row: dict[str, Any]) -> EpcSearchResult: + return EpcSearchResult( + certificate_number=row["certificateNumber"], + address_line_1=row["addressLine1"], + address_line_2=row.get("addressLine2"), + address_line_3=row.get("addressLine3"), + address_line_4=row.get("addressLine4"), + postcode=row["postcode"], + post_town=row["postTown"], + uprn=row.get("uprn"), + current_energy_efficiency_band=row["currentEnergyEfficiencyBand"], + registration_date=row["registrationDate"], + ) From b364df89ad9bba94479554aa1c48b75aabb4c811 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 10:31:54 +0000 Subject: [PATCH 070/106] forgot to add tuple typing --- backend/address2UPRN/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 7e0baeaa..7ac5a54e 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -136,7 +136,7 @@ def get_uprn( """ df = get_epc_data_with_postcode(postcode=postcode) - result: Optional[] = get_uprn_with_epc_df( + result: Optional[tuple[str, str, float]] = get_uprn_with_epc_df( user_inputed_address=user_inputed_address, epc_df=df, verbose=True, @@ -561,4 +561,3 @@ def handler(event, context, local=False): # Don't add results to return messages as its too verbose # capture the exepection as e, into s3, to find the logs go to s3 # Upload results to s3 as well as csv - From bec5c4f3c3bb4f2fc63167c8115188c3cd5a1c62 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 10:51:27 +0000 Subject: [PATCH 071/106] one place to have df_has_single_uprn --- CLAUDE.md | 1 + backend/address2UPRN/main.py | 3 +-- backend/utils/addressMatch.py | 10 ---------- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 23d465a7..f88a59d5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -62,6 +62,7 @@ bash .devcontainer/backend/install-claude-skills.sh ## Type Safety All new code must pass `pyright` with zero errors under `typeCheckingMode = strict`. +Use Optional over | None Annotate all function return types. Use `dict[str, Any]` for untyped external API payloads — never bare `dict`. Add `pandas-stubs` when introducing pandas to a module. diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 7ac5a54e..b2cb4d98 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -17,9 +17,8 @@ from datetime import datetime from backend.utils.addressMatch import ( AddressMatch, get_uprn_candidates, - df_has_single_uprn, - score_addresses, ) +from backend.address2UPRN.scoring import df_has_single_uprn from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index ee9d1004..7618e9ac 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -259,13 +259,3 @@ def get_uprn_candidates( out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) return out.sort_values(["lexirank", "lexiscore"], ascending=[True, False]) - - -def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: - """Returns True if all non-null UPRNs in df match the given uprn.""" - if column not in df.columns: - return False - uprns = df[column].dropna().astype(str).str.strip().unique() - if len(uprns) == 0: - return False - return len(uprns) == 1 and uprns[0] == str(uprn) From 35fea20fc7e2bbdc51f1da2a3218105e518c9b38 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 10:54:45 +0000 Subject: [PATCH 072/106] changed function name --- backend/address2UPRN/main.py | 6 +++--- backend/address2UPRN/scoring.py | 13 ++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index b2cb4d98..6006fec1 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -18,7 +18,7 @@ from backend.utils.addressMatch import ( AddressMatch, get_uprn_candidates, ) -from backend.address2UPRN.scoring import df_has_single_uprn +from backend.address2UPRN.scoring import all_uprns_match from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) @@ -98,7 +98,7 @@ def get_uprn_with_epc_df( top_rank_df = scored_df[scored_df["lexirank"] == 1] # If rank-1 rows do not agree on a single UPRN → ambiguous - if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]): + if not all_uprns_match(top_rank_df, target_uprn=top_rank_df.iloc[0]["uprn"]): return None address = top_rank_df["address"].values[0] @@ -207,7 +207,7 @@ def resolve_uprns_for_postcode_group( top_rank_df = scored_df[scored_df["lexirank"] == 1] - if not df_has_single_uprn(top_rank_df, top_rank_df.iloc[0]["uprn"]): + if not all_uprns_match(top_rank_df, top_rank_df.iloc[0]["uprn"]): results.append( { "found_uprn": None, diff --git a/backend/address2UPRN/scoring.py b/backend/address2UPRN/scoring.py index d31b9aea..bfda2e71 100644 --- a/backend/address2UPRN/scoring.py +++ b/backend/address2UPRN/scoring.py @@ -3,12 +3,11 @@ import pandas as pd from backend.utils.addressMatch import AddressMatch -def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: - """ - Returns True if all non-null UPRNs in df match the given uprn. - Returns False otherwise. - """ - +def all_uprns_match( + df: pd.DataFrame, + target_uprn: str, + column: str = "uprn", +) -> bool: if column not in df.columns: return False @@ -17,7 +16,7 @@ def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> boo if len(uprns) == 0: return False - return len(uprns) == 1 and uprns[0] == str(uprn) + return len(uprns) == 1 and uprns[0] == str(target_uprn) def get_uprn_candidates( From f5bbd2efb3921fb8a419f478bf1a7e4aeb2e7c49 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 11:23:50 +0000 Subject: [PATCH 073/106] add missing tf_vars to deploy_lambda workflow --- .github/workflows/_deploy_lambda.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/_deploy_lambda.yml b/.github/workflows/_deploy_lambda.yml index 3a407c5a..1cc7d462 100644 --- a/.github/workflows/_deploy_lambda.yml +++ b/.github/workflows/_deploy_lambda.yml @@ -82,6 +82,12 @@ on: required: false TF_VAR_hubspot_api_key: required: false + + TF_VAR_magicplan_customer_id: + required: false + + TF_VAR_magicplan_api_key: + required: false jobs: deploy: runs-on: ubuntu-latest @@ -149,6 +155,8 @@ jobs: TF_VAR_pashub_email: ${{ secrets.TF_VAR_pashub_email }} TF_VAR_pashub_password: ${{ secrets.TF_VAR_pashub_password }} TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }} + TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }} + TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }} run: | ECR_REPO_URL_VAR="" if [[ -n "${{ inputs.ecr_repo }}" ]]; then @@ -195,6 +203,8 @@ jobs: TF_VAR_pashub_email: ${{ secrets.TF_VAR_pashub_email }} TF_VAR_pashub_password: ${{ secrets.TF_VAR_pashub_password }} TF_VAR_hubspot_api_key: ${{ secrets.TF_VAR_hubspot_api_key }} + TF_VAR_magicplan_customer_id: ${{ secrets.TF_VAR_magicplan_customer_id }} + TF_VAR_magicplan_api_key: ${{ secrets.TF_VAR_magicplan_api_key }} run: | EXTRA_VARS="" if [[ -n "${{ inputs.ecr_repo }}" ]]; then From ec7acabaf8215a022a5f1bc25c44bb298346a8c7 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 11:48:39 +0000 Subject: [PATCH 074/106] reinstate deleted policy so it can be unattached from entities --- infrastructure/terraform/shared/main.tf | 26 ++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 2c3200de..0a9e87f6 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -280,6 +280,21 @@ output "retrofit_energy_assessments_bucket_name" { description = "Name of the retrofit energy assessments bucket" } +module "energy_assessments_s3_write" { + source = "../modules/s3_iam_policy" + + policy_name = "EnergyAssessmentsWriteS3" + policy_description = "Allow lambdas to write to retrofit energy assessments bucket" + bucket_arns = ["arn:aws:s3:::retrofit-energy-assessments-${var.stage}"] + actions = ["s3:PutObject", "s3:AbortMultipartUpload"] + resource_paths = ["/*"] +} + +output "energy_assessments_s3_write_arn" { + value = module.energy_assessments_s3_write.policy_arn +} + + # Set up the route53 record for the API module "route53" { @@ -568,18 +583,19 @@ module "pashub_to_ara_registry" { stage = var.stage } -module "energy_assessments_s3_write" { +#### TEMP - need to unattach from entities before this can be delete #### +module "pashub_to_ara_s3_write" { source = "../modules/s3_iam_policy" - policy_name = "EnergyAssessmentsWriteS3" - policy_description = "Allow lambdas to write to retrofit energy assessments bucket" + policy_name = "PashubToAraWriteS3" + policy_description = "Allow PasHub to ARA Lambda to write to retrofit energy assessments bucket" bucket_arns = ["arn:aws:s3:::retrofit-energy-assessments-${var.stage}"] actions = ["s3:PutObject", "s3:AbortMultipartUpload"] resource_paths = ["/*"] } -output "energy_assessments_s3_write_arn" { - value = module.energy_assessments_s3_write.policy_arn +output "pashub_to_ara_s3_write_arn" { + value = module.pashub_to_ara_s3_write.policy_arn } ################################################ From 18ea95b67d3e15c41dbe57fa4228bd21a762719b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 12:34:17 +0000 Subject: [PATCH 075/106] added env variables for boto --- .../backend/install-claude-skills.sh | 14 ------ .github/workflows/unit_tests.yml | 3 ++ backend/address2UPRN/tests/test_csv.py | 47 ++++--------------- 3 files changed, 13 insertions(+), 51 deletions(-) delete mode 100755 .devcontainer/backend/install-claude-skills.sh diff --git a/.devcontainer/backend/install-claude-skills.sh b/.devcontainer/backend/install-claude-skills.sh deleted file mode 100755 index a54f69e0..00000000 --- a/.devcontainer/backend/install-claude-skills.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash -# Run this in an existing container to install the mattpocock skills -# without rebuilding the image. New containers get them automatically via Dockerfile. -set -euo pipefail - -echo "Installing Claude Code skills (mattpocock/skills)..." - -npx skills@latest add --global --yes mattpocock/skills/grill-me -npx skills@latest add --global --yes mattpocock/skills/to-prd -npx skills@latest add --global --yes mattpocock/skills/ubiquitous-language -npx skills@latest add --global --yes mattpocock/skills/tdd -npx skills@latest add --global --yes mattpocock/skills/improve-codebase-architecture - -echo "Done. Available: /grill-me /to-prd /ubiquitous-language /tdd /improve-codebase-architecture" diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index e1f4fb48..fa4fdf2a 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -51,6 +51,9 @@ jobs: -e EPC_AUTH_TOKEN=${{ secrets.DEV_EPC_AUTH_TOKEN }} \ -e OPEN_EPC_API_TOKEN=${{ secrets.DEV_OPEN_EPC_API_TOKEN }} \ -e HUBSPOT_API_KEY=${{ secrets.HUBSPOT_API_KEY }} \ + -e AWS_ACCESS_KEY_ID=${{ secrets.DEV_AWS_ACCESS_KEY_ID }} \ + -e AWS_SECRET_ACCESS_KEY=${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} \ + -e AWS_DEFAULT_REGION=${{ secrets.DEV_AWS_REGION }} \ -e DB_HOST=localhost \ -e DB_NAME=test \ -e DB_USERNAME=test \ diff --git a/backend/address2UPRN/tests/test_csv.py b/backend/address2UPRN/tests/test_csv.py index d8f54c39..70e7a9f9 100644 --- a/backend/address2UPRN/tests/test_csv.py +++ b/backend/address2UPRN/tests/test_csv.py @@ -1,54 +1,25 @@ # tests/test_address_to_uprn_csv.py import csv -import json import pytest -from datetime import date from pathlib import Path from backend.address2UPRN.main import get_uprn FIXTURE_PATH = Path(__file__).parent / "test_data.csv" -SIDECAR_PATH = Path(__file__).parent / "test_lodgement_dates.json" -NEW_API_CUTOFF = date(2012, 1, 1) - - -def _load_sidecar() -> dict: - if SIDECAR_PATH.exists(): - return json.loads(SIDECAR_PATH.read_text()) - return {} def load_test_cases(): - sidecar = _load_sidecar() with open(FIXTURE_PATH, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) - cases = [] - for row in reader: - key = f"{row['User Input']}|{row['Postcode']}" - entry = sidecar.get(key, {}) - lodgement_date = entry.get("lodgement_date") - - marks = [] - if lodgement_date: - parsed = date.fromisoformat(lodgement_date[:10]) - if parsed < NEW_API_CUTOFF: - marks.append( - pytest.mark.xfail( - reason=f"EPC lodged {lodgement_date} — predates new API coverage (Jan 2012)", - strict=False, - ) - ) - - cases.append( - pytest.param( - row["User Input"], - row["Postcode"], - row["Manual UPRN Code"], - id=f'{row["User Input"]} [{row["Postcode"]}]', - marks=marks, - ) + return [ + pytest.param( + row["User Input"], + row["Postcode"], + row["Manual UPRN Code"], + id=f'{row["User Input"]} [{row["Postcode"]}]', ) - return cases + for row in reader + ] @pytest.mark.parametrize( @@ -60,6 +31,8 @@ def test_uprn_resolution_matches_manual( postcode: str, expected_uprn: str, ): + from utils.logger import setup_logger + uprn = get_uprn(user_input, postcode) if uprn: assert uprn == expected_uprn From 5cd21d85224ab84a84e2ebbecbcd7a117c15dbc3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 12:55:50 +0000 Subject: [PATCH 076/106] get rid of khalim's json --- .../tests/test_lodgement_dates.json | 1230 ----------------- 1 file changed, 1230 deletions(-) delete mode 100644 backend/address2UPRN/tests/test_lodgement_dates.json diff --git a/backend/address2UPRN/tests/test_lodgement_dates.json b/backend/address2UPRN/tests/test_lodgement_dates.json deleted file mode 100644 index c58be704..00000000 --- a/backend/address2UPRN/tests/test_lodgement_dates.json +++ /dev/null @@ -1,1230 +0,0 @@ -{ - "47 The Fairway|OX16 0RR": { - "lodgement_date": "2010-03-16", - "found_in_old_api": true - }, - "11 REGENT COURT|SL1 3LG": { - "lodgement_date": "2022-05-04", - "found_in_old_api": true - }, - "3/137a Windmill Road|TW8 9NH": { - "lodgement_date": "2025-01-30", - "found_in_old_api": true - }, - "Flat 33|SW18 4BE": { - "lodgement_date": "2022-04-27", - "found_in_old_api": true - }, - "FLAT 1 Brendon Grove|N2 8JE": { - "lodgement_date": "2011-02-17", - "found_in_old_api": true - }, - "Flat 15|KT8 2NE": { - "lodgement_date": "2018-03-26", - "found_in_old_api": true - }, - "FLAT 5 Stonehill Road|W4 3AH": { - "lodgement_date": "2025-09-22", - "found_in_old_api": true - }, - "Flat 10|W4 3AH": { - "lodgement_date": "2023-06-15", - "found_in_old_api": true - }, - "Flat 11|W4 3AH": { - "lodgement_date": "2023-10-19", - "found_in_old_api": true - }, - "Flat 12, Forbes House|W4 3AH": { - "lodgement_date": "2023-10-04", - "found_in_old_api": true - }, - "Flat 13|W4 3AH": { - "lodgement_date": "2012-05-14", - "found_in_old_api": true - }, - "Flat 14|W4 3AH": { - "lodgement_date": "2022-10-15", - "found_in_old_api": true - }, - "Flat 15|W4 3AH": { - "lodgement_date": "2009-08-25", - "found_in_old_api": true - }, - "Flat 16|W4 3AH": { - "lodgement_date": "2012-05-23", - "found_in_old_api": true - }, - "Flat 17|W4 3AH": { - "lodgement_date": "2023-08-31", - "found_in_old_api": true - }, - "Flat 19|W4 3AH": { - "lodgement_date": "2025-07-16", - "found_in_old_api": true - }, - "Flat 20|W4 3AH": { - "lodgement_date": "2024-10-27", - "found_in_old_api": true - }, - "Flat 21|W4 3AH": { - "lodgement_date": "2023-08-08", - "found_in_old_api": true - }, - "Flat 22|W4 3AH": { - "lodgement_date": "2022-10-15", - "found_in_old_api": true - }, - "Flat 23|W4 3AH": { - "lodgement_date": "2022-10-15", - "found_in_old_api": true - }, - "Flat 24|W4 3AH": { - "lodgement_date": "2024-01-12", - "found_in_old_api": true - }, - "10 Douglas Court|SL7 1UQ": { - "lodgement_date": "2018-10-25", - "found_in_old_api": true - }, - "1 Windmill Road|HP17 8JA": { - "lodgement_date": "2009-08-25", - "found_in_old_api": true - }, - "31 Denewood|HP13 7LH": { - "lodgement_date": "2009-03-23", - "found_in_old_api": true - }, - "10, Greenways Drive|TW4 5DD": { - "lodgement_date": "2012-11-29", - "found_in_old_api": true - }, - "Flat 11|TW4 5DD": { - "lodgement_date": "2012-11-29", - "found_in_old_api": true - }, - "12, Greenways Drive|TW4 5DD": { - "lodgement_date": "2012-11-29", - "found_in_old_api": true - }, - "Flat 13|TW4 5DD": { - "lodgement_date": "2012-11-29", - "found_in_old_api": true - }, - "Flat 14|TW4 5DD": { - "lodgement_date": "2012-11-29", - "found_in_old_api": true - }, - "Flat 15|TW4 5DD": { - "lodgement_date": "2012-11-29", - "found_in_old_api": true - }, - "Flat 16|TW4 5DD": { - "lodgement_date": "2025-02-26", - "found_in_old_api": true - }, - "Flat 17|TW4 5DD": { - "lodgement_date": "2012-11-29", - "found_in_old_api": true - }, - "Flat 18|TW4 5DD": { - "lodgement_date": "2022-12-28", - "found_in_old_api": true - }, - "FLAT 1 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 2 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 3 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 4 Goodstone Court|HA1 4FL": { - "lodgement_date": "2022-12-14", - "found_in_old_api": true - }, - "FLAT 5 Goodstone Court|HA1 4FL": { - "lodgement_date": "2016-10-04", - "found_in_old_api": true - }, - "FLAT 6 Goodstone Court|HA1 4FL": { - "lodgement_date": "2024-06-05", - "found_in_old_api": true - }, - "FLAT 7 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 8 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 9 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 10 Goodstone Court|HA1 4FL": { - "lodgement_date": "2023-09-21", - "found_in_old_api": true - }, - "FLAT 11 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 12 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 13 Goodstone Court|HA1 4FL": { - "lodgement_date": "2022-12-13", - "found_in_old_api": true - }, - "FLAT 14 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 15 Goodstone Court|HA1 4FL": { - "lodgement_date": "2024-02-09", - "found_in_old_api": true - }, - "FLAT 16 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 17 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 18 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 19 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 20 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 21 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 22 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 23 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 24 Goodstone Court|HA1 4FL": { - "lodgement_date": "2024-10-24", - "found_in_old_api": true - }, - "FLAT 25 Goodstone Court|HA1 4FL": { - "lodgement_date": "2020-01-18", - "found_in_old_api": true - }, - "FLAT 26 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 27 Goodstone Court|HA1 4FL": { - "lodgement_date": "2022-11-04", - "found_in_old_api": true - }, - "FLAT 28 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 29 Goodstone Court|HA1 4FL": { - "lodgement_date": "2023-10-13", - "found_in_old_api": true - }, - "FLAT 30 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 31 Goodstone Court|HA1 4FL": { - "lodgement_date": "2023-04-19", - "found_in_old_api": true - }, - "FLAT 32 Goodstone Court|HA1 4FL": { - "lodgement_date": "2025-11-18", - "found_in_old_api": true - }, - "FLAT 33 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 34 Goodstone Court|HA1 4FL": { - "lodgement_date": "2022-09-19", - "found_in_old_api": true - }, - "FLAT 35 Goodstone Court|HA1 4FL": { - "lodgement_date": "2021-10-13", - "found_in_old_api": true - }, - "FLAT 36 Goodstone Court|HA1 4FL": { - "lodgement_date": "2022-10-12", - "found_in_old_api": true - }, - "FLAT 37 Goodstone Court|HA1 4FL": { - "lodgement_date": "2024-08-26", - "found_in_old_api": true - }, - "FLAT 38 Goodstone Court|HA1 4FL": { - "lodgement_date": "2023-05-26", - "found_in_old_api": true - }, - "FLAT 39 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 40 Goodstone Court|HA1 4FL": { - "lodgement_date": "2023-10-05", - "found_in_old_api": true - }, - "FLAT 41 Goodstone Court|HA1 4FL": { - "lodgement_date": "2025-11-24", - "found_in_old_api": true - }, - "FLAT 42 Goodstone Court|HA1 4FL": { - "lodgement_date": "2012-11-06", - "found_in_old_api": true - }, - "FLAT 43 Goodstone Court|HA1 4FL": { - "lodgement_date": "2025-07-08", - "found_in_old_api": true - }, - "30c, Bosanquet Close|UB8 3PE": { - "lodgement_date": "2019-05-27", - "found_in_old_api": true - }, - "30e, Bosanquet Close|UB8 3PE": { - "lodgement_date": "2024-07-30", - "found_in_old_api": true - }, - "13 Stubwick Court, Old Saw Mill Place|HP6 6FF": { - "lodgement_date": "2025-07-05", - "found_in_old_api": true - }, - "14 Stubwick Court, Old Saw Mill Place|HP6 6FF": { - "lodgement_date": "2012-07-18", - "found_in_old_api": true - }, - "15 Stubwick Court, Old Saw Mill Place|HP6 6FF": { - "lodgement_date": "2012-06-11", - "found_in_old_api": true - }, - "16 Stubwick Court, Old Saw Mill Place|HP6 6FF": { - "lodgement_date": "2022-07-01", - "found_in_old_api": true - }, - "17 Stubwick Court, Old Saw Mill Place|HP6 6FF": { - "lodgement_date": "2025-01-07", - "found_in_old_api": true - }, - "18 Stubwick Court, Old Saw Mill Place|HP6 6FF": { - "lodgement_date": "2012-07-18", - "found_in_old_api": true - }, - "19 Stubwick Court, Old Saw Mill Place|HP6 6FF": { - "lodgement_date": "2025-03-22", - "found_in_old_api": true - }, - "20 Stubwick Court, Old Saw Mill Place|HP6 6FF": { - "lodgement_date": "2022-08-15", - "found_in_old_api": true - }, - "21 Stubwick Court, Old Saw Mill Place|HP6 6FF": { - "lodgement_date": "2012-07-18", - "found_in_old_api": true - }, - "90a Murray Road|W5 4DA": { - "lodgement_date": "2013-12-12", - "found_in_old_api": true - }, - "Flat 1, 6 Wolverton Gardens|W5 3LJ": { - "lodgement_date": "2017-10-13", - "found_in_old_api": true - }, - "1, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "10, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "20, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "2, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "3, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "4, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "5, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "6, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "7, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "8, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "9, Monsted House|UB1 1FG": { - "lodgement_date": "2019-02-08", - "found_in_old_api": true - }, - "1 Cullis House, 1, Accolade Avenue|UB1 1FH": { - "lodgement_date": "2018-11-05", - "found_in_old_api": true - }, - "2 Cullis House, 1, Accolade Avenue|UB1 1FH": { - "lodgement_date": "2018-11-05", - "found_in_old_api": true - }, - "3 Cullis House, 1, Accolade Avenue|UB1 1FH": { - "lodgement_date": "2018-11-05", - "found_in_old_api": true - }, - "4 Cullis House, 1, Accolade Avenue|UB1 1FH": { - "lodgement_date": "2018-11-05", - "found_in_old_api": true - }, - "5 Cullis House, 1, Accolade Avenue|UB1 1FH": { - "lodgement_date": "2018-11-05", - "found_in_old_api": true - }, - "6 Cullis House, 1, Accolade Avenue|UB1 1FH": { - "lodgement_date": "2018-11-05", - "found_in_old_api": true - }, - "1 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-10", - "found_in_old_api": true - }, - "2 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-10", - "found_in_old_api": true - }, - "3 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-13", - "found_in_old_api": true - }, - "4 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-13", - "found_in_old_api": true - }, - "5 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-13", - "found_in_old_api": true - }, - "6 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-13", - "found_in_old_api": true - }, - "7 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-13", - "found_in_old_api": true - }, - "8 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-13", - "found_in_old_api": true - }, - "9 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-13", - "found_in_old_api": true - }, - "10 Genteel House Samara Drive|UB1 1FJ": { - "lodgement_date": "2019-05-13", - "found_in_old_api": true - }, - "Flat 1 Ash Tree House, 2, Thompson Avenue|SE5 0TE": { - "lodgement_date": "2018-09-05", - "found_in_old_api": true - }, - "Flat 3 ASH TREE HOUSE|SE5 0TE": { - "lodgement_date": "2018-09-05", - "found_in_old_api": true - }, - "Flat 5 ASH TREE HOUSE|SE5 0TE": { - "lodgement_date": "2019-09-12", - "found_in_old_api": true - }, - "Flat 8 ASH TREE HOUSE|SE5 0TE": { - "lodgement_date": "2011-10-26", - "found_in_old_api": true - }, - "Flat 12 ASH TREE HOUSE|SE5 0TE": { - "lodgement_date": "2018-09-05", - "found_in_old_api": true - }, - "FLAT 1 599 HARROW ROAD|W10 4RA": { - "lodgement_date": "2017-01-12", - "found_in_old_api": true - }, - "FLAT 2 599 HARROW ROAD|W10 4RA": { - "lodgement_date": "2020-07-28", - "found_in_old_api": true - }, - "FLAT 5 599 HARROW ROAD|W10 4RA": { - "lodgement_date": "2017-01-12", - "found_in_old_api": true - }, - "Flat 1, Ohio Building|SE13 7RX": { - "lodgement_date": "2023-08-15", - "found_in_old_api": true - }, - "Flat 2, Ohio Building|SE13 7RX": { - "lodgement_date": "2017-06-09", - "found_in_old_api": true - }, - "Apartment 1 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2017-01-05", - "found_in_old_api": true - }, - "Apartment 2 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2014-01-22", - "found_in_old_api": true - }, - "Apartment 3 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2009-02-25", - "found_in_old_api": true - }, - "Apartment 4 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2017-01-05", - "found_in_old_api": true - }, - "Apartment 5 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2009-02-25", - "found_in_old_api": true - }, - "Apartment 6 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2009-02-25", - "found_in_old_api": true - }, - "Apartment 7 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2022-10-24", - "found_in_old_api": true - }, - "Apartment 8 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2009-02-25", - "found_in_old_api": true - }, - "Apartment 9 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2009-02-25", - "found_in_old_api": true - }, - "Apartment 10 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2009-02-25", - "found_in_old_api": true - }, - "Apartment 11 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2011-08-17", - "found_in_old_api": true - }, - "Apartment 12 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2009-02-25", - "found_in_old_api": true - }, - "Apartment 13 Block B, 105, Benwell Road|N7 7BW": { - "lodgement_date": "2009-02-25", - "found_in_old_api": true - }, - "Apartment 1 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2023-07-19", - "found_in_old_api": true - }, - "Apartment 2 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2022-10-20", - "found_in_old_api": true - }, - "Apartment 3 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2009-05-15", - "found_in_old_api": true - }, - "Apartment 4 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2009-05-15", - "found_in_old_api": true - }, - "Apartment 5 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2009-05-15", - "found_in_old_api": true - }, - "Apartment 6 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2012-11-08", - "found_in_old_api": true - }, - "Apartment 7 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2015-08-30", - "found_in_old_api": true - }, - "Apartment 8 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2020-08-02", - "found_in_old_api": true - }, - "Apartment 9 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2021-10-12", - "found_in_old_api": true - }, - "Apartment 10 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2009-05-15", - "found_in_old_api": true - }, - "Apartment 11 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2009-05-15", - "found_in_old_api": true - }, - "Apartment 12 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2022-02-22", - "found_in_old_api": true - }, - "Apartment 13 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2009-05-15", - "found_in_old_api": true - }, - "Apartment 14 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2009-05-15", - "found_in_old_api": true - }, - "Apartment 15 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2009-05-15", - "found_in_old_api": true - }, - "Apartment 16 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2009-05-15", - "found_in_old_api": true - }, - "Apartment 17Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2019-01-22", - "found_in_old_api": true - }, - "Apartment 18 Block D, 32, Hornsey Road|N7 7AT": { - "lodgement_date": "2013-06-03", - "found_in_old_api": true - }, - "FLAT B 158 LEAHURST ROAD|SE13 5NL": { - "lodgement_date": "2014-01-24", - "found_in_old_api": true - }, - "2 COLLEGE HOUSE|CM7 1JS": { - "lodgement_date": "2017-01-12", - "found_in_old_api": true - }, - "3 COLLEGE HOUSE|CM7 1JS": { - "lodgement_date": "2017-01-12", - "found_in_old_api": true - }, - "2 Anita Street|M4 5DU": { - "lodgement_date": "2019-10-18", - "found_in_old_api": true - }, - "5 Anita Street|M4 5DU": { - "lodgement_date": "2012-12-21", - "found_in_old_api": true - }, - "6 Anita Street|M4 5DU": { - "lodgement_date": "2021-02-16", - "found_in_old_api": true - }, - "10 Anita Street|M4 5DU": { - "lodgement_date": "2021-07-01", - "found_in_old_api": true - }, - "12 Anita Street|M4 5DU": { - "lodgement_date": "2025-08-08", - "found_in_old_api": true - }, - "26 Anita Street|M4 5DU": { - "lodgement_date": "2010-06-25", - "found_in_old_api": true - }, - "33 Anita Street|M4 5DU": { - "lodgement_date": "2017-03-10", - "found_in_old_api": true - }, - "35 Anita Street|M4 5DU": { - "lodgement_date": "2015-11-18", - "found_in_old_api": true - }, - "36 Anita Street|M4 5DU": { - "lodgement_date": "2013-09-12", - "found_in_old_api": true - }, - "23 George Leigh Street|M4 5DR": { - "lodgement_date": "2025-03-11", - "found_in_old_api": true - }, - "35 George Leigh Street|M4 5DR": { - "lodgement_date": "2024-05-29", - "found_in_old_api": true - }, - "39 George Leigh Street|M4 5DR": { - "lodgement_date": "2024-05-28", - "found_in_old_api": true - }, - "51 George Leigh Street|M4 5DR": { - "lodgement_date": "2022-02-03", - "found_in_old_api": true - }, - "1a, Victoria Square|M4 5DX": { - "lodgement_date": "2016-01-08", - "found_in_old_api": true - }, - "4a, Victoria Square|M4 5DX": { - "lodgement_date": "2012-09-19", - "found_in_old_api": true - }, - "5a Victoria Square|M4 5DX": { - "lodgement_date": "2012-06-25", - "found_in_old_api": true - }, - " 6a Victoria Square|M4 5DX": { - "lodgement_date": "2023-02-13", - "found_in_old_api": true - }, - "7a Victoria Square|M4 5DX": { - "lodgement_date": "2017-03-15", - "found_in_old_api": true - }, - "8a Victoria Square|M4 5DX": { - "lodgement_date": "2019-11-25", - "found_in_old_api": true - }, - "9a Victoria Square|M4 5DX": { - "lodgement_date": "2026-02-24", - "found_in_old_api": true - }, - "10a Victoria Square|M4 5DX": { - "lodgement_date": "2013-10-16", - "found_in_old_api": true - }, - "11a Victoria Square|M4 5DX": { - "lodgement_date": "2015-11-06", - "found_in_old_api": true - }, - "12a Victoria Square|M4 5DX": { - "lodgement_date": "2022-11-08", - "found_in_old_api": true - }, - "13a Victoria Square|M4 5DX": { - "lodgement_date": "2025-04-27", - "found_in_old_api": true - }, - "14a Victoria Square|M4 5DX": { - "lodgement_date": "2010-11-09", - "found_in_old_api": true - }, - "15a Victoria Square|M4 5DX": { - "lodgement_date": "2012-03-26", - "found_in_old_api": true - }, - "16a Victoria Square|M4 5DX": { - "lodgement_date": "2009-05-28", - "found_in_old_api": true - }, - "17a Victoria Square|M4 5DX": { - "lodgement_date": "2012-12-20", - "found_in_old_api": true - }, - "18a Victoria Square|M4 5DX": { - "lodgement_date": "2022-07-21", - "found_in_old_api": true - }, - "19a Victoria Square|M4 5DX": { - "lodgement_date": "2009-08-18", - "found_in_old_api": true - }, - "20a Victoria Square|M4 5DX": { - "lodgement_date": "2014-05-27", - "found_in_old_api": true - }, - "21a Victoria Square|M4 5DY": { - "lodgement_date": "2010-04-08", - "found_in_old_api": true - }, - "23a Victoria Square|M4 5DY": { - "lodgement_date": "2016-04-05", - "found_in_old_api": true - }, - "24a Victoria Square|M4 5DY": { - "lodgement_date": "2022-03-23", - "found_in_old_api": true - }, - "25a Victoria Square|M4 5DY": { - "lodgement_date": "2024-10-13", - "found_in_old_api": true - }, - "26a Victoria Square|M4 5DY": { - "lodgement_date": "2024-03-25", - "found_in_old_api": true - }, - "27a Victoria Square|M4 5DY": { - "lodgement_date": "2009-10-05", - "found_in_old_api": true - }, - "29a Victoria Square|M4 5DY": { - "lodgement_date": "2024-05-27", - "found_in_old_api": true - }, - "30a Victoria Square|M4 5DY": { - "lodgement_date": "2011-09-07", - "found_in_old_api": true - }, - "31a Victoria Square|M4 5DY": { - "lodgement_date": "2010-12-09", - "found_in_old_api": true - }, - "32a Victoria Square|M4 5DY": { - "lodgement_date": "2021-02-17", - "found_in_old_api": true - }, - "33a Victoria Square|M4 5DY": { - "lodgement_date": "2011-04-05", - "found_in_old_api": true - }, - "34a Victoria Square|M4 5DY": { - "lodgement_date": "2021-08-13", - "found_in_old_api": true - }, - "36a Victoria Square|M4 5DY": { - "lodgement_date": "2011-04-05", - "found_in_old_api": true - }, - "37a Victoria Square|M4 5DY": { - "lodgement_date": "2018-07-02", - "found_in_old_api": true - }, - "38a Victoria Square|M4 5DY": { - "lodgement_date": "2010-02-02", - "found_in_old_api": true - }, - "39a Victoria Square|M4 5DY": { - "lodgement_date": "2018-01-04", - "found_in_old_api": true - }, - "41a Victoria Square|M4 5DY": { - "lodgement_date": "2011-05-23", - "found_in_old_api": true - }, - "42a Victoria Square|M4 5DY": { - "lodgement_date": "2010-10-14", - "found_in_old_api": true - }, - "43a Victoria Square|M4 5DY": { - "lodgement_date": "2018-10-11", - "found_in_old_api": true - }, - "44a Victoria Square|M4 5DY": { - "lodgement_date": "2010-06-08", - "found_in_old_api": true - }, - "45a Victoria Square|M4 5DY": { - "lodgement_date": "2023-03-08", - "found_in_old_api": true - }, - "46a Victoria Square|M4 5DY": { - "lodgement_date": "2010-12-09", - "found_in_old_api": true - }, - "47a Victoria Square|M4 5DY": { - "lodgement_date": "2010-02-09", - "found_in_old_api": true - }, - "48a Victoria Square|M4 5DY": { - "lodgement_date": "2011-04-12", - "found_in_old_api": true - }, - "49a Victoria Square|M4 5DY": { - "lodgement_date": "2010-11-09", - "found_in_old_api": true - }, - "50a Victoria Square|M4 5DY": { - "lodgement_date": "2025-09-06", - "found_in_old_api": true - }, - "51a Victoria Square|M4 5DY": { - "lodgement_date": "2009-10-05", - "found_in_old_api": true - }, - "52a Victoria Square|M4 5DY": { - "lodgement_date": "2010-12-17", - "found_in_old_api": true - }, - "53a Victoria Square|M4 5DY": { - "lodgement_date": "2022-11-10", - "found_in_old_api": true - }, - "54a Victoria Square|M4 5DY": { - "lodgement_date": "2021-01-08", - "found_in_old_api": true - }, - "55a Victoria Square|M4 5DY": { - "lodgement_date": "2009-08-18", - "found_in_old_api": true - }, - "56a Victoria Square|M4 5DZ": { - "lodgement_date": "2019-03-15", - "found_in_old_api": true - }, - "58a Victoria Square|M4 5DZ": { - "lodgement_date": "2018-11-14", - "found_in_old_api": true - }, - "59a Victoria Square|M4 5DZ": { - "lodgement_date": "2013-11-26", - "found_in_old_api": true - }, - "60a Victoria Square|M4 5DZ": { - "lodgement_date": "2024-06-12", - "found_in_old_api": true - }, - "61a Victoria Square|M4 5DZ": { - "lodgement_date": "2024-08-05", - "found_in_old_api": true - }, - "62a Victoria Square|M4 5DZ": { - "lodgement_date": "2013-05-24", - "found_in_old_api": true - }, - "64a Victoria Square|M4 5DZ": { - "lodgement_date": "2021-07-29", - "found_in_old_api": true - }, - "65a Victoria Square|M4 5DZ": { - "lodgement_date": "2011-08-26", - "found_in_old_api": true - }, - "68a Victoria Square|M4 5DZ": { - "lodgement_date": "2022-03-29", - "found_in_old_api": true - }, - "69a Victoria Square|M4 5DZ": { - "lodgement_date": "2011-01-19", - "found_in_old_api": true - }, - "70a Victoria Square|M4 5DZ": { - "lodgement_date": "2011-07-27", - "found_in_old_api": true - }, - "71a Victoria Square|M4 5DZ": { - "lodgement_date": "2016-11-22", - "found_in_old_api": true - }, - "72a Victoria Square|M4 5DZ": { - "lodgement_date": "2019-01-07", - "found_in_old_api": true - }, - "73a Victoria Square|M4 5DZ": { - "lodgement_date": "2014-07-25", - "found_in_old_api": true - }, - "75a Victoria Square|M4 5DZ": { - "lodgement_date": "2016-01-20", - "found_in_old_api": true - }, - "76a Victoria Square|M4 5DZ": { - "lodgement_date": "2018-01-26", - "found_in_old_api": true - }, - "78a Victoria Square|M4 5DZ": { - "lodgement_date": "2011-06-02", - "found_in_old_api": true - }, - "79a Victoria Square|M4 5DZ": { - "lodgement_date": "2022-01-26", - "found_in_old_api": true - }, - "80a Victoria Square|M4 5DZ": { - "lodgement_date": "2018-11-05", - "found_in_old_api": true - }, - "81a Victoria Square|M4 5DZ": { - "lodgement_date": "2017-03-05", - "found_in_old_api": true - }, - "83a Victoria Square|M4 5DZ": { - "lodgement_date": "2012-05-01", - "found_in_old_api": true - }, - "85a Victoria Square|M4 5DZ": { - "lodgement_date": "2009-10-21", - "found_in_old_api": true - }, - "86a Victoria Square|M4 5DZ": { - "lodgement_date": "2024-05-29", - "found_in_old_api": true - }, - "87a Victoria Square|M4 5DZ": { - "lodgement_date": "2025-07-13", - "found_in_old_api": true - }, - "89a Victoria Square|M4 5DZ": { - "lodgement_date": "2016-05-12", - "found_in_old_api": true - }, - "90a Victoria Square|M4 5DZ": { - "lodgement_date": "2012-05-09", - "found_in_old_api": true - }, - "91a Victoria Square|M4 5DZ": { - "lodgement_date": "2025-04-30", - "found_in_old_api": true - }, - "92a Victoria Square|M4 5DZ": { - "lodgement_date": "2021-07-29", - "found_in_old_api": true - }, - "93a Victoria Square|M4 5EA": { - "lodgement_date": "2013-02-26", - "found_in_old_api": true - }, - "95a Victoria Square|M4 5EA": { - "lodgement_date": "2020-09-06", - "found_in_old_api": true - }, - "96a Victoria Square|M4 5EA": { - "lodgement_date": "2022-06-30", - "found_in_old_api": true - }, - "97a Victoria Square|M4 5EA": { - "lodgement_date": "2016-09-05", - "found_in_old_api": true - }, - "98a Victoria Square|M4 5EA": { - "lodgement_date": "2019-12-19", - "found_in_old_api": true - }, - "99a Victoria Square|M4 5EA": { - "lodgement_date": "2009-03-05", - "found_in_old_api": true - }, - "100a Victoria Square|M4 5EA": { - "lodgement_date": "2011-03-31", - "found_in_old_api": true - }, - "103a Victoria Square|M4 5EA": { - "lodgement_date": "2009-03-05", - "found_in_old_api": true - }, - "104a Victoria Square|M4 5EA": { - "lodgement_date": "2010-01-21", - "found_in_old_api": true - }, - "106a Victoria Square|M4 5EA": { - "lodgement_date": "2015-12-10", - "found_in_old_api": true - }, - "107a Victoria Square|M4 5EA": { - "lodgement_date": "2013-07-01", - "found_in_old_api": true - }, - "108a Victoria Square|M4 5EA": { - "lodgement_date": "2023-03-01", - "found_in_old_api": true - }, - "109a Victoria Square|M4 5EA": { - "lodgement_date": "2010-03-24", - "found_in_old_api": true - }, - "110a Victoria Square|M4 5EA": { - "lodgement_date": "2019-02-25", - "found_in_old_api": true - }, - "111a Victoria Square|M4 5EA": { - "lodgement_date": "2010-02-01", - "found_in_old_api": true - }, - "113a Victoria Square|M4 5EA": { - "lodgement_date": "2012-11-21", - "found_in_old_api": true - }, - "114a Victoria Square|M4 5EA": { - "lodgement_date": "2013-12-06", - "found_in_old_api": true - }, - "115a Victoria Square|M4 5EA": { - "lodgement_date": "2022-08-25", - "found_in_old_api": true - }, - "116a Victoria Square|M4 5EA": { - "lodgement_date": "2011-02-25", - "found_in_old_api": true - }, - "119a Victoria Square|M4 5EA": { - "lodgement_date": "2024-04-12", - "found_in_old_api": true - }, - "120a Victoria Square|M4 5EA": { - "lodgement_date": "2011-04-04", - "found_in_old_api": true - }, - "121a Victoria Square|M4 5EA": { - "lodgement_date": "2010-11-09", - "found_in_old_api": true - }, - "122a Victoria Square|M4 5EA": { - "lodgement_date": "2012-05-01", - "found_in_old_api": true - }, - "123a Victoria Square|M4 5EA": { - "lodgement_date": "2022-01-12", - "found_in_old_api": true - }, - "125a Victoria Square|M4 5EA": { - "lodgement_date": "2023-11-22", - "found_in_old_api": true - }, - "126a Victoria Square|M4 5EA": { - "lodgement_date": "2010-08-24", - "found_in_old_api": true - }, - "127a Victoria Square|M4 5EA": { - "lodgement_date": "2020-03-01", - "found_in_old_api": true - }, - "128a Victoria Square|M4 5EA": { - "lodgement_date": "2015-02-04", - "found_in_old_api": true - }, - "129a Victoria Square|M4 5EA": { - "lodgement_date": "2010-07-07", - "found_in_old_api": true - }, - "130a Victoria Square|M4 5FA": { - "lodgement_date": "2026-02-11", - "found_in_old_api": true - }, - "131a Victoria Square|M4 5FA": { - "lodgement_date": "2025-05-29", - "found_in_old_api": true - }, - "132a Victoria Square|M4 5FA": { - "lodgement_date": "2019-12-24", - "found_in_old_api": true - }, - "134a Victoria Square|M4 5FA": { - "lodgement_date": "2011-08-18", - "found_in_old_api": true - }, - "135a Victoria Square|M4 5FA": { - "lodgement_date": "2019-09-05", - "found_in_old_api": true - }, - "136a Victoria Square|M4 5FA": { - "lodgement_date": "2025-02-14", - "found_in_old_api": true - }, - "137a Victoria Square|M4 5FA": { - "lodgement_date": "2024-07-17", - "found_in_old_api": true - }, - "138a Victoria Square|M4 5FA": { - "lodgement_date": "2023-10-11", - "found_in_old_api": true - }, - "139a Victoria Square|M4 5FA": { - "lodgement_date": "2021-06-22", - "found_in_old_api": true - }, - "140a Victoria Square|M4 5FA": { - "lodgement_date": "2020-06-15", - "found_in_old_api": true - }, - "141a Victoria Square|M4 5FA": { - "lodgement_date": "2025-12-22", - "found_in_old_api": true - }, - "142a Victoria Square|M4 5FA": { - "lodgement_date": "2025-12-22", - "found_in_old_api": true - }, - "143a Victoria Square|M4 5FA": { - "lodgement_date": "2023-01-18", - "found_in_old_api": true - }, - "144a Victoria Square|M4 5FA": { - "lodgement_date": "2011-04-04", - "found_in_old_api": true - }, - "146a Victoria Square|M4 5FA": { - "lodgement_date": "2022-09-21", - "found_in_old_api": true - }, - "147a Victoria Square|M4 5FA": { - "lodgement_date": "2011-05-04", - "found_in_old_api": true - }, - "148a Victoria Square|M4 5FA": { - "lodgement_date": "2014-11-18", - "found_in_old_api": true - }, - "149a Victoria Square|M4 5FA": { - "lodgement_date": "2009-12-14", - "found_in_old_api": true - }, - "150a Victoria Square|M4 5FA": { - "lodgement_date": "2009-12-14", - "found_in_old_api": true - }, - "152a Victoria Square|M4 5FA": { - "lodgement_date": "2017-06-23", - "found_in_old_api": true - }, - "154a Victoria Square|M4 5FA": { - "lodgement_date": "2025-04-29", - "found_in_old_api": true - }, - "156a Victoria Square|M4 5FA": { - "lodgement_date": "2011-04-05", - "found_in_old_api": true - }, - "157a Victoria Square|M4 5FA": { - "lodgement_date": "2023-09-11", - "found_in_old_api": true - }, - "158a Victoria Square|M4 5FA": { - "lodgement_date": "2021-12-07", - "found_in_old_api": true - }, - "160a Victoria Square|M4 5FA": { - "lodgement_date": "2011-02-04", - "found_in_old_api": true - }, - "163a Victoria Square|M4 5FA": { - "lodgement_date": "2010-02-02", - "found_in_old_api": true - }, - "164a Victoria Square|M4 5FA": { - "lodgement_date": "2020-10-19", - "found_in_old_api": true - }, - "165a Victoria Square|M4 5FA": { - "lodgement_date": "2019-12-13", - "found_in_old_api": true - } -} \ No newline at end of file From a477561bbc0b7bef41bee12b96df1efa64365261 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 13:03:04 +0000 Subject: [PATCH 077/106] correct tfstate bucket name --- infrastructure/terraform/lambda/hubspot_deal_etl/main.tf | 2 +- infrastructure/terraform/lambda/magic_plan/provider.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf index 800dc3b6..ffb5f6f5 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf @@ -19,7 +19,7 @@ data "terraform_remote_state" "pashub_to_ara" { data "terraform_remote_state" "magic_plan" { backend = "s3" config = { - bucket = "magic-plan-hubspot-trigger-terraform-state" + bucket = "magic-plan-client-terraform-state" key = "env:/${var.stage}/terraform.tfstate" region = "eu-west-2" } diff --git a/infrastructure/terraform/lambda/magic_plan/provider.tf b/infrastructure/terraform/lambda/magic_plan/provider.tf index 9e7020ac..a3dd6a7d 100644 --- a/infrastructure/terraform/lambda/magic_plan/provider.tf +++ b/infrastructure/terraform/lambda/magic_plan/provider.tf @@ -7,7 +7,7 @@ terraform { } backend "s3" { - bucket = "magic-plan-hubspot-trigger-terraform-state" + bucket = "magic-plan-client-terraform-state" key = "terraform.tfstate" region = "eu-west-2" } From 46ec68e5db29d891deef01e130d460f708ff108b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 13:41:59 +0000 Subject: [PATCH 078/106] save match building number --- backend/epc_client/epc_client_service.py | 6 ++--- backend/utils/addressMatch.py | 30 ++++++++++++++++++------ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/backend/epc_client/epc_client_service.py b/backend/epc_client/epc_client_service.py index d00a164f..777e8d14 100644 --- a/backend/epc_client/epc_client_service.py +++ b/backend/epc_client/epc_client_service.py @@ -40,8 +40,8 @@ class EpcClientService: return call_with_retry(lambda: self._search(postcode=postcode)) # ------------------------------------------------------------------ - # Private helpers - # ------------------------------------------------------------------ + # Private helperEpcRateLimpolarss + # ----------------------EpcRateLimpolarsEpcRateLimpolarsEpcRateLimpolarsEpcRateLimpolarsEpcRateLimpolars-------------------------------------------- def _fetch_certificate(self, cert_num: str) -> dict[str, Any]: resp = httpx.get( @@ -52,7 +52,7 @@ class EpcClientService: if resp.status_code == 404: raise EpcNotFoundError(cert_num) if resp.status_code == 429: - raise EpcRateLimitError("Rate limited by EPC API") + raise EpcRateLimpolars vs pandas code examplepolars vs pandas code exampleitError("Rate limited by EPC API") if not resp.is_success: raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") return resp.json()["data"] diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index 7618e9ac..69be6f59 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -101,6 +101,16 @@ class AddressMatch: tokens.append(replacement) return " ".join(tokens) + @staticmethod + def _match_building_number(token: str, next_token: Optional[str]) -> Optional[str]: + if re.fullmatch(r"\d+[a-z]", token): + return token + if re.fullmatch(r"\d+", token): + if next_token is not None and re.fullmatch(r"[a-z]", next_token): + return token + next_token + return token + return None + @staticmethod def levenshtein(a: str, b: str) -> float: """ @@ -146,13 +156,9 @@ class AddressMatch: # first remaining number is building number; recombine with a # single-letter suffix when normalisation has split "82a" → "82 a" for i, t in enumerate(cleaned): - if re.fullmatch(r"\d+[a-z]", t): - return t - if re.fullmatch(r"\d+", t): - nxt = cleaned[i + 1] if i + 1 < len(cleaned) else None - if nxt is not None and re.fullmatch(r"[a-z]", nxt): - return t + nxt - return t + nxt = cleaned[i + 1] if i + 1 < len(cleaned) else None + if (match := AddressMatch._match_building_number(t, nxt)) is not None: + return match return None @@ -259,3 +265,13 @@ def get_uprn_candidates( out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) return out.sort_values(["lexirank", "lexiscore"], ascending=[True, False]) + + +def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: + """Returns True if all non-null UPRNs in df match the given uprn.""" + if column not in df.columns: + return False + uprns = df[column].dropna().astype(str).str.strip().unique() + if len(uprns) == 0: + return False + return len(uprns) == 1 and uprns[0] == str(uprn) From b0e935d49710137006da287e1f5746f153faed7e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 13:43:12 +0000 Subject: [PATCH 079/106] make sensible naming for column for address column in df --- backend/utils/addressMatch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index 69be6f59..3a7e7494 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -235,11 +235,11 @@ class AddressMatch: def score_addresses( df: pd.DataFrame, user_address: str, - column: str = "address", + address_column: str = "address", ) -> pd.Series: - if column not in df.columns: - raise ValueError(f"Missing column: {column}") - return df[column].apply(lambda x: AddressMatch.score(user_address, x)) + if address_column not in df.columns: + raise ValueError(f"Missing column: {address_column}") + return df[address_column].apply(lambda x: AddressMatch.score(user_address, x)) def get_uprn_candidates( From 27f2ef5e8370cc356fa90a7f38abfb2dfcfd2e7d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 13:46:02 +0000 Subject: [PATCH 080/106] get rid of duplicate function and make better sensible variable name --- backend/address2UPRN/main.py | 11 +++----- backend/address2UPRN/scoring.py | 2 +- backend/utils/addressMatch.py | 25 ------------------- datatypes/epc/domain/historic_epc_matching.py | 4 +-- 4 files changed, 7 insertions(+), 35 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 6006fec1..9c19eca9 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -14,11 +14,8 @@ from utils.s3 import ( ) from datetime import datetime -from backend.utils.addressMatch import ( - AddressMatch, - get_uprn_candidates, -) -from backend.address2UPRN.scoring import all_uprns_match +from backend.utils.addressMatch import AddressMatch +from backend.address2UPRN.scoring import all_uprns_match, rank_by_address_similarity from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) @@ -82,7 +79,7 @@ def get_uprn_with_epc_df( if epc_df.empty: return None - scored_df = get_uprn_candidates( + scored_df = rank_by_address_similarity( epc_df, user_address=user_inputed_address, ) @@ -174,7 +171,7 @@ def resolve_uprns_for_postcode_group( for _, row in group_df.iterrows(): user_address = str(row[address_col]).strip() - scored_df = get_uprn_candidates( + scored_df = rank_by_address_similarity( epc_df, user_address=user_address, ) diff --git a/backend/address2UPRN/scoring.py b/backend/address2UPRN/scoring.py index bfda2e71..2a681ad2 100644 --- a/backend/address2UPRN/scoring.py +++ b/backend/address2UPRN/scoring.py @@ -19,7 +19,7 @@ def all_uprns_match( return len(uprns) == 1 and uprns[0] == str(target_uprn) -def get_uprn_candidates( +def rank_by_address_similarity( df: pd.DataFrame, user_address: str, address_column: str = "address", diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index 3a7e7494..81896140 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -242,31 +242,6 @@ def score_addresses( return df[address_column].apply(lambda x: AddressMatch.score(user_address, x)) -def get_uprn_candidates( - df: pd.DataFrame, - user_address: str, - address_column: str = "address", - uprn_column: str = "uprn", -) -> pd.DataFrame: - """ - Annotate EPC results with lexicographical similarity scores and ranks. - Returns a DataFrame sorted by descending lexiscore. - """ - if address_column not in df.columns: - raise ValueError(f"Missing column: {address_column}") - if uprn_column not in df.columns: - raise ValueError(f"Missing column: {uprn_column}") - - out = df.copy() - user_norm = AddressMatch.normalise_address(user_address) - out["lexiscore"] = out[address_column].apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) - out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) - out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) - return out.sort_values(["lexirank", "lexiscore"], ascending=[True, False]) - - def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: """Returns True if all non-null UPRNs in df match the given uprn.""" if column not in df.columns: diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 95ca9d9f..6ea2118b 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -4,7 +4,7 @@ from typing import Optional import pandas as pd from botocore.exceptions import ClientError -from backend.address2UPRN.scoring import get_uprn_candidates +from backend.address2UPRN.scoring import rank_by_address_similarity from backend.utils.addressMatch import AddressMatch from datatypes.epc.domain.historic_epc import HistoricEpc from utils.pandas_utils import pandas_cell_to_str @@ -85,7 +85,7 @@ def match_addresses_for_postcode( ) from e raise - scored = get_uprn_candidates( + scored = rank_by_address_similarity( df, user_address=user_address, address_column=address_column, From a672c0dea0dbfb54a35d4d5deeefea7303c93193 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 13:51:46 +0000 Subject: [PATCH 081/106] add localhandler for testing and update requirements --- backend/magic_plan/handler/requirements.txt | 4 +++ .../local_handler/docker-compose.yml | 11 ++++++++ .../local_handler/invoke_local_lambda.py | 28 +++++++++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 backend/magic_plan/local_handler/docker-compose.yml create mode 100644 backend/magic_plan/local_handler/invoke_local_lambda.py diff --git a/backend/magic_plan/handler/requirements.txt b/backend/magic_plan/handler/requirements.txt index cfacf455..29123caa 100644 --- a/backend/magic_plan/handler/requirements.txt +++ b/backend/magic_plan/handler/requirements.txt @@ -5,3 +5,7 @@ sqlmodel psycopg2-binary==2.9.10 pydantic-settings==2.6.0 boto3==1.35.44 + +pytz==2024.2 +pandas==2.2.2 +numpy==2.1.2 diff --git a/backend/magic_plan/local_handler/docker-compose.yml b/backend/magic_plan/local_handler/docker-compose.yml new file mode 100644 index 00000000..5a42d259 --- /dev/null +++ b/backend/magic_plan/local_handler/docker-compose.yml @@ -0,0 +1,11 @@ +version: "3.9" + +services: + ecmk-fetcher-lambda: + build: + context: ../../../ + dockerfile: backend/magic_plan/handler/Dockerfile + ports: + - "9000:8080" + env_file: + - ../../../.env \ No newline at end of file diff --git a/backend/magic_plan/local_handler/invoke_local_lambda.py b/backend/magic_plan/local_handler/invoke_local_lambda.py new file mode 100644 index 00000000..7bb65806 --- /dev/null +++ b/backend/magic_plan/local_handler/invoke_local_lambda.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +import json +import requests + +HOST = "localhost" +PORT = "9000" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = { + "Records": [ + { + "body": json.dumps( + # { + # "address": "2 Laburnum Way, Rombley, BR2 8BZ | Retrofit Assessment", + # "hubspot_deal_id": "500262906061", + # } + {"address": "33 Wallaby Way, Sydney", "hubspot_deal_id": "123456789"} + ) + } + ] +} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) From da4f5f44c0ba871f05cf7da3cc2f070a5398b496 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 13:58:16 +0000 Subject: [PATCH 082/106] =?UTF-8?q?Set=20API=20key=20as=20session=20header?= =?UTF-8?q?=20on=20MagicPlanClient=20construction=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/tests/test_magic_plan_client.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/backend/magic_plan/tests/test_magic_plan_client.py b/backend/magic_plan/tests/test_magic_plan_client.py index c96b9cdf..cb2385b1 100644 --- a/backend/magic_plan/tests/test_magic_plan_client.py +++ b/backend/magic_plan/tests/test_magic_plan_client.py @@ -20,6 +20,7 @@ def _load_fixture(name: str) -> dict[str, Any]: def _make_client(mock_session: MagicMock) -> MagicPlanClient: + mock_session.headers = {} with patch( "backend.magic_plan.magic_plan_client.requests.Session", return_value=mock_session, @@ -44,7 +45,14 @@ def test_customer_header_set_on_session(mock_session: MagicMock) -> None: # Act _make_client(mock_session) # Assert - mock_session.headers.update.assert_called_once_with({"customer": CUSTOMER_ID}) + assert mock_session.headers["customer"] == CUSTOMER_ID + + +def test_api_key_header_set_on_session(mock_session: MagicMock) -> None: + # Act + _make_client(mock_session) + # Assert + assert mock_session.headers["key"] == API_KEY # --- get_plans --- From d59bf2d7cbf35299128099345a4334d1dc372c94 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 13:59:33 +0000 Subject: [PATCH 083/106] =?UTF-8?q?Set=20API=20key=20as=20session=20header?= =?UTF-8?q?=20on=20MagicPlanClient=20construction=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index 06905e6a..4029c436 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -9,7 +9,7 @@ class MagicPlanClient: def __init__(self, customer_id: str, api_key: str) -> None: self._api_key = api_key self._session = requests.Session() - self._session.headers.update({"customer": customer_id}) + self._session.headers.update({"customer": customer_id, "key": api_key}) def get_plans(self) -> PlansListResponse: r = self._session.get(f"{_BASE_URL}/plans", params={"key": self._api_key}) From ffcff33dd4434efe98bd2ca04d683b726eb453ba Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:00:07 +0000 Subject: [PATCH 084/106] =?UTF-8?q?get=5Fplans()=20sends=20no=20API=20key?= =?UTF-8?q?=20query=20parameter=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/tests/test_magic_plan_client.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/backend/magic_plan/tests/test_magic_plan_client.py b/backend/magic_plan/tests/test_magic_plan_client.py index cb2385b1..8b0b3f71 100644 --- a/backend/magic_plan/tests/test_magic_plan_client.py +++ b/backend/magic_plan/tests/test_magic_plan_client.py @@ -70,9 +70,7 @@ def test_get_plans_calls_correct_url( # Act client.get_plans() # Assert - mock_session.get.assert_called_once_with( - f"{BASE_URL}/plans", params={"key": API_KEY} - ) + mock_session.get.assert_called_once_with(f"{BASE_URL}/plans") def test_get_plans_calls_raise_for_status( From 20b32bcda03679d7d0c28f82d53621e4fb092af2 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:01:35 +0000 Subject: [PATCH 085/106] =?UTF-8?q?get=5Fplans()=20sends=20no=20API=20key?= =?UTF-8?q?=20query=20parameter=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index 4029c436..bed3dc9c 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -12,7 +12,7 @@ class MagicPlanClient: self._session.headers.update({"customer": customer_id, "key": api_key}) def get_plans(self) -> PlansListResponse: - r = self._session.get(f"{_BASE_URL}/plans", params={"key": self._api_key}) + r = self._session.get(f"{_BASE_URL}/plans") r.raise_for_status() return PlansListResponse.model_validate(r.json()["data"]) From 7752039dbdb5244d12a974e2ce3a17728d25293d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:01:40 +0000 Subject: [PATCH 086/106] =?UTF-8?q?=5Ffetch=5Fplan()=20sends=20no=20API=20?= =?UTF-8?q?key=20query=20parameter=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/tests/test_magic_plan_client.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/backend/magic_plan/tests/test_magic_plan_client.py b/backend/magic_plan/tests/test_magic_plan_client.py index 8b0b3f71..a0827bee 100644 --- a/backend/magic_plan/tests/test_magic_plan_client.py +++ b/backend/magic_plan/tests/test_magic_plan_client.py @@ -132,9 +132,7 @@ def test_get_plan_calls_correct_url( # Act client.get_plan(plan_id) # Assert - mock_session.get.assert_called_once_with( - f"{BASE_URL}/plans/{plan_id}", params={"key": API_KEY} - ) + mock_session.get.assert_called_once_with(f"{BASE_URL}/plans/{plan_id}") def test_get_plan_calls_raise_for_status( @@ -204,9 +202,7 @@ def test_get_plan_raw_calls_correct_url( # Act client.get_plan_raw(plan_id) # Assert - mock_session.get.assert_called_once_with( - f"{BASE_URL}/plans/{plan_id}", params={"key": API_KEY} - ) + mock_session.get.assert_called_once_with(f"{BASE_URL}/plans/{plan_id}") def test_get_plan_raw_calls_raise_for_status( From eb381a778c2dc8c2ff2341ff14a4376463e4b5d8 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:02:17 +0000 Subject: [PATCH 087/106] =?UTF-8?q?=5Ffetch=5Fplan()=20sends=20no=20API=20?= =?UTF-8?q?key=20query=20parameter=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index bed3dc9c..f9ae030f 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -23,8 +23,6 @@ class MagicPlanClient: return self._fetch_plan(plan_id).content def _fetch_plan(self, plan_id: str) -> requests.Response: - r = self._session.get( - f"{_BASE_URL}/plans/{plan_id}", params={"key": self._api_key} - ) + r = self._session.get(f"{_BASE_URL}/plans/{plan_id}") r.raise_for_status() return r From 3df726937e43c3c2758266144418740ae7636238 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:03:07 +0000 Subject: [PATCH 088/106] =?UTF-8?q?Remove=20unused=20=5Fapi=5Fkey=20instan?= =?UTF-8?q?ce=20variable=20now=20auth=20is=20fully=20header-based=20?= =?UTF-8?q?=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index f9ae030f..2880bf43 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -7,7 +7,6 @@ _BASE_URL = "https://cloud.magicplan.app/api/v2" class MagicPlanClient: def __init__(self, customer_id: str, api_key: str) -> None: - self._api_key = api_key self._session = requests.Session() self._session.headers.update({"customer": customer_id, "key": api_key}) From 8b27a5173bbbe1a348afbe901dd09b2ef6f8a349 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 14:05:30 +0000 Subject: [PATCH 089/106] fix typo for rate limit error --- backend/epc_client/epc_client_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/epc_client/epc_client_service.py b/backend/epc_client/epc_client_service.py index 777e8d14..abb5b826 100644 --- a/backend/epc_client/epc_client_service.py +++ b/backend/epc_client/epc_client_service.py @@ -52,7 +52,7 @@ class EpcClientService: if resp.status_code == 404: raise EpcNotFoundError(cert_num) if resp.status_code == 429: - raise EpcRateLimpolars vs pandas code examplepolars vs pandas code exampleitError("Rate limited by EPC API") + raise EpcRateLimitError("Rate limited by EPC API") if not resp.is_success: raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") return resp.json()["data"] From 04df924146e336688bd4110111625b3738e3ac21 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:13:12 +0000 Subject: [PATCH 090/106] fix local invoker --- backend/magic_plan/handler.py | 5 +++-- backend/magic_plan/local_handler/invoke_local_lambda.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/backend/magic_plan/handler.py b/backend/magic_plan/handler.py index f2c03b90..5fd90b7a 100644 --- a/backend/magic_plan/handler.py +++ b/backend/magic_plan/handler.py @@ -20,7 +20,9 @@ def handler(body: dict[str, Any], context: Any) -> str: api_key=settings.MAGICPLAN_API_KEY, ) # TODO: read s3_bucket from env var so staging/prod use the correct bucket - plan: Plan = MagicPlanService(client, s3_bucket="retrofit-energy-assessments-dev").run(payload) + plan: Plan = MagicPlanService( + client, s3_bucket="retrofit-energy-assessments-dev" + ).run(payload) logger.info("Saved MagicPlan plan uid=%s", plan.uid) return plan.uid @@ -30,7 +32,6 @@ if __name__ == "__main__": "Records": [ { "body": '{"address": "2 Laburnum Way Bromley BR2 8BZ", "hubspot_deal_id": "local-test-deal"}', - "messageId": "local-test", } ] } diff --git a/backend/magic_plan/local_handler/invoke_local_lambda.py b/backend/magic_plan/local_handler/invoke_local_lambda.py index 7bb65806..146951fe 100644 --- a/backend/magic_plan/local_handler/invoke_local_lambda.py +++ b/backend/magic_plan/local_handler/invoke_local_lambda.py @@ -10,13 +10,14 @@ LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" payload = { "Records": [ { + "messageId": "test-message-id", "body": json.dumps( # { # "address": "2 Laburnum Way, Rombley, BR2 8BZ | Retrofit Assessment", # "hubspot_deal_id": "500262906061", # } {"address": "33 Wallaby Way, Sydney", "hubspot_deal_id": "123456789"} - ) + ), } ] } From 75d03139341444ab5694793634f014c903fa6e3a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:14:37 +0000 Subject: [PATCH 091/106] fix broken magicplan handler tests --- backend/magic_plan/tests/test_handler.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/backend/magic_plan/tests/test_handler.py b/backend/magic_plan/tests/test_handler.py index 366f3ded..b0365f5b 100644 --- a/backend/magic_plan/tests/test_handler.py +++ b/backend/magic_plan/tests/test_handler.py @@ -54,7 +54,7 @@ def test_handler_raises_on_missing_address(mock_plan: MagicMock) -> None: def test_handler_constructs_client_from_settings(mock_service: MagicMock) -> None: # Arrange - body = {"address": ADDRESS} + body = {"address": ADDRESS, "hubspot_deal_id": "deal-123"} with patch("backend.magic_plan.handler.get_settings", return_value=_make_settings(customer_id="cust-xyz", api_key="key-xyz")), \ patch("backend.magic_plan.handler.MagicPlanClient") as MockClient, \ patch("backend.magic_plan.handler.MagicPlanService", return_value=mock_service): @@ -69,31 +69,37 @@ def test_handler_constructs_client_from_settings(mock_service: MagicMock) -> Non def test_handler_calls_service_run_with_address(mock_service: MagicMock) -> None: # Arrange - body = {"address": ADDRESS} + body = {"address": ADDRESS, "hubspot_deal_id": "deal-123"} with patch("backend.magic_plan.handler.get_settings", return_value=_make_settings()), \ patch("backend.magic_plan.handler.MagicPlanClient"), \ patch("backend.magic_plan.handler.MagicPlanService", return_value=mock_service): # Act _call_handler(body) # Assert - mock_service.run.assert_called_once_with(ADDRESS, None) + mock_service.run.assert_called_once() + request = mock_service.run.call_args.args[0] + assert request.address == ADDRESS + assert request.uprn is None def test_handler_passes_uprn_to_service(mock_service: MagicMock) -> None: # Arrange - body = {"address": ADDRESS, "uprn": "100023336956"} + body = {"address": ADDRESS, "uprn": "100023336956", "hubspot_deal_id": "deal-123"} with patch("backend.magic_plan.handler.get_settings", return_value=_make_settings()), \ patch("backend.magic_plan.handler.MagicPlanClient"), \ patch("backend.magic_plan.handler.MagicPlanService", return_value=mock_service): # Act _call_handler(body) # Assert - mock_service.run.assert_called_once_with(ADDRESS, "100023336956") + mock_service.run.assert_called_once() + request = mock_service.run.call_args.args[0] + assert request.address == ADDRESS + assert request.uprn == "100023336956" def test_handler_returns_plan_uid(mock_service: MagicMock) -> None: # Arrange - body = {"address": ADDRESS} + body = {"address": ADDRESS, "hubspot_deal_id": "deal-123"} with patch("backend.magic_plan.handler.get_settings", return_value=_make_settings()), \ patch("backend.magic_plan.handler.MagicPlanClient"), \ patch("backend.magic_plan.handler.MagicPlanService", return_value=mock_service): From beaf21fdcc4786dcf0b500dd37605e102eaa543f Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:32:37 +0000 Subject: [PATCH 092/106] correct magic plan url paths --- backend/magic_plan/magic_plan_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index 2880bf43..34c40695 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -11,7 +11,7 @@ class MagicPlanClient: self._session.headers.update({"customer": customer_id, "key": api_key}) def get_plans(self) -> PlansListResponse: - r = self._session.get(f"{_BASE_URL}/plans") + r = self._session.get(f"{_BASE_URL}/workgroups/plans") r.raise_for_status() return PlansListResponse.model_validate(r.json()["data"]) @@ -22,6 +22,6 @@ class MagicPlanClient: return self._fetch_plan(plan_id).content def _fetch_plan(self, plan_id: str) -> requests.Response: - r = self._session.get(f"{_BASE_URL}/plans/{plan_id}") + r = self._session.get(f"{_BASE_URL}/plans/get/{plan_id}") r.raise_for_status() return r From 8727a78f8bb0719061cc075ba49dc264ea90f9cd Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:33:58 +0000 Subject: [PATCH 093/106] =?UTF-8?q?correct=20magic=20plan=20url=20paths=20?= =?UTF-8?q?=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/tests/test_magic_plan_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/magic_plan/tests/test_magic_plan_client.py b/backend/magic_plan/tests/test_magic_plan_client.py index a0827bee..27d4ebad 100644 --- a/backend/magic_plan/tests/test_magic_plan_client.py +++ b/backend/magic_plan/tests/test_magic_plan_client.py @@ -70,7 +70,7 @@ def test_get_plans_calls_correct_url( # Act client.get_plans() # Assert - mock_session.get.assert_called_once_with(f"{BASE_URL}/plans") + mock_session.get.assert_called_once_with(f"{BASE_URL}/workgroups/plans") def test_get_plans_calls_raise_for_status( @@ -132,7 +132,7 @@ def test_get_plan_calls_correct_url( # Act client.get_plan(plan_id) # Assert - mock_session.get.assert_called_once_with(f"{BASE_URL}/plans/{plan_id}") + mock_session.get.assert_called_once_with(f"{BASE_URL}/plans/get/{plan_id}") def test_get_plan_calls_raise_for_status( @@ -202,7 +202,7 @@ def test_get_plan_raw_calls_correct_url( # Act client.get_plan_raw(plan_id) # Assert - mock_session.get.assert_called_once_with(f"{BASE_URL}/plans/{plan_id}") + mock_session.get.assert_called_once_with(f"{BASE_URL}/plans/get/{plan_id}") def test_get_plan_raw_calls_raise_for_status( From 62acc3ce98cb717a4c65711d3028e526443bbdfc Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:45:09 +0000 Subject: [PATCH 094/106] =?UTF-8?q?Paginate=20get=5Fplans=20to=20return=20?= =?UTF-8?q?flat=20list[PlanSummary]=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 8 +++----- backend/magic_plan/tests/test_magic_plan_client.py | 13 ++++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index 34c40695..ee52ffb0 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -1,6 +1,6 @@ import requests -from datatypes.magicplan.api.response import MagicPlanPlan, PlansListResponse +from datatypes.magicplan.api.response import MagicPlanPlan, PlanSummary, PlansListResponse _BASE_URL = "https://cloud.magicplan.app/api/v2" @@ -10,10 +10,8 @@ class MagicPlanClient: self._session = requests.Session() self._session.headers.update({"customer": customer_id, "key": api_key}) - def get_plans(self) -> PlansListResponse: - r = self._session.get(f"{_BASE_URL}/workgroups/plans") - r.raise_for_status() - return PlansListResponse.model_validate(r.json()["data"]) + def get_plans(self) -> list[PlanSummary]: + raise NotImplementedError def get_plan(self, plan_id: str) -> MagicPlanPlan: return MagicPlanPlan.model_validate(self._fetch_plan(plan_id).json()["data"]) diff --git a/backend/magic_plan/tests/test_magic_plan_client.py b/backend/magic_plan/tests/test_magic_plan_client.py index 27d4ebad..bf078517 100644 --- a/backend/magic_plan/tests/test_magic_plan_client.py +++ b/backend/magic_plan/tests/test_magic_plan_client.py @@ -7,7 +7,7 @@ import pytest import requests from backend.magic_plan.magic_plan_client import MagicPlanClient -from datatypes.magicplan.api.response import MagicPlanPlan, PlansListResponse +from datatypes.magicplan.api.response import MagicPlanPlan, PlanSummary FIXTURE_DIR = Path(__file__).parents[2] / "magic_plan" BASE_URL = "https://cloud.magicplan.app/api/v2" @@ -70,7 +70,9 @@ def test_get_plans_calls_correct_url( # Act client.get_plans() # Assert - mock_session.get.assert_called_once_with(f"{BASE_URL}/workgroups/plans") + mock_session.get.assert_called_once_with( + f"{BASE_URL}/workgroups/plans", params={"page": 1} + ) def test_get_plans_calls_raise_for_status( @@ -88,7 +90,7 @@ def test_get_plans_calls_raise_for_status( mock_session.get.return_value.raise_for_status.assert_called_once() -def test_get_plans_returns_plans_list_response( +def test_get_plans_returns_list_of_plan_summaries( client: MagicPlanClient, mock_session: MagicMock ) -> None: # Arrange @@ -100,8 +102,9 @@ def test_get_plans_returns_plans_list_response( # Act result = client.get_plans() # Assert - assert isinstance(result, PlansListResponse) - assert len(result.plans) == 1 + assert isinstance(result, list) + assert len(result) == 1 + assert isinstance(result[0], PlanSummary) def test_get_plans_propagates_http_error( From f83ddd05a8a6a8bace716e0e449c95bf040b1527 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:46:00 +0000 Subject: [PATCH 095/106] =?UTF-8?q?Paginate=20get=5Fplans=20to=20return=20?= =?UTF-8?q?flat=20list[PlanSummary]=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index ee52ffb0..de2fe4f6 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -11,7 +11,9 @@ class MagicPlanClient: self._session.headers.update({"customer": customer_id, "key": api_key}) def get_plans(self) -> list[PlanSummary]: - raise NotImplementedError + r = self._session.get(f"{_BASE_URL}/workgroups/plans", params={"page": 1}) + r.raise_for_status() + return PlansListResponse.model_validate(r.json()["data"]).plans def get_plan(self, plan_id: str) -> MagicPlanPlan: return MagicPlanPlan.model_validate(self._fetch_plan(plan_id).json()["data"]) From 6dfca082f8e9619c9a9c31dfd1c3524dc03be530 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:52:31 +0000 Subject: [PATCH 096/106] =?UTF-8?q?Fetch=20all=20pages=20in=20get=5Fplans?= =?UTF-8?q?=20pagination=20loop=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_magic_plan_client.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/backend/magic_plan/tests/test_magic_plan_client.py b/backend/magic_plan/tests/test_magic_plan_client.py index bf078517..211a5d4d 100644 --- a/backend/magic_plan/tests/test_magic_plan_client.py +++ b/backend/magic_plan/tests/test_magic_plan_client.py @@ -119,6 +119,34 @@ def test_get_plans_propagates_http_error( client.get_plans() +def test_get_plans_multi_page_fetches_all_pages( + client: MagicPlanClient, mock_session: MagicMock +) -> None: + # Arrange + page1_plan = _load_fixture("magicplan_api_plans_response_example.json")["data"][ + "plans" + ][0] + page2_plan = {**page1_plan, "id": "page-2-plan-id"} + page1_response = MagicMock() + page1_response.json.return_value = { + "data": {"paging": {"page": 1, "next_page": True, "count": 2}, "plans": [page1_plan]} + } + page2_response = MagicMock() + page2_response.json.return_value = { + "data": {"paging": {"page": 2, "next_page": False, "count": 2}, "plans": [page2_plan]} + } + mock_session.get.side_effect = [page1_response, page2_response] + # Act + result = client.get_plans() + # Assert + assert mock_session.get.call_count == 2 + mock_session.get.assert_any_call(f"{BASE_URL}/workgroups/plans", params={"page": 1}) + mock_session.get.assert_any_call(f"{BASE_URL}/workgroups/plans", params={"page": 2}) + assert len(result) == 2 + assert result[0].id == page1_plan["id"] + assert result[1].id == "page-2-plan-id" + + # --- get_plan --- From 0d324f99b29dfe1453d891f799512413e3776484 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:52:46 +0000 Subject: [PATCH 097/106] =?UTF-8?q?Fetch=20all=20pages=20in=20get=5Fplans?= =?UTF-8?q?=20pagination=20loop=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index de2fe4f6..bf50a6f8 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -11,9 +11,17 @@ class MagicPlanClient: self._session.headers.update({"customer": customer_id, "key": api_key}) def get_plans(self) -> list[PlanSummary]: - r = self._session.get(f"{_BASE_URL}/workgroups/plans", params={"page": 1}) - r.raise_for_status() - return PlansListResponse.model_validate(r.json()["data"]).plans + all_plans: list[PlanSummary] = [] + page = 1 + while True: + r = self._session.get(f"{_BASE_URL}/workgroups/plans", params={"page": page}) + r.raise_for_status() + response = PlansListResponse.model_validate(r.json()["data"]) + all_plans.extend(response.plans) + if not response.paging.next_page: + break + page += 1 + return all_plans def get_plan(self, plan_id: str) -> MagicPlanPlan: return MagicPlanPlan.model_validate(self._fetch_plan(plan_id).json()["data"]) From 5f77fbf4e45194a6fe18486e2cf199896333b0fa Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 14:54:14 +0000 Subject: [PATCH 098/106] =?UTF-8?q?Fetch=20all=20pages=20in=20get=5Fplans?= =?UTF-8?q?=20pagination=20loop=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 12 +++--------- .../magic_plan/tests/test_magic_plan_service.py | 14 +++++++------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index fb0a7610..22e19ddf 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -3,11 +3,7 @@ import json from datetime import datetime, timezone from typing import Optional -from datatypes.magicplan.api.response import ( - MagicPlanPlan, - PlanSummary, - PlansListResponse, -) +from datatypes.magicplan.api.response import MagicPlanPlan, PlanSummary from datatypes.magicplan.domain.mapper import map_plan from datatypes.magicplan.domain.models import Plan @@ -39,10 +35,8 @@ class MagicPlanService: if uprn is not None: logger.info("MagicPlanService.run uprn=%s", uprn) - plans_response: PlansListResponse = self._client.get_plans() - matched: Optional[PlanSummary] = find_matching_plan( - plans_response.plans, address - ) + plans: list[PlanSummary] = self._client.get_plans() + matched: Optional[PlanSummary] = find_matching_plan(plans, address) if matched is None: raise ValueError(f"No MagicPlan found for address: {address!r}") diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index f6954824..158cf4d6 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -91,7 +91,7 @@ def test_run_fetches_plan_with_matched_id( domain_plan: Plan, ) -> None: # Arrange - mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plans.return_value = [plan_summary] mock_client.get_plan.return_value = api_magic_plan service = _make_service(mock_client) with patch( @@ -114,7 +114,7 @@ def test_run_returns_mapped_plan( domain_plan: Plan, ) -> None: # Arrange - mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plans.return_value = [plan_summary] mock_client.get_plan.return_value = api_magic_plan service = _make_service(mock_client) with patch( @@ -137,7 +137,7 @@ def test_run_calls_save_plan_with_mapped_plan( plan_summary: PlanSummary, ) -> None: # Arrange - mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plans.return_value = [plan_summary] mock_client.get_plan.return_value = api_magic_plan service = _make_service(mock_client) with patch( @@ -161,7 +161,7 @@ def test_run_accepts_uprn_without_error( plan_summary: PlanSummary, ) -> None: # Arrange - mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plans.return_value = [plan_summary] mock_client.get_plan.return_value = api_magic_plan service = _make_service(mock_client) with patch( @@ -184,7 +184,7 @@ def test_run_uploads_to_s3_with_uprn_key( plan_summary: PlanSummary, ) -> None: # Arrange - mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plans.return_value = [plan_summary] request = _make_request(uprn="100023336956") service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) with patch( @@ -211,7 +211,7 @@ def test_run_uploads_to_s3_with_deal_id_key_when_uprn_absent( plan_summary: PlanSummary, ) -> None: # Arrange - mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plans.return_value = [plan_summary] mock_client.get_plan.return_value = api_magic_plan request = _make_request(hubspot_deal_id="deal-456", uprn=None) service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) @@ -242,7 +242,7 @@ def test_run_creates_uploaded_file_record( plan_summary: PlanSummary, ) -> None: # Arrange - mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plans.return_value = [plan_summary] mock_client.get_plan.return_value = api_magic_plan request = _make_request(hubspot_deal_id="deal-789", uprn="100023336956") service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) From dfc100f78b81d2ef213c3430ffc5467510781a9a Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 16:02:01 +0000 Subject: [PATCH 099/106] rank address similiarity --- backend/address2UPRN/main.py | 6 +++--- backend/address2UPRN/scoring.py | 10 +++++----- datatypes/epc/domain/historic_epc_matching.py | 4 ++-- etl/hubspot/hubspotClient.py | 17 +++++++++++++---- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 9c19eca9..389816cc 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -15,7 +15,7 @@ from utils.s3 import ( from datetime import datetime from backend.utils.addressMatch import AddressMatch -from backend.address2UPRN.scoring import all_uprns_match, rank_by_address_similarity +from backend.address2UPRN.scoring import all_uprns_match, rank_address_similarity from datatypes.epc.domain.historic_epc_matching import ( match_addresses_for_postcode, ) @@ -79,7 +79,7 @@ def get_uprn_with_epc_df( if epc_df.empty: return None - scored_df = rank_by_address_similarity( + scored_df = rank_address_similarity( epc_df, user_address=user_inputed_address, ) @@ -171,7 +171,7 @@ def resolve_uprns_for_postcode_group( for _, row in group_df.iterrows(): user_address = str(row[address_col]).strip() - scored_df = rank_by_address_similarity( + scored_df = rank_address_similarity( epc_df, user_address=user_address, ) diff --git a/backend/address2UPRN/scoring.py b/backend/address2UPRN/scoring.py index 2a681ad2..dcb86d49 100644 --- a/backend/address2UPRN/scoring.py +++ b/backend/address2UPRN/scoring.py @@ -19,8 +19,8 @@ def all_uprns_match( return len(uprns) == 1 and uprns[0] == str(target_uprn) -def rank_by_address_similarity( - df: pd.DataFrame, +def rank_address_similarity( + address_list_df: pd.DataFrame, user_address: str, address_column: str = "address", uprn_column: str = "uprn", @@ -32,13 +32,13 @@ def rank_by_address_similarity( DOES NOT choose or return a UPRN. """ - if address_column not in df.columns: + if address_column not in address_list_df.columns: raise ValueError(f"Missing column: {address_column}") - if uprn_column not in df.columns: + if uprn_column not in address_list_df.columns: raise ValueError(f"Missing column: {uprn_column}") - out = df.copy() + out = address_list_df.copy() user_norm = AddressMatch.normalise_address(user_address) diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 6ea2118b..86c44b59 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -4,7 +4,7 @@ from typing import Optional import pandas as pd from botocore.exceptions import ClientError -from backend.address2UPRN.scoring import rank_by_address_similarity +from backend.address2UPRN.scoring import rank_address_similarity from backend.utils.addressMatch import AddressMatch from datatypes.epc.domain.historic_epc import HistoricEpc from utils.pandas_utils import pandas_cell_to_str @@ -85,7 +85,7 @@ def match_addresses_for_postcode( ) from e raise - scored = rank_by_address_similarity( + scored = rank_address_similarity( df, user_address=user_address, address_column=address_column, diff --git a/etl/hubspot/hubspotClient.py b/etl/hubspot/hubspotClient.py index 92a6c7e1..4c9cb1e6 100644 --- a/etl/hubspot/hubspotClient.py +++ b/etl/hubspot/hubspotClient.py @@ -1,6 +1,7 @@ import os import time from enum import Enum +from http import HTTPStatus from typing import Optional, cast, Callable, Any from hubspot.client import Client # type: ignore[reportMissingTypeStubs] @@ -86,19 +87,27 @@ class HubspotClient: def _call_with_retry(self, fn: Callable[[], Any], max_retries: int = 2) -> Any: """ - Call fn(), retrying up to max_retries times on 429 rate-limit errors. + Call fn(), retrying up to max_retries times on 429 rate-limit errors + or transient 5xx server errors. Waits the minimal amount: the remaining interval window reported by HubSpot headers. Falls back to the full interval (10s) if headers are absent. Note: each HubSpot sub-module (deals, companies, etc.) ships its own ApiException - class with no shared base beyond Exception, so we detect 429s via duck-typing. + class with no shared base beyond Exception, so we detect retryable statuses via duck-typing. """ + retryable_statuses = { + HTTPStatus.TOO_MANY_REQUESTS, + HTTPStatus.INTERNAL_SERVER_ERROR, + HTTPStatus.BAD_GATEWAY, + HTTPStatus.SERVICE_UNAVAILABLE, + HTTPStatus.GATEWAY_TIMEOUT, + } for attempt in range(max_retries + 1): try: return fn() except Exception as e: status = getattr(e, "status", None) - if status != 429 or attempt == max_retries: + if status not in retryable_statuses or attempt == max_retries: raise headers = getattr(e, "headers", None) or {} interval_ms = int( @@ -106,7 +115,7 @@ class HubspotClient: ) wait_s = interval_ms / 1000.0 self.logger.warning( - f"HubSpot 429 (attempt {attempt + 1}/{max_retries}), " + f"HubSpot {status} (attempt {attempt + 1}/{max_retries}), " f"waiting {wait_s:.1f}s before retry." ) time.sleep(wait_s) From e458f0a2b718987bfc64635b690cb068293463dc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 16:24:11 +0000 Subject: [PATCH 100/106] task and sub tasks imrpvoed --- backend/app/db/models/tasks.py | 1 + backend/magic_plan/handler.py | 3 +- backend/pashub_fetcher/handler/handler.py | 3 +- backend/utils/subtasks.py | 131 +++++++++++----------- etl/hubspot/scripts/scraper/main.py | 3 +- 5 files changed, 72 insertions(+), 69 deletions(-) diff --git a/backend/app/db/models/tasks.py b/backend/app/db/models/tasks.py index e97a939f..db1b7c04 100644 --- a/backend/app/db/models/tasks.py +++ b/backend/app/db/models/tasks.py @@ -9,6 +9,7 @@ from sqlmodel import SQLModel, Field, Relationship class SourceEnum(enum.Enum): # TODO: move to domain? PORTFOLIO = "portfolio_id" + HUBSPOT_DEAL = "hubspot_deal_id" class Task(SQLModel, table=True): diff --git a/backend/magic_plan/handler.py b/backend/magic_plan/handler.py index 5fd90b7a..e7dc6484 100644 --- a/backend/magic_plan/handler.py +++ b/backend/magic_plan/handler.py @@ -5,13 +5,14 @@ from backend.magic_plan.magic_plan_client import MagicPlanClient from backend.magic_plan.magic_plan_service import MagicPlanService from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerRequest from datatypes.magicplan.domain.models import Plan +from backend.app.db.models.tasks import SourceEnum from backend.utils.subtasks import task_handler from utils.logger import setup_logger logger = setup_logger() -@task_handler() +@task_handler(task_source="magic_plan", source=SourceEnum.HUBSPOT_DEAL) def handler(body: dict[str, Any], context: Any) -> str: settings = get_settings() payload = MagicPlanTriggerRequest.model_validate(body) diff --git a/backend/pashub_fetcher/handler/handler.py b/backend/pashub_fetcher/handler/handler.py index 0d12b6bf..cd0c8113 100644 --- a/backend/pashub_fetcher/handler/handler.py +++ b/backend/pashub_fetcher/handler/handler.py @@ -5,6 +5,7 @@ from backend.pashub_fetcher.pashub_client import PashubClient, UnauthorizedError from backend.pashub_fetcher.pashub_service import PashubService from backend.pashub_fetcher.pashub_to_ara_trigger_request import PashubToAraTriggerRequest from backend.pashub_fetcher.token_getter import get_token_from_local_storage +from backend.app.db.models.tasks import SourceEnum from backend.utils.subtasks import task_handler from utils.logger import setup_logger from utils.sharepoint.domna_sharepoint_client import DomnaSharepointClient @@ -21,7 +22,7 @@ def get_pashub_client(email: str, password: str) -> PashubClient: return PashubClient(token=token) -@task_handler() +@task_handler(task_source="pashub_fetcher", source=SourceEnum.HUBSPOT_DEAL) def handler(body: Dict[str, Any], context: Any) -> List[str]: logger.info("Received message") diff --git a/backend/utils/subtasks.py b/backend/utils/subtasks.py index 6be3a742..36e67b78 100644 --- a/backend/utils/subtasks.py +++ b/backend/utils/subtasks.py @@ -1,75 +1,72 @@ -# decorators/subtask_handler.py - -from functools import wraps -from typing import Callable, Any -from uuid import UUID import json +import os +import time +from functools import wraps +from typing import Any, Callable, Optional, cast +from uuid import UUID from backend.app.db.functions.tasks.Tasks import SubTaskInterface, TasksInterface +from backend.app.db.models.tasks import SourceEnum +from backend.app.plan.utils import build_cloudwatch_log_url from utils.logger import setup_logger -def subtask_handler(): - """ - Decorator that wraps your existing handler and automatically: +def _try_build_cloud_logs_url(start_ms: int) -> Optional[str]: + # Returns None outside a Lambda runtime so local/non-Lambda runs don't crash. + required = ("AWS_REGION", "AWS_LAMBDA_LOG_GROUP_NAME", "AWS_LAMBDA_LOG_STREAM_NAME") + if not all(k in os.environ for k in required): + return None + return build_cloudwatch_log_url(start_ms) - - Extracts task_id + sub_task_id from event - - Marks subtask as in progress - - Executes handler logic - - Marks subtask complete on success - - Marks failed on exception + +def subtask_handler() -> Callable[[Callable[..., Any]], Callable[..., Any]]: + """ + Decorator for Lambdas that operate on an already-existing SubTask. Extracts + task_id + sub_task_id from each record, records the CloudWatch logs URL, + marks the SubTask in progress, then complete on success / failed on raise. """ - def decorator(func: Callable[..., Any]): + def decorator(func: Callable[..., Any]) -> Callable[..., Any]: @wraps(func) - def wrapper(event: dict[str, Any], context: Any, *args, **kwargs): + def wrapper(event: dict[str, Any], context: Any, *args: Any, **kwargs: Any) -> None: + start_ms = int(time.time() * 1000) + cloud_logs_url = _try_build_cloud_logs_url(start_ms) records = event.get("Records", [event]) - interface = SubTaskInterface() for record in records: - - # ------------------------------- - # Parse body safely - # ------------------------------- - body = {} - - if isinstance(record.get("body"), str): + raw_body = record.get("body") + body: dict[str, Any] + if isinstance(raw_body, str): try: - body = json.loads(record["body"]) + body = json.loads(raw_body) except Exception: body = {} + elif isinstance(raw_body, dict): + body = cast(dict[str, Any], raw_body) else: - body = record.get("body", {}) or {} + body = {} task_id_raw = body.get("task_id") subtask_id_raw = body.get("sub_task_id") task_id = UUID(task_id_raw) if isinstance(task_id_raw, str) else None - subtask_id = ( - UUID(subtask_id_raw) if isinstance(subtask_id_raw, str) else None - ) + subtask_id = UUID(subtask_id_raw) if isinstance(subtask_id_raw, str) else None if not task_id or not subtask_id: raise RuntimeError("task_id or sub_task_id missing") - # ------------------------------- - # Mark in progress - # ------------------------------- interface.update_subtask_status( subtask_id=subtask_id, status="in progress", + cloud_logs_url=cloud_logs_url, ) try: - # Pass the parsed body into your function result = func(body, context, *args, **kwargs) - # ------------------------------- - # Success → mark complete - # ------------------------------- interface.update_subtask_status( subtask_id=subtask_id, status="complete", @@ -77,75 +74,79 @@ def subtask_handler(): ) except Exception as e: - - # ------------------------------- - # Failure → mark failed - # ------------------------------- interface.update_subtask_status( subtask_id=subtask_id, status="failed", outputs={"error": str(e)}, ) - raise - return None - return wrapper return decorator -def task_handler(): +def task_handler( + task_source: str, + source: SourceEnum, +) -> Callable[[Callable[..., Any]], Callable[..., Any]]: """ - Decorator that wraps a Lambda handler and automatically: - - - Parses body from the first SQS record (or uses the event dict directly) - - Creates a fresh Task + SubTask in the database - - Marks the subtask as in progress - - Executes the handler, passing the parsed body - - Marks complete on success, failed on exception (and re-raises) + Decorator for Lambdas that are themselves the entry point of a pipeline (no + router in front). For each record the decorator creates a fresh Task + + SubTask with the given task_source and source. source_id is read from + body[source.value] (silent None if absent) — see ADR-0001. Records the + CloudWatch logs URL, marks the SubTask in progress, then complete on + success / failed on raise. """ - def decorator(func: Callable[..., Any]): - - task_source = f"{func.__module__}.{func.__qualname__}" + def decorator(func: Callable[..., Any]) -> Callable[..., Any]: @wraps(func) - def wrapper(event: dict[str, Any], context: Any, *args, **kwargs): + def wrapper(event: dict[str, Any], context: Any, *args: Any, **kwargs: Any) -> Any: logger = setup_logger() + start_ms = int(time.time() * 1000) + cloud_logs_url = _try_build_cloud_logs_url(start_ms) - records = event.get("Records", [event]) # fallback for non-SQS - - results = [] - failures = [] + records = event.get("Records", [event]) + results: list[Any] = [] + failures: list[dict[str, Any]] = [] + interface = SubTaskInterface() for record in records: - # Parse body raw_body = record.get("body", record) - + body: dict[str, Any] if isinstance(raw_body, str): try: body = json.loads(raw_body) except Exception: body = {} + elif isinstance(raw_body, dict): + body = cast(dict[str, Any], raw_body) else: - body = raw_body or {} + body = {} + + raw_source_id = body.get(source.value) + source_id: Optional[str] = ( + str(raw_source_id) if raw_source_id is not None else None + ) - # Create task per message logger.info("Creating task for source: %s", task_source) task_id, subtask_id = TasksInterface.create_task( task_source=task_source, inputs=body, + source=source, + source_id=source_id, ) - logger.info("Created task_id=%s subtask_id=%s", task_id, subtask_id) + if subtask_id is None: + raise RuntimeError("create_task did not return a subtask_id") - interface = SubTaskInterface() + logger.info("Created task_id=%s subtask_id=%s", task_id, subtask_id) interface.update_subtask_status( subtask_id=subtask_id, status="in progress", + cloud_logs_url=cloud_logs_url, ) try: @@ -172,13 +173,11 @@ def task_handler(): if "Records" in event: failures.append({"itemIdentifier": record["messageId"]}) else: - # Handle non-SQS events raise if "Records" in event: return {"batchItemFailures": failures} - # Handle non-SQS events return results return wrapper diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 86844352..a7b640cf 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -9,6 +9,7 @@ from etl.hubspot.hubspot_deal_differ import HubspotDealDiffer from etl.hubspot.hubspot_trigger_orchestrator_trigger_request import ( HubspotTriggerOrchestratorTriggerRequest, ) +from backend.app.db.models.tasks import SourceEnum from backend.utils.subtasks import task_handler from backend.app.db.models.hubspot_deal_data import HubspotDealData from utils.logger import setup_logger @@ -16,7 +17,7 @@ from utils.logger import setup_logger logger = setup_logger() -@task_handler() +@task_handler(task_source="hubspot_scraper", source=SourceEnum.HUBSPOT_DEAL) def handler(body: dict[str, Any], context: Any) -> None: db_client = HubspotDataToDb() hubspot_client = HubspotClient() From 09dbfe2106a4787c2194a921b01bd489821abed2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 12 May 2026 17:03:16 +0000 Subject: [PATCH 101/106] fix dependency issue --- backend/app/plan/utils.py | 29 +--------------------- backend/categorisation/handler/handler.py | 2 +- backend/categorisation/processor.py | 3 ++- backend/engine/engine.py | 3 ++- backend/utils/cloudwatch.py | 30 +++++++++++++++++++++++ backend/utils/subtasks.py | 2 +- 6 files changed, 37 insertions(+), 32 deletions(-) create mode 100644 backend/utils/cloudwatch.py diff --git a/backend/app/plan/utils.py b/backend/app/plan/utils.py index e752f5e0..a27bdf90 100644 --- a/backend/app/plan/utils.py +++ b/backend/app/plan/utils.py @@ -1,5 +1,4 @@ import ast -import os from typing import Optional import msgpack from uuid import UUID @@ -8,6 +7,7 @@ from backend.addresses.Address import Address from backend.app.config import get_settings from backend.app.plan.data_classes import PropertyRequestData from backend.app.db.functions.tasks.Tasks import SubTaskInterface +from backend.utils.cloudwatch import build_cloudwatch_log_url from starlette.responses import Response from utils.logger import setup_logger @@ -241,33 +241,6 @@ def parse_eco_packages( return measures, mapped["target_sap"], mapped["plan_type"], already_installed -def build_cloudwatch_log_url(start_ms: Optional[int]) -> str: - """ - Build a CloudWatch Logs URL for the current Lambda invocation, - including timestamp window from start_ms to end_ms (epoch ms). - """ - logger.info("Building cloudwatch logs URL") - region = os.environ["AWS_REGION"] - logger.info("Building cloudwatch logs URL: Got AWS region") - log_group = os.environ["AWS_LAMBDA_LOG_GROUP_NAME"] - logger.info("Building cloudwatch logs URL: Got lambda log group name") - log_stream = os.environ["AWS_LAMBDA_LOG_STREAM_NAME"] - logger.info("Building cloudwatch logs URL: Got lambda log stream name") - - # CloudWatch console requires / encoded as $252F - encoded_group = log_group.replace("/", "$252F") - encoded_stream = log_stream.replace("/", "$252F") - - # Return the full URL with time range - return ( - f"https://console.aws.amazon.com/cloudwatch/home?" - f"region={region}" - f"#logsV2:log-groups/log-group/{encoded_group}" - f"/log-events/{encoded_stream}" - f"$3Fstart={start_ms}" - ) - - def handle_error( msg: str, exception: Exception, diff --git a/backend/categorisation/handler/handler.py b/backend/categorisation/handler/handler.py index a1f69ea6..04dc0c44 100644 --- a/backend/categorisation/handler/handler.py +++ b/backend/categorisation/handler/handler.py @@ -3,7 +3,7 @@ import time from typing import Any, Mapping from backend.app.db.functions.tasks.Tasks import SubTaskInterface -from backend.app.plan.utils import build_cloudwatch_log_url +from backend.utils.cloudwatch import build_cloudwatch_log_url from backend.categorisation.categorisation_trigger_request import ( CategorisationTriggerRequest, ) diff --git a/backend/categorisation/processor.py b/backend/categorisation/processor.py index 88bc121e..e589c016 100644 --- a/backend/categorisation/processor.py +++ b/backend/categorisation/processor.py @@ -15,7 +15,8 @@ from backend.app.db.functions.tasks.Tasks import SubTaskInterface from backend.app.db.models.recommendations import PlanModel, ScenarioModel from backend.app.domain.classes.plan import Plan from backend.app.domain.classes.scenario import Scenario -from backend.app.plan.utils import build_cloudwatch_log_url, handle_error +from backend.app.plan.utils import handle_error +from backend.utils.cloudwatch import build_cloudwatch_log_url from backend.categorisation.categorisation_trigger_request import ( CategorisationTriggerRequest, ) diff --git a/backend/engine/engine.py b/backend/engine/engine.py index 8b4ee821..c9e3972f 100644 --- a/backend/engine/engine.py +++ b/backend/engine/engine.py @@ -23,8 +23,9 @@ from backend.app.db.functions.tasks.Tasks import SubTaskInterface from backend.app.plan.schemas import PlanTriggerRequest from backend.app.plan.utils import ( - get_cleaned, patch_epc, extract_property_request_data, handle_error, build_cloudwatch_log_url + get_cleaned, patch_epc, extract_property_request_data, handle_error ) +from backend.utils.cloudwatch import build_cloudwatch_log_url from backend.app.utils import sap_to_epc import backend.app.assumptions as assumptions diff --git a/backend/utils/cloudwatch.py b/backend/utils/cloudwatch.py new file mode 100644 index 00000000..e5309da2 --- /dev/null +++ b/backend/utils/cloudwatch.py @@ -0,0 +1,30 @@ +import os +from typing import Optional + +from utils.logger import setup_logger + +logger = setup_logger() + + +def build_cloudwatch_log_url(start_ms: Optional[int]) -> str: + """ + Build a CloudWatch Logs URL for the current Lambda invocation, including a + timestamp window starting at start_ms. Requires AWS_REGION, + AWS_LAMBDA_LOG_GROUP_NAME, and AWS_LAMBDA_LOG_STREAM_NAME to be set in the + environment — i.e. only safe to call inside a Lambda runtime. + """ + logger.info("Building cloudwatch logs URL") + region = os.environ["AWS_REGION"] + log_group = os.environ["AWS_LAMBDA_LOG_GROUP_NAME"] + log_stream = os.environ["AWS_LAMBDA_LOG_STREAM_NAME"] + + encoded_group = log_group.replace("/", "$252F") + encoded_stream = log_stream.replace("/", "$252F") + + return ( + f"https://console.aws.amazon.com/cloudwatch/home?" + f"region={region}" + f"#logsV2:log-groups/log-group/{encoded_group}" + f"/log-events/{encoded_stream}" + f"$3Fstart={start_ms}" + ) diff --git a/backend/utils/subtasks.py b/backend/utils/subtasks.py index 36e67b78..21ca24b1 100644 --- a/backend/utils/subtasks.py +++ b/backend/utils/subtasks.py @@ -7,7 +7,7 @@ from uuid import UUID from backend.app.db.functions.tasks.Tasks import SubTaskInterface, TasksInterface from backend.app.db.models.tasks import SourceEnum -from backend.app.plan.utils import build_cloudwatch_log_url +from backend.utils.cloudwatch import build_cloudwatch_log_url from utils.logger import setup_logger From 3fd7321337d26dfa80a5e3ad8bf039bf05faeeaa Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 13 May 2026 08:18:43 +0000 Subject: [PATCH 102/106] remove comment --- backend/epc_client/epc_client_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/epc_client/epc_client_service.py b/backend/epc_client/epc_client_service.py index abb5b826..b1ed2017 100644 --- a/backend/epc_client/epc_client_service.py +++ b/backend/epc_client/epc_client_service.py @@ -41,7 +41,7 @@ class EpcClientService: # ------------------------------------------------------------------ # Private helperEpcRateLimpolarss - # ----------------------EpcRateLimpolarsEpcRateLimpolarsEpcRateLimpolarsEpcRateLimpolarsEpcRateLimpolars-------------------------------------------- + # ------------------------------------------------------------------ def _fetch_certificate(self, cert_num: str) -> dict[str, Any]: resp = httpx.get( From 566c70077a9ffd7df5a6d08d918b4496713940eb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 13 May 2026 08:29:22 +0000 Subject: [PATCH 103/106] removing redundant code --- backend/epc_client/tests/test_client.py | 67 ------------------------- backend/utils/epc_address_match.py | 67 ------------------------- 2 files changed, 134 deletions(-) delete mode 100644 backend/utils/epc_address_match.py diff --git a/backend/epc_client/tests/test_client.py b/backend/epc_client/tests/test_client.py index 849b4a25..0e95a844 100644 --- a/backend/epc_client/tests/test_client.py +++ b/backend/epc_client/tests/test_client.py @@ -2,7 +2,6 @@ from unittest.mock import MagicMock, patch, call import pytest from backend.epc_client.epc_client_service import EpcClientService -from backend.utils.epc_address_match import find_best_epc_match from datatypes.epc.search import EpcSearchResult from backend.epc_client.exceptions import EpcNotFoundError, EpcRateLimitError from datatypes.epc.domain.epc_property_data import EpcPropertyData @@ -132,69 +131,3 @@ def test_search_by_postcode_404_returns_empty_list(epc_service): results = epc_service.search_by_postcode("ZZ9 9ZZ") assert results == [] - - -# --------------------------------------------------------------------------- -# Tests 8-10: find_best_epc_match — real scoring, only HTTP mocked -# --------------------------------------------------------------------------- - - -def test_find_best_match_clear_winner_on_first_pass(epc_service, rdsap_21_0_1_cert): - search_rows = [ - make_search_row(cert_num="CERT-WIN", address_line_1="1 High Street"), - make_search_row(cert_num="CERT-LOSE", address_line_1="99 Nowhere Lane"), - ] - cert_response = {"data": rdsap_21_0_1_cert} - - def fake_get(url, params=None, **kwargs): - if "search" in url: - return _mock_response(200, {"data": search_rows}) - return _mock_response(200, cert_response) - - with patch("httpx.get", side_effect=fake_get): - result = find_best_epc_match(epc_service, "SW1A 1AA", "1 High Street") - - assert isinstance(result, EpcPropertyData) - - -def test_find_best_match_resolves_on_second_pass_using_full_address( - epc_service, rdsap_21_0_1_cert -): - # Both candidates share address_line_1 — round 1 is ambiguous. - # Round 2 scores against full_address and picks the correct floor. - search_rows = [ - make_search_row( - cert_num="CERT-A", - address_line_1="1 High Street", - address_line_2="Ground Floor", - ), - make_search_row( - cert_num="CERT-B", - address_line_1="1 High Street", - address_line_2="First Floor", - ), - ] - cert_response = {"data": rdsap_21_0_1_cert} - - def fake_get(url, params=None, **kwargs): - if "search" in url: - return _mock_response(200, {"data": search_rows}) - return _mock_response(200, cert_response) - - with patch("httpx.get", side_effect=fake_get): - result = find_best_epc_match( - epc_service, "SW1A 1AA", "1 High Street Ground Floor" - ) - - assert isinstance(result, EpcPropertyData) - - -def test_find_best_match_returns_none_when_no_good_match(epc_service): - search_rows = [make_search_row(cert_num="CERT-X", address_line_1="99 Nowhere Lane")] - - with patch("httpx.get", return_value=_mock_response(200, {"data": search_rows})): - result = find_best_epc_match( - epc_service, "SW1A 1AA", "1 Completely Different Road" - ) - - assert result is None diff --git a/backend/utils/epc_address_match.py b/backend/utils/epc_address_match.py deleted file mode 100644 index 0df56eca..00000000 --- a/backend/utils/epc_address_match.py +++ /dev/null @@ -1,67 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING, Optional - -from backend.utils.addressMatch import AddressMatch -from datatypes.epc.domain.epc_property_data import EpcPropertyData -from datatypes.epc.search import EpcSearchResult - -if TYPE_CHECKING: - from backend.epc_client.epc_client_service import EpcClientService - -_MIN_MATCH_SCORE = 0.6 - - -def find_best_epc_match( - service: EpcClientService, - postcode: str, - address: str, -) -> Optional[EpcPropertyData]: - candidates = service.search_by_postcode(postcode) - if not candidates: - return None - - cert_num = _pick_best_cert(candidates, address, use_full_address=False) - if cert_num: - return _safe_get(service, cert_num) - - cert_num = _pick_best_cert(candidates, address, use_full_address=True) - if cert_num: - return _safe_get(service, cert_num) - - return None - - -def _pick_best_cert( - candidates: list[EpcSearchResult], - user_address: str, - use_full_address: bool, -) -> Optional[str]: - scored: list[tuple[float, str]] = [ - ( - AddressMatch.score( - user_address, - r.full_address if use_full_address else r.address_line_1, - ), - r.certificate_number, - ) - for r in candidates - ] - if not scored: - return None - best_score = max(s for s, _ in scored) - if best_score < _MIN_MATCH_SCORE: - return None - top = [cert for s, cert in scored if s == best_score] - if len(top) != 1: - return None - return top[0] - - -def _safe_get(service: EpcClientService, cert_num: str) -> Optional[EpcPropertyData]: - from backend.epc_client.exceptions import EpcNotFoundError - - try: - return service.get_by_certificate_number(cert_num) - except EpcNotFoundError: - return None From c347865b9e056c6ea903a834ab2e695cf4c0ad72 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 13 May 2026 09:34:51 +0000 Subject: [PATCH 104/106] retry --- backend/epc_client/_retry.py | 7 ++++++- backend/epc_client/epc_client_service.py | 23 +++++++++++++++++++++-- backend/epc_client/exceptions.py | 7 +++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/backend/epc_client/_retry.py b/backend/epc_client/_retry.py index e290e95b..bbdd0cff 100644 --- a/backend/epc_client/_retry.py +++ b/backend/epc_client/_retry.py @@ -11,6 +11,7 @@ def call_with_retry( max_retries: int = 5, backoff_base: float = 1.0, backoff_multiplier: float = 2.0, + max_backoff: float = 60.0, ) -> T: last_exc: EpcRateLimitError | None = None for attempt in range(max_retries + 1): @@ -19,5 +20,9 @@ def call_with_retry( except EpcRateLimitError as exc: last_exc = exc if attempt < max_retries: - time.sleep(backoff_base * (backoff_multiplier ** attempt)) + if exc.retry_after is not None: + delay = exc.retry_after + else: + delay = backoff_base * (backoff_multiplier ** attempt) + time.sleep(min(delay, max_backoff)) raise last_exc # type: ignore[misc] diff --git a/backend/epc_client/epc_client_service.py b/backend/epc_client/epc_client_service.py index b1ed2017..86caeea3 100644 --- a/backend/epc_client/epc_client_service.py +++ b/backend/epc_client/epc_client_service.py @@ -18,6 +18,7 @@ from datatypes.epc.search import EpcSearchResult class EpcClientService: BASE_URL = "https://api.get-energy-performance-data.communities.gov.uk" + REQUEST_TIMEOUT = 10.0 def __init__(self, auth_token: str) -> None: self._headers = { @@ -25,6 +26,16 @@ class EpcClientService: "Accept": "application/json", } + @staticmethod + def _parse_retry_after(resp: httpx.Response) -> Optional[float]: + header = resp.headers.get("Retry-After") + if header is None: + return None + try: + return float(header) + except (TypeError, ValueError): + return None + def get_by_certificate_number(self, cert_num: str) -> EpcPropertyData: raw = call_with_retry(lambda: self._fetch_certificate(cert_num)) return EpcPropertyDataMapper.from_api_response(raw) @@ -48,11 +59,15 @@ class EpcClientService: f"{self.BASE_URL}/api/certificate", params={"certificate_number": cert_num}, headers=self._headers, + timeout=self.REQUEST_TIMEOUT, ) if resp.status_code == 404: raise EpcNotFoundError(cert_num) if resp.status_code == 429: - raise EpcRateLimitError("Rate limited by EPC API") + raise EpcRateLimitError( + "Rate limited by EPC API", + retry_after=self._parse_retry_after(resp), + ) if not resp.is_success: raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") return resp.json()["data"] @@ -72,11 +87,15 @@ class EpcClientService: f"{self.BASE_URL}/api/domestic/search", params=params, headers=self._headers, + timeout=self.REQUEST_TIMEOUT, ) if resp.status_code == 404: return [] if resp.status_code == 429: - raise EpcRateLimitError("Rate limited by EPC API") + raise EpcRateLimitError( + "Rate limited by EPC API", + retry_after=self._parse_retry_after(resp), + ) if not resp.is_success: raise EpcApiError(f"EPC API error {resp.status_code}: {resp.text}") diff --git a/backend/epc_client/exceptions.py b/backend/epc_client/exceptions.py index 49f1542a..fb7d96fa 100644 --- a/backend/epc_client/exceptions.py +++ b/backend/epc_client/exceptions.py @@ -1,3 +1,6 @@ +from typing import Optional + + class EpcApiError(Exception): """Base for all EPC client errors.""" @@ -8,3 +11,7 @@ class EpcNotFoundError(EpcApiError): class EpcRateLimitError(EpcApiError): """Raised when the API returns 429 and all retries are exhausted.""" + + def __init__(self, message: str, retry_after: Optional[float] = None) -> None: + super().__init__(message) + self.retry_after = retry_after From ff4ad07a2b719147a96530d6f1ad893230425831 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 13 May 2026 11:41:21 +0000 Subject: [PATCH 105/106] retry --- backend/epc_client/tests/test_client.py | 86 ++++++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/backend/epc_client/tests/test_client.py b/backend/epc_client/tests/test_client.py index 0e95a844..70425a92 100644 --- a/backend/epc_client/tests/test_client.py +++ b/backend/epc_client/tests/test_client.py @@ -8,12 +8,13 @@ from datatypes.epc.domain.epc_property_data import EpcPropertyData from backend.epc_client.tests.conftest import make_search_row -def _mock_response(status_code=200, json_data=None): +def _mock_response(status_code=200, json_data=None, headers=None): resp = MagicMock() resp.status_code = status_code resp.is_success = 200 <= status_code < 300 resp.json.return_value = json_data or {} resp.text = str(json_data) + resp.headers = headers or {} return resp @@ -63,6 +64,89 @@ def test_get_by_certificate_number_retries_on_429_and_succeeds( assert isinstance(result, EpcPropertyData) +# --------------------------------------------------------------------------- +# Test 3b: 429 with Retry-After header → sleeps for that value +# --------------------------------------------------------------------------- + + +def test_429_retry_after_header_drives_sleep_duration( + epc_service, rdsap_21_0_1_cert +): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429, headers={"Retry-After": "7"}), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch( + "backend.epc_client._retry.time.sleep" + ) as mock_sleep: + epc_service.get_by_certificate_number("CERT-001") + + mock_sleep.assert_called_once_with(7.0) + + +# --------------------------------------------------------------------------- +# Test 3c: 429 without Retry-After → falls back to exponential backoff +# --------------------------------------------------------------------------- + + +def test_429_without_retry_after_uses_exponential_backoff( + epc_service, rdsap_21_0_1_cert +): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429), + _mock_response(429), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch( + "backend.epc_client._retry.time.sleep" + ) as mock_sleep: + epc_service.get_by_certificate_number("CERT-001") + + assert mock_sleep.call_args_list == [call(1.0), call(2.0)] + + +# --------------------------------------------------------------------------- +# Test 3d: malformed Retry-After header → falls back to exponential backoff +# --------------------------------------------------------------------------- + + +def test_429_malformed_retry_after_falls_back_to_backoff( + epc_service, rdsap_21_0_1_cert +): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429, headers={"Retry-After": "Wed, 21 Oct 2026 07:28:00 GMT"}), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch( + "backend.epc_client._retry.time.sleep" + ) as mock_sleep: + epc_service.get_by_certificate_number("CERT-001") + + mock_sleep.assert_called_once_with(1.0) + + +# --------------------------------------------------------------------------- +# Test 3e: Retry-After capped by max_backoff to avoid hostile/buggy values +# --------------------------------------------------------------------------- + + +def test_429_retry_after_capped_by_max_backoff(epc_service, rdsap_21_0_1_cert): + cert_response = {"data": rdsap_21_0_1_cert} + responses = [ + _mock_response(429, headers={"Retry-After": "9999"}), + _mock_response(200, cert_response), + ] + with patch("httpx.get", side_effect=responses), patch( + "backend.epc_client._retry.time.sleep" + ) as mock_sleep: + epc_service.get_by_certificate_number("CERT-001") + + mock_sleep.assert_called_once_with(60.0) + + # --------------------------------------------------------------------------- # Test 4: get_by_uprn empty search → None # --------------------------------------------------------------------------- From 2fb6a99956822f48f2d5ec2654c692d55c26ee68 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 13 May 2026 14:02:36 +0000 Subject: [PATCH 106/106] throttle added --- backend/address2UPRN/tests/test_csv.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/backend/address2UPRN/tests/test_csv.py b/backend/address2UPRN/tests/test_csv.py index 70e7a9f9..73d94388 100644 --- a/backend/address2UPRN/tests/test_csv.py +++ b/backend/address2UPRN/tests/test_csv.py @@ -1,12 +1,24 @@ # tests/test_address_to_uprn_csv.py import csv +import time import pytest from pathlib import Path from backend.address2UPRN.main import get_uprn FIXTURE_PATH = Path(__file__).parent / "test_data.csv" +# Delay between live EPC API calls to stay under the (undocumented) rate limit. +# Each parametrized case fires at least one EPC request; without throttling, +# GitHub-hosted runners burst fast enough to hit 429s. +EPC_THROTTLE_SECONDS = 1.0 + + +@pytest.fixture(autouse=True) +def _throttle_epc_requests(): + yield + time.sleep(EPC_THROTTLE_SECONDS) + def load_test_cases(): with open(FIXTURE_PATH, newline="", encoding="utf-8") as f: