From 7a49f5df20e61836ee19fc19e77abd024fc35880 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 12:19:03 +0000 Subject: [PATCH] save plan temporary while i incorporate skills to claude --- datatypes/epc/domain/plan.md | 161 +++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 datatypes/epc/domain/plan.md diff --git a/datatypes/epc/domain/plan.md b/datatypes/epc/domain/plan.md new file mode 100644 index 00000000..45cc495b --- /dev/null +++ b/datatypes/epc/domain/plan.md @@ -0,0 +1,161 @@ +# Historic EPC address-match service + +## Context + +ETL `backend/etl/etl_opendatacommunities/main.py` shards `certificates.csv` by sanitised postcode and uploads gzipped CSVs to `s3://retrofit-data-dev/historical_epc//data.csv.gz`. Need a pure-python lib that, given `(user_address, postcode)`, fetches the corresponding shard and scores every row against the user address using the same lexiscore as `address2UPRN` — but returning the full scored df (not a single UPRN), so callers can apply their own thresholding. + +Mirrors pattern in [backend/address2UPRN/main.py:111-147](backend/address2UPRN/main.py#L111-L147) (`get_uprn_candidates`) but reads from S3 historic CSV instead of the EPC live API. No Lambda, no script — lib only for now. + +## Approach + +Add a wrapper class `HistoricEpcMatches` and a function `match_addresses_for_postcode` to the existing domain file. Add a small gzip-CSV S3 helper to `utils/s3.py`. + +### 1. Add gzip-CSV S3 reader + +In [utils/s3.py](utils/s3.py) (after `read_dataframe_from_s3_parquet` ~line 167): + +```python +def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: + if not file_key.endswith(".csv.gz"): + raise ValueError("file_key must end with .csv.gz") + buf = read_io_from_s3(bucket_name, file_key) + return pd.read_csv(buf, compression="gzip", low_memory=False) +``` + +Reuses existing `read_io_from_s3` (line 105). Caller catches `botocore.exceptions.ClientError` for missing-key handling. + +### 2. Append matcher to domain module + +In [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — keep existing `HistoricEpc` dataclass intact, append: + +```python +from typing import Optional +import pandas as pd +from botocore.exceptions import ClientError + +from backend.utils.addressMatch import AddressMatch +from utils.s3 import read_csv_gz_from_s3 + + +@dataclass +class HistoricEpcMatches: + """Scored historic EPC rows for a single postcode.""" + user_address: str + postcode: str # sanitised + df: pd.DataFrame # has lexiscore + lexirank, sorted best-first + + def top(self) -> Optional[pd.Series]: + return None if self.df.empty else self.df.iloc[0] + + def top_n(self, k: int) -> pd.DataFrame: + return self.df.head(k) + + def unambiguous_uprn(self, uprn_column: str = "UPRN") -> Optional[str]: + if self.df.empty: + return None + top_rank = self.df["lexirank"].min() + uprns = ( + self.df.loc[self.df["lexirank"] == top_rank, uprn_column] + .dropna().astype(str).str.replace(r"\.0$", "", regex=True) + .unique() + ) + return uprns[0] if len(uprns) == 1 else None + + +def _sanitise_postcode(postcode: str) -> str: + if not postcode: + raise ValueError("postcode must be non-empty") + return postcode.upper().replace(" ", "") + + +def match_addresses_for_postcode( + user_address: str, + postcode: str, + *, + bucket: str = "retrofit-data-dev", + prefix: str = "historical_epc", + address_column: str = "ADDRESS", +) -> HistoricEpcMatches: + if not user_address: + raise ValueError("user_address must be non-empty") + + pc = _sanitise_postcode(postcode) + key = f"{prefix}/{pc}/data.csv.gz" + + try: + df = read_csv_gz_from_s3(bucket, key) + except ClientError as e: + if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): + raise FileNotFoundError( + f"No historic EPC data at s3://{bucket}/{key}" + ) from e + raise + + if address_column not in df.columns: + raise ValueError( + f"Missing address column {address_column!r} in {key}" + ) + + user_norm = AddressMatch.normalise_address(user_address) + df = df.copy() + df["lexiscore"] = df[address_column].fillna("").apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) + df["lexirank"] = ( + df["lexiscore"].rank(method="dense", ascending=False).astype(int) + ) + df = df.sort_values(["lexirank", "lexiscore"], ascending=[True, False]).reset_index(drop=True) + + return HistoricEpcMatches(user_address=user_address, postcode=pc, df=df) +``` + +### Reuse notes +- `AddressMatch.normalise_address` + `AddressMatch.levenshtein` from [backend/utils/addressMatch.py](backend/utils/addressMatch.py) — same scoring as address2UPRN. +- Score column copy uses `.fillna("")` to defend against NaN in `ADDRESS`. +- Defaults match ETL output: bucket `retrofit-data-dev`, prefix `historical_epc`, column `ADDRESS` (uppercase). + +### 3. Tests + +New: [datatypes/epc/domain/tests/__init__.py](datatypes/epc/domain/tests/__init__.py) (empty) and [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py). + +Reuse existing fixture `datatypes/epc/schema/tests/fixtures/historic_epc.csv` — read it in-memory in tests; do NOT commit a `.csv.gz` fixture. Patch target: `datatypes.epc.domain.historic_epc.read_csv_gz_from_s3` (local binding, not `utils.s3.read_csv_gz_from_s3`). + +Cases: +1. `_sanitise_postcode("ab33 8al") == "AB338AL"`; empty raises. +2. Returned df has `lexiscore` + `lexirank` columns, row count preserved. +3. df sorted: `iloc[0]["lexirank"] == 1`, `lexiscore` monotone non-increasing. +4. S3 key built correctly: `"AB33 8AL"` → key `"historical_epc/AB338AL/data.csv.gz"` (spy on patched helper). +5. `ClientError` with code `NoSuchKey` → `FileNotFoundError`. +6. Exact-match address → `unambiguous_uprn()` returns that UPRN; ambiguous tie → `None`. +7. `top()` / `top_n(k)` shape checks. + +## Critical files +- [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — append matcher +- [utils/s3.py](utils/s3.py) — add `read_csv_gz_from_s3` +- [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py) — new + +## Out of scope +- Lambda handler / SQS wiring (deferred — lib only) +- Threshold logic (caller decides via wrapper helpers) +- Postcode validation via `postcodes.io` (`AddressMatch.is_valid_postcode` exists if needed later) +- Refactoring `sanitise(pd.Series)` in `etl_opendatacommunities/main.py` — separate concern + +## Verification +``` +cd /workspaces/model && pytest datatypes/epc/domain/tests/test_historic_epc_match.py -v +``` + +Sample real-S3 call (needs AWS creds): +```python +from datatypes.epc.domain.historic_epc import match_addresses_for_postcode +m = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") +print(m.df[["ADDRESS", "UPRN", "lexiscore", "lexirank"]].head()) +print(m.unambiguous_uprn()) +``` + +## Sequencing +1. Add `read_csv_gz_from_s3` to `utils/s3.py`. +2. Append matcher + wrapper to `datatypes/epc/domain/historic_epc.py`. +3. Add tests. + +Steps 2 & 3 depend on 1. No `__init__.py` re-exports needed.