From 24ec68bb9f1d5f6e5ca8eb748b87cf64145ac7df Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 6 May 2026 07:55:37 +0000 Subject: [PATCH 01/43] save progress for historical epc procress --- backend/etl/etl_opendatacommunities/README.md | 14 ++ backend/etl/etl_opendatacommunities/main.py | 144 ++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 backend/etl/etl_opendatacommunities/README.md create mode 100644 backend/etl/etl_opendatacommunities/main.py diff --git a/backend/etl/etl_opendatacommunities/README.md b/backend/etl/etl_opendatacommunities/README.md new file mode 100644 index 00000000..bf16ba89 --- /dev/null +++ b/backend/etl/etl_opendatacommunities/README.md @@ -0,0 +1,14 @@ +This website https://epc.opendatacommunities.org/ has closed down on 30th May 2026 + +So we downloaded the data and moved everything to S3 ( s3://retrofit-data-dev/epc_opendatacommunities/master_backup/ ) + +This scripts assumes the following: + +1) You downloaded the master copy, uncompressed it and set it to a path so we can read the csv + + +The script funciton is: + +1) reads csv for all data, seperate each iteration by postcode +2) compresses the csv and save it in the location +2) only gets the postcode data, compresses and uploads to s3 -> location s3://retrofit-data-dev/epc_opendatacommunities//compressed data \ No newline at end of file diff --git a/backend/etl/etl_opendatacommunities/main.py b/backend/etl/etl_opendatacommunities/main.py new file mode 100644 index 00000000..30b4045a --- /dev/null +++ b/backend/etl/etl_opendatacommunities/main.py @@ -0,0 +1,144 @@ +from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait +from io import BytesIO +from pathlib import Path +from typing import Any + +import boto3 +import pandas as pd +from botocore.config import Config +from tqdm import tqdm + +from utils.logger import setup_logger + +logger = setup_logger() + +SRC_ROOT = Path("/workspaces/home/epc_data") +TMP_ROOT = Path("/tmp/epc_postcodes") +S3_BUCKET = "retrofit-data-dev" +S3_PREFIX = "epc_opendatacommunities" + +REC_COLS = { + "IMPROVEMENT_ITEM", + "IMPROVEMENT_SUMMARY_TEXT", + "IMPROVEMENT_DESCR_TEXT", + "IMPROVEMENT_ID", + "IMPROVEMENT_ID_TEXT", + "INDICATIVE_COST", +} + +# This scripts assume you downloading the zip, unzip it, and running it locally + + +def sanitise(pc: pd.Series) -> pd.Series: + return pc.astype("string").str.upper().str.replace(" ", "", regex=False) + + +def shard_la(la_dir: Path) -> None: + certs = pd.read_csv(la_dir / "certificates.csv", low_memory=False) + recs = pd.read_csv(la_dir / "recommendations.csv", low_memory=False) + merged = certs.merge(recs, on="LMK_KEY", how="left") + + merged["POSTCODE_CLEAN"] = sanitise(merged["POSTCODE"]) + before = len(merged) + merged = merged.dropna(subset=["POSTCODE_CLEAN"]) + merged = merged[merged["POSTCODE_CLEAN"] != ""] + dropped = before - len(merged) + if dropped: + logger.warning(f"{la_dir.name}: dropped {dropped} rows with empty postcode") + + for pc, group in merged.groupby("POSTCODE_CLEAN", sort=False): + out = TMP_ROOT / f"{pc}.csv" + group.drop(columns=["POSTCODE_CLEAN"]).to_csv( + out, mode="a", header=not out.exists(), index=False + ) + + +def list_existing_keys(s3: Any) -> set[str]: + existing: set[str] = set() + paginator = s3.get_paginator("list_objects_v2") + pages = paginator.paginate(Bucket=S3_BUCKET, Prefix=f"{S3_PREFIX}/") + for page in tqdm(pages, desc="list s3"): + for obj in page.get("Contents", []): + existing.add(obj["Key"]) + logger.info(f"Found {len(existing)} existing objects under {S3_PREFIX}/") + return existing + + +def upload_postcode(path: Path, s3: Any) -> None: + df = pd.read_csv(path, low_memory=False).drop_duplicates() + + cert_cols = [c for c in df.columns if c not in REC_COLS] + cert_only = df[cert_cols].drop_duplicates() + dupes = cert_only["LMK_KEY"].value_counts() + bad = dupes[dupes > 1] + if not bad.empty: + raise ValueError( + f"Postcode {path.stem}: LMK_KEY appears with conflicting cert data: " + f"{bad.index.tolist()[:5]}" + ) + + buf = BytesIO() + df.to_csv(buf, index=False, compression="gzip") + s3.put_object( + Bucket=S3_BUCKET, + Key=f"{S3_PREFIX}/{path.stem}/data.csv.gz", + Body=buf.getvalue(), + ContentType="text/csv", + ContentEncoding="gzip", + ) + + +def main(): + TMP_ROOT.mkdir(parents=True, exist_ok=True) + la_dirs = sorted( + p for p in SRC_ROOT.iterdir() if p.is_dir() and p.name.startswith("domestic-") + ) + logger.info(f"Sharding {len(la_dirs)} LA folders -> {TMP_ROOT}") + + # for la in tqdm(la_dirs, desc="shard"): + # shard_la(la) + + s3 = boto3.client( + "s3", + config=Config(max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"}), + ) + pc_files = sorted(TMP_ROOT.glob("*.csv")) + logger.info(f"Found {len(pc_files)} local shards") + + existing = list_existing_keys(s3) + todo = [p for p in pc_files if f"{S3_PREFIX}/{p.stem}/data.csv.gz" not in existing] + skipped = len(pc_files) - len(todo) + logger.info( + f"Uploading {len(todo)} shards (skipping {skipped} already in S3) -> " + f"s3://{S3_BUCKET}/{S3_PREFIX}/" + ) + + workers = 256 + todo_iter = iter(todo) + inflight: dict[Any, Path] = {} + pbar = tqdm(total=len(todo), desc="upload") + with ThreadPoolExecutor(max_workers=workers) as pool: + for _ in range(workers * 2): + pc = next(todo_iter, None) + if pc is None: + break + inflight[pool.submit(upload_postcode, pc, s3)] = pc + + while inflight: + done, _ = wait(inflight.keys(), return_when=FIRST_COMPLETED) + for fut in done: + pc = inflight.pop(fut) + try: + fut.result() + except Exception as e: + logger.error(f"{pc.name}: {e}") + raise + pbar.update(1) + nxt = next(todo_iter, None) + if nxt is not None: + inflight[pool.submit(upload_postcode, nxt, s3)] = nxt + pbar.close() + + +if __name__ == "__main__": + main() From 4f45eeb3e9bf854b5f246916d04b99d1eb45020b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 15:55:40 +0000 Subject: [PATCH 02/43] save --- asset_list/AssetList.py | 323 ++++++++++++++++++------------------ asset_list/app.py | 30 +++- backend/app/config.py | 1 + backend/app/local/router.py | 17 +- backend/app/main.py | 16 +- recommendations/Costs.py | 229 +++++++++++++++---------- 6 files changed, 345 insertions(+), 271 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index dede3162..573c4f7c 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -31,17 +31,19 @@ from recommendations.recommendation_utils import ( from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes -from dotenv import load_dotenv +# from dotenv import load_dotenv logger = setup_logger() -load_dotenv(dotenv_path="../backend/.env") +# load_dotenv(dotenv_path="../backend/.env") # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +# OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") class DataRemapper: - def __init__(self, standard_values, standard_map=None, max_tokens=1000): + def __init__( + self, standard_values, standard_map=None, max_tokens=1000, api_key=None + ): """ Initialize the remapper with standard values and a predefined mapping. @@ -75,7 +77,8 @@ class DataRemapper: "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, } - self.openai_client = OpenAI(api_key=OPENAI_API_KEY) + print(f"DATA REMAPPER api key is {api_key}") + self.openai_client = OpenAI(api_key=api_key) @staticmethod def clean_string(text): @@ -136,12 +139,20 @@ class DataRemapper: raise ValueError("Input tokens exceed the maximum limit.") logger.info("Calling OpenAI API for standardization...") - response = self.openai_client.chat.completions.create( - model=self.ai_model, - messages=[{"role": "user", "content": prompt}], - max_tokens=self.max_tokens, - temperature=0.1, - ) + + try: + response = self.openai_client.chat.completions.create( + model=self.ai_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.1, + ) + except Exception as e: + print(f"[debug] OpenAI call failed. type={type(e).__name__}") + print(f"[debug] status={getattr(e, 'status_code', None)}") + print(f"[debug] body={getattr(e, 'response', None) and e.response.text}") + print(f"[debug] model={self.ai_model}") + raise output_text = response.choices[0].message.content.strip() output_tokens = self.count_tokens(output_text) # Count output tokens @@ -504,6 +515,7 @@ class AssetList: landlord_block_reference=None, phase=False, header=0, + openai_api_key=None, ): self.local_filepath = local_filepath self.sheet_name = sheet_name @@ -529,6 +541,7 @@ class AssetList: self.ecosurv = None self.ecosurv_no_match = pd.DataFrame() self.geographical_areas = pd.DataFrame() + self.openai_api_key = openai_api_key # When this is True, we intend to break the programme into multiple phases. We may need to review # how this is structured in the future, as depending on how we get future data, we may need to @@ -1107,6 +1120,7 @@ class AssetList: remapper = DataRemapper( standard_values=config["standard_values"], standard_map=config["standard_map"], + api_key=self.openai_api_key, ) remap_dictionary = remapper.standardize_list( values_to_remap=values_to_remap.tolist() @@ -1296,8 +1310,8 @@ class AssetList: self.standardised_asset_list[ self.ATTRIBUTE_HAS_SOLAR ] = self.standardised_asset_list[ - self.FIND_EPC_DATA_NAMES["Solar photovoltaics"] - ] | ~self.standardised_asset_list[ + self.FIND_EPC_DATA_NAMES["Solar photovoltaics"] + ] | ~self.standardised_asset_list[ self.EPC_API_DATA_NAMES["photo-supply"] ].isin( ["0.0", 0, None, "", np.nan] @@ -1315,7 +1329,7 @@ class AssetList: property_type=( str(x[self.STANDARD_PROPERTY_TYPE]).title() if str(x[self.STANDARD_PROPERTY_TYPE]).title() - in accepted_epc_property_types + in accepted_epc_property_types else ( x[self.EPC_API_DATA_NAMES["property-type"]] if not pd.isnull( @@ -1373,9 +1387,9 @@ class AssetList: self.standardised_asset_list.apply( lambda x: estimate_perimeter( floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] - / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] - / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], ), axis=1, ) @@ -1460,7 +1474,7 @@ class AssetList: year_lower_bound = ( 2007 if x[self.EPC_API_DATA_NAMES["construction-age-band"]] - == "England and Wales: 2007 onwards" + == "England and Wales: 2007 onwards" else 2012 ) @@ -1515,7 +1529,7 @@ class AssetList: age_band_matches = ( "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] - == int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) + == int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) else "EPC Age Band is different from Year Built" ) @@ -1545,7 +1559,7 @@ class AssetList: age_band_matches = ( "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) - and (x[self.STANDARD_YEAR_BUILT] <= float(upper_date)) + and (x[self.STANDARD_YEAR_BUILT] <= float(upper_date)) else ( "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) @@ -1717,22 +1731,22 @@ class AssetList: if self.non_intrusives_present: if self.new_format_non_insturives_present_v2: non_intrusives_wall_filter = ( - self.standardised_asset_list["non-intrusives: Construction"] - == "CAVITY" - ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( + self.standardised_asset_list["non-intrusives: Construction"] + == "CAVITY" + ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( ["EMPTY", "PARTIAL", "EMPTY CAVITY"] ) else: non_intrusives_wall_filter = ( - self.standardised_asset_list["non-intrusives: Construction"] - == "CAVITY" - ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( + self.standardised_asset_list["non-intrusives: Construction"] + == "CAVITY" + ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( ["EMPTY", "PARTIAL"] ) elif self.old_format_non_intrusives_present: non_intrusives_wall_filter = self.standardised_asset_list[ - "non-intrusives: WFT Findings" - ].str.lower().str.strip().isin( + "non-intrusives: WFT Findings" + ].str.lower().str.strip().isin( [ "empty cavity", "partial fill", @@ -1742,18 +1756,18 @@ class AssetList: "empty cav", ] ) | ( - ( - self.standardised_asset_list["non-intrusives: WFT Findings"] - .str.lower() - .str.strip() - .str.contains("empty cavity|partial fill") - & ~self.standardised_asset_list["non-intrusives: WFT Findings"] - .astype(str) - .str.lower() - .str.strip() - .str.contains("major access issues") - ) - ) + ( + self.standardised_asset_list["non-intrusives: WFT Findings"] + .str.lower() + .str.strip() + .str.contains("empty cavity|partial fill") + & ~self.standardised_asset_list["non-intrusives: WFT Findings"] + .astype(str) + .str.lower() + .str.strip() + .str.contains("major access issues") + ) + ) else: # We set the filter to False, as we have no non-intrusives non_intrusives_wall_filter = False @@ -1765,12 +1779,12 @@ class AssetList: ) else: year_built_filter = ( - self.standardised_asset_list[self.STANDARD_YEAR_BUILT] - <= self.EMPTY_CAVITY_YEAR_THRESHOLD - ) | ( - self.standardised_asset_list["epc_year_upper_bound"] - <= self.EMPTY_CAVITY_YEAR_THRESHOLD - ) + self.standardised_asset_list[self.STANDARD_YEAR_BUILT] + <= self.EMPTY_CAVITY_YEAR_THRESHOLD + ) | ( + self.standardised_asset_list["epc_year_upper_bound"] + <= self.EMPTY_CAVITY_YEAR_THRESHOLD + ) # Criteria: # The property isn't a bedsit @@ -1811,8 +1825,8 @@ class AssetList: ] = ( ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity_has_solar" - ] + "non_intrusive_indicates_empty_cavity_has_solar" + ] & ( ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin( ["bedsit"] @@ -1888,8 +1902,8 @@ class AssetList: .str.lower() .isin(self.EPC_NO_WALL_INSULATION_DESCRIPTIONS) | self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( - ["uninsulated cavity"] - ) + ["uninsulated cavity"] + ) ) ###################################################### @@ -1926,8 +1940,8 @@ class AssetList: extraction_wall_filter = ( extraction_wall_filter & ~self.standardised_asset_list[ - "non-intrusives: Eligibility (Red/Yellow/Green)" - ].isin(["RED"]) + "non-intrusives: Eligibility (Red/Yellow/Green)" + ].isin(["RED"]) ) self.standardised_asset_list[ @@ -2023,26 +2037,26 @@ class AssetList: self.standardised_asset_list[ "solar_epc_data_indicates_correct_heating_system" ] = ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] - ] - .str.lower() - .str.contains( - "air source heat pump|ground source heat pump|boiler and radiators, electric" - ) - ) | ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] - ] - .str.lower() - .str.contains("electric storage heaters") - & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheatcont-description"] - ] - == "Controls for high heat retention storage heaters" - ) + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .str.contains( + "air source heat pump|ground source heat pump|boiler and radiators, electric" ) + ) | ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .str.contains("electric storage heaters") + & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheatcont-description"] + ] + == "Controls for high heat retention storage heaters" + ) + ) # If the landlord has given us the heating system, we default to that on heating upgrades. Because of the # poor heating in place, if the EPC indicates that this property had a low efficiency heating system but the @@ -2050,25 +2064,25 @@ class AssetList: self.standardised_asset_list[ "solar_epc_data_indicates_requires_heating_upgrade" ] = ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .str.contains("electric storage heaters|room heaters") + & ( self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] + self.EPC_API_DATA_NAMES["mainheatcont-description"] ] - .str.lower() - .str.contains("electric storage heaters|room heaters") - & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheatcont-description"] - ] - != "Controls for high heat retention storage heaters" - ) - ) & ( - ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( - ["district heating", "communal heating", "communal gas boiler"] - ) - & ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] - .astype(str) - .str.contains("gas ") + != "Controls for high heat retention storage heaters" ) + ) & ( + ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + ["district heating", "communal heating", "communal gas boiler"] + ) + & ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] + .astype(str) + .str.contains("gas ") + ) # Basic check - both of the previous two shouldn't be true simultaneously if ( @@ -2148,8 +2162,8 @@ class AssetList: self.standardised_asset_list[ "solar_non_intrusives_walls_insulated" ] = self.standardised_asset_list[ - "non-intrusives: WFT Findings" - ].str.lower().str.strip().isin( + "non-intrusives: WFT Findings" + ].str.lower().str.strip().isin( [ "retro drilled", "retro filled", @@ -2158,8 +2172,8 @@ class AssetList: "retro drilled and filled", ] ) | self.standardised_asset_list[ - "non-intrusives: WFT Findings" - ].str.lower().str.strip().str.contains( + "non-intrusives: WFT Findings" + ].str.lower().str.strip().str.contains( "retro drilled" ) else: @@ -2176,19 +2190,14 @@ class AssetList: ) self.standardised_asset_list["solar_epc_walls_insulated"] = ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES[ - "walls-description"]] - .str.lower() - .str.contains("|".join( - self.EPC_INSULATED_WALLS_SUBSTRINGS)) - ) | ( - self.standardised_asset_list[ - "walls_u_value"].apply( - lambda x: x <= 0.7 if not pd.isnull( - x) else False - ) - ) + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]] + .str.lower() + .str.contains("|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS)) + ) | ( + self.standardised_asset_list["walls_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) roof_data = [] for desc in self.standardised_asset_list[ @@ -2230,20 +2239,20 @@ class AssetList: self.standardised_asset_list[ "solar_epc_loft_needs_topup" ] = self.standardised_asset_list[ - self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS - ].apply( + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS + ].apply( lambda x: int(x) < 200 if str(x).isdigit() else False ) | ( - ( - self.standardised_asset_list["is_loft"] - | self.standardised_asset_list["is_pitched"] - ) - & ( - self.standardised_asset_list[ - self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS - ].isin(["below average", "none"]) - ) + ( + self.standardised_asset_list["is_loft"] + | self.standardised_asset_list["is_pitched"] ) + & ( + self.standardised_asset_list[ + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS + ].isin(["below average", "none"]) + ) + ) self.standardised_asset_list["epc_has_floor_recommendation"] = ( self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) @@ -2252,16 +2261,15 @@ class AssetList: # Check if the boiler is electric # We check if it contains both the terms boiler & electric self.standardised_asset_list["has_electric_boiler"] = ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] - ] - .str.lower() - .isin(["boiler and radiators, electric"]) - ) | ( - self.standardised_asset_list[ - self.STANDARD_HEATING_SYSTEM] - == "electric boiler" - ) + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .isin(["boiler and radiators, electric"]) + ) | ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] + == "electric boiler" + ) #################################### # Check solar eligibility @@ -2399,11 +2407,11 @@ class AssetList: empty_cavity_map = { "non_intrusive_indicates_empty_cavity": self.EMPTY_CAVITY_NON_INTRUSIVE - + ": ", + + ": ", "non_intrusive_indicates_empty_cavity_has_solar": f"{self.EMPTY_CAVITY_NON_INTRUSIVE} - property " - "already has solar: ", + "already has solar: ", "non_intrusive_indicates_empty_cavity_no_year_filter": f"{self.EMPTY_CAVITY_NON_INTRUSIVE}, " - f"built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: ", + f"built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: ", } for variable, description in empty_cavity_map.items(): self.standardised_asset_list["cavity_reason"] = np.where( @@ -2419,8 +2427,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & ( self.standardised_asset_list["non-intrusives: WFT Findings"] .str.lower() @@ -2445,8 +2453,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & self.standardised_asset_list[ "non_intrusive_indicates_cavity_extraction" ] @@ -2461,8 +2469,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & ( self.standardised_asset_list["non-intrusives: Insulated"] == "RETRO DRILLED" @@ -2478,8 +2486,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & ( self.standardised_asset_list["non-intrusives: Insulated"] == "FILLED AT BUILD" @@ -2495,8 +2503,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & pd.isnull(self.standardised_asset_list["cavity_reason"]) ), f"{self.EPC_EMPTY}: " + self.standardised_asset_list["SAP Category"], @@ -2640,7 +2648,7 @@ class AssetList: identified_work = self.standardised_asset_list[ ~pd.isnull(self.standardised_asset_list["cavity_reason"]) | ~pd.isnull(self.standardised_asset_list["solar_reason"]) - ][self.DOMNA_PROPERTY_ID].values + ][self.DOMNA_PROPERTY_ID].values if self.DOMNA_PROPERTY_ID in self.outcomes.columns: self.outcomes_for_output = self.outcomes[ @@ -2675,12 +2683,12 @@ class AssetList: blocks_of_flats = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" - ] + ] non_blocks_of_flats = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" - ] + ] # Produce some aggregate figures self.work_type_figures = { @@ -2723,7 +2731,7 @@ class AssetList: blocks = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" - ].copy() + ].copy() if blocks.empty: return @@ -2860,7 +2868,7 @@ class AssetList: self.standardised_asset_list = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" - ] + ] self.standardised_asset_list = pd.concat( [self.standardised_asset_list, expanded_blocks], ignore_index=True @@ -2940,7 +2948,7 @@ class AssetList: # find any block refs with more than 50% emptires viable_empty_blocks = self.block_analysis_df[ self.block_analysis_df["Percentage of Empties"] >= 0.50 - ] + ] if not viable_empty_blocks.empty: project_code_lookup = viable_empty_blocks[["Block Reference"]].copy() @@ -3179,7 +3187,7 @@ class AssetList: contact_details = pd.read_excel(local_filepath, sheet_name=sheet_name)[ [self.contact_detail_fields["landlord_property_id"]] + details_colnames - ] + ] contact_details = contact_details[ ~pd.isnull( contact_details[self.contact_detail_fields["landlord_property_id"]] @@ -3572,13 +3580,10 @@ class AssetList: "Non-Intrusives: Date Checked ": date_of_inspections, "Non-Intrusives: Wall Type ": non_intrusives_construction, "Non-intrusives: Insulation ": non_intrusives_insulated, - "Non-intrusives: Insulation Material ": - non_intrusives_insulation_material, - "Non-Intrusives: CIGA Check Required ": - non_intrusives_ciga_check_required, + "Non-intrusives: Insulation Material ": non_intrusives_insulation_material, + "Non-Intrusives: CIGA Check Required ": non_intrusives_ciga_check_required, "Non-Intrusives: PV Access Issues ": non_intrusives_pv_access, - "Non-Intrusives: Roof Orientation ": - non_intrusives_roof_orientation, + "Non-Intrusives: Roof Orientation ": non_intrusives_roof_orientation, "Non-Intrusives: Surveyor Notes ": non_intrusives_surveyor_notes, "Non-Intrusives: Surveyor Name ": non_intrusives_surveyor_name, "CIGA: Date Requested ": None, # TODO: Don't have this for the moment @@ -3755,8 +3760,8 @@ class AssetList: # We compare address line 1 to full address if any( df[self.STANDARD_FULL_ADDRESS] - .str.lower() - .str.contains(row["Address Line 1"].lower(), na=False) + .str.lower() + .str.contains(row["Address Line 1"].lower(), na=False) ): df = df[ df[self.STANDARD_FULL_ADDRESS] @@ -3996,7 +4001,7 @@ class AssetList: matched = matched[ matched["houseno"].astype(str) == house_no_to_match - ] + ] if matched.shape[0] == 1: lookup_i.append( { @@ -4021,7 +4026,7 @@ class AssetList: )[0] matched = matched[ matched[self.STANDARD_FULL_ADDRESS] == best_match - ] + ] lookup_i.append( { "row_id": x["row_id"], @@ -4332,7 +4337,7 @@ class AssetList: df = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] == row[master_id_colnames[idx]] - ] + ] if df.shape[0] == 1: matched.append( { @@ -4438,7 +4443,7 @@ class AssetList: )[1] ) > 90 - ] + ] if df.shape[0] == 0: unmatched.append(row["row_id"]) @@ -4446,8 +4451,8 @@ class AssetList: if any( df[self.STANDARD_FULL_ADDRESS] - .str.lower() - .str.contains( + .str.lower() + .str.contains( " ".join( [row[house_no_col], row["Street / Block Name"]] ).lower() @@ -4474,7 +4479,7 @@ class AssetList: row[property_type_col].split(" ")[-1].lower() ) & (df[self.STANDARD_PROPERTY_TYPE] != "block of flats") - ] + ] if df.shape[0] != 1: # We have multiple matches - it's likely because the landlord has a duplicate diff --git a/asset_list/app.py b/asset_list/app.py index 49ec48a0..7413c7cb 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -21,6 +21,11 @@ EPC_AUTH_TOKEN = os.getenv( OPENAI_API_KEY = os.getenv( "OPENAI_API_KEY", ) +print( + f"[debug] OPENAI_API_KEY loaded: " + f"{OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:] if OPENAI_API_KEY else 'NONE'} " + f"(len={len(OPENAI_API_KEY) if OPENAI_API_KEY else 0})" +) def extract_address1( @@ -74,23 +79,23 @@ def app(): """ data_folder = "/workspaces/model/asset_list" - data_filename = "2026-04-22T08_22_00.779745_61049fd3.xlsx" - sheet_name = "in" - postcode_column = "postcode_clean" - address1_column = "address2uprn_address" + data_filename = "input.xlsx" + sheet_name = "Handovers" + postcode_column = "POSTCODE" + address1_column = "Full Addres" address1_method = None - fulladdress_column = "address2uprn_address" + fulladdress_column = "Full Addres" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "address2uprn_uprn" - landlord_property_type = "Property Type" # Good to include if landlord gave - landlord_built_form = "Built Form" # Good to include if landlord gave + landlord_os_uprn = "domna_found_uprn" + landlord_property_type = "PROPERTY TYPE" # Good to include if landlord gave + landlord_built_form = "Type Description" # Good to include if landlord gave landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "UPRN" + landlord_property_id = "PROP REF" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -131,6 +136,7 @@ def app(): landlord_sap=landlord_sap, landlord_block_reference=landlord_block_reference, phase=phase, + openai_api_key=OPENAI_API_KEY, ) asset_list.init_standardise() @@ -462,3 +468,9 @@ def app(): asset_list.duplicated_addresses.to_excel( writer, sheet_name="Duplicate Properties", index=False ) + + + + +for key,value in dict.items(): + lsakjfldsa \ No newline at end of file diff --git a/backend/app/config.py b/backend/app/config.py index 70a6b50c..e72eb693 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -77,6 +77,7 @@ class Settings(BaseSettings): OSMOSIS_ACD_SHAREPOINT_ID: Optional[str] = None PRIVATE_PAY_SHAREPOINT_ID: Optional[str] = None SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID: Optional[str] = None + OPENAI_API_KEY: Optional[str] = None # Pas Hub PASHUB_EMAIL: Optional[str] = None diff --git a/backend/app/local/router.py b/backend/app/local/router.py index 0977be04..ea04dc49 100644 --- a/backend/app/local/router.py +++ b/backend/app/local/router.py @@ -2,8 +2,8 @@ from fastapi import APIRouter, HTTPException, status from jose import jwt, jwe import json import datetime -from app.config import get_settings -from app.dependencies import get_derived_encryption_key +from backend.app.config import get_settings +from backend.app.dependencies import get_derived_encryption_key router = APIRouter( prefix="/local", @@ -27,7 +27,12 @@ def create_dummy_token(secret: str) -> str: "dbId": "known_id", } - token = jwe.encrypt(json.dumps(claims), get_derived_encryption_key(secret), algorithm="dir", encryption="A256GCM") + token = jwe.encrypt( + json.dumps(claims), + get_derived_encryption_key(secret), + algorithm="dir", + encryption="A256GCM", + ) return token @@ -40,6 +45,8 @@ async def dummy_token(): async def dummy_token(): settings = get_settings() if settings.ENVIRONMENT != "local": - raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, - detail="Dummy token can only be generated in local environment") + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Dummy token can only be generated in local environment", + ) return {"dummy_token": create_dummy_token(settings.SECRET_KEY)} diff --git a/backend/app/main.py b/backend/app/main.py index c9733c18..55dfef7d 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -30,10 +30,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE logger.error(f"Validation Errors: {exc.errors()}") return JSONResponse( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - content=jsonable_encoder({ - "detail": exc.errors(), - "body": exc.body - }), + content=jsonable_encoder({"detail": exc.errors(), "body": exc.body}), ) @@ -63,7 +60,8 @@ app.include_router(tasks_router.router, prefix="/v1") app.include_router(bulk_uploads_router.router, prefix="/v1") if get_settings().ENVIRONMENT == "local": - from app.local import router as local_router + from backend.app.local import router as local_router + app.include_router(local_router.router) handler = Mangum(app) @@ -98,10 +96,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE logger.error(f"Validation Errors: {exc.errors()}") return JSONResponse( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - content=jsonable_encoder({ - "detail": exc.errors(), - "body": exc.body - }), + content=jsonable_encoder({"detail": exc.errors(), "body": exc.body}), ) @@ -130,7 +125,8 @@ app.include_router(whlg_router.router, prefix="/v1") app.include_router(bulk_uploads_router.router, prefix="/v1") if get_settings().ENVIRONMENT == "local": - from app.local import router as local_router + from backend.app.local import router as local_router + app.include_router(local_router.router) handler = Mangum(app) diff --git a/recommendations/Costs.py b/recommendations/Costs.py index bd8f160a..fc72d4d8 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -21,28 +21,28 @@ regional_labour_variations = [ {"Region": "Yorkshire and the Humber", "Adjustment_Factor": 0.86}, {"Region": "Wales", "Adjustment_Factor": 0.88}, {"Region": "Scotland", "Adjustment_Factor": 0.88}, - {"Region": "Northern Ireland", "Adjustment_Factor": 0.76} + {"Region": "Northern Ireland", "Adjustment_Factor": 0.76}, ] # Installers are now working with 435 watt panels PANEL_SIZE = 0.435 INSTALLER_SOLAR_COSTS = [ - {'n_panels': 4, 'array_kwp': 4 * PANEL_SIZE, 'cost': 4089.25, 'installer': 'CEG'}, - {'n_panels': 5, 'array_kwp': 5 * PANEL_SIZE, 'cost': 4242.48, 'installer': 'CEG'}, - {'n_panels': 6, 'array_kwp': 6 * PANEL_SIZE, 'cost': 4395.71, 'installer': 'CEG'}, - {'n_panels': 7, 'array_kwp': 7 * PANEL_SIZE, 'cost': 4548.94, 'installer': 'CEG'}, - {'n_panels': 8, 'array_kwp': 8 * PANEL_SIZE, 'cost': 4702.17, 'installer': 'CEG'}, - {'n_panels': 9, 'array_kwp': 9 * PANEL_SIZE, 'cost': 4855.41, 'installer': 'CEG'}, - {'n_panels': 10, 'array_kwp': 10 * PANEL_SIZE, 'cost': 5010.95, 'installer': 'CEG'}, - {'n_panels': 11, 'array_kwp': 11 * PANEL_SIZE, 'cost': 5166.49, 'installer': 'CEG'}, - {'n_panels': 12, 'array_kwp': 12 * PANEL_SIZE, 'cost': 5322.04, 'installer': 'CEG'}, - {'n_panels': 13, 'array_kwp': 13 * PANEL_SIZE, 'cost': 5657.6, 'installer': 'CEG'}, - {'n_panels': 14, 'array_kwp': 14 * PANEL_SIZE, 'cost': 5993.16, 'installer': 'CEG'}, - {'n_panels': 15, 'array_kwp': 15 * PANEL_SIZE, 'cost': 6328.71, 'installer': 'CEG'}, - {'n_panels': 16, 'array_kwp': 16 * PANEL_SIZE, 'cost': 6483.33, 'installer': 'CEG'}, - {'n_panels': 17, 'array_kwp': 17 * PANEL_SIZE, 'cost': 6637.95, 'installer': 'CEG'}, - {'n_panels': 18, 'array_kwp': 18 * PANEL_SIZE, 'cost': 6792.57, 'installer': 'CEG'} + {"n_panels": 4, "array_kwp": 4 * PANEL_SIZE, "cost": 4089.25, "installer": "CEG"}, + {"n_panels": 5, "array_kwp": 5 * PANEL_SIZE, "cost": 4242.48, "installer": "CEG"}, + {"n_panels": 6, "array_kwp": 6 * PANEL_SIZE, "cost": 4395.71, "installer": "CEG"}, + {"n_panels": 7, "array_kwp": 7 * PANEL_SIZE, "cost": 4548.94, "installer": "CEG"}, + {"n_panels": 8, "array_kwp": 8 * PANEL_SIZE, "cost": 4702.17, "installer": "CEG"}, + {"n_panels": 9, "array_kwp": 9 * PANEL_SIZE, "cost": 4855.41, "installer": "CEG"}, + {"n_panels": 10, "array_kwp": 10 * PANEL_SIZE, "cost": 5010.95, "installer": "CEG"}, + {"n_panels": 11, "array_kwp": 11 * PANEL_SIZE, "cost": 5166.49, "installer": "CEG"}, + {"n_panels": 12, "array_kwp": 12 * PANEL_SIZE, "cost": 5322.04, "installer": "CEG"}, + {"n_panels": 13, "array_kwp": 13 * PANEL_SIZE, "cost": 5657.6, "installer": "CEG"}, + {"n_panels": 14, "array_kwp": 14 * PANEL_SIZE, "cost": 5993.16, "installer": "CEG"}, + {"n_panels": 15, "array_kwp": 15 * PANEL_SIZE, "cost": 6328.71, "installer": "CEG"}, + {"n_panels": 16, "array_kwp": 16 * PANEL_SIZE, "cost": 6483.33, "installer": "CEG"}, + {"n_panels": 17, "array_kwp": 17 * PANEL_SIZE, "cost": 6637.95, "installer": "CEG"}, + {"n_panels": 18, "array_kwp": 18 * PANEL_SIZE, "cost": 6792.57, "installer": "CEG"}, ] # These are costs we received from CRG, for pricing up air source heat pumps @@ -80,7 +80,12 @@ INSTALLER_SOLAR_PV_INVERTER_COST = 7500 INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST = 500 # Just a rough guess to labour costs INSTALLER_SOLAR_BATTERY_COSTS = [ - {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 3769.89, 'installer': 'JJC'}, + { + "capacity_kwh": 5, + "description": "Battery Add on", + "cost": 3769.89, + "installer": "JJC", + }, # {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'}, # {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'}, # {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'} @@ -102,10 +107,14 @@ TTZC_SMART_THERMOSTAT_LABOUR_HOURS = 2 TTZC_ELECTRICIAN_HOURLY_RATE = 45 # Based on cost of a Nest temperature sensor TTZC_ROOM_TEMPERATURE_SENSOR_COST = 50 -TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS = 0.17 # (Assume ~ 10 mins install per sensor) +TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS = ( + 0.17 # (Assume ~ 10 mins install per sensor) +) # Basedon an average cost of smart radiator values TTZC_SMART_RADIATOR_VALUES = 50 -TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS = 0.37 # (Assume ~ 15-30 mins install per valve) +TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS = ( + 0.37 # (Assume ~ 15-30 mins install per valve) +) # boiler prices based on # This is the cost of a firs time central heating install from The Warm Front rate card @@ -169,7 +178,7 @@ class Costs: "heater_removal": 0.1, "sealing_open_fireplace": 0.1, "mechanical_ventilation": 0.26, - "sloping_ceiling_insulation": 0.26 # Similar to IWI so using the same contingency + "sloping_ceiling_insulation": 0.26, # Similar to IWI so using the same contingency } # Preliminaries are a percentage of the total cost of the work and covers the cost of site-specific costs @@ -195,36 +204,46 @@ class Costs: :param property_instance: Instance of a Property class containing relevant details like wall area. """ - if not hasattr(property_instance, 'insulation_wall_area'): - raise ValueError("Property instance must have an 'insulation_wall_area' attribute") + if not hasattr(property_instance, "insulation_wall_area"): + raise ValueError( + "Property instance must have an 'insulation_wall_area' attribute" + ) self.property = property_instance self.regional_labour_variations = regional_labour_variations self.region = county_to_region_map.get(self.property.epc_record.county, None) if self.region is None: # Try and grab using the local-authority-label - self.region = county_to_region_map.get(self.property.epc_record.local_authority_label, None) + self.region = county_to_region_map.get( + self.property.epc_record.local_authority_label, None + ) if self.region is None: # Try and get the region after converting the keys to lower - self.region = { - k.lower(): v for k, v in county_to_region_map.items() - }.get(self.property.epc_record.local_authority_label.lower(), None) + if self.property.epc_record.local_authority_label is not None: + self.region = { + k.lower(): v for k, v in county_to_region_map.items() + }.get(self.property.epc_record.local_authority_label.lower(), None) if self.region is None: - logger.warning("No region found for county %s, defaulting to South East England", - self.property.epc_record.county) + logger.warning( + "No region found for county %s, defaulting to South East England", + self.property.epc_record.county, + ) self.region = "South East England" self.labour_adjustment_factor = [ - x["Adjustment_Factor"] for x in self.regional_labour_variations if - x["Region"] == self.region + x["Adjustment_Factor"] + for x in self.regional_labour_variations + if x["Region"] == self.region ][0] if not self.labour_adjustment_factor: raise ValueError("Labour adjustment factor not found") - def cavity_wall_insulation(self, wall_area, material, is_extraction_and_refill=False): + def cavity_wall_insulation( + self, wall_area, material, is_extraction_and_refill=False + ): """ Calculates the total cost for cavity wall insulation based on material and labor costs, including contingency, preliminaries, profit, and VAT. @@ -318,7 +337,8 @@ class Costs: return { "total": total_cost, - "contingency": self.CONTINGENCIES["suspended_floor_insulation"] * total_cost, + "contingency": self.CONTINGENCIES["suspended_floor_insulation"] + * total_cost, "contingency_rate": self.CONTINGENCIES["suspended_floor_insulation"], "labour_hours": labour_hours, "labour_days": labour_days, @@ -370,8 +390,7 @@ class Costs: # - Apply sub-linear scaling for realism # - Enforce a minimum duration so estimates are not unrealistically low labour_days = max( - min_days, - base_days * (insulation_floor_area / base_area) ** labour_exponent + min_days, base_days * (insulation_floor_area / base_area) ** labour_exponent ) return labour_days @@ -388,7 +407,9 @@ class Costs: total_cost = material["total_cost"] * insulation_floor_area daily_labour_rate = 300 # Based on checkatrade - labour_days = self._estimate_number_of_days_for_solid_floor(insulation_floor_area) + labour_days = self._estimate_number_of_days_for_solid_floor( + insulation_floor_area + ) labour_cost = labour_days * daily_labour_rate total_cost = total_cost + labour_cost @@ -404,7 +425,6 @@ class Costs: } def low_energy_lighting(self, number_of_lights, material): - """ Calculates the total cost for low energy lighting based on material and labor costs, including contingency, preliminaries, profit, and VAT. @@ -419,7 +439,7 @@ class Costs: total_cost = material["total_cost"] * number_of_lights labour_hours = 1 - labour_days = (labour_hours / 8) + labour_days = labour_hours / 8 return { "total": total_cost, @@ -450,26 +470,22 @@ class Costs: } @classmethod - def solar_pv( - cls, - solar_product, - scaffolding_options, - n_floors - ): - - """ - - """ + def solar_pv(cls, solar_product, scaffolding_options, n_floors): + """ """ system_cost = solar_product["total_cost"] if not solar_product["includes_scaffolding"]: # We base this on the number of floors - scaffolding = [x["total_cost"] for x in scaffolding_options if x["size"] == n_floors] + scaffolding = [ + x["total_cost"] for x in scaffolding_options if x["size"] == n_floors + ] if not scaffolding: # If we have no options, handle this if n_floors <= 3: - raise ValueError("No scaffolding options available for 3 or fewer floors") + raise ValueError( + "No scaffolding options available for 3 or fewer floors" + ) # We take the largest scaffolding option available scaffolding_cost = max([x["total_cost"] for x in scaffolding_options]) else: @@ -523,9 +539,9 @@ class Costs: We base the estimates for the cost of electric room heaters on the cost per room as estimated by the following article: https://www.bestelectricradiators.co.uk/blog/cost-to-install-a-new-heating-system-uk/ - + :param number_heated_rooms: int, number of rooms to be heated - :return: + :return: """ total_cost = 500 * number_heated_rooms @@ -547,11 +563,11 @@ class Costs: } def high_heat_electric_storage_heaters( - self, number_heated_rooms: int, + self, + number_heated_rooms: int, needs_cylinder: bool, - product: dict | None = None + product: dict | None = None, ): - """ We base the estimates for the cost of electric storage heaters on the cost per room as estimated by the energy saving trust @@ -578,8 +594,11 @@ class Costs: return { "total": total_cost, - "contingency": total_cost * self.CONTINGENCIES["high_heat_retention_storage_heaters"], - "contingency_rate": self.CONTINGENCIES["high_heat_retention_storage_heaters"], + "contingency": total_cost + * self.CONTINGENCIES["high_heat_retention_storage_heaters"], + "contingency_rate": self.CONTINGENCIES[ + "high_heat_retention_storage_heaters" + ], "subtotal": subtotal_before_vat, "vat": vat, "labour_hours": labour_hours, @@ -690,14 +709,14 @@ class Costs: # The product costs are inclusive of VAT product_costs = ( - TTZC_SMART_THERMOSTAT_COST + - TTZC_ROOM_TEMPERATURE_SENSOR_COST * number_heated_rooms + - TTZC_SMART_RADIATOR_VALUES * number_heated_rooms + TTZC_SMART_THERMOSTAT_COST + + TTZC_ROOM_TEMPERATURE_SENSOR_COST * number_heated_rooms + + TTZC_SMART_RADIATOR_VALUES * number_heated_rooms ) labour_hours = ( - TTZC_SMART_THERMOSTAT_LABOUR_HOURS + - TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS * number_heated_rooms + - TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS * number_heated_rooms + TTZC_SMART_THERMOSTAT_LABOUR_HOURS + + TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS * number_heated_rooms + + TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS * number_heated_rooms ) labour_costs = TTZC_ELECTRICIAN_HOURLY_RATE * labour_hours # Add continency and preliminaries to the labour to account for the complexity of the job @@ -722,7 +741,9 @@ class Costs: "labour_days": labour_days, } - def programmer_trvs_bypass(self, number_heated_rooms, has_programmer, has_trvs, has_bypass): + def programmer_trvs_bypass( + self, number_heated_rooms, has_programmer, has_trvs, has_bypass + ): total_cost = 0 labour_hours = 0 @@ -779,7 +800,9 @@ class Costs: } @staticmethod - def _estimate_n_radiators(number_habitable_rooms, total_floor_area, property_type, built_form): + def _estimate_n_radiators( + number_habitable_rooms, total_floor_area, property_type, built_form + ): # Base number of radiators: one per habitable room base_radiators = number_habitable_rooms @@ -787,34 +810,49 @@ class Costs: additional_radiators = 3 # Initial assumption # Adjust additional radiators based on property type - if property_type == 'Flat': - additional_radiators -= 1 # Flats may need fewer radiators due to less exposure - elif property_type in ['House', 'Bungalow', 'Maisonette']: + if property_type == "Flat": + additional_radiators -= ( + 1 # Flats may need fewer radiators due to less exposure + ) + elif property_type in ["House", "Bungalow", "Maisonette"]: # Multiple floors in Maisonette may require additional heating points - additional_radiators += 2 # Houses and bungalows might need more due to greater exposure + additional_radiators += ( + 2 # Houses and bungalows might need more due to greater exposure + ) else: raise Exception("Invalid property type") # Adjust total radiator needs based on built form form_factor = { - 'Enclosed Mid-Terrace': 0.9, - 'Mid-Terrace': 0.95, - 'Enclosed End-Terrace': 0.95, - 'Semi-Detached': 1.05, - 'Detached': 1.25, - 'End-Terrace': 1.05 + "Enclosed Mid-Terrace": 0.9, + "Mid-Terrace": 0.95, + "Enclosed End-Terrace": 0.95, + "Semi-Detached": 1.05, + "Detached": 1.25, + "End-Terrace": 1.05, } # Calculate total heating power needed and number of radiators based on standard output total_heating_power_required = total_floor_area * 80 # Watts per square meter radiator_output = 1000 # Average wattage per radiator - total_radiators_based_on_power = (total_heating_power_required / radiator_output) * form_factor[built_form] + total_radiators_based_on_power = ( + total_heating_power_required / radiator_output + ) * form_factor[built_form] # Final estimation taking the higher of calculated needs or base room count - estimated_radiators = max(total_radiators_based_on_power, base_radiators + additional_radiators) + estimated_radiators = max( + total_radiators_based_on_power, base_radiators + additional_radiators + ) return round(estimated_radiators) - def boiler(self, exising_room_heaters, system_change, n_heated_rooms, n_rooms, is_electric=False): + def boiler( + self, + exising_room_heaters, + system_change, + n_heated_rooms, + n_rooms, + is_electric=False, + ): """ Based on a basic estimate of median value £2600 to install a low carbon combi boiler First time central heating vosts can als be found here: @@ -859,12 +897,14 @@ class Costs: number_habitable_rooms=n_rooms, total_floor_area=self.property.floor_area, property_type=self.property.epc_record.property_type, - built_form=self.property.epc_record.built_form + built_form=self.property.epc_record.built_form, ) additionals_labour_cost = labour_rate * self.labour_adjustment_factor radiator_cost = DOUBLE_RADIATOR_COST * n_radiators - system_change_cost = radiator_cost + FLUE_COST + PIPEWORK_COST + additionals_labour_cost + system_change_cost = ( + radiator_cost + FLUE_COST + PIPEWORK_COST + additionals_labour_cost + ) system_change_cost_before_vat = system_change_cost / (1 + self.VAT_RATE) system_change_vat = system_change_cost - system_change_cost_before_vat # We add an extra labour day for the system change @@ -897,14 +937,18 @@ class Costs: else: return 250 - def air_source_heat_pump(self, ashp_size: float, number_heated_rooms: int, total_floor_area: float) -> dict: + def air_source_heat_pump( + self, ashp_size: float, number_heated_rooms: int, total_floor_area: float + ) -> dict: """ We produce a cost estimation for an air source heat pump, based on costs we have received from installers. """ system_cost = ( - (ASHP_SMALL_SYSTEM_COST if ashp_size <= 8.5 else ASHP_LARGE_SYSTEM_COST) + ASHP_SECURITY + ASHP_WALL_BRACKET + (ASHP_SMALL_SYSTEM_COST if ashp_size <= 8.5 else ASHP_LARGE_SYSTEM_COST) + + ASHP_SECURITY + + ASHP_WALL_BRACKET ) available_n_rads = [x["n_radiators"] for x in ASHP_DISTRIBUTION_SYSTEM_COSTS] @@ -940,7 +984,9 @@ class Costs: } @staticmethod - def _estimate_number_of_days_for_sloping_ceiling(insulation_roof_area: float) -> float: + def _estimate_number_of_days_for_sloping_ceiling( + insulation_roof_area: float, + ) -> float: """ Estimate labour days required to insulate an existing sloping ceiling. @@ -965,14 +1011,15 @@ class Costs: min_days = 2 labour_days = max( - min_days, - base_days * (insulation_roof_area / base_area) ** labour_exponent + min_days, base_days * (insulation_roof_area / base_area) ** labour_exponent ) return labour_days @classmethod - def sloping_ceiling_insulation(cls, insulation_roof_area: float) -> Mapping[str, float]: + def sloping_ceiling_insulation( + cls, insulation_roof_area: float + ) -> Mapping[str, float]: """ This costing for this is based on Checkatrade desktop research, since we are yet to receive installer quotes. :param insulation_roof_area: Area of the sloping ceiling to be insulated @@ -985,14 +1032,20 @@ class Costs: # https://www.checkatrade.com/blog/cost-guides/vaulted-ceiling-cost/ # https://www.thegreenage.co.uk/can-i-insulate-my-sloping-ceiling/ # These assumptions last updated 21/02/2026 - insulation_cost_per_m2 = 52 # The actual install process is quite similar to IWI + insulation_cost_per_m2 = ( + 52 # The actual install process is quite similar to IWI + ) labour_rate = 250 # per day contingency_rate = cls.CONTINGENCIES["sloping_ceiling_insulation"] - labour_days = cls._estimate_number_of_days_for_sloping_ceiling(insulation_roof_area) + labour_days = cls._estimate_number_of_days_for_sloping_ceiling( + insulation_roof_area + ) labour_hours = labour_days * 8 - total = (insulation_cost_per_m2 * insulation_roof_area) + (labour_rate * labour_days) + total = (insulation_cost_per_m2 * insulation_roof_area) + ( + labour_rate * labour_days + ) # Assume VAT included in the total => total is 120% of subtotal vat = total - (total / 1.2) From 02fb3afbe41d479742ad4053f296c99e807f22ae Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:04:01 +0000 Subject: [PATCH 03/43] defined histrocial epc data shapre from csv --- datatypes/epc/schema/historic_epc.py | 97 ++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 datatypes/epc/schema/historic_epc.py diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py new file mode 100644 index 00000000..e158ac1f --- /dev/null +++ b/datatypes/epc/schema/historic_epc.py @@ -0,0 +1,97 @@ +from dataclasses import dataclass + + +@dataclass +class HistoricEpc: + lmk_key: str + address1: str + address2: str + address3: str + postcode: str + building_reference_number: str + current_energy_rating: str + potential_energy_rating: str + current_energy_efficiency: str + potential_energy_efficiency: str + property_type: str + built_form: str + inspection_date: str + local_authority: str + constituency: str + county: str + lodgement_date: str + transaction_type: str + environment_impact_current: str + environment_impact_potential: str + energy_consumption_current: str + energy_consumption_potential: str + co2_emissions_current: str + co2_emiss_curr_per_floor_area: str + co2_emissions_potential: str + lighting_cost_current: str + lighting_cost_potential: str + heating_cost_current: str + heating_cost_potential: str + hot_water_cost_current: str + hot_water_cost_potential: str + total_floor_area: str + energy_tariff: str + mains_gas_flag: str + floor_level: str + flat_top_storey: str + flat_storey_count: str + main_heating_controls: str + multi_glaze_proportion: str + glazed_type: str + glazed_area: str + extension_count: str + number_habitable_rooms: str + number_heated_rooms: str + low_energy_lighting: str + number_open_fireplaces: str + hotwater_description: str + hot_water_energy_eff: str + hot_water_env_eff: str + floor_description: str + floor_energy_eff: str + floor_env_eff: str + windows_description: str + windows_energy_eff: str + windows_env_eff: str + walls_description: str + walls_energy_eff: str + walls_env_eff: str + secondheat_description: str + sheating_energy_eff: str + sheating_env_eff: str + roof_description: str + roof_energy_eff: str + roof_env_eff: str + mainheat_description: str + mainheat_energy_eff: str + mainheat_env_eff: str + mainheatcont_description: str + mainheatc_energy_eff: str + mainheatc_env_eff: str + lighting_description: str + lighting_energy_eff: str + lighting_env_eff: str + main_fuel: str + wind_turbine_count: str + heat_loss_corridor: str + unheated_corridor_length: str + floor_height: str + photo_supply: str + solar_water_heating_flag: str + mechanical_ventilation: str + address: str + local_authority_label: str + constituency_label: str + posttown: str + construction_age_band: str + lodgement_datetime: str + tenure: str + fixed_lighting_outlets_count: str + low_energy_fixed_light_count: str + uprn: str + uprn_source: str From 90a4d83243604a4830dc0f6a7752a67f928c571f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:04:27 +0000 Subject: [PATCH 04/43] added init files to make it a python module --- backend/etl/__init__.py | 0 backend/etl/etl_opendatacommunities/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 backend/etl/__init__.py create mode 100644 backend/etl/etl_opendatacommunities/__init__.py diff --git a/backend/etl/__init__.py b/backend/etl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/etl/etl_opendatacommunities/__init__.py b/backend/etl/etl_opendatacommunities/__init__.py new file mode 100644 index 00000000..e69de29b From 74b7b87de6384419687bdb1bc57468ea3ee4959e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:22:41 +0000 Subject: [PATCH 05/43] =?UTF-8?q?load=20historic=20epc=20from=20csv=20?= =?UTF-8?q?=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- datatypes/epc/loaders/__init__.py | 0 datatypes/epc/loaders/historic_epc.py | 5 ++ datatypes/epc/schema/historic_epc.py | 7 +++ .../schema/tests/test_historic_epc_loading.py | 55 +++++++++++++++++++ 4 files changed, 67 insertions(+) create mode 100644 datatypes/epc/loaders/__init__.py create mode 100644 datatypes/epc/loaders/historic_epc.py create mode 100644 datatypes/epc/schema/tests/test_historic_epc_loading.py diff --git a/datatypes/epc/loaders/__init__.py b/datatypes/epc/loaders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/datatypes/epc/loaders/historic_epc.py b/datatypes/epc/loaders/historic_epc.py new file mode 100644 index 00000000..8555a706 --- /dev/null +++ b/datatypes/epc/loaders/historic_epc.py @@ -0,0 +1,5 @@ +from datatypes.epc.schema.historic_epc import HistoricEpc + + +def read_historic_epc_csv(path: str) -> list[HistoricEpc]: + raise NotImplementedError("read_historic_epc_csv not implemented yet") diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py index e158ac1f..9ebe4b09 100644 --- a/datatypes/epc/schema/historic_epc.py +++ b/datatypes/epc/schema/historic_epc.py @@ -95,3 +95,10 @@ class HistoricEpc: low_energy_fixed_light_count: str uprn: str uprn_source: str + report_type: str + improvement_item: str + improvement_summary_text: str + improvement_descr_text: str + improvement_id: str + improvement_id_text: str + indicative_cost: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py new file mode 100644 index 00000000..d5d5ea22 --- /dev/null +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -0,0 +1,55 @@ +import os + +import pytest + +from datatypes.epc.loaders.historic_epc import read_historic_epc_csv +from datatypes.epc.schema.historic_epc import HistoricEpc + +FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") + + +class TestHistoricEpcLoading: + + @pytest.fixture + def epc(self) -> HistoricEpc: + rows = read_historic_epc_csv(os.path.join(FIXTURES, "historic_epc.csv")) + return rows[0] + + def test_returns_historic_epc_instance(self, epc: HistoricEpc) -> None: + assert isinstance(epc, HistoricEpc) + + def test_lmk_key(self, epc: HistoricEpc) -> None: + assert epc.lmk_key == "9292c3bf26a8876ce59274401ea73e3de5bd0b3e52a507c2162a46e57db8ea2f" + + def test_address1(self, epc: HistoricEpc) -> None: + assert epc.address1 == "47 GORDON ROAD" + + def test_postcode(self, epc: HistoricEpc) -> None: + assert epc.postcode == "AB33 8AL" + + def test_current_energy_rating(self, epc: HistoricEpc) -> None: + assert epc.current_energy_rating == "E" + + def test_property_type(self, epc: HistoricEpc) -> None: + assert epc.property_type == "House" + + def test_built_form(self, epc: HistoricEpc) -> None: + assert epc.built_form == "Semi-Detached" + + def test_inspection_date(self, epc: HistoricEpc) -> None: + assert epc.inspection_date == "2021-04-11" + + def test_uprn(self, epc: HistoricEpc) -> None: + assert epc.uprn == "151020766.0" + + def test_uprn_source(self, epc: HistoricEpc) -> None: + assert epc.uprn_source == "Energy Assessor" + + def test_report_type(self, epc: HistoricEpc) -> None: + assert epc.report_type == "100" + + def test_improvement_summary_text(self, epc: HistoricEpc) -> None: + assert epc.improvement_summary_text == "Increase loft insulation to 270 mm" + + def test_indicative_cost(self, epc: HistoricEpc) -> None: + assert epc.indicative_cost == "£100 - £350" From 32bf1cc98de5bb73c753e772e81d0571595dc8fa Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:26:29 +0000 Subject: [PATCH 06/43] =?UTF-8?q?load=20historic=20epc=20from=20csv=20?= =?UTF-8?q?=F0=9F=9F=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- datatypes/epc/loaders/historic_epc.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/datatypes/epc/loaders/historic_epc.py b/datatypes/epc/loaders/historic_epc.py index 8555a706..7b563315 100644 --- a/datatypes/epc/loaders/historic_epc.py +++ b/datatypes/epc/loaders/historic_epc.py @@ -1,5 +1,18 @@ +import csv + from datatypes.epc.schema.historic_epc import HistoricEpc +def _normalise(value: str | None) -> str: + if value is None: + return "" + return value.replace("\xa0", " ") + + def read_historic_epc_csv(path: str) -> list[HistoricEpc]: - raise NotImplementedError("read_historic_epc_csv not implemented yet") + with open(path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [ + HistoricEpc(**{k.lower(): _normalise(v) for k, v in row.items()}) + for row in reader + ] From a39c3a0772566e28fc00a6c6ee2507a1d8b12b27 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 12:03:35 +0000 Subject: [PATCH 07/43] added added historic epc data class with shape --- backend/etl/etl_opendatacommunities/main.py | 37 +++++++------------ datatypes/epc/domain/historic_epc.py | 10 +++++ datatypes/epc/schema/historic_epc.py | 6 --- .../schema/tests/test_historic_epc_loading.py | 6 --- sfr/principal_pitch/2_export_data.py | 8 ++-- 5 files changed, 27 insertions(+), 40 deletions(-) create mode 100644 datatypes/epc/domain/historic_epc.py diff --git a/backend/etl/etl_opendatacommunities/main.py b/backend/etl/etl_opendatacommunities/main.py index 30b4045a..2bd41005 100644 --- a/backend/etl/etl_opendatacommunities/main.py +++ b/backend/etl/etl_opendatacommunities/main.py @@ -15,16 +15,7 @@ logger = setup_logger() SRC_ROOT = Path("/workspaces/home/epc_data") TMP_ROOT = Path("/tmp/epc_postcodes") S3_BUCKET = "retrofit-data-dev" -S3_PREFIX = "epc_opendatacommunities" - -REC_COLS = { - "IMPROVEMENT_ITEM", - "IMPROVEMENT_SUMMARY_TEXT", - "IMPROVEMENT_DESCR_TEXT", - "IMPROVEMENT_ID", - "IMPROVEMENT_ID_TEXT", - "INDICATIVE_COST", -} +S3_PREFIX = "historical_epc" # This scripts assume you downloading the zip, unzip it, and running it locally @@ -35,18 +26,16 @@ def sanitise(pc: pd.Series) -> pd.Series: def shard_la(la_dir: Path) -> None: certs = pd.read_csv(la_dir / "certificates.csv", low_memory=False) - recs = pd.read_csv(la_dir / "recommendations.csv", low_memory=False) - merged = certs.merge(recs, on="LMK_KEY", how="left") - merged["POSTCODE_CLEAN"] = sanitise(merged["POSTCODE"]) - before = len(merged) - merged = merged.dropna(subset=["POSTCODE_CLEAN"]) - merged = merged[merged["POSTCODE_CLEAN"] != ""] - dropped = before - len(merged) + certs["POSTCODE_CLEAN"] = sanitise(certs["POSTCODE"]) + before = len(certs) + certs = certs.dropna(subset=["POSTCODE_CLEAN"]) + certs = certs[certs["POSTCODE_CLEAN"] != ""] + dropped = before - len(certs) if dropped: logger.warning(f"{la_dir.name}: dropped {dropped} rows with empty postcode") - for pc, group in merged.groupby("POSTCODE_CLEAN", sort=False): + for pc, group in certs.groupby("POSTCODE_CLEAN", sort=False): out = TMP_ROOT / f"{pc}.csv" group.drop(columns=["POSTCODE_CLEAN"]).to_csv( out, mode="a", header=not out.exists(), index=False @@ -67,9 +56,7 @@ def list_existing_keys(s3: Any) -> set[str]: def upload_postcode(path: Path, s3: Any) -> None: df = pd.read_csv(path, low_memory=False).drop_duplicates() - cert_cols = [c for c in df.columns if c not in REC_COLS] - cert_only = df[cert_cols].drop_duplicates() - dupes = cert_only["LMK_KEY"].value_counts() + dupes = df["LMK_KEY"].value_counts() bad = dupes[dupes > 1] if not bad.empty: raise ValueError( @@ -95,12 +82,14 @@ def main(): ) logger.info(f"Sharding {len(la_dirs)} LA folders -> {TMP_ROOT}") - # for la in tqdm(la_dirs, desc="shard"): - # shard_la(la) + for la in tqdm(la_dirs, desc="shard"): + shard_la(la) s3 = boto3.client( "s3", - config=Config(max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"}), + config=Config( + max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"} + ), ) pc_files = sorted(TMP_ROOT.glob("*.csv")) logger.info(f"Found {len(pc_files)} local shards") diff --git a/datatypes/epc/domain/historic_epc.py b/datatypes/epc/domain/historic_epc.py new file mode 100644 index 00000000..230c6327 --- /dev/null +++ b/datatypes/epc/domain/historic_epc.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + + +@dataclass +class HistoricEpc: + address1: str + address2: str + address3: str + postcode: str + uprn: str diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py index 9ebe4b09..f64ab8c4 100644 --- a/datatypes/epc/schema/historic_epc.py +++ b/datatypes/epc/schema/historic_epc.py @@ -96,9 +96,3 @@ class HistoricEpc: uprn: str uprn_source: str report_type: str - improvement_item: str - improvement_summary_text: str - improvement_descr_text: str - improvement_id: str - improvement_id_text: str - indicative_cost: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py index d5d5ea22..2170a8a6 100644 --- a/datatypes/epc/schema/tests/test_historic_epc_loading.py +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -47,9 +47,3 @@ class TestHistoricEpcLoading: def test_report_type(self, epc: HistoricEpc) -> None: assert epc.report_type == "100" - - def test_improvement_summary_text(self, epc: HistoricEpc) -> None: - assert epc.improvement_summary_text == "Increase loft insulation to 270 mm" - - def test_indicative_cost(self, epc: HistoricEpc) -> None: - assert epc.indicative_cost == "£100 - £350" diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index b275086d..5e3ce5d5 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -26,13 +26,13 @@ from backend.app.db.functions.materials_functions import get_materials from collections import defaultdict from sqlalchemy import func -PORTFOLIO_ID = 711 -SCENARIOS = [1233] +PORTFOLIO_ID = 632 +SCENARIOS = [1144] scenario_names = { - 1233: "Reach EPC C", + 1144: "EPC C", } -project_name = "Novus" +project_name = "Calico Refresh" def get_data(portfolio_id, scenario_ids): From 7a49f5df20e61836ee19fc19e77abd024fc35880 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 12:19:03 +0000 Subject: [PATCH 08/43] save plan temporary while i incorporate skills to claude --- datatypes/epc/domain/plan.md | 161 +++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 datatypes/epc/domain/plan.md diff --git a/datatypes/epc/domain/plan.md b/datatypes/epc/domain/plan.md new file mode 100644 index 00000000..45cc495b --- /dev/null +++ b/datatypes/epc/domain/plan.md @@ -0,0 +1,161 @@ +# Historic EPC address-match service + +## Context + +ETL `backend/etl/etl_opendatacommunities/main.py` shards `certificates.csv` by sanitised postcode and uploads gzipped CSVs to `s3://retrofit-data-dev/historical_epc//data.csv.gz`. Need a pure-python lib that, given `(user_address, postcode)`, fetches the corresponding shard and scores every row against the user address using the same lexiscore as `address2UPRN` — but returning the full scored df (not a single UPRN), so callers can apply their own thresholding. + +Mirrors pattern in [backend/address2UPRN/main.py:111-147](backend/address2UPRN/main.py#L111-L147) (`get_uprn_candidates`) but reads from S3 historic CSV instead of the EPC live API. No Lambda, no script — lib only for now. + +## Approach + +Add a wrapper class `HistoricEpcMatches` and a function `match_addresses_for_postcode` to the existing domain file. Add a small gzip-CSV S3 helper to `utils/s3.py`. + +### 1. Add gzip-CSV S3 reader + +In [utils/s3.py](utils/s3.py) (after `read_dataframe_from_s3_parquet` ~line 167): + +```python +def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: + if not file_key.endswith(".csv.gz"): + raise ValueError("file_key must end with .csv.gz") + buf = read_io_from_s3(bucket_name, file_key) + return pd.read_csv(buf, compression="gzip", low_memory=False) +``` + +Reuses existing `read_io_from_s3` (line 105). Caller catches `botocore.exceptions.ClientError` for missing-key handling. + +### 2. Append matcher to domain module + +In [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — keep existing `HistoricEpc` dataclass intact, append: + +```python +from typing import Optional +import pandas as pd +from botocore.exceptions import ClientError + +from backend.utils.addressMatch import AddressMatch +from utils.s3 import read_csv_gz_from_s3 + + +@dataclass +class HistoricEpcMatches: + """Scored historic EPC rows for a single postcode.""" + user_address: str + postcode: str # sanitised + df: pd.DataFrame # has lexiscore + lexirank, sorted best-first + + def top(self) -> Optional[pd.Series]: + return None if self.df.empty else self.df.iloc[0] + + def top_n(self, k: int) -> pd.DataFrame: + return self.df.head(k) + + def unambiguous_uprn(self, uprn_column: str = "UPRN") -> Optional[str]: + if self.df.empty: + return None + top_rank = self.df["lexirank"].min() + uprns = ( + self.df.loc[self.df["lexirank"] == top_rank, uprn_column] + .dropna().astype(str).str.replace(r"\.0$", "", regex=True) + .unique() + ) + return uprns[0] if len(uprns) == 1 else None + + +def _sanitise_postcode(postcode: str) -> str: + if not postcode: + raise ValueError("postcode must be non-empty") + return postcode.upper().replace(" ", "") + + +def match_addresses_for_postcode( + user_address: str, + postcode: str, + *, + bucket: str = "retrofit-data-dev", + prefix: str = "historical_epc", + address_column: str = "ADDRESS", +) -> HistoricEpcMatches: + if not user_address: + raise ValueError("user_address must be non-empty") + + pc = _sanitise_postcode(postcode) + key = f"{prefix}/{pc}/data.csv.gz" + + try: + df = read_csv_gz_from_s3(bucket, key) + except ClientError as e: + if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): + raise FileNotFoundError( + f"No historic EPC data at s3://{bucket}/{key}" + ) from e + raise + + if address_column not in df.columns: + raise ValueError( + f"Missing address column {address_column!r} in {key}" + ) + + user_norm = AddressMatch.normalise_address(user_address) + df = df.copy() + df["lexiscore"] = df[address_column].fillna("").apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) + df["lexirank"] = ( + df["lexiscore"].rank(method="dense", ascending=False).astype(int) + ) + df = df.sort_values(["lexirank", "lexiscore"], ascending=[True, False]).reset_index(drop=True) + + return HistoricEpcMatches(user_address=user_address, postcode=pc, df=df) +``` + +### Reuse notes +- `AddressMatch.normalise_address` + `AddressMatch.levenshtein` from [backend/utils/addressMatch.py](backend/utils/addressMatch.py) — same scoring as address2UPRN. +- Score column copy uses `.fillna("")` to defend against NaN in `ADDRESS`. +- Defaults match ETL output: bucket `retrofit-data-dev`, prefix `historical_epc`, column `ADDRESS` (uppercase). + +### 3. Tests + +New: [datatypes/epc/domain/tests/__init__.py](datatypes/epc/domain/tests/__init__.py) (empty) and [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py). + +Reuse existing fixture `datatypes/epc/schema/tests/fixtures/historic_epc.csv` — read it in-memory in tests; do NOT commit a `.csv.gz` fixture. Patch target: `datatypes.epc.domain.historic_epc.read_csv_gz_from_s3` (local binding, not `utils.s3.read_csv_gz_from_s3`). + +Cases: +1. `_sanitise_postcode("ab33 8al") == "AB338AL"`; empty raises. +2. Returned df has `lexiscore` + `lexirank` columns, row count preserved. +3. df sorted: `iloc[0]["lexirank"] == 1`, `lexiscore` monotone non-increasing. +4. S3 key built correctly: `"AB33 8AL"` → key `"historical_epc/AB338AL/data.csv.gz"` (spy on patched helper). +5. `ClientError` with code `NoSuchKey` → `FileNotFoundError`. +6. Exact-match address → `unambiguous_uprn()` returns that UPRN; ambiguous tie → `None`. +7. `top()` / `top_n(k)` shape checks. + +## Critical files +- [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — append matcher +- [utils/s3.py](utils/s3.py) — add `read_csv_gz_from_s3` +- [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py) — new + +## Out of scope +- Lambda handler / SQS wiring (deferred — lib only) +- Threshold logic (caller decides via wrapper helpers) +- Postcode validation via `postcodes.io` (`AddressMatch.is_valid_postcode` exists if needed later) +- Refactoring `sanitise(pd.Series)` in `etl_opendatacommunities/main.py` — separate concern + +## Verification +``` +cd /workspaces/model && pytest datatypes/epc/domain/tests/test_historic_epc_match.py -v +``` + +Sample real-S3 call (needs AWS creds): +```python +from datatypes.epc.domain.historic_epc import match_addresses_for_postcode +m = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") +print(m.df[["ADDRESS", "UPRN", "lexiscore", "lexirank"]].head()) +print(m.unambiguous_uprn()) +``` + +## Sequencing +1. Add `read_csv_gz_from_s3` to `utils/s3.py`. +2. Append matcher + wrapper to `datatypes/epc/domain/historic_epc.py`. +3. Add tests. + +Steps 2 & 3 depend on 1. No `__init__.py` re-exports needed. From 676022a4c054d7301b1d7542b582fadbb63705b4 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 12:53:37 +0000 Subject: [PATCH 09/43] =?UTF-8?q?Fix=20coordination/design=20field=20names?= =?UTF-8?q?=20and=20add=20MagicPlan=20trigger=20to=20HubspotDealDiffer=20?= =?UTF-8?q?=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 6 ++ etl/hubspot/tests/test_hubspot_deal_differ.py | 76 ++++++++++++++++++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index 9e7069fc..a53df4f7 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -194,6 +194,12 @@ class HubspotDealDiffer: and new_status != old_deal.design_status ) + @staticmethod + def check_for_magicplan_trigger( + new_deal: Dict[str, str], old_deal: HubspotDealData + ) -> bool: + raise NotImplementedError + @staticmethod def _lodgement_completed( new_deal: Dict[str, str], old_deal: HubspotDealData diff --git a/etl/hubspot/tests/test_hubspot_deal_differ.py b/etl/hubspot/tests/test_hubspot_deal_differ.py index 0523c982..273a82a0 100644 --- a/etl/hubspot/tests/test_hubspot_deal_differ.py +++ b/etl/hubspot/tests/test_hubspot_deal_differ.py @@ -109,7 +109,7 @@ def test_pashub_trigger__coordination_completed_and_pashub_link_set__returns_tru new_deal = make_new_deal( deal_id, pashub_link="www.google.co.uk", - coordination_status=coordination_status, + **{"coordination_status__stage_1_": coordination_status}, ) assert ( @@ -156,7 +156,7 @@ def test_pashub_trigger__design_completed_and_pashub_link_set__returns_true() -> new_deal = make_new_deal( deal_id, pashub_link="www.google.co.uk", - design_status="uploaded", + retrofit_design_status="uploaded", ) assert ( @@ -177,7 +177,7 @@ def test_pashub_trigger__design_completed_and_pashub_link_not_set__returns_false new_deal = make_new_deal( deal_id, - design_status="uploaded", + retrofit_design_status="uploaded", ) assert ( @@ -270,6 +270,76 @@ def test_pashub_trigger__coordination_design_lodgement_not_completed_and_pashub_ ) +# ========================== +# MAGICPLAN TRIGGER TESTS +# ========================== + + +def test_magicplan_trigger__transitions_to_coordination_complete__returns_true() -> None: + deal_id = uuid.uuid4() + + # Arrange + old_deal = make_old_deal(id=deal_id, coordination_status="in progress") + new_deal = make_new_deal( + deal_id, + **{"coordination_status__stage_1_": "(v1) ioe/mtp complete"}, + ) + + # Act + result = HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + + # Assert + assert result is True + + +def test_magicplan_trigger__already_in_coordination_complete_unrelated_change__returns_false() -> None: + deal_id = uuid.uuid4() + + # Arrange + old_deal = make_old_deal( + id=deal_id, + coordination_status="(v1) ioe/mtp complete", + outcome="pending", + ) + new_deal = make_new_deal( + deal_id, + **{"coordination_status__stage_1_": "(v1) ioe/mtp complete"}, + outcome="won", + ) + + # Act + result = HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + + # Assert + assert result is False + + +def test_magicplan_trigger__transitions_to_non_complete_coordination_status__returns_false() -> None: + deal_id = uuid.uuid4() + + # Arrange + old_deal = make_old_deal(id=deal_id, coordination_status="in progress") + new_deal = make_new_deal( + deal_id, + **{"coordination_status__stage_1_": "design submitted"}, + ) + + # Act + result = HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + + # Assert + assert result is False + + # ======================= # DB UPDATE TRIGGER TESTS # ======================= From 69faa530a4c5a4dce34a00919baf965e2f59943a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 12:55:23 +0000 Subject: [PATCH 10/43] =?UTF-8?q?Fix=20coordination/design=20field=20names?= =?UTF-8?q?=20and=20add=20MagicPlan=20trigger=20to=20HubspotDealDiffer=20?= =?UTF-8?q?=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index a53df4f7..5435a46d 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -178,7 +178,7 @@ class HubspotDealDiffer: def _coordination_completed( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - new_status: str = new_deal.get("coordination_status") or "" + new_status: str = new_deal.get("coordination_status__stage_1_") or "" return ( new_status != "" and new_status.lower() in HubspotDealDiffer.COORDINATION_COMPLETE @@ -187,7 +187,7 @@ class HubspotDealDiffer: @staticmethod def _design_completed(new_deal: Dict[str, str], old_deal: HubspotDealData) -> bool: - new_status: str = new_deal.get("design_status") or "" + new_status: str = new_deal.get("retrofit_design_status") or "" return ( new_status != "" and new_status.lower() == HubspotDealDiffer.RETROFIT_DESIGN_COMPLETE @@ -198,7 +198,12 @@ class HubspotDealDiffer: def check_for_magicplan_trigger( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - raise NotImplementedError + new_status = (new_deal.get("coordination_status__stage_1_") or "").lower() + old_status = (old_deal.coordination_status or "").lower() + return ( + new_status in HubspotDealDiffer.COORDINATION_COMPLETE + and old_status not in HubspotDealDiffer.COORDINATION_COMPLETE + ) @staticmethod def _lodgement_completed( From 830d4ebb4bbe229c2e7eeef1357ed54c257e261c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 12:58:44 +0000 Subject: [PATCH 11/43] claude added from agentic-toolkit --- .devcontainer/backend/Dockerfile | 34 ++++++++++-------------- .devcontainer/backend/devcontainer.json | 14 ++++++---- .devcontainer/backend/docker-compose.yml | 5 ++++ 3 files changed, 28 insertions(+), 25 deletions(-) diff --git a/.devcontainer/backend/Dockerfile b/.devcontainer/backend/Dockerfile index a92d37f6..a8a25f27 100644 --- a/.devcontainer/backend/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -18,15 +18,6 @@ RUN curl -fsSL https://github.com/neovim/neovim/releases/latest/download/nvim-li | tar -xz -C /opt \ && ln -s /opt/nvim-linux-x86_64/bin/nvim /usr/local/bin/nvim -# # 2) Build and install libpostal from source -# RUN git clone --depth 1 https://github.com/openvenues/libpostal /tmp/libpostal \ -# && cd /tmp/libpostal \ -# && ./bootstrap.sh \ -# && ./configure --datadir=/usr/local/share/libpostal \ -# && make -j"$(nproc)" \ -# && make install \ -# && ldconfig \ -# && rm -rf /tmp/libpostal # 3) Create the user and grant sudo privileges RUN groupadd -g ${USER_GID} ${USER} \ @@ -34,10 +25,7 @@ RUN groupadd -g ${USER_GID} ${USER} \ && echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \ && chmod 0440 /etc/sudoers.d/${USER} -# # 4) Python deps - if you want to run assest list -# ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 -# ADD asset_list/requirements.txt requirements.txt -# RUN pip install -r requirements.txt + # ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 @@ -75,21 +63,27 @@ RUN wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key RUN apt update RUN apt install -y postgresql-14 -# Install Node.js + backlog.md +# Install Node.js RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \ && apt-get install -y nodejs \ - && npm install -g backlog.md \ && rm -rf /var/lib/apt/lists/* +# GitHub CLI — used by the postCreate skill installer to authenticate against +# private Hestia-Homes repos via the host's mounted ~/.config/gh. +RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \ + | dd of=/usr/share/keyrings/githubcli-archive-keyring.gpg \ + && chmod go+r /usr/share/keyrings/githubcli-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" \ + > /etc/apt/sources.list.d/github-cli.list \ + && apt update && apt install -y gh \ + && rm -rf /var/lib/apt/lists/* + USER ${USER} # Bootstrap LazyVim starter config RUN git clone https://github.com/LazyVim/starter /home/${USER}/.config/nvim \ && rm -rf /home/${USER}/.config/nvim/.git -# Install Claude -RUN curl -fsSL https://claude.ai/install.sh | bash \ - && export PATH="/home/${USER}/.local/bin:${PATH}" \ - && claude plugin marketplace add JuliusBrussee/caveman \ - && claude plugin install caveman@caveman +# Install Claude Code CLI (skills are installed via postCreate from Hestia-Homes/agentic-toolkit) +RUN curl -fsSL https://claude.ai/install.sh | bash ENV PATH="/home/vscode/.local/bin:${PATH}" USER root diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index a9b7352a..1c5859e5 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -4,6 +4,14 @@ "service": "model-backend", "remoteUser": "vscode", "workspaceFolder": "/workspaces/model", + + // Host preflight: ensure GitHub auth exists before we try to build. + // Either ~/.config/gh (from `gh auth login`) or a GITHUB_TOKEN env var. + "initializeCommand": "test -d \"$HOME/.config/gh\" || test -n \"$GITHUB_TOKEN\" || { echo >&2 'error: no GitHub auth found. Run `gh auth login && gh auth setup-git` on the host, or export GITHUB_TOKEN, then retry.'; exit 1; }", + + // Install Domna's curated skill set (pinned to 0.0.5) into this workspace. + // `gh repo clone` handles private-repo auth using the mounted host ~/.config/gh. + "postCreateCommand": "gh repo clone Hestia-Homes/agentic-toolkit /tmp/agentic-toolkit -- --branch 0.0.5 --depth 1 && bash /tmp/agentic-toolkit/setup.sh", "postStartCommand": "bash .devcontainer/backend/post-install.sh", "mounts": [ "source=${localEnv:HOME},target=/workspaces/home,type=bind", @@ -44,12 +52,8 @@ "containerEnv": { "PYTHONFLAGS": "-Xfrozen_modules=off" }, - "forwardPorts": [6421, 8000], + "forwardPorts": [8000], "portsAttributes": { - "6421": { - "label": "Backlog.md", - "onAutoForward": "notify" - }, "8000": { "label": "FastAPI", "onAutoForward": "notify" diff --git a/.devcontainer/backend/docker-compose.yml b/.devcontainer/backend/docker-compose.yml index 757cfbe0..cf3bb2c0 100644 --- a/.devcontainer/backend/docker-compose.yml +++ b/.devcontainer/backend/docker-compose.yml @@ -14,8 +14,13 @@ services: volumes: - ../../:/workspaces/model - ~/.gitconfig:/home/vscode/.gitconfig:ro + # GitHub CLI auth from host (created by `gh auth login`). Used by the + # postCreate skill installer to clone private Hestia-Homes repos. + - ~/.config/gh:/home/vscode/.config/gh:ro environment: - SSH_AUTH_SOCK=${SSH_AUTH_SOCK:-} + # Fallback HTTPS auth if ~/.config/gh isn't present on the host. + - GITHUB_TOKEN=${GITHUB_TOKEN:-} networks: - backend-net - shared-dev From 489b0ba30eb730139916901da7b585627428c3e9 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:05:38 +0000 Subject: [PATCH 12/43] =?UTF-8?q?Add=20MagicPlan=20SQS=20trigger=20to=20Hu?= =?UTF-8?q?bSpot=20orchestrator=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/config.py | 1 + .../hubspot_trigger_orchestrator/__init__.py | 0 .../tests/__init__.py | 0 .../tests/test_orchestrator.py | 148 ++++++++++++++++++ etl/hubspot/scripts/scraper/main.py | 21 +++ 5 files changed, 170 insertions(+) create mode 100644 backend/hubspot_trigger_orchestrator/__init__.py create mode 100644 backend/hubspot_trigger_orchestrator/tests/__init__.py create mode 100644 backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py diff --git a/backend/app/config.py b/backend/app/config.py index e939d6e4..21f12902 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -39,6 +39,7 @@ class Settings(BaseSettings): ENGINE_SQS_URL: str = "changeme" CATEGORISATION_SQS_URL: str = "changeme" PASHUB_TO_ARA_SQS_URL: str = "changeme" + MAGICPLAN_SQS_URL: str = "changeme" POSTCODE_SPLITTER_SQS_URL: str = "changeme" COMBINER_SQS_URL: str = "changeme" diff --git a/backend/hubspot_trigger_orchestrator/__init__.py b/backend/hubspot_trigger_orchestrator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/hubspot_trigger_orchestrator/tests/__init__.py b/backend/hubspot_trigger_orchestrator/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py b/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py new file mode 100644 index 00000000..6d18c4b4 --- /dev/null +++ b/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py @@ -0,0 +1,148 @@ +import json +import uuid +from typing import Any, Dict, Optional +from unittest.mock import MagicMock, patch + +import pytest + +from backend.app.db.models.hubspot_deal_data import HubspotDealData +from etl.hubspot.scripts.scraper.main import handler + +COORDINATION_COMPLETE = "(v1) ioe/mtp complete" +DEAL_NAME = "123 Main Street" +UPRN = "12345678" +DEAL_ID = "999" +MAGICPLAN_QUEUE_URL = "https://sqs.eu-west-2.amazonaws.com/123/magic-plan-dev" + + +def make_hubspot_deal( + coordination_status: Optional[str] = None, **kwargs: Any +) -> Dict[str, Any]: + deal: Dict[str, Any] = { + "hs_object_id": DEAL_ID, + "dealname": DEAL_NAME, + "pashub_link": None, + **kwargs, + } + if coordination_status is not None: + deal["coordination_status__stage_1_"] = coordination_status + return deal + + +def make_db_deal(coordination_status: Optional[str] = None, **kwargs: Any) -> HubspotDealData: + return HubspotDealData( + id=uuid.uuid4(), + deal_id=DEAL_ID, + coordination_status=coordination_status, + **kwargs, + ) + + +def run_handler( + hubspot_deal: Dict[str, Any], + db_deal: Optional[HubspotDealData], + listing: Optional[dict], +) -> MagicMock: + mock_sqs = MagicMock() + mock_sqs.send_message.return_value = {"MessageId": "test-id"} + + with ( + patch("etl.hubspot.scripts.scraper.main.HubspotDataToDb") as mock_db_cls, + patch("etl.hubspot.scripts.scraper.main.HubspotClient") as mock_hs_cls, + patch("etl.hubspot.scripts.scraper.main.boto3") as mock_boto3, + patch("etl.hubspot.scripts.scraper.main.get_settings") as mock_settings, + ): + mock_db_cls.return_value.find_deal_with_deal_id.return_value = db_deal + mock_db_cls.return_value.upsert_deal.return_value = None + mock_hs_cls.return_value.get_deal_and_company_and_listing.return_value = ( + hubspot_deal, + None, + listing, + ) + mock_boto3.client.return_value = mock_sqs + mock_settings.return_value.MAGICPLAN_SQS_URL = MAGICPLAN_QUEUE_URL + mock_settings.return_value.PASHUB_TO_ARA_SQS_URL = "https://sqs.test/pashub" + + handler.__wrapped__({"hubspot_deal_id": DEAL_ID}, "") + + return mock_sqs + + +# ======================= +# NEW DEAL PATH +# ======================= + + +def test_new_deal_in_coordination_complete__sends_sqs_message() -> None: + # Arrange + hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + listing = {"national_uprn": UPRN} + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=listing) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=MAGICPLAN_QUEUE_URL, + MessageBody=json.dumps({"address": DEAL_NAME, "uprn": UPRN}), + ) + + +def test_new_deal_not_in_coordination_complete__no_sqs_message() -> None: + # Arrange + hubspot_deal = make_hubspot_deal(coordination_status="in progress") + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) + + # Assert + mock_sqs.send_message.assert_not_called() + + +def test_new_deal_with_no_listing__uprn_is_none_in_message() -> None: + # Arrange + hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=MAGICPLAN_QUEUE_URL, + MessageBody=json.dumps({"address": DEAL_NAME, "uprn": None}), + ) + + +# ======================= +# EXISTING DEAL PATH +# ======================= + + +def test_existing_deal_transitions_to_coordination_complete__sends_sqs_message() -> None: + # Arrange + db_deal = make_db_deal(coordination_status="in progress") + hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + listing = {"national_uprn": UPRN} + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=listing) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=MAGICPLAN_QUEUE_URL, + MessageBody=json.dumps({"address": DEAL_NAME, "uprn": UPRN}), + ) + + +def test_existing_deal_already_in_coordination_complete_unrelated_change__no_sqs_message() -> None: + # Arrange + db_deal = make_db_deal(coordination_status=COORDINATION_COMPLETE, dealname="Old Name") + hubspot_deal = make_hubspot_deal( + coordination_status=COORDINATION_COMPLETE, dealname="New Name" + ) + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=None) + + # Assert + mock_sqs.send_message.assert_not_called() diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 3ed208a2..cd76e26f 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -56,6 +56,13 @@ def handler(body: dict[str, Any], context: Any) -> None: f"Triggering Pas Hub file fetcher for HubSpot deal ID {hubspot_deal_id}" ) _trigger_pashub_fetcher(sqs_client, hubspot_deal_id, hubspot_deal) + + coordination_status = (hubspot_deal.get("coordination_status__stage_1_") or "").lower() + if coordination_status in HubspotDealDiffer.COORDINATION_COMPLETE: + logger.info( + f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" + ) + _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing) else: # Deal already in db, check whether anything has changed logger.info( @@ -97,9 +104,23 @@ def handler(body: dict[str, Any], context: Any) -> None: f"Not Triggering PasHub file fetcher for HubSpot deal ID {hubspot_deal_id}" ) + if HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=hubspot_deal, old_deal=db_deal + ): + logger.info( + f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" + ) + _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing) + print("done") +def _trigger_magicplan_fetcher( + sqs_client: Any, hubspot_deal: Dict[str, str], listing: Optional[dict[str, str]] +) -> None: + raise NotImplementedError + + def _trigger_pashub_fetcher( sqs_client: Any, deal_id: str, hubspot_deal: Dict[str, str] ) -> None: From a1a445f6f270f62b148e7d3f79eace348213e893 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:06:43 +0000 Subject: [PATCH 13/43] =?UTF-8?q?Add=20MagicPlan=20SQS=20trigger=20to=20Hu?= =?UTF-8?q?bSpot=20orchestrator=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/scripts/scraper/main.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index cd76e26f..a39e8b37 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -118,7 +118,17 @@ def handler(body: dict[str, Any], context: Any) -> None: def _trigger_magicplan_fetcher( sqs_client: Any, hubspot_deal: Dict[str, str], listing: Optional[dict[str, str]] ) -> None: - raise NotImplementedError + message_body = { + "address": hubspot_deal.get("dealname"), + "uprn": listing.get("national_uprn") if listing else None, + } + response = sqs_client.send_message( + QueueUrl=get_settings().MAGICPLAN_SQS_URL, + MessageBody=json.dumps(message_body), + ) + logger.info( + f"Sent message to MagicPlan queue. MessageId: {response['MessageId']}" + ) def _trigger_pashub_fetcher( From ed68a10127b132d883b927655c5f0ce2cdc41815 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:07:36 +0000 Subject: [PATCH 14/43] magic plan client terraform --- .../terraform/lambda/magic_plan/main.tf | 41 +++++++++++++++++++ .../terraform/lambda/magic_plan/outputs.tf | 9 ++++ .../terraform/lambda/magic_plan/provider.tf | 16 ++++++++ 3 files changed, 66 insertions(+) create mode 100644 infrastructure/terraform/lambda/magic_plan/main.tf create mode 100644 infrastructure/terraform/lambda/magic_plan/outputs.tf create mode 100644 infrastructure/terraform/lambda/magic_plan/provider.tf diff --git a/infrastructure/terraform/lambda/magic_plan/main.tf b/infrastructure/terraform/lambda/magic_plan/main.tf new file mode 100644 index 00000000..56adac1b --- /dev/null +++ b/infrastructure/terraform/lambda/magic_plan/main.tf @@ -0,0 +1,41 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} + +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} + +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + +module "lambda" { + source = "../../modules/lambda_with_sqs" + + name = "magic_plan" + stage = var.stage + + image_uri = local.image_uri + + maximum_concurrency = var.maximum_concurrency + reserved_concurrent_executions = var.reserved_concurrent_executions + batch_size = var.batch_size + + environment = { + STAGE = var.stage + LOG_LEVEL = "info" + MAGICPLAN_CUSTOMER_ID = var.magicplan_customer_id + MAGICPLAN_API_KEY = var.magicplan_api_key + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + DB_HOST = var.db_host + DB_NAME = var.db_name + DB_PORT = var.db_port + } +} diff --git a/infrastructure/terraform/lambda/magic_plan/outputs.tf b/infrastructure/terraform/lambda/magic_plan/outputs.tf new file mode 100644 index 00000000..2082933f --- /dev/null +++ b/infrastructure/terraform/lambda/magic_plan/outputs.tf @@ -0,0 +1,9 @@ +output "magic_plan_queue_url" { + value = module.lambda.queue_url + description = "URL of the MagicPlan SQS queue" +} + +output "magic_plan_queue_arn" { + value = module.lambda.queue_arn + description = "ARN of the MagicPlan SQS queue" +} diff --git a/infrastructure/terraform/lambda/magic_plan/provider.tf b/infrastructure/terraform/lambda/magic_plan/provider.tf new file mode 100644 index 00000000..9e7020ac --- /dev/null +++ b/infrastructure/terraform/lambda/magic_plan/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } + + backend "s3" { + bucket = "magic-plan-hubspot-trigger-terraform-state" + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} From fd77fa51fdad5f1ea5dd3a6859f3051dd0665d84 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:07:53 +0000 Subject: [PATCH 15/43] magic plan client terraform --- .../terraform/lambda/magic_plan/variables.tf | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 infrastructure/terraform/lambda/magic_plan/variables.tf diff --git a/infrastructure/terraform/lambda/magic_plan/variables.tf b/infrastructure/terraform/lambda/magic_plan/variables.tf new file mode 100644 index 00000000..03f88e75 --- /dev/null +++ b/infrastructure/terraform/lambda/magic_plan/variables.tf @@ -0,0 +1,68 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} + +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "maximum_concurrency" { + type = number + default = null + description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." +} + +variable "reserved_concurrent_executions" { + type = number + default = 1 +} + +variable "batch_size" { + type = number + default = 1 +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} + +variable "magicplan_customer_id" { + type = string + sensitive = true +} + +variable "magicplan_api_key" { + type = string + sensitive = true +} + +variable "db_host" { + type = string + sensitive = true +} + +variable "db_name" { + type = string + sensitive = true +} + +variable "db_port" { + type = string + sensitive = true +} From feaa1ea68093f74087952597be691ee5a387fc8a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:12:13 +0000 Subject: [PATCH 16/43] Add MagicPlan Lambda Dockerfile, CI/CD jobs, and SQS IAM wiring in hubspot_deal_etl --- .github/workflows/deploy_terraform.yml | 40 ++++++++++++++++++- backend/magic_plan/handler/Dockerfile | 26 ++++++++++++ backend/magic_plan/handler/requirements.txt | 7 ++++ .../terraform/lambda/hubspot_deal_etl/main.tf | 26 +++++++++++- 4 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 backend/magic_plan/handler/Dockerfile create mode 100644 backend/magic_plan/handler/requirements.txt diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 398232c6..e0343974 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -537,11 +537,49 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + # ============================================================ + # Build MagicPlan Lambda image + # ============================================================ + magic_plan_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: magic-plan-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/magic_plan/handler/Dockerfile + build_context: . + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + + # ============================================================ + # Deploy MagicPlan Lambda + # ============================================================ + magic_plan_lambda: + needs: [magic_plan_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: magic_plan + lambda_path: infrastructure/terraform/lambda/magic_plan + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: magic-plan-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.magic_plan_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + TF_VAR_db_host: ${{ secrets.DEV_DB_HOST }} + TF_VAR_db_name: ${{ secrets.DEV_DB_NAME }} + TF_VAR_db_port: ${{ secrets.DEV_DB_PORT }} + TF_VAR_magicplan_customer_id: ${{ secrets.MAGICPLAN_CUSTOMER_ID }} + TF_VAR_magicplan_api_key: ${{ secrets.MAGICPLAN_API_KEY }} + # ============================================================ # Deploy Hubspot ETL Lambda # ============================================================ hubspot_etl_lambda: - needs: [hubspot_etl_image, determine_stage, pashub_to_ara_lambda] + needs: [hubspot_etl_image, determine_stage, pashub_to_ara_lambda, magic_plan_lambda] uses: ./.github/workflows/_deploy_lambda.yml with: lambda_name: hubspot-etl-to-ara diff --git a/backend/magic_plan/handler/Dockerfile b/backend/magic_plan/handler/Dockerfile new file mode 100644 index 00000000..7c83ebe6 --- /dev/null +++ b/backend/magic_plan/handler/Dockerfile @@ -0,0 +1,26 @@ +FROM mcr.microsoft.com/playwright/python:v1.58.0-jammy + +# Install AWS Lambda RIE +ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/local/bin/aws-lambda-rie +RUN chmod +x /usr/local/bin/aws-lambda-rie + +# Set working directory (Lambda task root) +WORKDIR /var/task + +COPY backend/magic_plan/handler/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY utils/ utils/ +COPY backend/ backend/ +COPY datatypes/ datatypes/ + +# Local lambda entrypoint +# ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] + +# AWS lambda entrypoint +ENTRYPOINT ["python", "-m", "awslambdaric"] + +# ----------------------------- +# Lambda handler +# ----------------------------- +CMD ["backend.magic_plan.handler.handler"] diff --git a/backend/magic_plan/handler/requirements.txt b/backend/magic_plan/handler/requirements.txt new file mode 100644 index 00000000..cfacf455 --- /dev/null +++ b/backend/magic_plan/handler/requirements.txt @@ -0,0 +1,7 @@ +awslambdaric +requests +sqlalchemy==2.0.36 +sqlmodel +psycopg2-binary==2.9.10 +pydantic-settings==2.6.0 +boto3==1.35.44 diff --git a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf index 48dd6b78..800dc3b6 100644 --- a/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf +++ b/infrastructure/terraform/lambda/hubspot_deal_etl/main.tf @@ -12,7 +12,16 @@ data "terraform_remote_state" "pashub_to_ara" { config = { bucket = "pashub-to-ara-terraform-state" key = "env:/${var.stage}/terraform.tfstate" - region = "eu-west-2" + region = "eu-west-2" + } +} + +data "terraform_remote_state" "magic_plan" { + backend = "s3" + config = { + bucket = "magic-plan-hubspot-trigger-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" } } @@ -49,6 +58,7 @@ module "hubspot_deal_etl" { HUBSPOT_API_KEY = var.hubspot_api_key PASHUB_TO_ARA_SQS_URL = data.terraform_remote_state.pashub_to_ara.outputs.pashub_to_ara_queue_url + MAGICPLAN_SQS_URL = data.terraform_remote_state.magic_plan.outputs.magic_plan_queue_url } } @@ -76,4 +86,18 @@ module "hubspot_deal_etl_sqs_policy" { resource "aws_iam_role_policy_attachment" "hubspot_deal_etl_sqs_send" { role = module.hubspot_deal_etl.role_name policy_arn = module.hubspot_deal_etl_sqs_policy.policy_arn +} + +module "hubspot_deal_etl_magicplan_sqs_policy" { + source = "../../modules/general_iam_policy" + + policy_name = "hubspot-deal-etl-magicplan-sqs-send-${var.stage}" + policy_description = "Allow HubSpot ETL Lambda to send messages to MagicPlan queue" + actions = ["sqs:SendMessage"] + resources = [data.terraform_remote_state.magic_plan.outputs.magic_plan_queue_arn] +} + +resource "aws_iam_role_policy_attachment" "hubspot_deal_etl_magicplan_sqs_send" { + role = module.hubspot_deal_etl.role_name + policy_arn = module.hubspot_deal_etl_magicplan_sqs_policy.policy_arn } \ No newline at end of file From e30e06cb6e61f24668bba039e0fb17c36f4a9307 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:24:04 +0000 Subject: [PATCH 17/43] simplify dockerfile as playwright not used --- backend/magic_plan/handler/Dockerfile | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/backend/magic_plan/handler/Dockerfile b/backend/magic_plan/handler/Dockerfile index 7c83ebe6..ffd85c02 100644 --- a/backend/magic_plan/handler/Dockerfile +++ b/backend/magic_plan/handler/Dockerfile @@ -1,10 +1,5 @@ -FROM mcr.microsoft.com/playwright/python:v1.58.0-jammy +FROM public.ecr.aws/lambda/python:3.11 -# Install AWS Lambda RIE -ADD https://github.com/aws/aws-lambda-runtime-interface-emulator/releases/latest/download/aws-lambda-rie /usr/local/bin/aws-lambda-rie -RUN chmod +x /usr/local/bin/aws-lambda-rie - -# Set working directory (Lambda task root) WORKDIR /var/task COPY backend/magic_plan/handler/requirements.txt . @@ -14,13 +9,4 @@ COPY utils/ utils/ COPY backend/ backend/ COPY datatypes/ datatypes/ -# Local lambda entrypoint -# ENTRYPOINT ["/usr/local/bin/aws-lambda-rie", "python", "-m", "awslambdaric"] - -# AWS lambda entrypoint -ENTRYPOINT ["python", "-m", "awslambdaric"] - -# ----------------------------- -# Lambda handler -# ----------------------------- CMD ["backend.magic_plan.handler.handler"] From 74b3a7f297b90535f32208189c6d9a87c24f806a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:42:57 +0000 Subject: [PATCH 18/43] =?UTF-8?q?Add=20hubspot=5Fdeal=5Fid=20required=20fi?= =?UTF-8?q?eld=20to=20MagicPlanTriggerRequest=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../magic_plan/tests/test_magic_plan_trigger_request.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/backend/magic_plan/tests/test_magic_plan_trigger_request.py b/backend/magic_plan/tests/test_magic_plan_trigger_request.py index 46a20a37..131ea93b 100644 --- a/backend/magic_plan/tests/test_magic_plan_trigger_request.py +++ b/backend/magic_plan/tests/test_magic_plan_trigger_request.py @@ -38,3 +38,11 @@ def test_extra_fields_ignored() -> None: req = MagicPlanTriggerRequest.model_validate(payload) # Assert assert req.address == "123 High St London SW1A 1AA" + + +def test_missing_hubspot_deal_id_raises() -> None: + # Arrange + payload = {"address": "123 High St London SW1A 1AA"} + # Act / Assert + with pytest.raises(ValidationError): + MagicPlanTriggerRequest.model_validate(payload) From 4a9cabe1979a1f0e12ef97f13dc634d4928b4fef Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 13:45:10 +0000 Subject: [PATCH 19/43] =?UTF-8?q?Add=20hubspot=5Fdeal=5Fid=20required=20fi?= =?UTF-8?q?eld=20to=20MagicPlanTriggerRequest=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- backend/magic_plan/handler.py | 2 +- backend/magic_plan/magic_plan_trigger_request.py | 1 + .../magic_plan/tests/test_magic_plan_trigger_request.py | 9 +++++---- etl/hubspot/scripts/scraper/main.py | 7 ++++--- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/backend/magic_plan/handler.py b/backend/magic_plan/handler.py index a592cc6a..22933e13 100644 --- a/backend/magic_plan/handler.py +++ b/backend/magic_plan/handler.py @@ -28,7 +28,7 @@ if __name__ == "__main__": event = { "Records": [ { - "body": '{"address": "2 Laburnum Way Bromley BR2 8BZ"}', + "body": '{"address": "2 Laburnum Way Bromley BR2 8BZ", "hubspot_deal_id": "local-test-deal"}', "messageId": "local-test", } ] diff --git a/backend/magic_plan/magic_plan_trigger_request.py b/backend/magic_plan/magic_plan_trigger_request.py index bb0151e4..e93c055c 100644 --- a/backend/magic_plan/magic_plan_trigger_request.py +++ b/backend/magic_plan/magic_plan_trigger_request.py @@ -7,4 +7,5 @@ class MagicPlanTriggerRequest(BaseModel): model_config = ConfigDict(extra="ignore") address: str + hubspot_deal_id: str uprn: Optional[str] = None diff --git a/backend/magic_plan/tests/test_magic_plan_trigger_request.py b/backend/magic_plan/tests/test_magic_plan_trigger_request.py index 131ea93b..9fb2754a 100644 --- a/backend/magic_plan/tests/test_magic_plan_trigger_request.py +++ b/backend/magic_plan/tests/test_magic_plan_trigger_request.py @@ -6,17 +6,18 @@ from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerReques def test_valid_payload_with_address_only() -> None: # Arrange - payload = {"address": "123 High St London SW1A 1AA"} + payload = {"address": "123 High St London SW1A 1AA", "hubspot_deal_id": "123456789"} # Act req = MagicPlanTriggerRequest.model_validate(payload) # Assert assert req.address == "123 High St London SW1A 1AA" + assert req.hubspot_deal_id == "123456789" assert req.uprn is None def test_valid_payload_with_uprn() -> None: # Arrange - payload = {"address": "123 High St London SW1A 1AA", "uprn": "100023336956"} + payload = {"address": "123 High St London SW1A 1AA", "hubspot_deal_id": "123456789", "uprn": "100023336956"} # Act req = MagicPlanTriggerRequest.model_validate(payload) # Assert @@ -25,7 +26,7 @@ def test_valid_payload_with_uprn() -> None: def test_missing_address_raises() -> None: # Arrange - payload = {"uprn": "100023336956"} + payload = {"hubspot_deal_id": "123456789", "uprn": "100023336956"} # Act / Assert with pytest.raises(ValidationError): MagicPlanTriggerRequest.model_validate(payload) @@ -33,7 +34,7 @@ def test_missing_address_raises() -> None: def test_extra_fields_ignored() -> None: # Arrange - payload = {"address": "123 High St London SW1A 1AA", "unknown_field": "whatever"} + payload = {"address": "123 High St London SW1A 1AA", "hubspot_deal_id": "123456789", "unknown_field": "whatever"} # Act req = MagicPlanTriggerRequest.model_validate(payload) # Assert diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index a39e8b37..32007cd4 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -62,7 +62,7 @@ def handler(body: dict[str, Any], context: Any) -> None: logger.info( f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" ) - _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing) + _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing, hubspot_deal_id) else: # Deal already in db, check whether anything has changed logger.info( @@ -110,16 +110,17 @@ def handler(body: dict[str, Any], context: Any) -> None: logger.info( f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" ) - _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing) + _trigger_magicplan_fetcher(sqs_client, hubspot_deal, listing, hubspot_deal_id) print("done") def _trigger_magicplan_fetcher( - sqs_client: Any, hubspot_deal: Dict[str, str], listing: Optional[dict[str, str]] + sqs_client: Any, hubspot_deal: Dict[str, str], listing: Optional[dict[str, str]], hubspot_deal_id: str ) -> None: message_body = { "address": hubspot_deal.get("dealname"), + "hubspot_deal_id": hubspot_deal_id, "uprn": listing.get("national_uprn") if listing else None, } response = sqs_client.send_message( From c3aae8fd51373a9b38cbdb023275bbd12172519c Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:08:56 +0000 Subject: [PATCH 20/43] =?UTF-8?q?Expose=20get=5Fplan=5Fraw=20method=20on?= =?UTF-8?q?=20MagicPlanClient=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 3 ++ .../tests/test_magic_plan_client.py | 52 +++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index 60f70fb1..172190fd 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -22,3 +22,6 @@ class MagicPlanClient: ) r.raise_for_status() return MagicPlanPlan.model_validate(r.json()["data"]) + + def get_plan_raw(self, plan_id: str) -> bytes: + raise NotImplementedError diff --git a/backend/magic_plan/tests/test_magic_plan_client.py b/backend/magic_plan/tests/test_magic_plan_client.py index 1be1448f..c96b9cdf 100644 --- a/backend/magic_plan/tests/test_magic_plan_client.py +++ b/backend/magic_plan/tests/test_magic_plan_client.py @@ -172,3 +172,55 @@ def test_get_plan_propagates_http_error( # Act / Assert with pytest.raises(requests.HTTPError): client.get_plan("some-id") + + +# --- get_plan_raw --- + + +def test_get_plan_raw_returns_bytes( + client: MagicPlanClient, mock_session: MagicMock +) -> None: + # Arrange + mock_session.get.return_value.content = b'{"data": "raw"}' + plan_id = "a7285ed1-878d-47eb-8aa6-85ef9e187516" + # Act + result = client.get_plan_raw(plan_id) + # Assert + assert isinstance(result, bytes) + + +def test_get_plan_raw_calls_correct_url( + client: MagicPlanClient, mock_session: MagicMock +) -> None: + # Arrange + mock_session.get.return_value.content = b"{}" + plan_id = "a7285ed1-878d-47eb-8aa6-85ef9e187516" + # Act + client.get_plan_raw(plan_id) + # Assert + mock_session.get.assert_called_once_with( + f"{BASE_URL}/plans/{plan_id}", params={"key": API_KEY} + ) + + +def test_get_plan_raw_calls_raise_for_status( + client: MagicPlanClient, mock_session: MagicMock +) -> None: + # Arrange + mock_session.get.return_value.content = b"{}" + # Act + client.get_plan_raw("a7285ed1-878d-47eb-8aa6-85ef9e187516") + # Assert + mock_session.get.return_value.raise_for_status.assert_called_once() + + +def test_get_plan_raw_propagates_http_error( + client: MagicPlanClient, mock_session: MagicMock +) -> None: + # Arrange + mock_session.get.return_value.raise_for_status.side_effect = requests.HTTPError( + "500" + ) + # Act / Assert + with pytest.raises(requests.HTTPError): + client.get_plan_raw("some-id") From f6c17be70a3d85e0638cea58f3edcdaf530c0aee Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:09:33 +0000 Subject: [PATCH 21/43] =?UTF-8?q?Expose=20get=5Fplan=5Fraw=20method=20on?= =?UTF-8?q?=20MagicPlanClient=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_client.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/backend/magic_plan/magic_plan_client.py b/backend/magic_plan/magic_plan_client.py index 172190fd..06905e6a 100644 --- a/backend/magic_plan/magic_plan_client.py +++ b/backend/magic_plan/magic_plan_client.py @@ -17,11 +17,14 @@ class MagicPlanClient: return PlansListResponse.model_validate(r.json()["data"]) def get_plan(self, plan_id: str) -> MagicPlanPlan: + return MagicPlanPlan.model_validate(self._fetch_plan(plan_id).json()["data"]) + + def get_plan_raw(self, plan_id: str) -> bytes: + return self._fetch_plan(plan_id).content + + def _fetch_plan(self, plan_id: str) -> requests.Response: r = self._session.get( f"{_BASE_URL}/plans/{plan_id}", params={"key": self._api_key} ) r.raise_for_status() - return MagicPlanPlan.model_validate(r.json()["data"]) - - def get_plan_raw(self, plan_id: str) -> bytes: - raise NotImplementedError + return r From 7c9cb5b161b4e910d95c96c6633362000c4b9e18 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:14:42 +0000 Subject: [PATCH 22/43] =?UTF-8?q?Upload=20gzip-compressed=20MagicPlan=20JS?= =?UTF-8?q?ON=20to=20S3=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/app/db/models/uploaded_file.py | 2 + backend/magic_plan/handler.py | 2 +- backend/magic_plan/magic_plan_service.py | 13 +++- .../tests/test_magic_plan_service.py | 66 +++++++++++++++++-- 4 files changed, 72 insertions(+), 11 deletions(-) diff --git a/backend/app/db/models/uploaded_file.py b/backend/app/db/models/uploaded_file.py index a516a1df..c629f574 100644 --- a/backend/app/db/models/uploaded_file.py +++ b/backend/app/db/models/uploaded_file.py @@ -17,6 +17,7 @@ class FileTypeEnum(enum.Enum): ECMK_SITE_NOTE = "ecmk_site_note" ECMK_RD_SAP_SITE_NOTE = "ecmk_rd_sap_site_note" ECMK_SURVEY_XML = "ecmk_survey_xml" + MAGIC_PLAN_JSON = "magic_plan_json" class FileSourceEnum(enum.Enum): @@ -24,6 +25,7 @@ class FileSourceEnum(enum.Enum): SHAREPOINT = "sharepoint" HUBSPOT = "hubspot" ECMK = "ecmk" + MAGIC_PLAN = "magic_plan" class UploadedFile(Base): diff --git a/backend/magic_plan/handler.py b/backend/magic_plan/handler.py index 22933e13..45de8554 100644 --- a/backend/magic_plan/handler.py +++ b/backend/magic_plan/handler.py @@ -19,7 +19,7 @@ def handler(body: dict[str, Any], context: Any) -> str: customer_id=settings.MAGICPLAN_CUSTOMER_ID, api_key=settings.MAGICPLAN_API_KEY, ) - plan: Plan = MagicPlanService(client).run(payload.address, payload.uprn) + plan: Plan = MagicPlanService(client, s3_bucket="retrofit-energy-assessments-dev").run(payload) logger.info("Saved MagicPlan plan uid=%s", plan.uid) return plan.uid diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 91b3cd13..6be6486c 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -1,3 +1,4 @@ +import gzip from typing import Optional from datatypes.magicplan.api.response import ( @@ -12,23 +13,29 @@ from backend.app.db.connection import db_session from backend.app.db.functions.magic_plan_functions import save_plan from backend.magic_plan.address_matcher import find_matching_plan from backend.magic_plan.magic_plan_client import MagicPlanClient +from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerRequest from utils.logger import setup_logger +from utils.s3 import save_data_to_s3 logger = setup_logger() class MagicPlanService: - def __init__(self, client: MagicPlanClient) -> None: + def __init__(self, client: MagicPlanClient, s3_bucket: str) -> None: self._client = client + self._s3_bucket = s3_bucket + + def run(self, request: MagicPlanTriggerRequest) -> Plan: + address = request.address + uprn = request.uprn - def run(self, address: str, uprn: Optional[str] = None) -> Plan: if uprn is not None: logger.info("MagicPlanService.run uprn=%s", uprn) plans_response: PlansListResponse = self._client.get_plans() matched: Optional[PlanSummary] = find_matching_plan( plans_response.plans, address - ) # TODO: use address2UPRN instead? or create AddressMatch domain class + ) if matched is None: raise ValueError(f"No MagicPlan found for address: {address!r}") diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index 8e433b87..87e20506 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from unittest.mock import MagicMock, patch +from unittest.mock import ANY, MagicMock, patch import pytest @@ -10,9 +10,11 @@ from datatypes.magicplan.domain.models import Plan from backend.magic_plan.magic_plan_client import MagicPlanClient from backend.magic_plan.magic_plan_service import MagicPlanService +from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerRequest FIXTURE_DIR = Path(__file__).parents[2] / "magic_plan" PLAN_ID = "a7285ed1-878d-47eb-8aa6-85ef9e187516" +S3_BUCKET = "test-bucket" @pytest.fixture(scope="module") @@ -45,7 +47,17 @@ def mock_client() -> MagicMock: def _make_service(mock_client: MagicMock) -> MagicPlanService: - return MagicPlanService(client=mock_client) + return MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) + + +def _make_request( + address: str = "2 Laburnum Way Bromley BR2 8BZ", + hubspot_deal_id: str = "deal-123", + uprn: str | None = None, +) -> MagicPlanTriggerRequest: + return MagicPlanTriggerRequest( + address=address, hubspot_deal_id=hubspot_deal_id, uprn=uprn + ) # --- no match --- @@ -57,7 +69,7 @@ def test_run_raises_when_no_plan_found(mock_client: MagicMock) -> None: service = _make_service(mock_client) # Act / Assert with pytest.raises(ValueError, match="No MagicPlan found"): - service.run("99 Nowhere Road London SW1A 1AA") + service.run(_make_request(address="99 Nowhere Road London SW1A 1AA")) # --- match found --- @@ -78,8 +90,10 @@ def test_run_fetches_plan_with_matched_id( return_value=plan_summary, ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" ): - service.run("2 Laburnum Way Bromley BR2 8BZ") + service.run(_make_request()) # Assert mock_client.get_plan.assert_called_once_with(plan_summary.id) @@ -99,8 +113,10 @@ def test_run_returns_mapped_plan( return_value=plan_summary, ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" ): - result = service.run("2 Laburnum Way Bromley BR2 8BZ") + result = service.run(_make_request()) # Assert assert isinstance(result, Plan) assert result.uid == PLAN_ID @@ -120,8 +136,10 @@ def test_run_calls_save_plan_with_mapped_plan( return_value=plan_summary, ), patch("backend.magic_plan.magic_plan_service.save_plan") as mock_save, patch( "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" ): - service.run("2 Laburnum Way Bromley BR2 8BZ") + service.run(_make_request()) # Assert — save_plan called with a Plan whose uid matches call_args = mock_save.call_args saved_plan: Plan = call_args[0][1] @@ -142,5 +160,39 @@ def test_run_accepts_uprn_without_error( return_value=plan_summary, ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" ): - service.run("2 Laburnum Way Bromley BR2 8BZ", uprn="100023336956") + service.run(_make_request(uprn="100023336956")) + + +# --- S3 upload --- + + +def test_run_uploads_to_s3_with_uprn_key( + mock_client: MagicMock, + api_magic_plan: MagicPlanPlan, + plan_summary: PlanSummary, +) -> None: + # Arrange + mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plan.return_value = api_magic_plan + mock_client.get_plan_raw.return_value = b'{"raw": "data"}' + request = _make_request(uprn="100023336956") + service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) + with patch( + "backend.magic_plan.magic_plan_service.find_matching_plan", + return_value=plan_summary, + ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( + "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" + ) as mock_s3: + # Act + service.run(request) + # Assert + mock_s3.assert_called_once_with( + ANY, + S3_BUCKET, + f"documents/uprn/100023336956/magic_plan_{plan_summary.id}.json.gz", + ) From 14a064fdefd1d6a9362cf3a4c078bbfc6da5b316 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:16:41 +0000 Subject: [PATCH 23/43] =?UTF-8?q?Upload=20gzip-compressed=20MagicPlan=20JS?= =?UTF-8?q?ON=20to=20S3=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 8 ++++++++ backend/magic_plan/tests/test_magic_plan_service.py | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 6be6486c..bb68fa42 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -41,8 +41,16 @@ class MagicPlanService: raise ValueError(f"No MagicPlan found for address: {address!r}") magic_plan: MagicPlanPlan = self._client.get_plan(matched.id) + raw_bytes: bytes = self._client.get_plan_raw(matched.id) plan: Plan = map_plan(magic_plan) + compressed = gzip.compress(raw_bytes) + if uprn is not None: + s3_key = f"documents/uprn/{uprn}/magic_plan_{matched.id}.json.gz" + else: + s3_key = f"documents/hubspot_deal_id/{request.hubspot_deal_id}/magic_plan_{matched.id}.json.gz" + save_data_to_s3(compressed, self._s3_bucket, s3_key) + with db_session() as session: save_plan(session, plan) diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index 87e20506..65c19b7a 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -43,7 +43,9 @@ def plan_summary() -> PlanSummary: @pytest.fixture() def mock_client() -> MagicMock: - return MagicMock(spec=MagicPlanClient) + client = MagicMock(spec=MagicPlanClient) + client.get_plan_raw.return_value = b"{}" + return client def _make_service(mock_client: MagicMock) -> MagicPlanService: From 03e8750c1ac233b408820eb56b4976b7b5a97d0b Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:17:51 +0000 Subject: [PATCH 24/43] =?UTF-8?q?Upload=20MagicPlan=20JSON=20to=20S3=20usi?= =?UTF-8?q?ng=20hubspot=5Fdeal=5Fid=20key=20when=20UPRN=20absent=20?= =?UTF-8?q?=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tests/test_magic_plan_service.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index 65c19b7a..70099e91 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -198,3 +198,31 @@ def test_run_uploads_to_s3_with_uprn_key( S3_BUCKET, f"documents/uprn/100023336956/magic_plan_{plan_summary.id}.json.gz", ) + + +def test_run_uploads_to_s3_with_deal_id_key_when_uprn_absent( + mock_client: MagicMock, + api_magic_plan: MagicPlanPlan, + plan_summary: PlanSummary, +) -> None: + # Arrange + mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plan.return_value = api_magic_plan + request = _make_request(hubspot_deal_id="deal-456", uprn=None) + service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) + with patch( + "backend.magic_plan.magic_plan_service.find_matching_plan", + return_value=plan_summary, + ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( + "backend.magic_plan.magic_plan_service.db_session" + ), patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" + ) as mock_s3: + # Act + service.run(request) + # Assert + mock_s3.assert_called_once_with( + ANY, + S3_BUCKET, + f"documents/hubspot_deal_id/deal-456/magic_plan_{plan_summary.id}.json.gz", + ) From 8ac77ce8b967eb87884443ad4e40e5e2d8c6fa29 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:19:49 +0000 Subject: [PATCH 25/43] =?UTF-8?q?Persist=20UploadedFile=20record=20for=20e?= =?UTF-8?q?ach=20MagicPlan=20S3=20upload=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 5 +++ .../tests/test_magic_plan_service.py | 45 +++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index bb68fa42..0aeed686 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -11,6 +11,11 @@ from datatypes.magicplan.domain.models import Plan from backend.app.db.connection import db_session from backend.app.db.functions.magic_plan_functions import save_plan +from backend.app.db.models.uploaded_file import ( + FileSourceEnum, + FileTypeEnum, + UploadedFile, +) from backend.magic_plan.address_matcher import find_matching_plan from backend.magic_plan.magic_plan_client import MagicPlanClient from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerRequest diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index 70099e91..b7580546 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -8,6 +8,11 @@ from datatypes.magicplan.api.response import MagicPlanPlan, PlanSummary from datatypes.magicplan.domain.mapper import map_plan from datatypes.magicplan.domain.models import Plan +from backend.app.db.models.uploaded_file import ( + FileSourceEnum, + FileTypeEnum, + UploadedFile, +) from backend.magic_plan.magic_plan_client import MagicPlanClient from backend.magic_plan.magic_plan_service import MagicPlanService from backend.magic_plan.magic_plan_trigger_request import MagicPlanTriggerRequest @@ -226,3 +231,43 @@ def test_run_uploads_to_s3_with_deal_id_key_when_uprn_absent( S3_BUCKET, f"documents/hubspot_deal_id/deal-456/magic_plan_{plan_summary.id}.json.gz", ) + + +# --- UploadedFile record --- + + +def test_run_creates_uploaded_file_record( + mock_client: MagicMock, + api_magic_plan: MagicPlanPlan, + plan_summary: PlanSummary, +) -> None: + # Arrange + mock_client.get_plans.return_value.plans = [plan_summary] + mock_client.get_plan.return_value = api_magic_plan + request = _make_request(hubspot_deal_id="deal-789", uprn="100023336956") + service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) + mock_session = MagicMock() + with patch( + "backend.magic_plan.magic_plan_service.find_matching_plan", + return_value=plan_summary, + ), patch("backend.magic_plan.magic_plan_service.save_plan"), patch( + "backend.magic_plan.magic_plan_service.db_session" + ) as mock_db, patch( + "backend.magic_plan.magic_plan_service.save_data_to_s3" + ): + mock_db.return_value.__enter__.return_value = mock_session + # Act + service.run(request) + # Assert + added_objects = [call.args[0] for call in mock_session.add.call_args_list] + uploaded_file = next( + (obj for obj in added_objects if isinstance(obj, UploadedFile)), None + ) + assert uploaded_file is not None + assert uploaded_file.file_source == FileSourceEnum.MAGIC_PLAN.value + assert uploaded_file.file_type == FileTypeEnum.MAGIC_PLAN_JSON.value + assert uploaded_file.s3_file_bucket == S3_BUCKET + assert uploaded_file.s3_file_key == f"documents/uprn/100023336956/magic_plan_{plan_summary.id}.json.gz" + assert uploaded_file.s3_upload_timestamp is not None + assert uploaded_file.uprn == 100023336956 + assert uploaded_file.hubspot_deal_id == "deal-789" From 337474e7733110e633601c5a7eaed26d6bbe4241 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:20:49 +0000 Subject: [PATCH 26/43] =?UTF-8?q?Persist=20UploadedFile=20record=20for=20e?= =?UTF-8?q?ach=20MagicPlan=20S3=20upload=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 0aeed686..7860fec9 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -1,4 +1,5 @@ import gzip +from datetime import datetime, timezone from typing import Optional from datatypes.magicplan.api.response import ( @@ -58,5 +59,16 @@ class MagicPlanService: with db_session() as session: save_plan(session, plan) + session.add( + UploadedFile( + s3_file_bucket=self._s3_bucket, + s3_file_key=s3_key, + s3_upload_timestamp=datetime.now(timezone.utc), + uprn=int(uprn) if uprn is not None else None, + hubspot_deal_id=request.hubspot_deal_id, + file_source=FileSourceEnum.MAGIC_PLAN.value, + file_type=FileTypeEnum.MAGIC_PLAN_JSON.value, + ) + ) return plan From e1972e4349d86db57eca6ad5230bcc900a10935a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:23:32 +0000 Subject: [PATCH 27/43] =?UTF-8?q?Upload=20gzip-compressed=20MagicPlan=20JS?= =?UTF-8?q?ON=20to=20S3=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 47 +++++++++++++++--------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 7860fec9..9ac17e55 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -50,25 +50,38 @@ class MagicPlanService: raw_bytes: bytes = self._client.get_plan_raw(matched.id) plan: Plan = map_plan(magic_plan) - compressed = gzip.compress(raw_bytes) - if uprn is not None: - s3_key = f"documents/uprn/{uprn}/magic_plan_{matched.id}.json.gz" - else: - s3_key = f"documents/hubspot_deal_id/{request.hubspot_deal_id}/magic_plan_{matched.id}.json.gz" - save_data_to_s3(compressed, self._s3_bucket, s3_key) + uploaded_file = self._upload_raw_plan_json( + plan_id=matched.id, + raw_bytes=raw_bytes, + uprn=uprn, + hubspot_deal_id=request.hubspot_deal_id, + ) with db_session() as session: save_plan(session, plan) - session.add( - UploadedFile( - s3_file_bucket=self._s3_bucket, - s3_file_key=s3_key, - s3_upload_timestamp=datetime.now(timezone.utc), - uprn=int(uprn) if uprn is not None else None, - hubspot_deal_id=request.hubspot_deal_id, - file_source=FileSourceEnum.MAGIC_PLAN.value, - file_type=FileTypeEnum.MAGIC_PLAN_JSON.value, - ) - ) + session.add(uploaded_file) return plan + + def _upload_raw_plan_json( + self, + plan_id: str, + raw_bytes: bytes, + uprn: Optional[str], + hubspot_deal_id: str, + ) -> UploadedFile: + compressed = gzip.compress(raw_bytes) + if uprn is not None: + s3_key = f"documents/uprn/{uprn}/magic_plan_{plan_id}.json.gz" + else: + s3_key = f"documents/hubspot_deal_id/{hubspot_deal_id}/magic_plan_{plan_id}.json.gz" + save_data_to_s3(compressed, self._s3_bucket, s3_key) + return UploadedFile( + s3_file_bucket=self._s3_bucket, + s3_file_key=s3_key, + s3_upload_timestamp=datetime.now(timezone.utc), + uprn=int(uprn) if uprn is not None else None, + hubspot_deal_id=hubspot_deal_id, + file_source=FileSourceEnum.MAGIC_PLAN.value, + file_type=FileTypeEnum.MAGIC_PLAN_JSON.value, + ) From 9f62e3c31abe61a9f67faf38e3bbc77730a1b64d Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 14:30:59 +0000 Subject: [PATCH 28/43] typehint --- backend/magic_plan/magic_plan_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 9ac17e55..6ed25c0c 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -50,7 +50,7 @@ class MagicPlanService: raw_bytes: bytes = self._client.get_plan_raw(matched.id) plan: Plan = map_plan(magic_plan) - uploaded_file = self._upload_raw_plan_json( + uploaded_file: UploadedFile = self._upload_raw_plan_json( plan_id=matched.id, raw_bytes=raw_bytes, uprn=uprn, From c9c43f178c51ae061dce767f1062981a3fa8acf3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 14:48:15 +0000 Subject: [PATCH 29/43] demo generated for use in address2uprn --- backend/address2UPRN/main.py | 76 +----- backend/address2UPRN/scoring.py | 57 +++++ datatypes/epc/domain/historic_epc.py | 88 +++++++ datatypes/epc/domain/historic_epc_matching.py | 114 +++++++++ datatypes/epc/domain/plan.md | 161 ------------ .../tests/test_historic_epc_matching.py | 239 ++++++++++++++++++ datatypes/epc/loaders/historic_epc.py | 2 +- datatypes/epc/schema/historic_epc.py | 98 ------- .../schema/tests/test_historic_epc_loading.py | 2 +- scripts/historic_epc_demo.py | 47 ++++ utils/s3.py | 15 ++ 11 files changed, 570 insertions(+), 329 deletions(-) create mode 100644 backend/address2UPRN/scoring.py create mode 100644 datatypes/epc/domain/historic_epc_matching.py delete mode 100644 datatypes/epc/domain/plan.md create mode 100644 datatypes/epc/domain/tests/test_historic_epc_matching.py delete mode 100644 datatypes/epc/schema/historic_epc.py create mode 100644 scripts/historic_epc_demo.py diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 28ad344f..b83c7a58 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -17,16 +17,12 @@ from utils.s3 import ( from datetime import datetime from backend.utils.addressMatch import AddressMatch - -logger = setup_logger() - - -EPC_AUTH_TOKEN = os.getenv( - "EPC_AUTH_TOKEN", +from backend.address2UPRN.scoring import ( # noqa: F401 (re-exported) + df_has_single_uprn, + get_uprn_candidates, ) -if EPC_AUTH_TOKEN is None: - raise RuntimeError("EPC_AUTH_TOKEN not defined in env") +logger = setup_logger() def score_addresses( @@ -45,7 +41,10 @@ def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): Recursively fetch EPC data by postcode. If results hit the size limit, retry with double size up to max_attempts. """ - client = EpcClient(auth_token=EPC_AUTH_TOKEN) + auth_token = os.getenv("EPC_AUTH_TOKEN") + if auth_token is None: + raise RuntimeError("EPC_AUTH_TOKEN not defined in env") + client = EpcClient(auth_token=auth_token) url = os.path.join(client.domestic.host, "search") @@ -88,65 +87,6 @@ def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): return results_df -def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: - """ - Returns True if all non-null UPRNs in df match the given uprn. - Returns False otherwise. - """ - - if column not in df.columns: - return False - - # Drop nulls and normalise to string - uprns = df[column].dropna().astype(str).str.strip().unique() - - # No valid UPRNs to compare - if len(uprns) == 0: - return False - - # Exactly one unique UPRN and it matches - return len(uprns) == 1 and uprns[0] == str(uprn) - - -def get_uprn_candidates( - df: pd.DataFrame, - user_address: str, - address_column: str = "address", - uprn_column: str = "uprn", -) -> pd.DataFrame: - """ - Annotate EPC results with lexicographical similarity scores and ranks. - - Returns a DataFrame sorted by descending lexiscore. - DOES NOT choose or return a UPRN. - """ - - if address_column not in df.columns: - raise ValueError(f"Missing column: {address_column}") - - if uprn_column not in df.columns: - raise ValueError(f"Missing column: {uprn_column}") - - out = df.copy() - - user_norm = AddressMatch.normalise_address(user_address) - - out["lexiscore"] = out[address_column].apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) - - # Normalise UPRN to string - out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) - - # Rank: 1 = best match - out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) - - return out.sort_values( - ["lexirank", "lexiscore"], - ascending=[True, False], - ) - - def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, diff --git a/backend/address2UPRN/scoring.py b/backend/address2UPRN/scoring.py new file mode 100644 index 00000000..d31b9aea --- /dev/null +++ b/backend/address2UPRN/scoring.py @@ -0,0 +1,57 @@ +import pandas as pd + +from backend.utils.addressMatch import AddressMatch + + +def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: + """ + Returns True if all non-null UPRNs in df match the given uprn. + Returns False otherwise. + """ + + if column not in df.columns: + return False + + uprns = df[column].dropna().astype(str).str.strip().unique() + + if len(uprns) == 0: + return False + + return len(uprns) == 1 and uprns[0] == str(uprn) + + +def get_uprn_candidates( + df: pd.DataFrame, + user_address: str, + address_column: str = "address", + uprn_column: str = "uprn", +) -> pd.DataFrame: + """ + Annotate EPC results with lexicographical similarity scores and ranks. + + Returns a DataFrame sorted by descending lexiscore. + DOES NOT choose or return a UPRN. + """ + + if address_column not in df.columns: + raise ValueError(f"Missing column: {address_column}") + + if uprn_column not in df.columns: + raise ValueError(f"Missing column: {uprn_column}") + + out = df.copy() + + user_norm = AddressMatch.normalise_address(user_address) + + out["lexiscore"] = out[address_column].apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) + + out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) + + out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) + + return out.sort_values( + ["lexirank", "lexiscore"], + ascending=[True, False], + ) diff --git a/datatypes/epc/domain/historic_epc.py b/datatypes/epc/domain/historic_epc.py index 230c6327..f64ab8c4 100644 --- a/datatypes/epc/domain/historic_epc.py +++ b/datatypes/epc/domain/historic_epc.py @@ -3,8 +3,96 @@ from dataclasses import dataclass @dataclass class HistoricEpc: + lmk_key: str address1: str address2: str address3: str postcode: str + building_reference_number: str + current_energy_rating: str + potential_energy_rating: str + current_energy_efficiency: str + potential_energy_efficiency: str + property_type: str + built_form: str + inspection_date: str + local_authority: str + constituency: str + county: str + lodgement_date: str + transaction_type: str + environment_impact_current: str + environment_impact_potential: str + energy_consumption_current: str + energy_consumption_potential: str + co2_emissions_current: str + co2_emiss_curr_per_floor_area: str + co2_emissions_potential: str + lighting_cost_current: str + lighting_cost_potential: str + heating_cost_current: str + heating_cost_potential: str + hot_water_cost_current: str + hot_water_cost_potential: str + total_floor_area: str + energy_tariff: str + mains_gas_flag: str + floor_level: str + flat_top_storey: str + flat_storey_count: str + main_heating_controls: str + multi_glaze_proportion: str + glazed_type: str + glazed_area: str + extension_count: str + number_habitable_rooms: str + number_heated_rooms: str + low_energy_lighting: str + number_open_fireplaces: str + hotwater_description: str + hot_water_energy_eff: str + hot_water_env_eff: str + floor_description: str + floor_energy_eff: str + floor_env_eff: str + windows_description: str + windows_energy_eff: str + windows_env_eff: str + walls_description: str + walls_energy_eff: str + walls_env_eff: str + secondheat_description: str + sheating_energy_eff: str + sheating_env_eff: str + roof_description: str + roof_energy_eff: str + roof_env_eff: str + mainheat_description: str + mainheat_energy_eff: str + mainheat_env_eff: str + mainheatcont_description: str + mainheatc_energy_eff: str + mainheatc_env_eff: str + lighting_description: str + lighting_energy_eff: str + lighting_env_eff: str + main_fuel: str + wind_turbine_count: str + heat_loss_corridor: str + unheated_corridor_length: str + floor_height: str + photo_supply: str + solar_water_heating_flag: str + mechanical_ventilation: str + address: str + local_authority_label: str + constituency_label: str + posttown: str + construction_age_band: str + lodgement_datetime: str + tenure: str + fixed_lighting_outlets_count: str + low_energy_fixed_light_count: str uprn: str + uprn_source: str + report_type: str diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py new file mode 100644 index 00000000..53f602ae --- /dev/null +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -0,0 +1,114 @@ +from dataclasses import dataclass +from typing import Any, Optional + +import pandas as pd +from botocore.exceptions import ClientError + +from backend.address2UPRN.scoring import get_uprn_candidates +from backend.utils.addressMatch import AddressMatch +from datatypes.epc.domain.historic_epc import HistoricEpc +from utils.s3 import parse_s3_uri, read_csv_gz_from_s3 + +DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" + +_EXTRA_COLS = {"lexiscore", "lexirank"} + + +def _cell_to_str(v: Any) -> str: + if v is None or (isinstance(v, float) and pd.isna(v)): + return "" + s = str(v).replace("\xa0", " ") + # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". + # Treat that as missing so unambiguous_uprn truthiness checks work. + if s.lower() == "nan": + return "" + return s + + +def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: + kwargs = { + col.lower(): _cell_to_str(val) + for col, val in row.items() + if col.lower() not in _EXTRA_COLS + } + return HistoricEpc(**kwargs) + + +@dataclass(frozen=True) +class ScoredHistoricEpc: + record: HistoricEpc + lexiscore: float + lexirank: int + + +@dataclass +class HistoricEpcMatches: + user_address: str + postcode: str + matches: list[ScoredHistoricEpc] + + def top(self) -> Optional[ScoredHistoricEpc]: + return self.matches[0] if self.matches else None + + def top_n(self, k: int) -> list[ScoredHistoricEpc]: + return self.matches[:k] + + def unambiguous_uprn(self) -> Optional[str]: + top = self.top() + if top is None or top.lexiscore <= 0: + return None + rank1 = [m for m in self.matches if m.lexirank == top.lexirank] + uprns = {m.record.uprn for m in rank1 if m.record.uprn} + return next(iter(uprns)) if len(uprns) == 1 else None + + +def _sanitise_postcode(postcode: str) -> str: + cleaned = (postcode or "").upper().replace(" ", "") + if not cleaned: + raise ValueError("postcode must contain non-whitespace characters") + if not AddressMatch.is_valid_postcode(cleaned): + raise ValueError(f"postcode {cleaned!r} is not a valid UK postcode") + return cleaned + + +def match_addresses_for_postcode( + user_address: str, + postcode: str, + *, + s3_root: str = DEFAULT_S3_ROOT, + address_column: str = "ADDRESS", + uprn_column: str = "UPRN", +) -> HistoricEpcMatches: + if not user_address: + raise ValueError("user_address must be non-empty") + + pc = _sanitise_postcode(postcode) + bucket, root_prefix = parse_s3_uri(s3_root) + key = f"{root_prefix.rstrip('/')}/{pc}/data.csv.gz" + + try: + df = read_csv_gz_from_s3(bucket, key) + except ClientError as e: + if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): + raise FileNotFoundError( + f"No historic EPC data at s3://{bucket}/{key}" + ) from e + raise + + scored = get_uprn_candidates( + df, + user_address=user_address, + address_column=address_column, + uprn_column=uprn_column, + ) + + matches = [ + ScoredHistoricEpc( + record=_row_to_historic_epc(row), + lexiscore=float(row["lexiscore"]), + lexirank=int(row["lexirank"]), + ) + for _, row in scored.iterrows() + ] + + return HistoricEpcMatches(user_address=user_address, postcode=pc, matches=matches) diff --git a/datatypes/epc/domain/plan.md b/datatypes/epc/domain/plan.md deleted file mode 100644 index 45cc495b..00000000 --- a/datatypes/epc/domain/plan.md +++ /dev/null @@ -1,161 +0,0 @@ -# Historic EPC address-match service - -## Context - -ETL `backend/etl/etl_opendatacommunities/main.py` shards `certificates.csv` by sanitised postcode and uploads gzipped CSVs to `s3://retrofit-data-dev/historical_epc//data.csv.gz`. Need a pure-python lib that, given `(user_address, postcode)`, fetches the corresponding shard and scores every row against the user address using the same lexiscore as `address2UPRN` — but returning the full scored df (not a single UPRN), so callers can apply their own thresholding. - -Mirrors pattern in [backend/address2UPRN/main.py:111-147](backend/address2UPRN/main.py#L111-L147) (`get_uprn_candidates`) but reads from S3 historic CSV instead of the EPC live API. No Lambda, no script — lib only for now. - -## Approach - -Add a wrapper class `HistoricEpcMatches` and a function `match_addresses_for_postcode` to the existing domain file. Add a small gzip-CSV S3 helper to `utils/s3.py`. - -### 1. Add gzip-CSV S3 reader - -In [utils/s3.py](utils/s3.py) (after `read_dataframe_from_s3_parquet` ~line 167): - -```python -def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: - if not file_key.endswith(".csv.gz"): - raise ValueError("file_key must end with .csv.gz") - buf = read_io_from_s3(bucket_name, file_key) - return pd.read_csv(buf, compression="gzip", low_memory=False) -``` - -Reuses existing `read_io_from_s3` (line 105). Caller catches `botocore.exceptions.ClientError` for missing-key handling. - -### 2. Append matcher to domain module - -In [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — keep existing `HistoricEpc` dataclass intact, append: - -```python -from typing import Optional -import pandas as pd -from botocore.exceptions import ClientError - -from backend.utils.addressMatch import AddressMatch -from utils.s3 import read_csv_gz_from_s3 - - -@dataclass -class HistoricEpcMatches: - """Scored historic EPC rows for a single postcode.""" - user_address: str - postcode: str # sanitised - df: pd.DataFrame # has lexiscore + lexirank, sorted best-first - - def top(self) -> Optional[pd.Series]: - return None if self.df.empty else self.df.iloc[0] - - def top_n(self, k: int) -> pd.DataFrame: - return self.df.head(k) - - def unambiguous_uprn(self, uprn_column: str = "UPRN") -> Optional[str]: - if self.df.empty: - return None - top_rank = self.df["lexirank"].min() - uprns = ( - self.df.loc[self.df["lexirank"] == top_rank, uprn_column] - .dropna().astype(str).str.replace(r"\.0$", "", regex=True) - .unique() - ) - return uprns[0] if len(uprns) == 1 else None - - -def _sanitise_postcode(postcode: str) -> str: - if not postcode: - raise ValueError("postcode must be non-empty") - return postcode.upper().replace(" ", "") - - -def match_addresses_for_postcode( - user_address: str, - postcode: str, - *, - bucket: str = "retrofit-data-dev", - prefix: str = "historical_epc", - address_column: str = "ADDRESS", -) -> HistoricEpcMatches: - if not user_address: - raise ValueError("user_address must be non-empty") - - pc = _sanitise_postcode(postcode) - key = f"{prefix}/{pc}/data.csv.gz" - - try: - df = read_csv_gz_from_s3(bucket, key) - except ClientError as e: - if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): - raise FileNotFoundError( - f"No historic EPC data at s3://{bucket}/{key}" - ) from e - raise - - if address_column not in df.columns: - raise ValueError( - f"Missing address column {address_column!r} in {key}" - ) - - user_norm = AddressMatch.normalise_address(user_address) - df = df.copy() - df["lexiscore"] = df[address_column].fillna("").apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) - df["lexirank"] = ( - df["lexiscore"].rank(method="dense", ascending=False).astype(int) - ) - df = df.sort_values(["lexirank", "lexiscore"], ascending=[True, False]).reset_index(drop=True) - - return HistoricEpcMatches(user_address=user_address, postcode=pc, df=df) -``` - -### Reuse notes -- `AddressMatch.normalise_address` + `AddressMatch.levenshtein` from [backend/utils/addressMatch.py](backend/utils/addressMatch.py) — same scoring as address2UPRN. -- Score column copy uses `.fillna("")` to defend against NaN in `ADDRESS`. -- Defaults match ETL output: bucket `retrofit-data-dev`, prefix `historical_epc`, column `ADDRESS` (uppercase). - -### 3. Tests - -New: [datatypes/epc/domain/tests/__init__.py](datatypes/epc/domain/tests/__init__.py) (empty) and [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py). - -Reuse existing fixture `datatypes/epc/schema/tests/fixtures/historic_epc.csv` — read it in-memory in tests; do NOT commit a `.csv.gz` fixture. Patch target: `datatypes.epc.domain.historic_epc.read_csv_gz_from_s3` (local binding, not `utils.s3.read_csv_gz_from_s3`). - -Cases: -1. `_sanitise_postcode("ab33 8al") == "AB338AL"`; empty raises. -2. Returned df has `lexiscore` + `lexirank` columns, row count preserved. -3. df sorted: `iloc[0]["lexirank"] == 1`, `lexiscore` monotone non-increasing. -4. S3 key built correctly: `"AB33 8AL"` → key `"historical_epc/AB338AL/data.csv.gz"` (spy on patched helper). -5. `ClientError` with code `NoSuchKey` → `FileNotFoundError`. -6. Exact-match address → `unambiguous_uprn()` returns that UPRN; ambiguous tie → `None`. -7. `top()` / `top_n(k)` shape checks. - -## Critical files -- [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — append matcher -- [utils/s3.py](utils/s3.py) — add `read_csv_gz_from_s3` -- [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py) — new - -## Out of scope -- Lambda handler / SQS wiring (deferred — lib only) -- Threshold logic (caller decides via wrapper helpers) -- Postcode validation via `postcodes.io` (`AddressMatch.is_valid_postcode` exists if needed later) -- Refactoring `sanitise(pd.Series)` in `etl_opendatacommunities/main.py` — separate concern - -## Verification -``` -cd /workspaces/model && pytest datatypes/epc/domain/tests/test_historic_epc_match.py -v -``` - -Sample real-S3 call (needs AWS creds): -```python -from datatypes.epc.domain.historic_epc import match_addresses_for_postcode -m = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") -print(m.df[["ADDRESS", "UPRN", "lexiscore", "lexirank"]].head()) -print(m.unambiguous_uprn()) -``` - -## Sequencing -1. Add `read_csv_gz_from_s3` to `utils/s3.py`. -2. Append matcher + wrapper to `datatypes/epc/domain/historic_epc.py`. -3. Add tests. - -Steps 2 & 3 depend on 1. No `__init__.py` re-exports needed. diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py new file mode 100644 index 00000000..c23846e1 --- /dev/null +++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py @@ -0,0 +1,239 @@ +from unittest.mock import patch + +import numpy as np +import pandas as pd +import pytest +from botocore.exceptions import ClientError + +from datatypes.epc.domain import historic_epc_matching as matcher_mod +from datatypes.epc.domain.historic_epc_matching import ( + HistoricEpcMatches, + ScoredHistoricEpc, + _sanitise_postcode, + match_addresses_for_postcode, +) + + +# Columns required by the HistoricEpc dataclass (lower-cased CSV columns). +# The matcher only reads ADDRESS + UPRN to score; everything else is filled +# with "" but must be present for HistoricEpc(**kwargs) to construct. +_FULL_COLUMN_FIELDS = [ + "LMK_KEY", "ADDRESS1", "ADDRESS2", "ADDRESS3", "POSTCODE", + "BUILDING_REFERENCE_NUMBER", "CURRENT_ENERGY_RATING", "POTENTIAL_ENERGY_RATING", + "CURRENT_ENERGY_EFFICIENCY", "POTENTIAL_ENERGY_EFFICIENCY", "PROPERTY_TYPE", + "BUILT_FORM", "INSPECTION_DATE", "LOCAL_AUTHORITY", "CONSTITUENCY", "COUNTY", + "LODGEMENT_DATE", "TRANSACTION_TYPE", "ENVIRONMENT_IMPACT_CURRENT", + "ENVIRONMENT_IMPACT_POTENTIAL", "ENERGY_CONSUMPTION_CURRENT", + "ENERGY_CONSUMPTION_POTENTIAL", "CO2_EMISSIONS_CURRENT", + "CO2_EMISS_CURR_PER_FLOOR_AREA", "CO2_EMISSIONS_POTENTIAL", + "LIGHTING_COST_CURRENT", "LIGHTING_COST_POTENTIAL", "HEATING_COST_CURRENT", + "HEATING_COST_POTENTIAL", "HOT_WATER_COST_CURRENT", "HOT_WATER_COST_POTENTIAL", + "TOTAL_FLOOR_AREA", "ENERGY_TARIFF", "MAINS_GAS_FLAG", "FLOOR_LEVEL", + "FLAT_TOP_STOREY", "FLAT_STOREY_COUNT", "MAIN_HEATING_CONTROLS", + "MULTI_GLAZE_PROPORTION", "GLAZED_TYPE", "GLAZED_AREA", "EXTENSION_COUNT", + "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "LOW_ENERGY_LIGHTING", + "NUMBER_OPEN_FIREPLACES", "HOTWATER_DESCRIPTION", "HOT_WATER_ENERGY_EFF", + "HOT_WATER_ENV_EFF", "FLOOR_DESCRIPTION", "FLOOR_ENERGY_EFF", "FLOOR_ENV_EFF", + "WINDOWS_DESCRIPTION", "WINDOWS_ENERGY_EFF", "WINDOWS_ENV_EFF", + "WALLS_DESCRIPTION", "WALLS_ENERGY_EFF", "WALLS_ENV_EFF", + "SECONDHEAT_DESCRIPTION", "SHEATING_ENERGY_EFF", "SHEATING_ENV_EFF", + "ROOF_DESCRIPTION", "ROOF_ENERGY_EFF", "ROOF_ENV_EFF", "MAINHEAT_DESCRIPTION", + "MAINHEAT_ENERGY_EFF", "MAINHEAT_ENV_EFF", "MAINHEATCONT_DESCRIPTION", + "MAINHEATC_ENERGY_EFF", "MAINHEATC_ENV_EFF", "LIGHTING_DESCRIPTION", + "LIGHTING_ENERGY_EFF", "LIGHTING_ENV_EFF", "MAIN_FUEL", "WIND_TURBINE_COUNT", + "HEAT_LOSS_CORRIDOR", "UNHEATED_CORRIDOR_LENGTH", "FLOOR_HEIGHT", + "PHOTO_SUPPLY", "SOLAR_WATER_HEATING_FLAG", "MECHANICAL_VENTILATION", + "ADDRESS", "LOCAL_AUTHORITY_LABEL", "CONSTITUENCY_LABEL", "POSTTOWN", + "CONSTRUCTION_AGE_BAND", "LODGEMENT_DATETIME", "TENURE", + "FIXED_LIGHTING_OUTLETS_COUNT", "LOW_ENERGY_FIXED_LIGHT_COUNT", "UPRN", + "UPRN_SOURCE", "REPORT_TYPE", +] + + +def _row(address: str, uprn) -> dict: + row = {col: "" for col in _FULL_COLUMN_FIELDS} + row["ADDRESS"] = address + row["UPRN"] = uprn + return row + + +def _build_df(rows: list[dict]) -> pd.DataFrame: + return pd.DataFrame(rows, columns=_FULL_COLUMN_FIELDS) + + +@pytest.fixture +def patch_postcode_valid(): + with patch.object(matcher_mod.AddressMatch, "is_valid_postcode", return_value=True) as m: + yield m + + +@pytest.fixture +def patch_read(): + with patch.object(matcher_mod, "read_csv_gz_from_s3") as m: + yield m + + +# ---------- _sanitise_postcode ---------- + + +class TestSanitisePostcode: + + def test_uppercases_and_strips_spaces(self, patch_postcode_valid): + assert _sanitise_postcode("ab33 8al") == "AB338AL" + + def test_empty_raises(self, patch_postcode_valid): + with pytest.raises(ValueError, match="non-whitespace"): + _sanitise_postcode("") + + def test_whitespace_only_raises(self, patch_postcode_valid): + with pytest.raises(ValueError, match="non-whitespace"): + _sanitise_postcode(" ") + + def test_invalid_postcode_raises(self): + with patch.object( + matcher_mod.AddressMatch, "is_valid_postcode", return_value=False + ): + with pytest.raises(ValueError, match="not a valid UK postcode"): + _sanitise_postcode("NONSENSE") + + +# ---------- match_addresses_for_postcode ---------- + + +class TestMatchAddressesForPostcode: + + def test_preserves_row_count_including_zero_score_rows( + self, patch_read, patch_postcode_valid + ): + # Disjoint number sets => hard zero. Still kept in matches. + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("999 SOMEWHERE ELSE", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert isinstance(result, HistoricEpcMatches) + assert len(result.matches) == 2 + + def test_top_has_lexirank_one_and_lexiscore_monotone( + self, patch_read, patch_postcode_valid + ): + patch_read.return_value = _build_df([ + _row("48 GORDON ROAD", "200"), # near miss + _row("47 GORDON ROAD", "100"), # exact (after normalisation) + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert result.top().lexirank == 1 + scores = [m.lexiscore for m in result.matches] + assert scores == sorted(scores, reverse=True) + + def test_s3_key_built_from_default_root(self, patch_read, patch_postcode_valid): + patch_read.return_value = _build_df([_row("47 GORDON ROAD", "100")]) + match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + patch_read.assert_called_once_with( + "retrofit-data-dev", "historical_epc/AB338AL/data.csv.gz" + ) + + def test_s3_key_respects_custom_root_with_trailing_slash( + self, patch_read, patch_postcode_valid + ): + patch_read.return_value = _build_df([_row("47 GORDON ROAD", "100")]) + match_addresses_for_postcode( + "47 Gordon Road", + "AB33 8AL", + s3_root="s3://my-bucket/some/prefix/", + ) + patch_read.assert_called_once_with( + "my-bucket", "some/prefix/AB338AL/data.csv.gz" + ) + + def test_no_such_key_translates_to_filenotfound( + self, patch_read, patch_postcode_valid + ): + patch_read.side_effect = ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "missing"}}, "GetObject" + ) + with pytest.raises(FileNotFoundError): + match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + + def test_other_client_error_propagates(self, patch_read, patch_postcode_valid): + patch_read.side_effect = ClientError( + {"Error": {"Code": "AccessDenied", "Message": "nope"}}, "GetObject" + ) + with pytest.raises(ClientError): + match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + + def test_empty_user_address_raises(self, patch_postcode_valid): + with pytest.raises(ValueError, match="user_address"): + match_addresses_for_postcode("", "AB33 8AL") + + +# ---------- unambiguous_uprn ---------- + + +class TestUnambiguousUprn: + + def test_exact_match_returns_uprn(self, patch_read, patch_postcode_valid): + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert result.unambiguous_uprn() == "100" + + def test_ambiguous_tie_returns_none(self, patch_read, patch_postcode_valid): + # Two duplicate addresses with different UPRNs share rank-1. + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("47 GORDON ROAD", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert result.unambiguous_uprn() is None + + def test_all_zero_score_returns_none_even_when_uprn_unique( + self, patch_read, patch_postcode_valid + ): + # User address has building number 47; no row has 47 -> all hard-zero. + patch_read.return_value = _build_df([ + _row("999 ELSEWHERE", "100"), + _row("888 ELSEWHERE", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert all(m.lexiscore == 0.0 for m in result.matches) + assert result.unambiguous_uprn() is None + + def test_nan_uprn_becomes_empty_string_not_nan( + self, patch_read, patch_postcode_valid + ): + # Use a real NaN in the UPRN cell. + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", np.nan), + _row("48 GORDON ROAD", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + top = result.top() + # _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), + # so unambiguous_uprn's truthiness check correctly drops the row. + assert top.record.uprn == "" + + +# ---------- top / top_n ---------- + + +class TestTopHelpers: + + def test_top_n_returns_first_k(self, patch_read, patch_postcode_valid): + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + _row("49 GORDON ROAD", "300"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + top2 = result.top_n(2) + assert len(top2) == 2 + assert all(isinstance(m, ScoredHistoricEpc) for m in top2) + + def test_top_on_empty_matches_returns_none(self): + empty = HistoricEpcMatches(user_address="x", postcode="AB338AL", matches=[]) + assert empty.top() is None + assert empty.top_n(5) == [] + assert empty.unambiguous_uprn() is None diff --git a/datatypes/epc/loaders/historic_epc.py b/datatypes/epc/loaders/historic_epc.py index 7b563315..a4757d23 100644 --- a/datatypes/epc/loaders/historic_epc.py +++ b/datatypes/epc/loaders/historic_epc.py @@ -1,6 +1,6 @@ import csv -from datatypes.epc.schema.historic_epc import HistoricEpc +from datatypes.epc.domain.historic_epc import HistoricEpc def _normalise(value: str | None) -> str: diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py deleted file mode 100644 index f64ab8c4..00000000 --- a/datatypes/epc/schema/historic_epc.py +++ /dev/null @@ -1,98 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class HistoricEpc: - lmk_key: str - address1: str - address2: str - address3: str - postcode: str - building_reference_number: str - current_energy_rating: str - potential_energy_rating: str - current_energy_efficiency: str - potential_energy_efficiency: str - property_type: str - built_form: str - inspection_date: str - local_authority: str - constituency: str - county: str - lodgement_date: str - transaction_type: str - environment_impact_current: str - environment_impact_potential: str - energy_consumption_current: str - energy_consumption_potential: str - co2_emissions_current: str - co2_emiss_curr_per_floor_area: str - co2_emissions_potential: str - lighting_cost_current: str - lighting_cost_potential: str - heating_cost_current: str - heating_cost_potential: str - hot_water_cost_current: str - hot_water_cost_potential: str - total_floor_area: str - energy_tariff: str - mains_gas_flag: str - floor_level: str - flat_top_storey: str - flat_storey_count: str - main_heating_controls: str - multi_glaze_proportion: str - glazed_type: str - glazed_area: str - extension_count: str - number_habitable_rooms: str - number_heated_rooms: str - low_energy_lighting: str - number_open_fireplaces: str - hotwater_description: str - hot_water_energy_eff: str - hot_water_env_eff: str - floor_description: str - floor_energy_eff: str - floor_env_eff: str - windows_description: str - windows_energy_eff: str - windows_env_eff: str - walls_description: str - walls_energy_eff: str - walls_env_eff: str - secondheat_description: str - sheating_energy_eff: str - sheating_env_eff: str - roof_description: str - roof_energy_eff: str - roof_env_eff: str - mainheat_description: str - mainheat_energy_eff: str - mainheat_env_eff: str - mainheatcont_description: str - mainheatc_energy_eff: str - mainheatc_env_eff: str - lighting_description: str - lighting_energy_eff: str - lighting_env_eff: str - main_fuel: str - wind_turbine_count: str - heat_loss_corridor: str - unheated_corridor_length: str - floor_height: str - photo_supply: str - solar_water_heating_flag: str - mechanical_ventilation: str - address: str - local_authority_label: str - constituency_label: str - posttown: str - construction_age_band: str - lodgement_datetime: str - tenure: str - fixed_lighting_outlets_count: str - low_energy_fixed_light_count: str - uprn: str - uprn_source: str - report_type: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py index 2170a8a6..a42f383e 100644 --- a/datatypes/epc/schema/tests/test_historic_epc_loading.py +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -3,7 +3,7 @@ import os import pytest from datatypes.epc.loaders.historic_epc import read_historic_epc_csv -from datatypes.epc.schema.historic_epc import HistoricEpc +from datatypes.epc.domain.historic_epc import HistoricEpc FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") diff --git a/scripts/historic_epc_demo.py b/scripts/historic_epc_demo.py new file mode 100644 index 00000000..b47c3a3c --- /dev/null +++ b/scripts/historic_epc_demo.py @@ -0,0 +1,47 @@ +"""Demo: look up historic EPC records for an address + postcode. + +Reads the gzipped CSV at + s3://retrofit-data-dev/historical_epc//data.csv.gz +scores rows against the user-provided address, and prints the top matches. + +Usage: + python -m scripts.historic_epc_demo "47 Gordon Road" "AB33 8AL" + python -m scripts.historic_epc_demo # uses defaults below +""" + +import sys + +from datatypes.epc.domain.historic_epc_matching import match_addresses_for_postcode + + +def main(user_address: str, postcode: str) -> None: + print(f"Looking up: {user_address!r} @ {postcode!r}\n") + + result = match_addresses_for_postcode(user_address, postcode) + + print(f"Found {len(result.matches)} candidate row(s).\n") + + print("Top 3 matches:") + for m in result.top_n(3): + print( + f" rank={m.lexirank} score={m.lexiscore:.3f} " + f"uprn={m.record.uprn or '(none)':<14} {m.record.address}" + ) + + print() + uprn = result.unambiguous_uprn() + if uprn: + print(f"Unambiguous UPRN: {uprn}") + else: + print("No unambiguous UPRN (zero-score, tie, or empty result).") + + +if __name__ == "__main__": + args = sys.argv[1:] + if len(args) == 2: + main(args[0], args[1]) + elif len(args) == 0: + main("47 Gordon Road", "AB33 8AL") + else: + print(__doc__) + sys.exit(2) diff --git a/utils/s3.py b/utils/s3.py index 930e2e15..a28f074e 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -167,6 +167,21 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key): return df +def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: + """ + Read a gzipped CSV from S3 into a pandas DataFrame. + + :param bucket_name: Name of the S3 bucket. + :param file_key: Key of the file (must end in .csv.gz). + :return: A pandas DataFrame. + """ + if not file_key.endswith(".csv.gz"): + raise ValueError("file_key must end with .csv.gz") + + buffer = read_io_from_s3(bucket_name=bucket_name, file_key=file_key) + return pd.read_csv(buffer, compression="gzip", low_memory=False) + + def save_csv_to_s3(dataframe, bucket_name, file_name): """ Save a Pandas DataFrame to a CSV file in an S3 bucket. From 7ef5dc49223676cc833e7deead3e8dd2339c178e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 15:06:41 +0000 Subject: [PATCH 30/43] update csv --- backend/etl/etl_opendatacommunities/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/etl/etl_opendatacommunities/README.md b/backend/etl/etl_opendatacommunities/README.md index bf16ba89..65441caf 100644 --- a/backend/etl/etl_opendatacommunities/README.md +++ b/backend/etl/etl_opendatacommunities/README.md @@ -1,6 +1,6 @@ This website https://epc.opendatacommunities.org/ has closed down on 30th May 2026 -So we downloaded the data and moved everything to S3 ( s3://retrofit-data-dev/epc_opendatacommunities/master_backup/ ) +So we downloaded the data and moved everything to S3 ( s3://retrofit-data-dev/histroical_epc/0_master_backup/ ) This scripts assumes the following: @@ -11,4 +11,4 @@ The script funciton is: 1) reads csv for all data, seperate each iteration by postcode 2) compresses the csv and save it in the location -2) only gets the postcode data, compresses and uploads to s3 -> location s3://retrofit-data-dev/epc_opendatacommunities//compressed data \ No newline at end of file +3) location s3://retrofit-data-dev/epc_opendatacommunities//compressed data.csv \ No newline at end of file From ce2b61d60b10b5608f390a267ad2fe2d92b7d164 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 15:07:09 +0000 Subject: [PATCH 31/43] =?UTF-8?q?Upload=20gzip-compressed=20MagicPlan=20JS?= =?UTF-8?q?ON=20to=20S3=20-=20only=20make=20one=20API=20call=20?= =?UTF-8?q?=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/magic_plan/magic_plan_service.py | 5 ++++- backend/magic_plan/tests/test_magic_plan_service.py | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/backend/magic_plan/magic_plan_service.py b/backend/magic_plan/magic_plan_service.py index 6ed25c0c..fb0a7610 100644 --- a/backend/magic_plan/magic_plan_service.py +++ b/backend/magic_plan/magic_plan_service.py @@ -1,4 +1,5 @@ import gzip +import json from datetime import datetime, timezone from typing import Optional @@ -46,8 +47,10 @@ class MagicPlanService: if matched is None: raise ValueError(f"No MagicPlan found for address: {address!r}") - magic_plan: MagicPlanPlan = self._client.get_plan(matched.id) raw_bytes: bytes = self._client.get_plan_raw(matched.id) + magic_plan: MagicPlanPlan = MagicPlanPlan.model_validate( + json.loads(raw_bytes)["data"] + ) plan: Plan = map_plan(magic_plan) uploaded_file: UploadedFile = self._upload_raw_plan_json( diff --git a/backend/magic_plan/tests/test_magic_plan_service.py b/backend/magic_plan/tests/test_magic_plan_service.py index b7580546..f6954824 100644 --- a/backend/magic_plan/tests/test_magic_plan_service.py +++ b/backend/magic_plan/tests/test_magic_plan_service.py @@ -49,7 +49,9 @@ def plan_summary() -> PlanSummary: @pytest.fixture() def mock_client() -> MagicMock: client = MagicMock(spec=MagicPlanClient) - client.get_plan_raw.return_value = b"{}" + client.get_plan_raw.return_value = ( + FIXTURE_DIR / "magicplan_api_plan_response_example.json" + ).read_bytes() return client @@ -102,7 +104,7 @@ def test_run_fetches_plan_with_matched_id( ): service.run(_make_request()) # Assert - mock_client.get_plan.assert_called_once_with(plan_summary.id) + mock_client.get_plan_raw.assert_called_once_with(plan_summary.id) def test_run_returns_mapped_plan( @@ -183,8 +185,6 @@ def test_run_uploads_to_s3_with_uprn_key( ) -> None: # Arrange mock_client.get_plans.return_value.plans = [plan_summary] - mock_client.get_plan.return_value = api_magic_plan - mock_client.get_plan_raw.return_value = b'{"raw": "data"}' request = _make_request(uprn="100023336956") service = MagicPlanService(client=mock_client, s3_bucket=S3_BUCKET) with patch( From 1243690d100a0d73b6a91100e02ed289f160b87a Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 15:41:12 +0000 Subject: [PATCH 32/43] give handler permission to write to s3 bucket --- backend/magic_plan/handler.py | 1 + infrastructure/terraform/lambda/magic_plan/main.tf | 5 +++++ infrastructure/terraform/shared/main.tf | 14 ++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/backend/magic_plan/handler.py b/backend/magic_plan/handler.py index 45de8554..f2c03b90 100644 --- a/backend/magic_plan/handler.py +++ b/backend/magic_plan/handler.py @@ -19,6 +19,7 @@ def handler(body: dict[str, Any], context: Any) -> str: customer_id=settings.MAGICPLAN_CUSTOMER_ID, api_key=settings.MAGICPLAN_API_KEY, ) + # TODO: read s3_bucket from env var so staging/prod use the correct bucket plan: Plan = MagicPlanService(client, s3_bucket="retrofit-energy-assessments-dev").run(payload) logger.info("Saved MagicPlan plan uid=%s", plan.uid) return plan.uid diff --git a/infrastructure/terraform/lambda/magic_plan/main.tf b/infrastructure/terraform/lambda/magic_plan/main.tf index 56adac1b..e2017b42 100644 --- a/infrastructure/terraform/lambda/magic_plan/main.tf +++ b/infrastructure/terraform/lambda/magic_plan/main.tf @@ -15,6 +15,11 @@ locals { db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) } +resource "aws_iam_role_policy_attachment" "magic_plan_s3_write" { + role = module.lambda.role_name + policy_arn = data.terraform_remote_state.shared.outputs.magic_plan_s3_write_arn +} + module "lambda" { source = "../../modules/lambda_with_sqs" diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 050ebdc2..e32ce395 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -745,4 +745,18 @@ module "magic_plan_client_registry" { source = "../modules/container_registry" name = "magic-plan" stage = var.stage +} + +module "magic_plan_s3_write" { + source = "../modules/s3_iam_policy" + + policy_name = "MagicPlanWriteS3" + policy_description = "Allow MagicPlan Lambda to write to retrofit energy assessments bucket" + bucket_arns = ["arn:aws:s3:::retrofit-energy-assessments-${var.stage}"] + actions = ["s3:PutObject", "s3:AbortMultipartUpload"] + resource_paths = ["/*"] +} + +output "magic_plan_s3_write_arn" { + value = module.magic_plan_s3_write.policy_arn } \ No newline at end of file From aadf73ed87db510efc17a1579122d7bc5e65ca15 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Fri, 8 May 2026 15:44:14 +0000 Subject: [PATCH 33/43] combine s3 write policies into one and apply to pashub and magicplan lambdas --- .../terraform/lambda/magic_plan/main.tf | 2 +- .../terraform/lambda/pashub_to_ara/main.tf | 2 +- infrastructure/terraform/shared/main.tf | 23 ++++--------------- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/infrastructure/terraform/lambda/magic_plan/main.tf b/infrastructure/terraform/lambda/magic_plan/main.tf index e2017b42..48fc3867 100644 --- a/infrastructure/terraform/lambda/magic_plan/main.tf +++ b/infrastructure/terraform/lambda/magic_plan/main.tf @@ -17,7 +17,7 @@ locals { resource "aws_iam_role_policy_attachment" "magic_plan_s3_write" { role = module.lambda.role_name - policy_arn = data.terraform_remote_state.shared.outputs.magic_plan_s3_write_arn + policy_arn = data.terraform_remote_state.shared.outputs.energy_assessments_s3_write_arn } module "lambda" { diff --git a/infrastructure/terraform/lambda/pashub_to_ara/main.tf b/infrastructure/terraform/lambda/pashub_to_ara/main.tf index 1a457617..902d7845 100644 --- a/infrastructure/terraform/lambda/pashub_to_ara/main.tf +++ b/infrastructure/terraform/lambda/pashub_to_ara/main.tf @@ -54,5 +54,5 @@ module "lambda" { resource "aws_iam_role_policy_attachment" "pashub_to_ara_s3_write" { role = module.lambda.role_name - policy_arn = data.terraform_remote_state.shared.outputs.pashub_to_ara_s3_write_arn + policy_arn = data.terraform_remote_state.shared.outputs.energy_assessments_s3_write_arn } diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index e32ce395..2c3200de 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -568,18 +568,18 @@ module "pashub_to_ara_registry" { stage = var.stage } -module "pashub_to_ara_s3_write" { +module "energy_assessments_s3_write" { source = "../modules/s3_iam_policy" - policy_name = "PashubToAraWriteS3" - policy_description = "Allow PasHub to ARA Lambda to write to retrofit energy assessments bucket" + policy_name = "EnergyAssessmentsWriteS3" + policy_description = "Allow lambdas to write to retrofit energy assessments bucket" bucket_arns = ["arn:aws:s3:::retrofit-energy-assessments-${var.stage}"] actions = ["s3:PutObject", "s3:AbortMultipartUpload"] resource_paths = ["/*"] } -output "pashub_to_ara_s3_write_arn" { - value = module.pashub_to_ara_s3_write.policy_arn +output "energy_assessments_s3_write_arn" { + value = module.energy_assessments_s3_write.policy_arn } ################################################ @@ -747,16 +747,3 @@ module "magic_plan_client_registry" { stage = var.stage } -module "magic_plan_s3_write" { - source = "../modules/s3_iam_policy" - - policy_name = "MagicPlanWriteS3" - policy_description = "Allow MagicPlan Lambda to write to retrofit energy assessments bucket" - bucket_arns = ["arn:aws:s3:::retrofit-energy-assessments-${var.stage}"] - actions = ["s3:PutObject", "s3:AbortMultipartUpload"] - resource_paths = ["/*"] -} - -output "magic_plan_s3_write_arn" { - value = module.magic_plan_s3_write.policy_arn -} \ No newline at end of file From fb758b76bf2dcecbed486b569b9fa5e345a85ddc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:37:44 +0000 Subject: [PATCH 34/43] changed to utils --- datatypes/epc/domain/historic_epc_matching.py | 16 +++------------- .../domain/tests/test_historic_epc_matching.py | 2 +- utils/pandas_utils.py | 14 ++++++++++++++ utils/s3.py | 2 -- 4 files changed, 18 insertions(+), 16 deletions(-) create mode 100644 utils/pandas_utils.py diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 53f602ae..2eb590e8 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import pandas as pd from botocore.exceptions import ClientError @@ -7,6 +7,7 @@ from botocore.exceptions import ClientError from backend.address2UPRN.scoring import get_uprn_candidates from backend.utils.addressMatch import AddressMatch from datatypes.epc.domain.historic_epc import HistoricEpc +from utils.pandas_utils import pandas_cell_to_str from utils.s3 import parse_s3_uri, read_csv_gz_from_s3 DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" @@ -14,20 +15,9 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" _EXTRA_COLS = {"lexiscore", "lexirank"} -def _cell_to_str(v: Any) -> str: - if v is None or (isinstance(v, float) and pd.isna(v)): - return "" - s = str(v).replace("\xa0", " ") - # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". - # Treat that as missing so unambiguous_uprn truthiness checks work. - if s.lower() == "nan": - return "" - return s - - def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: kwargs = { - col.lower(): _cell_to_str(val) + col.lower(): pandas_cell_to_str(val) for col, val in row.items() if col.lower() not in _EXTRA_COLS } diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py index c23846e1..1c3ee6d4 100644 --- a/datatypes/epc/domain/tests/test_historic_epc_matching.py +++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py @@ -211,7 +211,7 @@ class TestUnambiguousUprn: ]) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") top = result.top() - # _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), + # pandas_cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), # so unambiguous_uprn's truthiness check correctly drops the row. assert top.record.uprn == "" diff --git a/utils/pandas_utils.py b/utils/pandas_utils.py new file mode 100644 index 00000000..b32cde10 --- /dev/null +++ b/utils/pandas_utils.py @@ -0,0 +1,14 @@ +from typing import Any + +import pandas as pd + + +def pandas_cell_to_str(v: Any) -> str: + if v is None or (isinstance(v, float) and pd.isna(v)): + return "" + s = str(v).replace("\xa0", " ") + # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". + # Treat that as missing so unambiguous_uprn truthiness checks work. + if s.lower() == "nan": + return "" + return s diff --git a/utils/s3.py b/utils/s3.py index a28f074e..13d272e7 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -6,8 +6,6 @@ from io import BytesIO, StringIO from urllib.parse import unquote from utils.logger import setup_logger from botocore.exceptions import NoCredentialsError, PartialCredentialsError -from typing import Any - logger = setup_logger() From dccb35c2bc05f613bb767958b2a0ea7e517433e8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:44:55 +0000 Subject: [PATCH 35/43] fixed s3 location --- backend/etl/etl_opendatacommunities/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/etl/etl_opendatacommunities/README.md b/backend/etl/etl_opendatacommunities/README.md index 65441caf..728ac468 100644 --- a/backend/etl/etl_opendatacommunities/README.md +++ b/backend/etl/etl_opendatacommunities/README.md @@ -11,4 +11,4 @@ The script funciton is: 1) reads csv for all data, seperate each iteration by postcode 2) compresses the csv and save it in the location -3) location s3://retrofit-data-dev/epc_opendatacommunities//compressed data.csv \ No newline at end of file +3) location s3://retrofit-data-dev/historical_epc//compressed data.csv \ No newline at end of file From bf91722f30699124af4b36c23efcb108d09eeab8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:45:26 +0000 Subject: [PATCH 36/43] renamed a function to be self commenting --- datatypes/epc/domain/historic_epc_matching.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 2eb590e8..95ca9d9f 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -15,7 +15,7 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" _EXTRA_COLS = {"lexiscore", "lexirank"} -def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: +def _map_historic_epc_pandas_row_to_domain(row: pd.Series) -> HistoricEpc: kwargs = { col.lower(): pandas_cell_to_str(val) for col, val in row.items() @@ -94,7 +94,7 @@ def match_addresses_for_postcode( matches = [ ScoredHistoricEpc( - record=_row_to_historic_epc(row), + record=_map_historic_epc_pandas_row_to_domain(row), lexiscore=float(row["lexiscore"]), lexirank=int(row["lexirank"]), ) From 2049553176ee88e1fefa2062f300d036e58206a1 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 11 May 2026 09:25:41 +0000 Subject: [PATCH 37/43] =?UTF-8?q?Trigger=20MagicPlan=20on=20outcome=20"sur?= =?UTF-8?q?veyed"=20transition=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 7 +-- etl/hubspot/tests/test_hubspot_deal_differ.py | 49 ++++++++++--------- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index 5435a46d..ba3dc27a 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -198,12 +198,7 @@ class HubspotDealDiffer: def check_for_magicplan_trigger( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - new_status = (new_deal.get("coordination_status__stage_1_") or "").lower() - old_status = (old_deal.coordination_status or "").lower() - return ( - new_status in HubspotDealDiffer.COORDINATION_COMPLETE - and old_status not in HubspotDealDiffer.COORDINATION_COMPLETE - ) + raise NotImplementedError @staticmethod def _lodgement_completed( diff --git a/etl/hubspot/tests/test_hubspot_deal_differ.py b/etl/hubspot/tests/test_hubspot_deal_differ.py index 273a82a0..94952424 100644 --- a/etl/hubspot/tests/test_hubspot_deal_differ.py +++ b/etl/hubspot/tests/test_hubspot_deal_differ.py @@ -275,15 +275,12 @@ def test_pashub_trigger__coordination_design_lodgement_not_completed_and_pashub_ # ========================== -def test_magicplan_trigger__transitions_to_coordination_complete__returns_true() -> None: +def test_magicplan_trigger__outcome_transitions_to_surveyed__returns_true() -> None: deal_id = uuid.uuid4() # Arrange - old_deal = make_old_deal(id=deal_id, coordination_status="in progress") - new_deal = make_new_deal( - deal_id, - **{"coordination_status__stage_1_": "(v1) ioe/mtp complete"}, - ) + old_deal = make_old_deal(id=deal_id, outcome="assessed") + new_deal = make_new_deal(deal_id, outcome="surveyed") # Act result = HubspotDealDiffer.check_for_magicplan_trigger( @@ -295,20 +292,12 @@ def test_magicplan_trigger__transitions_to_coordination_complete__returns_true() assert result is True -def test_magicplan_trigger__already_in_coordination_complete_unrelated_change__returns_false() -> None: +def test_magicplan_trigger__outcome_already_surveyed__returns_false() -> None: deal_id = uuid.uuid4() # Arrange - old_deal = make_old_deal( - id=deal_id, - coordination_status="(v1) ioe/mtp complete", - outcome="pending", - ) - new_deal = make_new_deal( - deal_id, - **{"coordination_status__stage_1_": "(v1) ioe/mtp complete"}, - outcome="won", - ) + old_deal = make_old_deal(id=deal_id, outcome="surveyed") + new_deal = make_new_deal(deal_id, outcome="surveyed") # Act result = HubspotDealDiffer.check_for_magicplan_trigger( @@ -320,15 +309,12 @@ def test_magicplan_trigger__already_in_coordination_complete_unrelated_change__r assert result is False -def test_magicplan_trigger__transitions_to_non_complete_coordination_status__returns_false() -> None: +def test_magicplan_trigger__outcome_transitions_to_non_surveyed__returns_false() -> None: deal_id = uuid.uuid4() # Arrange - old_deal = make_old_deal(id=deal_id, coordination_status="in progress") - new_deal = make_new_deal( - deal_id, - **{"coordination_status__stage_1_": "design submitted"}, - ) + old_deal = make_old_deal(id=deal_id, outcome="assessed") + new_deal = make_new_deal(deal_id, outcome="assessed") # Act result = HubspotDealDiffer.check_for_magicplan_trigger( @@ -340,6 +326,23 @@ def test_magicplan_trigger__transitions_to_non_complete_coordination_status__ret assert result is False +def test_magicplan_trigger__outcome_surveyed_uppercase__returns_true() -> None: + deal_id = uuid.uuid4() + + # Arrange + old_deal = make_old_deal(id=deal_id, outcome="assessed") + new_deal = make_new_deal(deal_id, outcome="SURVEYED") + + # Act + result = HubspotDealDiffer.check_for_magicplan_trigger( + new_deal=new_deal, + old_deal=old_deal, + ) + + # Assert + assert result is True + + # ======================= # DB UPDATE TRIGGER TESTS # ======================= From c15ffdf2c01765b0baa3e1fb371afe8c54e462c4 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Mon, 11 May 2026 09:26:20 +0000 Subject: [PATCH 38/43] =?UTF-8?q?Trigger=20MagicPlan=20on=20outcome=20"sur?= =?UTF-8?q?veyed"=20transition=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index ba3dc27a..724a3e68 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -198,7 +198,9 @@ class HubspotDealDiffer: def check_for_magicplan_trigger( new_deal: Dict[str, str], old_deal: HubspotDealData ) -> bool: - raise NotImplementedError + new_outcome = (new_deal.get("outcome") or "").lower() + old_outcome = (old_deal.outcome or "").lower() + return new_outcome == "surveyed" and old_outcome != "surveyed" @staticmethod def _lodgement_completed( From 197e9a0e009565b72ad461f1f49ec26cb13226da Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 15:21:16 +0000 Subject: [PATCH 39/43] added histroci_epc.csv --- datatypes/epc/schema/tests/fixtures/historic_epc.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 datatypes/epc/schema/tests/fixtures/historic_epc.csv diff --git a/datatypes/epc/schema/tests/fixtures/historic_epc.csv b/datatypes/epc/schema/tests/fixtures/historic_epc.csv new file mode 100644 index 00000000..b4c10739 --- /dev/null +++ b/datatypes/epc/schema/tests/fixtures/historic_epc.csv @@ -0,0 +1,2 @@ +LMK_KEY,ADDRESS1,ADDRESS2,ADDRESS3,POSTCODE,BUILDING_REFERENCE_NUMBER,CURRENT_ENERGY_RATING,POTENTIAL_ENERGY_RATING,CURRENT_ENERGY_EFFICIENCY,POTENTIAL_ENERGY_EFFICIENCY,PROPERTY_TYPE,BUILT_FORM,INSPECTION_DATE,LOCAL_AUTHORITY,CONSTITUENCY,COUNTY,LODGEMENT_DATE,TRANSACTION_TYPE,ENVIRONMENT_IMPACT_CURRENT,ENVIRONMENT_IMPACT_POTENTIAL,ENERGY_CONSUMPTION_CURRENT,ENERGY_CONSUMPTION_POTENTIAL,CO2_EMISSIONS_CURRENT,CO2_EMISS_CURR_PER_FLOOR_AREA,CO2_EMISSIONS_POTENTIAL,LIGHTING_COST_CURRENT,LIGHTING_COST_POTENTIAL,HEATING_COST_CURRENT,HEATING_COST_POTENTIAL,HOT_WATER_COST_CURRENT,HOT_WATER_COST_POTENTIAL,TOTAL_FLOOR_AREA,ENERGY_TARIFF,MAINS_GAS_FLAG,FLOOR_LEVEL,FLAT_TOP_STOREY,FLAT_STOREY_COUNT,MAIN_HEATING_CONTROLS,MULTI_GLAZE_PROPORTION,GLAZED_TYPE,GLAZED_AREA,EXTENSION_COUNT,NUMBER_HABITABLE_ROOMS,NUMBER_HEATED_ROOMS,LOW_ENERGY_LIGHTING,NUMBER_OPEN_FIREPLACES,HOTWATER_DESCRIPTION,HOT_WATER_ENERGY_EFF,HOT_WATER_ENV_EFF,FLOOR_DESCRIPTION,FLOOR_ENERGY_EFF,FLOOR_ENV_EFF,WINDOWS_DESCRIPTION,WINDOWS_ENERGY_EFF,WINDOWS_ENV_EFF,WALLS_DESCRIPTION,WALLS_ENERGY_EFF,WALLS_ENV_EFF,SECONDHEAT_DESCRIPTION,SHEATING_ENERGY_EFF,SHEATING_ENV_EFF,ROOF_DESCRIPTION,ROOF_ENERGY_EFF,ROOF_ENV_EFF,MAINHEAT_DESCRIPTION,MAINHEAT_ENERGY_EFF,MAINHEAT_ENV_EFF,MAINHEATCONT_DESCRIPTION,MAINHEATC_ENERGY_EFF,MAINHEATC_ENV_EFF,LIGHTING_DESCRIPTION,LIGHTING_ENERGY_EFF,LIGHTING_ENV_EFF,MAIN_FUEL,WIND_TURBINE_COUNT,HEAT_LOSS_CORRIDOR,UNHEATED_CORRIDOR_LENGTH,FLOOR_HEIGHT,PHOTO_SUPPLY,SOLAR_WATER_HEATING_FLAG,MECHANICAL_VENTILATION,ADDRESS,LOCAL_AUTHORITY_LABEL,CONSTITUENCY_LABEL,POSTTOWN,CONSTRUCTION_AGE_BAND,LODGEMENT_DATETIME,TENURE,FIXED_LIGHTING_OUTLETS_COUNT,LOW_ENERGY_FIXED_LIGHT_COUNT,UPRN,UPRN_SOURCE,REPORT_TYPE +9292c3bf26a8876ce59274401ea73e3de5bd0b3e52a507c2162a46e57db8ea2f,47 GORDON ROAD,ALFORD,,AB33 8AL,10001111325,E,B,42,87,House,Semi-Detached,2021-04-11,,Unknown,,2021-04-12,ECO assessment,49,69,450,299,5.5,76,3.6,69,77,1579,715,349,118,72.0,Single,N,,,,,100.0,"double glazing, unknown install date",Normal,0.0,3.0,3.0,86.0,0.0,"Electric immersion, standard tariff",Very Poor,Poor,"Solid, no insulation (assumed)",,,Fully double glazed,Average,Average,"Granite or whinstone, as built, partial insulation (assumed)",Average,Average,,,,"Pitched, 100 mm loft insulation",Average,Average,"Room heaters, electric",Very Poor,Poor,Appliance thermostats,Good,Good,Low energy lighting in 86% of fixed outlets,Very Good,Very Good,electricity (not community),0.0,,,2.4,0.0,N,natural,"47 GORDON ROAD, ALFORD",,,ALFORD,England and Wales: 1976-1982,2021-04-12 21:45:35,Rented (private),7.0,,151020766.0,Energy Assessor,100 \ No newline at end of file From f0300eb8ff4749da93a49a259c88f448ab7dee08 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 08:57:24 +0000 Subject: [PATCH 40/43] =?UTF-8?q?Replace=20new-deal=20MagicPlan=20trigger?= =?UTF-8?q?=20to=20use=20outcome=3D=3D"surveyed"=20=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/hubspot_deal_differ.py | 16 ++--- .../hubspot/tests/test_scraper_handler.py | 65 +++++++++---------- pytest.ini | 2 +- 3 files changed, 39 insertions(+), 44 deletions(-) rename backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py => etl/hubspot/tests/test_scraper_handler.py (61%) diff --git a/etl/hubspot/hubspot_deal_differ.py b/etl/hubspot/hubspot_deal_differ.py index 724a3e68..da0072c1 100644 --- a/etl/hubspot/hubspot_deal_differ.py +++ b/etl/hubspot/hubspot_deal_differ.py @@ -162,6 +162,14 @@ class HubspotDealDiffer: return False + @staticmethod + def check_for_magicplan_trigger( + new_deal: Dict[str, str], old_deal: HubspotDealData + ) -> bool: + new_outcome = (new_deal.get("outcome") or "").lower() + old_outcome = (old_deal.outcome or "").lower() + return new_outcome == "surveyed" and old_outcome != "surveyed" + @staticmethod def _has_valid_pashub_link(new_pashub_link: str) -> bool: return bool(new_pashub_link) @@ -194,14 +202,6 @@ class HubspotDealDiffer: and new_status != old_deal.design_status ) - @staticmethod - def check_for_magicplan_trigger( - new_deal: Dict[str, str], old_deal: HubspotDealData - ) -> bool: - new_outcome = (new_deal.get("outcome") or "").lower() - old_outcome = (old_deal.outcome or "").lower() - return new_outcome == "surveyed" and old_outcome != "surveyed" - @staticmethod def _lodgement_completed( new_deal: Dict[str, str], old_deal: HubspotDealData diff --git a/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py b/etl/hubspot/tests/test_scraper_handler.py similarity index 61% rename from backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py rename to etl/hubspot/tests/test_scraper_handler.py index 6d18c4b4..e2f80d07 100644 --- a/backend/hubspot_trigger_orchestrator/tests/test_orchestrator.py +++ b/etl/hubspot/tests/test_scraper_handler.py @@ -3,37 +3,28 @@ import uuid from typing import Any, Dict, Optional from unittest.mock import MagicMock, patch -import pytest - from backend.app.db.models.hubspot_deal_data import HubspotDealData from etl.hubspot.scripts.scraper.main import handler -COORDINATION_COMPLETE = "(v1) ioe/mtp complete" DEAL_NAME = "123 Main Street" UPRN = "12345678" DEAL_ID = "999" MAGICPLAN_QUEUE_URL = "https://sqs.eu-west-2.amazonaws.com/123/magic-plan-dev" -def make_hubspot_deal( - coordination_status: Optional[str] = None, **kwargs: Any -) -> Dict[str, Any]: - deal: Dict[str, Any] = { +def make_hubspot_deal(**kwargs: Any) -> Dict[str, Any]: + return { "hs_object_id": DEAL_ID, "dealname": DEAL_NAME, "pashub_link": None, **kwargs, } - if coordination_status is not None: - deal["coordination_status__stage_1_"] = coordination_status - return deal -def make_db_deal(coordination_status: Optional[str] = None, **kwargs: Any) -> HubspotDealData: +def make_db_deal(**kwargs: Any) -> HubspotDealData: return HubspotDealData( id=uuid.uuid4(), deal_id=DEAL_ID, - coordination_status=coordination_status, **kwargs, ) @@ -68,14 +59,14 @@ def run_handler( return mock_sqs -# ======================= -# NEW DEAL PATH -# ======================= +# ==================================== +# NEW DEAL PATH - MagicPlan trigger +# ==================================== -def test_new_deal_in_coordination_complete__sends_sqs_message() -> None: +def test_new_deal__outcome_is_surveyed__triggers_magicplan() -> None: # Arrange - hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + hubspot_deal = make_hubspot_deal(outcome="surveyed") listing = {"national_uprn": UPRN} # Act @@ -84,13 +75,15 @@ def test_new_deal_in_coordination_complete__sends_sqs_message() -> None: # Assert mock_sqs.send_message.assert_called_once_with( QueueUrl=MAGICPLAN_QUEUE_URL, - MessageBody=json.dumps({"address": DEAL_NAME, "uprn": UPRN}), + MessageBody=json.dumps( + {"address": DEAL_NAME, "hubspot_deal_id": DEAL_ID, "uprn": UPRN} + ), ) -def test_new_deal_not_in_coordination_complete__no_sqs_message() -> None: +def test_new_deal__outcome_is_not_surveyed__does_not_trigger_magicplan() -> None: # Arrange - hubspot_deal = make_hubspot_deal(coordination_status="in progress") + hubspot_deal = make_hubspot_deal(outcome="assessed") # Act mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) @@ -99,9 +92,9 @@ def test_new_deal_not_in_coordination_complete__no_sqs_message() -> None: mock_sqs.send_message.assert_not_called() -def test_new_deal_with_no_listing__uprn_is_none_in_message() -> None: +def test_new_deal__outcome_is_surveyed__no_listing__magicplan_message_uprn_is_null() -> None: # Arrange - hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + hubspot_deal = make_hubspot_deal(outcome="surveyed") # Act mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) @@ -109,19 +102,21 @@ def test_new_deal_with_no_listing__uprn_is_none_in_message() -> None: # Assert mock_sqs.send_message.assert_called_once_with( QueueUrl=MAGICPLAN_QUEUE_URL, - MessageBody=json.dumps({"address": DEAL_NAME, "uprn": None}), + MessageBody=json.dumps( + {"address": DEAL_NAME, "hubspot_deal_id": DEAL_ID, "uprn": None} + ), ) -# ======================= -# EXISTING DEAL PATH -# ======================= +# ========================================== +# EXISTING DEAL PATH - MagicPlan trigger +# ========================================== -def test_existing_deal_transitions_to_coordination_complete__sends_sqs_message() -> None: +def test_existing_deal__outcome_transitions_to_surveyed__triggers_magicplan() -> None: # Arrange - db_deal = make_db_deal(coordination_status="in progress") - hubspot_deal = make_hubspot_deal(coordination_status=COORDINATION_COMPLETE) + db_deal = make_db_deal(outcome="assessed") + hubspot_deal = make_hubspot_deal(outcome="surveyed") listing = {"national_uprn": UPRN} # Act @@ -130,16 +125,16 @@ def test_existing_deal_transitions_to_coordination_complete__sends_sqs_message() # Assert mock_sqs.send_message.assert_called_once_with( QueueUrl=MAGICPLAN_QUEUE_URL, - MessageBody=json.dumps({"address": DEAL_NAME, "uprn": UPRN}), + MessageBody=json.dumps( + {"address": DEAL_NAME, "hubspot_deal_id": DEAL_ID, "uprn": UPRN} + ), ) -def test_existing_deal_already_in_coordination_complete_unrelated_change__no_sqs_message() -> None: +def test_existing_deal__outcome_already_surveyed__unrelated_change__does_not_trigger_magicplan() -> None: # Arrange - db_deal = make_db_deal(coordination_status=COORDINATION_COMPLETE, dealname="Old Name") - hubspot_deal = make_hubspot_deal( - coordination_status=COORDINATION_COMPLETE, dealname="New Name" - ) + db_deal = make_db_deal(outcome="surveyed", dealname="Old Name") + hubspot_deal = make_hubspot_deal(outcome="surveyed", dealname="New Name") # Act mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=None) diff --git a/pytest.ini b/pytest.ini index 398c5b71..e2a4a25d 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,6 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/pashub_fetcher/tests backend/documents_parser/tests backend/magic_plan/tests datatypes/magicplan/api/tests datatypes/magicplan/domain/tests backend/app/db/functions/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/ecmk_fetcher/tests/ backend/pashub_fetcher/tests backend/documents_parser/tests backend/magic_plan/tests datatypes/magicplan/api/tests datatypes/magicplan/domain/tests backend/app/db/functions/tests markers = integration: mark a test as an integration test From 9386846044e087b38a15bbfae3dd306f376cd205 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 08:58:15 +0000 Subject: [PATCH 41/43] =?UTF-8?q?Replace=20new-deal=20MagicPlan=20trigger?= =?UTF-8?q?=20to=20use=20outcome=3D=3D"surveyed"=20=F0=9F=9F=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/hubspot/scripts/scraper/main.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/etl/hubspot/scripts/scraper/main.py b/etl/hubspot/scripts/scraper/main.py index 32007cd4..86844352 100644 --- a/etl/hubspot/scripts/scraper/main.py +++ b/etl/hubspot/scripts/scraper/main.py @@ -57,8 +57,7 @@ def handler(body: dict[str, Any], context: Any) -> None: ) _trigger_pashub_fetcher(sqs_client, hubspot_deal_id, hubspot_deal) - coordination_status = (hubspot_deal.get("coordination_status__stage_1_") or "").lower() - if coordination_status in HubspotDealDiffer.COORDINATION_COMPLETE: + if (hubspot_deal.get("outcome") or "").lower() == "surveyed": logger.info( f"Triggering MagicPlan fetcher for HubSpot deal ID {hubspot_deal_id}" ) From 9501146ec815b2f2998017acb646571c922ba277 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 08:59:32 +0000 Subject: [PATCH 42/43] =?UTF-8?q?Replace=20new-deal=20MagicPlan=20trigger?= =?UTF-8?q?=20to=20use=20outcome=3D=3D"surveyed"=20=F0=9F=9F=AA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/hubspot_trigger_orchestrator/__init__.py | 0 backend/hubspot_trigger_orchestrator/tests/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 backend/hubspot_trigger_orchestrator/__init__.py delete mode 100644 backend/hubspot_trigger_orchestrator/tests/__init__.py diff --git a/backend/hubspot_trigger_orchestrator/__init__.py b/backend/hubspot_trigger_orchestrator/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/backend/hubspot_trigger_orchestrator/tests/__init__.py b/backend/hubspot_trigger_orchestrator/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 From 71aadfe78d1237cca0721707d97b7fc01e6bb6c3 Mon Sep 17 00:00:00 2001 From: Daniel Roth Date: Tue, 12 May 2026 09:31:47 +0000 Subject: [PATCH 43/43] add pashub functions to orchestrator tests, and rename existing magicplan ones --- etl/hubspot/tests/test_scraper_handler.py | 96 +++++++++++++++++++++-- 1 file changed, 90 insertions(+), 6 deletions(-) diff --git a/etl/hubspot/tests/test_scraper_handler.py b/etl/hubspot/tests/test_scraper_handler.py index e2f80d07..4810d171 100644 --- a/etl/hubspot/tests/test_scraper_handler.py +++ b/etl/hubspot/tests/test_scraper_handler.py @@ -9,7 +9,9 @@ from etl.hubspot.scripts.scraper.main import handler DEAL_NAME = "123 Main Street" UPRN = "12345678" DEAL_ID = "999" +PASHUB_LINK = "https://pashub.example.com/deal/999" MAGICPLAN_QUEUE_URL = "https://sqs.eu-west-2.amazonaws.com/123/magic-plan-dev" +PASHUB_QUEUE_URL = "https://sqs.test/pashub" def make_hubspot_deal(**kwargs: Any) -> Dict[str, Any]: @@ -52,7 +54,7 @@ def run_handler( ) mock_boto3.client.return_value = mock_sqs mock_settings.return_value.MAGICPLAN_SQS_URL = MAGICPLAN_QUEUE_URL - mock_settings.return_value.PASHUB_TO_ARA_SQS_URL = "https://sqs.test/pashub" + mock_settings.return_value.PASHUB_TO_ARA_SQS_URL = PASHUB_QUEUE_URL handler.__wrapped__({"hubspot_deal_id": DEAL_ID}, "") @@ -64,7 +66,7 @@ def run_handler( # ==================================== -def test_new_deal__outcome_is_surveyed__triggers_magicplan() -> None: +def test_new_deal_surveyed__sends_magicplan_sqs() -> None: # Arrange hubspot_deal = make_hubspot_deal(outcome="surveyed") listing = {"national_uprn": UPRN} @@ -81,7 +83,7 @@ def test_new_deal__outcome_is_surveyed__triggers_magicplan() -> None: ) -def test_new_deal__outcome_is_not_surveyed__does_not_trigger_magicplan() -> None: +def test_new_deal_not_surveyed__no_magicplan_sqs() -> None: # Arrange hubspot_deal = make_hubspot_deal(outcome="assessed") @@ -92,7 +94,7 @@ def test_new_deal__outcome_is_not_surveyed__does_not_trigger_magicplan() -> None mock_sqs.send_message.assert_not_called() -def test_new_deal__outcome_is_surveyed__no_listing__magicplan_message_uprn_is_null() -> None: +def test_new_deal_surveyed_no_listing__magicplan_sqs_uprn_is_null() -> None: # Arrange hubspot_deal = make_hubspot_deal(outcome="surveyed") @@ -113,7 +115,7 @@ def test_new_deal__outcome_is_surveyed__no_listing__magicplan_message_uprn_is_nu # ========================================== -def test_existing_deal__outcome_transitions_to_surveyed__triggers_magicplan() -> None: +def test_existing_deal_surveyed_transition__sends_magicplan_sqs() -> None: # Arrange db_deal = make_db_deal(outcome="assessed") hubspot_deal = make_hubspot_deal(outcome="surveyed") @@ -131,7 +133,7 @@ def test_existing_deal__outcome_transitions_to_surveyed__triggers_magicplan() -> ) -def test_existing_deal__outcome_already_surveyed__unrelated_change__does_not_trigger_magicplan() -> None: +def test_existing_deal_already_surveyed__no_magicplan_sqs() -> None: # Arrange db_deal = make_db_deal(outcome="surveyed", dealname="Old Name") hubspot_deal = make_hubspot_deal(outcome="surveyed", dealname="New Name") @@ -141,3 +143,85 @@ def test_existing_deal__outcome_already_surveyed__unrelated_change__does_not_tri # Assert mock_sqs.send_message.assert_not_called() + + +# ==================================== +# NEW DEAL PATH - PasHub trigger +# ==================================== + + +def test_new_deal_with_pashub_link__sends_pashub_sqs() -> None: + # Arrange + hubspot_deal = make_hubspot_deal(pashub_link=PASHUB_LINK) + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=PASHUB_QUEUE_URL, + MessageBody=json.dumps( + { + "pashub_link": PASHUB_LINK, + "address": None, + "hubspot_deal_id": DEAL_ID, + "sharepoint_link": None, + "uprn": None, + "landlord_property_id": None, + "deal_stage": None, + } + ), + ) + + +def test_new_deal_no_pashub_link__no_pashub_sqs() -> None: + # Arrange + hubspot_deal = make_hubspot_deal() + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=None, listing=None) + + # Assert + mock_sqs.send_message.assert_not_called() + + +# ========================================== +# EXISTING DEAL PATH - PasHub trigger +# ========================================== + + +def test_existing_deal_pashub_link_added__sends_pashub_sqs() -> None: + # Arrange + db_deal = make_db_deal(pashub_link=None) + hubspot_deal = make_hubspot_deal(pashub_link=PASHUB_LINK) + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=None) + + # Assert + mock_sqs.send_message.assert_called_once_with( + QueueUrl=PASHUB_QUEUE_URL, + MessageBody=json.dumps( + { + "pashub_link": PASHUB_LINK, + "address": None, + "hubspot_deal_id": DEAL_ID, + "sharepoint_link": None, + "uprn": None, + "landlord_property_id": None, + "deal_stage": None, + } + ), + ) + + +def test_existing_deal_pashub_link_unchanged__no_pashub_sqs() -> None: + # Arrange + db_deal = make_db_deal(pashub_link=PASHUB_LINK, dealname="Old Name") + hubspot_deal = make_hubspot_deal(pashub_link=PASHUB_LINK, dealname="New Name") + + # Act + mock_sqs = run_handler(hubspot_deal=hubspot_deal, db_deal=db_deal, listing=None) + + # Assert + mock_sqs.send_message.assert_not_called()