From 24ec68bb9f1d5f6e5ca8eb748b87cf64145ac7df Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 6 May 2026 07:55:37 +0000 Subject: [PATCH 01/13] save progress for historical epc procress --- backend/etl/etl_opendatacommunities/README.md | 14 ++ backend/etl/etl_opendatacommunities/main.py | 144 ++++++++++++++++++ 2 files changed, 158 insertions(+) create mode 100644 backend/etl/etl_opendatacommunities/README.md create mode 100644 backend/etl/etl_opendatacommunities/main.py diff --git a/backend/etl/etl_opendatacommunities/README.md b/backend/etl/etl_opendatacommunities/README.md new file mode 100644 index 00000000..bf16ba89 --- /dev/null +++ b/backend/etl/etl_opendatacommunities/README.md @@ -0,0 +1,14 @@ +This website https://epc.opendatacommunities.org/ has closed down on 30th May 2026 + +So we downloaded the data and moved everything to S3 ( s3://retrofit-data-dev/epc_opendatacommunities/master_backup/ ) + +This scripts assumes the following: + +1) You downloaded the master copy, uncompressed it and set it to a path so we can read the csv + + +The script funciton is: + +1) reads csv for all data, seperate each iteration by postcode +2) compresses the csv and save it in the location +2) only gets the postcode data, compresses and uploads to s3 -> location s3://retrofit-data-dev/epc_opendatacommunities//compressed data \ No newline at end of file diff --git a/backend/etl/etl_opendatacommunities/main.py b/backend/etl/etl_opendatacommunities/main.py new file mode 100644 index 00000000..30b4045a --- /dev/null +++ b/backend/etl/etl_opendatacommunities/main.py @@ -0,0 +1,144 @@ +from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait +from io import BytesIO +from pathlib import Path +from typing import Any + +import boto3 +import pandas as pd +from botocore.config import Config +from tqdm import tqdm + +from utils.logger import setup_logger + +logger = setup_logger() + +SRC_ROOT = Path("/workspaces/home/epc_data") +TMP_ROOT = Path("/tmp/epc_postcodes") +S3_BUCKET = "retrofit-data-dev" +S3_PREFIX = "epc_opendatacommunities" + +REC_COLS = { + "IMPROVEMENT_ITEM", + "IMPROVEMENT_SUMMARY_TEXT", + "IMPROVEMENT_DESCR_TEXT", + "IMPROVEMENT_ID", + "IMPROVEMENT_ID_TEXT", + "INDICATIVE_COST", +} + +# This scripts assume you downloading the zip, unzip it, and running it locally + + +def sanitise(pc: pd.Series) -> pd.Series: + return pc.astype("string").str.upper().str.replace(" ", "", regex=False) + + +def shard_la(la_dir: Path) -> None: + certs = pd.read_csv(la_dir / "certificates.csv", low_memory=False) + recs = pd.read_csv(la_dir / "recommendations.csv", low_memory=False) + merged = certs.merge(recs, on="LMK_KEY", how="left") + + merged["POSTCODE_CLEAN"] = sanitise(merged["POSTCODE"]) + before = len(merged) + merged = merged.dropna(subset=["POSTCODE_CLEAN"]) + merged = merged[merged["POSTCODE_CLEAN"] != ""] + dropped = before - len(merged) + if dropped: + logger.warning(f"{la_dir.name}: dropped {dropped} rows with empty postcode") + + for pc, group in merged.groupby("POSTCODE_CLEAN", sort=False): + out = TMP_ROOT / f"{pc}.csv" + group.drop(columns=["POSTCODE_CLEAN"]).to_csv( + out, mode="a", header=not out.exists(), index=False + ) + + +def list_existing_keys(s3: Any) -> set[str]: + existing: set[str] = set() + paginator = s3.get_paginator("list_objects_v2") + pages = paginator.paginate(Bucket=S3_BUCKET, Prefix=f"{S3_PREFIX}/") + for page in tqdm(pages, desc="list s3"): + for obj in page.get("Contents", []): + existing.add(obj["Key"]) + logger.info(f"Found {len(existing)} existing objects under {S3_PREFIX}/") + return existing + + +def upload_postcode(path: Path, s3: Any) -> None: + df = pd.read_csv(path, low_memory=False).drop_duplicates() + + cert_cols = [c for c in df.columns if c not in REC_COLS] + cert_only = df[cert_cols].drop_duplicates() + dupes = cert_only["LMK_KEY"].value_counts() + bad = dupes[dupes > 1] + if not bad.empty: + raise ValueError( + f"Postcode {path.stem}: LMK_KEY appears with conflicting cert data: " + f"{bad.index.tolist()[:5]}" + ) + + buf = BytesIO() + df.to_csv(buf, index=False, compression="gzip") + s3.put_object( + Bucket=S3_BUCKET, + Key=f"{S3_PREFIX}/{path.stem}/data.csv.gz", + Body=buf.getvalue(), + ContentType="text/csv", + ContentEncoding="gzip", + ) + + +def main(): + TMP_ROOT.mkdir(parents=True, exist_ok=True) + la_dirs = sorted( + p for p in SRC_ROOT.iterdir() if p.is_dir() and p.name.startswith("domestic-") + ) + logger.info(f"Sharding {len(la_dirs)} LA folders -> {TMP_ROOT}") + + # for la in tqdm(la_dirs, desc="shard"): + # shard_la(la) + + s3 = boto3.client( + "s3", + config=Config(max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"}), + ) + pc_files = sorted(TMP_ROOT.glob("*.csv")) + logger.info(f"Found {len(pc_files)} local shards") + + existing = list_existing_keys(s3) + todo = [p for p in pc_files if f"{S3_PREFIX}/{p.stem}/data.csv.gz" not in existing] + skipped = len(pc_files) - len(todo) + logger.info( + f"Uploading {len(todo)} shards (skipping {skipped} already in S3) -> " + f"s3://{S3_BUCKET}/{S3_PREFIX}/" + ) + + workers = 256 + todo_iter = iter(todo) + inflight: dict[Any, Path] = {} + pbar = tqdm(total=len(todo), desc="upload") + with ThreadPoolExecutor(max_workers=workers) as pool: + for _ in range(workers * 2): + pc = next(todo_iter, None) + if pc is None: + break + inflight[pool.submit(upload_postcode, pc, s3)] = pc + + while inflight: + done, _ = wait(inflight.keys(), return_when=FIRST_COMPLETED) + for fut in done: + pc = inflight.pop(fut) + try: + fut.result() + except Exception as e: + logger.error(f"{pc.name}: {e}") + raise + pbar.update(1) + nxt = next(todo_iter, None) + if nxt is not None: + inflight[pool.submit(upload_postcode, nxt, s3)] = nxt + pbar.close() + + +if __name__ == "__main__": + main() From 4f45eeb3e9bf854b5f246916d04b99d1eb45020b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 15:55:40 +0000 Subject: [PATCH 02/13] save --- asset_list/AssetList.py | 323 ++++++++++++++++++------------------ asset_list/app.py | 30 +++- backend/app/config.py | 1 + backend/app/local/router.py | 17 +- backend/app/main.py | 16 +- recommendations/Costs.py | 229 +++++++++++++++---------- 6 files changed, 345 insertions(+), 271 deletions(-) diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index dede3162..573c4f7c 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -31,17 +31,19 @@ from recommendations.recommendation_utils import ( from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes -from dotenv import load_dotenv +# from dotenv import load_dotenv logger = setup_logger() -load_dotenv(dotenv_path="../backend/.env") +# load_dotenv(dotenv_path="../backend/.env") # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +# OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") class DataRemapper: - def __init__(self, standard_values, standard_map=None, max_tokens=1000): + def __init__( + self, standard_values, standard_map=None, max_tokens=1000, api_key=None + ): """ Initialize the remapper with standard values and a predefined mapping. @@ -75,7 +77,8 @@ class DataRemapper: "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, } - self.openai_client = OpenAI(api_key=OPENAI_API_KEY) + print(f"DATA REMAPPER api key is {api_key}") + self.openai_client = OpenAI(api_key=api_key) @staticmethod def clean_string(text): @@ -136,12 +139,20 @@ class DataRemapper: raise ValueError("Input tokens exceed the maximum limit.") logger.info("Calling OpenAI API for standardization...") - response = self.openai_client.chat.completions.create( - model=self.ai_model, - messages=[{"role": "user", "content": prompt}], - max_tokens=self.max_tokens, - temperature=0.1, - ) + + try: + response = self.openai_client.chat.completions.create( + model=self.ai_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.1, + ) + except Exception as e: + print(f"[debug] OpenAI call failed. type={type(e).__name__}") + print(f"[debug] status={getattr(e, 'status_code', None)}") + print(f"[debug] body={getattr(e, 'response', None) and e.response.text}") + print(f"[debug] model={self.ai_model}") + raise output_text = response.choices[0].message.content.strip() output_tokens = self.count_tokens(output_text) # Count output tokens @@ -504,6 +515,7 @@ class AssetList: landlord_block_reference=None, phase=False, header=0, + openai_api_key=None, ): self.local_filepath = local_filepath self.sheet_name = sheet_name @@ -529,6 +541,7 @@ class AssetList: self.ecosurv = None self.ecosurv_no_match = pd.DataFrame() self.geographical_areas = pd.DataFrame() + self.openai_api_key = openai_api_key # When this is True, we intend to break the programme into multiple phases. We may need to review # how this is structured in the future, as depending on how we get future data, we may need to @@ -1107,6 +1120,7 @@ class AssetList: remapper = DataRemapper( standard_values=config["standard_values"], standard_map=config["standard_map"], + api_key=self.openai_api_key, ) remap_dictionary = remapper.standardize_list( values_to_remap=values_to_remap.tolist() @@ -1296,8 +1310,8 @@ class AssetList: self.standardised_asset_list[ self.ATTRIBUTE_HAS_SOLAR ] = self.standardised_asset_list[ - self.FIND_EPC_DATA_NAMES["Solar photovoltaics"] - ] | ~self.standardised_asset_list[ + self.FIND_EPC_DATA_NAMES["Solar photovoltaics"] + ] | ~self.standardised_asset_list[ self.EPC_API_DATA_NAMES["photo-supply"] ].isin( ["0.0", 0, None, "", np.nan] @@ -1315,7 +1329,7 @@ class AssetList: property_type=( str(x[self.STANDARD_PROPERTY_TYPE]).title() if str(x[self.STANDARD_PROPERTY_TYPE]).title() - in accepted_epc_property_types + in accepted_epc_property_types else ( x[self.EPC_API_DATA_NAMES["property-type"]] if not pd.isnull( @@ -1373,9 +1387,9 @@ class AssetList: self.standardised_asset_list.apply( lambda x: estimate_perimeter( floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] - / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] - / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], ), axis=1, ) @@ -1460,7 +1474,7 @@ class AssetList: year_lower_bound = ( 2007 if x[self.EPC_API_DATA_NAMES["construction-age-band"]] - == "England and Wales: 2007 onwards" + == "England and Wales: 2007 onwards" else 2012 ) @@ -1515,7 +1529,7 @@ class AssetList: age_band_matches = ( "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] - == int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) + == int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) else "EPC Age Band is different from Year Built" ) @@ -1545,7 +1559,7 @@ class AssetList: age_band_matches = ( "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) - and (x[self.STANDARD_YEAR_BUILT] <= float(upper_date)) + and (x[self.STANDARD_YEAR_BUILT] <= float(upper_date)) else ( "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) @@ -1717,22 +1731,22 @@ class AssetList: if self.non_intrusives_present: if self.new_format_non_insturives_present_v2: non_intrusives_wall_filter = ( - self.standardised_asset_list["non-intrusives: Construction"] - == "CAVITY" - ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( + self.standardised_asset_list["non-intrusives: Construction"] + == "CAVITY" + ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( ["EMPTY", "PARTIAL", "EMPTY CAVITY"] ) else: non_intrusives_wall_filter = ( - self.standardised_asset_list["non-intrusives: Construction"] - == "CAVITY" - ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( + self.standardised_asset_list["non-intrusives: Construction"] + == "CAVITY" + ) & self.standardised_asset_list["non-intrusives: Insulated"].isin( ["EMPTY", "PARTIAL"] ) elif self.old_format_non_intrusives_present: non_intrusives_wall_filter = self.standardised_asset_list[ - "non-intrusives: WFT Findings" - ].str.lower().str.strip().isin( + "non-intrusives: WFT Findings" + ].str.lower().str.strip().isin( [ "empty cavity", "partial fill", @@ -1742,18 +1756,18 @@ class AssetList: "empty cav", ] ) | ( - ( - self.standardised_asset_list["non-intrusives: WFT Findings"] - .str.lower() - .str.strip() - .str.contains("empty cavity|partial fill") - & ~self.standardised_asset_list["non-intrusives: WFT Findings"] - .astype(str) - .str.lower() - .str.strip() - .str.contains("major access issues") - ) - ) + ( + self.standardised_asset_list["non-intrusives: WFT Findings"] + .str.lower() + .str.strip() + .str.contains("empty cavity|partial fill") + & ~self.standardised_asset_list["non-intrusives: WFT Findings"] + .astype(str) + .str.lower() + .str.strip() + .str.contains("major access issues") + ) + ) else: # We set the filter to False, as we have no non-intrusives non_intrusives_wall_filter = False @@ -1765,12 +1779,12 @@ class AssetList: ) else: year_built_filter = ( - self.standardised_asset_list[self.STANDARD_YEAR_BUILT] - <= self.EMPTY_CAVITY_YEAR_THRESHOLD - ) | ( - self.standardised_asset_list["epc_year_upper_bound"] - <= self.EMPTY_CAVITY_YEAR_THRESHOLD - ) + self.standardised_asset_list[self.STANDARD_YEAR_BUILT] + <= self.EMPTY_CAVITY_YEAR_THRESHOLD + ) | ( + self.standardised_asset_list["epc_year_upper_bound"] + <= self.EMPTY_CAVITY_YEAR_THRESHOLD + ) # Criteria: # The property isn't a bedsit @@ -1811,8 +1825,8 @@ class AssetList: ] = ( ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity_has_solar" - ] + "non_intrusive_indicates_empty_cavity_has_solar" + ] & ( ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin( ["bedsit"] @@ -1888,8 +1902,8 @@ class AssetList: .str.lower() .isin(self.EPC_NO_WALL_INSULATION_DESCRIPTIONS) | self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( - ["uninsulated cavity"] - ) + ["uninsulated cavity"] + ) ) ###################################################### @@ -1926,8 +1940,8 @@ class AssetList: extraction_wall_filter = ( extraction_wall_filter & ~self.standardised_asset_list[ - "non-intrusives: Eligibility (Red/Yellow/Green)" - ].isin(["RED"]) + "non-intrusives: Eligibility (Red/Yellow/Green)" + ].isin(["RED"]) ) self.standardised_asset_list[ @@ -2023,26 +2037,26 @@ class AssetList: self.standardised_asset_list[ "solar_epc_data_indicates_correct_heating_system" ] = ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] - ] - .str.lower() - .str.contains( - "air source heat pump|ground source heat pump|boiler and radiators, electric" - ) - ) | ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] - ] - .str.lower() - .str.contains("electric storage heaters") - & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheatcont-description"] - ] - == "Controls for high heat retention storage heaters" - ) + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .str.contains( + "air source heat pump|ground source heat pump|boiler and radiators, electric" ) + ) | ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .str.contains("electric storage heaters") + & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheatcont-description"] + ] + == "Controls for high heat retention storage heaters" + ) + ) # If the landlord has given us the heating system, we default to that on heating upgrades. Because of the # poor heating in place, if the EPC indicates that this property had a low efficiency heating system but the @@ -2050,25 +2064,25 @@ class AssetList: self.standardised_asset_list[ "solar_epc_data_indicates_requires_heating_upgrade" ] = ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .str.contains("electric storage heaters|room heaters") + & ( self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] + self.EPC_API_DATA_NAMES["mainheatcont-description"] ] - .str.lower() - .str.contains("electric storage heaters|room heaters") - & ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheatcont-description"] - ] - != "Controls for high heat retention storage heaters" - ) - ) & ( - ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( - ["district heating", "communal heating", "communal gas boiler"] - ) - & ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] - .astype(str) - .str.contains("gas ") + != "Controls for high heat retention storage heaters" ) + ) & ( + ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + ["district heating", "communal heating", "communal gas boiler"] + ) + & ~self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] + .astype(str) + .str.contains("gas ") + ) # Basic check - both of the previous two shouldn't be true simultaneously if ( @@ -2148,8 +2162,8 @@ class AssetList: self.standardised_asset_list[ "solar_non_intrusives_walls_insulated" ] = self.standardised_asset_list[ - "non-intrusives: WFT Findings" - ].str.lower().str.strip().isin( + "non-intrusives: WFT Findings" + ].str.lower().str.strip().isin( [ "retro drilled", "retro filled", @@ -2158,8 +2172,8 @@ class AssetList: "retro drilled and filled", ] ) | self.standardised_asset_list[ - "non-intrusives: WFT Findings" - ].str.lower().str.strip().str.contains( + "non-intrusives: WFT Findings" + ].str.lower().str.strip().str.contains( "retro drilled" ) else: @@ -2176,19 +2190,14 @@ class AssetList: ) self.standardised_asset_list["solar_epc_walls_insulated"] = ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES[ - "walls-description"]] - .str.lower() - .str.contains("|".join( - self.EPC_INSULATED_WALLS_SUBSTRINGS)) - ) | ( - self.standardised_asset_list[ - "walls_u_value"].apply( - lambda x: x <= 0.7 if not pd.isnull( - x) else False - ) - ) + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]] + .str.lower() + .str.contains("|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS)) + ) | ( + self.standardised_asset_list["walls_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) roof_data = [] for desc in self.standardised_asset_list[ @@ -2230,20 +2239,20 @@ class AssetList: self.standardised_asset_list[ "solar_epc_loft_needs_topup" ] = self.standardised_asset_list[ - self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS - ].apply( + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS + ].apply( lambda x: int(x) < 200 if str(x).isdigit() else False ) | ( - ( - self.standardised_asset_list["is_loft"] - | self.standardised_asset_list["is_pitched"] - ) - & ( - self.standardised_asset_list[ - self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS - ].isin(["below average", "none"]) - ) + ( + self.standardised_asset_list["is_loft"] + | self.standardised_asset_list["is_pitched"] ) + & ( + self.standardised_asset_list[ + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS + ].isin(["below average", "none"]) + ) + ) self.standardised_asset_list["epc_has_floor_recommendation"] = ( self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) @@ -2252,16 +2261,15 @@ class AssetList: # Check if the boiler is electric # We check if it contains both the terms boiler & electric self.standardised_asset_list["has_electric_boiler"] = ( - self.standardised_asset_list[ - self.EPC_API_DATA_NAMES["mainheat-description"] - ] - .str.lower() - .isin(["boiler and radiators, electric"]) - ) | ( - self.standardised_asset_list[ - self.STANDARD_HEATING_SYSTEM] - == "electric boiler" - ) + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"] + ] + .str.lower() + .isin(["boiler and radiators, electric"]) + ) | ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] + == "electric boiler" + ) #################################### # Check solar eligibility @@ -2399,11 +2407,11 @@ class AssetList: empty_cavity_map = { "non_intrusive_indicates_empty_cavity": self.EMPTY_CAVITY_NON_INTRUSIVE - + ": ", + + ": ", "non_intrusive_indicates_empty_cavity_has_solar": f"{self.EMPTY_CAVITY_NON_INTRUSIVE} - property " - "already has solar: ", + "already has solar: ", "non_intrusive_indicates_empty_cavity_no_year_filter": f"{self.EMPTY_CAVITY_NON_INTRUSIVE}, " - f"built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: ", + f"built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: ", } for variable, description in empty_cavity_map.items(): self.standardised_asset_list["cavity_reason"] = np.where( @@ -2419,8 +2427,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & ( self.standardised_asset_list["non-intrusives: WFT Findings"] .str.lower() @@ -2445,8 +2453,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & self.standardised_asset_list[ "non_intrusive_indicates_cavity_extraction" ] @@ -2461,8 +2469,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & ( self.standardised_asset_list["non-intrusives: Insulated"] == "RETRO DRILLED" @@ -2478,8 +2486,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & ( self.standardised_asset_list["non-intrusives: Insulated"] == "FILLED AT BUILD" @@ -2495,8 +2503,8 @@ class AssetList: ( self.standardised_asset_list["epc_indicates_empty_cavity"] & ~self.standardised_asset_list[ - "non_intrusive_indicates_empty_cavity" - ] + "non_intrusive_indicates_empty_cavity" + ] & pd.isnull(self.standardised_asset_list["cavity_reason"]) ), f"{self.EPC_EMPTY}: " + self.standardised_asset_list["SAP Category"], @@ -2640,7 +2648,7 @@ class AssetList: identified_work = self.standardised_asset_list[ ~pd.isnull(self.standardised_asset_list["cavity_reason"]) | ~pd.isnull(self.standardised_asset_list["solar_reason"]) - ][self.DOMNA_PROPERTY_ID].values + ][self.DOMNA_PROPERTY_ID].values if self.DOMNA_PROPERTY_ID in self.outcomes.columns: self.outcomes_for_output = self.outcomes[ @@ -2675,12 +2683,12 @@ class AssetList: blocks_of_flats = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" - ] + ] non_blocks_of_flats = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" - ] + ] # Produce some aggregate figures self.work_type_figures = { @@ -2723,7 +2731,7 @@ class AssetList: blocks = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" - ].copy() + ].copy() if blocks.empty: return @@ -2860,7 +2868,7 @@ class AssetList: self.standardised_asset_list = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" - ] + ] self.standardised_asset_list = pd.concat( [self.standardised_asset_list, expanded_blocks], ignore_index=True @@ -2940,7 +2948,7 @@ class AssetList: # find any block refs with more than 50% emptires viable_empty_blocks = self.block_analysis_df[ self.block_analysis_df["Percentage of Empties"] >= 0.50 - ] + ] if not viable_empty_blocks.empty: project_code_lookup = viable_empty_blocks[["Block Reference"]].copy() @@ -3179,7 +3187,7 @@ class AssetList: contact_details = pd.read_excel(local_filepath, sheet_name=sheet_name)[ [self.contact_detail_fields["landlord_property_id"]] + details_colnames - ] + ] contact_details = contact_details[ ~pd.isnull( contact_details[self.contact_detail_fields["landlord_property_id"]] @@ -3572,13 +3580,10 @@ class AssetList: "Non-Intrusives: Date Checked ": date_of_inspections, "Non-Intrusives: Wall Type ": non_intrusives_construction, "Non-intrusives: Insulation ": non_intrusives_insulated, - "Non-intrusives: Insulation Material ": - non_intrusives_insulation_material, - "Non-Intrusives: CIGA Check Required ": - non_intrusives_ciga_check_required, + "Non-intrusives: Insulation Material ": non_intrusives_insulation_material, + "Non-Intrusives: CIGA Check Required ": non_intrusives_ciga_check_required, "Non-Intrusives: PV Access Issues ": non_intrusives_pv_access, - "Non-Intrusives: Roof Orientation ": - non_intrusives_roof_orientation, + "Non-Intrusives: Roof Orientation ": non_intrusives_roof_orientation, "Non-Intrusives: Surveyor Notes ": non_intrusives_surveyor_notes, "Non-Intrusives: Surveyor Name ": non_intrusives_surveyor_name, "CIGA: Date Requested ": None, # TODO: Don't have this for the moment @@ -3755,8 +3760,8 @@ class AssetList: # We compare address line 1 to full address if any( df[self.STANDARD_FULL_ADDRESS] - .str.lower() - .str.contains(row["Address Line 1"].lower(), na=False) + .str.lower() + .str.contains(row["Address Line 1"].lower(), na=False) ): df = df[ df[self.STANDARD_FULL_ADDRESS] @@ -3996,7 +4001,7 @@ class AssetList: matched = matched[ matched["houseno"].astype(str) == house_no_to_match - ] + ] if matched.shape[0] == 1: lookup_i.append( { @@ -4021,7 +4026,7 @@ class AssetList: )[0] matched = matched[ matched[self.STANDARD_FULL_ADDRESS] == best_match - ] + ] lookup_i.append( { "row_id": x["row_id"], @@ -4332,7 +4337,7 @@ class AssetList: df = self.standardised_asset_list[ self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] == row[master_id_colnames[idx]] - ] + ] if df.shape[0] == 1: matched.append( { @@ -4438,7 +4443,7 @@ class AssetList: )[1] ) > 90 - ] + ] if df.shape[0] == 0: unmatched.append(row["row_id"]) @@ -4446,8 +4451,8 @@ class AssetList: if any( df[self.STANDARD_FULL_ADDRESS] - .str.lower() - .str.contains( + .str.lower() + .str.contains( " ".join( [row[house_no_col], row["Street / Block Name"]] ).lower() @@ -4474,7 +4479,7 @@ class AssetList: row[property_type_col].split(" ")[-1].lower() ) & (df[self.STANDARD_PROPERTY_TYPE] != "block of flats") - ] + ] if df.shape[0] != 1: # We have multiple matches - it's likely because the landlord has a duplicate diff --git a/asset_list/app.py b/asset_list/app.py index 49ec48a0..7413c7cb 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -21,6 +21,11 @@ EPC_AUTH_TOKEN = os.getenv( OPENAI_API_KEY = os.getenv( "OPENAI_API_KEY", ) +print( + f"[debug] OPENAI_API_KEY loaded: " + f"{OPENAI_API_KEY[:8]}...{OPENAI_API_KEY[-4:] if OPENAI_API_KEY else 'NONE'} " + f"(len={len(OPENAI_API_KEY) if OPENAI_API_KEY else 0})" +) def extract_address1( @@ -74,23 +79,23 @@ def app(): """ data_folder = "/workspaces/model/asset_list" - data_filename = "2026-04-22T08_22_00.779745_61049fd3.xlsx" - sheet_name = "in" - postcode_column = "postcode_clean" - address1_column = "address2uprn_address" + data_filename = "input.xlsx" + sheet_name = "Handovers" + postcode_column = "POSTCODE" + address1_column = "Full Addres" address1_method = None - fulladdress_column = "address2uprn_address" + fulladdress_column = "Full Addres" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "address2uprn_uprn" - landlord_property_type = "Property Type" # Good to include if landlord gave - landlord_built_form = "Built Form" # Good to include if landlord gave + landlord_os_uprn = "domna_found_uprn" + landlord_property_type = "PROPERTY TYPE" # Good to include if landlord gave + landlord_built_form = "Type Description" # Good to include if landlord gave landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "UPRN" + landlord_property_id = "PROP REF" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -131,6 +136,7 @@ def app(): landlord_sap=landlord_sap, landlord_block_reference=landlord_block_reference, phase=phase, + openai_api_key=OPENAI_API_KEY, ) asset_list.init_standardise() @@ -462,3 +468,9 @@ def app(): asset_list.duplicated_addresses.to_excel( writer, sheet_name="Duplicate Properties", index=False ) + + + + +for key,value in dict.items(): + lsakjfldsa \ No newline at end of file diff --git a/backend/app/config.py b/backend/app/config.py index 70a6b50c..e72eb693 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -77,6 +77,7 @@ class Settings(BaseSettings): OSMOSIS_ACD_SHAREPOINT_ID: Optional[str] = None PRIVATE_PAY_SHAREPOINT_ID: Optional[str] = None SOCIAL_HOUSING_WAVE_3_SHAREPOINT_ID: Optional[str] = None + OPENAI_API_KEY: Optional[str] = None # Pas Hub PASHUB_EMAIL: Optional[str] = None diff --git a/backend/app/local/router.py b/backend/app/local/router.py index 0977be04..ea04dc49 100644 --- a/backend/app/local/router.py +++ b/backend/app/local/router.py @@ -2,8 +2,8 @@ from fastapi import APIRouter, HTTPException, status from jose import jwt, jwe import json import datetime -from app.config import get_settings -from app.dependencies import get_derived_encryption_key +from backend.app.config import get_settings +from backend.app.dependencies import get_derived_encryption_key router = APIRouter( prefix="/local", @@ -27,7 +27,12 @@ def create_dummy_token(secret: str) -> str: "dbId": "known_id", } - token = jwe.encrypt(json.dumps(claims), get_derived_encryption_key(secret), algorithm="dir", encryption="A256GCM") + token = jwe.encrypt( + json.dumps(claims), + get_derived_encryption_key(secret), + algorithm="dir", + encryption="A256GCM", + ) return token @@ -40,6 +45,8 @@ async def dummy_token(): async def dummy_token(): settings = get_settings() if settings.ENVIRONMENT != "local": - raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, - detail="Dummy token can only be generated in local environment") + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Dummy token can only be generated in local environment", + ) return {"dummy_token": create_dummy_token(settings.SECRET_KEY)} diff --git a/backend/app/main.py b/backend/app/main.py index c9733c18..55dfef7d 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -30,10 +30,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE logger.error(f"Validation Errors: {exc.errors()}") return JSONResponse( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - content=jsonable_encoder({ - "detail": exc.errors(), - "body": exc.body - }), + content=jsonable_encoder({"detail": exc.errors(), "body": exc.body}), ) @@ -63,7 +60,8 @@ app.include_router(tasks_router.router, prefix="/v1") app.include_router(bulk_uploads_router.router, prefix="/v1") if get_settings().ENVIRONMENT == "local": - from app.local import router as local_router + from backend.app.local import router as local_router + app.include_router(local_router.router) handler = Mangum(app) @@ -98,10 +96,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE logger.error(f"Validation Errors: {exc.errors()}") return JSONResponse( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, - content=jsonable_encoder({ - "detail": exc.errors(), - "body": exc.body - }), + content=jsonable_encoder({"detail": exc.errors(), "body": exc.body}), ) @@ -130,7 +125,8 @@ app.include_router(whlg_router.router, prefix="/v1") app.include_router(bulk_uploads_router.router, prefix="/v1") if get_settings().ENVIRONMENT == "local": - from app.local import router as local_router + from backend.app.local import router as local_router + app.include_router(local_router.router) handler = Mangum(app) diff --git a/recommendations/Costs.py b/recommendations/Costs.py index bd8f160a..fc72d4d8 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -21,28 +21,28 @@ regional_labour_variations = [ {"Region": "Yorkshire and the Humber", "Adjustment_Factor": 0.86}, {"Region": "Wales", "Adjustment_Factor": 0.88}, {"Region": "Scotland", "Adjustment_Factor": 0.88}, - {"Region": "Northern Ireland", "Adjustment_Factor": 0.76} + {"Region": "Northern Ireland", "Adjustment_Factor": 0.76}, ] # Installers are now working with 435 watt panels PANEL_SIZE = 0.435 INSTALLER_SOLAR_COSTS = [ - {'n_panels': 4, 'array_kwp': 4 * PANEL_SIZE, 'cost': 4089.25, 'installer': 'CEG'}, - {'n_panels': 5, 'array_kwp': 5 * PANEL_SIZE, 'cost': 4242.48, 'installer': 'CEG'}, - {'n_panels': 6, 'array_kwp': 6 * PANEL_SIZE, 'cost': 4395.71, 'installer': 'CEG'}, - {'n_panels': 7, 'array_kwp': 7 * PANEL_SIZE, 'cost': 4548.94, 'installer': 'CEG'}, - {'n_panels': 8, 'array_kwp': 8 * PANEL_SIZE, 'cost': 4702.17, 'installer': 'CEG'}, - {'n_panels': 9, 'array_kwp': 9 * PANEL_SIZE, 'cost': 4855.41, 'installer': 'CEG'}, - {'n_panels': 10, 'array_kwp': 10 * PANEL_SIZE, 'cost': 5010.95, 'installer': 'CEG'}, - {'n_panels': 11, 'array_kwp': 11 * PANEL_SIZE, 'cost': 5166.49, 'installer': 'CEG'}, - {'n_panels': 12, 'array_kwp': 12 * PANEL_SIZE, 'cost': 5322.04, 'installer': 'CEG'}, - {'n_panels': 13, 'array_kwp': 13 * PANEL_SIZE, 'cost': 5657.6, 'installer': 'CEG'}, - {'n_panels': 14, 'array_kwp': 14 * PANEL_SIZE, 'cost': 5993.16, 'installer': 'CEG'}, - {'n_panels': 15, 'array_kwp': 15 * PANEL_SIZE, 'cost': 6328.71, 'installer': 'CEG'}, - {'n_panels': 16, 'array_kwp': 16 * PANEL_SIZE, 'cost': 6483.33, 'installer': 'CEG'}, - {'n_panels': 17, 'array_kwp': 17 * PANEL_SIZE, 'cost': 6637.95, 'installer': 'CEG'}, - {'n_panels': 18, 'array_kwp': 18 * PANEL_SIZE, 'cost': 6792.57, 'installer': 'CEG'} + {"n_panels": 4, "array_kwp": 4 * PANEL_SIZE, "cost": 4089.25, "installer": "CEG"}, + {"n_panels": 5, "array_kwp": 5 * PANEL_SIZE, "cost": 4242.48, "installer": "CEG"}, + {"n_panels": 6, "array_kwp": 6 * PANEL_SIZE, "cost": 4395.71, "installer": "CEG"}, + {"n_panels": 7, "array_kwp": 7 * PANEL_SIZE, "cost": 4548.94, "installer": "CEG"}, + {"n_panels": 8, "array_kwp": 8 * PANEL_SIZE, "cost": 4702.17, "installer": "CEG"}, + {"n_panels": 9, "array_kwp": 9 * PANEL_SIZE, "cost": 4855.41, "installer": "CEG"}, + {"n_panels": 10, "array_kwp": 10 * PANEL_SIZE, "cost": 5010.95, "installer": "CEG"}, + {"n_panels": 11, "array_kwp": 11 * PANEL_SIZE, "cost": 5166.49, "installer": "CEG"}, + {"n_panels": 12, "array_kwp": 12 * PANEL_SIZE, "cost": 5322.04, "installer": "CEG"}, + {"n_panels": 13, "array_kwp": 13 * PANEL_SIZE, "cost": 5657.6, "installer": "CEG"}, + {"n_panels": 14, "array_kwp": 14 * PANEL_SIZE, "cost": 5993.16, "installer": "CEG"}, + {"n_panels": 15, "array_kwp": 15 * PANEL_SIZE, "cost": 6328.71, "installer": "CEG"}, + {"n_panels": 16, "array_kwp": 16 * PANEL_SIZE, "cost": 6483.33, "installer": "CEG"}, + {"n_panels": 17, "array_kwp": 17 * PANEL_SIZE, "cost": 6637.95, "installer": "CEG"}, + {"n_panels": 18, "array_kwp": 18 * PANEL_SIZE, "cost": 6792.57, "installer": "CEG"}, ] # These are costs we received from CRG, for pricing up air source heat pumps @@ -80,7 +80,12 @@ INSTALLER_SOLAR_PV_INVERTER_COST = 7500 INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST = 500 # Just a rough guess to labour costs INSTALLER_SOLAR_BATTERY_COSTS = [ - {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 3769.89, 'installer': 'JJC'}, + { + "capacity_kwh": 5, + "description": "Battery Add on", + "cost": 3769.89, + "installer": "JJC", + }, # {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'}, # {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'}, # {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'} @@ -102,10 +107,14 @@ TTZC_SMART_THERMOSTAT_LABOUR_HOURS = 2 TTZC_ELECTRICIAN_HOURLY_RATE = 45 # Based on cost of a Nest temperature sensor TTZC_ROOM_TEMPERATURE_SENSOR_COST = 50 -TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS = 0.17 # (Assume ~ 10 mins install per sensor) +TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS = ( + 0.17 # (Assume ~ 10 mins install per sensor) +) # Basedon an average cost of smart radiator values TTZC_SMART_RADIATOR_VALUES = 50 -TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS = 0.37 # (Assume ~ 15-30 mins install per valve) +TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS = ( + 0.37 # (Assume ~ 15-30 mins install per valve) +) # boiler prices based on # This is the cost of a firs time central heating install from The Warm Front rate card @@ -169,7 +178,7 @@ class Costs: "heater_removal": 0.1, "sealing_open_fireplace": 0.1, "mechanical_ventilation": 0.26, - "sloping_ceiling_insulation": 0.26 # Similar to IWI so using the same contingency + "sloping_ceiling_insulation": 0.26, # Similar to IWI so using the same contingency } # Preliminaries are a percentage of the total cost of the work and covers the cost of site-specific costs @@ -195,36 +204,46 @@ class Costs: :param property_instance: Instance of a Property class containing relevant details like wall area. """ - if not hasattr(property_instance, 'insulation_wall_area'): - raise ValueError("Property instance must have an 'insulation_wall_area' attribute") + if not hasattr(property_instance, "insulation_wall_area"): + raise ValueError( + "Property instance must have an 'insulation_wall_area' attribute" + ) self.property = property_instance self.regional_labour_variations = regional_labour_variations self.region = county_to_region_map.get(self.property.epc_record.county, None) if self.region is None: # Try and grab using the local-authority-label - self.region = county_to_region_map.get(self.property.epc_record.local_authority_label, None) + self.region = county_to_region_map.get( + self.property.epc_record.local_authority_label, None + ) if self.region is None: # Try and get the region after converting the keys to lower - self.region = { - k.lower(): v for k, v in county_to_region_map.items() - }.get(self.property.epc_record.local_authority_label.lower(), None) + if self.property.epc_record.local_authority_label is not None: + self.region = { + k.lower(): v for k, v in county_to_region_map.items() + }.get(self.property.epc_record.local_authority_label.lower(), None) if self.region is None: - logger.warning("No region found for county %s, defaulting to South East England", - self.property.epc_record.county) + logger.warning( + "No region found for county %s, defaulting to South East England", + self.property.epc_record.county, + ) self.region = "South East England" self.labour_adjustment_factor = [ - x["Adjustment_Factor"] for x in self.regional_labour_variations if - x["Region"] == self.region + x["Adjustment_Factor"] + for x in self.regional_labour_variations + if x["Region"] == self.region ][0] if not self.labour_adjustment_factor: raise ValueError("Labour adjustment factor not found") - def cavity_wall_insulation(self, wall_area, material, is_extraction_and_refill=False): + def cavity_wall_insulation( + self, wall_area, material, is_extraction_and_refill=False + ): """ Calculates the total cost for cavity wall insulation based on material and labor costs, including contingency, preliminaries, profit, and VAT. @@ -318,7 +337,8 @@ class Costs: return { "total": total_cost, - "contingency": self.CONTINGENCIES["suspended_floor_insulation"] * total_cost, + "contingency": self.CONTINGENCIES["suspended_floor_insulation"] + * total_cost, "contingency_rate": self.CONTINGENCIES["suspended_floor_insulation"], "labour_hours": labour_hours, "labour_days": labour_days, @@ -370,8 +390,7 @@ class Costs: # - Apply sub-linear scaling for realism # - Enforce a minimum duration so estimates are not unrealistically low labour_days = max( - min_days, - base_days * (insulation_floor_area / base_area) ** labour_exponent + min_days, base_days * (insulation_floor_area / base_area) ** labour_exponent ) return labour_days @@ -388,7 +407,9 @@ class Costs: total_cost = material["total_cost"] * insulation_floor_area daily_labour_rate = 300 # Based on checkatrade - labour_days = self._estimate_number_of_days_for_solid_floor(insulation_floor_area) + labour_days = self._estimate_number_of_days_for_solid_floor( + insulation_floor_area + ) labour_cost = labour_days * daily_labour_rate total_cost = total_cost + labour_cost @@ -404,7 +425,6 @@ class Costs: } def low_energy_lighting(self, number_of_lights, material): - """ Calculates the total cost for low energy lighting based on material and labor costs, including contingency, preliminaries, profit, and VAT. @@ -419,7 +439,7 @@ class Costs: total_cost = material["total_cost"] * number_of_lights labour_hours = 1 - labour_days = (labour_hours / 8) + labour_days = labour_hours / 8 return { "total": total_cost, @@ -450,26 +470,22 @@ class Costs: } @classmethod - def solar_pv( - cls, - solar_product, - scaffolding_options, - n_floors - ): - - """ - - """ + def solar_pv(cls, solar_product, scaffolding_options, n_floors): + """ """ system_cost = solar_product["total_cost"] if not solar_product["includes_scaffolding"]: # We base this on the number of floors - scaffolding = [x["total_cost"] for x in scaffolding_options if x["size"] == n_floors] + scaffolding = [ + x["total_cost"] for x in scaffolding_options if x["size"] == n_floors + ] if not scaffolding: # If we have no options, handle this if n_floors <= 3: - raise ValueError("No scaffolding options available for 3 or fewer floors") + raise ValueError( + "No scaffolding options available for 3 or fewer floors" + ) # We take the largest scaffolding option available scaffolding_cost = max([x["total_cost"] for x in scaffolding_options]) else: @@ -523,9 +539,9 @@ class Costs: We base the estimates for the cost of electric room heaters on the cost per room as estimated by the following article: https://www.bestelectricradiators.co.uk/blog/cost-to-install-a-new-heating-system-uk/ - + :param number_heated_rooms: int, number of rooms to be heated - :return: + :return: """ total_cost = 500 * number_heated_rooms @@ -547,11 +563,11 @@ class Costs: } def high_heat_electric_storage_heaters( - self, number_heated_rooms: int, + self, + number_heated_rooms: int, needs_cylinder: bool, - product: dict | None = None + product: dict | None = None, ): - """ We base the estimates for the cost of electric storage heaters on the cost per room as estimated by the energy saving trust @@ -578,8 +594,11 @@ class Costs: return { "total": total_cost, - "contingency": total_cost * self.CONTINGENCIES["high_heat_retention_storage_heaters"], - "contingency_rate": self.CONTINGENCIES["high_heat_retention_storage_heaters"], + "contingency": total_cost + * self.CONTINGENCIES["high_heat_retention_storage_heaters"], + "contingency_rate": self.CONTINGENCIES[ + "high_heat_retention_storage_heaters" + ], "subtotal": subtotal_before_vat, "vat": vat, "labour_hours": labour_hours, @@ -690,14 +709,14 @@ class Costs: # The product costs are inclusive of VAT product_costs = ( - TTZC_SMART_THERMOSTAT_COST + - TTZC_ROOM_TEMPERATURE_SENSOR_COST * number_heated_rooms + - TTZC_SMART_RADIATOR_VALUES * number_heated_rooms + TTZC_SMART_THERMOSTAT_COST + + TTZC_ROOM_TEMPERATURE_SENSOR_COST * number_heated_rooms + + TTZC_SMART_RADIATOR_VALUES * number_heated_rooms ) labour_hours = ( - TTZC_SMART_THERMOSTAT_LABOUR_HOURS + - TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS * number_heated_rooms + - TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS * number_heated_rooms + TTZC_SMART_THERMOSTAT_LABOUR_HOURS + + TTZC_ROOM_TEMPERATURE_SENSOR_LABOUR_HOURS * number_heated_rooms + + TTZC_SMART_RADIATOR_VALUES_LABOUR_HOURS * number_heated_rooms ) labour_costs = TTZC_ELECTRICIAN_HOURLY_RATE * labour_hours # Add continency and preliminaries to the labour to account for the complexity of the job @@ -722,7 +741,9 @@ class Costs: "labour_days": labour_days, } - def programmer_trvs_bypass(self, number_heated_rooms, has_programmer, has_trvs, has_bypass): + def programmer_trvs_bypass( + self, number_heated_rooms, has_programmer, has_trvs, has_bypass + ): total_cost = 0 labour_hours = 0 @@ -779,7 +800,9 @@ class Costs: } @staticmethod - def _estimate_n_radiators(number_habitable_rooms, total_floor_area, property_type, built_form): + def _estimate_n_radiators( + number_habitable_rooms, total_floor_area, property_type, built_form + ): # Base number of radiators: one per habitable room base_radiators = number_habitable_rooms @@ -787,34 +810,49 @@ class Costs: additional_radiators = 3 # Initial assumption # Adjust additional radiators based on property type - if property_type == 'Flat': - additional_radiators -= 1 # Flats may need fewer radiators due to less exposure - elif property_type in ['House', 'Bungalow', 'Maisonette']: + if property_type == "Flat": + additional_radiators -= ( + 1 # Flats may need fewer radiators due to less exposure + ) + elif property_type in ["House", "Bungalow", "Maisonette"]: # Multiple floors in Maisonette may require additional heating points - additional_radiators += 2 # Houses and bungalows might need more due to greater exposure + additional_radiators += ( + 2 # Houses and bungalows might need more due to greater exposure + ) else: raise Exception("Invalid property type") # Adjust total radiator needs based on built form form_factor = { - 'Enclosed Mid-Terrace': 0.9, - 'Mid-Terrace': 0.95, - 'Enclosed End-Terrace': 0.95, - 'Semi-Detached': 1.05, - 'Detached': 1.25, - 'End-Terrace': 1.05 + "Enclosed Mid-Terrace": 0.9, + "Mid-Terrace": 0.95, + "Enclosed End-Terrace": 0.95, + "Semi-Detached": 1.05, + "Detached": 1.25, + "End-Terrace": 1.05, } # Calculate total heating power needed and number of radiators based on standard output total_heating_power_required = total_floor_area * 80 # Watts per square meter radiator_output = 1000 # Average wattage per radiator - total_radiators_based_on_power = (total_heating_power_required / radiator_output) * form_factor[built_form] + total_radiators_based_on_power = ( + total_heating_power_required / radiator_output + ) * form_factor[built_form] # Final estimation taking the higher of calculated needs or base room count - estimated_radiators = max(total_radiators_based_on_power, base_radiators + additional_radiators) + estimated_radiators = max( + total_radiators_based_on_power, base_radiators + additional_radiators + ) return round(estimated_radiators) - def boiler(self, exising_room_heaters, system_change, n_heated_rooms, n_rooms, is_electric=False): + def boiler( + self, + exising_room_heaters, + system_change, + n_heated_rooms, + n_rooms, + is_electric=False, + ): """ Based on a basic estimate of median value £2600 to install a low carbon combi boiler First time central heating vosts can als be found here: @@ -859,12 +897,14 @@ class Costs: number_habitable_rooms=n_rooms, total_floor_area=self.property.floor_area, property_type=self.property.epc_record.property_type, - built_form=self.property.epc_record.built_form + built_form=self.property.epc_record.built_form, ) additionals_labour_cost = labour_rate * self.labour_adjustment_factor radiator_cost = DOUBLE_RADIATOR_COST * n_radiators - system_change_cost = radiator_cost + FLUE_COST + PIPEWORK_COST + additionals_labour_cost + system_change_cost = ( + radiator_cost + FLUE_COST + PIPEWORK_COST + additionals_labour_cost + ) system_change_cost_before_vat = system_change_cost / (1 + self.VAT_RATE) system_change_vat = system_change_cost - system_change_cost_before_vat # We add an extra labour day for the system change @@ -897,14 +937,18 @@ class Costs: else: return 250 - def air_source_heat_pump(self, ashp_size: float, number_heated_rooms: int, total_floor_area: float) -> dict: + def air_source_heat_pump( + self, ashp_size: float, number_heated_rooms: int, total_floor_area: float + ) -> dict: """ We produce a cost estimation for an air source heat pump, based on costs we have received from installers. """ system_cost = ( - (ASHP_SMALL_SYSTEM_COST if ashp_size <= 8.5 else ASHP_LARGE_SYSTEM_COST) + ASHP_SECURITY + ASHP_WALL_BRACKET + (ASHP_SMALL_SYSTEM_COST if ashp_size <= 8.5 else ASHP_LARGE_SYSTEM_COST) + + ASHP_SECURITY + + ASHP_WALL_BRACKET ) available_n_rads = [x["n_radiators"] for x in ASHP_DISTRIBUTION_SYSTEM_COSTS] @@ -940,7 +984,9 @@ class Costs: } @staticmethod - def _estimate_number_of_days_for_sloping_ceiling(insulation_roof_area: float) -> float: + def _estimate_number_of_days_for_sloping_ceiling( + insulation_roof_area: float, + ) -> float: """ Estimate labour days required to insulate an existing sloping ceiling. @@ -965,14 +1011,15 @@ class Costs: min_days = 2 labour_days = max( - min_days, - base_days * (insulation_roof_area / base_area) ** labour_exponent + min_days, base_days * (insulation_roof_area / base_area) ** labour_exponent ) return labour_days @classmethod - def sloping_ceiling_insulation(cls, insulation_roof_area: float) -> Mapping[str, float]: + def sloping_ceiling_insulation( + cls, insulation_roof_area: float + ) -> Mapping[str, float]: """ This costing for this is based on Checkatrade desktop research, since we are yet to receive installer quotes. :param insulation_roof_area: Area of the sloping ceiling to be insulated @@ -985,14 +1032,20 @@ class Costs: # https://www.checkatrade.com/blog/cost-guides/vaulted-ceiling-cost/ # https://www.thegreenage.co.uk/can-i-insulate-my-sloping-ceiling/ # These assumptions last updated 21/02/2026 - insulation_cost_per_m2 = 52 # The actual install process is quite similar to IWI + insulation_cost_per_m2 = ( + 52 # The actual install process is quite similar to IWI + ) labour_rate = 250 # per day contingency_rate = cls.CONTINGENCIES["sloping_ceiling_insulation"] - labour_days = cls._estimate_number_of_days_for_sloping_ceiling(insulation_roof_area) + labour_days = cls._estimate_number_of_days_for_sloping_ceiling( + insulation_roof_area + ) labour_hours = labour_days * 8 - total = (insulation_cost_per_m2 * insulation_roof_area) + (labour_rate * labour_days) + total = (insulation_cost_per_m2 * insulation_roof_area) + ( + labour_rate * labour_days + ) # Assume VAT included in the total => total is 120% of subtotal vat = total - (total / 1.2) From 02fb3afbe41d479742ad4053f296c99e807f22ae Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:04:01 +0000 Subject: [PATCH 03/13] defined histrocial epc data shapre from csv --- datatypes/epc/schema/historic_epc.py | 97 ++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 datatypes/epc/schema/historic_epc.py diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py new file mode 100644 index 00000000..e158ac1f --- /dev/null +++ b/datatypes/epc/schema/historic_epc.py @@ -0,0 +1,97 @@ +from dataclasses import dataclass + + +@dataclass +class HistoricEpc: + lmk_key: str + address1: str + address2: str + address3: str + postcode: str + building_reference_number: str + current_energy_rating: str + potential_energy_rating: str + current_energy_efficiency: str + potential_energy_efficiency: str + property_type: str + built_form: str + inspection_date: str + local_authority: str + constituency: str + county: str + lodgement_date: str + transaction_type: str + environment_impact_current: str + environment_impact_potential: str + energy_consumption_current: str + energy_consumption_potential: str + co2_emissions_current: str + co2_emiss_curr_per_floor_area: str + co2_emissions_potential: str + lighting_cost_current: str + lighting_cost_potential: str + heating_cost_current: str + heating_cost_potential: str + hot_water_cost_current: str + hot_water_cost_potential: str + total_floor_area: str + energy_tariff: str + mains_gas_flag: str + floor_level: str + flat_top_storey: str + flat_storey_count: str + main_heating_controls: str + multi_glaze_proportion: str + glazed_type: str + glazed_area: str + extension_count: str + number_habitable_rooms: str + number_heated_rooms: str + low_energy_lighting: str + number_open_fireplaces: str + hotwater_description: str + hot_water_energy_eff: str + hot_water_env_eff: str + floor_description: str + floor_energy_eff: str + floor_env_eff: str + windows_description: str + windows_energy_eff: str + windows_env_eff: str + walls_description: str + walls_energy_eff: str + walls_env_eff: str + secondheat_description: str + sheating_energy_eff: str + sheating_env_eff: str + roof_description: str + roof_energy_eff: str + roof_env_eff: str + mainheat_description: str + mainheat_energy_eff: str + mainheat_env_eff: str + mainheatcont_description: str + mainheatc_energy_eff: str + mainheatc_env_eff: str + lighting_description: str + lighting_energy_eff: str + lighting_env_eff: str + main_fuel: str + wind_turbine_count: str + heat_loss_corridor: str + unheated_corridor_length: str + floor_height: str + photo_supply: str + solar_water_heating_flag: str + mechanical_ventilation: str + address: str + local_authority_label: str + constituency_label: str + posttown: str + construction_age_band: str + lodgement_datetime: str + tenure: str + fixed_lighting_outlets_count: str + low_energy_fixed_light_count: str + uprn: str + uprn_source: str From 90a4d83243604a4830dc0f6a7752a67f928c571f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:04:27 +0000 Subject: [PATCH 04/13] added init files to make it a python module --- backend/etl/__init__.py | 0 backend/etl/etl_opendatacommunities/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 backend/etl/__init__.py create mode 100644 backend/etl/etl_opendatacommunities/__init__.py diff --git a/backend/etl/__init__.py b/backend/etl/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/etl/etl_opendatacommunities/__init__.py b/backend/etl/etl_opendatacommunities/__init__.py new file mode 100644 index 00000000..e69de29b From 74b7b87de6384419687bdb1bc57468ea3ee4959e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:22:41 +0000 Subject: [PATCH 05/13] =?UTF-8?q?load=20historic=20epc=20from=20csv=20?= =?UTF-8?q?=F0=9F=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- datatypes/epc/loaders/__init__.py | 0 datatypes/epc/loaders/historic_epc.py | 5 ++ datatypes/epc/schema/historic_epc.py | 7 +++ .../schema/tests/test_historic_epc_loading.py | 55 +++++++++++++++++++ 4 files changed, 67 insertions(+) create mode 100644 datatypes/epc/loaders/__init__.py create mode 100644 datatypes/epc/loaders/historic_epc.py create mode 100644 datatypes/epc/schema/tests/test_historic_epc_loading.py diff --git a/datatypes/epc/loaders/__init__.py b/datatypes/epc/loaders/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/datatypes/epc/loaders/historic_epc.py b/datatypes/epc/loaders/historic_epc.py new file mode 100644 index 00000000..8555a706 --- /dev/null +++ b/datatypes/epc/loaders/historic_epc.py @@ -0,0 +1,5 @@ +from datatypes.epc.schema.historic_epc import HistoricEpc + + +def read_historic_epc_csv(path: str) -> list[HistoricEpc]: + raise NotImplementedError("read_historic_epc_csv not implemented yet") diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py index e158ac1f..9ebe4b09 100644 --- a/datatypes/epc/schema/historic_epc.py +++ b/datatypes/epc/schema/historic_epc.py @@ -95,3 +95,10 @@ class HistoricEpc: low_energy_fixed_light_count: str uprn: str uprn_source: str + report_type: str + improvement_item: str + improvement_summary_text: str + improvement_descr_text: str + improvement_id: str + improvement_id_text: str + indicative_cost: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py new file mode 100644 index 00000000..d5d5ea22 --- /dev/null +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -0,0 +1,55 @@ +import os + +import pytest + +from datatypes.epc.loaders.historic_epc import read_historic_epc_csv +from datatypes.epc.schema.historic_epc import HistoricEpc + +FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") + + +class TestHistoricEpcLoading: + + @pytest.fixture + def epc(self) -> HistoricEpc: + rows = read_historic_epc_csv(os.path.join(FIXTURES, "historic_epc.csv")) + return rows[0] + + def test_returns_historic_epc_instance(self, epc: HistoricEpc) -> None: + assert isinstance(epc, HistoricEpc) + + def test_lmk_key(self, epc: HistoricEpc) -> None: + assert epc.lmk_key == "9292c3bf26a8876ce59274401ea73e3de5bd0b3e52a507c2162a46e57db8ea2f" + + def test_address1(self, epc: HistoricEpc) -> None: + assert epc.address1 == "47 GORDON ROAD" + + def test_postcode(self, epc: HistoricEpc) -> None: + assert epc.postcode == "AB33 8AL" + + def test_current_energy_rating(self, epc: HistoricEpc) -> None: + assert epc.current_energy_rating == "E" + + def test_property_type(self, epc: HistoricEpc) -> None: + assert epc.property_type == "House" + + def test_built_form(self, epc: HistoricEpc) -> None: + assert epc.built_form == "Semi-Detached" + + def test_inspection_date(self, epc: HistoricEpc) -> None: + assert epc.inspection_date == "2021-04-11" + + def test_uprn(self, epc: HistoricEpc) -> None: + assert epc.uprn == "151020766.0" + + def test_uprn_source(self, epc: HistoricEpc) -> None: + assert epc.uprn_source == "Energy Assessor" + + def test_report_type(self, epc: HistoricEpc) -> None: + assert epc.report_type == "100" + + def test_improvement_summary_text(self, epc: HistoricEpc) -> None: + assert epc.improvement_summary_text == "Increase loft insulation to 270 mm" + + def test_indicative_cost(self, epc: HistoricEpc) -> None: + assert epc.indicative_cost == "£100 - £350" From 32bf1cc98de5bb73c753e772e81d0571595dc8fa Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 7 May 2026 16:26:29 +0000 Subject: [PATCH 06/13] =?UTF-8?q?load=20historic=20epc=20from=20csv=20?= =?UTF-8?q?=F0=9F=9F=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- datatypes/epc/loaders/historic_epc.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/datatypes/epc/loaders/historic_epc.py b/datatypes/epc/loaders/historic_epc.py index 8555a706..7b563315 100644 --- a/datatypes/epc/loaders/historic_epc.py +++ b/datatypes/epc/loaders/historic_epc.py @@ -1,5 +1,18 @@ +import csv + from datatypes.epc.schema.historic_epc import HistoricEpc +def _normalise(value: str | None) -> str: + if value is None: + return "" + return value.replace("\xa0", " ") + + def read_historic_epc_csv(path: str) -> list[HistoricEpc]: - raise NotImplementedError("read_historic_epc_csv not implemented yet") + with open(path, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [ + HistoricEpc(**{k.lower(): _normalise(v) for k, v in row.items()}) + for row in reader + ] From a39c3a0772566e28fc00a6c6ee2507a1d8b12b27 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 12:03:35 +0000 Subject: [PATCH 07/13] added added historic epc data class with shape --- backend/etl/etl_opendatacommunities/main.py | 37 +++++++------------ datatypes/epc/domain/historic_epc.py | 10 +++++ datatypes/epc/schema/historic_epc.py | 6 --- .../schema/tests/test_historic_epc_loading.py | 6 --- sfr/principal_pitch/2_export_data.py | 8 ++-- 5 files changed, 27 insertions(+), 40 deletions(-) create mode 100644 datatypes/epc/domain/historic_epc.py diff --git a/backend/etl/etl_opendatacommunities/main.py b/backend/etl/etl_opendatacommunities/main.py index 30b4045a..2bd41005 100644 --- a/backend/etl/etl_opendatacommunities/main.py +++ b/backend/etl/etl_opendatacommunities/main.py @@ -15,16 +15,7 @@ logger = setup_logger() SRC_ROOT = Path("/workspaces/home/epc_data") TMP_ROOT = Path("/tmp/epc_postcodes") S3_BUCKET = "retrofit-data-dev" -S3_PREFIX = "epc_opendatacommunities" - -REC_COLS = { - "IMPROVEMENT_ITEM", - "IMPROVEMENT_SUMMARY_TEXT", - "IMPROVEMENT_DESCR_TEXT", - "IMPROVEMENT_ID", - "IMPROVEMENT_ID_TEXT", - "INDICATIVE_COST", -} +S3_PREFIX = "historical_epc" # This scripts assume you downloading the zip, unzip it, and running it locally @@ -35,18 +26,16 @@ def sanitise(pc: pd.Series) -> pd.Series: def shard_la(la_dir: Path) -> None: certs = pd.read_csv(la_dir / "certificates.csv", low_memory=False) - recs = pd.read_csv(la_dir / "recommendations.csv", low_memory=False) - merged = certs.merge(recs, on="LMK_KEY", how="left") - merged["POSTCODE_CLEAN"] = sanitise(merged["POSTCODE"]) - before = len(merged) - merged = merged.dropna(subset=["POSTCODE_CLEAN"]) - merged = merged[merged["POSTCODE_CLEAN"] != ""] - dropped = before - len(merged) + certs["POSTCODE_CLEAN"] = sanitise(certs["POSTCODE"]) + before = len(certs) + certs = certs.dropna(subset=["POSTCODE_CLEAN"]) + certs = certs[certs["POSTCODE_CLEAN"] != ""] + dropped = before - len(certs) if dropped: logger.warning(f"{la_dir.name}: dropped {dropped} rows with empty postcode") - for pc, group in merged.groupby("POSTCODE_CLEAN", sort=False): + for pc, group in certs.groupby("POSTCODE_CLEAN", sort=False): out = TMP_ROOT / f"{pc}.csv" group.drop(columns=["POSTCODE_CLEAN"]).to_csv( out, mode="a", header=not out.exists(), index=False @@ -67,9 +56,7 @@ def list_existing_keys(s3: Any) -> set[str]: def upload_postcode(path: Path, s3: Any) -> None: df = pd.read_csv(path, low_memory=False).drop_duplicates() - cert_cols = [c for c in df.columns if c not in REC_COLS] - cert_only = df[cert_cols].drop_duplicates() - dupes = cert_only["LMK_KEY"].value_counts() + dupes = df["LMK_KEY"].value_counts() bad = dupes[dupes > 1] if not bad.empty: raise ValueError( @@ -95,12 +82,14 @@ def main(): ) logger.info(f"Sharding {len(la_dirs)} LA folders -> {TMP_ROOT}") - # for la in tqdm(la_dirs, desc="shard"): - # shard_la(la) + for la in tqdm(la_dirs, desc="shard"): + shard_la(la) s3 = boto3.client( "s3", - config=Config(max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"}), + config=Config( + max_pool_connections=512, retries={"max_attempts": 5, "mode": "standard"} + ), ) pc_files = sorted(TMP_ROOT.glob("*.csv")) logger.info(f"Found {len(pc_files)} local shards") diff --git a/datatypes/epc/domain/historic_epc.py b/datatypes/epc/domain/historic_epc.py new file mode 100644 index 00000000..230c6327 --- /dev/null +++ b/datatypes/epc/domain/historic_epc.py @@ -0,0 +1,10 @@ +from dataclasses import dataclass + + +@dataclass +class HistoricEpc: + address1: str + address2: str + address3: str + postcode: str + uprn: str diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py index 9ebe4b09..f64ab8c4 100644 --- a/datatypes/epc/schema/historic_epc.py +++ b/datatypes/epc/schema/historic_epc.py @@ -96,9 +96,3 @@ class HistoricEpc: uprn: str uprn_source: str report_type: str - improvement_item: str - improvement_summary_text: str - improvement_descr_text: str - improvement_id: str - improvement_id_text: str - indicative_cost: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py index d5d5ea22..2170a8a6 100644 --- a/datatypes/epc/schema/tests/test_historic_epc_loading.py +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -47,9 +47,3 @@ class TestHistoricEpcLoading: def test_report_type(self, epc: HistoricEpc) -> None: assert epc.report_type == "100" - - def test_improvement_summary_text(self, epc: HistoricEpc) -> None: - assert epc.improvement_summary_text == "Increase loft insulation to 270 mm" - - def test_indicative_cost(self, epc: HistoricEpc) -> None: - assert epc.indicative_cost == "£100 - £350" diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index b275086d..5e3ce5d5 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -26,13 +26,13 @@ from backend.app.db.functions.materials_functions import get_materials from collections import defaultdict from sqlalchemy import func -PORTFOLIO_ID = 711 -SCENARIOS = [1233] +PORTFOLIO_ID = 632 +SCENARIOS = [1144] scenario_names = { - 1233: "Reach EPC C", + 1144: "EPC C", } -project_name = "Novus" +project_name = "Calico Refresh" def get_data(portfolio_id, scenario_ids): From 7a49f5df20e61836ee19fc19e77abd024fc35880 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 12:19:03 +0000 Subject: [PATCH 08/13] save plan temporary while i incorporate skills to claude --- datatypes/epc/domain/plan.md | 161 +++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 datatypes/epc/domain/plan.md diff --git a/datatypes/epc/domain/plan.md b/datatypes/epc/domain/plan.md new file mode 100644 index 00000000..45cc495b --- /dev/null +++ b/datatypes/epc/domain/plan.md @@ -0,0 +1,161 @@ +# Historic EPC address-match service + +## Context + +ETL `backend/etl/etl_opendatacommunities/main.py` shards `certificates.csv` by sanitised postcode and uploads gzipped CSVs to `s3://retrofit-data-dev/historical_epc//data.csv.gz`. Need a pure-python lib that, given `(user_address, postcode)`, fetches the corresponding shard and scores every row against the user address using the same lexiscore as `address2UPRN` — but returning the full scored df (not a single UPRN), so callers can apply their own thresholding. + +Mirrors pattern in [backend/address2UPRN/main.py:111-147](backend/address2UPRN/main.py#L111-L147) (`get_uprn_candidates`) but reads from S3 historic CSV instead of the EPC live API. No Lambda, no script — lib only for now. + +## Approach + +Add a wrapper class `HistoricEpcMatches` and a function `match_addresses_for_postcode` to the existing domain file. Add a small gzip-CSV S3 helper to `utils/s3.py`. + +### 1. Add gzip-CSV S3 reader + +In [utils/s3.py](utils/s3.py) (after `read_dataframe_from_s3_parquet` ~line 167): + +```python +def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: + if not file_key.endswith(".csv.gz"): + raise ValueError("file_key must end with .csv.gz") + buf = read_io_from_s3(bucket_name, file_key) + return pd.read_csv(buf, compression="gzip", low_memory=False) +``` + +Reuses existing `read_io_from_s3` (line 105). Caller catches `botocore.exceptions.ClientError` for missing-key handling. + +### 2. Append matcher to domain module + +In [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — keep existing `HistoricEpc` dataclass intact, append: + +```python +from typing import Optional +import pandas as pd +from botocore.exceptions import ClientError + +from backend.utils.addressMatch import AddressMatch +from utils.s3 import read_csv_gz_from_s3 + + +@dataclass +class HistoricEpcMatches: + """Scored historic EPC rows for a single postcode.""" + user_address: str + postcode: str # sanitised + df: pd.DataFrame # has lexiscore + lexirank, sorted best-first + + def top(self) -> Optional[pd.Series]: + return None if self.df.empty else self.df.iloc[0] + + def top_n(self, k: int) -> pd.DataFrame: + return self.df.head(k) + + def unambiguous_uprn(self, uprn_column: str = "UPRN") -> Optional[str]: + if self.df.empty: + return None + top_rank = self.df["lexirank"].min() + uprns = ( + self.df.loc[self.df["lexirank"] == top_rank, uprn_column] + .dropna().astype(str).str.replace(r"\.0$", "", regex=True) + .unique() + ) + return uprns[0] if len(uprns) == 1 else None + + +def _sanitise_postcode(postcode: str) -> str: + if not postcode: + raise ValueError("postcode must be non-empty") + return postcode.upper().replace(" ", "") + + +def match_addresses_for_postcode( + user_address: str, + postcode: str, + *, + bucket: str = "retrofit-data-dev", + prefix: str = "historical_epc", + address_column: str = "ADDRESS", +) -> HistoricEpcMatches: + if not user_address: + raise ValueError("user_address must be non-empty") + + pc = _sanitise_postcode(postcode) + key = f"{prefix}/{pc}/data.csv.gz" + + try: + df = read_csv_gz_from_s3(bucket, key) + except ClientError as e: + if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): + raise FileNotFoundError( + f"No historic EPC data at s3://{bucket}/{key}" + ) from e + raise + + if address_column not in df.columns: + raise ValueError( + f"Missing address column {address_column!r} in {key}" + ) + + user_norm = AddressMatch.normalise_address(user_address) + df = df.copy() + df["lexiscore"] = df[address_column].fillna("").apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) + df["lexirank"] = ( + df["lexiscore"].rank(method="dense", ascending=False).astype(int) + ) + df = df.sort_values(["lexirank", "lexiscore"], ascending=[True, False]).reset_index(drop=True) + + return HistoricEpcMatches(user_address=user_address, postcode=pc, df=df) +``` + +### Reuse notes +- `AddressMatch.normalise_address` + `AddressMatch.levenshtein` from [backend/utils/addressMatch.py](backend/utils/addressMatch.py) — same scoring as address2UPRN. +- Score column copy uses `.fillna("")` to defend against NaN in `ADDRESS`. +- Defaults match ETL output: bucket `retrofit-data-dev`, prefix `historical_epc`, column `ADDRESS` (uppercase). + +### 3. Tests + +New: [datatypes/epc/domain/tests/__init__.py](datatypes/epc/domain/tests/__init__.py) (empty) and [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py). + +Reuse existing fixture `datatypes/epc/schema/tests/fixtures/historic_epc.csv` — read it in-memory in tests; do NOT commit a `.csv.gz` fixture. Patch target: `datatypes.epc.domain.historic_epc.read_csv_gz_from_s3` (local binding, not `utils.s3.read_csv_gz_from_s3`). + +Cases: +1. `_sanitise_postcode("ab33 8al") == "AB338AL"`; empty raises. +2. Returned df has `lexiscore` + `lexirank` columns, row count preserved. +3. df sorted: `iloc[0]["lexirank"] == 1`, `lexiscore` monotone non-increasing. +4. S3 key built correctly: `"AB33 8AL"` → key `"historical_epc/AB338AL/data.csv.gz"` (spy on patched helper). +5. `ClientError` with code `NoSuchKey` → `FileNotFoundError`. +6. Exact-match address → `unambiguous_uprn()` returns that UPRN; ambiguous tie → `None`. +7. `top()` / `top_n(k)` shape checks. + +## Critical files +- [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — append matcher +- [utils/s3.py](utils/s3.py) — add `read_csv_gz_from_s3` +- [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py) — new + +## Out of scope +- Lambda handler / SQS wiring (deferred — lib only) +- Threshold logic (caller decides via wrapper helpers) +- Postcode validation via `postcodes.io` (`AddressMatch.is_valid_postcode` exists if needed later) +- Refactoring `sanitise(pd.Series)` in `etl_opendatacommunities/main.py` — separate concern + +## Verification +``` +cd /workspaces/model && pytest datatypes/epc/domain/tests/test_historic_epc_match.py -v +``` + +Sample real-S3 call (needs AWS creds): +```python +from datatypes.epc.domain.historic_epc import match_addresses_for_postcode +m = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") +print(m.df[["ADDRESS", "UPRN", "lexiscore", "lexirank"]].head()) +print(m.unambiguous_uprn()) +``` + +## Sequencing +1. Add `read_csv_gz_from_s3` to `utils/s3.py`. +2. Append matcher + wrapper to `datatypes/epc/domain/historic_epc.py`. +3. Add tests. + +Steps 2 & 3 depend on 1. No `__init__.py` re-exports needed. From c9c43f178c51ae061dce767f1062981a3fa8acf3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 14:48:15 +0000 Subject: [PATCH 09/13] demo generated for use in address2uprn --- backend/address2UPRN/main.py | 76 +----- backend/address2UPRN/scoring.py | 57 +++++ datatypes/epc/domain/historic_epc.py | 88 +++++++ datatypes/epc/domain/historic_epc_matching.py | 114 +++++++++ datatypes/epc/domain/plan.md | 161 ------------ .../tests/test_historic_epc_matching.py | 239 ++++++++++++++++++ datatypes/epc/loaders/historic_epc.py | 2 +- datatypes/epc/schema/historic_epc.py | 98 ------- .../schema/tests/test_historic_epc_loading.py | 2 +- scripts/historic_epc_demo.py | 47 ++++ utils/s3.py | 15 ++ 11 files changed, 570 insertions(+), 329 deletions(-) create mode 100644 backend/address2UPRN/scoring.py create mode 100644 datatypes/epc/domain/historic_epc_matching.py delete mode 100644 datatypes/epc/domain/plan.md create mode 100644 datatypes/epc/domain/tests/test_historic_epc_matching.py delete mode 100644 datatypes/epc/schema/historic_epc.py create mode 100644 scripts/historic_epc_demo.py diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 28ad344f..b83c7a58 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -17,16 +17,12 @@ from utils.s3 import ( from datetime import datetime from backend.utils.addressMatch import AddressMatch - -logger = setup_logger() - - -EPC_AUTH_TOKEN = os.getenv( - "EPC_AUTH_TOKEN", +from backend.address2UPRN.scoring import ( # noqa: F401 (re-exported) + df_has_single_uprn, + get_uprn_candidates, ) -if EPC_AUTH_TOKEN is None: - raise RuntimeError("EPC_AUTH_TOKEN not defined in env") +logger = setup_logger() def score_addresses( @@ -45,7 +41,10 @@ def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): Recursively fetch EPC data by postcode. If results hit the size limit, retry with double size up to max_attempts. """ - client = EpcClient(auth_token=EPC_AUTH_TOKEN) + auth_token = os.getenv("EPC_AUTH_TOKEN") + if auth_token is None: + raise RuntimeError("EPC_AUTH_TOKEN not defined in env") + client = EpcClient(auth_token=auth_token) url = os.path.join(client.domestic.host, "search") @@ -88,65 +87,6 @@ def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): return results_df -def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: - """ - Returns True if all non-null UPRNs in df match the given uprn. - Returns False otherwise. - """ - - if column not in df.columns: - return False - - # Drop nulls and normalise to string - uprns = df[column].dropna().astype(str).str.strip().unique() - - # No valid UPRNs to compare - if len(uprns) == 0: - return False - - # Exactly one unique UPRN and it matches - return len(uprns) == 1 and uprns[0] == str(uprn) - - -def get_uprn_candidates( - df: pd.DataFrame, - user_address: str, - address_column: str = "address", - uprn_column: str = "uprn", -) -> pd.DataFrame: - """ - Annotate EPC results with lexicographical similarity scores and ranks. - - Returns a DataFrame sorted by descending lexiscore. - DOES NOT choose or return a UPRN. - """ - - if address_column not in df.columns: - raise ValueError(f"Missing column: {address_column}") - - if uprn_column not in df.columns: - raise ValueError(f"Missing column: {uprn_column}") - - out = df.copy() - - user_norm = AddressMatch.normalise_address(user_address) - - out["lexiscore"] = out[address_column].apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) - - # Normalise UPRN to string - out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) - - # Rank: 1 = best match - out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) - - return out.sort_values( - ["lexirank", "lexiscore"], - ascending=[True, False], - ) - - def get_uprn_with_epc_df( user_inputed_address: str, epc_df: pd.DataFrame, diff --git a/backend/address2UPRN/scoring.py b/backend/address2UPRN/scoring.py new file mode 100644 index 00000000..d31b9aea --- /dev/null +++ b/backend/address2UPRN/scoring.py @@ -0,0 +1,57 @@ +import pandas as pd + +from backend.utils.addressMatch import AddressMatch + + +def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: + """ + Returns True if all non-null UPRNs in df match the given uprn. + Returns False otherwise. + """ + + if column not in df.columns: + return False + + uprns = df[column].dropna().astype(str).str.strip().unique() + + if len(uprns) == 0: + return False + + return len(uprns) == 1 and uprns[0] == str(uprn) + + +def get_uprn_candidates( + df: pd.DataFrame, + user_address: str, + address_column: str = "address", + uprn_column: str = "uprn", +) -> pd.DataFrame: + """ + Annotate EPC results with lexicographical similarity scores and ranks. + + Returns a DataFrame sorted by descending lexiscore. + DOES NOT choose or return a UPRN. + """ + + if address_column not in df.columns: + raise ValueError(f"Missing column: {address_column}") + + if uprn_column not in df.columns: + raise ValueError(f"Missing column: {uprn_column}") + + out = df.copy() + + user_norm = AddressMatch.normalise_address(user_address) + + out["lexiscore"] = out[address_column].apply( + lambda x: AddressMatch.levenshtein(user_norm, x) + ) + + out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) + + out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) + + return out.sort_values( + ["lexirank", "lexiscore"], + ascending=[True, False], + ) diff --git a/datatypes/epc/domain/historic_epc.py b/datatypes/epc/domain/historic_epc.py index 230c6327..f64ab8c4 100644 --- a/datatypes/epc/domain/historic_epc.py +++ b/datatypes/epc/domain/historic_epc.py @@ -3,8 +3,96 @@ from dataclasses import dataclass @dataclass class HistoricEpc: + lmk_key: str address1: str address2: str address3: str postcode: str + building_reference_number: str + current_energy_rating: str + potential_energy_rating: str + current_energy_efficiency: str + potential_energy_efficiency: str + property_type: str + built_form: str + inspection_date: str + local_authority: str + constituency: str + county: str + lodgement_date: str + transaction_type: str + environment_impact_current: str + environment_impact_potential: str + energy_consumption_current: str + energy_consumption_potential: str + co2_emissions_current: str + co2_emiss_curr_per_floor_area: str + co2_emissions_potential: str + lighting_cost_current: str + lighting_cost_potential: str + heating_cost_current: str + heating_cost_potential: str + hot_water_cost_current: str + hot_water_cost_potential: str + total_floor_area: str + energy_tariff: str + mains_gas_flag: str + floor_level: str + flat_top_storey: str + flat_storey_count: str + main_heating_controls: str + multi_glaze_proportion: str + glazed_type: str + glazed_area: str + extension_count: str + number_habitable_rooms: str + number_heated_rooms: str + low_energy_lighting: str + number_open_fireplaces: str + hotwater_description: str + hot_water_energy_eff: str + hot_water_env_eff: str + floor_description: str + floor_energy_eff: str + floor_env_eff: str + windows_description: str + windows_energy_eff: str + windows_env_eff: str + walls_description: str + walls_energy_eff: str + walls_env_eff: str + secondheat_description: str + sheating_energy_eff: str + sheating_env_eff: str + roof_description: str + roof_energy_eff: str + roof_env_eff: str + mainheat_description: str + mainheat_energy_eff: str + mainheat_env_eff: str + mainheatcont_description: str + mainheatc_energy_eff: str + mainheatc_env_eff: str + lighting_description: str + lighting_energy_eff: str + lighting_env_eff: str + main_fuel: str + wind_turbine_count: str + heat_loss_corridor: str + unheated_corridor_length: str + floor_height: str + photo_supply: str + solar_water_heating_flag: str + mechanical_ventilation: str + address: str + local_authority_label: str + constituency_label: str + posttown: str + construction_age_band: str + lodgement_datetime: str + tenure: str + fixed_lighting_outlets_count: str + low_energy_fixed_light_count: str uprn: str + uprn_source: str + report_type: str diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py new file mode 100644 index 00000000..53f602ae --- /dev/null +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -0,0 +1,114 @@ +from dataclasses import dataclass +from typing import Any, Optional + +import pandas as pd +from botocore.exceptions import ClientError + +from backend.address2UPRN.scoring import get_uprn_candidates +from backend.utils.addressMatch import AddressMatch +from datatypes.epc.domain.historic_epc import HistoricEpc +from utils.s3 import parse_s3_uri, read_csv_gz_from_s3 + +DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" + +_EXTRA_COLS = {"lexiscore", "lexirank"} + + +def _cell_to_str(v: Any) -> str: + if v is None or (isinstance(v, float) and pd.isna(v)): + return "" + s = str(v).replace("\xa0", " ") + # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". + # Treat that as missing so unambiguous_uprn truthiness checks work. + if s.lower() == "nan": + return "" + return s + + +def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: + kwargs = { + col.lower(): _cell_to_str(val) + for col, val in row.items() + if col.lower() not in _EXTRA_COLS + } + return HistoricEpc(**kwargs) + + +@dataclass(frozen=True) +class ScoredHistoricEpc: + record: HistoricEpc + lexiscore: float + lexirank: int + + +@dataclass +class HistoricEpcMatches: + user_address: str + postcode: str + matches: list[ScoredHistoricEpc] + + def top(self) -> Optional[ScoredHistoricEpc]: + return self.matches[0] if self.matches else None + + def top_n(self, k: int) -> list[ScoredHistoricEpc]: + return self.matches[:k] + + def unambiguous_uprn(self) -> Optional[str]: + top = self.top() + if top is None or top.lexiscore <= 0: + return None + rank1 = [m for m in self.matches if m.lexirank == top.lexirank] + uprns = {m.record.uprn for m in rank1 if m.record.uprn} + return next(iter(uprns)) if len(uprns) == 1 else None + + +def _sanitise_postcode(postcode: str) -> str: + cleaned = (postcode or "").upper().replace(" ", "") + if not cleaned: + raise ValueError("postcode must contain non-whitespace characters") + if not AddressMatch.is_valid_postcode(cleaned): + raise ValueError(f"postcode {cleaned!r} is not a valid UK postcode") + return cleaned + + +def match_addresses_for_postcode( + user_address: str, + postcode: str, + *, + s3_root: str = DEFAULT_S3_ROOT, + address_column: str = "ADDRESS", + uprn_column: str = "UPRN", +) -> HistoricEpcMatches: + if not user_address: + raise ValueError("user_address must be non-empty") + + pc = _sanitise_postcode(postcode) + bucket, root_prefix = parse_s3_uri(s3_root) + key = f"{root_prefix.rstrip('/')}/{pc}/data.csv.gz" + + try: + df = read_csv_gz_from_s3(bucket, key) + except ClientError as e: + if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): + raise FileNotFoundError( + f"No historic EPC data at s3://{bucket}/{key}" + ) from e + raise + + scored = get_uprn_candidates( + df, + user_address=user_address, + address_column=address_column, + uprn_column=uprn_column, + ) + + matches = [ + ScoredHistoricEpc( + record=_row_to_historic_epc(row), + lexiscore=float(row["lexiscore"]), + lexirank=int(row["lexirank"]), + ) + for _, row in scored.iterrows() + ] + + return HistoricEpcMatches(user_address=user_address, postcode=pc, matches=matches) diff --git a/datatypes/epc/domain/plan.md b/datatypes/epc/domain/plan.md deleted file mode 100644 index 45cc495b..00000000 --- a/datatypes/epc/domain/plan.md +++ /dev/null @@ -1,161 +0,0 @@ -# Historic EPC address-match service - -## Context - -ETL `backend/etl/etl_opendatacommunities/main.py` shards `certificates.csv` by sanitised postcode and uploads gzipped CSVs to `s3://retrofit-data-dev/historical_epc//data.csv.gz`. Need a pure-python lib that, given `(user_address, postcode)`, fetches the corresponding shard and scores every row against the user address using the same lexiscore as `address2UPRN` — but returning the full scored df (not a single UPRN), so callers can apply their own thresholding. - -Mirrors pattern in [backend/address2UPRN/main.py:111-147](backend/address2UPRN/main.py#L111-L147) (`get_uprn_candidates`) but reads from S3 historic CSV instead of the EPC live API. No Lambda, no script — lib only for now. - -## Approach - -Add a wrapper class `HistoricEpcMatches` and a function `match_addresses_for_postcode` to the existing domain file. Add a small gzip-CSV S3 helper to `utils/s3.py`. - -### 1. Add gzip-CSV S3 reader - -In [utils/s3.py](utils/s3.py) (after `read_dataframe_from_s3_parquet` ~line 167): - -```python -def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: - if not file_key.endswith(".csv.gz"): - raise ValueError("file_key must end with .csv.gz") - buf = read_io_from_s3(bucket_name, file_key) - return pd.read_csv(buf, compression="gzip", low_memory=False) -``` - -Reuses existing `read_io_from_s3` (line 105). Caller catches `botocore.exceptions.ClientError` for missing-key handling. - -### 2. Append matcher to domain module - -In [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — keep existing `HistoricEpc` dataclass intact, append: - -```python -from typing import Optional -import pandas as pd -from botocore.exceptions import ClientError - -from backend.utils.addressMatch import AddressMatch -from utils.s3 import read_csv_gz_from_s3 - - -@dataclass -class HistoricEpcMatches: - """Scored historic EPC rows for a single postcode.""" - user_address: str - postcode: str # sanitised - df: pd.DataFrame # has lexiscore + lexirank, sorted best-first - - def top(self) -> Optional[pd.Series]: - return None if self.df.empty else self.df.iloc[0] - - def top_n(self, k: int) -> pd.DataFrame: - return self.df.head(k) - - def unambiguous_uprn(self, uprn_column: str = "UPRN") -> Optional[str]: - if self.df.empty: - return None - top_rank = self.df["lexirank"].min() - uprns = ( - self.df.loc[self.df["lexirank"] == top_rank, uprn_column] - .dropna().astype(str).str.replace(r"\.0$", "", regex=True) - .unique() - ) - return uprns[0] if len(uprns) == 1 else None - - -def _sanitise_postcode(postcode: str) -> str: - if not postcode: - raise ValueError("postcode must be non-empty") - return postcode.upper().replace(" ", "") - - -def match_addresses_for_postcode( - user_address: str, - postcode: str, - *, - bucket: str = "retrofit-data-dev", - prefix: str = "historical_epc", - address_column: str = "ADDRESS", -) -> HistoricEpcMatches: - if not user_address: - raise ValueError("user_address must be non-empty") - - pc = _sanitise_postcode(postcode) - key = f"{prefix}/{pc}/data.csv.gz" - - try: - df = read_csv_gz_from_s3(bucket, key) - except ClientError as e: - if e.response.get("Error", {}).get("Code") in ("NoSuchKey", "404"): - raise FileNotFoundError( - f"No historic EPC data at s3://{bucket}/{key}" - ) from e - raise - - if address_column not in df.columns: - raise ValueError( - f"Missing address column {address_column!r} in {key}" - ) - - user_norm = AddressMatch.normalise_address(user_address) - df = df.copy() - df["lexiscore"] = df[address_column].fillna("").apply( - lambda x: AddressMatch.levenshtein(user_norm, x) - ) - df["lexirank"] = ( - df["lexiscore"].rank(method="dense", ascending=False).astype(int) - ) - df = df.sort_values(["lexirank", "lexiscore"], ascending=[True, False]).reset_index(drop=True) - - return HistoricEpcMatches(user_address=user_address, postcode=pc, df=df) -``` - -### Reuse notes -- `AddressMatch.normalise_address` + `AddressMatch.levenshtein` from [backend/utils/addressMatch.py](backend/utils/addressMatch.py) — same scoring as address2UPRN. -- Score column copy uses `.fillna("")` to defend against NaN in `ADDRESS`. -- Defaults match ETL output: bucket `retrofit-data-dev`, prefix `historical_epc`, column `ADDRESS` (uppercase). - -### 3. Tests - -New: [datatypes/epc/domain/tests/__init__.py](datatypes/epc/domain/tests/__init__.py) (empty) and [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py). - -Reuse existing fixture `datatypes/epc/schema/tests/fixtures/historic_epc.csv` — read it in-memory in tests; do NOT commit a `.csv.gz` fixture. Patch target: `datatypes.epc.domain.historic_epc.read_csv_gz_from_s3` (local binding, not `utils.s3.read_csv_gz_from_s3`). - -Cases: -1. `_sanitise_postcode("ab33 8al") == "AB338AL"`; empty raises. -2. Returned df has `lexiscore` + `lexirank` columns, row count preserved. -3. df sorted: `iloc[0]["lexirank"] == 1`, `lexiscore` monotone non-increasing. -4. S3 key built correctly: `"AB33 8AL"` → key `"historical_epc/AB338AL/data.csv.gz"` (spy on patched helper). -5. `ClientError` with code `NoSuchKey` → `FileNotFoundError`. -6. Exact-match address → `unambiguous_uprn()` returns that UPRN; ambiguous tie → `None`. -7. `top()` / `top_n(k)` shape checks. - -## Critical files -- [datatypes/epc/domain/historic_epc.py](datatypes/epc/domain/historic_epc.py) — append matcher -- [utils/s3.py](utils/s3.py) — add `read_csv_gz_from_s3` -- [datatypes/epc/domain/tests/test_historic_epc_match.py](datatypes/epc/domain/tests/test_historic_epc_match.py) — new - -## Out of scope -- Lambda handler / SQS wiring (deferred — lib only) -- Threshold logic (caller decides via wrapper helpers) -- Postcode validation via `postcodes.io` (`AddressMatch.is_valid_postcode` exists if needed later) -- Refactoring `sanitise(pd.Series)` in `etl_opendatacommunities/main.py` — separate concern - -## Verification -``` -cd /workspaces/model && pytest datatypes/epc/domain/tests/test_historic_epc_match.py -v -``` - -Sample real-S3 call (needs AWS creds): -```python -from datatypes.epc.domain.historic_epc import match_addresses_for_postcode -m = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") -print(m.df[["ADDRESS", "UPRN", "lexiscore", "lexirank"]].head()) -print(m.unambiguous_uprn()) -``` - -## Sequencing -1. Add `read_csv_gz_from_s3` to `utils/s3.py`. -2. Append matcher + wrapper to `datatypes/epc/domain/historic_epc.py`. -3. Add tests. - -Steps 2 & 3 depend on 1. No `__init__.py` re-exports needed. diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py new file mode 100644 index 00000000..c23846e1 --- /dev/null +++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py @@ -0,0 +1,239 @@ +from unittest.mock import patch + +import numpy as np +import pandas as pd +import pytest +from botocore.exceptions import ClientError + +from datatypes.epc.domain import historic_epc_matching as matcher_mod +from datatypes.epc.domain.historic_epc_matching import ( + HistoricEpcMatches, + ScoredHistoricEpc, + _sanitise_postcode, + match_addresses_for_postcode, +) + + +# Columns required by the HistoricEpc dataclass (lower-cased CSV columns). +# The matcher only reads ADDRESS + UPRN to score; everything else is filled +# with "" but must be present for HistoricEpc(**kwargs) to construct. +_FULL_COLUMN_FIELDS = [ + "LMK_KEY", "ADDRESS1", "ADDRESS2", "ADDRESS3", "POSTCODE", + "BUILDING_REFERENCE_NUMBER", "CURRENT_ENERGY_RATING", "POTENTIAL_ENERGY_RATING", + "CURRENT_ENERGY_EFFICIENCY", "POTENTIAL_ENERGY_EFFICIENCY", "PROPERTY_TYPE", + "BUILT_FORM", "INSPECTION_DATE", "LOCAL_AUTHORITY", "CONSTITUENCY", "COUNTY", + "LODGEMENT_DATE", "TRANSACTION_TYPE", "ENVIRONMENT_IMPACT_CURRENT", + "ENVIRONMENT_IMPACT_POTENTIAL", "ENERGY_CONSUMPTION_CURRENT", + "ENERGY_CONSUMPTION_POTENTIAL", "CO2_EMISSIONS_CURRENT", + "CO2_EMISS_CURR_PER_FLOOR_AREA", "CO2_EMISSIONS_POTENTIAL", + "LIGHTING_COST_CURRENT", "LIGHTING_COST_POTENTIAL", "HEATING_COST_CURRENT", + "HEATING_COST_POTENTIAL", "HOT_WATER_COST_CURRENT", "HOT_WATER_COST_POTENTIAL", + "TOTAL_FLOOR_AREA", "ENERGY_TARIFF", "MAINS_GAS_FLAG", "FLOOR_LEVEL", + "FLAT_TOP_STOREY", "FLAT_STOREY_COUNT", "MAIN_HEATING_CONTROLS", + "MULTI_GLAZE_PROPORTION", "GLAZED_TYPE", "GLAZED_AREA", "EXTENSION_COUNT", + "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "LOW_ENERGY_LIGHTING", + "NUMBER_OPEN_FIREPLACES", "HOTWATER_DESCRIPTION", "HOT_WATER_ENERGY_EFF", + "HOT_WATER_ENV_EFF", "FLOOR_DESCRIPTION", "FLOOR_ENERGY_EFF", "FLOOR_ENV_EFF", + "WINDOWS_DESCRIPTION", "WINDOWS_ENERGY_EFF", "WINDOWS_ENV_EFF", + "WALLS_DESCRIPTION", "WALLS_ENERGY_EFF", "WALLS_ENV_EFF", + "SECONDHEAT_DESCRIPTION", "SHEATING_ENERGY_EFF", "SHEATING_ENV_EFF", + "ROOF_DESCRIPTION", "ROOF_ENERGY_EFF", "ROOF_ENV_EFF", "MAINHEAT_DESCRIPTION", + "MAINHEAT_ENERGY_EFF", "MAINHEAT_ENV_EFF", "MAINHEATCONT_DESCRIPTION", + "MAINHEATC_ENERGY_EFF", "MAINHEATC_ENV_EFF", "LIGHTING_DESCRIPTION", + "LIGHTING_ENERGY_EFF", "LIGHTING_ENV_EFF", "MAIN_FUEL", "WIND_TURBINE_COUNT", + "HEAT_LOSS_CORRIDOR", "UNHEATED_CORRIDOR_LENGTH", "FLOOR_HEIGHT", + "PHOTO_SUPPLY", "SOLAR_WATER_HEATING_FLAG", "MECHANICAL_VENTILATION", + "ADDRESS", "LOCAL_AUTHORITY_LABEL", "CONSTITUENCY_LABEL", "POSTTOWN", + "CONSTRUCTION_AGE_BAND", "LODGEMENT_DATETIME", "TENURE", + "FIXED_LIGHTING_OUTLETS_COUNT", "LOW_ENERGY_FIXED_LIGHT_COUNT", "UPRN", + "UPRN_SOURCE", "REPORT_TYPE", +] + + +def _row(address: str, uprn) -> dict: + row = {col: "" for col in _FULL_COLUMN_FIELDS} + row["ADDRESS"] = address + row["UPRN"] = uprn + return row + + +def _build_df(rows: list[dict]) -> pd.DataFrame: + return pd.DataFrame(rows, columns=_FULL_COLUMN_FIELDS) + + +@pytest.fixture +def patch_postcode_valid(): + with patch.object(matcher_mod.AddressMatch, "is_valid_postcode", return_value=True) as m: + yield m + + +@pytest.fixture +def patch_read(): + with patch.object(matcher_mod, "read_csv_gz_from_s3") as m: + yield m + + +# ---------- _sanitise_postcode ---------- + + +class TestSanitisePostcode: + + def test_uppercases_and_strips_spaces(self, patch_postcode_valid): + assert _sanitise_postcode("ab33 8al") == "AB338AL" + + def test_empty_raises(self, patch_postcode_valid): + with pytest.raises(ValueError, match="non-whitespace"): + _sanitise_postcode("") + + def test_whitespace_only_raises(self, patch_postcode_valid): + with pytest.raises(ValueError, match="non-whitespace"): + _sanitise_postcode(" ") + + def test_invalid_postcode_raises(self): + with patch.object( + matcher_mod.AddressMatch, "is_valid_postcode", return_value=False + ): + with pytest.raises(ValueError, match="not a valid UK postcode"): + _sanitise_postcode("NONSENSE") + + +# ---------- match_addresses_for_postcode ---------- + + +class TestMatchAddressesForPostcode: + + def test_preserves_row_count_including_zero_score_rows( + self, patch_read, patch_postcode_valid + ): + # Disjoint number sets => hard zero. Still kept in matches. + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("999 SOMEWHERE ELSE", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert isinstance(result, HistoricEpcMatches) + assert len(result.matches) == 2 + + def test_top_has_lexirank_one_and_lexiscore_monotone( + self, patch_read, patch_postcode_valid + ): + patch_read.return_value = _build_df([ + _row("48 GORDON ROAD", "200"), # near miss + _row("47 GORDON ROAD", "100"), # exact (after normalisation) + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert result.top().lexirank == 1 + scores = [m.lexiscore for m in result.matches] + assert scores == sorted(scores, reverse=True) + + def test_s3_key_built_from_default_root(self, patch_read, patch_postcode_valid): + patch_read.return_value = _build_df([_row("47 GORDON ROAD", "100")]) + match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + patch_read.assert_called_once_with( + "retrofit-data-dev", "historical_epc/AB338AL/data.csv.gz" + ) + + def test_s3_key_respects_custom_root_with_trailing_slash( + self, patch_read, patch_postcode_valid + ): + patch_read.return_value = _build_df([_row("47 GORDON ROAD", "100")]) + match_addresses_for_postcode( + "47 Gordon Road", + "AB33 8AL", + s3_root="s3://my-bucket/some/prefix/", + ) + patch_read.assert_called_once_with( + "my-bucket", "some/prefix/AB338AL/data.csv.gz" + ) + + def test_no_such_key_translates_to_filenotfound( + self, patch_read, patch_postcode_valid + ): + patch_read.side_effect = ClientError( + {"Error": {"Code": "NoSuchKey", "Message": "missing"}}, "GetObject" + ) + with pytest.raises(FileNotFoundError): + match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + + def test_other_client_error_propagates(self, patch_read, patch_postcode_valid): + patch_read.side_effect = ClientError( + {"Error": {"Code": "AccessDenied", "Message": "nope"}}, "GetObject" + ) + with pytest.raises(ClientError): + match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + + def test_empty_user_address_raises(self, patch_postcode_valid): + with pytest.raises(ValueError, match="user_address"): + match_addresses_for_postcode("", "AB33 8AL") + + +# ---------- unambiguous_uprn ---------- + + +class TestUnambiguousUprn: + + def test_exact_match_returns_uprn(self, patch_read, patch_postcode_valid): + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert result.unambiguous_uprn() == "100" + + def test_ambiguous_tie_returns_none(self, patch_read, patch_postcode_valid): + # Two duplicate addresses with different UPRNs share rank-1. + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("47 GORDON ROAD", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert result.unambiguous_uprn() is None + + def test_all_zero_score_returns_none_even_when_uprn_unique( + self, patch_read, patch_postcode_valid + ): + # User address has building number 47; no row has 47 -> all hard-zero. + patch_read.return_value = _build_df([ + _row("999 ELSEWHERE", "100"), + _row("888 ELSEWHERE", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + assert all(m.lexiscore == 0.0 for m in result.matches) + assert result.unambiguous_uprn() is None + + def test_nan_uprn_becomes_empty_string_not_nan( + self, patch_read, patch_postcode_valid + ): + # Use a real NaN in the UPRN cell. + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", np.nan), + _row("48 GORDON ROAD", "200"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + top = result.top() + # _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), + # so unambiguous_uprn's truthiness check correctly drops the row. + assert top.record.uprn == "" + + +# ---------- top / top_n ---------- + + +class TestTopHelpers: + + def test_top_n_returns_first_k(self, patch_read, patch_postcode_valid): + patch_read.return_value = _build_df([ + _row("47 GORDON ROAD", "100"), + _row("48 GORDON ROAD", "200"), + _row("49 GORDON ROAD", "300"), + ]) + result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") + top2 = result.top_n(2) + assert len(top2) == 2 + assert all(isinstance(m, ScoredHistoricEpc) for m in top2) + + def test_top_on_empty_matches_returns_none(self): + empty = HistoricEpcMatches(user_address="x", postcode="AB338AL", matches=[]) + assert empty.top() is None + assert empty.top_n(5) == [] + assert empty.unambiguous_uprn() is None diff --git a/datatypes/epc/loaders/historic_epc.py b/datatypes/epc/loaders/historic_epc.py index 7b563315..a4757d23 100644 --- a/datatypes/epc/loaders/historic_epc.py +++ b/datatypes/epc/loaders/historic_epc.py @@ -1,6 +1,6 @@ import csv -from datatypes.epc.schema.historic_epc import HistoricEpc +from datatypes.epc.domain.historic_epc import HistoricEpc def _normalise(value: str | None) -> str: diff --git a/datatypes/epc/schema/historic_epc.py b/datatypes/epc/schema/historic_epc.py deleted file mode 100644 index f64ab8c4..00000000 --- a/datatypes/epc/schema/historic_epc.py +++ /dev/null @@ -1,98 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class HistoricEpc: - lmk_key: str - address1: str - address2: str - address3: str - postcode: str - building_reference_number: str - current_energy_rating: str - potential_energy_rating: str - current_energy_efficiency: str - potential_energy_efficiency: str - property_type: str - built_form: str - inspection_date: str - local_authority: str - constituency: str - county: str - lodgement_date: str - transaction_type: str - environment_impact_current: str - environment_impact_potential: str - energy_consumption_current: str - energy_consumption_potential: str - co2_emissions_current: str - co2_emiss_curr_per_floor_area: str - co2_emissions_potential: str - lighting_cost_current: str - lighting_cost_potential: str - heating_cost_current: str - heating_cost_potential: str - hot_water_cost_current: str - hot_water_cost_potential: str - total_floor_area: str - energy_tariff: str - mains_gas_flag: str - floor_level: str - flat_top_storey: str - flat_storey_count: str - main_heating_controls: str - multi_glaze_proportion: str - glazed_type: str - glazed_area: str - extension_count: str - number_habitable_rooms: str - number_heated_rooms: str - low_energy_lighting: str - number_open_fireplaces: str - hotwater_description: str - hot_water_energy_eff: str - hot_water_env_eff: str - floor_description: str - floor_energy_eff: str - floor_env_eff: str - windows_description: str - windows_energy_eff: str - windows_env_eff: str - walls_description: str - walls_energy_eff: str - walls_env_eff: str - secondheat_description: str - sheating_energy_eff: str - sheating_env_eff: str - roof_description: str - roof_energy_eff: str - roof_env_eff: str - mainheat_description: str - mainheat_energy_eff: str - mainheat_env_eff: str - mainheatcont_description: str - mainheatc_energy_eff: str - mainheatc_env_eff: str - lighting_description: str - lighting_energy_eff: str - lighting_env_eff: str - main_fuel: str - wind_turbine_count: str - heat_loss_corridor: str - unheated_corridor_length: str - floor_height: str - photo_supply: str - solar_water_heating_flag: str - mechanical_ventilation: str - address: str - local_authority_label: str - constituency_label: str - posttown: str - construction_age_band: str - lodgement_datetime: str - tenure: str - fixed_lighting_outlets_count: str - low_energy_fixed_light_count: str - uprn: str - uprn_source: str - report_type: str diff --git a/datatypes/epc/schema/tests/test_historic_epc_loading.py b/datatypes/epc/schema/tests/test_historic_epc_loading.py index 2170a8a6..a42f383e 100644 --- a/datatypes/epc/schema/tests/test_historic_epc_loading.py +++ b/datatypes/epc/schema/tests/test_historic_epc_loading.py @@ -3,7 +3,7 @@ import os import pytest from datatypes.epc.loaders.historic_epc import read_historic_epc_csv -from datatypes.epc.schema.historic_epc import HistoricEpc +from datatypes.epc.domain.historic_epc import HistoricEpc FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") diff --git a/scripts/historic_epc_demo.py b/scripts/historic_epc_demo.py new file mode 100644 index 00000000..b47c3a3c --- /dev/null +++ b/scripts/historic_epc_demo.py @@ -0,0 +1,47 @@ +"""Demo: look up historic EPC records for an address + postcode. + +Reads the gzipped CSV at + s3://retrofit-data-dev/historical_epc//data.csv.gz +scores rows against the user-provided address, and prints the top matches. + +Usage: + python -m scripts.historic_epc_demo "47 Gordon Road" "AB33 8AL" + python -m scripts.historic_epc_demo # uses defaults below +""" + +import sys + +from datatypes.epc.domain.historic_epc_matching import match_addresses_for_postcode + + +def main(user_address: str, postcode: str) -> None: + print(f"Looking up: {user_address!r} @ {postcode!r}\n") + + result = match_addresses_for_postcode(user_address, postcode) + + print(f"Found {len(result.matches)} candidate row(s).\n") + + print("Top 3 matches:") + for m in result.top_n(3): + print( + f" rank={m.lexirank} score={m.lexiscore:.3f} " + f"uprn={m.record.uprn or '(none)':<14} {m.record.address}" + ) + + print() + uprn = result.unambiguous_uprn() + if uprn: + print(f"Unambiguous UPRN: {uprn}") + else: + print("No unambiguous UPRN (zero-score, tie, or empty result).") + + +if __name__ == "__main__": + args = sys.argv[1:] + if len(args) == 2: + main(args[0], args[1]) + elif len(args) == 0: + main("47 Gordon Road", "AB33 8AL") + else: + print(__doc__) + sys.exit(2) diff --git a/utils/s3.py b/utils/s3.py index 930e2e15..a28f074e 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -167,6 +167,21 @@ def read_dataframe_from_s3_parquet(bucket_name, file_key): return df +def read_csv_gz_from_s3(bucket_name: str, file_key: str) -> pd.DataFrame: + """ + Read a gzipped CSV from S3 into a pandas DataFrame. + + :param bucket_name: Name of the S3 bucket. + :param file_key: Key of the file (must end in .csv.gz). + :return: A pandas DataFrame. + """ + if not file_key.endswith(".csv.gz"): + raise ValueError("file_key must end with .csv.gz") + + buffer = read_io_from_s3(bucket_name=bucket_name, file_key=file_key) + return pd.read_csv(buffer, compression="gzip", low_memory=False) + + def save_csv_to_s3(dataframe, bucket_name, file_name): """ Save a Pandas DataFrame to a CSV file in an S3 bucket. From 7ef5dc49223676cc833e7deead3e8dd2339c178e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 8 May 2026 15:06:41 +0000 Subject: [PATCH 10/13] update csv --- backend/etl/etl_opendatacommunities/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/etl/etl_opendatacommunities/README.md b/backend/etl/etl_opendatacommunities/README.md index bf16ba89..65441caf 100644 --- a/backend/etl/etl_opendatacommunities/README.md +++ b/backend/etl/etl_opendatacommunities/README.md @@ -1,6 +1,6 @@ This website https://epc.opendatacommunities.org/ has closed down on 30th May 2026 -So we downloaded the data and moved everything to S3 ( s3://retrofit-data-dev/epc_opendatacommunities/master_backup/ ) +So we downloaded the data and moved everything to S3 ( s3://retrofit-data-dev/histroical_epc/0_master_backup/ ) This scripts assumes the following: @@ -11,4 +11,4 @@ The script funciton is: 1) reads csv for all data, seperate each iteration by postcode 2) compresses the csv and save it in the location -2) only gets the postcode data, compresses and uploads to s3 -> location s3://retrofit-data-dev/epc_opendatacommunities//compressed data \ No newline at end of file +3) location s3://retrofit-data-dev/epc_opendatacommunities//compressed data.csv \ No newline at end of file From fb758b76bf2dcecbed486b569b9fa5e345a85ddc Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:37:44 +0000 Subject: [PATCH 11/13] changed to utils --- datatypes/epc/domain/historic_epc_matching.py | 16 +++------------- .../domain/tests/test_historic_epc_matching.py | 2 +- utils/pandas_utils.py | 14 ++++++++++++++ utils/s3.py | 2 -- 4 files changed, 18 insertions(+), 16 deletions(-) create mode 100644 utils/pandas_utils.py diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 53f602ae..2eb590e8 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import pandas as pd from botocore.exceptions import ClientError @@ -7,6 +7,7 @@ from botocore.exceptions import ClientError from backend.address2UPRN.scoring import get_uprn_candidates from backend.utils.addressMatch import AddressMatch from datatypes.epc.domain.historic_epc import HistoricEpc +from utils.pandas_utils import pandas_cell_to_str from utils.s3 import parse_s3_uri, read_csv_gz_from_s3 DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" @@ -14,20 +15,9 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" _EXTRA_COLS = {"lexiscore", "lexirank"} -def _cell_to_str(v: Any) -> str: - if v is None or (isinstance(v, float) and pd.isna(v)): - return "" - s = str(v).replace("\xa0", " ") - # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". - # Treat that as missing so unambiguous_uprn truthiness checks work. - if s.lower() == "nan": - return "" - return s - - def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: kwargs = { - col.lower(): _cell_to_str(val) + col.lower(): pandas_cell_to_str(val) for col, val in row.items() if col.lower() not in _EXTRA_COLS } diff --git a/datatypes/epc/domain/tests/test_historic_epc_matching.py b/datatypes/epc/domain/tests/test_historic_epc_matching.py index c23846e1..1c3ee6d4 100644 --- a/datatypes/epc/domain/tests/test_historic_epc_matching.py +++ b/datatypes/epc/domain/tests/test_historic_epc_matching.py @@ -211,7 +211,7 @@ class TestUnambiguousUprn: ]) result = match_addresses_for_postcode("47 Gordon Road", "AB33 8AL") top = result.top() - # _cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), + # pandas_cell_to_str must turn NaN/"nan" into "" (not the literal string "nan"), # so unambiguous_uprn's truthiness check correctly drops the row. assert top.record.uprn == "" diff --git a/utils/pandas_utils.py b/utils/pandas_utils.py new file mode 100644 index 00000000..b32cde10 --- /dev/null +++ b/utils/pandas_utils.py @@ -0,0 +1,14 @@ +from typing import Any + +import pandas as pd + + +def pandas_cell_to_str(v: Any) -> str: + if v is None or (isinstance(v, float) and pd.isna(v)): + return "" + s = str(v).replace("\xa0", " ") + # get_uprn_candidates runs .astype(str) on UPRN, turning NaN into "nan". + # Treat that as missing so unambiguous_uprn truthiness checks work. + if s.lower() == "nan": + return "" + return s diff --git a/utils/s3.py b/utils/s3.py index a28f074e..13d272e7 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -6,8 +6,6 @@ from io import BytesIO, StringIO from urllib.parse import unquote from utils.logger import setup_logger from botocore.exceptions import NoCredentialsError, PartialCredentialsError -from typing import Any - logger = setup_logger() From dccb35c2bc05f613bb767958b2a0ea7e517433e8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:44:55 +0000 Subject: [PATCH 12/13] fixed s3 location --- backend/etl/etl_opendatacommunities/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/etl/etl_opendatacommunities/README.md b/backend/etl/etl_opendatacommunities/README.md index 65441caf..728ac468 100644 --- a/backend/etl/etl_opendatacommunities/README.md +++ b/backend/etl/etl_opendatacommunities/README.md @@ -11,4 +11,4 @@ The script funciton is: 1) reads csv for all data, seperate each iteration by postcode 2) compresses the csv and save it in the location -3) location s3://retrofit-data-dev/epc_opendatacommunities//compressed data.csv \ No newline at end of file +3) location s3://retrofit-data-dev/historical_epc//compressed data.csv \ No newline at end of file From bf91722f30699124af4b36c23efcb108d09eeab8 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 11 May 2026 08:45:26 +0000 Subject: [PATCH 13/13] renamed a function to be self commenting --- datatypes/epc/domain/historic_epc_matching.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datatypes/epc/domain/historic_epc_matching.py b/datatypes/epc/domain/historic_epc_matching.py index 2eb590e8..95ca9d9f 100644 --- a/datatypes/epc/domain/historic_epc_matching.py +++ b/datatypes/epc/domain/historic_epc_matching.py @@ -15,7 +15,7 @@ DEFAULT_S3_ROOT = "s3://retrofit-data-dev/historical_epc" _EXTRA_COLS = {"lexiscore", "lexirank"} -def _row_to_historic_epc(row: pd.Series) -> HistoricEpc: +def _map_historic_epc_pandas_row_to_domain(row: pd.Series) -> HistoricEpc: kwargs = { col.lower(): pandas_cell_to_str(val) for col, val in row.items() @@ -94,7 +94,7 @@ def match_addresses_for_postcode( matches = [ ScoredHistoricEpc( - record=_row_to_historic_epc(row), + record=_map_historic_epc_pandas_row_to_domain(row), lexiscore=float(row["lexiscore"]), lexirank=int(row["lexirank"]), )