diff --git a/.devcontainer/asset_list/Dockerfile b/.devcontainer/asset_list/Dockerfile new file mode 100644 index 00000000..512ab109 --- /dev/null +++ b/.devcontainer/asset_list/Dockerfile @@ -0,0 +1,39 @@ +FROM python:3.11.10-bullseye + + +ARG USER=vscode +ARG DEBIAN_FRONTEND=noninteractive + +# 1) Toolchain + utilities for building libpostal +RUN apt-get update && apt-get install -y --no-install-recommends \ + sudo jq vim curl git ca-certificates \ + build-essential pkg-config automake autoconf libtool \ + && rm -rf /var/lib/apt/lists/* + +# # 2) Build and install libpostal from source +RUN git clone --depth 1 https://github.com/openvenues/libpostal /tmp/libpostal \ + && cd /tmp/libpostal \ + && ./bootstrap.sh \ + && ./configure --datadir=/usr/local/share/libpostal \ + && make -j"$(nproc)" \ + && make install \ + && ldconfig \ + && rm -rf /tmp/libpostal + +# 3) Create the user and grant sudo privileges +RUN useradd -m -s /usr/bin/bash ${USER} \ + && echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \ + && chmod 0440 /etc/sudoers.d/${USER} + +# # 4) Python deps - if you want to run assest list +ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 +ADD asset_list/requirements.txt requirements.txt +RUN pip install -r requirements.txt + +RUN pip install -r requirements.txt +# 5) Workdir +WORKDIR /workspaces/model + +# 6) Make Python find your package +# Add project root to PYTHONPATH for all processes +ENV PYTHONPATH=/workspaces/model:${PYTHONPATH} diff --git a/.devcontainer/devcontainer.json b/.devcontainer/asset_list/devcontainer.json similarity index 95% rename from .devcontainer/devcontainer.json rename to .devcontainer/asset_list/devcontainer.json index 5e23ae0d..4834d559 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/asset_list/devcontainer.json @@ -1,7 +1,7 @@ { - "name": "Basic Python", + "name": "SAL ENV", "dockerComposeFile": "docker-compose.yml", - "service": "model", + "service": "model-sal", "remoteUser": "vscode", "workspaceFolder": "/workspaces/model", "postStartCommand": "bash .devcontainer/post-install.sh", diff --git a/.devcontainer/docker-compose.yml b/.devcontainer/asset_list/docker-compose.yml similarity index 95% rename from .devcontainer/docker-compose.yml rename to .devcontainer/asset_list/docker-compose.yml index 7f60d34d..67b27444 100644 --- a/.devcontainer/docker-compose.yml +++ b/.devcontainer/asset_list/docker-compose.yml @@ -1,7 +1,7 @@ version: '3.8' services: - model: + model-sal: user: "${UID}:${GID}" build: context: .. diff --git a/.devcontainer/post-install.sh b/.devcontainer/asset_list/post-install.sh similarity index 98% rename from .devcontainer/post-install.sh rename to .devcontainer/asset_list/post-install.sh index dc6da006..48fbfde1 100644 --- a/.devcontainer/post-install.sh +++ b/.devcontainer/asset_list/post-install.sh @@ -11,4 +11,4 @@ if os.path.exists(env_path): print("✔ Loaded .env into Jupyter kernel") else: print("⚠ No .env file found to load") -EOF \ No newline at end of file +EOF diff --git a/.devcontainer/asset_list/requirements.txt b/.devcontainer/asset_list/requirements.txt new file mode 100644 index 00000000..cfab95ec --- /dev/null +++ b/.devcontainer/asset_list/requirements.txt @@ -0,0 +1,23 @@ +fastapi==0.115.2 +sqlalchemy==2.0.36 +psycopg2-binary==2.9.10 +python-jose==3.3.0 +cryptography==43.0.3 +mangum==0.19.0 +# AWS +boto3==1.35.44 +# Data +openpyxl==3.1.2 +# Basic +pytz +uvicorn[standard] +# Testing +pytest==9.0.2 +pytest-cov==7.0.0 +ipykernel>=6.25,<7 +pydantic-settings<2 +pyyaml>=6.0.1 +pydantic>=1.10.7,<2 +sqlmodel +# Formatting +black==26.1.0 diff --git a/.devcontainer/Dockerfile b/.devcontainer/backend/Dockerfile similarity index 96% rename from .devcontainer/Dockerfile rename to .devcontainer/backend/Dockerfile index ccfb55b6..4c5d16f5 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/backend/Dockerfile @@ -34,7 +34,7 @@ RUN useradd -m -s /usr/bin/bash ${USER} \ ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 ADD backend/engine/requirements.txt requirements1.txt ADD backend/app/requirements/requirements.txt requirements2.txt -ADD .devcontainer/requirements.txt requirements3.txt +ADD .devcontainer/backend/requirements.txt requirements3.txt RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt RUN pip install -r requirements.txt diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json new file mode 100644 index 00000000..c672b1bf --- /dev/null +++ b/.devcontainer/backend/devcontainer.json @@ -0,0 +1,40 @@ +{ + "name": "Backend Model Env", + "dockerComposeFile": "docker-compose.yml", + "service": "model-backend", + "remoteUser": "vscode", + "workspaceFolder": "/workspaces/model", + "postStartCommand": "bash .devcontainer/backend/post-install.sh", + "mounts": [ + "source=${localEnv:HOME},target=/workspaces/home,type=bind" + ], + "customizations": { + "vscode": { + "extensions": [ + "ms-python.python", + "ms-toolsai.jupyter", + "mechatroner.rainbow-csv", + "ms-toolsai.datawrangler", + "lindacong.vscode-book-reader", + "4ops.terraform", + "fabiospampinato.vscode-todo-plus", + "jgclark.vscode-todo-highlight", + "corentinartaud.pdfpreview", + "ms-python.vscode-python-envs", + "ms-python.black-formatter", + "waderyan.gitblame" + ], + "settings": { + "files.defaultWorkspace": "/workspaces/model", + "[python]": { + "editor.defaultFormatter": "ms-python.black-formatter", + "editor.formatOnSave": true + }, + "python.formatting.provider": "none" + } + } + }, + "containerEnv": { + "PYTHONFLAGS": "-Xfrozen_modules=off" + } +} diff --git a/.devcontainer/backend/docker-compose.yml b/.devcontainer/backend/docker-compose.yml new file mode 100644 index 00000000..75526e79 --- /dev/null +++ b/.devcontainer/backend/docker-compose.yml @@ -0,0 +1,18 @@ +version: '3.8' + +services: + model-backend: + user: "${UID}:${GID}" + build: + context: ../.. + dockerfile: .devcontainer/backend/Dockerfile + command: sleep infinity + volumes: + - ../../:/workspaces/model + networks: + - model-net + +networks: + model-net: + driver: bridge + diff --git a/.devcontainer/backend/post-install.sh b/.devcontainer/backend/post-install.sh new file mode 100644 index 00000000..48fbfde1 --- /dev/null +++ b/.devcontainer/backend/post-install.sh @@ -0,0 +1,14 @@ +mkdir -p ~/.ipython/profile_default/startup + +cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py +from dotenv import load_dotenv +import os + +# Adjust path as needed +env_path = "/workspaces/model/backend/.env" +if os.path.exists(env_path): + load_dotenv(env_path) + print("✔ Loaded .env into Jupyter kernel") +else: + print("⚠ No .env file found to load") +EOF diff --git a/.devcontainer/requirements.txt b/.devcontainer/backend/requirements.txt similarity index 96% rename from .devcontainer/requirements.txt rename to .devcontainer/backend/requirements.txt index 5e7753a6..9562aa6a 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/backend/requirements.txt @@ -1,4 +1,4 @@ -# fastapi + fastapi==0.115.2 sqlalchemy==2.0.36 pydantic-settings==2.6.0 diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 95155c86..14d5a06f 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -2,6 +2,12 @@ name: Run unit tests on: pull_request: + branches: + - "**" + push: + branches: + - "**" + jobs: test: diff --git a/.vscode/settings.json b/.vscode/settings.json index 88c2ae2d..3d4c6b42 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -9,9 +9,12 @@ "path": "/bin/bash" } }, +<<<<<<< HEAD +======= "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "python.testing.pytestArgs": ["-s", "-q", "--no-cov"] +>>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d // Hot reload setting that needs to be in user settings // "jupyter.runStartupCommands": [ diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index 940c723a..ea4d8b34 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -34,7 +34,8 @@ from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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") + class DataRemapper: @@ -1159,13 +1160,17 @@ class AssetList: ), axis=1 ) + + col = self.EPC_API_DATA_NAMES["roof-description"] self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply( - lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[ + lambda x: RoofAttributes(description=x[col]).process()[ "insulation_thickness"] if not pd.isnull( - x[self.EPC_API_DATA_NAMES["roof-description"]]) else None, + x[col]) else None, axis=1 ) + + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = ( self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].str.replace("+", "") ) diff --git a/asset_list/DataMapper.py b/asset_list/DataMapper.py index ac1b8db3..0751a7cf 100644 --- a/asset_list/DataMapper.py +++ b/asset_list/DataMapper.py @@ -1,5 +1,5 @@ # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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") class DataRemapper: diff --git a/asset_list/__init__.py b/asset_list/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/asset_list/app.py b/asset_list/app.py index 01906c5f..9907a609 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -12,9 +12,8 @@ from asset_list.utils import get_data from dotenv import load_dotenv from backend.SearchEpc import SearchEpc - load_dotenv(dotenv_path="backend/.env") -EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=") def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): @@ -58,6 +57,10 @@ def app(): EPC recommendations Property UPRN """ +<<<<<<< HEAD + data_folder = ("/workspaces/model/asset_list") + data_filename = "assets.xlsx" +======= data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hackney" data_filename = "Domna SHF Wave 3 (3).xlsx" @@ -96,22 +99,23 @@ def app(): data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " "Project/data_validation") data_filename = "to_standardise_uprns.xlsx" +>>>>>>> 3874da6177cbcc37f7a488bec0a06e387906653c sheet_name = "Sheet1" postcode_column = 'Postcode' - address1_column = "Address 1" - address1_method = None - fulladdress_column = None - address_cols_to_concat = ["Address 1", "Address 2", "Address 3"] + address1_column = None + address1_method = 'house_number_extraction' + fulladdress_column = 'Address' + address_cols_to_concat = None missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = "Type" - landlord_built_form = "Attachment" + landlord_property_type = None + landlord_built_form = None landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Org Ref" + landlord_property_id = "LLUPRN" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -127,40 +131,6 @@ def app(): asset_list_header = 0 landlord_block_reference = None - # Lambeth: - # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lambeth/December 10th" - # data_filename = "lambeth_sw2_leigham court estate.xlsx" - # sheet_name = "Sheet1" - # postcode_column = 'Postcode' - # address1_column = "Address" - # address1_method = None - # fulladdress_column = None - # address_cols_to_concat = ["Address"] - # missing_postcodes_method = None - # landlord_year_built = None - # landlord_os_uprn = None - # landlord_property_type = None - # landlord_built_form = None - # landlord_wall_construction = None - # landlord_roof_construction = None - # landlord_heating_system = None - # landlord_existing_pv = None - # landlord_property_id = "row_id" - # landlord_sap = None - # outcomes_filename = None - # outcomes_sheetname = None - # outcomes_postcode = None - # outcomes_houseno = None - # outcomes_id = None - # outcomes_address = None - # master_filepaths = [] - # master_id_colnames = [] - # master_to_asset_list_filepath = None - # phase = False - # ecosurv_landlords = None - # asset_list_header = 0 - # landlord_block_reference = None - # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -439,6 +409,10 @@ def app(): ) asset_list.merge_data(epc_df) + # asset_list.standardised_asset_list = asset_list.standardised_asset_list[ + # asset_list.standardised_asset_list["domna_full_address"] + # != "120 Airdrie Crescent, Burnley, Lancashire" + # ] asset_list.extract_attributes() asset_list.identify_worktypes() diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index b68706be..dc7e572e 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -1,7 +1,6 @@ postal pandas usaddress -pydantic-settings==2.6.0 epc-api-python==1.0.2 thefuzz boto3 @@ -10,6 +9,5 @@ openai>=1.3.5 tiktoken msgpack beautifulsoup4 -pydantic>=1.10.7 typing-extensions>=4.5.0 -requests>=2.28.2 +requests>=2.28.2 \ No newline at end of file diff --git a/backend/address2UPRN/README.md b/backend/address2UPRN/README.md new file mode 100644 index 00000000..b4876340 --- /dev/null +++ b/backend/address2UPRN/README.md @@ -0,0 +1,20 @@ +We have list of address as input. + +It'll come in batches of the same post code and from then we want to somehow convert that into UPRN + +if this lambda/function can do that we'll be speeding ahead + + +Energy Performance Information: https://epc.opendatacommunities.org/ + +guidance page: https://epc.opendatacommunities.org/docs/guidance#field_domestic_LMK_KEY + +Example of past khalims code that he wrote some tests for: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/tests/test_search_epc.py#L11 + + +Example of EPC search: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/SearchEpc.py#L118 + + + +Khalim has made a python package to help scrape data: https://github.com/KhalimCK/epc-api-python + diff --git a/backend/address2UPRN/__init__.py b/backend/address2UPRN/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py new file mode 100644 index 00000000..58b25d74 --- /dev/null +++ b/backend/address2UPRN/main.py @@ -0,0 +1,520 @@ +from epc_api.client import EpcClient +import os +from urllib.parse import urlencode +import pandas as pd +from difflib import SequenceMatcher +from tqdm import tqdm +from utils.logger import setup_logger + +logger = setup_logger() + +import re + +EPC_AUTH_TOKEN = os.getenv( + "EPC_AUTH_TOKEN", +) + +import re +from difflib import SequenceMatcher +from typing import Set + + +def levenshtein(a: str, b: str) -> float: + """ + Address similarity score in [0, 1]. + + Strategy: + - Normalise + - Strongly penalise mismatched house/flat numbers + - Combine token overlap + character similarity + """ + + def extract_number_sequence(s: str) -> list[str]: + return re.findall(r"\d+[a-z]?", s) + + def extract_numbers(s: str) -> Set[str]: + return set(extract_number_sequence(s)) + + def tokenise(s: str) -> Set[str]: + return set(s.split()) + + a_norm = normalise_address(a) + b_norm = normalise_address(b) + + # --- hard signal: numbers --- + nums_a = extract_numbers(a_norm) + nums_b = extract_numbers(b_norm) + + if nums_a and not nums_b: + return 0.0 + + # No shared numbers at all → impossible match + if nums_a and nums_b and nums_a.isdisjoint(nums_b): + return 0.0 + + # --- order-sensitive flat/building guard --- + seq_a = extract_number_sequence(a_norm) + seq_b = extract_number_sequence(b_norm) + + has_flat_token_user = any( + tok in a_norm for tok in ("flat", "apt", "apartment", "unit") + ) + has_flat_token_epc = "flat" in b_norm + + if ( + len(seq_a) == 2 + and len(seq_b) >= 2 + and has_flat_token_epc + and not has_flat_token_user + and seq_a != seq_b[:2] + ): + return 0.0 + + # --- token similarity (order-independent) --- + toks_a = tokenise(a_norm) + toks_b = tokenise(b_norm) + + if not toks_a or not toks_b: + token_score = 0.0 + else: + token_score = len(toks_a & toks_b) / len(toks_a | toks_b) + + # --- character similarity (soft signal) --- + char_score = SequenceMatcher(None, a_norm, b_norm).ratio() + + # --- weighted blend --- + return round( + 0.65 * token_score + 0.35 * char_score, + 4, + ) + + +def normalise_address(s: str) -> str: + """ + Canonical UK-focused address normalisation. + + - Lowercases + - Removes punctuation (keeps / for flats) + - Normalises whitespace + - Applies synonym compression at token level + """ + + if not s: + return "" + + ADDRESS_SYNONYMS = { + # street types + "rd": "road", + "rd.": "road", + "st": "street", + "st.": "street", + "ave": "avenue", + "ave.": "avenue", + "ln": "lane", + "ln.": "lane", + "cres": "crescent", + "ct": "court", + "dr": "drive", + # flats / units + "apt": "flat", + "apartment": "flat", + "unit": "flat", + "ste": "suite", + # numbering noise + "no": "", + "no.": "", + } + # 1. lowercase + s = s.lower() + + # 1.5 split digit-letter suffixes + s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) + + # 2. remove punctuation except / + s = re.sub(r"[^\w\s/]", " ", s) + + # 3. normalise whitespace + s = re.sub(r"\s+", " ", s).strip() + + # 4. tokenise + synonym normalisation + tokens = [] + for tok in s.split(): + replacement = ADDRESS_SYNONYMS.get(tok, tok) + if replacement: + tokens.append(replacement) + + return " ".join(tokens) + + +def score_addresses( + df: pd.DataFrame, + user_address: str, + column: str = "address", +) -> pd.Series: + if column not in df.columns: + raise ValueError(f"Missing column: {column}") + + return df[column].apply(lambda x: levenshtein(user_address, x)) + + +def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): + """ + Recursively fetch EPC data by postcode. + If results hit the size limit, retry with double size up to max_attempts. + """ + client = EpcClient(auth_token=EPC_AUTH_TOKEN) + + url = os.path.join(client.domestic.host, "search") + + if size: + url += "?" + urlencode({"size": size}) + + search_resp = client.domestic.call( + url=url, + method="get", + params={"postcode": postcode}, + ) + + results_df = pd.DataFrame(search_resp["rows"], columns=search_resp["column-names"]) + + row_count = len(results_df) + + # If we hit the size limit, there *may* be more results + if row_count == size: + print( + f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. " + f"Attempt {attempt}/{max_attempts}." + ) + + if attempt < max_attempts: + print(f"🔁 Retrying with size={size * 2}") + return get_epc_data_with_postcode( + postcode=postcode, + size=size * 2, + attempt=attempt + 1, + max_attempts=max_attempts, + ) + else: + print( + "🚨 Max attempts reached. Results may be truncated. " + "(Please do a manual review by the tech team.)" + ) + + return results_df + + +def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: + """ + Returns True if all non-null UPRNs in df match the given uprn. + Returns False otherwise. + """ + + if column not in df.columns: + return False + + # Drop nulls and normalise to string + uprns = df[column].dropna().astype(str).str.strip().unique() + + # No valid UPRNs to compare + if len(uprns) == 0: + return False + + # Exactly one unique UPRN and it matches + return len(uprns) == 1 and uprns[0] == str(uprn) + + +def get_uprn_candidates( + df: pd.DataFrame, + user_address: str, + address_column: str = "address", + uprn_column: str = "uprn", +) -> pd.DataFrame: + """ + Annotate EPC results with lexicographical similarity scores and ranks. + + Returns a DataFrame sorted by descending lexiscore. + DOES NOT choose or return a UPRN. + """ + + if address_column not in df.columns: + raise ValueError(f"Missing column: {address_column}") + + if uprn_column not in df.columns: + raise ValueError(f"Missing column: {uprn_column}") + + out = df.copy() + + user_norm = normalise_address(user_address) + + out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x)) + + # Normalise UPRN to string + out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) + + # Rank: 1 = best match + out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) + + return out.sort_values( + ["lexirank", "lexiscore"], + ascending=[True, False], + ) + + +def get_uprn(user_inputed_address: str, postcode: str): + """ + Return uprn (str) + Return False if failed to find a sensible matching epc + Return Nons when epc found but no UPRN + """ + df = get_epc_data_with_postcode(postcode=postcode) + + if df.empty: + return None + + scored_df = get_uprn_candidates( + df, + user_address=user_inputed_address, + ) + + # Best score + best_score = scored_df.iloc[0]["lexiscore"] + + if best_score <= 0: + return None + + # All rank-1 rows (possible draw) + top_rank_df = scored_df[scored_df["lexirank"] == 1] + + # If rank-1 rows do not agree on a single UPRN → ambiguous + if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]): + return None + + address = top_rank_df["address"].values[0] + lexiscore = float(top_rank_df["lexiscore"].values[0]) + + logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}") + # Safe to return the agreed UPRN + found_uprn = top_rank_df.iloc[0]["uprn"] + + if found_uprn == "": + return None + + return found_uprn + + +def resolve_uprns_for_postcode_group( + group_df: pd.DataFrame, + epc_df: pd.DataFrame, + address_col: str = "Address 1", +) -> pd.DataFrame: + """ + Given: + - group_df: rows sharing the same postcode + - epc_df: EPC search results for that postcode + + Returns: + group_df + found_uprn + diagnostics + """ + + results = [] + + for _, row in group_df.iterrows(): + user_address = str(row[address_col]).strip() + + scored_df = get_uprn_candidates( + epc_df, + user_address=user_address, + ) + + if scored_df.empty: + results.append( + { + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "no_epc_candidates", + } + ) + continue + + best_score = scored_df.iloc[0]["lexiscore"] + + if best_score <= 0: + results.append( + { + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": best_score, + "status": "zero_score", + } + ) + continue + + top_rank_df = scored_df[scored_df["lexirank"] == 1] + + if not df_has_single_uprn(top_rank_df, top_rank_df.iloc[0]["uprn"]): + results.append( + { + "found_uprn": None, + "best_match_uprn": top_rank_df.iloc[0]["uprn"], + "best_match_address": top_rank_df.iloc[0]["address"], + "best_match_lexiscore": best_score, + "status": "ambiguous", + } + ) + continue + + results.append( + { + "found_uprn": str(top_rank_df.iloc[0]["uprn"]), + "best_match_uprn": str(top_rank_df.iloc[0]["uprn"]), + "best_match_address": top_rank_df.iloc[0]["address"], + "best_match_lexiscore": best_score, + "status": "matched", + } + ) + + return pd.concat( + [group_df.reset_index(drop=True), pd.DataFrame(results)], + axis=1, + ) + + +def test(a, b): + assert a == b, f"erorr: {a}{type(a)} != {b}: {type(b)}" + + +def run_all_test(): + # Basic usage with different post codes styles + test(get_epc_data_with_postcode("b93 8sy").shape[0], 63) + test(get_epc_data_with_postcode("B938sy").shape[0], 63) + test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) + test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) + + test(get_uprn("68", "b93 8sy"), "100070989938") + test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938") + test(get_uprn("Flat A, 28, Nelgarde Road", "se6 4tf"), "100023278633") + test(get_uprn("28 A", "se6 4tf"), "100023278633") + test(get_uprn("28A", "se6 4tf"), "100023278633") + test(get_uprn("6 Aitken Close", "E8 4SQ"), False) + + # unique case + test(get_uprn("Flat 5, 1, Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("5 , 1 Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("5 Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("1, 5 Semley Gate", "e9 5nh"), False) + test( + get_uprn("1 Semley Gate", "e9 5nh"), "10008238188" + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("48 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("42 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("46 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + get_uprn_candidates(get_epc_data_with_postcode("e5 0bt"), "48 Oswald Street") + + +if __name__ == "__main__": + INPUT_FILE = "hackney.xlsx" + + ADDRESS_COL = "Address 1" + POSTCODE_COL = "Postcode" + UPRN_COL = "UPRN" + + df = pd.read_excel(INPUT_FILE) + + failures = [] + + for _, row in tqdm( + df.iterrows(), + total=len(df), + desc="Auditing UPRNs", + ): + input_address = str(row[ADDRESS_COL]).strip() + postcode = str(row[POSTCODE_COL]).strip() + + expected_uprn = None if pd.isna(row[UPRN_COL]) else str(int(row[UPRN_COL])) + + try: + epc_df = get_epc_data_with_postcode(postcode) + + if epc_df.empty: + failures.append( + { + **row.to_dict(), + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "no_epc_results", + } + ) + continue + + scored_df = get_uprn_candidates( + epc_df, + user_address=input_address, + ) + + best_row = scored_df.iloc[0] + + best_match_uprn = str(best_row["uprn"]) + best_match_address = best_row["address"] + best_match_lexiscore = round(float(best_row["lexiscore"]), 4) + + found_uprn = get_uprn(input_address, postcode) + + except Exception as e: + failures.append( + { + **row.to_dict(), + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "exception", + "error": str(e), + } + ) + continue + + found_uprn_norm = None if not found_uprn else str(found_uprn) + + if found_uprn_norm != expected_uprn: + failures.append( + { + **row.to_dict(), + "found_uprn": found_uprn_norm, + "best_match_uprn": best_match_uprn, + "best_match_address": best_match_address, + "best_match_lexiscore": best_match_lexiscore, + "status": ("no_match" if found_uprn_norm is None else "mismatch"), + } + ) + + failures_df = pd.DataFrame(failures) + + print("===================================") + print(f"Total rows : {len(df)}") + print(f"Failures : {len(failures_df)}") + print("===================================") + + failures_df.to_excel( + "hackney_uprn_failures.xlsx", + index=False, + ) + + +# TO do function dispatcher, + +# get_uprn_candidates(get_epc_data_with_postcode("E9 5NH"),"Flat 1, 5 Semley Gate" and Flat 5, 1 Semley Gate) +# fix that +# Look again at flat 1 +# pandas reader the seperate postcode_splitter +# dump into s3 diff --git a/backend/address2UPRN/script.py b/backend/address2UPRN/script.py new file mode 100644 index 00000000..bd8f8017 --- /dev/null +++ b/backend/address2UPRN/script.py @@ -0,0 +1,17 @@ +import pandas as pd + + +# use Address 1 +junte_df = pd.read_excel("hackney_uprn_failures.xlsx") + + +# use domna_address_1 +khalim_df = pd.read_excel("khalim_standard.xlsx") + + +combined_df = junte_df.merge(khalim_df, how="left", left_on="Address 1", right_on='domna_address_1') + +# Find the row in khalim_df that does not app + +result = combined_df[~pd.isnull(combined_df["epc_os_uprn"])] + diff --git a/backend/address2UPRN/tests/test_csv.py b/backend/address2UPRN/tests/test_csv.py new file mode 100644 index 00000000..70e7a9f9 --- /dev/null +++ b/backend/address2UPRN/tests/test_csv.py @@ -0,0 +1,40 @@ +# tests/test_address_to_uprn_csv.py + +import csv +import pytest +from pathlib import Path +from backend.address2UPRN.main import get_uprn + +FIXTURE_PATH = Path(__file__).parent / "test_data.csv" + + +def load_test_cases(): + with open(FIXTURE_PATH, newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + return [ + pytest.param( + row["User Input"], + row["Postcode"], + row["Manual UPRN Code"], + id=f'{row["User Input"]} [{row["Postcode"]}]', + ) + for row in reader + ] + + +@pytest.mark.parametrize( + "user_input,postcode,expected_uprn", + load_test_cases(), +) +def test_uprn_resolution_matches_manual( + user_input: str, + postcode: str, + expected_uprn: str, +): + from utils.logger import setup_logger + + uprn = get_uprn(user_input, postcode) + if uprn: + assert uprn == expected_uprn + else: + assert str(uprn) == expected_uprn diff --git a/backend/address2UPRN/tests/test_data.csv b/backend/address2UPRN/tests/test_data.csv new file mode 100644 index 00000000..f3d9b64c --- /dev/null +++ b/backend/address2UPRN/tests/test_data.csv @@ -0,0 +1,167 @@ +User Input,Postcode,Manual UPRN Code +47 The Fairway,OX16 0RR,100120771697 +11 REGENT COURT,SL1 3LG,100081041562 +3/137a Windmill Road,TW8 9NH,100021516998 +Flat 33,SW18 4BE,100023328943 +FLAT 1 Brendon Grove,N2 8JE,200013412 +Flat 15,KT8 2NE,100062123759 +FLAT 5 Stonehill Road,W4 3AH,100021589829 +10 Douglas Court,SL7 1UQ,100081278099 +1 Windmill Road,HP17 8JA,766034606 +31 Denewood,HP13 7LH,100081095964 +"10, Greenways Drive",TW4 5DD,10091597009 +Flat 10,W4 3AH,"100021589834" +Flat 11,TW4 5DD,10091597010 +Flat 11,W4 3AH,100021589835 +"12, Greenways Drive",TW4 5DD,10091597011 +"Flat 12, Forbes House",W4 3AH,100021589836 +FLAT 1 Goodstone Court,HA1 4FL,10070269053 +Flat 13,TW4 5DD,10091597012 +Flat 13,W4 3AH,100021589837 +Flat 14,TW4 5DD,10091597013 +Flat 14,W4 3AH,100021589838 +Flat 15,TW4 5DD,10091597014 +Flat 15,W4 3AH,100021589839 +Flat 16,TW4 5DD,"10091597015" +Flat 16,W4 3AH,100021589840 +Flat 17,TW4 5DD,10091597016 +Flat 17,W4 3AH,100021589841 +Flat 18,TW4 5DD,10091597017 +Flat 19,W4 3AH,100021589843 +Flat 20,W4 3AH,100021589844 +Flat 21,W4 3AH,100021589845 +Flat 22,W4 3AH,100021589846 +FLAT 2 Goodstone Court,HA1 4FL,10070269054 +Flat 23,W4 3AH,100021589847 +Flat 24,W4 3AH,100021589848 +"30c, Bosanquet Close",UB8 3PE,100021475316 +"30e, Bosanquet Close",UB8 3PE,100021475318 +FLAT 3 Goodstone Court,HA1 4FL,10070269055 +FLAT 4 Goodstone Court,HA1 4FL,10070269056 +FLAT 5 Goodstone Court,HA1 4FL,10070269057 +FLAT 6 Goodstone Court,HA1 4FL,10070269058 +FLAT 7 Goodstone Court,HA1 4FL,10070269059 +FLAT 8 Goodstone Court,HA1 4FL,10070269060 +FLAT 9 Goodstone Court,HA1 4FL,10070269061 +FLAT 10 Goodstone Court,HA1 4FL,10070269062 +FLAT 11 Goodstone Court,HA1 4FL,10070269063 +FLAT 12 Goodstone Court,HA1 4FL,10070269064 +FLAT 13 Goodstone Court,HA1 4FL,10070269065 +FLAT 14 Goodstone Court,HA1 4FL,10070269066 +FLAT 15 Goodstone Court,HA1 4FL,10070269067 +FLAT 16 Goodstone Court,HA1 4FL,10070269068 +FLAT 17 Goodstone Court,HA1 4FL,10070269069 +FLAT 18 Goodstone Court,HA1 4FL,10070269070 +FLAT 19 Goodstone Court,HA1 4FL,10070269071 +FLAT 20 Goodstone Court,HA1 4FL,10070269072 +FLAT 21 Goodstone Court,HA1 4FL,10070269073 +FLAT 22 Goodstone Court,HA1 4FL,10070269074 +FLAT 23 Goodstone Court,HA1 4FL,10070269075 +FLAT 24 Goodstone Court,HA1 4FL,10070269076 +FLAT 25 Goodstone Court,HA1 4FL,10070269077 +FLAT 26 Goodstone Court,HA1 4FL,10070269078 +FLAT 27 Goodstone Court,HA1 4FL,10070269079 +FLAT 28 Goodstone Court,HA1 4FL,10070269080 +FLAT 29 Goodstone Court,HA1 4FL,10070269081 +FLAT 30 Goodstone Court,HA1 4FL,10070269082 +FLAT 31 Goodstone Court,HA1 4FL,10070269083 +FLAT 32 Goodstone Court,HA1 4FL,10070269084 +FLAT 33 Goodstone Court,HA1 4FL,10070269085 +FLAT 34 Goodstone Court,HA1 4FL,10070269086 +FLAT 35 Goodstone Court,HA1 4FL,10070269087 +FLAT 36 Goodstone Court,HA1 4FL,10070269088 +FLAT 37 Goodstone Court,HA1 4FL,10070269089 +FLAT 38 Goodstone Court,HA1 4FL,10070269090 +FLAT 39 Goodstone Court,HA1 4FL,10070269091 +FLAT 40 Goodstone Court,HA1 4FL,10070269092 +FLAT 41 Goodstone Court,HA1 4FL,10070269093 +FLAT 42 Goodstone Court,HA1 4FL,10070269094 +FLAT 43 Goodstone Court,HA1 4FL,10070269095 +"13 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778260 +"14 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778259 +"15 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778258 +"16 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778263 +"17 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778262 +"18 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778261 +"19 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778266 +"20 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778265 +"21 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778264 +90a Murray Road,W5 4DA,12135293 +"Flat 1, 6 Wolverton Gardens",W5 3LJ,"12119972" +"1, Monsted House",UB1 1FG,12189944 +"10, Monsted House",UB1 1FG,12189953 +"20, Monsted House",UB1 1FG,12189963 +"2, Monsted House",UB1 1FG,12189945 +"3, Monsted House",UB1 1FG,12189946 +"4, Monsted House",UB1 1FG,12189947 +"5, Monsted House",UB1 1FG,12189948 +"6, Monsted House",UB1 1FG,12189949 +"7, Monsted House",UB1 1FG,12189950 +"8, Monsted House",UB1 1FG,12189951 +"9, Monsted House",UB1 1FG,12189952 +"1 Cullis House, 1, Accolade Avenue",UB1 1FH,12189904 +"2 Cullis House, 1, Accolade Avenue",UB1 1FH,12189905 +"3 Cullis House, 1, Accolade Avenue",UB1 1FH,12189906 +"4 Cullis House, 1, Accolade Avenue",UB1 1FH,12189907 +"5 Cullis House, 1, Accolade Avenue",UB1 1FH,12189908 +"6 Cullis House, 1, Accolade Avenue",UB1 1FH,12189909 +1 Genteel House Samara Drive,UB1 1FJ,12189835 +2 Genteel House Samara Drive,UB1 1FJ,12189836 +3 Genteel House Samara Drive,UB1 1FJ,12189837 +4 Genteel House Samara Drive,UB1 1FJ,12189838 +5 Genteel House Samara Drive,UB1 1FJ,12189839 +6 Genteel House Samara Drive,UB1 1FJ,12189840 +7 Genteel House Samara Drive,UB1 1FJ,12189841 +8 Genteel House Samara Drive,UB1 1FJ,12189842 +9 Genteel House Samara Drive,UB1 1FJ,12189843 +10 Genteel House Samara Drive,UB1 1FJ,12189844 +1 ASH TREE HOUSE,SE5 0TE,10009803979 +3 ASH TREE HOUSE,SE5 0TE,10009803981 +5 ASH TREE HOUSE,SE5 0TE,10009803983 +8 ASH TREE HOUSE,SE5 0TE,10009803986 +12 ASH TREE HOUSE,SE5 0TE,10009803990 +FLAT 1 599 HARROW ROAD,W10 4RA,217113930 +FLAT 2 599 HARROW ROAD,W10 4RA,217113931 +FLAT 3 599 HARROW ROAD,W10 4RA,None +FLAT 4 599 HARROW ROAD,W10 4RA,None +FLAT 5 599 HARROW ROAD,W10 4RA,217113934 +FLAT 6 599 HARROW ROAD,W10 4RA,None +FLAT 7 599 HARROW ROAD,W10 4RA,None +FLAT 8 599 HARROW ROAD,W10 4RA,None +"Flat 1, Ohio Building",SE13 7RX,10023226256 +"Flat 2, Ohio Building",SE13 7RX,10023226257 +"Apartment 1 Block B, 105, Benwell Road",N7 7BW,10012792307 +"Apartment 2 Block B, 105, Benwell Road",N7 7BW,10012792308 +"Apartment 3 Block B, 105, Benwell Road",N7 7BW,10012792309 +"Apartment 4 Block B, 105, Benwell Road",N7 7BW,10012792310 +"Apartment 5 Block B, 105, Benwell Road",N7 7BW,10012792311 +"Apartment 6 Block B, 105, Benwell Road",N7 7BW,10012792312 +"Apartment 7 Block B, 105, Benwell Road",N7 7BW,10012792313 +"Apartment 8 Block B, 105, Benwell Road",N7 7BW,10012792314 +"Apartment 9 Block B, 105, Benwell Road",N7 7BW,10012792315 +"Apartment 10 Block B, 105, Benwell Road",N7 7BW,10012792316 +"Apartment 11 Block B, 105, Benwell Road",N7 7BW,10012792317 +"Apartment 12 Block B, 105, Benwell Road",N7 7BW,10012792318 +"Apartment 13 Block B, 105, Benwell Road",N7 7BW,10012792319 +"Apartment 1 Block D, 32, Hornsey Road",N7 7AT,10012792366 +"Apartment 2 Block D, 32, Hornsey Road",N7 7AT,10012792367 +"Apartment 3 Block D, 32, Hornsey Road",N7 7AT,10012792368 +"Apartment 4 Block D, 32, Hornsey Road",N7 7AT,10012792369 +"Apartment 5 Block D, 32, Hornsey Road",N7 7AT,10012792370 +"Apartment 6 Block D, 32, Hornsey Road",N7 7AT,"10012792371" +"Apartment 7 Block D, 32, Hornsey Road",N7 7AT,10012792372 +"Apartment 8 Block D, 32, Hornsey Road",N7 7AT,10012792373 +"Apartment 9 Block D, 32, Hornsey Road",N7 7AT,10012792374 +"Apartment 10 Block D, 32, Hornsey Road",N7 7AT,10012792375 +"Apartment 11 Block D, 32, Hornsey Road",N7 7AT,10012792376 +"Apartment 12 Block D, 32, Hornsey Road",N7 7AT,10012792377 +"Apartment 13 Block D, 32, Hornsey Road",N7 7AT,10012792378 +"Apartment 14 Block D, 32, Hornsey Road",N7 7AT,10012792379 +"Apartment 15 Block D, 32, Hornsey Road",N7 7AT,10012792380 +"Apartment 16 Block D, 32, Hornsey Road",N7 7AT,"10012792381" +"Apartment 17Block D, 32, Hornsey Road",N7 7AT,10012792382 +"Apartment 18 Block D, 32, Hornsey Road",N7 7AT,10012792383 +24b Honley Road,SE6 2HZ,None +FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974 +2 COLLEGE HOUSE,CM7 1JS,100091449870 +3 COLLEGE HOUSE,CM7 1JS,100091449871 \ No newline at end of file diff --git a/backend/app/requirements/requirements.txt b/backend/app/requirements/requirements.txt index dff7a546..3124034e 100644 --- a/backend/app/requirements/requirements.txt +++ b/backend/app/requirements/requirements.txt @@ -1,3 +1,4 @@ + # fastapi fastapi==0.115.2 sqlalchemy==2.0.36 @@ -12,5 +13,4 @@ boto3==1.35.44 openpyxl==3.1.2 # Basic pytz -sqlmodel - +sqlmodel \ No newline at end of file diff --git a/backend/engine/requirements.txt b/backend/engine/requirements.txt index b565e9d3..5cca1211 100644 --- a/backend/engine/requirements.txt +++ b/backend/engine/requirements.txt @@ -1,3 +1,4 @@ + # Pandas and numpy numpy==2.1.2 pandas==2.2.3 @@ -22,4 +23,4 @@ pyarrow==17.0.0 fastparquet==2024.5.0 aiohttp==3.10.10 # find my epc -beautifulsoup4 +beautifulsoup4 \ No newline at end of file diff --git a/backend/postcode_splitter/hackney.xlsx b/backend/postcode_splitter/hackney.xlsx new file mode 100644 index 00000000..64892f3a Binary files /dev/null and b/backend/postcode_splitter/hackney.xlsx differ diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py new file mode 100644 index 00000000..d417c8f1 --- /dev/null +++ b/backend/postcode_splitter/main.py @@ -0,0 +1,114 @@ +import pandas as pd +import requests +from backend.address2UPRN.main import resolve_uprns_for_postcode_group, get_epc_data_with_postcode +from tqdm import tqdm + + + +def sanitise_postcode(postcode: str) -> str | None: + """ + Normalise postcode for grouping. + + - Uppercase + - Remove all whitespace + """ + if pd.isna(postcode): + return None + + return postcode.upper().replace(" ", "") + + +def is_valid_postcode(postcode_clean: str) -> bool: + """ + Validate postcode using postcodes.io. + + Expects a sanitised postcode (e.g. E84SQ). + Returns True if valid, False otherwise. + """ + POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" + if not postcode_clean: + return False + + try: + resp = requests.get( + POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), + timeout=5, + ) + resp.raise_for_status() + return resp.json().get("result", False) + except requests.RequestException: + # Network issues, rate limits, etc. + return False + + +def main(): + df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability") + df = df.head(500) + + # Sanitise postcodes + df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) + + # --- validate AFTER grouping (save API calls) --- + + # Get unique, non-null postcodes + unique_postcodes = ( + df["postcode_clean"] + .dropna() + .unique() + ) + + # Validate each postcode once, TODOadd a progress bar + postcode_validity = { + pc: is_valid_postcode(pc) + for pc in tqdm(unique_postcodes, total=len(unique_postcodes)) + } + + # Map validity back onto dataframe + df["postcode_valid"] = df["postcode_clean"].map(postcode_validity) + + + results = [] + + for postcode, group_df in tqdm( + df[df["postcode_valid"]].groupby("postcode_clean"), + desc="Resolving UPRNs by postcode", + ): + try: + epc_df = get_epc_data_with_postcode(postcode) + + if epc_df.empty: + tmp = group_df.copy() + tmp["found_uprn"] = None + tmp["status"] = "no_epc_results" + results.append(tmp) + continue + + resolved = resolve_uprns_for_postcode_group( + group_df=group_df, + epc_df=epc_df, + ) + + results.append(resolved) + + except Exception as e: + tmp = group_df.copy() + tmp["found_uprn"] = None + tmp["status"] = "exception" + tmp["error"] = str(e) + results.append(tmp) + + final_df = pd.concat(results, ignore_index=True) + a = final_df[[ + "best_match_lexiscore","Address 1", + "best_match_address", "Postcode", + "UPRN", "best_match_uprn" + ]] # add levi score to viewing + b = final_df[final_df["best_match_lexiscore"]>0] # add levi score to viewing + b = b[[ + "best_match_lexiscore","Address 1", + "best_match_address", "Postcode", + "UPRN", "best_match_uprn" + ]] + +if __name__ == "__main__": + main() diff --git a/conftest.py b/conftest.py index e3add6e6..d93f0023 100644 --- a/conftest.py +++ b/conftest.py @@ -1,5 +1,11 @@ import os from backend.app.config import get_settings +import os +from dotenv import load_dotenv +import os + +# Load .env in conftest.py directory for local development +load_dotenv() DEFAULT_ENV = { "API_KEY": "test", @@ -8,7 +14,10 @@ DEFAULT_ENV = { "DATA_BUCKET": "test", "PLAN_TRIGGER_BUCKET": "test", "ENGINE_SQS_URL": "test", - "EPC_AUTH_TOKEN": "test", # overridden in GitHub Actions + "EPC_AUTH_TOKEN": os.getenv( + "EPC_AUTH_TOKEN", + "test", + ), # overridden in GitHub Actions "GOOGLE_SOLAR_API_KEY": "test", "DB_HOST": "localhost", "DB_USERNAME": "test", diff --git a/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py b/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py index b6fc0f8f..68655e80 100644 --- a/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py +++ b/etl/customers/peabody/Nov 2025 Consulting Project/k_deck_stats.py @@ -1,111 +1,111 @@ import pandas as pd -epc_c_recommendations = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " - "solid floor, ashp 3.0 - corrected.xlsx" -) -epc_b_recommendations = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC B - no " - "solid floor, ashp 3.0 - corrected.xlsx" -) +# epc_c_recommendations = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " +# "solid floor, ashp 3.0 - corrected.xlsx" +# ) +# epc_b_recommendations = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC B - no " +# "solid floor, ashp 3.0 - corrected.xlsx" +# ) -epc_c_movers = epc_b_recommendations[ - epc_b_recommendations["current_epc_rating"] == "Epc.C" - ] -epc_c_movers["property_type"].value_counts() +# epc_c_movers = epc_b_recommendations[ +# epc_b_recommendations["current_epc_rating"] == "Epc.C" +# ] +# epc_c_movers["property_type"].value_counts() -house_epc_c_movers = epc_c_movers[ - epc_c_movers["property_type"] == "House" - ] -house_epc_c_movers_with_solar = house_epc_c_movers[ - ~pd.isnull(house_epc_c_movers["solar_pv"]) | ~pd.isnull(house_epc_c_movers["solar_pv_with_battery"]) - ] +# house_epc_c_movers = epc_c_movers[ +# epc_c_movers["property_type"] == "House" +# ] +# house_epc_c_movers_with_solar = house_epc_c_movers[ +# ~pd.isnull(house_epc_c_movers["solar_pv"]) | ~pd.isnull(house_epc_c_movers["solar_pv_with_battery"]) +# ] -house_epc_c_movers_with_a_heatpump = house_epc_c_movers[ - ~pd.isnull(house_epc_c_movers["air_source_heat_pump"]) -] +# house_epc_c_movers_with_a_heatpump = house_epc_c_movers[ +# ~pd.isnull(house_epc_c_movers["air_source_heat_pump"]) +# ] -flat_epc_c_movers = epc_c_movers[ - epc_c_movers["property_type"] == "Flat" - ] +# flat_epc_c_movers = epc_c_movers[ +# epc_c_movers["property_type"] == "Flat" +# ] -epc_c_recommendations["sap_points"].mean() -epc_c_recommendations["sap_points"].mean() +# epc_c_recommendations["sap_points"].mean() +# epc_c_recommendations["sap_points"].mean() -measure_cols = [ - "air_source_heat_pump", - "boiler_upgrade", - "cavity_wall_insulation", - "double_glazing", - "external_wall_insulation", - "flat_roof_insulation", - "high_heat_retention_storage_heaters", - "internal_wall_insulation", - "loft_insulation", - "low_energy_lighting", - "mechanical_ventilation", - "room_roof_insulation", - "roomstat_programmer_trvs", - "sealing_open_fireplace", - "secondary_glazing", - "secondary_heating", - "solar_pv", - "solar_pv_with_battery", - "suspended_floor_insulation", - "time_temperature_zone_control", -] +# measure_cols = [ +# "air_source_heat_pump", +# "boiler_upgrade", +# "cavity_wall_insulation", +# "double_glazing", +# "external_wall_insulation", +# "flat_roof_insulation", +# "high_heat_retention_storage_heaters", +# "internal_wall_insulation", +# "loft_insulation", +# "low_energy_lighting", +# "mechanical_ventilation", +# "room_roof_insulation", +# "roomstat_programmer_trvs", +# "sealing_open_fireplace", +# "secondary_glazing", +# "secondary_heating", +# "solar_pv", +# "solar_pv_with_battery", +# "suspended_floor_insulation", +# "time_temperature_zone_control", +# ] -epc_c_melted = ( - epc_c_recommendations - .melt( - id_vars=[c for c in epc_c_recommendations.columns if c not in measure_cols], - value_vars=measure_cols, - var_name="measure_type", - value_name="value", - ) - .dropna(subset=["value"]) -) -epc_c_melted = epc_c_melted[epc_c_melted["value"] > 0] -epc_c_measures = epc_c_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() +# epc_c_melted = ( +# epc_c_recommendations +# .melt( +# id_vars=[c for c in epc_c_recommendations.columns if c not in measure_cols], +# value_vars=measure_cols, +# var_name="measure_type", +# value_name="value", +# ) +# .dropna(subset=["value"]) +# ) +# epc_c_melted = epc_c_melted[epc_c_melted["value"] > 0] +# epc_c_measures = epc_c_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() -epc_b_melted = ( - epc_b_recommendations - .melt( - id_vars=[c for c in epc_b_recommendations.columns if c not in measure_cols], - value_vars=measure_cols, - var_name="measure_type", - value_name="value", - ) - .dropna(subset=["value"]) -) +# epc_b_melted = ( +# epc_b_recommendations +# .melt( +# id_vars=[c for c in epc_b_recommendations.columns if c not in measure_cols], +# value_vars=measure_cols, +# var_name="measure_type", +# value_name="value", +# ) +# .dropna(subset=["value"]) +# ) -epc_b_melted = epc_b_melted[epc_b_melted["value"] > 0] -epc_b_measures = epc_b_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() +# epc_b_melted = epc_b_melted[epc_b_melted["value"] > 0] +# epc_b_measures = epc_b_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index() -measures_compared = epc_c_measures.merge( - epc_b_measures, - left_on="measure_type", - right_on="measure_type", - suffixes=("_epc_c", "_epc_b"), -) +# measures_compared = epc_c_measures.merge( +# epc_b_measures, +# left_on="measure_type", +# right_on="measure_type", +# suffixes=("_epc_c", "_epc_b"), +# ) -epc_c_retrofits = epc_c_recommendations[ - epc_c_recommendations["total_retrofit_cost"] > 0 - ] +# epc_c_retrofits = epc_c_recommendations[ +# epc_c_recommendations["total_retrofit_cost"] > 0 +# ] -epc_b_retrofits = epc_b_recommendations[ - epc_b_recommendations["total_retrofit_cost"] > 0 - ] +# epc_b_retrofits = epc_b_recommendations[ +# epc_b_recommendations["total_retrofit_cost"] > 0 +# ] -epc_c_retrofits["sap_points"].mean() -epc_b_retrofits["sap_points"].mean() +# epc_c_retrofits["sap_points"].mean() +# epc_b_retrofits["sap_points"].mean() -properties_in_both = epc_c_retrofits.merge(epc_b_retrofits, on="uprn", suffixes=("_epc_c", "_epc_b")) +# properties_in_both = epc_c_retrofits.merge(epc_b_retrofits, on="uprn", suffixes=("_epc_c", "_epc_b")) -properties_in_both["total_retrofit_cost_epc_c"].mean() -properties_in_both["sap_points_epc_c"].mean() -properties_in_both["total_retrofit_cost_epc_b"].mean() -properties_in_both["sap_points_epc_b"].mean() +# properties_in_both["total_retrofit_cost_epc_c"].mean() +# properties_in_both["sap_points_epc_c"].mean() +# properties_in_both["total_retrofit_cost_epc_b"].mean() +# properties_in_both["sap_points_epc_b"].mean() # Solar PV savings - we need the amount of solar PV bill savings from sqlalchemy.orm import sessionmaker @@ -114,16 +114,12 @@ from backend.app.db.models.recommendations import Recommendation, Plan, PlanReco from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel from collections import defaultdict -PORTFOLIO_ID = 435 # Peabody +PORTFOLIO_ID = 485 # Peabody SCENARIOS = [ - 908, - 909, - 910, + 970 ] scenario_names = { - 908: "EPC C - no solid floor, ashp 3.0", - 909: "EPC C - no solid floor, no EWI or IWI, ashp 3.0", - 910: "EPC B - no solid floor, no EWI, ashp 3.0" + 970: "EPC C - no solid floor, ashp 3.0", } @@ -236,307 +232,266 @@ recommendations_df = pd.DataFrame(recommendations_data) properties_df = pd.DataFrame(properties_data) plans_df = pd.DataFrame(plans_data) -s_id = 910 -ps_w_a_plan = plans_df[plans_df["scenario_id"] == s_id].copy() -# Take the newest by scenario id -ps_w_a_plan = ps_w_a_plan.sort_values("created_at", ascending=False).drop_duplicates( - subset=["property_id"] -) -z = ps_w_a_plan[ - ps_w_a_plan["cost_of_works"] > 0 - ].copy() -z2 = properties_df[properties_df["property_id"].isin(z["property_id"].values)] -# '', 'hot_water_cost_current', -# 'lighting_cost_current', 'appliances_cost_current', -# 'gas_standing_charge', 'electricity_standing_charge' -z2["total_bills"] = z2["heating_cost_current"] + z2["hot_water_cost_current"] + z2["lighting_cost_current"] + z2[ - "appliances_cost_current" -] + z2["gas_standing_charge"] + z2["electricity_standing_charge"] +with pd.ExcelWriter("hackney.xlsx", engine="openpyxl") as writer: + recommendations_df.to_excel(writer, sheet_name="recommendations", index=False) + properties_df.to_excel(writer, sheet_name="properties", index=False) -from tqdm import tqdm + +# solar_pv_recommendations = recommendations_df[recommendations_df["measure_type"] == "solar_pv"] +# average_savings = solar_pv_recommendations.groupby("scenario_id")["energy_cost_savings"].mean().reset_index() -# For a property ID, find a property where the no EWI/IWI approach is more expensive than the EWI approach -pids = properties_df["property_id"].unique() -for pid in tqdm(pids): - if pid in [603272, 550550, 574493]: - continue - # get the plans - property_plan = plans_df[plans_df["property_id"] == int(pid)] - # Take the newest plan by scenario id - property_plan = property_plan.sort_values("created_at", ascending=False).drop_duplicates( - subset=["scenario_id"] - ) - a = property_plan[property_plan["scenario_id"] == 909].squeeze() # no EWI/IWI - b = property_plan[property_plan["scenario_id"] == 908].squeeze() # EWI - if (a["cost_of_works"] > b["cost_of_works"]) and ( - a["post_epc_rating"].value == "C") and (b["cost_of_works"] > 5000): - bah +# # Check tenures +# initial_asset_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " +# "- Data Extracts for Domna.xlsx", +# sheet_name="Properties" +# ) +# sustainability_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " +# "- Data Extracts for Domna.xlsx", +# sheet_name="Sustainability" +# ) -solar_pv_recommendations = recommendations_df[ - recommendations_df["measure_type"] == "solar_pv" - ] +# sustainability_sample = sustainability_data[ +# sustainability_data["UPRN"].isin(properties_df["uprn"].astype(int).astype(str).values) +# ] -solid_wall_recommendation = recommendations_df[ - recommendations_df["scenario_id"].isin([908]) & - recommendations_df["measure_type"].isin(["internal_wall_insulation"]) & - recommendations_df["default"] - ] -average_savings = solar_pv_recommendations.groupby("scenario_id")["energy_cost_savings"].mean().reset_index() -# Add on scenarion names -average_savings["scenario_name"] = average_savings["scenario_id"].map(scenario_names) +# sustainability_sample = sustainability_sample.merge( +# initial_asset_data, left_on="Org Ref", right_on="UPRN", suffixes=("_sustainability", "_initial_asset") +# ) -# Check tenures -initial_asset_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " - "- Data Extracts for Domna.xlsx", - sheet_name="Properties" -) -sustainability_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody " - "- Data Extracts for Domna.xlsx", - sheet_name="Sustainability" -) +# block_sizes = initial_asset_data["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) +# block_sizes.to_excel("/Users/khalimconn-kowlessar/Downloads/peabody_block_sizes.xlsx", index=False) -sustainability_sample = sustainability_data[ - sustainability_data["UPRN"].isin(properties_df["uprn"].astype(int).astype(str).values) -] +# initial_asset_data.columns +# initial_asset_data["LeaseType"].value_counts() -sustainability_sample = sustainability_sample.merge( - initial_asset_data, left_on="Org Ref", right_on="UPRN", suffixes=("_sustainability", "_initial_asset") -) +# # sustainability_sample["Tenure Group"].value_counts() +# # Tenure Group +# # General Needs 57787 +# # Home Ownership 25471 +# # Care & Supported Housing 4239 +# # Rental 2677 +# # Other 188 -block_sizes = initial_asset_data["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) -block_sizes.to_excel("/Users/khalimconn-kowlessar/Downloads/peabody_block_sizes.xlsx", index=False) +# df = sustainability_sample["Ownership Type"].value_counts().to_frame().reset_index() +# df.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenures.xlsx", index=False) -initial_asset_data.columns -initial_asset_data["LeaseType"].value_counts() +# tenure_groups = sustainability_sample["Tenure Group"].value_counts().to_frame().reset_index() +# tenure_groups.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenure_groups.xlsx", index=False) -# sustainability_sample["Tenure Group"].value_counts() -# Tenure Group -# General Needs 57787 -# Home Ownership 25471 -# Care & Supported Housing 4239 -# Rental 2677 -# Other 188 +# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Tenure Group"].value_counts() -df = sustainability_sample["Ownership Type"].value_counts().to_frame().reset_index() -df.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenures.xlsx", index=False) +# sample_data = initial_asset_data[ +# ~initial_asset_data["Ownership Type"].isin( +# [ +# # Commercial # Everything is resi - based on the Residential Indicator variable - all are true +# # Freeholder +# "FREEHOLDER", # 19517 properties +# # HOMEBUY / EQUITY LOAN +# "Rent to Homebuy", # 1 property +# # Leaseholder +# "LEASEHOLD 100%", # 8455 properties +# "Owned and Managed - 999 year lease", # 2076 properties +# "Managed but not Owned-Private Lease", # 159 properties +# "Owned and managed LEASEHOLD", # 26 properties +# # Outright Sale - can't find anything matching +# # SHARED EQUITY +# "Shared Ownership", # 4065 properties +# "Shared Ownership Owned Not Managed", # 23 properties +# # Extra categories which seem sensible to exclude +# "NOT MANAGED AND NOT OWNED" +# ] +# ) +# ] -tenure_groups = sustainability_sample["Tenure Group"].value_counts().to_frame().reset_index() -tenure_groups.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenure_groups.xlsx", index=False) +# sample_data["Ownership Type"].value_counts() -initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Tenure Group"].value_counts() +# sample_data = initial_asset_data[ +# initial_asset_data["Ownership Type"].isin( +# [ +# "Owned and Managed", +# "Owned and Managed - 999 year lease", +# "Owned and managed LEASEHOLD", +# "LEASEHOLD 100%", +# "DATALOAD DEFAULT" +# ] +# ) +# ] +# dropped = initial_asset_data[~initial_asset_data["UPRN"].isin(sample_data["UPRN"].values)] +# dropped["Ownership Type"].value_counts() -sample_data = initial_asset_data[ - ~initial_asset_data["Ownership Type"].isin( - [ - # Commercial # Everything is resi - based on the Residential Indicator variable - all are true - # Freeholder - "FREEHOLDER", # 19517 properties - # HOMEBUY / EQUITY LOAN - "Rent to Homebuy", # 1 property - # Leaseholder - "LEASEHOLD 100%", # 8455 properties - "Owned and Managed - 999 year lease", # 2076 properties - "Managed but not Owned-Private Lease", # 159 properties - "Owned and managed LEASEHOLD", # 26 properties - # Outright Sale - can't find anything matching - # SHARED EQUITY - "Shared Ownership", # 4065 properties - "Shared Ownership Owned Not Managed", # 23 properties - # Extra categories which seem sensible to exclude - "NOT MANAGED AND NOT OWNED" - ] - ) -] +# for value in [ +# # Commercial # Everything is resi, so should be fine. No matches +# # Freeholder +# "FREEHOLDER", # 19517 properties +# # HOMEBUY / EQUITY LOAN +# "Rent to Homebuy", # 1 property +# # Leaseholder +# "LEASEHOLD 100%", # 8455 properties +# "Owned and Managed - 999 year lease", # 2076 properties +# "Managed but not Owned-Private Lease", # 159 properties +# "Owned and managed LEASEHOLD", # 26 properties +# # Outright Sale - can't find anything matching +# # SHARED EQUITY +# "Shared Ownership", # 4065 properties +# "Shared Ownership Owned Not Managed", # 23 properties +# ]: +# print(initial_asset_data[initial_asset_data["Ownership Type"] == value].shape[0]) -sample_data["Ownership Type"].value_counts() +# house_types = [ +# "HOUSE", +# "BUNGALOW", +# "MAISONETTE", +# "DUPLEX", +# ] -sample_data = initial_asset_data[ - initial_asset_data["Ownership Type"].isin( - [ - "Owned and Managed", - "Owned and Managed - 999 year lease", - "Owned and managed LEASEHOLD", - "LEASEHOLD 100%", - "DATALOAD DEFAULT" - ] - ) -] -dropped = initial_asset_data[~initial_asset_data["UPRN"].isin(sample_data["UPRN"].values)] -dropped["Ownership Type"].value_counts() +# guaranteed_control = [ +# "Owned and Managed", +# "Owned and Managed - 999 year lease", +# "Owned and managed LEASEHOLD", +# "LEASEHOLD 100%", +# "DATALOAD DEFAULT", +# ] -for value in [ - # Commercial # Everything is resi, so should be fine. No matches - # Freeholder - "FREEHOLDER", # 19517 properties - # HOMEBUY / EQUITY LOAN - "Rent to Homebuy", # 1 property - # Leaseholder - "LEASEHOLD 100%", # 8455 properties - "Owned and Managed - 999 year lease", # 2076 properties - "Managed but not Owned-Private Lease", # 159 properties - "Owned and managed LEASEHOLD", # 26 properties - # Outright Sale - can't find anything matching - # SHARED EQUITY - "Shared Ownership", # 4065 properties - "Shared Ownership Owned Not Managed", # 23 properties -]: - print(initial_asset_data[initial_asset_data["Ownership Type"] == value].shape[0]) +# sample_data = initial_asset_data[ +# ( +# initial_asset_data["Ownership Type"].isin(guaranteed_control) +# ) +# | +# ( +# (initial_asset_data["Ownership Type"] == "FREEHOLDER") +# & +# (initial_asset_data["Property Type"].isin(house_types)) +# ) +# ] -house_types = [ - "HOUSE", - "BUNGALOW", - "MAISONETTE", - "DUPLEX", -] +# fabric_retrofit_sample = initial_asset_data[ +# initial_asset_data["Ownership Type"].isin( +# [ +# "Owned and Managed", +# "FREEHOLDER", +# "DATALOAD DEFAULT", +# ] +# ) +# ] -guaranteed_control = [ - "Owned and Managed", - "Owned and Managed - 999 year lease", - "Owned and managed LEASEHOLD", - "LEASEHOLD 100%", - "DATALOAD DEFAULT", -] +# initial_asset_data[pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() +# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() -sample_data = initial_asset_data[ - ( - initial_asset_data["Ownership Type"].isin(guaranteed_control) - ) - | - ( - (initial_asset_data["Ownership Type"] == "FREEHOLDER") - & - (initial_asset_data["Property Type"].isin(house_types)) - ) - ] +# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Property Type"].value_counts() +# z = initial_asset_data[ +# ~pd.isnull(initial_asset_data["BlockCode"]) & initial_asset_data["Property Type"].isin(house_types) +# ] -fabric_retrofit_sample = initial_asset_data[ - initial_asset_data["Ownership Type"].isin( - [ - "Owned and Managed", - "FREEHOLDER", - "DATALOAD DEFAULT", - ] - ) -] +# block_code_agg = z["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) +# zz = initial_asset_data[initial_asset_data["BlockCode"] == "CHAT3343FM"] -initial_asset_data[pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() -initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts() +# potential_sample = initial_asset_data[ +# ~pd.isnull(initial_asset_data["BlockCode"]) +# ] -initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Property Type"].value_counts() -z = initial_asset_data[ - ~pd.isnull(initial_asset_data["BlockCode"]) & initial_asset_data["Property Type"].isin(house_types) - ] +# compare = potential_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( +# initial_asset_data["Property Type"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="Property Type", +# right_on="Property Type", +# suffixes=("_on_block_codes", "_overall") +# ) -block_code_agg = z["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False) -zz = initial_asset_data[initial_asset_data["BlockCode"] == "CHAT3343FM"] +# # Comparison of smaller sample vs overall +# new_asset_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " +# "- Peabody " +# "- Data Extracts for Domna v2.xlsx", +# sheet_name="Properties" +# ) -potential_sample = initial_asset_data[ - ~pd.isnull(initial_asset_data["BlockCode"]) -] +# new_sustainability_data = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " +# "- Peabody " +# "- Data Extracts for Domna v2.xlsx", +# sheet_name="Sustainability" +# ) -compare = potential_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( - initial_asset_data["Property Type"].value_counts(normalize=True).to_frame().reset_index(), - left_on="Property Type", - right_on="Property Type", - suffixes=("_on_block_codes", "_overall") -) +# sap_bands = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/Parity Data " +# "08012026.xlsx", +# ) -# Comparison of smaller sample vs overall -new_asset_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " - "- Peabody " - "- Data Extracts for Domna v2.xlsx", - sheet_name="Properties" -) +# combined = new_asset_data.merge( +# new_sustainability_data, +# left_on="UPRN", +# right_on="Org Ref", +# suffixes=("_asset", "_sustainability") +# ).merge( +# sap_bands[["OrgRef", "SAP Band", "Lodged EPC Band"]], how="left", left_on="Org Ref", right_on="OrgRef" +# ) +# reduced_sample = combined[ +# ~combined["AH Tenure"].isin( +# ["Commercial", +# "Freeholder", +# "HOMEBUY / EQUITY LOAN", +# "Leaseholder", +# "Outright Sale", +# "SHARED EQUITY", +# "Shared Ownership"] +# ) +# ].copy() -new_sustainability_data = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 " - "- Peabody " - "- Data Extracts for Domna v2.xlsx", - sheet_name="Sustainability" -) +# # property types +# property_type_comparison = reduced_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( +# combined["Property Type"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="Property Type", +# right_on="Property Type", +# suffixes=("_reduced_sample", "_overall") +# ) -sap_bands = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/Parity Data " - "08012026.xlsx", -) +# # lodged ratings +# lodged_epc_band_comparison = reduced_sample["Lodged EPC Band"].value_counts( +# normalize=True).to_frame().reset_index().merge( +# combined["Lodged EPC Band"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="Lodged EPC Band", +# right_on="Lodged EPC Band", +# suffixes=("_reduced_sample", "_overall") +# ) -combined = new_asset_data.merge( - new_sustainability_data, - left_on="UPRN", - right_on="Org Ref", - suffixes=("_asset", "_sustainability") -).merge( - sap_bands[["OrgRef", "SAP Band", "Lodged EPC Band"]], how="left", left_on="Org Ref", right_on="OrgRef" -) -reduced_sample = combined[ - ~combined["AH Tenure"].isin( - ["Commercial", - "Freeholder", - "HOMEBUY / EQUITY LOAN", - "Leaseholder", - "Outright Sale", - "SHARED EQUITY", - "Shared Ownership"] - ) -].copy() +# # modelled ratings +# modelled_epc_band_comparison = reduced_sample["SAP Band"].value_counts( +# normalize=True).to_frame().reset_index().merge( +# combined["SAP Band"].value_counts(normalize=True).to_frame().reset_index(), +# left_on="SAP Band", +# right_on="SAP Band", +# suffixes=("_reduced_sample", "_overall") +# ) -# property types -property_type_comparison = reduced_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge( - combined["Property Type"].value_counts(normalize=True).to_frame().reset_index(), - left_on="Property Type", - right_on="Property Type", - suffixes=("_reduced_sample", "_overall") -) +# # Testing measures +# m1 = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " +# "solid floor, ashp 3.0 - 20250113 final.xlsx" +# ) +# m2 = pd.read_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " +# "solid floor, no EWI or IWI, ashp 3.0 - 20250113 final.xlsx" +# ) -# lodged ratings -lodged_epc_band_comparison = reduced_sample["Lodged EPC Band"].value_counts( - normalize=True).to_frame().reset_index().merge( - combined["Lodged EPC Band"].value_counts(normalize=True).to_frame().reset_index(), - left_on="Lodged EPC Band", - right_on="Lodged EPC Band", - suffixes=("_reduced_sample", "_overall") -) +# compare = m1.merge( +# m2, +# left_on="uprn", +# right_on="uprn", +# suffixes=("_ewi_iwi", "_no_ewi_iwi") +# ) -# modelled ratings -modelled_epc_band_comparison = reduced_sample["SAP Band"].value_counts( - normalize=True).to_frame().reset_index().merge( - combined["SAP Band"].value_counts(normalize=True).to_frame().reset_index(), - left_on="SAP Band", - right_on="SAP Band", - suffixes=("_reduced_sample", "_overall") -) +# # Which properties get done under the no EWI/IWI scenario that do not under the EWI/IWI scenario +# only_no_ewi_iwi = compare[ +# (compare["total_retrofit_cost_ewi_iwi"] == 0) & +# (compare["total_retrofit_cost_no_ewi_iwi"] != 0) +# ] -# Testing measures -m1 = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " - "solid floor, ashp 3.0 - 20250113 final.xlsx" -) -m2 = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no " - "solid floor, no EWI or IWI, ashp 3.0 - 20250113 final.xlsx" -) +# (m1["total_retrofit_cost"] > 0).sum() +# (m2["total_retrofit_cost"] > 0).sum() -compare = m1.merge( - m2, - left_on="uprn", - right_on="uprn", - suffixes=("_ewi_iwi", "_no_ewi_iwi") -) +# with_ewi_projects = compare[compare["total_retrofit_cost_no_ewi_iwi"] > 0] -# Which properties get done under the no EWI/IWI scenario that do not under the EWI/IWI scenario -only_no_ewi_iwi = compare[ - (compare["total_retrofit_cost_ewi_iwi"] == 0) & - (compare["total_retrofit_cost_no_ewi_iwi"] != 0) - ] - -(m1["total_retrofit_cost"] > 0).sum() -(m2["total_retrofit_cost"] > 0).sum() - -with_ewi_projects = compare[compare["total_retrofit_cost_no_ewi_iwi"] > 0] - -z = with_ewi_projects[pd.isnull(with_ewi_projects["total_retrofit_cost_ewi_iwi"])] +# z = with_ewi_projects[pd.isnull(with_ewi_projects["total_retrofit_cost_ewi_iwi"])] diff --git a/model_data/requirements/requirements.txt b/model_data/requirements/requirements.txt index 845166d9..bbf75df5 100644 --- a/model_data/requirements/requirements.txt +++ b/model_data/requirements/requirements.txt @@ -1,4 +1,4 @@ -pydantic==2.9.2 +pydantic>=1.10.7 pydantic-settings==2.6.0 epc-api-python==1.0.2 numpy==2.1.2 diff --git a/pytest.ini b/pytest.ini index 1422657b..0a0bbf73 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] pythonpath = . addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index 2184d074..f0fc5cd1 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -15,16 +15,12 @@ from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 435 # Peabody +PORTFOLIO_ID = 485 # Peabody SCENARIOS = [ - 908, - 909, - 910, + 970, ] scenario_names = { - 908: "EPC C - no solid floor, ashp 3.0", - 909: "EPC C - no solid floor, no EWI or IWI, ashp 3.0", - 910: "EPC B - no solid floor, no EWI, ashp 3.0" + 970: "EPC C - No solid floor, EQI, IWI", } @@ -295,6 +291,11 @@ for scenario_id in SCENARIOS: df[df["predicted_post_works_sap"] == ""] # Create excel to store to +<<<<<<< HEAD + filename = (f"{scenario_names[scenario_id]} - 20250113 final.xlsx") + with pd.ExcelWriter(filename) as writer: + df.to_excel(writer, sheet_name="properties", index=False) +======= filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " f"Project/Final SAL/scenarios/{scenario_names[scenario_id]} - 20250114 final.xlsx") with pd.ExcelWriter(filename) as writer: @@ -475,3 +476,4 @@ dupes = plans_df2[plans_df2["property_id"].duplicated()] example = example.merge( plans_df, how="left", ) +>>>>>>> 3874da6177cbcc37f7a488bec0a06e387906653c