Merge pull request #694 from Hestia-Homes/feature/address_to_uprn

Feature/address to uprn
This commit is contained in:
Jun-te Kim 2026-01-29 12:21:20 +00:00 committed by GitHub
commit 535e5befb5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
32 changed files with 1398 additions and 433 deletions

View file

@ -0,0 +1,39 @@
FROM python:3.11.10-bullseye
ARG USER=vscode
ARG DEBIAN_FRONTEND=noninteractive
# 1) Toolchain + utilities for building libpostal
RUN apt-get update && apt-get install -y --no-install-recommends \
sudo jq vim curl git ca-certificates \
build-essential pkg-config automake autoconf libtool \
&& rm -rf /var/lib/apt/lists/*
# # 2) Build and install libpostal from source
RUN git clone --depth 1 https://github.com/openvenues/libpostal /tmp/libpostal \
&& cd /tmp/libpostal \
&& ./bootstrap.sh \
&& ./configure --datadir=/usr/local/share/libpostal \
&& make -j"$(nproc)" \
&& make install \
&& ldconfig \
&& rm -rf /tmp/libpostal
# 3) Create the user and grant sudo privileges
RUN useradd -m -s /usr/bin/bash ${USER} \
&& echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \
&& chmod 0440 /etc/sudoers.d/${USER}
# # 4) Python deps - if you want to run assest list
ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
ADD asset_list/requirements.txt requirements.txt
RUN pip install -r requirements.txt
RUN pip install -r requirements.txt
# 5) Workdir
WORKDIR /workspaces/model
# 6) Make Python find your package
# Add project root to PYTHONPATH for all processes
ENV PYTHONPATH=/workspaces/model:${PYTHONPATH}

View file

@ -1,7 +1,7 @@
{
"name": "Basic Python",
"name": "SAL ENV",
"dockerComposeFile": "docker-compose.yml",
"service": "model",
"service": "model-sal",
"remoteUser": "vscode",
"workspaceFolder": "/workspaces/model",
"postStartCommand": "bash .devcontainer/post-install.sh",

View file

@ -1,7 +1,7 @@
version: '3.8'
services:
model:
model-sal:
user: "${UID}:${GID}"
build:
context: ..

View file

@ -11,4 +11,4 @@ if os.path.exists(env_path):
print("✔ Loaded .env into Jupyter kernel")
else:
print("⚠ No .env file found to load")
EOF
EOF

View file

@ -0,0 +1,23 @@
fastapi==0.115.2
sqlalchemy==2.0.36
psycopg2-binary==2.9.10
python-jose==3.3.0
cryptography==43.0.3
mangum==0.19.0
# AWS
boto3==1.35.44
# Data
openpyxl==3.1.2
# Basic
pytz
uvicorn[standard]
# Testing
pytest==9.0.2
pytest-cov==7.0.0
ipykernel>=6.25,<7
pydantic-settings<2
pyyaml>=6.0.1
pydantic>=1.10.7,<2
sqlmodel
# Formatting
black==26.1.0

View file

@ -34,7 +34,7 @@ RUN useradd -m -s /usr/bin/bash ${USER} \
ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
ADD backend/engine/requirements.txt requirements1.txt
ADD backend/app/requirements/requirements.txt requirements2.txt
ADD .devcontainer/requirements.txt requirements3.txt
ADD .devcontainer/backend/requirements.txt requirements3.txt
RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt
RUN pip install -r requirements.txt

View file

@ -0,0 +1,40 @@
{
"name": "Backend Model Env",
"dockerComposeFile": "docker-compose.yml",
"service": "model-backend",
"remoteUser": "vscode",
"workspaceFolder": "/workspaces/model",
"postStartCommand": "bash .devcontainer/backend/post-install.sh",
"mounts": [
"source=${localEnv:HOME},target=/workspaces/home,type=bind"
],
"customizations": {
"vscode": {
"extensions": [
"ms-python.python",
"ms-toolsai.jupyter",
"mechatroner.rainbow-csv",
"ms-toolsai.datawrangler",
"lindacong.vscode-book-reader",
"4ops.terraform",
"fabiospampinato.vscode-todo-plus",
"jgclark.vscode-todo-highlight",
"corentinartaud.pdfpreview",
"ms-python.vscode-python-envs",
"ms-python.black-formatter",
"waderyan.gitblame"
],
"settings": {
"files.defaultWorkspace": "/workspaces/model",
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true
},
"python.formatting.provider": "none"
}
}
},
"containerEnv": {
"PYTHONFLAGS": "-Xfrozen_modules=off"
}
}

View file

@ -0,0 +1,18 @@
version: '3.8'
services:
model-backend:
user: "${UID}:${GID}"
build:
context: ../..
dockerfile: .devcontainer/backend/Dockerfile
command: sleep infinity
volumes:
- ../../:/workspaces/model
networks:
- model-net
networks:
model-net:
driver: bridge

View file

@ -0,0 +1,14 @@
mkdir -p ~/.ipython/profile_default/startup
cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py
from dotenv import load_dotenv
import os
# Adjust path as needed
env_path = "/workspaces/model/backend/.env"
if os.path.exists(env_path):
load_dotenv(env_path)
print("✔ Loaded .env into Jupyter kernel")
else:
print("⚠ No .env file found to load")
EOF

View file

@ -1,4 +1,4 @@
# fastapi
fastapi==0.115.2
sqlalchemy==2.0.36
pydantic-settings==2.6.0

View file

@ -2,6 +2,12 @@ name: Run unit tests
on:
pull_request:
branches:
- "**"
push:
branches:
- "**"
jobs:
test:

View file

@ -9,9 +9,12 @@
"path": "/bin/bash"
}
},
<<<<<<< HEAD
=======
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"python.testing.pytestArgs": ["-s", "-q", "--no-cov"]
>>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
// Hot reload setting that needs to be in user settings
// "jupyter.runStartupCommands": [

View file

@ -34,7 +34,8 @@ from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
logger = setup_logger()
# OpenAI API Key (set this in your environment variables for security)
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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")
class DataRemapper:
@ -1159,13 +1160,17 @@ class AssetList:
),
axis=1
)
col = self.EPC_API_DATA_NAMES["roof-description"]
self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply(
lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[
lambda x: RoofAttributes(description=x[col]).process()[
"insulation_thickness"] if not pd.isnull(
x[self.EPC_API_DATA_NAMES["roof-description"]]) else None,
x[col]) else None,
axis=1
)
self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = (
self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].str.replace("+", "")
)

View file

@ -1,5 +1,5 @@
# OpenAI API Key (set this in your environment variables for security)
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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")
class DataRemapper:

0
asset_list/__init__.py Normal file
View file

View file

@ -12,9 +12,8 @@ from asset_list.utils import get_data
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=")
def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"):
@ -58,6 +57,10 @@ def app():
EPC recommendations
Property UPRN
"""
<<<<<<< HEAD
data_folder = ("/workspaces/model/asset_list")
data_filename = "assets.xlsx"
=======
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Hackney"
data_filename = "Domna SHF Wave 3 (3).xlsx"
@ -96,22 +99,23 @@ def app():
data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
"Project/data_validation")
data_filename = "to_standardise_uprns.xlsx"
>>>>>>> 3874da6177cbcc37f7a488bec0a06e387906653c
sheet_name = "Sheet1"
postcode_column = 'Postcode'
address1_column = "Address 1"
address1_method = None
fulladdress_column = None
address_cols_to_concat = ["Address 1", "Address 2", "Address 3"]
address1_column = None
address1_method = 'house_number_extraction'
fulladdress_column = 'Address'
address_cols_to_concat = None
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = None
landlord_property_type = "Type"
landlord_built_form = "Attachment"
landlord_property_type = None
landlord_built_form = None
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "Org Ref"
landlord_property_id = "LLUPRN"
landlord_sap = None
outcomes_filename = None
outcomes_sheetname = None
@ -127,40 +131,6 @@ def app():
asset_list_header = 0
landlord_block_reference = None
# Lambeth:
# data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Lambeth/December 10th"
# data_filename = "lambeth_sw2_leigham court estate.xlsx"
# sheet_name = "Sheet1"
# postcode_column = 'Postcode'
# address1_column = "Address"
# address1_method = None
# fulladdress_column = None
# address_cols_to_concat = ["Address"]
# missing_postcodes_method = None
# landlord_year_built = None
# landlord_os_uprn = None
# landlord_property_type = None
# landlord_built_form = None
# landlord_wall_construction = None
# landlord_roof_construction = None
# landlord_heating_system = None
# landlord_existing_pv = None
# landlord_property_id = "row_id"
# landlord_sap = None
# outcomes_filename = None
# outcomes_sheetname = None
# outcomes_postcode = None
# outcomes_houseno = None
# outcomes_id = None
# outcomes_address = None
# master_filepaths = []
# master_id_colnames = []
# master_to_asset_list_filepath = None
# phase = False
# ecosurv_landlords = None
# asset_list_header = 0
# landlord_block_reference = None
# Maps addresses to uprn in problematic cases
manual_uprn_map = {}
@ -439,6 +409,10 @@ def app():
)
asset_list.merge_data(epc_df)
# asset_list.standardised_asset_list = asset_list.standardised_asset_list[
# asset_list.standardised_asset_list["domna_full_address"]
# != "120 Airdrie Crescent, Burnley, Lancashire"
# ]
asset_list.extract_attributes()
asset_list.identify_worktypes()

View file

@ -1,7 +1,6 @@
postal
pandas
usaddress
pydantic-settings==2.6.0
epc-api-python==1.0.2
thefuzz
boto3
@ -10,6 +9,5 @@ openai>=1.3.5
tiktoken
msgpack
beautifulsoup4
pydantic>=1.10.7
typing-extensions>=4.5.0
requests>=2.28.2
requests>=2.28.2

View file

@ -0,0 +1,20 @@
We have list of address as input.
It'll come in batches of the same post code and from then we want to somehow convert that into UPRN
if this lambda/function can do that we'll be speeding ahead
Energy Performance Information: https://epc.opendatacommunities.org/
guidance page: https://epc.opendatacommunities.org/docs/guidance#field_domestic_LMK_KEY
Example of past khalims code that he wrote some tests for: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/tests/test_search_epc.py#L11
Example of EPC search: https://github.com/Hestia-Homes/Model/blob/941be42b83a590e838fd3ee475bfd1ff31438789/backend/SearchEpc.py#L118
Khalim has made a python package to help scrape data: https://github.com/KhalimCK/epc-api-python

View file

View file

@ -0,0 +1,520 @@
from epc_api.client import EpcClient
import os
from urllib.parse import urlencode
import pandas as pd
from difflib import SequenceMatcher
from tqdm import tqdm
from utils.logger import setup_logger
logger = setup_logger()
import re
EPC_AUTH_TOKEN = os.getenv(
"EPC_AUTH_TOKEN",
)
import re
from difflib import SequenceMatcher
from typing import Set
def levenshtein(a: str, b: str) -> float:
"""
Address similarity score in [0, 1].
Strategy:
- Normalise
- Strongly penalise mismatched house/flat numbers
- Combine token overlap + character similarity
"""
def extract_number_sequence(s: str) -> list[str]:
return re.findall(r"\d+[a-z]?", s)
def extract_numbers(s: str) -> Set[str]:
return set(extract_number_sequence(s))
def tokenise(s: str) -> Set[str]:
return set(s.split())
a_norm = normalise_address(a)
b_norm = normalise_address(b)
# --- hard signal: numbers ---
nums_a = extract_numbers(a_norm)
nums_b = extract_numbers(b_norm)
if nums_a and not nums_b:
return 0.0
# No shared numbers at all → impossible match
if nums_a and nums_b and nums_a.isdisjoint(nums_b):
return 0.0
# --- order-sensitive flat/building guard ---
seq_a = extract_number_sequence(a_norm)
seq_b = extract_number_sequence(b_norm)
has_flat_token_user = any(
tok in a_norm for tok in ("flat", "apt", "apartment", "unit")
)
has_flat_token_epc = "flat" in b_norm
if (
len(seq_a) == 2
and len(seq_b) >= 2
and has_flat_token_epc
and not has_flat_token_user
and seq_a != seq_b[:2]
):
return 0.0
# --- token similarity (order-independent) ---
toks_a = tokenise(a_norm)
toks_b = tokenise(b_norm)
if not toks_a or not toks_b:
token_score = 0.0
else:
token_score = len(toks_a & toks_b) / len(toks_a | toks_b)
# --- character similarity (soft signal) ---
char_score = SequenceMatcher(None, a_norm, b_norm).ratio()
# --- weighted blend ---
return round(
0.65 * token_score + 0.35 * char_score,
4,
)
def normalise_address(s: str) -> str:
"""
Canonical UK-focused address normalisation.
- Lowercases
- Removes punctuation (keeps / for flats)
- Normalises whitespace
- Applies synonym compression at token level
"""
if not s:
return ""
ADDRESS_SYNONYMS = {
# street types
"rd": "road",
"rd.": "road",
"st": "street",
"st.": "street",
"ave": "avenue",
"ave.": "avenue",
"ln": "lane",
"ln.": "lane",
"cres": "crescent",
"ct": "court",
"dr": "drive",
# flats / units
"apt": "flat",
"apartment": "flat",
"unit": "flat",
"ste": "suite",
# numbering noise
"no": "",
"no.": "",
}
# 1. lowercase
s = s.lower()
# 1.5 split digit-letter suffixes
s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s)
# 2. remove punctuation except /
s = re.sub(r"[^\w\s/]", " ", s)
# 3. normalise whitespace
s = re.sub(r"\s+", " ", s).strip()
# 4. tokenise + synonym normalisation
tokens = []
for tok in s.split():
replacement = ADDRESS_SYNONYMS.get(tok, tok)
if replacement:
tokens.append(replacement)
return " ".join(tokens)
def score_addresses(
df: pd.DataFrame,
user_address: str,
column: str = "address",
) -> pd.Series:
if column not in df.columns:
raise ValueError(f"Missing column: {column}")
return df[column].apply(lambda x: levenshtein(user_address, x))
def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3):
"""
Recursively fetch EPC data by postcode.
If results hit the size limit, retry with double size up to max_attempts.
"""
client = EpcClient(auth_token=EPC_AUTH_TOKEN)
url = os.path.join(client.domestic.host, "search")
if size:
url += "?" + urlencode({"size": size})
search_resp = client.domestic.call(
url=url,
method="get",
params={"postcode": postcode},
)
results_df = pd.DataFrame(search_resp["rows"], columns=search_resp["column-names"])
row_count = len(results_df)
# If we hit the size limit, there *may* be more results
if row_count == size:
print(
f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. "
f"Attempt {attempt}/{max_attempts}."
)
if attempt < max_attempts:
print(f"🔁 Retrying with size={size * 2}")
return get_epc_data_with_postcode(
postcode=postcode,
size=size * 2,
attempt=attempt + 1,
max_attempts=max_attempts,
)
else:
print(
"🚨 Max attempts reached. Results may be truncated. "
"(Please do a manual review by the tech team.)"
)
return results_df
def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool:
"""
Returns True if all non-null UPRNs in df match the given uprn.
Returns False otherwise.
"""
if column not in df.columns:
return False
# Drop nulls and normalise to string
uprns = df[column].dropna().astype(str).str.strip().unique()
# No valid UPRNs to compare
if len(uprns) == 0:
return False
# Exactly one unique UPRN and it matches
return len(uprns) == 1 and uprns[0] == str(uprn)
def get_uprn_candidates(
df: pd.DataFrame,
user_address: str,
address_column: str = "address",
uprn_column: str = "uprn",
) -> pd.DataFrame:
"""
Annotate EPC results with lexicographical similarity scores and ranks.
Returns a DataFrame sorted by descending lexiscore.
DOES NOT choose or return a UPRN.
"""
if address_column not in df.columns:
raise ValueError(f"Missing column: {address_column}")
if uprn_column not in df.columns:
raise ValueError(f"Missing column: {uprn_column}")
out = df.copy()
user_norm = normalise_address(user_address)
out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x))
# Normalise UPRN to string
out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True)
# Rank: 1 = best match
out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int)
return out.sort_values(
["lexirank", "lexiscore"],
ascending=[True, False],
)
def get_uprn(user_inputed_address: str, postcode: str):
"""
Return uprn (str)
Return False if failed to find a sensible matching epc
Return Nons when epc found but no UPRN
"""
df = get_epc_data_with_postcode(postcode=postcode)
if df.empty:
return None
scored_df = get_uprn_candidates(
df,
user_address=user_inputed_address,
)
# Best score
best_score = scored_df.iloc[0]["lexiscore"]
if best_score <= 0:
return None
# All rank-1 rows (possible draw)
top_rank_df = scored_df[scored_df["lexirank"] == 1]
# If rank-1 rows do not agree on a single UPRN → ambiguous
if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]):
return None
address = top_rank_df["address"].values[0]
lexiscore = float(top_rank_df["lexiscore"].values[0])
logger.info(f"Address found to be: {address}, with lexiscore {lexiscore}")
# Safe to return the agreed UPRN
found_uprn = top_rank_df.iloc[0]["uprn"]
if found_uprn == "":
return None
return found_uprn
def resolve_uprns_for_postcode_group(
group_df: pd.DataFrame,
epc_df: pd.DataFrame,
address_col: str = "Address 1",
) -> pd.DataFrame:
"""
Given:
- group_df: rows sharing the same postcode
- epc_df: EPC search results for that postcode
Returns:
group_df + found_uprn + diagnostics
"""
results = []
for _, row in group_df.iterrows():
user_address = str(row[address_col]).strip()
scored_df = get_uprn_candidates(
epc_df,
user_address=user_address,
)
if scored_df.empty:
results.append(
{
"found_uprn": None,
"best_match_uprn": None,
"best_match_address": None,
"best_match_lexiscore": None,
"status": "no_epc_candidates",
}
)
continue
best_score = scored_df.iloc[0]["lexiscore"]
if best_score <= 0:
results.append(
{
"found_uprn": None,
"best_match_uprn": None,
"best_match_address": None,
"best_match_lexiscore": best_score,
"status": "zero_score",
}
)
continue
top_rank_df = scored_df[scored_df["lexirank"] == 1]
if not df_has_single_uprn(top_rank_df, top_rank_df.iloc[0]["uprn"]):
results.append(
{
"found_uprn": None,
"best_match_uprn": top_rank_df.iloc[0]["uprn"],
"best_match_address": top_rank_df.iloc[0]["address"],
"best_match_lexiscore": best_score,
"status": "ambiguous",
}
)
continue
results.append(
{
"found_uprn": str(top_rank_df.iloc[0]["uprn"]),
"best_match_uprn": str(top_rank_df.iloc[0]["uprn"]),
"best_match_address": top_rank_df.iloc[0]["address"],
"best_match_lexiscore": best_score,
"status": "matched",
}
)
return pd.concat(
[group_df.reset_index(drop=True), pd.DataFrame(results)],
axis=1,
)
def test(a, b):
assert a == b, f"erorr: {a}{type(a)} != {b}: {type(b)}"
def run_all_test():
# Basic usage with different post codes styles
test(get_epc_data_with_postcode("b93 8sy").shape[0], 63)
test(get_epc_data_with_postcode("B938sy").shape[0], 63)
test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63)
test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63)
test(get_uprn("68", "b93 8sy"), "100070989938")
test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938")
test(get_uprn("Flat A, 28, Nelgarde Road", "se6 4tf"), "100023278633")
test(get_uprn("28 A", "se6 4tf"), "100023278633")
test(get_uprn("28A", "se6 4tf"), "100023278633")
test(get_uprn("6 Aitken Close", "E8 4SQ"), False)
# unique case
test(get_uprn("Flat 5, 1, Semley Gate", "e9 5nh"), "10008238198")
test(get_uprn("5 , 1 Semley Gate", "e9 5nh"), "10008238198")
test(get_uprn("5 Semley Gate", "e9 5nh"), "10008238198")
test(get_uprn("1, 5 Semley Gate", "e9 5nh"), False)
test(
get_uprn("1 Semley Gate", "e9 5nh"), "10008238188"
) # this one return "flat 1, in 1 semley gate"
test(
get_uprn("48 Oswald Street", "E5 0BT"), False
) # this one return "flat 1, in 1 semley gate"
test(
get_uprn("42 Oswald Street", "E5 0BT"), False
) # this one return "flat 1, in 1 semley gate"
test(
get_uprn("46 Oswald Street", "E5 0BT"), False
) # this one return "flat 1, in 1 semley gate"
get_uprn_candidates(get_epc_data_with_postcode("e5 0bt"), "48 Oswald Street")
if __name__ == "__main__":
INPUT_FILE = "hackney.xlsx"
ADDRESS_COL = "Address 1"
POSTCODE_COL = "Postcode"
UPRN_COL = "UPRN"
df = pd.read_excel(INPUT_FILE)
failures = []
for _, row in tqdm(
df.iterrows(),
total=len(df),
desc="Auditing UPRNs",
):
input_address = str(row[ADDRESS_COL]).strip()
postcode = str(row[POSTCODE_COL]).strip()
expected_uprn = None if pd.isna(row[UPRN_COL]) else str(int(row[UPRN_COL]))
try:
epc_df = get_epc_data_with_postcode(postcode)
if epc_df.empty:
failures.append(
{
**row.to_dict(),
"found_uprn": None,
"best_match_uprn": None,
"best_match_address": None,
"best_match_lexiscore": None,
"status": "no_epc_results",
}
)
continue
scored_df = get_uprn_candidates(
epc_df,
user_address=input_address,
)
best_row = scored_df.iloc[0]
best_match_uprn = str(best_row["uprn"])
best_match_address = best_row["address"]
best_match_lexiscore = round(float(best_row["lexiscore"]), 4)
found_uprn = get_uprn(input_address, postcode)
except Exception as e:
failures.append(
{
**row.to_dict(),
"found_uprn": None,
"best_match_uprn": None,
"best_match_address": None,
"best_match_lexiscore": None,
"status": "exception",
"error": str(e),
}
)
continue
found_uprn_norm = None if not found_uprn else str(found_uprn)
if found_uprn_norm != expected_uprn:
failures.append(
{
**row.to_dict(),
"found_uprn": found_uprn_norm,
"best_match_uprn": best_match_uprn,
"best_match_address": best_match_address,
"best_match_lexiscore": best_match_lexiscore,
"status": ("no_match" if found_uprn_norm is None else "mismatch"),
}
)
failures_df = pd.DataFrame(failures)
print("===================================")
print(f"Total rows : {len(df)}")
print(f"Failures : {len(failures_df)}")
print("===================================")
failures_df.to_excel(
"hackney_uprn_failures.xlsx",
index=False,
)
# TO do function dispatcher,
# get_uprn_candidates(get_epc_data_with_postcode("E9 5NH"),"Flat 1, 5 Semley Gate" and Flat 5, 1 Semley Gate)
# fix that
# Look again at flat 1
# pandas reader the seperate postcode_splitter
# dump into s3

View file

@ -0,0 +1,17 @@
import pandas as pd
# use Address 1
junte_df = pd.read_excel("hackney_uprn_failures.xlsx")
# use domna_address_1
khalim_df = pd.read_excel("khalim_standard.xlsx")
combined_df = junte_df.merge(khalim_df, how="left", left_on="Address 1", right_on='domna_address_1')
# Find the row in khalim_df that does not app
result = combined_df[~pd.isnull(combined_df["epc_os_uprn"])]

View file

@ -0,0 +1,40 @@
# tests/test_address_to_uprn_csv.py
import csv
import pytest
from pathlib import Path
from backend.address2UPRN.main import get_uprn
FIXTURE_PATH = Path(__file__).parent / "test_data.csv"
def load_test_cases():
with open(FIXTURE_PATH, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
return [
pytest.param(
row["User Input"],
row["Postcode"],
row["Manual UPRN Code"],
id=f'{row["User Input"]} [{row["Postcode"]}]',
)
for row in reader
]
@pytest.mark.parametrize(
"user_input,postcode,expected_uprn",
load_test_cases(),
)
def test_uprn_resolution_matches_manual(
user_input: str,
postcode: str,
expected_uprn: str,
):
from utils.logger import setup_logger
uprn = get_uprn(user_input, postcode)
if uprn:
assert uprn == expected_uprn
else:
assert str(uprn) == expected_uprn

View file

@ -0,0 +1,167 @@
User Input,Postcode,Manual UPRN Code
47 The Fairway,OX16 0RR,100120771697
11 REGENT COURT,SL1 3LG,100081041562
3/137a Windmill Road,TW8 9NH,100021516998
Flat 33,SW18 4BE,100023328943
FLAT 1 Brendon Grove,N2 8JE,200013412
Flat 15,KT8 2NE,100062123759
FLAT 5 Stonehill Road,W4 3AH,100021589829
10 Douglas Court,SL7 1UQ,100081278099
1 Windmill Road,HP17 8JA,766034606
31 Denewood,HP13 7LH,100081095964
"10, Greenways Drive",TW4 5DD,10091597009
Flat 10,W4 3AH,"100021589834"
Flat 11,TW4 5DD,10091597010
Flat 11,W4 3AH,100021589835
"12, Greenways Drive",TW4 5DD,10091597011
"Flat 12, Forbes House",W4 3AH,100021589836
FLAT 1 Goodstone Court,HA1 4FL,10070269053
Flat 13,TW4 5DD,10091597012
Flat 13,W4 3AH,100021589837
Flat 14,TW4 5DD,10091597013
Flat 14,W4 3AH,100021589838
Flat 15,TW4 5DD,10091597014
Flat 15,W4 3AH,100021589839
Flat 16,TW4 5DD,"10091597015"
Flat 16,W4 3AH,100021589840
Flat 17,TW4 5DD,10091597016
Flat 17,W4 3AH,100021589841
Flat 18,TW4 5DD,10091597017
Flat 19,W4 3AH,100021589843
Flat 20,W4 3AH,100021589844
Flat 21,W4 3AH,100021589845
Flat 22,W4 3AH,100021589846
FLAT 2 Goodstone Court,HA1 4FL,10070269054
Flat 23,W4 3AH,100021589847
Flat 24,W4 3AH,100021589848
"30c, Bosanquet Close",UB8 3PE,100021475316
"30e, Bosanquet Close",UB8 3PE,100021475318
FLAT 3 Goodstone Court,HA1 4FL,10070269055
FLAT 4 Goodstone Court,HA1 4FL,10070269056
FLAT 5 Goodstone Court,HA1 4FL,10070269057
FLAT 6 Goodstone Court,HA1 4FL,10070269058
FLAT 7 Goodstone Court,HA1 4FL,10070269059
FLAT 8 Goodstone Court,HA1 4FL,10070269060
FLAT 9 Goodstone Court,HA1 4FL,10070269061
FLAT 10 Goodstone Court,HA1 4FL,10070269062
FLAT 11 Goodstone Court,HA1 4FL,10070269063
FLAT 12 Goodstone Court,HA1 4FL,10070269064
FLAT 13 Goodstone Court,HA1 4FL,10070269065
FLAT 14 Goodstone Court,HA1 4FL,10070269066
FLAT 15 Goodstone Court,HA1 4FL,10070269067
FLAT 16 Goodstone Court,HA1 4FL,10070269068
FLAT 17 Goodstone Court,HA1 4FL,10070269069
FLAT 18 Goodstone Court,HA1 4FL,10070269070
FLAT 19 Goodstone Court,HA1 4FL,10070269071
FLAT 20 Goodstone Court,HA1 4FL,10070269072
FLAT 21 Goodstone Court,HA1 4FL,10070269073
FLAT 22 Goodstone Court,HA1 4FL,10070269074
FLAT 23 Goodstone Court,HA1 4FL,10070269075
FLAT 24 Goodstone Court,HA1 4FL,10070269076
FLAT 25 Goodstone Court,HA1 4FL,10070269077
FLAT 26 Goodstone Court,HA1 4FL,10070269078
FLAT 27 Goodstone Court,HA1 4FL,10070269079
FLAT 28 Goodstone Court,HA1 4FL,10070269080
FLAT 29 Goodstone Court,HA1 4FL,10070269081
FLAT 30 Goodstone Court,HA1 4FL,10070269082
FLAT 31 Goodstone Court,HA1 4FL,10070269083
FLAT 32 Goodstone Court,HA1 4FL,10070269084
FLAT 33 Goodstone Court,HA1 4FL,10070269085
FLAT 34 Goodstone Court,HA1 4FL,10070269086
FLAT 35 Goodstone Court,HA1 4FL,10070269087
FLAT 36 Goodstone Court,HA1 4FL,10070269088
FLAT 37 Goodstone Court,HA1 4FL,10070269089
FLAT 38 Goodstone Court,HA1 4FL,10070269090
FLAT 39 Goodstone Court,HA1 4FL,10070269091
FLAT 40 Goodstone Court,HA1 4FL,10070269092
FLAT 41 Goodstone Court,HA1 4FL,10070269093
FLAT 42 Goodstone Court,HA1 4FL,10070269094
FLAT 43 Goodstone Court,HA1 4FL,10070269095
"13 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778260
"14 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778259
"15 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778258
"16 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778263
"17 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778262
"18 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778261
"19 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778266
"20 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778265
"21 Stubwick Court, Old Saw Mill Place",HP6 6FF,10013778264
90a Murray Road,W5 4DA,12135293
"Flat 1, 6 Wolverton Gardens",W5 3LJ,"12119972"
"1, Monsted House",UB1 1FG,12189944
"10, Monsted House",UB1 1FG,12189953
"20, Monsted House",UB1 1FG,12189963
"2, Monsted House",UB1 1FG,12189945
"3, Monsted House",UB1 1FG,12189946
"4, Monsted House",UB1 1FG,12189947
"5, Monsted House",UB1 1FG,12189948
"6, Monsted House",UB1 1FG,12189949
"7, Monsted House",UB1 1FG,12189950
"8, Monsted House",UB1 1FG,12189951
"9, Monsted House",UB1 1FG,12189952
"1 Cullis House, 1, Accolade Avenue",UB1 1FH,12189904
"2 Cullis House, 1, Accolade Avenue",UB1 1FH,12189905
"3 Cullis House, 1, Accolade Avenue",UB1 1FH,12189906
"4 Cullis House, 1, Accolade Avenue",UB1 1FH,12189907
"5 Cullis House, 1, Accolade Avenue",UB1 1FH,12189908
"6 Cullis House, 1, Accolade Avenue",UB1 1FH,12189909
1 Genteel House Samara Drive,UB1 1FJ,12189835
2 Genteel House Samara Drive,UB1 1FJ,12189836
3 Genteel House Samara Drive,UB1 1FJ,12189837
4 Genteel House Samara Drive,UB1 1FJ,12189838
5 Genteel House Samara Drive,UB1 1FJ,12189839
6 Genteel House Samara Drive,UB1 1FJ,12189840
7 Genteel House Samara Drive,UB1 1FJ,12189841
8 Genteel House Samara Drive,UB1 1FJ,12189842
9 Genteel House Samara Drive,UB1 1FJ,12189843
10 Genteel House Samara Drive,UB1 1FJ,12189844
1 ASH TREE HOUSE,SE5 0TE,10009803979
3 ASH TREE HOUSE,SE5 0TE,10009803981
5 ASH TREE HOUSE,SE5 0TE,10009803983
8 ASH TREE HOUSE,SE5 0TE,10009803986
12 ASH TREE HOUSE,SE5 0TE,10009803990
FLAT 1 599 HARROW ROAD,W10 4RA,217113930
FLAT 2 599 HARROW ROAD,W10 4RA,217113931
FLAT 3 599 HARROW ROAD,W10 4RA,None
FLAT 4 599 HARROW ROAD,W10 4RA,None
FLAT 5 599 HARROW ROAD,W10 4RA,217113934
FLAT 6 599 HARROW ROAD,W10 4RA,None
FLAT 7 599 HARROW ROAD,W10 4RA,None
FLAT 8 599 HARROW ROAD,W10 4RA,None
"Flat 1, Ohio Building",SE13 7RX,10023226256
"Flat 2, Ohio Building",SE13 7RX,10023226257
"Apartment 1 Block B, 105, Benwell Road",N7 7BW,10012792307
"Apartment 2 Block B, 105, Benwell Road",N7 7BW,10012792308
"Apartment 3 Block B, 105, Benwell Road",N7 7BW,10012792309
"Apartment 4 Block B, 105, Benwell Road",N7 7BW,10012792310
"Apartment 5 Block B, 105, Benwell Road",N7 7BW,10012792311
"Apartment 6 Block B, 105, Benwell Road",N7 7BW,10012792312
"Apartment 7 Block B, 105, Benwell Road",N7 7BW,10012792313
"Apartment 8 Block B, 105, Benwell Road",N7 7BW,10012792314
"Apartment 9 Block B, 105, Benwell Road",N7 7BW,10012792315
"Apartment 10 Block B, 105, Benwell Road",N7 7BW,10012792316
"Apartment 11 Block B, 105, Benwell Road",N7 7BW,10012792317
"Apartment 12 Block B, 105, Benwell Road",N7 7BW,10012792318
"Apartment 13 Block B, 105, Benwell Road",N7 7BW,10012792319
"Apartment 1 Block D, 32, Hornsey Road",N7 7AT,10012792366
"Apartment 2 Block D, 32, Hornsey Road",N7 7AT,10012792367
"Apartment 3 Block D, 32, Hornsey Road",N7 7AT,10012792368
"Apartment 4 Block D, 32, Hornsey Road",N7 7AT,10012792369
"Apartment 5 Block D, 32, Hornsey Road",N7 7AT,10012792370
"Apartment 6 Block D, 32, Hornsey Road",N7 7AT,"10012792371"
"Apartment 7 Block D, 32, Hornsey Road",N7 7AT,10012792372
"Apartment 8 Block D, 32, Hornsey Road",N7 7AT,10012792373
"Apartment 9 Block D, 32, Hornsey Road",N7 7AT,10012792374
"Apartment 10 Block D, 32, Hornsey Road",N7 7AT,10012792375
"Apartment 11 Block D, 32, Hornsey Road",N7 7AT,10012792376
"Apartment 12 Block D, 32, Hornsey Road",N7 7AT,10012792377
"Apartment 13 Block D, 32, Hornsey Road",N7 7AT,10012792378
"Apartment 14 Block D, 32, Hornsey Road",N7 7AT,10012792379
"Apartment 15 Block D, 32, Hornsey Road",N7 7AT,10012792380
"Apartment 16 Block D, 32, Hornsey Road",N7 7AT,"10012792381"
"Apartment 17Block D, 32, Hornsey Road",N7 7AT,10012792382
"Apartment 18 Block D, 32, Hornsey Road",N7 7AT,10012792383
24b Honley Road,SE6 2HZ,None
FLAT B 158 LEAHURST ROAD,SE13 5NL,100021976974
2 COLLEGE HOUSE,CM7 1JS,100091449870
3 COLLEGE HOUSE,CM7 1JS,100091449871
1 User Input Postcode Manual UPRN Code
2 47 The Fairway OX16 0RR 100120771697
3 11 REGENT COURT SL1 3LG 100081041562
4 3/137a Windmill Road TW8 9NH 100021516998
5 Flat 33 SW18 4BE 100023328943
6 FLAT 1 Brendon Grove N2 8JE 200013412
7 Flat 15 KT8 2NE 100062123759
8 FLAT 5 Stonehill Road W4 3AH 100021589829
9 10 Douglas Court SL7 1UQ 100081278099
10 1 Windmill Road HP17 8JA 766034606
11 31 Denewood HP13 7LH 100081095964
12 10, Greenways Drive TW4 5DD 10091597009
13 Flat 10 W4 3AH 100021589834
14 Flat 11 TW4 5DD 10091597010
15 Flat 11 W4 3AH 100021589835
16 12, Greenways Drive TW4 5DD 10091597011
17 Flat 12, Forbes House W4 3AH 100021589836
18 FLAT 1 Goodstone Court HA1 4FL 10070269053
19 Flat 13 TW4 5DD 10091597012
20 Flat 13 W4 3AH 100021589837
21 Flat 14 TW4 5DD 10091597013
22 Flat 14 W4 3AH 100021589838
23 Flat 15 TW4 5DD 10091597014
24 Flat 15 W4 3AH 100021589839
25 Flat 16 TW4 5DD 10091597015
26 Flat 16 W4 3AH 100021589840
27 Flat 17 TW4 5DD 10091597016
28 Flat 17 W4 3AH 100021589841
29 Flat 18 TW4 5DD 10091597017
30 Flat 19 W4 3AH 100021589843
31 Flat 20 W4 3AH 100021589844
32 Flat 21 W4 3AH 100021589845
33 Flat 22 W4 3AH 100021589846
34 FLAT 2 Goodstone Court HA1 4FL 10070269054
35 Flat 23 W4 3AH 100021589847
36 Flat 24 W4 3AH 100021589848
37 30c, Bosanquet Close UB8 3PE 100021475316
38 30e, Bosanquet Close UB8 3PE 100021475318
39 FLAT 3 Goodstone Court HA1 4FL 10070269055
40 FLAT 4 Goodstone Court HA1 4FL 10070269056
41 FLAT 5 Goodstone Court HA1 4FL 10070269057
42 FLAT 6 Goodstone Court HA1 4FL 10070269058
43 FLAT 7 Goodstone Court HA1 4FL 10070269059
44 FLAT 8 Goodstone Court HA1 4FL 10070269060
45 FLAT 9 Goodstone Court HA1 4FL 10070269061
46 FLAT 10 Goodstone Court HA1 4FL 10070269062
47 FLAT 11 Goodstone Court HA1 4FL 10070269063
48 FLAT 12 Goodstone Court HA1 4FL 10070269064
49 FLAT 13 Goodstone Court HA1 4FL 10070269065
50 FLAT 14 Goodstone Court HA1 4FL 10070269066
51 FLAT 15 Goodstone Court HA1 4FL 10070269067
52 FLAT 16 Goodstone Court HA1 4FL 10070269068
53 FLAT 17 Goodstone Court HA1 4FL 10070269069
54 FLAT 18 Goodstone Court HA1 4FL 10070269070
55 FLAT 19 Goodstone Court HA1 4FL 10070269071
56 FLAT 20 Goodstone Court HA1 4FL 10070269072
57 FLAT 21 Goodstone Court HA1 4FL 10070269073
58 FLAT 22 Goodstone Court HA1 4FL 10070269074
59 FLAT 23 Goodstone Court HA1 4FL 10070269075
60 FLAT 24 Goodstone Court HA1 4FL 10070269076
61 FLAT 25 Goodstone Court HA1 4FL 10070269077
62 FLAT 26 Goodstone Court HA1 4FL 10070269078
63 FLAT 27 Goodstone Court HA1 4FL 10070269079
64 FLAT 28 Goodstone Court HA1 4FL 10070269080
65 FLAT 29 Goodstone Court HA1 4FL 10070269081
66 FLAT 30 Goodstone Court HA1 4FL 10070269082
67 FLAT 31 Goodstone Court HA1 4FL 10070269083
68 FLAT 32 Goodstone Court HA1 4FL 10070269084
69 FLAT 33 Goodstone Court HA1 4FL 10070269085
70 FLAT 34 Goodstone Court HA1 4FL 10070269086
71 FLAT 35 Goodstone Court HA1 4FL 10070269087
72 FLAT 36 Goodstone Court HA1 4FL 10070269088
73 FLAT 37 Goodstone Court HA1 4FL 10070269089
74 FLAT 38 Goodstone Court HA1 4FL 10070269090
75 FLAT 39 Goodstone Court HA1 4FL 10070269091
76 FLAT 40 Goodstone Court HA1 4FL 10070269092
77 FLAT 41 Goodstone Court HA1 4FL 10070269093
78 FLAT 42 Goodstone Court HA1 4FL 10070269094
79 FLAT 43 Goodstone Court HA1 4FL 10070269095
80 13 Stubwick Court, Old Saw Mill Place HP6 6FF 10013778260
81 14 Stubwick Court, Old Saw Mill Place HP6 6FF 10013778259
82 15 Stubwick Court, Old Saw Mill Place HP6 6FF 10013778258
83 16 Stubwick Court, Old Saw Mill Place HP6 6FF 10013778263
84 17 Stubwick Court, Old Saw Mill Place HP6 6FF 10013778262
85 18 Stubwick Court, Old Saw Mill Place HP6 6FF 10013778261
86 19 Stubwick Court, Old Saw Mill Place HP6 6FF 10013778266
87 20 Stubwick Court, Old Saw Mill Place HP6 6FF 10013778265
88 21 Stubwick Court, Old Saw Mill Place HP6 6FF 10013778264
89 90a Murray Road W5 4DA 12135293
90 Flat 1, 6 Wolverton Gardens W5 3LJ 12119972
91 1, Monsted House UB1 1FG 12189944
92 10, Monsted House UB1 1FG 12189953
93 20, Monsted House UB1 1FG 12189963
94 2, Monsted House UB1 1FG 12189945
95 3, Monsted House UB1 1FG 12189946
96 4, Monsted House UB1 1FG 12189947
97 5, Monsted House UB1 1FG 12189948
98 6, Monsted House UB1 1FG 12189949
99 7, Monsted House UB1 1FG 12189950
100 8, Monsted House UB1 1FG 12189951
101 9, Monsted House UB1 1FG 12189952
102 1 Cullis House, 1, Accolade Avenue UB1 1FH 12189904
103 2 Cullis House, 1, Accolade Avenue UB1 1FH 12189905
104 3 Cullis House, 1, Accolade Avenue UB1 1FH 12189906
105 4 Cullis House, 1, Accolade Avenue UB1 1FH 12189907
106 5 Cullis House, 1, Accolade Avenue UB1 1FH 12189908
107 6 Cullis House, 1, Accolade Avenue UB1 1FH 12189909
108 1 Genteel House Samara Drive UB1 1FJ 12189835
109 2 Genteel House Samara Drive UB1 1FJ 12189836
110 3 Genteel House Samara Drive UB1 1FJ 12189837
111 4 Genteel House Samara Drive UB1 1FJ 12189838
112 5 Genteel House Samara Drive UB1 1FJ 12189839
113 6 Genteel House Samara Drive UB1 1FJ 12189840
114 7 Genteel House Samara Drive UB1 1FJ 12189841
115 8 Genteel House Samara Drive UB1 1FJ 12189842
116 9 Genteel House Samara Drive UB1 1FJ 12189843
117 10 Genteel House Samara Drive UB1 1FJ 12189844
118 1 ASH TREE HOUSE SE5 0TE 10009803979
119 3 ASH TREE HOUSE SE5 0TE 10009803981
120 5 ASH TREE HOUSE SE5 0TE 10009803983
121 8 ASH TREE HOUSE SE5 0TE 10009803986
122 12 ASH TREE HOUSE SE5 0TE 10009803990
123 FLAT 1 599 HARROW ROAD W10 4RA 217113930
124 FLAT 2 599 HARROW ROAD W10 4RA 217113931
125 FLAT 3 599 HARROW ROAD W10 4RA None
126 FLAT 4 599 HARROW ROAD W10 4RA None
127 FLAT 5 599 HARROW ROAD W10 4RA 217113934
128 FLAT 6 599 HARROW ROAD W10 4RA None
129 FLAT 7 599 HARROW ROAD W10 4RA None
130 FLAT 8 599 HARROW ROAD W10 4RA None
131 Flat 1, Ohio Building SE13 7RX 10023226256
132 Flat 2, Ohio Building SE13 7RX 10023226257
133 Apartment 1 Block B, 105, Benwell Road N7 7BW 10012792307
134 Apartment 2 Block B, 105, Benwell Road N7 7BW 10012792308
135 Apartment 3 Block B, 105, Benwell Road N7 7BW 10012792309
136 Apartment 4 Block B, 105, Benwell Road N7 7BW 10012792310
137 Apartment 5 Block B, 105, Benwell Road N7 7BW 10012792311
138 Apartment 6 Block B, 105, Benwell Road N7 7BW 10012792312
139 Apartment 7 Block B, 105, Benwell Road N7 7BW 10012792313
140 Apartment 8 Block B, 105, Benwell Road N7 7BW 10012792314
141 Apartment 9 Block B, 105, Benwell Road N7 7BW 10012792315
142 Apartment 10 Block B, 105, Benwell Road N7 7BW 10012792316
143 Apartment 11 Block B, 105, Benwell Road N7 7BW 10012792317
144 Apartment 12 Block B, 105, Benwell Road N7 7BW 10012792318
145 Apartment 13 Block B, 105, Benwell Road N7 7BW 10012792319
146 Apartment 1 Block D, 32, Hornsey Road N7 7AT 10012792366
147 Apartment 2 Block D, 32, Hornsey Road N7 7AT 10012792367
148 Apartment 3 Block D, 32, Hornsey Road N7 7AT 10012792368
149 Apartment 4 Block D, 32, Hornsey Road N7 7AT 10012792369
150 Apartment 5 Block D, 32, Hornsey Road N7 7AT 10012792370
151 Apartment 6 Block D, 32, Hornsey Road N7 7AT 10012792371
152 Apartment 7 Block D, 32, Hornsey Road N7 7AT 10012792372
153 Apartment 8 Block D, 32, Hornsey Road N7 7AT 10012792373
154 Apartment 9 Block D, 32, Hornsey Road N7 7AT 10012792374
155 Apartment 10 Block D, 32, Hornsey Road N7 7AT 10012792375
156 Apartment 11 Block D, 32, Hornsey Road N7 7AT 10012792376
157 Apartment 12 Block D, 32, Hornsey Road N7 7AT 10012792377
158 Apartment 13 Block D, 32, Hornsey Road N7 7AT 10012792378
159 Apartment 14 Block D, 32, Hornsey Road N7 7AT 10012792379
160 Apartment 15 Block D, 32, Hornsey Road N7 7AT 10012792380
161 Apartment 16 Block D, 32, Hornsey Road N7 7AT 10012792381
162 Apartment 17Block D, 32, Hornsey Road N7 7AT 10012792382
163 Apartment 18 Block D, 32, Hornsey Road N7 7AT 10012792383
164 24b Honley Road SE6 2HZ None
165 FLAT B 158 LEAHURST ROAD SE13 5NL 100021976974
166 2 COLLEGE HOUSE CM7 1JS 100091449870
167 3 COLLEGE HOUSE CM7 1JS 100091449871

View file

@ -1,3 +1,4 @@
# fastapi
fastapi==0.115.2
sqlalchemy==2.0.36
@ -12,5 +13,4 @@ boto3==1.35.44
openpyxl==3.1.2
# Basic
pytz
sqlmodel
sqlmodel

View file

@ -1,3 +1,4 @@
# Pandas and numpy
numpy==2.1.2
pandas==2.2.3
@ -22,4 +23,4 @@ pyarrow==17.0.0
fastparquet==2024.5.0
aiohttp==3.10.10
# find my epc
beautifulsoup4
beautifulsoup4

Binary file not shown.

View file

@ -0,0 +1,114 @@
import pandas as pd
import requests
from backend.address2UPRN.main import resolve_uprns_for_postcode_group, get_epc_data_with_postcode
from tqdm import tqdm
def sanitise_postcode(postcode: str) -> str | None:
"""
Normalise postcode for grouping.
- Uppercase
- Remove all whitespace
"""
if pd.isna(postcode):
return None
return postcode.upper().replace(" ", "")
def is_valid_postcode(postcode_clean: str) -> bool:
"""
Validate postcode using postcodes.io.
Expects a sanitised postcode (e.g. E84SQ).
Returns True if valid, False otherwise.
"""
POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
if not postcode_clean:
return False
try:
resp = requests.get(
POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
timeout=5,
)
resp.raise_for_status()
return resp.json().get("result", False)
except requests.RequestException:
# Network issues, rate limits, etc.
return False
def main():
df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability")
df = df.head(500)
# Sanitise postcodes
df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode)
# --- validate AFTER grouping (save API calls) ---
# Get unique, non-null postcodes
unique_postcodes = (
df["postcode_clean"]
.dropna()
.unique()
)
# Validate each postcode once, TODOadd a progress bar
postcode_validity = {
pc: is_valid_postcode(pc)
for pc in tqdm(unique_postcodes, total=len(unique_postcodes))
}
# Map validity back onto dataframe
df["postcode_valid"] = df["postcode_clean"].map(postcode_validity)
results = []
for postcode, group_df in tqdm(
df[df["postcode_valid"]].groupby("postcode_clean"),
desc="Resolving UPRNs by postcode",
):
try:
epc_df = get_epc_data_with_postcode(postcode)
if epc_df.empty:
tmp = group_df.copy()
tmp["found_uprn"] = None
tmp["status"] = "no_epc_results"
results.append(tmp)
continue
resolved = resolve_uprns_for_postcode_group(
group_df=group_df,
epc_df=epc_df,
)
results.append(resolved)
except Exception as e:
tmp = group_df.copy()
tmp["found_uprn"] = None
tmp["status"] = "exception"
tmp["error"] = str(e)
results.append(tmp)
final_df = pd.concat(results, ignore_index=True)
a = final_df[[
"best_match_lexiscore","Address 1",
"best_match_address", "Postcode",
"UPRN", "best_match_uprn"
]] # add levi score to viewing
b = final_df[final_df["best_match_lexiscore"]>0] # add levi score to viewing
b = b[[
"best_match_lexiscore","Address 1",
"best_match_address", "Postcode",
"UPRN", "best_match_uprn"
]]
if __name__ == "__main__":
main()

View file

@ -1,5 +1,11 @@
import os
from backend.app.config import get_settings
import os
from dotenv import load_dotenv
import os
# Load .env in conftest.py directory for local development
load_dotenv()
DEFAULT_ENV = {
"API_KEY": "test",
@ -8,7 +14,10 @@ DEFAULT_ENV = {
"DATA_BUCKET": "test",
"PLAN_TRIGGER_BUCKET": "test",
"ENGINE_SQS_URL": "test",
"EPC_AUTH_TOKEN": "test", # overridden in GitHub Actions
"EPC_AUTH_TOKEN": os.getenv(
"EPC_AUTH_TOKEN",
"test",
), # overridden in GitHub Actions
"GOOGLE_SOLAR_API_KEY": "test",
"DB_HOST": "localhost",
"DB_USERNAME": "test",

View file

@ -1,111 +1,111 @@
import pandas as pd
epc_c_recommendations = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no "
"solid floor, ashp 3.0 - corrected.xlsx"
)
epc_b_recommendations = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC B - no "
"solid floor, ashp 3.0 - corrected.xlsx"
)
# epc_c_recommendations = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no "
# "solid floor, ashp 3.0 - corrected.xlsx"
# )
# epc_b_recommendations = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC B - no "
# "solid floor, ashp 3.0 - corrected.xlsx"
# )
epc_c_movers = epc_b_recommendations[
epc_b_recommendations["current_epc_rating"] == "Epc.C"
]
epc_c_movers["property_type"].value_counts()
# epc_c_movers = epc_b_recommendations[
# epc_b_recommendations["current_epc_rating"] == "Epc.C"
# ]
# epc_c_movers["property_type"].value_counts()
house_epc_c_movers = epc_c_movers[
epc_c_movers["property_type"] == "House"
]
house_epc_c_movers_with_solar = house_epc_c_movers[
~pd.isnull(house_epc_c_movers["solar_pv"]) | ~pd.isnull(house_epc_c_movers["solar_pv_with_battery"])
]
# house_epc_c_movers = epc_c_movers[
# epc_c_movers["property_type"] == "House"
# ]
# house_epc_c_movers_with_solar = house_epc_c_movers[
# ~pd.isnull(house_epc_c_movers["solar_pv"]) | ~pd.isnull(house_epc_c_movers["solar_pv_with_battery"])
# ]
house_epc_c_movers_with_a_heatpump = house_epc_c_movers[
~pd.isnull(house_epc_c_movers["air_source_heat_pump"])
]
# house_epc_c_movers_with_a_heatpump = house_epc_c_movers[
# ~pd.isnull(house_epc_c_movers["air_source_heat_pump"])
# ]
flat_epc_c_movers = epc_c_movers[
epc_c_movers["property_type"] == "Flat"
]
# flat_epc_c_movers = epc_c_movers[
# epc_c_movers["property_type"] == "Flat"
# ]
epc_c_recommendations["sap_points"].mean()
epc_c_recommendations["sap_points"].mean()
# epc_c_recommendations["sap_points"].mean()
# epc_c_recommendations["sap_points"].mean()
measure_cols = [
"air_source_heat_pump",
"boiler_upgrade",
"cavity_wall_insulation",
"double_glazing",
"external_wall_insulation",
"flat_roof_insulation",
"high_heat_retention_storage_heaters",
"internal_wall_insulation",
"loft_insulation",
"low_energy_lighting",
"mechanical_ventilation",
"room_roof_insulation",
"roomstat_programmer_trvs",
"sealing_open_fireplace",
"secondary_glazing",
"secondary_heating",
"solar_pv",
"solar_pv_with_battery",
"suspended_floor_insulation",
"time_temperature_zone_control",
]
# measure_cols = [
# "air_source_heat_pump",
# "boiler_upgrade",
# "cavity_wall_insulation",
# "double_glazing",
# "external_wall_insulation",
# "flat_roof_insulation",
# "high_heat_retention_storage_heaters",
# "internal_wall_insulation",
# "loft_insulation",
# "low_energy_lighting",
# "mechanical_ventilation",
# "room_roof_insulation",
# "roomstat_programmer_trvs",
# "sealing_open_fireplace",
# "secondary_glazing",
# "secondary_heating",
# "solar_pv",
# "solar_pv_with_battery",
# "suspended_floor_insulation",
# "time_temperature_zone_control",
# ]
epc_c_melted = (
epc_c_recommendations
.melt(
id_vars=[c for c in epc_c_recommendations.columns if c not in measure_cols],
value_vars=measure_cols,
var_name="measure_type",
value_name="value",
)
.dropna(subset=["value"])
)
epc_c_melted = epc_c_melted[epc_c_melted["value"] > 0]
epc_c_measures = epc_c_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index()
# epc_c_melted = (
# epc_c_recommendations
# .melt(
# id_vars=[c for c in epc_c_recommendations.columns if c not in measure_cols],
# value_vars=measure_cols,
# var_name="measure_type",
# value_name="value",
# )
# .dropna(subset=["value"])
# )
# epc_c_melted = epc_c_melted[epc_c_melted["value"] > 0]
# epc_c_measures = epc_c_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index()
epc_b_melted = (
epc_b_recommendations
.melt(
id_vars=[c for c in epc_b_recommendations.columns if c not in measure_cols],
value_vars=measure_cols,
var_name="measure_type",
value_name="value",
)
.dropna(subset=["value"])
)
# epc_b_melted = (
# epc_b_recommendations
# .melt(
# id_vars=[c for c in epc_b_recommendations.columns if c not in measure_cols],
# value_vars=measure_cols,
# var_name="measure_type",
# value_name="value",
# )
# .dropna(subset=["value"])
# )
epc_b_melted = epc_b_melted[epc_b_melted["value"] > 0]
epc_b_measures = epc_b_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index()
# epc_b_melted = epc_b_melted[epc_b_melted["value"] > 0]
# epc_b_measures = epc_b_melted["measure_type"].value_counts(normalize=True).to_frame().reset_index()
measures_compared = epc_c_measures.merge(
epc_b_measures,
left_on="measure_type",
right_on="measure_type",
suffixes=("_epc_c", "_epc_b"),
)
# measures_compared = epc_c_measures.merge(
# epc_b_measures,
# left_on="measure_type",
# right_on="measure_type",
# suffixes=("_epc_c", "_epc_b"),
# )
epc_c_retrofits = epc_c_recommendations[
epc_c_recommendations["total_retrofit_cost"] > 0
]
# epc_c_retrofits = epc_c_recommendations[
# epc_c_recommendations["total_retrofit_cost"] > 0
# ]
epc_b_retrofits = epc_b_recommendations[
epc_b_recommendations["total_retrofit_cost"] > 0
]
# epc_b_retrofits = epc_b_recommendations[
# epc_b_recommendations["total_retrofit_cost"] > 0
# ]
epc_c_retrofits["sap_points"].mean()
epc_b_retrofits["sap_points"].mean()
# epc_c_retrofits["sap_points"].mean()
# epc_b_retrofits["sap_points"].mean()
properties_in_both = epc_c_retrofits.merge(epc_b_retrofits, on="uprn", suffixes=("_epc_c", "_epc_b"))
# properties_in_both = epc_c_retrofits.merge(epc_b_retrofits, on="uprn", suffixes=("_epc_c", "_epc_b"))
properties_in_both["total_retrofit_cost_epc_c"].mean()
properties_in_both["sap_points_epc_c"].mean()
properties_in_both["total_retrofit_cost_epc_b"].mean()
properties_in_both["sap_points_epc_b"].mean()
# properties_in_both["total_retrofit_cost_epc_c"].mean()
# properties_in_both["sap_points_epc_c"].mean()
# properties_in_both["total_retrofit_cost_epc_b"].mean()
# properties_in_both["sap_points_epc_b"].mean()
# Solar PV savings - we need the amount of solar PV bill savings
from sqlalchemy.orm import sessionmaker
@ -114,16 +114,12 @@ from backend.app.db.models.recommendations import Recommendation, Plan, PlanReco
from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel
from collections import defaultdict
PORTFOLIO_ID = 435 # Peabody
PORTFOLIO_ID = 485 # Peabody
SCENARIOS = [
908,
909,
910,
970
]
scenario_names = {
908: "EPC C - no solid floor, ashp 3.0",
909: "EPC C - no solid floor, no EWI or IWI, ashp 3.0",
910: "EPC B - no solid floor, no EWI, ashp 3.0"
970: "EPC C - no solid floor, ashp 3.0",
}
@ -236,307 +232,266 @@ recommendations_df = pd.DataFrame(recommendations_data)
properties_df = pd.DataFrame(properties_data)
plans_df = pd.DataFrame(plans_data)
s_id = 910
ps_w_a_plan = plans_df[plans_df["scenario_id"] == s_id].copy()
# Take the newest by scenario id
ps_w_a_plan = ps_w_a_plan.sort_values("created_at", ascending=False).drop_duplicates(
subset=["property_id"]
)
z = ps_w_a_plan[
ps_w_a_plan["cost_of_works"] > 0
].copy()
z2 = properties_df[properties_df["property_id"].isin(z["property_id"].values)]
# '', 'hot_water_cost_current',
# 'lighting_cost_current', 'appliances_cost_current',
# 'gas_standing_charge', 'electricity_standing_charge'
z2["total_bills"] = z2["heating_cost_current"] + z2["hot_water_cost_current"] + z2["lighting_cost_current"] + z2[
"appliances_cost_current"
] + z2["gas_standing_charge"] + z2["electricity_standing_charge"]
with pd.ExcelWriter("hackney.xlsx", engine="openpyxl") as writer:
recommendations_df.to_excel(writer, sheet_name="recommendations", index=False)
properties_df.to_excel(writer, sheet_name="properties", index=False)
from tqdm import tqdm
# solar_pv_recommendations = recommendations_df[recommendations_df["measure_type"] == "solar_pv"]
# average_savings = solar_pv_recommendations.groupby("scenario_id")["energy_cost_savings"].mean().reset_index()
# For a property ID, find a property where the no EWI/IWI approach is more expensive than the EWI approach
pids = properties_df["property_id"].unique()
for pid in tqdm(pids):
if pid in [603272, 550550, 574493]:
continue
# get the plans
property_plan = plans_df[plans_df["property_id"] == int(pid)]
# Take the newest plan by scenario id
property_plan = property_plan.sort_values("created_at", ascending=False).drop_duplicates(
subset=["scenario_id"]
)
a = property_plan[property_plan["scenario_id"] == 909].squeeze() # no EWI/IWI
b = property_plan[property_plan["scenario_id"] == 908].squeeze() # EWI
if (a["cost_of_works"] > b["cost_of_works"]) and (
a["post_epc_rating"].value == "C") and (b["cost_of_works"] > 5000):
bah
# # Check tenures
# initial_asset_data = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
# "- Data Extracts for Domna.xlsx",
# sheet_name="Properties"
# )
# sustainability_data = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
# "- Data Extracts for Domna.xlsx",
# sheet_name="Sustainability"
# )
solar_pv_recommendations = recommendations_df[
recommendations_df["measure_type"] == "solar_pv"
]
# sustainability_sample = sustainability_data[
# sustainability_data["UPRN"].isin(properties_df["uprn"].astype(int).astype(str).values)
# ]
solid_wall_recommendation = recommendations_df[
recommendations_df["scenario_id"].isin([908]) &
recommendations_df["measure_type"].isin(["internal_wall_insulation"]) &
recommendations_df["default"]
]
average_savings = solar_pv_recommendations.groupby("scenario_id")["energy_cost_savings"].mean().reset_index()
# Add on scenarion names
average_savings["scenario_name"] = average_savings["scenario_id"].map(scenario_names)
# sustainability_sample = sustainability_sample.merge(
# initial_asset_data, left_on="Org Ref", right_on="UPRN", suffixes=("_sustainability", "_initial_asset")
# )
# Check tenures
initial_asset_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
"- Data Extracts for Domna.xlsx",
sheet_name="Properties"
)
sustainability_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/2025_11_11 - Peabody "
"- Data Extracts for Domna.xlsx",
sheet_name="Sustainability"
)
# block_sizes = initial_asset_data["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False)
# block_sizes.to_excel("/Users/khalimconn-kowlessar/Downloads/peabody_block_sizes.xlsx", index=False)
sustainability_sample = sustainability_data[
sustainability_data["UPRN"].isin(properties_df["uprn"].astype(int).astype(str).values)
]
# initial_asset_data.columns
# initial_asset_data["LeaseType"].value_counts()
sustainability_sample = sustainability_sample.merge(
initial_asset_data, left_on="Org Ref", right_on="UPRN", suffixes=("_sustainability", "_initial_asset")
)
# # sustainability_sample["Tenure Group"].value_counts()
# # Tenure Group
# # General Needs 57787
# # Home Ownership 25471
# # Care & Supported Housing 4239
# # Rental 2677
# # Other 188
block_sizes = initial_asset_data["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False)
block_sizes.to_excel("/Users/khalimconn-kowlessar/Downloads/peabody_block_sizes.xlsx", index=False)
# df = sustainability_sample["Ownership Type"].value_counts().to_frame().reset_index()
# df.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenures.xlsx", index=False)
initial_asset_data.columns
initial_asset_data["LeaseType"].value_counts()
# tenure_groups = sustainability_sample["Tenure Group"].value_counts().to_frame().reset_index()
# tenure_groups.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenure_groups.xlsx", index=False)
# sustainability_sample["Tenure Group"].value_counts()
# Tenure Group
# General Needs 57787
# Home Ownership 25471
# Care & Supported Housing 4239
# Rental 2677
# Other 188
# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Tenure Group"].value_counts()
df = sustainability_sample["Ownership Type"].value_counts().to_frame().reset_index()
df.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenures.xlsx", index=False)
# sample_data = initial_asset_data[
# ~initial_asset_data["Ownership Type"].isin(
# [
# # Commercial # Everything is resi - based on the Residential Indicator variable - all are true
# # Freeholder
# "FREEHOLDER", # 19517 properties
# # HOMEBUY / EQUITY LOAN
# "Rent to Homebuy", # 1 property
# # Leaseholder
# "LEASEHOLD 100%", # 8455 properties
# "Owned and Managed - 999 year lease", # 2076 properties
# "Managed but not Owned-Private Lease", # 159 properties
# "Owned and managed LEASEHOLD", # 26 properties
# # Outright Sale - can't find anything matching
# # SHARED EQUITY
# "Shared Ownership", # 4065 properties
# "Shared Ownership Owned Not Managed", # 23 properties
# # Extra categories which seem sensible to exclude
# "NOT MANAGED AND NOT OWNED"
# ]
# )
# ]
tenure_groups = sustainability_sample["Tenure Group"].value_counts().to_frame().reset_index()
tenure_groups.to_excel("/Users/khalimconn-kowlessar/Downloads/sustainability_tenure_groups.xlsx", index=False)
# sample_data["Ownership Type"].value_counts()
initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Tenure Group"].value_counts()
# sample_data = initial_asset_data[
# initial_asset_data["Ownership Type"].isin(
# [
# "Owned and Managed",
# "Owned and Managed - 999 year lease",
# "Owned and managed LEASEHOLD",
# "LEASEHOLD 100%",
# "DATALOAD DEFAULT"
# ]
# )
# ]
# dropped = initial_asset_data[~initial_asset_data["UPRN"].isin(sample_data["UPRN"].values)]
# dropped["Ownership Type"].value_counts()
sample_data = initial_asset_data[
~initial_asset_data["Ownership Type"].isin(
[
# Commercial # Everything is resi - based on the Residential Indicator variable - all are true
# Freeholder
"FREEHOLDER", # 19517 properties
# HOMEBUY / EQUITY LOAN
"Rent to Homebuy", # 1 property
# Leaseholder
"LEASEHOLD 100%", # 8455 properties
"Owned and Managed - 999 year lease", # 2076 properties
"Managed but not Owned-Private Lease", # 159 properties
"Owned and managed LEASEHOLD", # 26 properties
# Outright Sale - can't find anything matching
# SHARED EQUITY
"Shared Ownership", # 4065 properties
"Shared Ownership Owned Not Managed", # 23 properties
# Extra categories which seem sensible to exclude
"NOT MANAGED AND NOT OWNED"
]
)
]
# for value in [
# # Commercial # Everything is resi, so should be fine. No matches
# # Freeholder
# "FREEHOLDER", # 19517 properties
# # HOMEBUY / EQUITY LOAN
# "Rent to Homebuy", # 1 property
# # Leaseholder
# "LEASEHOLD 100%", # 8455 properties
# "Owned and Managed - 999 year lease", # 2076 properties
# "Managed but not Owned-Private Lease", # 159 properties
# "Owned and managed LEASEHOLD", # 26 properties
# # Outright Sale - can't find anything matching
# # SHARED EQUITY
# "Shared Ownership", # 4065 properties
# "Shared Ownership Owned Not Managed", # 23 properties
# ]:
# print(initial_asset_data[initial_asset_data["Ownership Type"] == value].shape[0])
sample_data["Ownership Type"].value_counts()
# house_types = [
# "HOUSE",
# "BUNGALOW",
# "MAISONETTE",
# "DUPLEX",
# ]
sample_data = initial_asset_data[
initial_asset_data["Ownership Type"].isin(
[
"Owned and Managed",
"Owned and Managed - 999 year lease",
"Owned and managed LEASEHOLD",
"LEASEHOLD 100%",
"DATALOAD DEFAULT"
]
)
]
dropped = initial_asset_data[~initial_asset_data["UPRN"].isin(sample_data["UPRN"].values)]
dropped["Ownership Type"].value_counts()
# guaranteed_control = [
# "Owned and Managed",
# "Owned and Managed - 999 year lease",
# "Owned and managed LEASEHOLD",
# "LEASEHOLD 100%",
# "DATALOAD DEFAULT",
# ]
for value in [
# Commercial # Everything is resi, so should be fine. No matches
# Freeholder
"FREEHOLDER", # 19517 properties
# HOMEBUY / EQUITY LOAN
"Rent to Homebuy", # 1 property
# Leaseholder
"LEASEHOLD 100%", # 8455 properties
"Owned and Managed - 999 year lease", # 2076 properties
"Managed but not Owned-Private Lease", # 159 properties
"Owned and managed LEASEHOLD", # 26 properties
# Outright Sale - can't find anything matching
# SHARED EQUITY
"Shared Ownership", # 4065 properties
"Shared Ownership Owned Not Managed", # 23 properties
]:
print(initial_asset_data[initial_asset_data["Ownership Type"] == value].shape[0])
# sample_data = initial_asset_data[
# (
# initial_asset_data["Ownership Type"].isin(guaranteed_control)
# )
# |
# (
# (initial_asset_data["Ownership Type"] == "FREEHOLDER")
# &
# (initial_asset_data["Property Type"].isin(house_types))
# )
# ]
house_types = [
"HOUSE",
"BUNGALOW",
"MAISONETTE",
"DUPLEX",
]
# fabric_retrofit_sample = initial_asset_data[
# initial_asset_data["Ownership Type"].isin(
# [
# "Owned and Managed",
# "FREEHOLDER",
# "DATALOAD DEFAULT",
# ]
# )
# ]
guaranteed_control = [
"Owned and Managed",
"Owned and Managed - 999 year lease",
"Owned and managed LEASEHOLD",
"LEASEHOLD 100%",
"DATALOAD DEFAULT",
]
# initial_asset_data[pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts()
# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts()
sample_data = initial_asset_data[
(
initial_asset_data["Ownership Type"].isin(guaranteed_control)
)
|
(
(initial_asset_data["Ownership Type"] == "FREEHOLDER")
&
(initial_asset_data["Property Type"].isin(house_types))
)
]
# initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Property Type"].value_counts()
# z = initial_asset_data[
# ~pd.isnull(initial_asset_data["BlockCode"]) & initial_asset_data["Property Type"].isin(house_types)
# ]
fabric_retrofit_sample = initial_asset_data[
initial_asset_data["Ownership Type"].isin(
[
"Owned and Managed",
"FREEHOLDER",
"DATALOAD DEFAULT",
]
)
]
# block_code_agg = z["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False)
# zz = initial_asset_data[initial_asset_data["BlockCode"] == "CHAT3343FM"]
initial_asset_data[pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts()
initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Ownership Type"].value_counts()
# potential_sample = initial_asset_data[
# ~pd.isnull(initial_asset_data["BlockCode"])
# ]
initial_asset_data[~pd.isnull(initial_asset_data["BlockCode"])]["Property Type"].value_counts()
z = initial_asset_data[
~pd.isnull(initial_asset_data["BlockCode"]) & initial_asset_data["Property Type"].isin(house_types)
]
# compare = potential_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge(
# initial_asset_data["Property Type"].value_counts(normalize=True).to_frame().reset_index(),
# left_on="Property Type",
# right_on="Property Type",
# suffixes=("_on_block_codes", "_overall")
# )
block_code_agg = z["BlockCode"].value_counts().reset_index().sort_values("count", ascending=False)
zz = initial_asset_data[initial_asset_data["BlockCode"] == "CHAT3343FM"]
# # Comparison of smaller sample vs overall
# new_asset_data = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 "
# "- Peabody "
# "- Data Extracts for Domna v2.xlsx",
# sheet_name="Properties"
# )
potential_sample = initial_asset_data[
~pd.isnull(initial_asset_data["BlockCode"])
]
# new_sustainability_data = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 "
# "- Peabody "
# "- Data Extracts for Domna v2.xlsx",
# sheet_name="Sustainability"
# )
compare = potential_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge(
initial_asset_data["Property Type"].value_counts(normalize=True).to_frame().reset_index(),
left_on="Property Type",
right_on="Property Type",
suffixes=("_on_block_codes", "_overall")
)
# sap_bands = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/Parity Data "
# "08012026.xlsx",
# )
# Comparison of smaller sample vs overall
new_asset_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 "
"- Peabody "
"- Data Extracts for Domna v2.xlsx",
sheet_name="Properties"
)
# combined = new_asset_data.merge(
# new_sustainability_data,
# left_on="UPRN",
# right_on="Org Ref",
# suffixes=("_asset", "_sustainability")
# ).merge(
# sap_bands[["OrgRef", "SAP Band", "Lodged EPC Band"]], how="left", left_on="Org Ref", right_on="OrgRef"
# )
# reduced_sample = combined[
# ~combined["AH Tenure"].isin(
# ["Commercial",
# "Freeholder",
# "HOMEBUY / EQUITY LOAN",
# "Leaseholder",
# "Outright Sale",
# "SHARED EQUITY",
# "Shared Ownership"]
# )
# ].copy()
new_sustainability_data = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/2025_11_11 "
"- Peabody "
"- Data Extracts for Domna v2.xlsx",
sheet_name="Sustainability"
)
# # property types
# property_type_comparison = reduced_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge(
# combined["Property Type"].value_counts(normalize=True).to_frame().reset_index(),
# left_on="Property Type",
# right_on="Property Type",
# suffixes=("_reduced_sample", "_overall")
# )
sap_bands = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/Parity Data "
"08012026.xlsx",
)
# # lodged ratings
# lodged_epc_band_comparison = reduced_sample["Lodged EPC Band"].value_counts(
# normalize=True).to_frame().reset_index().merge(
# combined["Lodged EPC Band"].value_counts(normalize=True).to_frame().reset_index(),
# left_on="Lodged EPC Band",
# right_on="Lodged EPC Band",
# suffixes=("_reduced_sample", "_overall")
# )
combined = new_asset_data.merge(
new_sustainability_data,
left_on="UPRN",
right_on="Org Ref",
suffixes=("_asset", "_sustainability")
).merge(
sap_bands[["OrgRef", "SAP Band", "Lodged EPC Band"]], how="left", left_on="Org Ref", right_on="OrgRef"
)
reduced_sample = combined[
~combined["AH Tenure"].isin(
["Commercial",
"Freeholder",
"HOMEBUY / EQUITY LOAN",
"Leaseholder",
"Outright Sale",
"SHARED EQUITY",
"Shared Ownership"]
)
].copy()
# # modelled ratings
# modelled_epc_band_comparison = reduced_sample["SAP Band"].value_counts(
# normalize=True).to_frame().reset_index().merge(
# combined["SAP Band"].value_counts(normalize=True).to_frame().reset_index(),
# left_on="SAP Band",
# right_on="SAP Band",
# suffixes=("_reduced_sample", "_overall")
# )
# property types
property_type_comparison = reduced_sample["Property Type"].value_counts(normalize=True).to_frame().reset_index().merge(
combined["Property Type"].value_counts(normalize=True).to_frame().reset_index(),
left_on="Property Type",
right_on="Property Type",
suffixes=("_reduced_sample", "_overall")
)
# # Testing measures
# m1 = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no "
# "solid floor, ashp 3.0 - 20250113 final.xlsx"
# )
# m2 = pd.read_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no "
# "solid floor, no EWI or IWI, ashp 3.0 - 20250113 final.xlsx"
# )
# lodged ratings
lodged_epc_band_comparison = reduced_sample["Lodged EPC Band"].value_counts(
normalize=True).to_frame().reset_index().merge(
combined["Lodged EPC Band"].value_counts(normalize=True).to_frame().reset_index(),
left_on="Lodged EPC Band",
right_on="Lodged EPC Band",
suffixes=("_reduced_sample", "_overall")
)
# compare = m1.merge(
# m2,
# left_on="uprn",
# right_on="uprn",
# suffixes=("_ewi_iwi", "_no_ewi_iwi")
# )
# modelled ratings
modelled_epc_band_comparison = reduced_sample["SAP Band"].value_counts(
normalize=True).to_frame().reset_index().merge(
combined["SAP Band"].value_counts(normalize=True).to_frame().reset_index(),
left_on="SAP Band",
right_on="SAP Band",
suffixes=("_reduced_sample", "_overall")
)
# # Which properties get done under the no EWI/IWI scenario that do not under the EWI/IWI scenario
# only_no_ewi_iwi = compare[
# (compare["total_retrofit_cost_ewi_iwi"] == 0) &
# (compare["total_retrofit_cost_no_ewi_iwi"] != 0)
# ]
# Testing measures
m1 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no "
"solid floor, ashp 3.0 - 20250113 final.xlsx"
)
m2 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting Project/Final SAL/EPC C - no "
"solid floor, no EWI or IWI, ashp 3.0 - 20250113 final.xlsx"
)
# (m1["total_retrofit_cost"] > 0).sum()
# (m2["total_retrofit_cost"] > 0).sum()
compare = m1.merge(
m2,
left_on="uprn",
right_on="uprn",
suffixes=("_ewi_iwi", "_no_ewi_iwi")
)
# with_ewi_projects = compare[compare["total_retrofit_cost_no_ewi_iwi"] > 0]
# Which properties get done under the no EWI/IWI scenario that do not under the EWI/IWI scenario
only_no_ewi_iwi = compare[
(compare["total_retrofit_cost_ewi_iwi"] == 0) &
(compare["total_retrofit_cost_no_ewi_iwi"] != 0)
]
(m1["total_retrofit_cost"] > 0).sum()
(m2["total_retrofit_cost"] > 0).sum()
with_ewi_projects = compare[compare["total_retrofit_cost_no_ewi_iwi"] > 0]
z = with_ewi_projects[pd.isnull(with_ewi_projects["total_retrofit_cost_ewi_iwi"])]
# z = with_ewi_projects[pd.isnull(with_ewi_projects["total_retrofit_cost_ewi_iwi"])]

View file

@ -1,4 +1,4 @@
pydantic==2.9.2
pydantic>=1.10.7
pydantic-settings==2.6.0
epc-api-python==1.0.2
numpy==2.1.2

View file

@ -1,4 +1,4 @@
[pytest]
pythonpath = .
addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial
testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests
testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests

View file

@ -15,16 +15,12 @@ from sqlalchemy import func
# PORTFOLIO_ID = 206
# SCENARIOS = [389]
PORTFOLIO_ID = 435 # Peabody
PORTFOLIO_ID = 485 # Peabody
SCENARIOS = [
908,
909,
910,
970,
]
scenario_names = {
908: "EPC C - no solid floor, ashp 3.0",
909: "EPC C - no solid floor, no EWI or IWI, ashp 3.0",
910: "EPC B - no solid floor, no EWI, ashp 3.0"
970: "EPC C - No solid floor, EQI, IWI",
}
@ -295,6 +291,11 @@ for scenario_id in SCENARIOS:
df[df["predicted_post_works_sap"] == ""]
# Create excel to store to
<<<<<<< HEAD
filename = (f"{scenario_names[scenario_id]} - 20250113 final.xlsx")
with pd.ExcelWriter(filename) as writer:
df.to_excel(writer, sheet_name="properties", index=False)
=======
filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
f"Project/Final SAL/scenarios/{scenario_names[scenario_id]} - 20250114 final.xlsx")
with pd.ExcelWriter(filename) as writer:
@ -475,3 +476,4 @@ dupes = plans_df2[plans_df2["property_id"].duplicated()]
example = example.merge(
plans_df, how="left",
)
>>>>>>> 3874da6177cbcc37f7a488bec0a06e387906653c