mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
its working the way khalim wanted wiht postcode and then search that
This commit is contained in:
parent
786e310f7c
commit
d5c9fd9390
8 changed files with 319 additions and 53 deletions
|
|
@ -34,10 +34,11 @@ RUN useradd -m -s /usr/bin/bash ${USER} \
|
|||
ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||
ADD backend/engine/requirements.txt requirements1.txt
|
||||
ADD backend/app/requirements/requirements.txt requirements2.txt
|
||||
ADD .devcontainer/requirements.txt requirements3.txt
|
||||
RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt
|
||||
RUN pip install -r requirements.txt
|
||||
# ADD .devcontainer/requirements.txt requirements3.txt
|
||||
# RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt
|
||||
RUN cat requirements1.txt requirements2.txt > requirements.txt
|
||||
|
||||
RUN pip install -r requirements.txt
|
||||
# 5) Workdir
|
||||
WORKDIR /workspaces/model
|
||||
|
||||
|
|
|
|||
|
|
@ -23,11 +23,6 @@
|
|||
"4ops.terraform",
|
||||
"fabiospampinato.vscode-todo-plus",
|
||||
"jgclark.vscode-todo-highlight",
|
||||
<<<<<<< HEAD
|
||||
"corentinartaud.pdfpreview"
|
||||
]
|
||||
}
|
||||
=======
|
||||
"corentinartaud.pdfpreview",
|
||||
"ms-python.vscode-python-envs"
|
||||
]
|
||||
|
|
@ -35,6 +30,5 @@
|
|||
},
|
||||
"containerEnv": {
|
||||
"PYTHONFLAGS": "-Xfrozen_modules=off"
|
||||
>>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
# fastapi
|
||||
fastapi==0.115.2
|
||||
sqlalchemy==2.0.36
|
||||
pydantic-settings==2.6.0
|
||||
psycopg2-binary==2.9.10
|
||||
python-jose==3.3.0
|
||||
cryptography==43.0.3
|
||||
|
|
@ -13,8 +12,10 @@ openpyxl==3.1.2
|
|||
# Basic
|
||||
pytz
|
||||
uvicorn[standard]
|
||||
sqlmodel
|
||||
# Testing
|
||||
pytest==9.0.2
|
||||
pytest-cov==7.0.0
|
||||
ipykernel>=6.25,<7
|
||||
ipykernel>=6.25,<7
|
||||
pydantic-settings<2
|
||||
pyyaml>=6.0.1
|
||||
pydantic>=1.10.7,<2
|
||||
|
|
@ -57,21 +57,6 @@ def app():
|
|||
EPC recommendations
|
||||
Property UPRN
|
||||
"""
|
||||
<<<<<<< HEAD
|
||||
data_folder = ("/workspaces/model/asset_list")
|
||||
data_filename = "assets.xlsx"
|
||||
sheet_name = "Sheet1"
|
||||
postcode_column = 'Postcode'
|
||||
address1_column = None
|
||||
address1_method = 'house_number_extraction'
|
||||
fulladdress_column = 'Address'
|
||||
address_cols_to_concat = None
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = None
|
||||
landlord_os_uprn = None
|
||||
landlord_property_type = None
|
||||
landlord_built_form = None
|
||||
=======
|
||||
|
||||
data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Warmfront/SCIS")
|
||||
data_filename = "SCIS_Historic_Deemed_Combined_Workings.xlsx"
|
||||
|
|
@ -86,16 +71,11 @@ def app():
|
|||
landlord_os_uprn = None
|
||||
landlord_property_type = "PROPERTY TYPE As per table emailed"
|
||||
landlord_built_form = "PROPERTY TYPE As per table emailed"
|
||||
>>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
|
||||
landlord_wall_construction = None
|
||||
landlord_roof_construction = None
|
||||
landlord_heating_system = None
|
||||
landlord_existing_pv = None
|
||||
<<<<<<< HEAD
|
||||
landlord_property_id = "LLUPRN"
|
||||
=======
|
||||
landlord_property_id = "Row ID"
|
||||
>>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
|
||||
landlord_sap = None
|
||||
outcomes_filename = None
|
||||
outcomes_sheetname = None
|
||||
|
|
@ -111,8 +91,6 @@ def app():
|
|||
asset_list_header = 0
|
||||
landlord_block_reference = None
|
||||
|
||||
<<<<<<< HEAD
|
||||
=======
|
||||
# Peabody data for cleaning
|
||||
data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
|
||||
"Project/data_validation")
|
||||
|
|
@ -181,7 +159,6 @@ def app():
|
|||
# ecosurv_landlords = None
|
||||
# asset_list_header = 0
|
||||
# landlord_block_reference = None
|
||||
>>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
|
||||
|
||||
# Maps addresses to uprn in problematic cases
|
||||
manual_uprn_map = {}
|
||||
|
|
@ -516,11 +493,7 @@ def app():
|
|||
|
||||
if not asset_list.geographical_areas.empty:
|
||||
asset_list.geographical_areas.to_excel(writer, sheet_name="Geographical Areas", index=False)
|
||||
<<<<<<< HEAD
|
||||
print("done")
|
||||
=======
|
||||
|
||||
# Store dupes
|
||||
if not asset_list.duplicated_addresses.empty:
|
||||
asset_list.duplicated_addresses.to_excel(writer, sheet_name="Duplicate Properties", index=False)
|
||||
>>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
postal
|
||||
pandas
|
||||
usaddress
|
||||
pydantic-settings==2.6.0
|
||||
epc-api-python==1.0.2
|
||||
thefuzz
|
||||
boto3
|
||||
|
|
@ -10,7 +9,5 @@ openai>=1.3.5
|
|||
tiktoken
|
||||
msgpack
|
||||
beautifulsoup4
|
||||
pydantic>=1.10.7
|
||||
typing-extensions>=4.5.0
|
||||
requests>=2.28.2
|
||||
tiktoken
|
||||
|
|
@ -1,18 +1,321 @@
|
|||
from epc_api.client import EpcClient
|
||||
import os
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "")
|
||||
from urllib.parse import urlencode
|
||||
import pandas as pd
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
import re
|
||||
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "")
|
||||
client = EpcClient(auth_token=EPC_AUTH_TOKEN)
|
||||
|
||||
import re
|
||||
from difflib import SequenceMatcher
|
||||
from typing import Set
|
||||
|
||||
search_resp = client.domestic.search(
|
||||
params={
|
||||
"postcode": "b938sy"
|
||||
|
||||
def levenshtein(a: str, b: str) -> float:
|
||||
"""
|
||||
Address similarity score in [0, 1].
|
||||
|
||||
Strategy:
|
||||
- Normalise
|
||||
- Strongly penalise mismatched house/flat numbers
|
||||
- Combine token overlap + character similarity
|
||||
"""
|
||||
def extract_numbers(s: str) -> Set[str]:
|
||||
"""Extract all numeric tokens (house numbers, flat numbers)."""
|
||||
return set(re.findall(r"\d+[a-z]?", s))
|
||||
|
||||
def tokenise(s: str) -> Set[str]:
|
||||
return set(s.split())
|
||||
|
||||
a_norm = normalise_address(a)
|
||||
b_norm = normalise_address(b)
|
||||
|
||||
# --- hard signal: numbers ---
|
||||
nums_a = extract_numbers(a_norm)
|
||||
nums_b = extract_numbers(b_norm)
|
||||
|
||||
if nums_a and nums_b and nums_a != nums_b:
|
||||
# Different house/flat numbers → near impossible match
|
||||
return 0.0
|
||||
|
||||
# --- token similarity (order-independent) ---
|
||||
toks_a = tokenise(a_norm)
|
||||
toks_b = tokenise(b_norm)
|
||||
|
||||
if not toks_a or not toks_b:
|
||||
token_score = 0.0
|
||||
else:
|
||||
token_score = len(toks_a & toks_b) / len(toks_a | toks_b)
|
||||
|
||||
# --- character similarity (soft signal) ---
|
||||
char_score = SequenceMatcher(None, a_norm, b_norm).ratio()
|
||||
|
||||
# --- weighted blend ---
|
||||
return round(
|
||||
0.65 * token_score +
|
||||
0.35 * char_score,
|
||||
4,
|
||||
)
|
||||
|
||||
|
||||
def normalise_address(s: str) -> str:
|
||||
"""
|
||||
Canonical UK-focused address normalisation.
|
||||
|
||||
- Lowercases
|
||||
- Removes punctuation (keeps / for flats)
|
||||
- Normalises whitespace
|
||||
- Applies synonym compression at token level
|
||||
"""
|
||||
|
||||
if not s:
|
||||
return ""
|
||||
|
||||
ADDRESS_SYNONYMS = {
|
||||
# street types
|
||||
"rd": "road",
|
||||
"rd.": "road",
|
||||
"st": "street",
|
||||
"st.": "street",
|
||||
"ave": "avenue",
|
||||
"ave.": "avenue",
|
||||
"ln": "lane",
|
||||
"ln.": "lane",
|
||||
"cres": "crescent",
|
||||
"ct": "court",
|
||||
"dr": "drive",
|
||||
|
||||
# flats / units
|
||||
"apt": "flat",
|
||||
"apartment": "flat",
|
||||
"unit": "flat",
|
||||
"ste": "suite",
|
||||
|
||||
# numbering noise
|
||||
"no": "",
|
||||
"no.": "",
|
||||
}
|
||||
)
|
||||
|
||||
print(search_resp)
|
||||
# 1. lowercase
|
||||
s = s.lower()
|
||||
|
||||
# 2. remove punctuation except /
|
||||
s = re.sub(r"[^\w\s/]", " ", s)
|
||||
|
||||
# 3. normalise whitespace
|
||||
s = re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
# 4. tokenise + synonym normalisation
|
||||
tokens = []
|
||||
for tok in s.split():
|
||||
replacement = ADDRESS_SYNONYMS.get(tok, tok)
|
||||
if replacement:
|
||||
tokens.append(replacement)
|
||||
|
||||
return " ".join(tokens)
|
||||
|
||||
|
||||
def score_addresses(
|
||||
df: pd.DataFrame,
|
||||
user_address: str,
|
||||
column: str = "address",
|
||||
) -> pd.Series:
|
||||
if column not in df.columns:
|
||||
raise ValueError(f"Missing column: {column}")
|
||||
|
||||
return df[column].apply(
|
||||
lambda x: levenshtein(user_address, x)
|
||||
)
|
||||
|
||||
def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3):
|
||||
"""
|
||||
Recursively fetch EPC data by postcode.
|
||||
If results hit the size limit, retry with double size up to max_attempts.
|
||||
"""
|
||||
|
||||
url = os.path.join(client.domestic.host, "search")
|
||||
|
||||
if size:
|
||||
url += "?" + urlencode({"size": size})
|
||||
|
||||
search_resp = client.domestic.call(
|
||||
url=url,
|
||||
method="get",
|
||||
params={"postcode": postcode},
|
||||
)
|
||||
|
||||
results_df = pd.DataFrame(
|
||||
search_resp["rows"],
|
||||
columns=search_resp["column-names"]
|
||||
)
|
||||
|
||||
row_count = len(results_df)
|
||||
|
||||
# If we hit the size limit, there *may* be more results
|
||||
if row_count == size:
|
||||
print(
|
||||
f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. "
|
||||
f"Attempt {attempt}/{max_attempts}."
|
||||
)
|
||||
|
||||
if attempt < max_attempts:
|
||||
print(f"🔁 Retrying with size={size * 2}")
|
||||
return get_epc_data_with_postcode(
|
||||
postcode=postcode,
|
||||
size=size * 2,
|
||||
attempt=attempt + 1,
|
||||
max_attempts=max_attempts,
|
||||
)
|
||||
else:
|
||||
print(
|
||||
"🚨 Max attempts reached. Results may be truncated. "
|
||||
"(Please do a manual review by the tech team.)"
|
||||
)
|
||||
|
||||
return results_df
|
||||
|
||||
|
||||
def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool:
|
||||
"""
|
||||
Returns True if all non-null UPRNs in df match the given uprn.
|
||||
Returns False otherwise.
|
||||
"""
|
||||
|
||||
if column not in df.columns:
|
||||
return False
|
||||
|
||||
# Drop nulls and normalise to string
|
||||
uprns = (
|
||||
df[column]
|
||||
.dropna()
|
||||
.astype(str)
|
||||
.str.strip()
|
||||
.unique()
|
||||
)
|
||||
|
||||
# No valid UPRNs to compare
|
||||
if len(uprns) == 0:
|
||||
return False
|
||||
|
||||
# Exactly one unique UPRN and it matches
|
||||
return len(uprns) == 1 and uprns[0] == str(uprn)
|
||||
|
||||
|
||||
def get_uprn_candidates(
|
||||
df: pd.DataFrame,
|
||||
user_address: str,
|
||||
address_column: str = "address",
|
||||
uprn_column: str = "uprn",
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Annotate EPC results with lexicographical similarity scores and ranks.
|
||||
|
||||
Returns a DataFrame sorted by descending lexiscore.
|
||||
DOES NOT choose or return a UPRN.
|
||||
"""
|
||||
|
||||
if address_column not in df.columns:
|
||||
raise ValueError(f"Missing column: {address_column}")
|
||||
|
||||
if uprn_column not in df.columns:
|
||||
raise ValueError(f"Missing column: {uprn_column}")
|
||||
|
||||
out = df.copy()
|
||||
|
||||
user_norm = normalise_address(user_address)
|
||||
|
||||
out["lexiscore"] = out[address_column].apply(
|
||||
lambda x: levenshtein(user_norm, x)
|
||||
)
|
||||
|
||||
# Normalise UPRN to string
|
||||
out[uprn_column] = (
|
||||
out[uprn_column]
|
||||
.astype(str)
|
||||
.str.replace(r"\.0$", "", regex=True)
|
||||
)
|
||||
|
||||
# Rank: 1 = best match
|
||||
out["lexirank"] = (
|
||||
out["lexiscore"]
|
||||
.rank(method="dense", ascending=False)
|
||||
.astype(int)
|
||||
)
|
||||
|
||||
return out.sort_values(
|
||||
["lexirank", "lexiscore"],
|
||||
ascending=[True, False],
|
||||
)
|
||||
|
||||
|
||||
def get_uprn(user_inputed_address: str, postcode: str):
|
||||
df = get_epc_data_with_postcode(postcode=postcode)
|
||||
|
||||
if df.empty:
|
||||
return False
|
||||
|
||||
scored_df = get_uprn_candidates(
|
||||
df,
|
||||
user_address=user_inputed_address,
|
||||
)
|
||||
|
||||
# Best score
|
||||
best_score = scored_df.iloc[0]["lexiscore"]
|
||||
|
||||
if best_score <= 0:
|
||||
return False
|
||||
|
||||
# All rank-1 rows (possible draw)
|
||||
top_rank_df = scored_df[scored_df["lexirank"] == 1]
|
||||
|
||||
# If rank-1 rows do not agree on a single UPRN → ambiguous
|
||||
if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]):
|
||||
return False
|
||||
|
||||
# Safe to return the agreed UPRN
|
||||
return top_rank_df.iloc[0]["uprn"]
|
||||
|
||||
|
||||
def test(a,b):
|
||||
assert a == b, f"errr a {a} - {type(a)}, does not equal b {b} - {type(b)}"
|
||||
|
||||
|
||||
def run_all_test():
|
||||
# Basic usage with different post codes styles
|
||||
test(get_epc_data_with_postcode("b93 8sy").shape[0], 63)
|
||||
test(get_epc_data_with_postcode("B938sy").shape[0], 63)
|
||||
test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63)
|
||||
test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63)
|
||||
|
||||
test(get_uprn("68", "b93 8sy"), "100070989938")
|
||||
test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938")
|
||||
|
||||
get_uprn_candidates(get_epc_data_with_postcode("b93 8sy"), "68")
|
||||
get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "Flat A, 28, Nelgarde Road")
|
||||
get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "28 A")
|
||||
get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "A 28")
|
||||
get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "28A")
|
||||
get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "A28")
|
||||
|
||||
get_uprn_candidates(get_epc_data_with_postcode("E8 4SQ"), "6 Aitken Close") # no epc
|
||||
|
||||
|
||||
# # Example of more than one results for the same address
|
||||
# test(get_uprn("se6 4tf", house_number="flat A 28"), "100023278633")
|
||||
# test(get_uprn("se6 4tf", house_number="28 A"), "100023278633")
|
||||
# test(get_uprn("se6 4tf", house_number="A 28"), "100023278633")
|
||||
|
||||
# test(get_uprn("se6 4tf", house_number="A28"), "100023278633") # this one
|
||||
# test(get_uprn("se6 4tf", house_number="28A"), "100023278633") # investigate this one
|
||||
|
||||
# # Example of flats that have different uprn
|
||||
# test(get_uprn("se6 4tf", house_number="28"), "100023278633")
|
||||
|
||||
# house number nlp, address1
|
||||
|
||||
# get postcode
|
||||
# make input data with peabody with 3 postcode so i have sample of iput
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
# fastapi
|
||||
fastapi==0.115.2
|
||||
sqlalchemy==2.0.36
|
||||
pydantic-settings==2.6.0
|
||||
|
|
@ -11,6 +10,4 @@ boto3==1.35.44
|
|||
# Data
|
||||
openpyxl==3.1.2
|
||||
# Basic
|
||||
pytz
|
||||
sqlmodel
|
||||
|
||||
pytz
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
pydantic==2.9.2
|
||||
pydantic>=1.10.7
|
||||
pydantic-settings==2.6.0
|
||||
epc-api-python==1.0.2
|
||||
numpy==2.1.2
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue