From d5c9fd9390f189666fcd6c512d9444570c8bff87 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 22 Jan 2026 15:17:13 +0000 Subject: [PATCH] its working the way khalim wanted wiht postcode and then search that --- .devcontainer/Dockerfile | 7 +- .devcontainer/devcontainer.json | 6 - .devcontainer/requirements.txt | 7 +- asset_list/app.py | 27 -- asset_list/requirements.txt | 3 - backend/address2UPRN/main.py | 315 +++++++++++++++++++++- backend/app/requirements/requirements.txt | 5 +- model_data/requirements/requirements.txt | 2 +- 8 files changed, 319 insertions(+), 53 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index a2fd9b31..ccd056c7 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -34,10 +34,11 @@ RUN useradd -m -s /usr/bin/bash ${USER} \ ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1 ADD backend/engine/requirements.txt requirements1.txt ADD backend/app/requirements/requirements.txt requirements2.txt -ADD .devcontainer/requirements.txt requirements3.txt -RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt -RUN pip install -r requirements.txt +# ADD .devcontainer/requirements.txt requirements3.txt +# RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt +RUN cat requirements1.txt requirements2.txt > requirements.txt +RUN pip install -r requirements.txt # 5) Workdir WORKDIR /workspaces/model diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 80a56bf2..761786cd 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -23,11 +23,6 @@ "4ops.terraform", "fabiospampinato.vscode-todo-plus", "jgclark.vscode-todo-highlight", -<<<<<<< HEAD - "corentinartaud.pdfpreview" - ] - } -======= "corentinartaud.pdfpreview", "ms-python.vscode-python-envs" ] @@ -35,6 +30,5 @@ }, "containerEnv": { "PYTHONFLAGS": "-Xfrozen_modules=off" ->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d } } diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt index 300b86b0..a6aebdaf 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/requirements.txt @@ -1,7 +1,6 @@ # fastapi fastapi==0.115.2 sqlalchemy==2.0.36 -pydantic-settings==2.6.0 psycopg2-binary==2.9.10 python-jose==3.3.0 cryptography==43.0.3 @@ -13,8 +12,10 @@ openpyxl==3.1.2 # Basic pytz uvicorn[standard] -sqlmodel # Testing pytest==9.0.2 pytest-cov==7.0.0 -ipykernel>=6.25,<7 \ No newline at end of file +ipykernel>=6.25,<7 +pydantic-settings<2 +pyyaml>=6.0.1 +pydantic>=1.10.7,<2 \ No newline at end of file diff --git a/asset_list/app.py b/asset_list/app.py index a832784c..d3ca9337 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -57,21 +57,6 @@ def app(): EPC recommendations Property UPRN """ -<<<<<<< HEAD - data_folder = ("/workspaces/model/asset_list") - data_filename = "assets.xlsx" - sheet_name = "Sheet1" - postcode_column = 'Postcode' - address1_column = None - address1_method = 'house_number_extraction' - fulladdress_column = 'Address' - address_cols_to_concat = None - missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = None - landlord_built_form = None -======= data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Warmfront/SCIS") data_filename = "SCIS_Historic_Deemed_Combined_Workings.xlsx" @@ -86,16 +71,11 @@ def app(): landlord_os_uprn = None landlord_property_type = "PROPERTY TYPE As per table emailed" landlord_built_form = "PROPERTY TYPE As per table emailed" ->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d landlord_wall_construction = None landlord_roof_construction = None landlord_heating_system = None landlord_existing_pv = None -<<<<<<< HEAD - landlord_property_id = "LLUPRN" -======= landlord_property_id = "Row ID" ->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -111,8 +91,6 @@ def app(): asset_list_header = 0 landlord_block_reference = None -<<<<<<< HEAD -======= # Peabody data for cleaning data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " "Project/data_validation") @@ -181,7 +159,6 @@ def app(): # ecosurv_landlords = None # asset_list_header = 0 # landlord_block_reference = None ->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d # Maps addresses to uprn in problematic cases manual_uprn_map = {} @@ -516,11 +493,7 @@ def app(): if not asset_list.geographical_areas.empty: asset_list.geographical_areas.to_excel(writer, sheet_name="Geographical Areas", index=False) -<<<<<<< HEAD - print("done") -======= # Store dupes if not asset_list.duplicated_addresses.empty: asset_list.duplicated_addresses.to_excel(writer, sheet_name="Duplicate Properties", index=False) ->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt index f9be495b..1fa08aca 100644 --- a/asset_list/requirements.txt +++ b/asset_list/requirements.txt @@ -1,7 +1,6 @@ postal pandas usaddress -pydantic-settings==2.6.0 epc-api-python==1.0.2 thefuzz boto3 @@ -10,7 +9,5 @@ openai>=1.3.5 tiktoken msgpack beautifulsoup4 -pydantic>=1.10.7 typing-extensions>=4.5.0 requests>=2.28.2 -tiktoken \ No newline at end of file diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index fc081fab..1d8a9b68 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -1,18 +1,321 @@ from epc_api.client import EpcClient import os -EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "") +from urllib.parse import urlencode +import pandas as pd +from difflib import SequenceMatcher +import re + +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "") client = EpcClient(auth_token=EPC_AUTH_TOKEN) +import re +from difflib import SequenceMatcher +from typing import Set -search_resp = client.domestic.search( - params={ - "postcode": "b938sy" + +def levenshtein(a: str, b: str) -> float: + """ + Address similarity score in [0, 1]. + + Strategy: + - Normalise + - Strongly penalise mismatched house/flat numbers + - Combine token overlap + character similarity + """ + def extract_numbers(s: str) -> Set[str]: + """Extract all numeric tokens (house numbers, flat numbers).""" + return set(re.findall(r"\d+[a-z]?", s)) + + def tokenise(s: str) -> Set[str]: + return set(s.split()) + + a_norm = normalise_address(a) + b_norm = normalise_address(b) + + # --- hard signal: numbers --- + nums_a = extract_numbers(a_norm) + nums_b = extract_numbers(b_norm) + + if nums_a and nums_b and nums_a != nums_b: + # Different house/flat numbers → near impossible match + return 0.0 + + # --- token similarity (order-independent) --- + toks_a = tokenise(a_norm) + toks_b = tokenise(b_norm) + + if not toks_a or not toks_b: + token_score = 0.0 + else: + token_score = len(toks_a & toks_b) / len(toks_a | toks_b) + + # --- character similarity (soft signal) --- + char_score = SequenceMatcher(None, a_norm, b_norm).ratio() + + # --- weighted blend --- + return round( + 0.65 * token_score + + 0.35 * char_score, + 4, + ) + + +def normalise_address(s: str) -> str: + """ + Canonical UK-focused address normalisation. + + - Lowercases + - Removes punctuation (keeps / for flats) + - Normalises whitespace + - Applies synonym compression at token level + """ + + if not s: + return "" + + ADDRESS_SYNONYMS = { + # street types + "rd": "road", + "rd.": "road", + "st": "street", + "st.": "street", + "ave": "avenue", + "ave.": "avenue", + "ln": "lane", + "ln.": "lane", + "cres": "crescent", + "ct": "court", + "dr": "drive", + + # flats / units + "apt": "flat", + "apartment": "flat", + "unit": "flat", + "ste": "suite", + + # numbering noise + "no": "", + "no.": "", } -) -print(search_resp) + # 1. lowercase + s = s.lower() + + # 2. remove punctuation except / + s = re.sub(r"[^\w\s/]", " ", s) + + # 3. normalise whitespace + s = re.sub(r"\s+", " ", s).strip() + + # 4. tokenise + synonym normalisation + tokens = [] + for tok in s.split(): + replacement = ADDRESS_SYNONYMS.get(tok, tok) + if replacement: + tokens.append(replacement) + + return " ".join(tokens) +def score_addresses( + df: pd.DataFrame, + user_address: str, + column: str = "address", +) -> pd.Series: + if column not in df.columns: + raise ValueError(f"Missing column: {column}") + + return df[column].apply( + lambda x: levenshtein(user_address, x) + ) + +def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): + """ + Recursively fetch EPC data by postcode. + If results hit the size limit, retry with double size up to max_attempts. + """ + + url = os.path.join(client.domestic.host, "search") + + if size: + url += "?" + urlencode({"size": size}) + + search_resp = client.domestic.call( + url=url, + method="get", + params={"postcode": postcode}, + ) + + results_df = pd.DataFrame( + search_resp["rows"], + columns=search_resp["column-names"] + ) + + row_count = len(results_df) + + # If we hit the size limit, there *may* be more results + if row_count == size: + print( + f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. " + f"Attempt {attempt}/{max_attempts}." + ) + + if attempt < max_attempts: + print(f"🔁 Retrying with size={size * 2}") + return get_epc_data_with_postcode( + postcode=postcode, + size=size * 2, + attempt=attempt + 1, + max_attempts=max_attempts, + ) + else: + print( + "🚨 Max attempts reached. Results may be truncated. " + "(Please do a manual review by the tech team.)" + ) + + return results_df +def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: + """ + Returns True if all non-null UPRNs in df match the given uprn. + Returns False otherwise. + """ + + if column not in df.columns: + return False + + # Drop nulls and normalise to string + uprns = ( + df[column] + .dropna() + .astype(str) + .str.strip() + .unique() + ) + + # No valid UPRNs to compare + if len(uprns) == 0: + return False + + # Exactly one unique UPRN and it matches + return len(uprns) == 1 and uprns[0] == str(uprn) + + +def get_uprn_candidates( + df: pd.DataFrame, + user_address: str, + address_column: str = "address", + uprn_column: str = "uprn", +) -> pd.DataFrame: + """ + Annotate EPC results with lexicographical similarity scores and ranks. + + Returns a DataFrame sorted by descending lexiscore. + DOES NOT choose or return a UPRN. + """ + + if address_column not in df.columns: + raise ValueError(f"Missing column: {address_column}") + + if uprn_column not in df.columns: + raise ValueError(f"Missing column: {uprn_column}") + + out = df.copy() + + user_norm = normalise_address(user_address) + + out["lexiscore"] = out[address_column].apply( + lambda x: levenshtein(user_norm, x) + ) + + # Normalise UPRN to string + out[uprn_column] = ( + out[uprn_column] + .astype(str) + .str.replace(r"\.0$", "", regex=True) + ) + + # Rank: 1 = best match + out["lexirank"] = ( + out["lexiscore"] + .rank(method="dense", ascending=False) + .astype(int) + ) + + return out.sort_values( + ["lexirank", "lexiscore"], + ascending=[True, False], + ) + + +def get_uprn(user_inputed_address: str, postcode: str): + df = get_epc_data_with_postcode(postcode=postcode) + + if df.empty: + return False + + scored_df = get_uprn_candidates( + df, + user_address=user_inputed_address, + ) + + # Best score + best_score = scored_df.iloc[0]["lexiscore"] + + if best_score <= 0: + return False + + # All rank-1 rows (possible draw) + top_rank_df = scored_df[scored_df["lexirank"] == 1] + + # If rank-1 rows do not agree on a single UPRN → ambiguous + if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]): + return False + + # Safe to return the agreed UPRN + return top_rank_df.iloc[0]["uprn"] + + +def test(a,b): + assert a == b, f"errr a {a} - {type(a)}, does not equal b {b} - {type(b)}" + + +def run_all_test(): + # Basic usage with different post codes styles + test(get_epc_data_with_postcode("b93 8sy").shape[0], 63) + test(get_epc_data_with_postcode("B938sy").shape[0], 63) + test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) + test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) + + test(get_uprn("68", "b93 8sy"), "100070989938") + test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938") + + get_uprn_candidates(get_epc_data_with_postcode("b93 8sy"), "68") + get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "Flat A, 28, Nelgarde Road") + get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "28 A") + get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "A 28") + get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "28A") + get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "A28") + + get_uprn_candidates(get_epc_data_with_postcode("E8 4SQ"), "6 Aitken Close") # no epc + + + # # Example of more than one results for the same address + # test(get_uprn("se6 4tf", house_number="flat A 28"), "100023278633") + # test(get_uprn("se6 4tf", house_number="28 A"), "100023278633") + # test(get_uprn("se6 4tf", house_number="A 28"), "100023278633") + + # test(get_uprn("se6 4tf", house_number="A28"), "100023278633") # this one + # test(get_uprn("se6 4tf", house_number="28A"), "100023278633") # investigate this one + + # # Example of flats that have different uprn + # test(get_uprn("se6 4tf", house_number="28"), "100023278633") + + # house number nlp, address1 + + # get postcode + # make input data with peabody with 3 postcode so i have sample of iput + diff --git a/backend/app/requirements/requirements.txt b/backend/app/requirements/requirements.txt index dff7a546..a45738c6 100644 --- a/backend/app/requirements/requirements.txt +++ b/backend/app/requirements/requirements.txt @@ -1,4 +1,3 @@ -# fastapi fastapi==0.115.2 sqlalchemy==2.0.36 pydantic-settings==2.6.0 @@ -11,6 +10,4 @@ boto3==1.35.44 # Data openpyxl==3.1.2 # Basic -pytz -sqlmodel - +pytz \ No newline at end of file diff --git a/model_data/requirements/requirements.txt b/model_data/requirements/requirements.txt index 845166d9..bbf75df5 100644 --- a/model_data/requirements/requirements.txt +++ b/model_data/requirements/requirements.txt @@ -1,4 +1,4 @@ -pydantic==2.9.2 +pydantic>=1.10.7 pydantic-settings==2.6.0 epc-api-python==1.0.2 numpy==2.1.2