Model/backend/postcode_splitter/main.py
2026-02-04 15:07:34 +00:00

127 lines
3.3 KiB
Python

import pandas as pd
import requests
from backend.address2UPRN.main import (
resolve_uprns_for_postcode_group,
get_epc_data_with_postcode,
)
from tqdm import tqdm
def sanitise_postcode(postcode: str) -> str | None:
"""
Normalise postcode for grouping.
- Uppercase
- Remove all whitespace
"""
if pd.isna(postcode):
return None
return postcode.upper().replace(" ", "")
def is_valid_postcode(postcode_clean: str) -> bool:
"""
Validate postcode using postcodes.io.
Expects a sanitised postcode (e.g. E84SQ).
Returns True if valid, False otherwise.
"""
POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
if not postcode_clean:
return False
try:
resp = requests.get(
POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
timeout=5,
)
resp.raise_for_status()
return resp.json().get("result", False)
except requests.RequestException:
# Network issues, rate limits, etc.
return False
def main():
df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability")
df = df.head(500)
# Sanitise postcodes
df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode)
# --- validate AFTER grouping (save API calls) ---
# Get unique, non-null postcodes
unique_postcodes = df["postcode_clean"].dropna().unique()
# Validate each postcode once, TODOadd a progress bar
postcode_validity = {
pc: is_valid_postcode(pc)
for pc in tqdm(unique_postcodes, total=len(unique_postcodes))
}
# Map validity back onto dataframe
df["postcode_valid"] = df["postcode_clean"].map(postcode_validity)
results = []
for postcode, group_df in tqdm(
df[df["postcode_valid"]].groupby("postcode_clean"),
desc="Resolving UPRNs by postcode",
):
try:
epc_df = get_epc_data_with_postcode(postcode)
if epc_df.empty:
tmp = group_df.copy()
tmp["found_uprn"] = None
tmp["status"] = "no_epc_results"
results.append(tmp)
continue
resolved = resolve_uprns_for_postcode_group(
group_df=group_df,
epc_df=epc_df,
)
results.append(resolved)
except Exception as e:
tmp = group_df.copy()
tmp["found_uprn"] = None
tmp["status"] = "exception"
tmp["error"] = str(e)
results.append(tmp)
final_df = pd.concat(results, ignore_index=True)
a = final_df[
[
"best_match_lexiscore",
"Address 1",
"best_match_address",
"Postcode",
"UPRN",
"best_match_uprn",
]
] # add levi score to viewing
b = final_df[final_df["best_match_lexiscore"] > 0] # add levi score to viewing
b = b[
[
"best_match_lexiscore",
"Address 1",
"best_match_address",
"Postcode",
"UPRN",
"best_match_uprn",
]
]
def handler(event, context):
print("hello Postcode splitter world")
return {"statusCode": 200, "body": "hello world"}
if __name__ == "__main__":
main()