Model/backend/postcode_splitter/main.py
2026-01-23 16:39:16 +00:00

81 lines
1.9 KiB
Python

import pandas as pd
import requests
def sanitise_postcode(postcode: str) -> str | None:
"""
Normalise postcode for grouping.
- Uppercase
- Remove all whitespace
"""
if pd.isna(postcode):
return None
return postcode.upper().replace(" ", "")
def is_valid_postcode(postcode_clean: str) -> bool:
"""
Validate postcode using postcodes.io.
Expects a sanitised postcode (e.g. E84SQ).
Returns True if valid, False otherwise.
"""
POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate"
if not postcode_clean:
return False
try:
resp = requests.get(
POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean),
timeout=5,
)
resp.raise_for_status()
return resp.json().get("result", False)
except requests.RequestException:
# Network issues, rate limits, etc.
return False
def main():
df = pd.read_excel("hackney.xlsx")
# Sanitise postcodes
df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode)
# --- validate AFTER grouping (save API calls) ---
# Get unique, non-null postcodes
unique_postcodes = (
df["postcode_clean"]
.dropna()
.unique()
)
# Validate each postcode once
postcode_validity = {
pc: is_valid_postcode(pc)
for pc in unique_postcodes
}
# Map validity back onto dataframe
df["postcode_valid"] = df["postcode_clean"].map(postcode_validity)
# Group only valid postcodes
grouped = (
df[df["postcode_valid"]]
.groupby("postcode_clean")
)
# Example: count addresses per postcode
postcode_counts = grouped.size().sort_values(ascending=False)
for pc in sorted(unique_postcodes):
pc_df = df[df["postcode_clean"] == pc]
pd_df
if __name__ == "__main__":
main()