From aaa1bdc077d3131cf09a24b371b097bba7e2733b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 2 Mar 2026 15:15:39 +0000 Subject: [PATCH 01/21] added utils to allow easier subtask management --- .devcontainer/asset_list/Dockerfile | 7 +- .devcontainer/asset_list/devcontainer.json | 7 +- .devcontainer/asset_list/docker-compose.yml | 6 +- .vscode/settings.json | 8 +- asset_list/app.py | 22 +++-- backend/app/utils.py | 4 +- backend/ordanceSurvey/handler/Dockerfile | 0 .../ordanceSurvey/handler/requirements.txt | 0 backend/ordanceSurvey/main.py | 37 ++++++++ backend/utils/subtasks.py | 95 +++++++++++++++++++ utils/s3.py | 3 +- 11 files changed, 169 insertions(+), 20 deletions(-) create mode 100644 backend/ordanceSurvey/handler/Dockerfile create mode 100644 backend/ordanceSurvey/handler/requirements.txt create mode 100644 backend/ordanceSurvey/main.py create mode 100644 backend/utils/subtasks.py diff --git a/.devcontainer/asset_list/Dockerfile b/.devcontainer/asset_list/Dockerfile index 72a5de53..be869637 100644 --- a/.devcontainer/asset_list/Dockerfile +++ b/.devcontainer/asset_list/Dockerfile @@ -21,7 +21,7 @@ RUN git clone --depth 1 https://github.com/openvenues/libpostal /tmp/libpostal \ && rm -rf /tmp/libpostal # 3) Create the user and grant sudo privileges -RUN useradd -m -s /usr/bin/bash ${USER} \ +RUN useradd -m -s /bin/bash ${USER} \ && echo "${USER} ALL=(ALL) NOPASSWD: ALL" >/etc/sudoers.d/${USER} \ && chmod 0440 /etc/sudoers.d/${USER} @@ -32,6 +32,11 @@ ADD asset_list/requirements.txt requirements1.txt RUN cat requirements1.txt requirements2.txt >> requirements.txt RUN pip install -r requirements.txt + +# Install code server +RUN curl -fsSL https://code-server.dev/install.sh | sh + + # 5) Workdir WORKDIR /workspaces/model diff --git a/.devcontainer/asset_list/devcontainer.json b/.devcontainer/asset_list/devcontainer.json index 945dcd88..83e5a276 100644 --- a/.devcontainer/asset_list/devcontainer.json +++ b/.devcontainer/asset_list/devcontainer.json @@ -2,13 +2,14 @@ "name": "SAL ENV", "dockerComposeFile": "docker-compose.yml", "service": "model-sal", - "remoteUser": "vscode", + // "remoteUser": "vscode", "workspaceFolder": "/workspaces/model", - "postStartCommand": "bash .devcontainer/post-install.sh", + "postStartCommand": "bash .devcontainer/asset_list/post-install.sh", "mounts": [ // Optional, just makes getting from Downloads (local env) easier - "source=${localEnv:HOME},target=/workspaces/home,type=bind" + "source=${localEnv:HOME},target=/home/vscode,type=bind" ], + "forwardPorts": [8081], "customizations": { "vscode": { "extensions": [ diff --git a/.devcontainer/asset_list/docker-compose.yml b/.devcontainer/asset_list/docker-compose.yml index 06e4124d..0568393b 100644 --- a/.devcontainer/asset_list/docker-compose.yml +++ b/.devcontainer/asset_list/docker-compose.yml @@ -2,15 +2,17 @@ version: '3.8' services: model-sal: - user: "${UID}:${GID}" build: context: ../.. dockerfile: .devcontainer/asset_list/Dockerfile - command: sleep infinity + command: code-server --bind-addr 0.0.0.0:8080 + user: vscode volumes: - ../../:/workspaces/model networks: - model-net + ports: + - "8081:8080" networks: model-net: diff --git a/.vscode/settings.json b/.vscode/settings.json index b294c736..56299a40 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -16,7 +16,13 @@ "python.languageServer": "Pylance", "python.analysis.typeCheckingMode": "strict", "python.analysis.autoSearchPaths": true, - "python.analysis.extraPaths": ["./src"] + "python.analysis.extraPaths": ["./src"], + + "vim.useCtrlKeys": true, + "vim.handleKeys": { + "": false, + "": false + } // Hot reload setting that needs to be in user settings // "jupyter.runStartupCommands": [ diff --git a/asset_list/app.py b/asset_list/app.py index 3e492118..02f930b4 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -77,21 +77,21 @@ def app(): data_folder = "/workspaces/model/asset_list" data_filename = "assests.xlsx" sheet_name = "Sheet1" - postcode_column = "Postcode" - address1_column = "Address" + postcode_column = "POSTCODE" + address1_column = "ADDRESS" address1_method = "house_number_extraction" fulladdress_column = None - address_cols_to_concat = ["Address"] + address_cols_to_concat = ["ADDRESS"] missing_postcodes_method = None landlord_year_built = None - landlord_os_uprn = "UPRN" - landlord_property_type = "Archetype" - landlord_built_form = "Bedroom Count" - landlord_wall_construction = "Wall Insulation Type" - landlord_roof_construction = "Roof Type" - landlord_heating_system = "Boiler Type" + landlord_os_uprn = None + landlord_property_type = None + landlord_built_form = None + landlord_wall_construction = None + landlord_roof_construction = None + landlord_heating_system = None landlord_existing_pv = None - landlord_property_id = "Tab" + landlord_property_id = "UPRN" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -488,3 +488,5 @@ def app(): asset_list.duplicated_addresses.to_excel( writer, sheet_name="Duplicate Properties", index=False ) + + diff --git a/backend/app/utils.py b/backend/app/utils.py index b3843206..c1ad54f6 100644 --- a/backend/app/utils.py +++ b/backend/app/utils.py @@ -43,7 +43,7 @@ def generate_api_key(): # Define the characters that will be used to generate the api key characters = string.ascii_letters + string.digits # Generate a 40 character long api key - api_key = ''.join(secrets.choice(characters) for _ in range(40)) + api_key = "".join(secrets.choice(characters) for _ in range(40)) return api_key @@ -113,7 +113,7 @@ def save_dataframe_to_s3_parquet(df, bucket_name, file_key): df.to_parquet(parquet_buffer) # Create the boto3 client - s3 = boto3.resource('s3') + s3 = boto3.resource("s3") # Upload the Parquet file to S3 s3.Object(bucket_name, file_key).put(Body=parquet_buffer.getvalue()) diff --git a/backend/ordanceSurvey/handler/Dockerfile b/backend/ordanceSurvey/handler/Dockerfile new file mode 100644 index 00000000..e69de29b diff --git a/backend/ordanceSurvey/handler/requirements.txt b/backend/ordanceSurvey/handler/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/backend/ordanceSurvey/main.py b/backend/ordanceSurvey/main.py new file mode 100644 index 00000000..bc482d56 --- /dev/null +++ b/backend/ordanceSurvey/main.py @@ -0,0 +1,37 @@ +from typing import Any +import json +from utils.logger import setup_logger +import logging + +from backend.utils.subtasks import subtask_handler + +logger: logging.Logger = setup_logger() +import time + + +@subtask_handler() +def handler(event: dict[str, Any], context: Any, local: bool = False) -> None: + + local = True + # Example SQS message for testing (copy and paste into SQS): + if local is True: + event = { + "Records": [ + { + "body": json.dumps( + { + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", + } + ) + } + ] + } + + print("sleeping for 30 seconds, subtask should be in progress") + raise RuntimeError("test") + time.sleep(30) + print("subtask should be marked as done") + # ------------------------------ + # YOUR BUSINESS LOGIC HERE + # ------------------------------ diff --git a/backend/utils/subtasks.py b/backend/utils/subtasks.py new file mode 100644 index 00000000..041494e9 --- /dev/null +++ b/backend/utils/subtasks.py @@ -0,0 +1,95 @@ +# decorators/subtask_handler.py + +from functools import wraps +from typing import Callable, Any +from uuid import UUID +import json + +from backend.app.db.functions.tasks.Tasks import SubTaskInterface + + +def subtask_handler(): + """ + Decorator that wraps your existing handler and automatically: + + - Extracts task_id + sub_task_id from event + - Marks subtask as in progress + - Executes handler logic + - Marks subtask complete on success + - Marks failed on exception + """ + + def decorator(func: Callable[..., Any]): + + @wraps(func) + def wrapper(event: dict[str, Any], context: Any, *args, **kwargs): + + records = event.get("Records", [event]) + + interface = SubTaskInterface() + + for record in records: + + # ------------------------------- + # Parse body safely + # ------------------------------- + body = {} + + if isinstance(record.get("body"), str): + try: + body = json.loads(record["body"]) + except Exception: + body = {} + else: + body = record.get("body", {}) or {} + + task_id_raw = body.get("task_id") + subtask_id_raw = body.get("sub_task_id") + + task_id = UUID(task_id_raw) if isinstance(task_id_raw, str) else None + subtask_id = ( + UUID(subtask_id_raw) if isinstance(subtask_id_raw, str) else None + ) + + if not task_id or not subtask_id: + raise RuntimeError("task_id or sub_task_id missing") + + # ------------------------------- + # Mark in progress + # ------------------------------- + interface.update_subtask_status( + subtask_id=subtask_id, + status="in progress", + ) + + try: + # Pass the parsed body into your function + result = func(body, context, *args, **kwargs) + + # ------------------------------- + # Success → mark complete + # ------------------------------- + interface.update_subtask_status( + subtask_id=subtask_id, + status="complete", + outputs={"result": result} if result else None, + ) + + except Exception as e: + + # ------------------------------- + # Failure → mark failed + # ------------------------------- + interface.update_subtask_status( + subtask_id=subtask_id, + status="failed", + outputs={"error": str(e)}, + ) + + raise + + return None + + return wrapper + + return decorator diff --git a/utils/s3.py b/utils/s3.py index b3a96dba..6aa3f44e 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -6,6 +6,7 @@ from io import BytesIO, StringIO from urllib.parse import unquote from utils.logger import setup_logger from botocore.exceptions import NoCredentialsError, PartialCredentialsError +from typing import Any logger = setup_logger() @@ -316,7 +317,7 @@ def save_excel_to_s3(df, bucket_name, file_key): logger.info(f"Excel file saved to S3 bucket '{bucket_name}' with key '{file_key}'") -def read_csv_from_s3(bucket_name, filepath): +def read_csv_from_s3(bucket_name: str, filepath: str) -> list[dict[str, str]]: logger.info( f"Reading CSV file from S3 bucket '{bucket_name}' with key '{filepath}'" ) From 2cd24ae3d012226b9ebb15986a012571d9c2a28e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 2 Mar 2026 17:23:57 +0000 Subject: [PATCH 02/21] todo list added --- backend/ordanceSurvey/main.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/backend/ordanceSurvey/main.py b/backend/ordanceSurvey/main.py index bc482d56..21090f7b 100644 --- a/backend/ordanceSurvey/main.py +++ b/backend/ordanceSurvey/main.py @@ -2,11 +2,9 @@ from typing import Any import json from utils.logger import setup_logger import logging - from backend.utils.subtasks import subtask_handler logger: logging.Logger = setup_logger() -import time @subtask_handler() @@ -22,16 +20,15 @@ def handler(event: dict[str, Any], context: Any, local: bool = False) -> None: { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", + "s3_uri" } ) } ] } - print("sleeping for 30 seconds, subtask should be in progress") - raise RuntimeError("test") - time.sleep(30) - print("subtask should be marked as done") - # ------------------------------ - # YOUR BUSINESS LOGIC HERE - # ------------------------------ + # Add business logic to do handling + # TODO: Copy s3_uri importing from address2uprn + # TODO: Copy s3_uri logic to read csv from address2uprn and search for ones without UPRN/score is low + # TODO: Copy and do ordant survey logic + # TODO: Save new results to s3 ( ask Khalim if we want to save to db) From db251c185728d4f3f76c008593602368db96572d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 4 Mar 2026 16:14:27 +0000 Subject: [PATCH 03/21] removed duplicate code --- backend/address2UPRN/main.py | 199 +------------------------- backend/app/config.py | 2 + backend/ordanceSurvey/main.py | 113 ++++++++++++--- backend/utils/addressMatch.py | 201 +++++++++++++++++++++++++++ backend/utils/ordnance_survey.py | 44 ++++++ sfr/principal_pitch/2_export_data.py | 14 +- 6 files changed, 356 insertions(+), 217 deletions(-) create mode 100644 backend/utils/addressMatch.py create mode 100644 backend/utils/ordnance_survey.py diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index af29a095..7d52c562 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -2,12 +2,8 @@ from epc_api.client import EpcClient import os from urllib.parse import urlencode import pandas as pd -from difflib import SequenceMatcher from utils.logger import setup_logger -import re -from typing import Set import json -import requests from uuid import UUID import uuid from backend.app.db.functions.tasks.Tasks import SubTaskInterface @@ -18,6 +14,8 @@ from utils.s3 import ( ) from datetime import datetime +from backend.utils.addressMatch import addressMatch + logger = setup_logger() @@ -29,191 +27,6 @@ if EPC_AUTH_TOKEN is None: raise RuntimeError("EPC_AUTH_TOKEN not defined in env") -def is_valid_postcode(postcode_clean: str) -> bool: - """ - Validate postcode using postcodes.io. - - Expects a sanitised postcode (e.g. E84SQ). - Returns True if valid, False otherwise. - """ - POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" - if not postcode_clean: - return False - - try: - resp = requests.get( - POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), - timeout=5, - ) - resp.raise_for_status() - return resp.json().get("result", False) - except requests.RequestException: - # Network issues, rate limits, etc. - return False - - -def levenshtein(a: str, b: str) -> float: - """ - Address similarity score in [0, 1]. - - Strategy: - - Normalise - - Strongly penalise mismatched house/flat numbers - - Combine token overlap + character similarity - """ - - def extract_number_sequence(s: str) -> list[str]: - return re.findall(r"\d+[a-z]?", s) - - def extract_numbers(s: str) -> Set[str]: - return set(extract_number_sequence(s)) - - def tokenise(s: str) -> Set[str]: - return set(s.split()) - - def extract_building_number(s: str) -> str | None: - """ - Extract the main building number (NOT flat/unit). - Assumes formats like: - - '42 moreton road' - - 'flat 3 42 moreton road' - """ - tokens = s.split() - - # remove flat/unit context - cleaned = [] - skip_next = False - for t in tokens: - if t in ("flat", "apt", "apartment", "unit"): - skip_next = True - continue - if skip_next: - skip_next = False - continue - cleaned.append(t) - - # first remaining number is building number - for t in cleaned: - if re.fullmatch(r"\d+[a-z]?", t): - return t - - return None - - a_norm = normalise_address(a) - b_norm = normalise_address(b) - - # --- hard signal: numbers --- - nums_a = extract_numbers(a_norm) - nums_b = extract_numbers(b_norm) - - if nums_a and not nums_b: - return 0.0 - - # No shared numbers at all → impossible match - if nums_a and nums_b and nums_a.isdisjoint(nums_b): - return 0.0 - - # 🔒 HARD GUARD: building number must match - bld_a = extract_building_number(a_norm) - bld_b = extract_building_number(b_norm) - - if bld_a and bld_b and bld_a != bld_b: - return 0.0 - - # --- order-sensitive flat/building guard --- - seq_a = extract_number_sequence(a_norm) - seq_b = extract_number_sequence(b_norm) - - has_flat_token_user = any( - tok in a_norm for tok in ("flat", "apt", "apartment", "unit") - ) - has_flat_token_epc = "flat" in b_norm - - if ( - len(seq_a) == 2 - and len(seq_b) >= 2 - and has_flat_token_epc - and not has_flat_token_user - and seq_a != seq_b[:2] - ): - return 0.0 - - # --- token similarity (order-independent) --- - toks_a = tokenise(a_norm) - toks_b = tokenise(b_norm) - - if not toks_a or not toks_b: - token_score = 0.0 - else: - token_score = len(toks_a & toks_b) / len(toks_a | toks_b) - - # --- character similarity (soft signal) --- - char_score = SequenceMatcher(None, a_norm, b_norm).ratio() - - # --- weighted blend --- - return round( - 0.65 * token_score + 0.35 * char_score, - 4, - ) - - -def normalise_address(s: str) -> str: - """ - Canonical UK-focused address normalisation. - - - Lowercases - - Removes punctuation (keeps / for flats) - - Normalises whitespace - - Applies synonym compression at token level - """ - - if not s: - return "" - - ADDRESS_SYNONYMS = { - # street types - "rd": "road", - "rd.": "road", - "st": "street", - "st.": "street", - "ave": "avenue", - "ave.": "avenue", - "ln": "lane", - "ln.": "lane", - "cres": "crescent", - "ct": "court", - "dr": "drive", - # flats / units - "apt": "flat", - "apartment": "flat", - "unit": "flat", - "ste": "suite", - # numbering noise - "no": "", - "no.": "", - } - # 1. lowercase - s = s.lower() - - # 1.5 split digit-letter suffixes - s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) - - # 2. remove punctuation except / - s = re.sub(r"[^\w\s/]", " ", s) - - # 3. normalise whitespace - s = re.sub(r"\s+", " ", s).strip() - - # 4. tokenise + synonym normalisation - tokens = [] - for tok in s.split(): - replacement = ADDRESS_SYNONYMS.get(tok, tok) - if replacement: - tokens.append(replacement) - - return " ".join(tokens) - - def score_addresses( df: pd.DataFrame, user_address: str, @@ -222,7 +35,7 @@ def score_addresses( if column not in df.columns: raise ValueError(f"Missing column: {column}") - return df[column].apply(lambda x: levenshtein(user_address, x)) + return df[column].apply(lambda x: addressMatch.score(user_address, x)) def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): @@ -314,9 +127,9 @@ def get_uprn_candidates( out = df.copy() - user_norm = normalise_address(user_address) + user_norm = addressMatch.normalise_address(user_address) - out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x)) + out["lexiscore"] = out[address_column].apply(lambda x: addressMatch.levenshtein(user_norm, x)) # Normalise UPRN to string out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) @@ -653,7 +466,7 @@ def handler(event, context, local=False): ) # Validate postcode before processing - if not is_valid_postcode(postcode): + if not addressMatch.is_valid_postcode(postcode): logger.warning(f"Postcode {postcode} is invalid, skipping") continue diff --git a/backend/app/config.py b/backend/app/config.py index 26fb6b8b..b5b29137 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -63,6 +63,8 @@ class Settings(BaseSettings): # Other S3 buckts ENERGY_ASSESSMENTS_BUCKET: str = "changeme" + ORDNANCE_SURVEY_API_KEY: str = "changeme" + # Optional AWS creds (only required in local) AWS_ACCESS_KEY_ID: Optional[str] = None AWS_SECRET_KEY_ID: Optional[str] = None diff --git a/backend/ordanceSurvey/main.py b/backend/ordanceSurvey/main.py index 21090f7b..5961aa16 100644 --- a/backend/ordanceSurvey/main.py +++ b/backend/ordanceSurvey/main.py @@ -3,32 +3,113 @@ import json from utils.logger import setup_logger import logging from backend.utils.subtasks import subtask_handler +from utils.s3 import ( + # save_csv_to_s3, + read_csv_from_s3 as read_csv_from_s3_dict, + parse_s3_uri, +) +from backend.utils.addressMatch import addressMatch +from backend.app.db.connection import get_db_session +from backend.app.db.models.postcode_search import PostcodeSearchModel +from backend.utils.ordnance_survey import ( + lookup_os_places, + os_places_results_to_dataframe, +) +from backend.app.config import get_settings +from sqlalchemy import select + +import pandas as pd logger: logging.Logger = setup_logger() -@subtask_handler() -def handler(event: dict[str, Any], context: Any, local: bool = False) -> None: +def check_if_post_code_exists_in_db_cache(postcode): + with get_db_session() as session: + result = ( + session.execute( + select(PostcodeSearchModel).where( + PostcodeSearchModel.postcode == postcode + ) + ) + .scalars() + .first() + ) + if result: + return os_places_results_to_dataframe(result.result_data) + + # Cache miss — fetch from OS Places API + api_key = get_settings().ORDNANCE_SURVEY_API_KEY + response = lookup_os_places(postcode, api_key) + + if response.get("status") != 200 or "data" not in response: + logger.error(f"OS Places API failed for {postcode}: {response}") + raise RuntimeError( + "A postcode that doesn't exists in ordant survey and check if its real in postcode validator!!! Postcode: {postcode}" + ) + return None + + # Save to cache + new_record = PostcodeSearchModel( + postcode=postcode, + result_data=response["data"], + ) + session.add(new_record) + session.commit() + + return os_places_results_to_dataframe(response["data"]) + + +def get_ordance_survey_record(row, cache=None): + if cache is None: + cache = check_if_post_code_exists_in_db_cache(postcode) + + # process cache with row + + +@subtask_handler() # This assumes task_id and subtask_id is defined in event.Records.body +def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: + + # delete this line after test local = True # Example SQS message for testing (copy and paste into SQS): if local is True: - event = { - "Records": [ - { - "body": json.dumps( - { - "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", - "s3_uri" - } - ) - } - ] + body = { + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", + "s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/09cc7368-0850-4145-8b04-ebd84b3263c4/2026-02-18T14:00:13.228611_d2f675c3.csv", } + s3_uri: str = body.get("s3_uri", "") + lexiscore_threshold: float = body.get("lexiscore_threshold", 0.5) + + if s3_uri == "": + raise RuntimeError("Missing s3_uri in message body") + + bucket, key = parse_s3_uri(s3_uri) + + # Assumption designing with address2uprn was ran first + csv_data = read_csv_from_s3_dict(bucket, key) + df = pd.DataFrame(csv_data) + df["domna_lexiscore"] = pd.to_numeric(df["domna_lexiscore"], errors="coerce") + needs_processing = df[ + df["domna_lexiscore"].isna() | (df["domna_lexiscore"] < lexiscore_threshold) + ] + + grouped = needs_processing.groupby("postcode_clean") + + # Process each postcode group at a time + for postcode, group in grouped: + print(f"Processing postcode: {postcode} ({len(group)} rows)") + valid_group = addressMatch.is_valid_postcode(postcode) + if valid_group: + postcode_cache = None + if postcode_cache is None: + postcode_cache = get_ordance_survey_record(postcode) + for index, row in group.iterrows(): + print("do something") + break + # Add business logic to do handling - # TODO: Copy s3_uri importing from address2uprn - # TODO: Copy s3_uri logic to read csv from address2uprn and search for ones without UPRN/score is low # TODO: Copy and do ordant survey logic # TODO: Save new results to s3 ( ask Khalim if we want to save to db) diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py new file mode 100644 index 00000000..b09c1672 --- /dev/null +++ b/backend/utils/addressMatch.py @@ -0,0 +1,201 @@ +import re +from typing import Any, Optional +from difflib import SequenceMatcher +import requests + + +class addressMatch: + def __init__(self): + return None + + @staticmethod + def score(a: str, b: str) -> float: + score: float = addressMatch.levenshtein(a, b) + + return score + + @staticmethod + def is_valid_postcode(postcode_clean: str) -> bool: + """ + Validate postcode using postcodes.io. + + Expects a sanitised postcode (e.g. E84SQ). + Returns True if valid, False otherwise. + """ + POSTCODES_IO_VALIDATE_URL = ( + "https://api.postcodes.io/postcodes/{postcode}/validate" + ) + if not postcode_clean: + return False + + try: + resp = requests.get( + POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), + timeout=5, + ) + resp.raise_for_status() + return resp.json().get("result", False) + except requests.RequestException: + # Network issues, rate limits, etc. + return False + + @staticmethod + def normalise_address(s: str) -> str: + """ + Canonical UK-focused address normalisation. + + - Lowercases + - Removes punctuation (keeps / for flats) + - Normalises whitespace + - Applies synonym compression at token level + """ + + if not s: + return "" + + ADDRESS_SYNONYMS = { + # street types + "rd": "road", + "rd.": "road", + "st": "street", + "st.": "street", + "ave": "avenue", + "ave.": "avenue", + "ln": "lane", + "ln.": "lane", + "cres": "crescent", + "ct": "court", + "dr": "drive", + # flats / units + "apt": "flat", + "apartment": "flat", + "unit": "flat", + "ste": "suite", + # numbering noise + "no": "", + "no.": "", + } + # 1. lowercase + s = s.lower() + + # 1.5 split digit-letter suffixes + s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) + + # 2. remove punctuation except / + s = re.sub(r"[^\w\s/]", " ", s) + + # 3. normalise whitespace + s = re.sub(r"\s+", " ", s).strip() + + # 4. tokenise + synonym normalisation + tokens: list[str] = [] + for tok in s.split(): + replacement = ADDRESS_SYNONYMS.get(tok, tok) + if replacement: + tokens.append(replacement) + return " ".join(tokens) + + @staticmethod + def levenshtein(a: str, b: str) -> float: + """ + Address similarity score in [0, 1]. + + Strategy: + - Normalise + - Strongly penalise mismatched house/flat numbers + - Combine token overlap + character similarity + """ + + def extract_number_sequence(s: str) -> list[str]: + return re.findall(r"\d+[a-z]?", s) + + def extract_numbers(s: str) -> set[str]: + return set(extract_number_sequence(s)) + + def tokenise(s: str) -> set[str]: + return set(s.split()) + + def extract_building_number(s: str) -> Optional[str]: + """ + Extract the main building number (NOT flat/unit). + Assumes formats like: + - '42 moreton road' + - 'flat 3 42 moreton road' + """ + tokens = s.split() + + # remove flat/unit context + cleaned: list[Any] = [] + skip_next = False + for t in tokens: + if t in ("flat", "apt", "apartment", "unit"): + skip_next = True + continue + if skip_next: + skip_next = False + continue + cleaned.append(t) + + # first remaining number is building number + for t in cleaned: + if re.fullmatch(r"\d+[a-z]?", t): + return t + + return None + + a_norm = addressMatch.normalise_address(a) + b_norm = addressMatch.normalise_address(b) + + # --- hard signal: numbers --- + nums_a = extract_numbers(a_norm) + nums_b = extract_numbers(b_norm) + + if nums_a and not nums_b: + return 0.0 + + # No shared numbers at all → impossible match + if nums_a and nums_b and nums_a.isdisjoint(nums_b): + return 0.0 + + # 🔒 HARD GUARD: building number must match + bld_a = extract_building_number(a_norm) + bld_b = extract_building_number(b_norm) + + if bld_a and bld_b and bld_a != bld_b: + return 0.0 + + # --- order-sensitive flat/building guard --- + seq_a = extract_number_sequence(a_norm) + seq_b = extract_number_sequence(b_norm) + + has_flat_token_user = any( + tok in a_norm for tok in ("flat", "apt", "apartment", "unit") + ) + has_flat_token_epc = "flat" in b_norm + + if ( + len(seq_a) == 2 + and len(seq_b) >= 2 + and has_flat_token_epc + and not has_flat_token_user + and seq_a != seq_b[:2] + ): + return 0.0 + + # --- token similarity (order-independent) --- + toks_a: set[str] = tokenise(a_norm) + toks_b: set[str] = tokenise(b_norm) + + if not toks_a or not toks_b: + token_score = 0.0 + else: + token_score = len(toks_a & toks_b) / len(toks_a | toks_b) + + # --- character similarity (soft signal) --- + char_score: float = SequenceMatcher(None, a_norm, b_norm).ratio() + + # --- weighted blend --- + return round( + 0.65 * token_score + 0.35 * char_score, + 4, + ) diff --git a/backend/utils/ordnance_survey.py b/backend/utils/ordnance_survey.py new file mode 100644 index 00000000..03a0e57b --- /dev/null +++ b/backend/utils/ordnance_survey.py @@ -0,0 +1,44 @@ +import urllib.parse +import requests +import pandas as pd +from utils.logger import setup_logger + +logger = setup_logger() + + +def os_places_results_to_dataframe(data: dict) -> pd.DataFrame: + """ + Flatten the OS Places API response results into a DataFrame. + Each result contains either a DPA or LPI record. + """ + results = data.get("results", []) + rows = [] + for r in results: + if "DPA" in r: + rows.append(r["DPA"]) + elif "LPI" in r: + rows.append(r["LPI"]) + return pd.DataFrame(rows) + + +def lookup_os_places(postcode: str, api_key: str) -> dict: + """ + Lookup a postcode using the OS Places API. + Returns the full API response data or an error dict. + """ + if not api_key: + return {"error": "Ordnance Survey API key not specified", "status": 400} + + encoded_postcode = urllib.parse.quote(postcode) + url = ( + f"https://api.os.uk/search/places/v1/postcode?postcode={encoded_postcode}" + f"&dataset=DPA,LPI&key={api_key}" + ) + + response = requests.get(url) + if response.status_code != 200: + logger.error(f"OS Places API error for postcode {postcode}: {response.status_code}") + return {"error": "Failed to fetch address data", "status": response.status_code} + + data = response.json() + return {"data": data, "status": 200} diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index 4f430209..b1c3a88a 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -28,15 +28,15 @@ from sqlalchemy import func # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 568 -SCENARIOS = [ - 1059, -] +PORTFOLIO_ID = 404 +SCENARIOS = [819, 829, 872] scenario_names = { - 1059: "EPC C - 10k budget", + 819: "EPC C", + 829: "EPC C - no solid floor", + 872: "EPC C - no solid floor, refresh", } -project_name = "manchester" +project_name = "lincs_rural" def get_data(portfolio_id, scenario_ids): @@ -330,8 +330,6 @@ for scenario_id in SCENARIOS: getting_works = df[df["total_retrofit_cost"] > 0] getting_works["predicted_post_works_epc"].value_counts() - 32565 / getting_works.shape[0] - df[df["predicted_post_works_sap"] == ""] # Expected columns list From 1b3a942c308e54a23e5543c739f1acff30da95c1 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 4 Mar 2026 16:58:23 +0000 Subject: [PATCH 04/21] ordance survey logic basically finsihed --- backend/address2UPRN/main.py | 24 ++++++----- backend/ordanceSurvey/main.py | 78 +++++++++++++++++++++++++++-------- 2 files changed, 74 insertions(+), 28 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 7d52c562..53e50617 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -129,7 +129,9 @@ def get_uprn_candidates( user_norm = addressMatch.normalise_address(user_address) - out["lexiscore"] = out[address_column].apply(lambda x: addressMatch.levenshtein(user_norm, x)) + out["lexiscore"] = out[address_column].apply( + lambda x: addressMatch.levenshtein(user_norm, x) + ) # Normalise UPRN to string out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) @@ -346,7 +348,7 @@ def handler(event, context, local=False): { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "sub_task_id": "6a427b6e-1ece-4983-b1e5-9bffccc53d1d", - "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-16T12:00:20.257856_7b520c0e.csv", + "s3_uri": "s3://retrofit-data-dev/ara_postcode_splitter_batches/e31f2f21-175b-4a91-a3ec-a6baa325e917/8673913b-1a88-42d7-8578-0449123d94b0/2026-02-18T11:47:00.822579_f95467f5.csv", } ) } @@ -507,9 +509,9 @@ def handler(event, context, local=False): results_data.append( { **row, # Include all original data - "uprn": uprn, - "domna_found_address": found_address, - "domna_lexiscore": score, + "address2uprn_uprn": uprn, + "address2uprn_address": found_address, + "address2uprn_lexiscore": score, } ) else: @@ -519,9 +521,9 @@ def handler(event, context, local=False): results_data.append( { **row, # Include all original data - "uprn": None, - "domna_found_address": None, - "domna_lexiscore": None, + "address2uprn_uprn": None, + "address2uprn_address": None, + "address2uprn_lexiscore": None, } ) @@ -533,9 +535,9 @@ def handler(event, context, local=False): results_data.append( { **row, - "uprn": None, - "domna_found_address": None, - "domna_lexiscore": None, + "address2uprn_uprn": None, + "address2uprn_address": None, + "address2uprn_lexiscore": None, "error": str(e), } ) diff --git a/backend/ordanceSurvey/main.py b/backend/ordanceSurvey/main.py index 5961aa16..4200bd24 100644 --- a/backend/ordanceSurvey/main.py +++ b/backend/ordanceSurvey/main.py @@ -44,10 +44,7 @@ def check_if_post_code_exists_in_db_cache(postcode): if response.get("status") != 200 or "data" not in response: logger.error(f"OS Places API failed for {postcode}: {response}") - raise RuntimeError( - "A postcode that doesn't exists in ordant survey and check if its real in postcode validator!!! Postcode: {postcode}" - ) - return None + return pd.DataFrame() # Save to cache new_record = PostcodeSearchModel( @@ -77,7 +74,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: body = { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", - "s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/09cc7368-0850-4145-8b04-ebd84b3263c4/2026-02-18T14:00:13.228611_d2f675c3.csv", + "s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/6a427b6e-1ece-4983-b1e5-9bffccc53d1d/2026-03-04T16:48:22.339995_634c88fc.csv", } s3_uri: str = body.get("s3_uri", "") @@ -91,25 +88,72 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # Assumption designing with address2uprn was ran first csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) - df["domna_lexiscore"] = pd.to_numeric(df["domna_lexiscore"], errors="coerce") + df["address2uprn_lexiscore"] = pd.to_numeric( + df["address2uprn_lexiscore"], errors="coerce" + ) needs_processing = df[ - df["domna_lexiscore"].isna() | (df["domna_lexiscore"] < lexiscore_threshold) + df["address2uprn_lexiscore"].isna() + | (df["address2uprn_lexiscore"] < lexiscore_threshold) ] grouped = needs_processing.groupby("postcode_clean") + # Initialise new columns + df["ordnance_survey_address"] = None + df["ordnance_survey_uprn"] = None + df["ordnance_survey_lexiscore"] = None + # Process each postcode group at a time for postcode, group in grouped: print(f"Processing postcode: {postcode} ({len(group)} rows)") valid_group = addressMatch.is_valid_postcode(postcode) - if valid_group: - postcode_cache = None - if postcode_cache is None: - postcode_cache = get_ordance_survey_record(postcode) - for index, row in group.iterrows(): - print("do something") - break + if not valid_group: + logger.warning(f"Postcode {postcode} is invalid, skipping") + for idx in group.index: + df.at[idx, "ordnance_survey_address"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_uprn"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_lexiscore"] = ( + "postcode not found in ordnance survey" + ) + continue - # Add business logic to do handling - # TODO: Copy and do ordant survey logic - # TODO: Save new results to s3 ( ask Khalim if we want to save to db) + postcode_cache = check_if_post_code_exists_in_db_cache(postcode) + if postcode_cache.empty: + logger.warning(f"No OS Places data for {postcode}") + for idx in group.index: + df.at[idx, "ordnance_survey_address"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_uprn"] = ( + "postcode not found in ordnance survey" + ) + df.at[idx, "ordnance_survey_lexiscore"] = ( + "postcode not found in ordnance survey" + ) + continue + + for idx, row in group.iterrows(): + user_address = str(row.get("user_input", "")).strip() + if not user_address: + continue + + # Score against OS Places addresses + scores = postcode_cache["ADDRESS"].apply( + lambda addr: addressMatch.score(user_address, addr) + ) + best_idx = scores.idxmax() + best_score = scores[best_idx] + + df.at[idx, "ordnance_survey_address"] = postcode_cache.at[ + best_idx, "ADDRESS" + ] + df.at[idx, "ordnance_survey_uprn"] = postcode_cache.at[best_idx, "UPRN"] + df.at[idx, "ordnance_survey_lexiscore"] = best_score + + # TODO: Save new results to s3 (ask Khalim if we want to save to db) + df.to_csv("ordnance_survey_results.csv", index=False) + print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)") From 24b19dbf9a9701a80a83cbfd4a03287bd71e7d84 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 4 Mar 2026 17:04:14 +0000 Subject: [PATCH 05/21] spelling mistake --- backend/{ordanceSurvey => ordnanceSurvey}/handler/Dockerfile | 0 .../{ordanceSurvey => ordnanceSurvey}/handler/requirements.txt | 0 backend/{ordanceSurvey => ordnanceSurvey}/main.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename backend/{ordanceSurvey => ordnanceSurvey}/handler/Dockerfile (100%) rename backend/{ordanceSurvey => ordnanceSurvey}/handler/requirements.txt (100%) rename backend/{ordanceSurvey => ordnanceSurvey}/main.py (100%) diff --git a/backend/ordanceSurvey/handler/Dockerfile b/backend/ordnanceSurvey/handler/Dockerfile similarity index 100% rename from backend/ordanceSurvey/handler/Dockerfile rename to backend/ordnanceSurvey/handler/Dockerfile diff --git a/backend/ordanceSurvey/handler/requirements.txt b/backend/ordnanceSurvey/handler/requirements.txt similarity index 100% rename from backend/ordanceSurvey/handler/requirements.txt rename to backend/ordnanceSurvey/handler/requirements.txt diff --git a/backend/ordanceSurvey/main.py b/backend/ordnanceSurvey/main.py similarity index 100% rename from backend/ordanceSurvey/main.py rename to backend/ordnanceSurvey/main.py From eda2fb36c66c5b591236b656c6f6a8507d2f0477 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 4 Mar 2026 17:19:47 +0000 Subject: [PATCH 06/21] get rid of ordancy survey --- backend/OrdnanceSurvey.py | 131 -------------------------------------- 1 file changed, 131 deletions(-) delete mode 100644 backend/OrdnanceSurvey.py diff --git a/backend/OrdnanceSurvey.py b/backend/OrdnanceSurvey.py deleted file mode 100644 index a4d716d0..00000000 --- a/backend/OrdnanceSurvey.py +++ /dev/null @@ -1,131 +0,0 @@ -from functools import lru_cache -import urllib.parse -import requests -from utils.logger import setup_logger - -logger = setup_logger() - - -class OrdnanceSuveyClient: - - def __init__(self, address, postcode, api_key): - """ - This class is tasked with interaction with the ordnance survey API. - :param address: The address for the property to search for - :param postcode: The postcode for the property to search for - """ - - self.address = address - self.postcode = postcode - self.full_address = ", ".join([self.address, self.postcode]) - self.api_key = api_key - - self.results = None - - self.most_relevant_result = None - self.property_type = None - self.built_form = None - # This will be postcode and address, as returned by the ordnance survey - self.address_os = None - self.postcode_os = None - - def set_places_address(self): - """ - Given a response from the places api, this function will set the address and postcode of the property - """ - - if self.most_relevant_result is None: - raise ValueError("No results found - run get_places_api first") - - self.address_os = self.most_relevant_result["ADDRESS"] - - if "POSTCODE" in self.most_relevant_result: - self.postcode_os = self.most_relevant_result["POSTCODE"] - else: - self.postcode_os = self.most_relevant_result["POSTCODE_LOCATOR"] - # We strip out the postcode from the address as this is already stored separately - self.address_os = self.address_os.replace(self.postcode_os, "").strip() - # Remove trailing comma - self.address_os = self.address_os.rstrip(",").strip() - # Convert to title case - self.address_os = self.address_os.title() - # Make sure postcode is upper case - self.postcode_os = self.postcode_os.upper() - - @lru_cache(maxsize=128) - def get_places_api(self, filter_by_postcode=False): - """ - This method is tasked with getting the places api from the Ordnance Survey. - """ - - if not self.api_key: - raise ValueError("Ordnance Survey API key not specified") - - encoded_address_query = urllib.parse.quote(self.full_address) - - url = ( - f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&dataset=DPA,LPI&matchprecision=10" - f"&key={self.api_key}" - ) - - response = requests.get(url) - if response.status_code == 200: - data = response.json() - res = data["results"] - - if filter_by_postcode: - results = [] - for r in res: - if "DPA" in r: - if r["DPA"]["POSTCODE"] == self.postcode: - results.append(r) - elif "LPI" in r: - if r["LPI"]["POSTCODE_LOCATOR"] == self.postcode: - results.append(r) - else: - raise ValueError("Could not find postcode in either DPA or LPI") - else: - results = res - - self.results = results - - # Extract some details about the best match - self.most_relevant_result = self.results[0]["DPA"] if "DPA" in self.results[0] else self.results[0]["LPI"] - - self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"]) - self.set_places_address() - - else: - logger.info("Could not find any results for the provided address and postcode") - - return {"status": response.status_code} - - def parse_classification_code(self, classification_code: str): - """ - This function will convert the classification code, returned by the OS places api, to a property type that is - compatible with the EPC database. - - The various classifications cane be found here: - https://osdatahub.os.uk/docs/places/technicalSpecification - - Under LPI Output, CLASSIFICATION_CODE is described, and a link is provided to the full table of classifications - For these purposes, we do not need the full classification as this includes non-residential properties. We only - parse the ones of interest to us - :return: - """ - - value_map = { - # In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database - 'RD': {}, - 'RD02': {'property_type': 'House', 'built_form': 'Detached'}, - 'RD03': {'property_type': 'House', 'built_form': 'Semi-Detached'}, - 'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'}, - 'RD06': {'property_type': 'Flat'}, - } - # Other classifications can be found in here: - # https://osdatahub.os.uk/docs/places/technicalSpecification in the CLASSIFICATION_CODE description. - # A lookup table csv can be downloaded which contains all of the codes - - mapped = value_map.get(classification_code, {}) - self.property_type = mapped.get("property_type", "") - self.built_form = mapped.get("built_form", "") From 815ce010827bd46a72e22f1936146c7c68d36f2a Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Mar 2026 12:51:08 +0000 Subject: [PATCH 07/21] lambda code works locally --- backend/address2UPRN/main.py | 33 +++++++--------- backend/app/db/models/postcode_search.py | 24 ++++++++++++ backend/ordnanceSurvey/handler/Dockerfile | 25 ++++++++++++ .../ordnanceSurvey/handler/requirements.txt | 11 ++++++ .../local_handler/docker-compose.yml | 11 ++++++ .../local_handler/invoke_local_lambda.py | 29 ++++++++++++++ backend/ordnanceSurvey/main.py | 38 +++++++++++++------ 7 files changed, 140 insertions(+), 31 deletions(-) create mode 100644 backend/app/db/models/postcode_search.py create mode 100644 backend/ordnanceSurvey/local_handler/docker-compose.yml create mode 100644 backend/ordnanceSurvey/local_handler/invoke_local_lambda.py diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 53e50617..ea588a77 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -436,19 +436,6 @@ def handler(event, context, local=False): # Process the rows logger.info(f"Processing {len(df)} rows for task {task_id}") - # Create user_input column by concatenating Address columns if not already present - if "user_input" not in df.columns: - df["user_input"] = ( - df["Address 1"].fillna("") - + " " - + df["Address 2"].fillna("") - + " " - + df["Address 3"].fillna("") - ).str.strip() - logger.info(f"Created user_input column from Address 1 and Address 2") - else: - logger.info(f"user_input column already present in data") - clean_df = df.dropna(subset=["postcode_clean"]) postcode_to_addresses = { @@ -487,23 +474,29 @@ def handler(event, context, local=False): # Process each address in this postcode with the same EPC data for row in postcode_rows: try: - user_input = row.get("user_input", "") - if not user_input: + # Concatenate Address columns directly + address2uprn_user_input = ( + str(row.get("Address 1", "")).strip() + " " + + str(row.get("Address 2", "")).strip() + " " + + str(row.get("Address 3", "")).strip() + ).strip() + + if not address2uprn_user_input: logger.warning( - f"Skipping row with missing user_input for postcode {postcode}" + f"Skipping row with missing address components for postcode {postcode}" ) continue # Get UPRN using the pre-fetched EPC data with all return options result = get_uprn_with_epc_df( - user_inputed_address=user_input, epc_df=epc_df, verbose=True + user_inputed_address=address2uprn_user_input, epc_df=epc_df, verbose=True ) # Parse result tuple if successful if result: uprn, found_address, score = result logger.info( - f"Found UPRN for {user_input} in {postcode}: {uprn} (score: {score})" + f"Found UPRN for {address2uprn_user_input} in {postcode}: {uprn} (score: {score})" ) results_data.append( @@ -516,7 +509,7 @@ def handler(event, context, local=False): ) else: logger.warning( - f"No UPRN found for {user_input} in {postcode}" + f"No UPRN found for {address2uprn_user_input} in {postcode}" ) results_data.append( { @@ -529,7 +522,7 @@ def handler(event, context, local=False): except Exception as e: logger.error( - f"Error processing address {row.get('user_input', 'unknown')}: {e}" + f"Error processing address {row.get('address2uprn_user_input', 'unknown')}: {e}" ) # Still add the row with error markers results_data.append( diff --git a/backend/app/db/models/postcode_search.py b/backend/app/db/models/postcode_search.py new file mode 100644 index 00000000..1e3e03b8 --- /dev/null +++ b/backend/app/db/models/postcode_search.py @@ -0,0 +1,24 @@ +import pytz +import datetime +from sqlalchemy import ( + Column, + BigInteger, + Text, + DateTime, +) +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.ext.declarative import declarative_base + +Base = declarative_base() + + +class PostcodeSearchModel(Base): + __tablename__ = "postcode_search" + + id = Column(BigInteger, primary_key=True, autoincrement=True) + postcode = Column(Text, nullable=False) + result_data = Column(JSONB, nullable=True) + + created_at = Column( + DateTime(timezone=True), nullable=False, default=datetime.datetime.now(pytz.utc) + ) diff --git a/backend/ordnanceSurvey/handler/Dockerfile b/backend/ordnanceSurvey/handler/Dockerfile index e69de29b..6a3cbe26 100644 --- a/backend/ordnanceSurvey/handler/Dockerfile +++ b/backend/ordnanceSurvey/handler/Dockerfile @@ -0,0 +1,25 @@ +FROM public.ecr.aws/lambda/python:3.11 + +ARG DEV_DB_HOST +ARG DEV_DB_PORT +ARG DEV_DB_NAME + +ENV DB_HOST=${DEV_DB_HOST} +ENV DB_PORT=${DEV_DB_PORT} +ENV DB_NAME=${DEV_DB_NAME} + +# Set working directory (Lambda task root) +WORKDIR /var/task + +COPY backend/ordnanceSurvey/handler/requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +# Copy necessary files for database and utility imports +COPY utils/ utils/ +COPY backend/ backend/ +COPY datatypes/ datatypes/ + +# Lambda handler +CMD ["backend/ordnanceSurvey/main.handler"] + diff --git a/backend/ordnanceSurvey/handler/requirements.txt b/backend/ordnanceSurvey/handler/requirements.txt index e69de29b..6ef41b2d 100644 --- a/backend/ordnanceSurvey/handler/requirements.txt +++ b/backend/ordnanceSurvey/handler/requirements.txt @@ -0,0 +1,11 @@ +pandas==2.2.2 +numpy<2.0 +requests +tqdm +openpyxl +epc-api-python==1.0.2 +boto3==1.35.44 +sqlmodel +sqlalchemy==2.0.36 +psycopg2-binary==2.9.10 +pydantic-settings==2.6.0 \ No newline at end of file diff --git a/backend/ordnanceSurvey/local_handler/docker-compose.yml b/backend/ordnanceSurvey/local_handler/docker-compose.yml new file mode 100644 index 00000000..5f54e7da --- /dev/null +++ b/backend/ordnanceSurvey/local_handler/docker-compose.yml @@ -0,0 +1,11 @@ +version: "3.9" + +services: + ordnance-survey-lambda: + build: + context: ../../../ + dockerfile: backend/ordnanceSurvey/handler/Dockerfile + ports: + - "9000:8080" + env_file: + - ../../../.env \ No newline at end of file diff --git a/backend/ordnanceSurvey/local_handler/invoke_local_lambda.py b/backend/ordnanceSurvey/local_handler/invoke_local_lambda.py new file mode 100644 index 00000000..c25f2d20 --- /dev/null +++ b/backend/ordnanceSurvey/local_handler/invoke_local_lambda.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +import json +import requests + +HOST = "localhost" +PORT = "9000" + +LAMBDA_URL = f"http://{HOST}:{PORT}/2015-03-31/functions/function/invocations" + +payload = { + "Records": [ + { + "body": json.dumps( + { + "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", + "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", + "s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/6a427b6e-1ece-4983-b1e5-9bffccc53d1d/2026-03-04T16:48:22.339995_634c88fc.csv", + "lexiscore_column": "address2uprn_lexiscore", + } + ) + } + ] +} + +response = requests.post(LAMBDA_URL, json=payload) + +print("Status code:", response.status_code) +print("Response:") +print(response.text) diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index 4200bd24..6c4f3080 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -68,17 +68,19 @@ def get_ordance_survey_record(row, cache=None): def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # delete this line after test - local = True + # local = True # Example SQS message for testing (copy and paste into SQS): if local is True: body = { "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", "sub_task_id": "8673913b-1a88-42d7-8578-0449123d94b0", "s3_uri": "s3://retrofit-data-dev/ara_raw_outputs/e31f2f21-175b-4a91-a3ec-a6baa325e917/6a427b6e-1ece-4983-b1e5-9bffccc53d1d/2026-03-04T16:48:22.339995_634c88fc.csv", + "lexiscore_column": "address2uprn_lexiscore", } s3_uri: str = body.get("s3_uri", "") lexiscore_threshold: float = body.get("lexiscore_threshold", 0.5) + lexiscore_column: str = body.get("lexiscore_column", None) if s3_uri == "": raise RuntimeError("Missing s3_uri in message body") @@ -88,13 +90,17 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # Assumption designing with address2uprn was ran first csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) - df["address2uprn_lexiscore"] = pd.to_numeric( - df["address2uprn_lexiscore"], errors="coerce" - ) - needs_processing = df[ - df["address2uprn_lexiscore"].isna() - | (df["address2uprn_lexiscore"] < lexiscore_threshold) - ] + df = df.head(5) + + # If lexiscore_column is specified, use it; otherwise process all rows + if lexiscore_column and lexiscore_column in df.columns: + df[lexiscore_column] = pd.to_numeric(df[lexiscore_column], errors="coerce") + needs_processing = df[ + df[lexiscore_column].isna() | (df[lexiscore_column] < lexiscore_threshold) + ] + else: + # Default: process all rows + needs_processing = df grouped = needs_processing.groupby("postcode_clean") @@ -137,13 +143,21 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: continue for idx, row in group.iterrows(): - user_address = str(row.get("user_input", "")).strip() - if not user_address: + # Concatenate Address columns directly + ordnancy_survey_user_input = ( + str(row.get("Address 1", "")).strip() + + " " + + str(row.get("Address 2", "")).strip() + + " " + + str(row.get("Address 3", "")).strip() + ).strip() + + if not ordnancy_survey_user_input: continue # Score against OS Places addresses scores = postcode_cache["ADDRESS"].apply( - lambda addr: addressMatch.score(user_address, addr) + lambda addr: addressMatch.score(ordnancy_survey_user_input, addr) ) best_idx = scores.idxmax() best_score = scores[best_idx] @@ -157,3 +171,5 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # TODO: Save new results to s3 (ask Khalim if we want to save to db) df.to_csv("ordnance_survey_results.csv", index=False) print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)") + + # TODO upload to s3 once you get confirmation from Khalim or db From 071a67e501bb760692925e7fe30bd584b3708169 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Mar 2026 13:29:25 +0000 Subject: [PATCH 08/21] ordnancesurvey deployment --- .github/workflows/deploy_terraform.yml | 39 +++++++++++++ backend/address2UPRN/main.py | 12 ++-- backend/ordnanceSurvey/main.py | 57 ++++++++++++++++++- .../terraform/lambda/ordnanceSurvey/main.tf | 57 +++++++++++++++++++ .../lambda/ordnanceSurvey/provider.tf | 16 ++++++ .../lambda/ordnanceSurvey/variables.tf | 32 +++++++++++ infrastructure/terraform/shared/main.tf | 34 ++++++++++- 7 files changed, 239 insertions(+), 8 deletions(-) create mode 100644 infrastructure/terraform/lambda/ordnanceSurvey/main.tf create mode 100644 infrastructure/terraform/lambda/ordnanceSurvey/provider.tf create mode 100644 infrastructure/terraform/lambda/ordnanceSurvey/variables.tf diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 4c9ce44a..aac49923 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -242,3 +242,42 @@ jobs: AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + # ============================================================ + # 2️⃣ Build OrdanceSurvey image and Push + # ============================================================ + ordnanceSurvey_image: + needs: [determine_stage, shared_terraform] + uses: ./.github/workflows/_build_image.yml + with: + ecr_repo: ordnance-${{ needs.determine_stage.outputs.stage }} + dockerfile_path: backend/ordnanceSurvey/handler/Dockerfile + build_context: . + build_args: | + DEV_DB_HOST=$DEV_DB_HOST + DEV_DB_PORT=$DEV_DB_PORT + DEV_DB_NAME=$DEV_DB_NAME + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} + DEV_DB_HOST: ${{ secrets.DEV_DB_HOST }} + DEV_DB_PORT: ${{ secrets.DEV_DB_PORT }} + DEV_DB_NAME: ${{ secrets.DEV_DB_NAME }} + + # ============================================================ + # 3️⃣ Deploy OrdanceSurvey Lambda + # ============================================================ + ordnanceSurvey_lambda: + needs: [ordnanceSurvey_image, determine_stage] + uses: ./.github/workflows/_deploy_lambda.yml + with: + lambda_name: ordnanceSurvey + lambda_path: infrastructure/terraform/lambda/ordnanceSurvey + stage: ${{ needs.determine_stage.outputs.stage }} + ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} + image_digest: ${{ needs.ordnanceSurvey_image.outputs.image_digest }} + terraform_apply: ${{ needs.determine_stage.outputs.terraform_apply }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} \ No newline at end of file diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index ea588a77..33cb6ff9 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -476,9 +476,11 @@ def handler(event, context, local=False): try: # Concatenate Address columns directly address2uprn_user_input = ( - str(row.get("Address 1", "")).strip() + " " + - str(row.get("Address 2", "")).strip() + " " + - str(row.get("Address 3", "")).strip() + str(row.get("Address 1", "")).strip() + + " " + + str(row.get("Address 2", "")).strip() + + " " + + str(row.get("Address 3", "")).strip() ).strip() if not address2uprn_user_input: @@ -489,7 +491,9 @@ def handler(event, context, local=False): # Get UPRN using the pre-fetched EPC data with all return options result = get_uprn_with_epc_df( - user_inputed_address=address2uprn_user_input, epc_df=epc_df, verbose=True + user_inputed_address=address2uprn_user_input, + epc_df=epc_df, + verbose=True, ) # Parse result tuple if successful diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index 6c4f3080..0a0e2a8a 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -4,7 +4,7 @@ from utils.logger import setup_logger import logging from backend.utils.subtasks import subtask_handler from utils.s3 import ( - # save_csv_to_s3, + save_csv_to_s3, read_csv_from_s3 as read_csv_from_s3_dict, parse_s3_uri, ) @@ -17,6 +17,9 @@ from backend.utils.ordnance_survey import ( ) from backend.app.config import get_settings from sqlalchemy import select +from datetime import datetime +import uuid +import os import pandas as pd @@ -64,6 +67,47 @@ def get_ordance_survey_record(row, cache=None): # process cache with row +def save_results_to_s3( + results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None +) -> bool: + """ + Save results DataFrame to S3 as CSV in a parent folder structure. + + :param results_df: The DataFrame containing results + :param task_id: The task ID (used for file naming) + :param sub_task_id: The subtask ID (used for file naming) + :param bucket_name: The S3 bucket name (defaults to env variable) + :return: True if successful, False otherwise + """ + if bucket_name is None: + bucket_name = os.getenv("S3_BUCKET_NAME") + + if not bucket_name: + logger.error( + "S3 bucket name not provided and S3_BUCKET_NAME environment variable not set" + ) + return False + + try: + # Create a filename with timestamp and UUID + file_name = f"{datetime.now().isoformat()}_{str(uuid.uuid4())[:8]}" + file_key = f"ara_ordnance_survey_outputs/{task_id}/{sub_task_id}/ordnanceSurvey/{file_name}.csv" + + # Save to S3 + success = save_csv_to_s3(results_df, bucket_name, file_key) + + if success: + logger.info(f"Successfully saved results to s3://{bucket_name}/{file_key}") + return True + else: + logger.error(f"Failed to save results to S3") + return False + + except Exception as e: + logger.error(f"Error saving results to S3: {str(e)}") + return False + + @subtask_handler() # This assumes task_id and subtask_id is defined in event.Records.body def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: @@ -81,6 +125,8 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: s3_uri: str = body.get("s3_uri", "") lexiscore_threshold: float = body.get("lexiscore_threshold", 0.5) lexiscore_column: str = body.get("lexiscore_column", None) + task_id: str = body.get("task_id", "") + sub_task_id: str = body.get("sub_task_id", "") if s3_uri == "": raise RuntimeError("Missing s3_uri in message body") @@ -168,8 +214,13 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: df.at[idx, "ordnance_survey_uprn"] = postcode_cache.at[best_idx, "UPRN"] df.at[idx, "ordnance_survey_lexiscore"] = best_score - # TODO: Save new results to s3 (ask Khalim if we want to save to db) + # Save results locally df.to_csv("ordnance_survey_results.csv", index=False) print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)") - # TODO upload to s3 once you get confirmation from Khalim or db + # Save results to S3 + if task_id and sub_task_id: + try: + save_results_to_s3(df, task_id, sub_task_id) + except Exception as s3_error: + logger.error(f"Failed to save results to S3: {s3_error}") diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/main.tf b/infrastructure/terraform/lambda/ordnanceSurvey/main.tf new file mode 100644 index 00000000..baa673e1 --- /dev/null +++ b/infrastructure/terraform/lambda/ordnanceSurvey/main.tf @@ -0,0 +1,57 @@ +data "terraform_remote_state" "shared" { + backend = "s3" + config = { + bucket = "assessment-model-terraform-state" + key = "env:/${var.stage}/terraform.tfstate" + region = "eu-west-2" + } +} +data "aws_secretsmanager_secret_version" "db_credentials" { + secret_id = "${var.stage}/assessment_model/db_credentials" +} +locals { + db_credentials = jsondecode(data.aws_secretsmanager_secret_version.db_credentials.secret_string) +} + +module "ordnance" { + source = "../modules/lambda_with_sqs" + + name = ordnanceSurvey #"address2uprn" for example + stage = var.stage + + image_uri = local.image_uri + + timeout = 900 + + # Optional: Set maximum_concurrency to limit concurrent SQS-triggered invocations (2-1000) + maximum_concurrency = var.maximum_concurrency + + environment = merge( + { + STAGE = var.stage + LOG_LEVEL = "info" + DB_USERNAME = local.db_credentials.db_assessment_model_username + DB_PASSWORD = local.db_credentials.db_assessment_model_password + GOOGLE_SOLAR_API_KEY = "test" + SAP_PREDICTIONS_BUCKET = "test" + CARBON_PREDICTIONS_BUCKET = "test" + HEAT_PREDICTIONS_BUCKET = "test" + HEATING_KWH_PREDICTIONS_BUCKET = "test" + HOTWATER_KWH_PREDICTIONS_BUCKET = "test" + API_KEY = "test" + ENVIRONMENT = "test" + SECRET_KEY = "test" + PLAN_TRIGGER_BUCKET = "test" + DATA_BUCKET = "test" + ENGINE_SQS_URL = "test" + ENERGY_ASSESSMENTS_BUCKET = "test" + S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name + }, + ) +} + +# Attach S3 read policy to the Lambda execution role +resource "aws_iam_role_policy_attachment" "ordanceSurvey_read_and_write" { + role = module.ordnance.role_name + policy_arn = data.terraform_remote_state.shared.outputs.ordnance_s3_read_and_write_arn +} diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf b/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf new file mode 100644 index 00000000..37c412ce --- /dev/null +++ b/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 4.16" + } + } + + backend "s3" { + bucket = REPLACE_ME + key = "terraform.tfstate" + region = "eu-west-2" + } + + required_version = ">= 1.2.0" +} \ No newline at end of file diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf b/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf new file mode 100644 index 00000000..e0061321 --- /dev/null +++ b/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf @@ -0,0 +1,32 @@ +variable "lambda_name" { + type = string + description = "Logical name of the lambda (e.g. address2uprn)" +} + +variable "stage" { + description = "Deployment stage (e.g. dev, prod)" + type = string +} +variable "ecr_repo_url" { + type = string + description = "ECR repository URL (no tag, no digest)" +} + +variable "image_digest" { + type = string + description = "Image digest (sha256:...)" +} + +variable "maximum_concurrency" { + type = number + default = null + description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." +} + +locals { + image_uri = "${var.ecr_repo_url}@${var.image_digest}" +} + +output "resolved_image_uri" { + value = local.image_uri +} diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index cca3394f..df519f4f 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -414,4 +414,36 @@ module "categorisation_registry" { source = "../modules/container_registry" name = "categorisation" stage = var.stage -} \ No newline at end of file +} + + +################################################ +# OrdnanceSurveyAPI – Lambda +################################################ +module "ordnance_state_bucket" { + source = "../modules/tf_state_bucket" + bucket_name = "ordnance-terraform-state" + +} + +module "ordnance_registry" { + source = "../modules/container_registry" + name = "ordnance" + stage = var.stage + +} + +# S3 policy for postcode splitter to read from retrofit data bucket +module "ordnance_s3_read_and_write" { + source = "../modules/s3_iam_policy" + + policy_name = "Address2UPRNReadandWriteS3" + policy_description = "Allow ordnance Lambda to read and write from retrofit-data bucket" + bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] + actions = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"] + resource_paths = ["/*"] +} + +output "ordnance_s3_read_and_write_arn" { + value = module.ordnance_s3_read_and_write.policy_arn +} From a147c9111191e803473c891328542a5856998ba3 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Mar 2026 13:41:54 +0000 Subject: [PATCH 09/21] get rid of redudant env --- .../terraform/lambda/address2UPRN/main.tf | 13 ------------- .../terraform/lambda/ordnanceSurvey/main.tf | 14 +------------- 2 files changed, 1 insertion(+), 26 deletions(-) diff --git a/infrastructure/terraform/lambda/address2UPRN/main.tf b/infrastructure/terraform/lambda/address2UPRN/main.tf index 2d185497..3afc8738 100644 --- a/infrastructure/terraform/lambda/address2UPRN/main.tf +++ b/infrastructure/terraform/lambda/address2UPRN/main.tf @@ -33,19 +33,6 @@ module "address2uprn" { LOG_LEVEL = "info" DB_USERNAME = local.db_credentials.db_assessment_model_username DB_PASSWORD = local.db_credentials.db_assessment_model_password - GOOGLE_SOLAR_API_KEY = "test" - SAP_PREDICTIONS_BUCKET = "test" - CARBON_PREDICTIONS_BUCKET = "test" - HEAT_PREDICTIONS_BUCKET = "test" - HEATING_KWH_PREDICTIONS_BUCKET = "test" - HOTWATER_KWH_PREDICTIONS_BUCKET = "test" - API_KEY = "test" - ENVIRONMENT = "test" - SECRET_KEY = "test" - PLAN_TRIGGER_BUCKET = "test" - DATA_BUCKET = "test" - ENGINE_SQS_URL = "test" - ENERGY_ASSESSMENTS_BUCKET = "test" S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name }, ) diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/main.tf b/infrastructure/terraform/lambda/ordnanceSurvey/main.tf index baa673e1..3af33ad7 100644 --- a/infrastructure/terraform/lambda/ordnanceSurvey/main.tf +++ b/infrastructure/terraform/lambda/ordnanceSurvey/main.tf @@ -32,20 +32,8 @@ module "ordnance" { LOG_LEVEL = "info" DB_USERNAME = local.db_credentials.db_assessment_model_username DB_PASSWORD = local.db_credentials.db_assessment_model_password - GOOGLE_SOLAR_API_KEY = "test" - SAP_PREDICTIONS_BUCKET = "test" - CARBON_PREDICTIONS_BUCKET = "test" - HEAT_PREDICTIONS_BUCKET = "test" - HEATING_KWH_PREDICTIONS_BUCKET = "test" - HOTWATER_KWH_PREDICTIONS_BUCKET = "test" - API_KEY = "test" - ENVIRONMENT = "test" - SECRET_KEY = "test" - PLAN_TRIGGER_BUCKET = "test" - DATA_BUCKET = "test" - ENGINE_SQS_URL = "test" - ENERGY_ASSESSMENTS_BUCKET = "test" S3_BUCKET_NAME = data.terraform_remote_state.shared.outputs.retrofit_sap_data_bucket_name + ORDNANCE_SURVEY_API_KEY:= "Reminder to add This somehow, ask if we are doing aws secret method or github secret method" }, ) } From ac344be09408e890be56ba348e18beb9919a5d1a Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Mar 2026 13:44:57 +0000 Subject: [PATCH 10/21] re add --- OrdnanceSurvey.py | 139 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 OrdnanceSurvey.py diff --git a/OrdnanceSurvey.py b/OrdnanceSurvey.py new file mode 100644 index 00000000..1ae66152 --- /dev/null +++ b/OrdnanceSurvey.py @@ -0,0 +1,139 @@ +from functools import lru_cache +import urllib.parse +import requests +from utils.logger import setup_logger + +logger = setup_logger() + + +class OrdnanceSuveyClient: + + def __init__(self, address, postcode, api_key): + """ + This class is tasked with interaction with the ordnance survey API. + :param address: The address for the property to search for + :param postcode: The postcode for the property to search for + """ + + self.address = address + self.postcode = postcode + self.full_address = ", ".join([self.address, self.postcode]) + self.api_key = api_key + + self.results = None + + self.most_relevant_result = None + self.property_type = None + self.built_form = None + # This will be postcode and address, as returned by the ordnance survey + self.address_os = None + self.postcode_os = None + + def set_places_address(self): + """ + Given a response from the places api, this function will set the address and postcode of the property + """ + + if self.most_relevant_result is None: + raise ValueError("No results found - run get_places_api first") + + self.address_os = self.most_relevant_result["ADDRESS"] + + if "POSTCODE" in self.most_relevant_result: + self.postcode_os = self.most_relevant_result["POSTCODE"] + else: + self.postcode_os = self.most_relevant_result["POSTCODE_LOCATOR"] + # We strip out the postcode from the address as this is already stored separately + self.address_os = self.address_os.replace(self.postcode_os, "").strip() + # Remove trailing comma + self.address_os = self.address_os.rstrip(",").strip() + # Convert to title case + self.address_os = self.address_os.title() + # Make sure postcode is upper case + self.postcode_os = self.postcode_os.upper() + + @lru_cache(maxsize=128) + def get_places_api(self, filter_by_postcode=False): + """ + This method is tasked with getting the places api from the Ordnance Survey. + """ + + if not self.api_key: + raise ValueError("Ordnance Survey API key not specified") + + encoded_address_query = urllib.parse.quote(self.full_address) + + url = ( + f"https://api.os.uk/search/places/v1/find?query={encoded_address_query}&dataset=DPA,LPI&matchprecision=10" + f"&key={self.api_key}" + ) + + response = requests.get(url) + if response.status_code == 200: + data = response.json() + res = data["results"] + + if filter_by_postcode: + results = [] + for r in res: + if "DPA" in r: + if r["DPA"]["POSTCODE"] == self.postcode: + results.append(r) + elif "LPI" in r: + if r["LPI"]["POSTCODE_LOCATOR"] == self.postcode: + results.append(r) + else: + raise ValueError("Could not find postcode in either DPA or LPI") + else: + results = res + + self.results = results + + # Extract some details about the best match + self.most_relevant_result = ( + self.results[0]["DPA"] + if "DPA" in self.results[0] + else self.results[0]["LPI"] + ) + + self.parse_classification_code( + self.most_relevant_result["CLASSIFICATION_CODE"] + ) + self.set_places_address() + + else: + logger.info( + "Could not find any results for the provided address and postcode" + ) + + return {"status": response.status_code} + + def parse_classification_code(self, classification_code: str): + """ + This function will convert the classification code, returned by the OS places api, to a property type that is + compatible with the EPC database. + + The various classifications cane be found here: + https://osdatahub.os.uk/docs/places/technicalSpecification + + Under LPI Output, CLASSIFICATION_CODE is described, and a link is provided to the full table of classifications + For these purposes, we do not need the full classification as this includes non-residential properties. We only + parse the ones of interest to us + :return: + """ + + value_map = { + # In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database + "RD": {}, + "RD02": {"property_type": "House", "built_form": "Detached"}, + "RD03": {"property_type": "House", "built_form": "Semi-Detached"}, + "RD04": {"property_type": "House", "built_form": "Mid-Terrace"}, + "RD06": {"property_type": "Flat"}, + } + # Other classifications can be found in here: + # https://osdatahub.os.uk/docs/places/technicalSpecification in the CLASSIFICATION_CODE description. + # A lookup table csv can be downloaded which contains all of the codes + + mapped = value_map.get(classification_code, {}) + self.property_type = mapped.get("property_type", "") + self.built_form = mapped.get("built_form", "") From 9852eb631bd4c1a2a0c878abca6b63c61be3d534 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Mar 2026 13:49:05 +0000 Subject: [PATCH 11/21] added ordnance state --- infrastructure/terraform/lambda/ordnanceSurvey/provider.tf | 2 +- infrastructure/terraform/shared/main.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf b/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf index 37c412ce..b7f453f1 100644 --- a/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf +++ b/infrastructure/terraform/lambda/ordnanceSurvey/provider.tf @@ -7,7 +7,7 @@ terraform { } backend "s3" { - bucket = REPLACE_ME + bucket = "ordnance-terraform-state" key = "terraform.tfstate" region = "eu-west-2" } diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 7e46e787..905fbc78 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -472,7 +472,7 @@ module "ordnance_registry" { module "ordnance_s3_read_and_write" { source = "../modules/s3_iam_policy" - policy_name = "Address2UPRNReadandWriteS3" + policy_name = "OrdnanceSurveyReadandWriteS3" policy_description = "Allow ordnance Lambda to read and write from retrofit-data bucket" bucket_arns = ["arn:aws:s3:::retrofit-data-${var.stage}"] actions = ["s3:GetObject", "s3:ListBucket", "s3:PutObject"] From 58843a5409209d23abc99d0273a19b4948ca802f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Mar 2026 13:51:47 +0000 Subject: [PATCH 12/21] put batch size as well --- infrastructure/terraform/lambda/ordnanceSurvey/variables.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf b/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf index e0061321..e7646811 100644 --- a/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf +++ b/infrastructure/terraform/lambda/ordnanceSurvey/variables.tf @@ -23,6 +23,11 @@ variable "maximum_concurrency" { description = "Maximum number of concurrent Lambda invocations from SQS (2-1000). null = no limit." } +variable "batch_size" { + type = number + default = 1 +} + locals { image_uri = "${var.ecr_repo_url}@${var.image_digest}" } From 8163a97b97d43fb3cc6d9ca020e82835b67e3d52 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Mar 2026 13:57:33 +0000 Subject: [PATCH 13/21] get rid of df.head and control the inputs instead --- backend/ordnanceSurvey/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index 0a0e2a8a..bf8cfdaf 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -136,7 +136,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # Assumption designing with address2uprn was ran first csv_data = read_csv_from_s3_dict(bucket, key) df = pd.DataFrame(csv_data) - df = df.head(5) + # df = df.head(5) # If lexiscore_column is specified, use it; otherwise process all rows if lexiscore_column and lexiscore_column in df.columns: From ef45ed62f8542899b6ea0c61978a9cf2156db097 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Mar 2026 14:53:51 +0000 Subject: [PATCH 14/21] ordance survey init file --- backend/ordnanceSurvey/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 backend/ordnanceSurvey/__init__.py diff --git a/backend/ordnanceSurvey/__init__.py b/backend/ordnanceSurvey/__init__.py new file mode 100644 index 00000000..e69de29b From 19ab0ad7574b076b01d47d516f4cab47695cb79c Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 6 Mar 2026 14:58:49 +0000 Subject: [PATCH 15/21] deleted the wrong folder aded back the origional --- .../OrdnanceSurvey.py | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) rename OrdnanceSurvey.py => backend/OrdnanceSurvey.py (86%) diff --git a/OrdnanceSurvey.py b/backend/OrdnanceSurvey.py similarity index 86% rename from OrdnanceSurvey.py rename to backend/OrdnanceSurvey.py index 1ae66152..a4d716d0 100644 --- a/OrdnanceSurvey.py +++ b/backend/OrdnanceSurvey.py @@ -90,21 +90,13 @@ class OrdnanceSuveyClient: self.results = results # Extract some details about the best match - self.most_relevant_result = ( - self.results[0]["DPA"] - if "DPA" in self.results[0] - else self.results[0]["LPI"] - ) + self.most_relevant_result = self.results[0]["DPA"] if "DPA" in self.results[0] else self.results[0]["LPI"] - self.parse_classification_code( - self.most_relevant_result["CLASSIFICATION_CODE"] - ) + self.parse_classification_code(self.most_relevant_result["CLASSIFICATION_CODE"]) self.set_places_address() else: - logger.info( - "Could not find any results for the provided address and postcode" - ) + logger.info("Could not find any results for the provided address and postcode") return {"status": response.status_code} @@ -124,11 +116,11 @@ class OrdnanceSuveyClient: value_map = { # In the OS api, "RD" is a "Dwelling" however this is not valid property type in the EPC database - "RD": {}, - "RD02": {"property_type": "House", "built_form": "Detached"}, - "RD03": {"property_type": "House", "built_form": "Semi-Detached"}, - "RD04": {"property_type": "House", "built_form": "Mid-Terrace"}, - "RD06": {"property_type": "Flat"}, + 'RD': {}, + 'RD02': {'property_type': 'House', 'built_form': 'Detached'}, + 'RD03': {'property_type': 'House', 'built_form': 'Semi-Detached'}, + 'RD04': {'property_type': 'House', 'built_form': 'Mid-Terrace'}, + 'RD06': {'property_type': 'Flat'}, } # Other classifications can be found in here: # https://osdatahub.os.uk/docs/places/technicalSpecification in the CLASSIFICATION_CODE description. From 124a34597a5e5a373b757e7b2cc36e6238e6578d Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Mar 2026 12:15:22 +0000 Subject: [PATCH 16/21] save so i can run it from mealcraft --- .devcontainer/backend/post-install.sh | 24 +++++----- backend/app/config.py | 1 + backend/ordnanceSurvey/helpers.py | 66 +++++++++++++++++++++++++++ backend/ordnanceSurvey/main.py | 21 +++------ backend/ordnanceSurvey/types.py | 44 ++++++++++++++++++ backend/utils/ordnance_survey.py | 44 ------------------ 6 files changed, 129 insertions(+), 71 deletions(-) create mode 100644 backend/ordnanceSurvey/helpers.py create mode 100644 backend/ordnanceSurvey/types.py delete mode 100644 backend/utils/ordnance_survey.py diff --git a/.devcontainer/backend/post-install.sh b/.devcontainer/backend/post-install.sh index 48fbfde1..20c699d6 100644 --- a/.devcontainer/backend/post-install.sh +++ b/.devcontainer/backend/post-install.sh @@ -1,14 +1,14 @@ -mkdir -p ~/.ipython/profile_default/startup +# mkdir -p ~/.ipython/profile_default/startup -cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py -from dotenv import load_dotenv -import os +# cat << 'EOF' > ~/.ipython/profile_default/startup/00-load-env.py +# from dotenv import load_dotenv +# import os -# Adjust path as needed -env_path = "/workspaces/model/backend/.env" -if os.path.exists(env_path): - load_dotenv(env_path) - print("✔ Loaded .env into Jupyter kernel") -else: - print("⚠ No .env file found to load") -EOF +# # Adjust path as needed +# env_path = "/workspaces/model/backend/.env" +# if os.path.exists(env_path): +# load_dotenv(env_path) +# print("✔ Loaded .env into Jupyter kernel") +# else: +# print("⚠ No .env file found to load") +# EOF diff --git a/backend/app/config.py b/backend/app/config.py index b5b29137..e87f8374 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -64,6 +64,7 @@ class Settings(BaseSettings): ENERGY_ASSESSMENTS_BUCKET: str = "changeme" ORDNANCE_SURVEY_API_KEY: str = "changeme" + PLAN_TRIGGER_BUCKET: str = "changeme" # Optional AWS creds (only required in local) AWS_ACCESS_KEY_ID: Optional[str] = None diff --git a/backend/ordnanceSurvey/helpers.py b/backend/ordnanceSurvey/helpers.py new file mode 100644 index 00000000..d59060b8 --- /dev/null +++ b/backend/ordnanceSurvey/helpers.py @@ -0,0 +1,66 @@ +import urllib.parse +from pydantic import ValidationError +import requests +import pandas as pd +from utils.logger import setup_logger +from backend.ordnanceSurvey.types import PostcodeResponse + +logger = setup_logger() + + +def os_places_results_to_dataframe(data: dict) -> pd.DataFrame: + """ + Flatten the OS Places API response results into a DataFrame. + Each result contains either a DPA or LPI record. + """ + results = data.get("results", []) + rows = [] + for r in results: + if "DPA" in r: + rows.append(r["DPA"]) + elif "LPI" in r: + rows.append(r["LPI"]) + return pd.DataFrame(rows) + + +import urllib.parse +import requests +import logging +from pydantic import ValidationError + +logger = logging.getLogger(__name__) + + +def lookup_os_places(postcode: str, api_key: str) -> PostcodeResponse: + """ + Lookup a postcode using the OS Places API. + Returns a validated PostcodeResponse. + Raises exceptions on failure. + """ + if not api_key: + raise ValueError("Ordnance Survey API key not specified") + + encoded_postcode = urllib.parse.quote(postcode) + + url = ( + f"https://api.os.uk/search/places/v1/postcode?postcode={encoded_postcode}" + f"&dataset=DPA,LPI&key={api_key}" + ) + + response = requests.get(url) + + if response.status_code != 200: + logger.error( + "OS Places API error for postcode %s: %s", + postcode, + response.status_code, + ) + raise RuntimeError(f"OS Places lookup failed for postcode {postcode}") + + try: + raw = response.json() + return PostcodeResponse.model_validate(raw) + + except ValidationError as e: + logger.error("OS Places response validation failed: %s", e) + raise RuntimeError("Invalid response format from OS Places API") from e diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index bf8cfdaf..835910f5 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -1,5 +1,6 @@ from typing import Any import json +from backend.ordnanceSurvey.types import PostcodeResponse from utils.logger import setup_logger import logging from backend.utils.subtasks import subtask_handler @@ -11,7 +12,7 @@ from utils.s3 import ( from backend.utils.addressMatch import addressMatch from backend.app.db.connection import get_db_session from backend.app.db.models.postcode_search import PostcodeSearchModel -from backend.utils.ordnance_survey import ( +from backend.ordnanceSurvey.helpers import ( lookup_os_places, os_places_results_to_dataframe, ) @@ -27,6 +28,7 @@ logger: logging.Logger = setup_logger() def check_if_post_code_exists_in_db_cache(postcode): + postcode = "SE22 9AL" with get_db_session() as session: result = ( @@ -43,16 +45,12 @@ def check_if_post_code_exists_in_db_cache(postcode): # Cache miss — fetch from OS Places API api_key = get_settings().ORDNANCE_SURVEY_API_KEY - response = lookup_os_places(postcode, api_key) - - if response.get("status") != 200 or "data" not in response: - logger.error(f"OS Places API failed for {postcode}: {response}") - return pd.DataFrame() + response: PostcodeResponse = lookup_os_places(postcode, api_key) # Save to cache new_record = PostcodeSearchModel( postcode=postcode, - result_data=response["data"], + result_data=response.results, ) session.add(new_record) session.commit() @@ -60,13 +58,6 @@ def check_if_post_code_exists_in_db_cache(postcode): return os_places_results_to_dataframe(response["data"]) -def get_ordance_survey_record(row, cache=None): - if cache is None: - cache = check_if_post_code_exists_in_db_cache(postcode) - - # process cache with row - - def save_results_to_s3( results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None ) -> bool: @@ -100,7 +91,7 @@ def save_results_to_s3( logger.info(f"Successfully saved results to s3://{bucket_name}/{file_key}") return True else: - logger.error(f"Failed to save results to S3") + logger.error(f"Failed to save results to S3 {bucket_name}/{file_key}") return False except Exception as e: diff --git a/backend/ordnanceSurvey/types.py b/backend/ordnanceSurvey/types.py new file mode 100644 index 00000000..0631ff67 --- /dev/null +++ b/backend/ordnanceSurvey/types.py @@ -0,0 +1,44 @@ +from pydantic import BaseModel +from typing import List + + +class OrdnanceSurveyResponse(BaseModel): + RPC: str + UPRN: str + MATCH: int + UDPRN: str + STATUS: str + ADDRESS: str + LANGUAGE: str + POSTCODE: str + POST_TOWN: str + WARD_CODE: str + ENTRY_DATE: str + COUNTRY_CODE: str + X_COORDINATE: int + Y_COORDINATE: int + BUILDING_NAME: str + BLPU_STATE_CODE: str + BLPU_STATE_DATE: str + LAST_UPDATE_DATE: str + MATCH_DESCRIPTION: str + THOROUGHFARE_NAME: str + CLASSIFICATION_CODE: str + LOGICAL_STATUS_CODE: str + POSTAL_ADDRESS_CODE: str + LOCAL_CUSTODIAN_CODE: int + DELIVERY_POINT_SUFFIX: str + TOPOGRAPHY_LAYER_TOID: str + COUNTRY_CODE_DESCRIPTION: str + BLPU_STATE_CODE_DESCRIPTION: str + CLASSIFICATION_CODE_DESCRIPTION: str + POSTAL_ADDRESS_CODE_DESCRIPTION: str + LOCAL_CUSTODIAN_CODE_DESCRIPTION: str + + +class Result(BaseModel): + DPA: OrdnanceSurveyResponse + + +class PostcodeResponse(BaseModel): + results: List[Result] diff --git a/backend/utils/ordnance_survey.py b/backend/utils/ordnance_survey.py deleted file mode 100644 index 03a0e57b..00000000 --- a/backend/utils/ordnance_survey.py +++ /dev/null @@ -1,44 +0,0 @@ -import urllib.parse -import requests -import pandas as pd -from utils.logger import setup_logger - -logger = setup_logger() - - -def os_places_results_to_dataframe(data: dict) -> pd.DataFrame: - """ - Flatten the OS Places API response results into a DataFrame. - Each result contains either a DPA or LPI record. - """ - results = data.get("results", []) - rows = [] - for r in results: - if "DPA" in r: - rows.append(r["DPA"]) - elif "LPI" in r: - rows.append(r["LPI"]) - return pd.DataFrame(rows) - - -def lookup_os_places(postcode: str, api_key: str) -> dict: - """ - Lookup a postcode using the OS Places API. - Returns the full API response data or an error dict. - """ - if not api_key: - return {"error": "Ordnance Survey API key not specified", "status": 400} - - encoded_postcode = urllib.parse.quote(postcode) - url = ( - f"https://api.os.uk/search/places/v1/postcode?postcode={encoded_postcode}" - f"&dataset=DPA,LPI&key={api_key}" - ) - - response = requests.get(url) - if response.status_code != 200: - logger.error(f"OS Places API error for postcode {postcode}: {response.status_code}") - return {"error": "Failed to fetch address data", "status": response.status_code} - - data = response.json() - return {"data": data, "status": 200} From 1b9c26a2b62cb32e3bffdfbc1035954acfc6bebb Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Mar 2026 12:32:20 +0000 Subject: [PATCH 17/21] back to origional format --- backend/app/config.py | 1 - backend/ordnanceSurvey/helpers.py | 32 +++++----------------- backend/ordnanceSurvey/main.py | 21 ++++++++++----- backend/ordnanceSurvey/types.py | 44 ------------------------------- 4 files changed, 22 insertions(+), 76 deletions(-) delete mode 100644 backend/ordnanceSurvey/types.py diff --git a/backend/app/config.py b/backend/app/config.py index e87f8374..b5b29137 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -64,7 +64,6 @@ class Settings(BaseSettings): ENERGY_ASSESSMENTS_BUCKET: str = "changeme" ORDNANCE_SURVEY_API_KEY: str = "changeme" - PLAN_TRIGGER_BUCKET: str = "changeme" # Optional AWS creds (only required in local) AWS_ACCESS_KEY_ID: Optional[str] = None diff --git a/backend/ordnanceSurvey/helpers.py b/backend/ordnanceSurvey/helpers.py index d59060b8..fcaa148a 100644 --- a/backend/ordnanceSurvey/helpers.py +++ b/backend/ordnanceSurvey/helpers.py @@ -23,44 +23,26 @@ def os_places_results_to_dataframe(data: dict) -> pd.DataFrame: return pd.DataFrame(rows) -import urllib.parse -import requests -import logging -from pydantic import ValidationError - -logger = logging.getLogger(__name__) - - -def lookup_os_places(postcode: str, api_key: str) -> PostcodeResponse: +def lookup_os_places(postcode: str, api_key: str) -> dict: """ Lookup a postcode using the OS Places API. - Returns a validated PostcodeResponse. - Raises exceptions on failure. + Returns the full API response data or an error dict. """ if not api_key: - raise ValueError("Ordnance Survey API key not specified") + return {"error": "Ordnance Survey API key not specified", "status": 400} encoded_postcode = urllib.parse.quote(postcode) - url = ( f"https://api.os.uk/search/places/v1/postcode?postcode={encoded_postcode}" f"&dataset=DPA,LPI&key={api_key}" ) response = requests.get(url) - if response.status_code != 200: logger.error( - "OS Places API error for postcode %s: %s", - postcode, - response.status_code, + f"OS Places API error for postcode {postcode}: {response.status_code}" ) - raise RuntimeError(f"OS Places lookup failed for postcode {postcode}") + return {"error": "Failed to fetch address data", "status": response.status_code} - try: - raw = response.json() - return PostcodeResponse.model_validate(raw) - - except ValidationError as e: - logger.error("OS Places response validation failed: %s", e) - raise RuntimeError("Invalid response format from OS Places API") from e + data = response.json() + return {"data": data, "status": 200} diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index 835910f5..bf8cfdaf 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -1,6 +1,5 @@ from typing import Any import json -from backend.ordnanceSurvey.types import PostcodeResponse from utils.logger import setup_logger import logging from backend.utils.subtasks import subtask_handler @@ -12,7 +11,7 @@ from utils.s3 import ( from backend.utils.addressMatch import addressMatch from backend.app.db.connection import get_db_session from backend.app.db.models.postcode_search import PostcodeSearchModel -from backend.ordnanceSurvey.helpers import ( +from backend.utils.ordnance_survey import ( lookup_os_places, os_places_results_to_dataframe, ) @@ -28,7 +27,6 @@ logger: logging.Logger = setup_logger() def check_if_post_code_exists_in_db_cache(postcode): - postcode = "SE22 9AL" with get_db_session() as session: result = ( @@ -45,12 +43,16 @@ def check_if_post_code_exists_in_db_cache(postcode): # Cache miss — fetch from OS Places API api_key = get_settings().ORDNANCE_SURVEY_API_KEY - response: PostcodeResponse = lookup_os_places(postcode, api_key) + response = lookup_os_places(postcode, api_key) + + if response.get("status") != 200 or "data" not in response: + logger.error(f"OS Places API failed for {postcode}: {response}") + return pd.DataFrame() # Save to cache new_record = PostcodeSearchModel( postcode=postcode, - result_data=response.results, + result_data=response["data"], ) session.add(new_record) session.commit() @@ -58,6 +60,13 @@ def check_if_post_code_exists_in_db_cache(postcode): return os_places_results_to_dataframe(response["data"]) +def get_ordance_survey_record(row, cache=None): + if cache is None: + cache = check_if_post_code_exists_in_db_cache(postcode) + + # process cache with row + + def save_results_to_s3( results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None ) -> bool: @@ -91,7 +100,7 @@ def save_results_to_s3( logger.info(f"Successfully saved results to s3://{bucket_name}/{file_key}") return True else: - logger.error(f"Failed to save results to S3 {bucket_name}/{file_key}") + logger.error(f"Failed to save results to S3") return False except Exception as e: diff --git a/backend/ordnanceSurvey/types.py b/backend/ordnanceSurvey/types.py deleted file mode 100644 index 0631ff67..00000000 --- a/backend/ordnanceSurvey/types.py +++ /dev/null @@ -1,44 +0,0 @@ -from pydantic import BaseModel -from typing import List - - -class OrdnanceSurveyResponse(BaseModel): - RPC: str - UPRN: str - MATCH: int - UDPRN: str - STATUS: str - ADDRESS: str - LANGUAGE: str - POSTCODE: str - POST_TOWN: str - WARD_CODE: str - ENTRY_DATE: str - COUNTRY_CODE: str - X_COORDINATE: int - Y_COORDINATE: int - BUILDING_NAME: str - BLPU_STATE_CODE: str - BLPU_STATE_DATE: str - LAST_UPDATE_DATE: str - MATCH_DESCRIPTION: str - THOROUGHFARE_NAME: str - CLASSIFICATION_CODE: str - LOGICAL_STATUS_CODE: str - POSTAL_ADDRESS_CODE: str - LOCAL_CUSTODIAN_CODE: int - DELIVERY_POINT_SUFFIX: str - TOPOGRAPHY_LAYER_TOID: str - COUNTRY_CODE_DESCRIPTION: str - BLPU_STATE_CODE_DESCRIPTION: str - CLASSIFICATION_CODE_DESCRIPTION: str - POSTAL_ADDRESS_CODE_DESCRIPTION: str - LOCAL_CUSTODIAN_CODE_DESCRIPTION: str - - -class Result(BaseModel): - DPA: OrdnanceSurveyResponse - - -class PostcodeResponse(BaseModel): - results: List[Result] From 4d013f329592569b06c0cabc9d4d0e8899283d76 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Mar 2026 13:23:20 +0000 Subject: [PATCH 18/21] dan's pr --- backend/address2UPRN/main.py | 10 +++++----- backend/ordnanceSurvey/main.py | 20 +++++++++----------- backend/utils/addressMatch.py | 8 ++++---- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 33cb6ff9..4c22db44 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -14,7 +14,7 @@ from utils.s3 import ( ) from datetime import datetime -from backend.utils.addressMatch import addressMatch +from backend.utils.addressMatch import AddressMatch logger = setup_logger() @@ -35,7 +35,7 @@ def score_addresses( if column not in df.columns: raise ValueError(f"Missing column: {column}") - return df[column].apply(lambda x: addressMatch.score(user_address, x)) + return df[column].apply(lambda x: AddressMatch.score(user_address, x)) def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): @@ -127,10 +127,10 @@ def get_uprn_candidates( out = df.copy() - user_norm = addressMatch.normalise_address(user_address) + user_norm = AddressMatch.normalise_address(user_address) out["lexiscore"] = out[address_column].apply( - lambda x: addressMatch.levenshtein(user_norm, x) + lambda x: AddressMatch.levenshtein(user_norm, x) ) # Normalise UPRN to string @@ -455,7 +455,7 @@ def handler(event, context, local=False): ) # Validate postcode before processing - if not addressMatch.is_valid_postcode(postcode): + if not AddressMatch.is_valid_postcode(postcode): logger.warning(f"Postcode {postcode} is invalid, skipping") continue diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index bf8cfdaf..5c8373a1 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Optional import json from utils.logger import setup_logger import logging @@ -8,7 +8,7 @@ from utils.s3 import ( read_csv_from_s3 as read_csv_from_s3_dict, parse_s3_uri, ) -from backend.utils.addressMatch import addressMatch +from backend.utils.addressMatch import AddressMatch from backend.app.db.connection import get_db_session from backend.app.db.models.postcode_search import PostcodeSearchModel from backend.utils.ordnance_survey import ( @@ -124,7 +124,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: s3_uri: str = body.get("s3_uri", "") lexiscore_threshold: float = body.get("lexiscore_threshold", 0.5) - lexiscore_column: str = body.get("lexiscore_column", None) + lexiscore_column: Optional[str] = body.get("lexiscore_column", None) task_id: str = body.get("task_id", "") sub_task_id: str = body.get("sub_task_id", "") @@ -158,7 +158,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # Process each postcode group at a time for postcode, group in grouped: print(f"Processing postcode: {postcode} ({len(group)} rows)") - valid_group = addressMatch.is_valid_postcode(postcode) + valid_group = AddressMatch.is_valid_postcode(postcode) if not valid_group: logger.warning(f"Postcode {postcode} is invalid, skipping") for idx in group.index: @@ -203,7 +203,7 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: # Score against OS Places addresses scores = postcode_cache["ADDRESS"].apply( - lambda addr: addressMatch.score(ordnancy_survey_user_input, addr) + lambda addr: AddressMatch.score(ordnancy_survey_user_input, addr) ) best_idx = scores.idxmax() best_score = scores[best_idx] @@ -215,12 +215,10 @@ def handler(body: dict[str, Any], context: Any, local: bool = False) -> None: df.at[idx, "ordnance_survey_lexiscore"] = best_score # Save results locally - df.to_csv("ordnance_survey_results.csv", index=False) - print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)") + if local: + df.to_csv("ordnance_survey_results.csv", index=False) + print(f"Results saved to ordnance_survey_results.csv ({len(df)} rows)") # Save results to S3 if task_id and sub_task_id: - try: - save_results_to_s3(df, task_id, sub_task_id) - except Exception as s3_error: - logger.error(f"Failed to save results to S3: {s3_error}") + save_results_to_s3(df, task_id, sub_task_id) diff --git a/backend/utils/addressMatch.py b/backend/utils/addressMatch.py index b09c1672..411bb07c 100644 --- a/backend/utils/addressMatch.py +++ b/backend/utils/addressMatch.py @@ -4,13 +4,13 @@ from difflib import SequenceMatcher import requests -class addressMatch: +class AddressMatch: def __init__(self): return None @staticmethod def score(a: str, b: str) -> float: - score: float = addressMatch.levenshtein(a, b) + score: float = AddressMatch.levenshtein(a, b) return score @@ -143,8 +143,8 @@ class addressMatch: return None - a_norm = addressMatch.normalise_address(a) - b_norm = addressMatch.normalise_address(b) + a_norm = AddressMatch.normalise_address(a) + b_norm = AddressMatch.normalise_address(b) # --- hard signal: numbers --- nums_a = extract_numbers(a_norm) From 147982cb7c9939f5237522169eb38f0d4cada53b Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Mar 2026 13:28:16 +0000 Subject: [PATCH 19/21] optional none --- backend/address2UPRN/main.py | 7 ++++++- backend/ordnanceSurvey/main.py | 5 ++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 4c22db44..d0ba36e6 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -1,3 +1,5 @@ +from typing import Optional + from epc_api.client import EpcClient import os from urllib.parse import urlencode @@ -295,7 +297,10 @@ def resolve_uprns_for_postcode_group( def save_results_to_s3( - results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None + results_df: pd.DataFrame, + task_id: str, + sub_task_id: str, + bucket_name: Optional[str] = None, ) -> bool: """ Save results DataFrame to S3 as CSV. diff --git a/backend/ordnanceSurvey/main.py b/backend/ordnanceSurvey/main.py index 5c8373a1..70b45079 100644 --- a/backend/ordnanceSurvey/main.py +++ b/backend/ordnanceSurvey/main.py @@ -68,7 +68,10 @@ def get_ordance_survey_record(row, cache=None): def save_results_to_s3( - results_df: pd.DataFrame, task_id: str, sub_task_id: str, bucket_name: str = None + results_df: pd.DataFrame, + task_id: str, + sub_task_id: str, + bucket_name: Optional[str] = None, ) -> bool: """ Save results DataFrame to S3 as CSV in a parent folder structure. From cd822a3eedf6bbce7705d9622ea2c7e74ec6e6fa Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Mar 2026 15:26:01 +0000 Subject: [PATCH 20/21] deploy terraform --- .github/workflows/deploy_terraform.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/deploy_terraform.yml b/.github/workflows/deploy_terraform.yml index 0fae2110..bde7eb21 100644 --- a/.github/workflows/deploy_terraform.yml +++ b/.github/workflows/deploy_terraform.yml @@ -316,3 +316,7 @@ jobs: stage: ${{ needs.determine_stage.outputs.stage }} ecr_repo: postcode_splitter-${{ needs.determine_stage.outputs.stage }} image_digest: ${{ needs.ordnanceSurvey_image.outputs.image_digest }} + secrets: + AWS_ACCESS_KEY_ID: ${{ secrets.DEV_AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.DEV_AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.DEV_AWS_REGION }} From 4cf0194b7ec5fbe8cf2a0aff57d3c957df853d17 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 9 Mar 2026 15:36:33 +0000 Subject: [PATCH 21/21] missing bracket --- infrastructure/terraform/shared/main.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/infrastructure/terraform/shared/main.tf b/infrastructure/terraform/shared/main.tf index 905fbc78..4cf5ac46 100644 --- a/infrastructure/terraform/shared/main.tf +++ b/infrastructure/terraform/shared/main.tf @@ -481,6 +481,8 @@ module "ordnance_s3_read_and_write" { output "ordnance_s3_read_and_write_arn" { value = module.ordnance_s3_read_and_write.policy_arn +} + ################################################ # Engine – Lambda ECR ################################################