From d5c9fd9390f189666fcd6c512d9444570c8bff87 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <junte.kim@mealcraft.com>
Date: Thu, 22 Jan 2026 15:17:13 +0000
Subject: [PATCH] its working the way khalim wanted wiht postcode and then
 search that

---
 .devcontainer/Dockerfile                  |   7 +-
 .devcontainer/devcontainer.json           |   6 -
 .devcontainer/requirements.txt            |   7 +-
 asset_list/app.py                         |  27 --
 asset_list/requirements.txt               |   3 -
 backend/address2UPRN/main.py              | 315 +++++++++++++++++++++-
 backend/app/requirements/requirements.txt |   5 +-
 model_data/requirements/requirements.txt  |   2 +-
 8 files changed, 319 insertions(+), 53 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index a2fd9b31..ccd056c7 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -34,10 +34,11 @@ RUN useradd -m -s /usr/bin/bash ${USER} \
 ENV PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1
 ADD backend/engine/requirements.txt requirements1.txt
 ADD backend/app/requirements/requirements.txt requirements2.txt
-ADD .devcontainer/requirements.txt requirements3.txt
-RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt
-RUN pip install -r requirements.txt
+# ADD .devcontainer/requirements.txt requirements3.txt
+# RUN cat requirements1.txt requirements2.txt requirements3.txt > requirements.txt
+RUN cat requirements1.txt requirements2.txt > requirements.txt
 
+RUN pip install -r requirements.txt
 # 5) Workdir
 WORKDIR /workspaces/model
 
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 80a56bf2..761786cd 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -23,11 +23,6 @@
         "4ops.terraform",
         "fabiospampinato.vscode-todo-plus",
         "jgclark.vscode-todo-highlight",
-<<<<<<< HEAD
-        "corentinartaud.pdfpreview"
-      ]
-    }
-=======
         "corentinartaud.pdfpreview",
         "ms-python.vscode-python-envs"
       ]
@@ -35,6 +30,5 @@
   },
   "containerEnv": {
     "PYTHONFLAGS": "-Xfrozen_modules=off"
->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
   }
 }
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
index 300b86b0..a6aebdaf 100644
--- a/.devcontainer/requirements.txt
+++ b/.devcontainer/requirements.txt
@@ -1,7 +1,6 @@
 # fastapi
 fastapi==0.115.2
 sqlalchemy==2.0.36
-pydantic-settings==2.6.0
 psycopg2-binary==2.9.10
 python-jose==3.3.0
 cryptography==43.0.3
@@ -13,8 +12,10 @@ openpyxl==3.1.2
 # Basic
 pytz
 uvicorn[standard]
-sqlmodel
 # Testing
 pytest==9.0.2
 pytest-cov==7.0.0
-ipykernel>=6.25,<7
\ No newline at end of file
+ipykernel>=6.25,<7
+pydantic-settings<2
+pyyaml>=6.0.1
+pydantic>=1.10.7,<2
\ No newline at end of file
diff --git a/asset_list/app.py b/asset_list/app.py
index a832784c..d3ca9337 100644
--- a/asset_list/app.py
+++ b/asset_list/app.py
@@ -57,21 +57,6 @@ def app():
     EPC recommendations
     Property UPRN
     """
-<<<<<<< HEAD
-    data_folder = ("/workspaces/model/asset_list")
-    data_filename = "assets.xlsx"
-    sheet_name = "Sheet1"
-    postcode_column = 'Postcode'
-    address1_column = None
-    address1_method = 'house_number_extraction'
-    fulladdress_column = 'Address'
-    address_cols_to_concat = None
-    missing_postcodes_method = None
-    landlord_year_built = None
-    landlord_os_uprn = None
-    landlord_property_type = None
-    landlord_built_form = None
-=======
 
     data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Warmfront/SCIS")
     data_filename = "SCIS_Historic_Deemed_Combined_Workings.xlsx"
@@ -86,16 +71,11 @@ def app():
     landlord_os_uprn = None
     landlord_property_type = "PROPERTY TYPE As per table emailed"
     landlord_built_form = "PROPERTY TYPE As per table emailed"
->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
     landlord_wall_construction = None
     landlord_roof_construction = None
     landlord_heating_system = None
     landlord_existing_pv = None
-<<<<<<< HEAD
-    landlord_property_id = "LLUPRN"
-=======
     landlord_property_id = "Row ID"
->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
     landlord_sap = None
     outcomes_filename = None
     outcomes_sheetname = None
@@ -111,8 +91,6 @@ def app():
     asset_list_header = 0
     landlord_block_reference = None
 
-<<<<<<< HEAD
-=======
     # Peabody data for cleaning
     data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting "
                    "Project/data_validation")
@@ -181,7 +159,6 @@ def app():
     # ecosurv_landlords = None
     # asset_list_header = 0
     # landlord_block_reference = None
->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
 
     # Maps addresses to uprn in problematic cases
     manual_uprn_map = {}
@@ -516,11 +493,7 @@ def app():
 
         if not asset_list.geographical_areas.empty:
             asset_list.geographical_areas.to_excel(writer, sheet_name="Geographical Areas", index=False)
-<<<<<<< HEAD
-        print("done")
-=======
 
         # Store dupes
         if not asset_list.duplicated_addresses.empty:
             asset_list.duplicated_addresses.to_excel(writer, sheet_name="Duplicate Properties", index=False)
->>>>>>> 11b482838efcf46f376fd3ecbf2c1bb0be6d097d
diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt
index f9be495b..1fa08aca 100644
--- a/asset_list/requirements.txt
+++ b/asset_list/requirements.txt
@@ -1,7 +1,6 @@
 postal
 pandas
 usaddress
-pydantic-settings==2.6.0
 epc-api-python==1.0.2
 thefuzz
 boto3
@@ -10,7 +9,5 @@ openai>=1.3.5
 tiktoken
 msgpack
 beautifulsoup4
-pydantic>=1.10.7
 typing-extensions>=4.5.0
 requests>=2.28.2
-tiktoken
\ No newline at end of file
diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py
index fc081fab..1d8a9b68 100644
--- a/backend/address2UPRN/main.py
+++ b/backend/address2UPRN/main.py
@@ -1,18 +1,321 @@
 from epc_api.client import EpcClient
 import os
-EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "")
+from urllib.parse import urlencode
+import pandas as pd
+from difflib import SequenceMatcher
 
+import re
+
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "")
 client = EpcClient(auth_token=EPC_AUTH_TOKEN)
 
+import re
+from difflib import SequenceMatcher
+from typing import Set
 
-search_resp = client.domestic.search(
-    params={
-        "postcode": "b938sy"
+
+def levenshtein(a: str, b: str) -> float:
+    """
+    Address similarity score in [0, 1].
+
+    Strategy:
+    - Normalise
+    - Strongly penalise mismatched house/flat numbers
+    - Combine token overlap + character similarity
+    """
+    def extract_numbers(s: str) -> Set[str]:
+        """Extract all numeric tokens (house numbers, flat numbers)."""
+        return set(re.findall(r"\d+[a-z]?", s))
+
+    def tokenise(s: str) -> Set[str]:
+        return set(s.split())
+
+    a_norm = normalise_address(a)
+    b_norm = normalise_address(b)
+
+    # --- hard signal: numbers ---
+    nums_a = extract_numbers(a_norm)
+    nums_b = extract_numbers(b_norm)
+
+    if nums_a and nums_b and nums_a != nums_b:
+        # Different house/flat numbers → near impossible match
+        return 0.0
+
+    # --- token similarity (order-independent) ---
+    toks_a = tokenise(a_norm)
+    toks_b = tokenise(b_norm)
+
+    if not toks_a or not toks_b:
+        token_score = 0.0
+    else:
+        token_score = len(toks_a & toks_b) / len(toks_a | toks_b)
+
+    # --- character similarity (soft signal) ---
+    char_score = SequenceMatcher(None, a_norm, b_norm).ratio()
+
+    # --- weighted blend ---
+    return round(
+        0.65 * token_score +
+        0.35 * char_score,
+        4,
+    )
+
+
+def normalise_address(s: str) -> str:
+    """
+    Canonical UK-focused address normalisation.
+
+    - Lowercases
+    - Removes punctuation (keeps / for flats)
+    - Normalises whitespace
+    - Applies synonym compression at token level
+    """
+
+    if not s:
+        return ""
+
+    ADDRESS_SYNONYMS = {
+        # street types
+        "rd": "road",
+        "rd.": "road",
+        "st": "street",
+        "st.": "street",
+        "ave": "avenue",
+        "ave.": "avenue",
+        "ln": "lane",
+        "ln.": "lane",
+        "cres": "crescent",
+        "ct": "court",
+        "dr": "drive",
+
+        # flats / units
+        "apt": "flat",
+        "apartment": "flat",
+        "unit": "flat",
+        "ste": "suite",
+
+        # numbering noise
+        "no": "",
+        "no.": "",
     }
-)
 
-print(search_resp)
+    # 1. lowercase
+    s = s.lower()
+
+    # 2. remove punctuation except /
+    s = re.sub(r"[^\w\s/]", " ", s)
+
+    # 3. normalise whitespace
+    s = re.sub(r"\s+", " ", s).strip()
+
+    # 4. tokenise + synonym normalisation
+    tokens = []
+    for tok in s.split():
+        replacement = ADDRESS_SYNONYMS.get(tok, tok)
+        if replacement:
+            tokens.append(replacement)
+
+    return " ".join(tokens)
 
 
+def score_addresses(
+    df: pd.DataFrame,
+    user_address: str,
+    column: str = "address",
+) -> pd.Series:
+    if column not in df.columns:
+        raise ValueError(f"Missing column: {column}")
+
+    return df[column].apply(
+        lambda x: levenshtein(user_address, x)
+    )
+
+def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3):
+    """
+    Recursively fetch EPC data by postcode.
+    If results hit the size limit, retry with double size up to max_attempts.
+    """
+
+    url = os.path.join(client.domestic.host, "search")
+
+    if size:
+        url += "?" + urlencode({"size": size})
+
+    search_resp = client.domestic.call(
+        url=url,
+        method="get",
+        params={"postcode": postcode},
+    )
+
+    results_df = pd.DataFrame(
+        search_resp["rows"],
+        columns=search_resp["column-names"]
+    )
+
+    row_count = len(results_df)
+
+    # If we hit the size limit, there *may* be more results
+    if row_count == size:
+        print(
+            f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. "
+            f"Attempt {attempt}/{max_attempts}."
+        )
+
+        if attempt < max_attempts:
+            print(f"🔁 Retrying with size={size * 2}")
+            return get_epc_data_with_postcode(
+                postcode=postcode,
+                size=size * 2,
+                attempt=attempt + 1,
+                max_attempts=max_attempts,
+            )
+        else:
+            print(
+                "🚨 Max attempts reached. Results may be truncated. "
+                "(Please do a manual review by the tech team.)"
+            )
+
+    return results_df
 
 
+def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool:
+    """
+    Returns True if all non-null UPRNs in df match the given uprn.
+    Returns False otherwise.
+    """
+
+    if column not in df.columns:
+        return False
+
+    # Drop nulls and normalise to string
+    uprns = (
+        df[column]
+        .dropna()
+        .astype(str)
+        .str.strip()
+        .unique()
+    )
+
+    # No valid UPRNs to compare
+    if len(uprns) == 0:
+        return False
+
+    # Exactly one unique UPRN and it matches
+    return len(uprns) == 1 and uprns[0] == str(uprn)
+
+
+def get_uprn_candidates(
+    df: pd.DataFrame,
+    user_address: str,
+    address_column: str = "address",
+    uprn_column: str = "uprn",
+) -> pd.DataFrame:
+    """
+    Annotate EPC results with lexicographical similarity scores and ranks.
+
+    Returns a DataFrame sorted by descending lexiscore.
+    DOES NOT choose or return a UPRN.
+    """
+
+    if address_column not in df.columns:
+        raise ValueError(f"Missing column: {address_column}")
+
+    if uprn_column not in df.columns:
+        raise ValueError(f"Missing column: {uprn_column}")
+
+    out = df.copy()
+
+    user_norm = normalise_address(user_address)
+
+    out["lexiscore"] = out[address_column].apply(
+        lambda x: levenshtein(user_norm, x)
+    )
+
+    # Normalise UPRN to string
+    out[uprn_column] = (
+        out[uprn_column]
+        .astype(str)
+        .str.replace(r"\.0$", "", regex=True)
+    )
+
+    # Rank: 1 = best match
+    out["lexirank"] = (
+        out["lexiscore"]
+        .rank(method="dense", ascending=False)
+        .astype(int)
+    )
+
+    return out.sort_values(
+        ["lexirank", "lexiscore"],
+        ascending=[True, False],
+    )
+
+
+def get_uprn(user_inputed_address: str, postcode: str):
+    df = get_epc_data_with_postcode(postcode=postcode)
+
+    if df.empty:
+        return False
+
+    scored_df = get_uprn_candidates(
+        df,
+        user_address=user_inputed_address,
+    )
+
+    # Best score
+    best_score = scored_df.iloc[0]["lexiscore"]
+
+    if best_score <= 0:
+        return False
+
+    # All rank-1 rows (possible draw)
+    top_rank_df = scored_df[scored_df["lexirank"] == 1]
+
+    # If rank-1 rows do not agree on a single UPRN → ambiguous
+    if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]):
+        return False
+
+    # Safe to return the agreed UPRN
+    return top_rank_df.iloc[0]["uprn"]
+
+
+def test(a,b):
+    assert a == b, f"errr a {a} - {type(a)}, does not equal b {b} - {type(b)}"
+
+
+def run_all_test():
+    # Basic usage with different post codes styles
+    test(get_epc_data_with_postcode("b93 8sy").shape[0], 63)
+    test(get_epc_data_with_postcode("B938sy").shape[0], 63)
+    test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63)
+    test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63)
+
+    test(get_uprn("68", "b93 8sy"), "100070989938")
+    test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938")
+
+    get_uprn_candidates(get_epc_data_with_postcode("b93 8sy"), "68")
+    get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "Flat A, 28, Nelgarde Road")
+    get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "28 A")
+    get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "A 28")
+    get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "28A")
+    get_uprn_candidates(get_epc_data_with_postcode("se6 4tf"), "A28")
+
+    get_uprn_candidates(get_epc_data_with_postcode("E8 4SQ"), "6 Aitken Close") # no epc
+    
+
+    # # Example of more than one results for the same address
+    # test(get_uprn("se6 4tf", house_number="flat A 28"), "100023278633")
+    # test(get_uprn("se6 4tf", house_number="28 A"), "100023278633")
+    # test(get_uprn("se6 4tf", house_number="A 28"), "100023278633")
+    
+    # test(get_uprn("se6 4tf", house_number="A28"), "100023278633") # this one
+    # test(get_uprn("se6 4tf", house_number="28A"), "100023278633")  # investigate this one
+
+    # # Example of flats that have different uprn
+    # test(get_uprn("se6 4tf", house_number="28"), "100023278633")
+
+    # house number nlp, address1
+
+    # get postcode
+    # make input data with peabody with 3 postcode so i have sample of iput 
+         
diff --git a/backend/app/requirements/requirements.txt b/backend/app/requirements/requirements.txt
index dff7a546..a45738c6 100644
--- a/backend/app/requirements/requirements.txt
+++ b/backend/app/requirements/requirements.txt
@@ -1,4 +1,3 @@
-# fastapi
 fastapi==0.115.2
 sqlalchemy==2.0.36
 pydantic-settings==2.6.0
@@ -11,6 +10,4 @@ boto3==1.35.44
 # Data
 openpyxl==3.1.2
 # Basic
-pytz
-sqlmodel
-
+pytz
\ No newline at end of file
diff --git a/model_data/requirements/requirements.txt b/model_data/requirements/requirements.txt
index 845166d9..bbf75df5 100644
--- a/model_data/requirements/requirements.txt
+++ b/model_data/requirements/requirements.txt
@@ -1,4 +1,4 @@
-pydantic==2.9.2
+pydantic>=1.10.7
 pydantic-settings==2.6.0
 epc-api-python==1.0.2
 numpy==2.1.2