From ef40363a69500801297bc192cfb811f22abd331f Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 28 Jan 2026 15:54:38 +0000 Subject: [PATCH] run all tests --- .devcontainer/backend/devcontainer.json | 3 +- .github/workflows/unit_tests.yml | 3 + backend/address2UPRN/main.py | 225 +++++++++++------------- conftest.py | 2 +- pytest.ini | 2 +- 5 files changed, 114 insertions(+), 121 deletions(-) diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index 1782189a..c672b1bf 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -21,7 +21,8 @@ "jgclark.vscode-todo-highlight", "corentinartaud.pdfpreview", "ms-python.vscode-python-envs", - "ms-python.black-formatter" + "ms-python.black-formatter", + "waderyan.gitblame" ], "settings": { "files.defaultWorkspace": "/workspaces/model", diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 95155c86..bbe05753 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -2,6 +2,9 @@ name: Run unit tests on: pull_request: + branches: + - '*' + jobs: test: diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 406a8ffb..3b02151a 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -7,8 +7,10 @@ from tqdm import tqdm import re -EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=") -client = EpcClient(auth_token=EPC_AUTH_TOKEN) +EPC_AUTH_TOKEN = os.getenv( + "EPC_AUTH_TOKEN", + "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=", +) import re from difflib import SequenceMatcher @@ -25,14 +27,12 @@ def levenshtein(a: str, b: str) -> float: - Combine token overlap + character similarity """ - def extract_number_sequence(s: str) -> list[str]: + def extract_number_sequence(s: str) -> list[str]: return re.findall(r"\d+[a-z]?", s) def extract_numbers(s: str) -> Set[str]: return set(extract_number_sequence(s)) - - def tokenise(s: str) -> Set[str]: return set(s.split()) @@ -42,7 +42,7 @@ def levenshtein(a: str, b: str) -> float: # --- hard signal: numbers --- nums_a = extract_numbers(a_norm) nums_b = extract_numbers(b_norm) - + if nums_a and not nums_b: return 0.0 @@ -54,9 +54,10 @@ def levenshtein(a: str, b: str) -> float: seq_a = extract_number_sequence(a_norm) seq_b = extract_number_sequence(b_norm) - has_flat_token_user = any(tok in a_norm for tok in ("flat", "apt", "apartment", "unit")) - has_flat_token_epc = "flat" in b_norm - + has_flat_token_user = any( + tok in a_norm for tok in ("flat", "apt", "apartment", "unit") + ) + has_flat_token_epc = "flat" in b_norm if ( len(seq_a) == 2 @@ -67,7 +68,6 @@ def levenshtein(a: str, b: str) -> float: ): return 0.0 - # --- token similarity (order-independent) --- toks_a = tokenise(a_norm) toks_b = tokenise(b_norm) @@ -82,8 +82,7 @@ def levenshtein(a: str, b: str) -> float: # --- weighted blend --- return round( - 0.65 * token_score + - 0.35 * char_score, + 0.65 * token_score + 0.35 * char_score, 4, ) @@ -114,13 +113,11 @@ def normalise_address(s: str) -> str: "cres": "crescent", "ct": "court", "dr": "drive", - # flats / units "apt": "flat", "apartment": "flat", "unit": "flat", "ste": "suite", - # numbering noise "no": "", "no.": "", @@ -155,15 +152,15 @@ def score_addresses( if column not in df.columns: raise ValueError(f"Missing column: {column}") - return df[column].apply( - lambda x: levenshtein(user_address, x) - ) + return df[column].apply(lambda x: levenshtein(user_address, x)) + def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): """ Recursively fetch EPC data by postcode. If results hit the size limit, retry with double size up to max_attempts. """ + client = EpcClient(auth_token=EPC_AUTH_TOKEN) url = os.path.join(client.domestic.host, "search") @@ -176,10 +173,7 @@ def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): params={"postcode": postcode}, ) - results_df = pd.DataFrame( - search_resp["rows"], - columns=search_resp["column-names"] - ) + results_df = pd.DataFrame(search_resp["rows"], columns=search_resp["column-names"]) row_count = len(results_df) @@ -217,13 +211,7 @@ def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> boo return False # Drop nulls and normalise to string - uprns = ( - df[column] - .dropna() - .astype(str) - .str.strip() - .unique() - ) + uprns = df[column].dropna().astype(str).str.strip().unique() # No valid UPRNs to compare if len(uprns) == 0: @@ -256,23 +244,13 @@ def get_uprn_candidates( user_norm = normalise_address(user_address) - out["lexiscore"] = out[address_column].apply( - lambda x: levenshtein(user_norm, x) - ) + out["lexiscore"] = out[address_column].apply(lambda x: levenshtein(user_norm, x)) # Normalise UPRN to string - out[uprn_column] = ( - out[uprn_column] - .astype(str) - .str.replace(r"\.0$", "", regex=True) - ) + out[uprn_column] = out[uprn_column].astype(str).str.replace(r"\.0$", "", regex=True) # Rank: 1 = best match - out["lexirank"] = ( - out["lexiscore"] - .rank(method="dense", ascending=False) - .astype(int) - ) + out["lexirank"] = out["lexiscore"].rank(method="dense", ascending=False).astype(int) return out.sort_values( ["lexirank", "lexiscore"], @@ -307,6 +285,7 @@ def get_uprn(user_inputed_address: str, postcode: str): # Safe to return the agreed UPRN return top_rank_df.iloc[0]["uprn"] + def resolve_uprns_for_postcode_group( group_df: pd.DataFrame, epc_df: pd.DataFrame, @@ -332,46 +311,54 @@ def resolve_uprns_for_postcode_group( ) if scored_df.empty: - results.append({ - "found_uprn": None, - "best_match_uprn": None, - "best_match_address": None, - "best_match_lexiscore": None, - "status": "no_epc_candidates", - }) + results.append( + { + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "no_epc_candidates", + } + ) continue best_score = scored_df.iloc[0]["lexiscore"] if best_score <= 0: - results.append({ - "found_uprn": None, - "best_match_uprn": None, - "best_match_address": None, - "best_match_lexiscore": best_score, - "status": "zero_score", - }) + results.append( + { + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": best_score, + "status": "zero_score", + } + ) continue top_rank_df = scored_df[scored_df["lexirank"] == 1] if not df_has_single_uprn(top_rank_df, top_rank_df.iloc[0]["uprn"]): - results.append({ - "found_uprn": None, - "best_match_uprn": top_rank_df.iloc[0]["uprn"], - "best_match_address": top_rank_df.iloc[0]["address"], - "best_match_lexiscore": best_score, - "status": "ambiguous", - }) + results.append( + { + "found_uprn": None, + "best_match_uprn": top_rank_df.iloc[0]["uprn"], + "best_match_address": top_rank_df.iloc[0]["address"], + "best_match_lexiscore": best_score, + "status": "ambiguous", + } + ) continue - results.append({ - "found_uprn": str(top_rank_df.iloc[0]["uprn"]), - "best_match_uprn": str(top_rank_df.iloc[0]["uprn"]), - "best_match_address": top_rank_df.iloc[0]["address"], - "best_match_lexiscore": best_score, - "status": "matched", - }) + results.append( + { + "found_uprn": str(top_rank_df.iloc[0]["uprn"]), + "best_match_uprn": str(top_rank_df.iloc[0]["uprn"]), + "best_match_address": top_rank_df.iloc[0]["address"], + "best_match_lexiscore": best_score, + "status": "matched", + } + ) return pd.concat( [group_df.reset_index(drop=True), pd.DataFrame(results)], @@ -379,8 +366,7 @@ def resolve_uprns_for_postcode_group( ) - -def test(a,b): +def test(a, b): assert a == b, f"erorr: {a}{type(a)} != {b}: {type(b)}" @@ -394,20 +380,27 @@ def run_all_test(): test(get_uprn("68", "b93 8sy"), "100070989938") test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938") test(get_uprn("Flat A, 28, Nelgarde Road", "se6 4tf"), "100023278633") - test(get_uprn("28 A", "se6 4tf"), "100023278633") + test(get_uprn("28 A", "se6 4tf"), "100023278633") test(get_uprn("28A", "se6 4tf"), "100023278633") test(get_uprn("6 Aitken Close", "E8 4SQ"), False) - # unique case - test(get_uprn("Flat 5, 1, Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("Flat 5, 1, Semley Gate", "e9 5nh"), "10008238198") test(get_uprn("5 , 1 Semley Gate", "e9 5nh"), "10008238198") - test(get_uprn("5 Semley Gate", "e9 5nh"), "10008238198" ) + test(get_uprn("5 Semley Gate", "e9 5nh"), "10008238198") test(get_uprn("1, 5 Semley Gate", "e9 5nh"), False) - test(get_uprn("1 Semley Gate", "e9 5nh"), "10008238188") # this one return "flat 1, in 1 semley gate" - test(get_uprn("48 Oswald Street", "E5 0BT"), False) # this one return "flat 1, in 1 semley gate" - test(get_uprn("42 Oswald Street", "E5 0BT"), False) # this one return "flat 1, in 1 semley gate" - test(get_uprn("46 Oswald Street", "E5 0BT"), False) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("1 Semley Gate", "e9 5nh"), "10008238188" + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("48 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("42 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" + test( + get_uprn("46 Oswald Street", "E5 0BT"), False + ) # this one return "flat 1, in 1 semley gate" get_uprn_candidates(get_epc_data_with_postcode("e5 0bt"), "48 Oswald Street") @@ -430,24 +423,22 @@ if __name__ == "__main__": input_address = str(row[ADDRESS_COL]).strip() postcode = str(row[POSTCODE_COL]).strip() - expected_uprn = ( - None - if pd.isna(row[UPRN_COL]) - else str(int(row[UPRN_COL])) - ) + expected_uprn = None if pd.isna(row[UPRN_COL]) else str(int(row[UPRN_COL])) try: epc_df = get_epc_data_with_postcode(postcode) if epc_df.empty: - failures.append({ - **row.to_dict(), - "found_uprn": None, - "best_match_uprn": None, - "best_match_address": None, - "best_match_lexiscore": None, - "status": "no_epc_results", - }) + failures.append( + { + **row.to_dict(), + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "no_epc_results", + } + ) continue scored_df = get_uprn_candidates( @@ -464,34 +455,32 @@ if __name__ == "__main__": found_uprn = get_uprn(input_address, postcode) except Exception as e: - failures.append({ - **row.to_dict(), - "found_uprn": None, - "best_match_uprn": None, - "best_match_address": None, - "best_match_lexiscore": None, - "status": "exception", - "error": str(e), - }) + failures.append( + { + **row.to_dict(), + "found_uprn": None, + "best_match_uprn": None, + "best_match_address": None, + "best_match_lexiscore": None, + "status": "exception", + "error": str(e), + } + ) continue - found_uprn_norm = ( - None if not found_uprn else str(found_uprn) - ) + found_uprn_norm = None if not found_uprn else str(found_uprn) if found_uprn_norm != expected_uprn: - failures.append({ - **row.to_dict(), - "found_uprn": found_uprn_norm, - "best_match_uprn": best_match_uprn, - "best_match_address": best_match_address, - "best_match_lexiscore": best_match_lexiscore, - "status": ( - "no_match" - if found_uprn_norm is None - else "mismatch" - ), - }) + failures.append( + { + **row.to_dict(), + "found_uprn": found_uprn_norm, + "best_match_uprn": best_match_uprn, + "best_match_address": best_match_address, + "best_match_lexiscore": best_match_lexiscore, + "status": ("no_match" if found_uprn_norm is None else "mismatch"), + } + ) failures_df = pd.DataFrame(failures) @@ -510,6 +499,6 @@ if __name__ == "__main__": # get_uprn_candidates(get_epc_data_with_postcode("E9 5NH"),"Flat 1, 5 Semley Gate" and Flat 5, 1 Semley Gate) # fix that -# Look again at flat 1 +# Look again at flat 1 # pandas reader the seperate postcode_splitter -# dump into s3 \ No newline at end of file +# dump into s3 diff --git a/conftest.py b/conftest.py index e3add6e6..be5c54a4 100644 --- a/conftest.py +++ b/conftest.py @@ -8,7 +8,7 @@ DEFAULT_ENV = { "DATA_BUCKET": "test", "PLAN_TRIGGER_BUCKET": "test", "ENGINE_SQS_URL": "test", - "EPC_AUTH_TOKEN": "test", # overridden in GitHub Actions + "EPC_AUTH_TOKEN": "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=", # overridden in GitHub Actions "GOOGLE_SOLAR_API_KEY": "test", "DB_HOST": "localhost", "DB_USERNAME": "test", diff --git a/pytest.ini b/pytest.ini index 1422657b..0a0bbf73 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] pythonpath = . addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests