From e1188ebc180416b1b07a7194a7510e2e993ec7b6 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Fri, 23 Jan 2026 16:39:16 +0000 Subject: [PATCH] address 2 uprn --- backend/address2UPRN/main.py | 356 +++---------------------- backend/address2UPRN/script.py | 17 ++ backend/postcode_splitter/hackney.xlsx | Bin 0 -> 19557 bytes backend/postcode_splitter/main.py | 81 ++++++ 4 files changed, 136 insertions(+), 318 deletions(-) create mode 100644 backend/address2UPRN/script.py create mode 100644 backend/postcode_splitter/hackney.xlsx create mode 100644 backend/postcode_splitter/main.py diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 29c3c456..e4939836 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -24,9 +24,14 @@ def levenshtein(a: str, b: str) -> float: - Strongly penalise mismatched house/flat numbers - Combine token overlap + character similarity """ + + def extract_number_sequence(s: str) -> list[str]: + return re.findall(r"\d+[a-z]?", s) + def extract_numbers(s: str) -> Set[str]: - """Extract all numeric tokens (house numbers, flat numbers).""" - return set(re.findall(r"\d+[a-z]?", s)) + return set(extract_number_sequence(s)) + + def tokenise(s: str) -> Set[str]: return set(s.split()) @@ -38,10 +43,28 @@ def levenshtein(a: str, b: str) -> float: nums_a = extract_numbers(a_norm) nums_b = extract_numbers(b_norm) - if nums_a and nums_b and nums_a != nums_b: - # Different house/flat numbers → near impossible match + # No shared numbers at all → impossible match + if nums_a and nums_b and nums_a.isdisjoint(nums_b): return 0.0 + # --- order-sensitive flat/building guard --- + seq_a = extract_number_sequence(a_norm) + seq_b = extract_number_sequence(b_norm) + + has_flat_token_user = any(tok in a_norm for tok in ("flat", "apt", "apartment", "unit")) + has_flat_token_epc = "flat" in b_norm + + + if ( + len(seq_a) == 2 + and len(seq_b) >= 2 + and has_flat_token_epc + and not has_flat_token_user + and seq_a != seq_b[:2] + ): + return 0.0 + + # --- token similarity (order-independent) --- toks_a = tokenise(a_norm) toks_b = tokenise(b_norm) @@ -99,10 +122,12 @@ def normalise_address(s: str) -> str: "no": "", "no.": "", } - # 1. lowercase s = s.lower() + # 1.5 split digit-letter suffixes + s = re.sub(r"(\d+)([a-z])\b", r"\1 \2", s) + # 2. remove punctuation except / s = re.sub(r"[^\w\s/]", " ", s) @@ -281,7 +306,7 @@ def get_uprn(user_inputed_address: str, postcode: str): def test(a,b): - assert a == b, f"errr a {a} - {type(a)}, does not equal b {b} - {type(b)}" + assert a == b, f"erorr: {a}{type(a)} != {b}: {type(b)}" def run_all_test(): @@ -294,321 +319,17 @@ def run_all_test(): test(get_uprn("68", "b93 8sy"), "100070989938") test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938") test(get_uprn("Flat A, 28, Nelgarde Road", "se6 4tf"), "100023278633") - test(get_uprn("28 A", "se6 4tf"), "100023278633") + test(get_uprn("28 A", "se6 4tf"), "100023278633") test(get_uprn("28A", "se6 4tf"), "100023278633") test(get_uprn("6 Aitken Close", "E8 4SQ"), False) - from epc_api.client import EpcClient -import os -from urllib.parse import urlencode -import pandas as pd -from difflib import SequenceMatcher -from tqdm import tqdm -import re -EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN", "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=") -client = EpcClient(auth_token=EPC_AUTH_TOKEN) - -import re -from difflib import SequenceMatcher -from typing import Set - - -def levenshtein(a: str, b: str) -> float: - """ - Address similarity score in [0, 1]. - - Strategy: - - Normalise - - Strongly penalise mismatched house/flat numbers - - Combine token overlap + character similarity - """ - def extract_numbers(s: str) -> Set[str]: - """Extract all numeric tokens (house numbers, flat numbers).""" - return set(re.findall(r"\d+[a-z]?", s)) - - def tokenise(s: str) -> Set[str]: - return set(s.split()) - - a_norm = normalise_address(a) - b_norm = normalise_address(b) - - # --- hard signal: numbers --- - nums_a = extract_numbers(a_norm) - nums_b = extract_numbers(b_norm) - - if nums_a and nums_b and nums_a != nums_b: - # Different house/flat numbers → near impossible match - return 0.0 - - # --- token similarity (order-independent) --- - toks_a = tokenise(a_norm) - toks_b = tokenise(b_norm) - - if not toks_a or not toks_b: - token_score = 0.0 - else: - token_score = len(toks_a & toks_b) / len(toks_a | toks_b) - - # --- character similarity (soft signal) --- - char_score = SequenceMatcher(None, a_norm, b_norm).ratio() - - # --- weighted blend --- - return round( - 0.65 * token_score + - 0.35 * char_score, - 4, - ) - - -def normalise_address(s: str) -> str: - """ - Canonical UK-focused address normalisation. - - - Lowercases - - Removes punctuation (keeps / for flats) - - Normalises whitespace - - Applies synonym compression at token level - """ - - if not s: - return "" - - ADDRESS_SYNONYMS = { - # street types - "rd": "road", - "rd.": "road", - "st": "street", - "st.": "street", - "ave": "avenue", - "ave.": "avenue", - "ln": "lane", - "ln.": "lane", - "cres": "crescent", - "ct": "court", - "dr": "drive", - - # flats / units - "apt": "flat", - "apartment": "flat", - "unit": "flat", - "ste": "suite", - - # numbering noise - "no": "", - "no.": "", - } - - # 1. lowercase - s = s.lower() - - # 2. remove punctuation except / - s = re.sub(r"[^\w\s/]", " ", s) - - # 3. normalise whitespace - s = re.sub(r"\s+", " ", s).strip() - - # 4. tokenise + synonym normalisation - tokens = [] - for tok in s.split(): - replacement = ADDRESS_SYNONYMS.get(tok, tok) - if replacement: - tokens.append(replacement) - - return " ".join(tokens) - - -def score_addresses( - df: pd.DataFrame, - user_address: str, - column: str = "address", -) -> pd.Series: - if column not in df.columns: - raise ValueError(f"Missing column: {column}") - - return df[column].apply( - lambda x: levenshtein(user_address, x) - ) - -def get_epc_data_with_postcode(postcode, size=500, attempt=1, max_attempts=3): - """ - Recursively fetch EPC data by postcode. - If results hit the size limit, retry with double size up to max_attempts. - """ - - url = os.path.join(client.domestic.host, "search") - - if size: - url += "?" + urlencode({"size": size}) - - search_resp = client.domestic.call( - url=url, - method="get", - params={"postcode": postcode}, - ) - - results_df = pd.DataFrame( - search_resp["rows"], - columns=search_resp["column-names"] - ) - - row_count = len(results_df) - - # If we hit the size limit, there *may* be more results - if row_count == size: - print( - f"⚠️ Warning: hit size limit ({size}) for postcode '{postcode}'. " - f"Attempt {attempt}/{max_attempts}." - ) - - if attempt < max_attempts: - print(f"🔁 Retrying with size={size * 2}") - return get_epc_data_with_postcode( - postcode=postcode, - size=size * 2, - attempt=attempt + 1, - max_attempts=max_attempts, - ) - else: - print( - "🚨 Max attempts reached. Results may be truncated. " - "(Please do a manual review by the tech team.)" - ) - - return results_df - - -def df_has_single_uprn(df: pd.DataFrame, uprn: str, column: str = "uprn") -> bool: - """ - Returns True if all non-null UPRNs in df match the given uprn. - Returns False otherwise. - """ - - if column not in df.columns: - return False - - # Drop nulls and normalise to string - uprns = ( - df[column] - .dropna() - .astype(str) - .str.strip() - .unique() - ) - - # No valid UPRNs to compare - if len(uprns) == 0: - return False - - # Exactly one unique UPRN and it matches - return len(uprns) == 1 and uprns[0] == str(uprn) - - -def get_uprn_candidates( - df: pd.DataFrame, - user_address: str, - address_column: str = "address", - uprn_column: str = "uprn", -) -> pd.DataFrame: - """ - Annotate EPC results with lexicographical similarity scores and ranks. - - Returns a DataFrame sorted by descending lexiscore. - DOES NOT choose or return a UPRN. - """ - - if address_column not in df.columns: - raise ValueError(f"Missing column: {address_column}") - - if uprn_column not in df.columns: - raise ValueError(f"Missing column: {uprn_column}") - - out = df.copy() - - user_norm = normalise_address(user_address) - - out["lexiscore"] = out[address_column].apply( - lambda x: levenshtein(user_norm, x) - ) - - # Normalise UPRN to string - out[uprn_column] = ( - out[uprn_column] - .astype(str) - .str.replace(r"\.0$", "", regex=True) - ) - - # Rank: 1 = best match - out["lexirank"] = ( - out["lexiscore"] - .rank(method="dense", ascending=False) - .astype(int) - ) - - return out.sort_values( - ["lexirank", "lexiscore"], - ascending=[True, False], - ) - - -def get_uprn(user_inputed_address: str, postcode: str): - df = get_epc_data_with_postcode(postcode=postcode) - - if df.empty: - return False - - scored_df = get_uprn_candidates( - df, - user_address=user_inputed_address, - ) - - # Best score - best_score = scored_df.iloc[0]["lexiscore"] - - if best_score <= 0: - return False - - # All rank-1 rows (possible draw) - top_rank_df = scored_df[scored_df["lexirank"] == 1] - - # If rank-1 rows do not agree on a single UPRN → ambiguous - if not df_has_single_uprn(top_rank_df, uprn=top_rank_df.iloc[0]["uprn"]): - return False - - # Safe to return the agreed UPRN - return top_rank_df.iloc[0]["uprn"] - - -def test(a,b): - assert a == b, f"errr a {a} - {type(a)}, does not equal b {b} - {type(b)}" - - -def run_all_test(): - # Basic usage with different post codes styles - test(get_epc_data_with_postcode("b93 8sy").shape[0], 63) - test(get_epc_data_with_postcode("B938sy").shape[0], 63) - test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) - test(get_epc_data_with_postcode("b93 8Sy").shape[0], 63) - - test(get_uprn("68", "b93 8sy"), "100070989938") - test(get_uprn("68 Glendon Way", "b93 8sy"), "100070989938") - test(get_uprn("Flat A, 28, Nelgarde Road", "se6 4tf"), "100023278633") - test(get_uprn("28 A", "se6 4tf"), "100023278633") - test(get_uprn("28A", "se6 4tf"), "100023278633") - test(get_uprn("6 Aitken Close", "E8 4SQ"), False) - - - get_uprn_candidates(get_epc_data_with_postcode("E9 5NH"),"5 Semley Gate") - - # TODO - # Lets write some tests with hackney and then peabody data - - - - get_uprn_candidates(get_epc_data_with_postcode("E9 5NH"),"5 Semley Gate") - - # TODO - # Lets write some tests with hackney and then peabody data + # unique case + test(get_uprn("Flat 5, 1, Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("5 , 1 Semley Gate", "e9 5nh"), "10008238198") + test(get_uprn("5 Semley Gate", "e9 5nh"), "10008238198" ) + test(get_uprn("1, 5 Semley Gate", "e9 5nh"), False) + test(get_uprn("1 Semley Gate", "e9 5nh"), "10008238188") # this one return "flat 1, in 1 semley gate" if __name__ == "__main__": @@ -706,7 +427,6 @@ if __name__ == "__main__": ) - # TO do function dispatcher, # get_uprn_candidates(get_epc_data_with_postcode("E9 5NH"),"Flat 1, 5 Semley Gate" and Flat 5, 1 Semley Gate) diff --git a/backend/address2UPRN/script.py b/backend/address2UPRN/script.py new file mode 100644 index 00000000..bd8f8017 --- /dev/null +++ b/backend/address2UPRN/script.py @@ -0,0 +1,17 @@ +import pandas as pd + + +# use Address 1 +junte_df = pd.read_excel("hackney_uprn_failures.xlsx") + + +# use domna_address_1 +khalim_df = pd.read_excel("khalim_standard.xlsx") + + +combined_df = junte_df.merge(khalim_df, how="left", left_on="Address 1", right_on='domna_address_1') + +# Find the row in khalim_df that does not app + +result = combined_df[~pd.isnull(combined_df["epc_os_uprn"])] + diff --git a/backend/postcode_splitter/hackney.xlsx b/backend/postcode_splitter/hackney.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..b6d3786efc8c8cc0dd706371bb1f664ee9b978e1 GIT binary patch literal 19557 zcmbTe1yr2Lwl<8rI|K;s?v~*0?(Xg`3GVJ5+}+*Xo!}58Sdid^@HJ%aolNdI^RM;S zV$t2ZYd^KEq<2-la+086Xh2X#7nZdUZV4#rlF^scrJmIk)AmUM2` zR&+0viuf^$J_a>rges{Pe>6$5Qx6Jg7q0s(w6i z=3xHPMcqd7Y^3-Lndvz6gpZ1yjFqTY-nXq@xyK&z);Y;3(InOaME6mWeir@bixg8P0BOSO?{j{09%Zq-@2eu<<7Q_B#ntHP1 zvq)DTG2N~|9&M>LqL$_7o`NcZhvM-VJ4Z(l0Z+vabyJllJW%zX?+IQ`5)x|r4aMgh zfNj`8fq>-xtMw3nS?_nd6)0=lt}vi^Z&nS{)U%7hk>*9n)c{!(d=s=*&F@f>*?xOLT}Rb#a^Nojx)dhUFX|apZB%;Hk?2bC)P_G9s0ls$z!4 z37ICOWwdEeeP7RD@U;lY;er$_qzpe)8XU`jQS_%64F$g8G7PktzP!V2&L~bco{2Kn z0TSxxV15I;(w)%!!7si}O`KM7&eu~@gnXy^z`!fNiU-{de-@lVt7bI&UBa^j&4=KI z`q+?*c!FaYj=tOzoBU-0Pf+a&~K|o5daXjyAtb$ESf!m)sgQrVtupG@|M*Rfb zc|tT4^R@Em5$O6|a}rZQq_lxN6otC{u&OY@@H;sp%MBhpeQNH+7J^B?^5VHNsJ=;Q zko(i~Hym=DsPPXNpwzGIHGX+1HX$naQ-a<`aHF*W;_OdW2L6sE4z zx-{|hqF7d(1Ul{2$z7KYiYHG3!_d>Ky$ zR99^>7?8ZrRMpzqA(D=%(u*xxj|Axz;(b0YrAQjW)n?{D-r~V+*H#}&tmhNdxOwf5 z*O*$>B`c&4zn#zwVgf=3rL{GBI!Uk*4#*`FR7llHjB<^@Wni@@YDpthjuXRDde`o?LXNYOW#Y zqBs*A2fZFAHW&(VNJVd9Ovo=WeYXxYqfI>B>XGyiS~n=m3NgAz%la^c50SpcJrN!E z`5STa*Vyw&8pddST&r3+cts z=NS<|m%as0aS+&?o#)RG=7+E@I2-msCt!$QFIXJ%t(=<@XGPca&`~i-7gY=RM{zo| z5m~b7(>$xEwum0E(%)$*VLKs10?lOd4R91@ zp-3!6M8+#ZOF>cJBPn6**dBEvQNBQseOVah2u4Jq zzMeP6F|H16-|}sJMU5ia$ay8#id0Uk!bFJ9R$!_pMVE?r6JN%GGzeRUz}H2bfWApa z^w<)6f=!NW{P7!1zreCNKh7J@T;WN{nR7EX{cD0*T#C*m5s@&I$v6`qZiPu7hEim9 z;rt23w1Y~}ZGWjuYD>An?>Pe-)W+5n$cga*cxh7^O&mHfm?F=#wr-stTT&XQQ6uKy z)AbVO?FxIpfmIp9%O?)g2)RM%QYkg{RTRv!3+K<`LTcP*<$3e&2`y;E<2IT$vUJ9i7~*j2&P4&Z)Y#?CR^BQvKuoCBLSgL>QK0QO759H1pYdNOpS^ z*r6y~Obwr>6AtOj*`!%uDor_drUqOXg8g>NE%f z&z)93UbE$HTbU^N4(A_GXWWWv2G{q=3&=Hus<~*LSf&L?VL}wVmkFNN|m@Va22CR3;rO7F~Yg^$b9=sJH{ zD0YlN*!A*Qd%iYGmvG@xE_IyTiU!aM0~`zqDTY-(KA$&l7G@Zb;@C_;$&aja8Speu zpG^=Y%tY5-;qs)e%+D!&QgWl%x$MGFs6vu(Z{{$4rZ5Ke`Rs|5_4sF{bV=!P@q{F@ z%F@9_ir8XxSwM@qVoRf>oa5jRkKVAZLMeJw0Q1K0CA>@2PvWs*E%%Valb2N9OcT2X zrGnvlYmb*tC1ZG})lx5onUGI2L>;vWp3D)ad{8mo2Skg@KM3wFm*d=#?_4w&sSziZ zKE6vB<3{$uE>sF_;85q2L3u=EZoq)D0jr9MU-c^E+gKB|5hWHDpuHK|C`=M&6zbx= z!|^!ug0%vfjZUrw9@W6wiAII+G&f$OA5r5FOrBL0QBtCX#B;HS!zY4;u3lmvC_Oj$ zYYy?y-aN-FXW*fvE|PUDSGeOlQ>LGaAxz3%0aG< zELC0q%EVJelPdDnuGP`3HLD*}1^e@lY{$=w6@q1hFjwM2P*qpHm<^5Q@sv4n4BvNo zAha%dL}5QvZGzz1f_>Eeb_T&><_lrTuSm!cz7M~jA9!fpSMQM7Xh(i*w^=j&)Zm-Z zK=9;@l!tR(5$1S;S0BflMOuLxr`DB89;GEABVI16I6HO?X%q#njX$Q8la0#z8SO)M z|3gz+RUOX;-p|)bZr4rM(G1WL>0oKzLIM!Nddlv+w4M83}%SyN1*K5Nu&?GiJ0On!MpY-=XI%w53LM zcNmc-MEp42#v(3*RPxkK~ zSFYL`SKC`8bSA1yCiW{e8`7t2D}UZUw(+8?)~#2sl_wqB%(b^X%$!vI^m?>q-)#Lk z_ASReNS&hkb4QB@R?zA3@$ve);P>XWj-$p2vLCyh$8VDb?<}A1J(^C__beVnqmM1w zvk#V{$6Op6s?J+^i$q*MY;LeRyWUnuJa%%GZItTzzy^D) zWyfom-}1UwCg;nY)pQ)UX}j{aw;tmowgri*>m<`hefK>3`5`fm2KQ*m#j9woX8FrH zr|5n2Nf(dxGn$1hXo>7;E1r)Oq?fZN^TM+zZ=3Cs;M{YCW+3~$Ps;}Dw(08L!M-8$ z4@d4Nue**%QMc{8FUPCt$9=q=9qJ9+Pgu8Bi|n@mYs9rz#=C16g?o)l*F0OK+qT8? zPPrhiH@WK5XD^Pe*L=O&`EYf5n)}UG~HIrZk_cSB@<& zZ@`n+<{IJ@Y4BcA^h9gRo3w?)uE%qh@;;IGKiz)JS7gJMr#+S9jcx zC2xYh+5fn|YU7Ep)!%%wVw>}1&cWVnIYoP~#HDiQ9Mbi(b2Iz%w+{$stkjOJ6LS*I zD3q)}&+qGTqJLaX4LKgHI~fEsU!JQ3HXTO0u=0S}eN)b&A8l7NnzA+Gu=cSWj3@fR z(e80u58LMKW&FJO6OMfPBC(Hh?sO{hc4El{k4oy%?LlLC|0ivWPmN>~`!>(A^fTU6 z$hAY;o2eGTnWsj?$JWg&!8Q%nH?5z|)1$}E^t<|=S0}EX*Ut^P%6ji>=)c6NdYUA! z=W|=l#!4qW$I(9YVLW}uSpSaE_o$JOylLdIY&6=kf1jGp)9$k2nfkcXydGiY$`x3& z>HCyg|4gNUFt(cB2^cJkPJE6Z}_2N|tk&=2oDFgYJcy!Yt5?`&3TPK<^W z?C(;B`?t?f?-Q0!fF}+dlgGb%>4ext?m5WxzKK7tHiYE|L5&NDj-lA&^xZ>4DPasn zb16q6VIUOGXGFQo=yre!9ZwO0;8uY_Nr@q;#exQI90btTQiKq=lZL0nQDWaFv<&v5 z8&mbNE-I?gM5xo8^9`K22ouE(5DoUiVyMaL%VX710EnHcBt+_H$QY?S^wn9txhy8~ zJHDgziYT$!n54$?9LWVOcZp>+8Gb~mZ|bnvDWX|08Pr5(IgB_E+)ZNqaH3O;e@eQM zxRuY+t3kMtz##Hwn_gZKfV1K+Zlr#KB%5UdknQf*zQ<(o!&vZ?pEC2mk;=EMhCAPa z;kp>!F5R^PmoCR+;U{;p(+|Sl?GbjG5?sNgEWW5_n*p1cCXItMiGf_2c^9p4$)PEV zYWY@r8X+@*A&wtkOPm}Z_KxSBrZ}o4x%4zrCK_XsIB9f95=P*>f`UA#0c|0)MeZ00 zQrrk>Oh`J0Fr9A(ZBb}LCK*LI0$hO67TmU9E2Kg%wnJksm!+_?S-1=p)Yvd{zrFCf zZpW~0Z<^BlM>XzJL2aj5el>`XcB1cq#7Sf7NEcE!afNlS6a>^nbT+uI5zK*hw&6He z6*3zNxX?uCyPVYrB?*oqXMv0eSp!}$EK3Cquk1iGjetTtx>$fH^Mhl;aF8rzojr}U zGVsfPtT-1ZLJ4(PbAiNS{>;yX=+{ufSijO3xYucxb?L^6(mSp^ zFAarQIG_tgMeX4!q!%Mum*QtKCgKY)L6Ml0h6jQg(VbB){zYfS>#q@)p$Y>`mljK+QqEaDRpXxvk1Aejo1D`MG64Z9 zyUk_3SS`I&)O37HGsp-Cpu|Nb_b_%kyuIl6*)F1!dGh?Y`3F>%h_yYrH?6y??^i-s+`mzJ_-$HKeV)8 znWCc;EGDI)=q4caG64xS%p-@?E2+o>8(5c6UI-r&(17wON#gl-L+#bkvNkl%C*0*a zAteD5lomh`I8@tCP*L%f)3O3U>7M|g#YtQszEd%ORfQr}!yW-by7oDO0`oDRT$~@* zP-HfUP#9Aa0U4QpEFnfZG%n8t>0eP~&W)G_P&Z&-$iN}Ea1T15&571}wgueG zni$g|@_tf%7`}#{7+i`0YS7*XSsN`@9T2 zLQXecd~y>FC=yHn5IIOm9)AeV;Q(Pmv*Ee2025G-5Y#0lX;~hhnh@>PEE0y}8sokB z2-79eUJPd-(tBJma22sUB;~1aMKQghEf-~Bfe=DMtlc3H>U{kWn1caE4nO_}c4$Hf zQ3kP7lRy?Y`H6N82w4WPV=}`2Eog|tr1F>`LaJoUp%@f61zK?^#(XgU_JtuBbF5qm zcY8SO9)hRP03^X6tgR917tFBT0mj`B{>m%GSR{jCXo$NI5&&x9Lo;82q@FYhiU8c# z=uj=W;+M7xL0b4q{Gq|n0qwJh;kr!$P^Hivd^M80dwEF$u%-!7TMT3-U?{O*6hmpI zk5SY%g93mA`DqmB;1ufwa3B=JX+ng1ndks@Lxe}KWi3?1`t&Sf!E40SDgpwKKmo6M zAJJzSQZ$VCX;2m^2nqZ7Xi!8ntN)jGl;jsL`n%tFef$G=lZl!Svb ziet$MP?g|HN!JpPK_7!nBiRv9L4OewBaIX^FO4IW7f*qg72FZrwHDVx$qI~@pn_CP zrU{iCJsd-a2Zkhycc^uee&IK+C1HYQPB#a%l%DWC+fqS%PtPKrv_?E-T1v3n_iH|` zqtYvPCQYbJFOLaMT(q^sPP_w%-7C9yElDXa~o4^2D>P^xeK?KnbzJtv2lR(VzdNtodgDeOIVkHsN_RE5I|E-qR5xQrr{ ze+VfZlk^L5!C@tXM=AwUND&O8Mv_yv*rdS;tI&tj8bPl+ZIy-86c|sDUSF(0?}Z!( zAS0kICKpl$@bf6jWR|GY;suqM%cT4F1vC_xOH>ZKNOc7GW@8_$nlQ01Oqe@fGU%g# zN+V&JLK9^wa6jGfERi25=PP#%p}npn}(gmJZPw zt(ilYM#d2msF4N+6sCEB05Hd95jgdL=*}oTf@M zVD>4LD`lturK&OuKckaFzx#lq7^w1mPqnoQwLssZFe-~6q+pXfjQOW}7uXa>|6?W_QeH~@7)Thzm}>Nx>l7`CX>Ugy$oU>sgD$id;k z9TZaXu;P{$`y;QHtL)BKE6WEPeAQE82k2UyivZzLr-6*qNOc6L>f&rnDexl1oz067 zD#CbSIeqwrG{R<|!nm;ZUxg?wCZ-C*<5mxwCZK}J>rk(Ng4aZM780)EC6xpiPgsAN zy$n4^zbrROjEt?w2w=QoiijtZ1|oo_^T;FplL-JgB=kVSt#Xxspo%aqqErUB41n=+ za~+)QCM*tFb~>WzWcpCi69qq0rwQO>6&K^G>AVU_cNA(o3VizFrGDWYP3V_~j&rlr z$gXg9Da|gAL&Hby=UD%TT8O)*bAdKXKnGJ+So5Oe3%dX!XBrVpNcbr5!T(kyyOPWU z$k|K*muC7Rr}jMRSb*>M7bjx|=6rvZGu*=gUOA6nO~mfAmKDr}G_kjGL?!@)1kIz&k43>u^#5-8%g2fasnd^Pub9&} zVXv4ofQn9?xv%0#G3&jG!$O(_tq5-hus9u+DargVc?$cP_MAe0F*{re{1qS)G^kuS zGiW+}g++88D-h#FsXwwx)hD3PHOZwztpzlR(}QsN0@nHGM>g7T@s2@qmwdDtFiWZjB-`<$&5Y#hJhiA9t- zAZqu8-AUf+S2t=ki!%ATHy;-UkiE4yIiAPjS4cRr2-5!#Xa}JEUr{B|3eInO6IQgF zrZW=$O^E(m<|7&gqF6RuD%9V!8yxr@|P zprT9lge+- z93zE~<(VYQd#w$p+NJjw28sLE=3d1RQiB%k`j3Tw5y{*$tVMqj={&k$^q@^KT#aub;B@Hf^625##Ki&iCYKx)i3LcKB`tTiGXI_lRH>UT1{nMB zXt%&knI}-GdG}k%R!gr1AfWJ)uiP#<-lh~lp*w#|qOYNas3=jWQP!Hh7?FHaVHPI0 z9oAL^a;sq`ofuZJ9rsff-&C>=EhjfxnZjP?wJy5WEIdpWnkr8s2)c8*09SIJu7!qT zk->3OK6^qihBXhkW2?l1ZCxDPJ5ym+n#o4`L}KmS_6#VUi-i82{nAYc8iu1ZjD?z% zhYI4ST%K{d1Iqr~htm*YpRE`7Dt0;NJEBE276SsR*e5*^Kk4ixND6(9#SkeG>(0!> z^yJTWqSW$#)$Vc5PfrM@3ju_1v8%l(&8(d{z~&{}T(hd9eMexw$eh8%KkT(7`AWZlaNB$Gy_InrMH2%c1u|I%pOvDgwN;u!lh zz;_!1cs0YTi7&A-?|^5{yO%RTAv;1%#2&3ix7Jx={^HaFYS9NdcQcMr#gC++0CE?A z{2Bp4O8AQ{TQ3o-)jVDL7jaSvpR0Bjtp}w}>t0@I?pJKnmhzN?3Mhg}jyvcRGHLvZ zK-I+Hqy~U4{WO?C4S!XXE^Ei-(Y#1#hjo^+r>Rasp#2q>9ix4hj68vA21&5DP3H2~ z-`03&YwxgZZcCf7aBEM$t~%UX>z?7KnG`4YUEY}7vpAitTu)!sb+){<#qaz{iyhzP zW9M0Sx;k;L&t1Q6!tQC7ROa&C{kEMs`_bk4y;W-aWsUnrM{uR>2bXmH7N1osfd$<* zRzQAP&@-5~89`IW$L43_32&2r-|_x?rSvYC!i_UTuZQo`)i$uRS}no=#skRX#Ii#x*^|zHA9trfTI)U;+W@od4UF0NP*M zm1g=5#zu-x4(2wdue+87nqv;DY-sDZ)C?=^M0|eH5vLKP+~TZ0#Yu|n>pes?_Cm3A z{GYEkF_jc0(_JPzmfZ}3Z?izx*rBq#9xOG(F;`nktz|?n1zAuU)f9q1tR#bgQ-bMQQU{dD!UNWN~uMSl;GxJ3MeXD8R z)mNS9y7h3p%*^D1=O)xkqqXeJh+y*PgG2XYk1h(>cR#FsTtVO6(FdzV3>0kU zeJ|Zy7Lht$)F(=A5yR*bh~D^|Xzc;>7JCj_e2~b9KY3=S-5k!o$OiwDD08WexJi77 z_>xVz8^bjO`xTrMaV~(AH;iI0lQ@m6E)Yxlm3j&ybg9rrcR<@xV(k`V`Mt2#lWU3( z!ZXV@Q~_L0264UaPxf&ygUiK+c{oqUXb4vM7L52a*CuuY59hY#ezuT^*pRBiW2#j} z%y*FSq}tw-F_c(L@9F0PQ5Y?kB8j<2aK|RoayH1nTGm-mQ5BR;r96z6ogED|TDq1s zaD1qlK!8lu_lc8Wy!EvPjD^c}XDO1#PS@`I$rl}?+SfbjTN|1&N*$(ZDDudV1oAW` zDWrR@mS%^Vo6@rnnB`BixKEU#6knjk2jUDWBrdV`+wrtgP6278!!NLdlqlhix~r2h z@yn`MpPHsDTYYG3hjwyGB_);K_qD#4R>_Jby%7#b+F-qnS@QHr9GaahgxTn z?}}>4Crj~}AWUQoa6nL)}l%n?j~X42|LXr*;%|tZX>)* z)j#w0IT!yz(pMXF+Y!ndsqT``9PNwjV$f!aCR!V#7c0gQfIy*#KL#RSJv`Yb*S$;! zBG1qUH$wU~W(R|dgGr|V zv7uw(${i92YM)+1r;j}@s=>W_k7b}#z!w zsl7?ZUhn?}`Y5VD` zUPj=va3>K_2DQOtdt3<6Nd*mv9ul*aC4C07oddTqU=jaBCfHRLc%8$r0VB81Yc!!} zH5kj+cW(nw(5b7b=ivk((avwgwE}YV*dh||j-<^VvznWhP0P-_;=tMwo$Nq`C#y3hhPSCTVp;Y_aT3$7vS^|q{=3ZAr{SmguIq1{K__cutu9jn z3G)2N4-O#lvr<-TEk|rV?s0D-#4QZLrM@E_XV@4SKJxySJGSeDjV`>?dxCFmxCwPV zME?^d++*N-Wyhu=MzniuBEx{2eA2m}FtcIA7wA6GFz%*}M!bTt?=BB88(CrAgbwdc z->eaiR#$abm9_emT4{65#-vS)-}fZZDNY-=^A9t%(66uG3@v5%dqZsYBg9Kc4G9FKBJppN?O*%v zMh^O}FY_$@UvFNI9Aw5!AowvL1)RMH|J;W54omQ^1u2{Ft)3vJnLv|t6HI4^bWGa? zN9uTTYRSsNR=vZ4*$qju=;fe#7iFHv% zL;|}6lxT%}$KCo9y+&4$;!6?j{e4o`*$8WloF7>7S3Mi;uP*bxymn}#L)E9Z=#0}2 zqeH&2-4t^6an)>Ot!WA*u32)=--RC2uns*uW+hW@q{Zj@=+2;=Fy_kXDn<;`(j_gE zbko#(qJ3hHP+x*3+m0a$0boKmuzw30WPllsYz^fcZ0#KB4Q(BaUk;T;DvZehjM@Gj zJ@zd0JCeLHYl)D&tdf8QH4?j_t)k$LTH`9qhv;~g96rP~7t_Hy#4|7ZN{KVDl@JXR z3K)T1Un5q`+Tm%NwoB7t<|GN3F%5wr4F{?yy=}4WOt#h5>Z7(UO8@=BtI>5W1uc>7=9Lv5k++aHL8E z6+QAu+qMqIb?u^@pO@fhQ^hKzrXM zi37|(VKg7W4a?aW>BQ}QaS6QSh*0E>_`$Iu0u4=!QfTY=BP$!sH@7brPd2}88$CvJY%>Qx-{(fh{y!OC^$6Y%rkMfbJt!JnnD zH&eihC}g`nF05N-)2fShDC?yA^14nEZIu%jPoODyB55d?n);GhTZ}&(Gw(59yhYEA zeryZy)*;{!?SDU1j`X{?^zH0koRt~V12~8e7j)(srspAj5=0iT(`*%y3A0K_-3C6T zq7mKCR--W6l)YNpcyLJagp!?~ZM&Z%8^r+e#Y#E}n;2ScufFCn&5O;}TsZFIti@+; zSXNGcf;SiaydnqsLZF19U~eoO_^vD^{JFXb1Tj|GT)5Z~Pz*W0l^k>BZ7r0T1BJ7g zmgh4NtHE}?rjuR5 zz)8qz;uh*G8J&f`mP*t3-oWyNrAb%$b5hx^er_c1kB&$4sX9fRMbXX7p02E# zG~p2g&2T@=Gd8@@yI$;Tk99oHN=nE_`jUNdV<+~ zr%&L^2PV``WV}E(YeQ`W$KQ+ic~VmOxkE-Vt+cGv+66z_c@i%Ymp^xPL*Kga}cVUU#XA1z*`G#IbQKl11IvM@Ws3t9K1XgBXdI>IG ziZXX8lJM->012|d7>8STD(@ot+u_1Qed-y{G&?uNQ2EE!sRHU>@pv6v}pM5UgRp|JcY492k@;wWy41e<}2Y)9id9+ zIJ4GotbD{2W}4XDYPWj6Oip_BDNQI&o$$Y>u;f5fM~H~1?_eShwsc+jmQ%vM5NLuN zPbxJEp2S3Emtx=q*-t<|OJuYdE1uosL2JPE6e2U+h6O#wvzha3!q;H{7$UY-JAoEE zaG&}b`#zE2RcNuK-sOSpq2w;Gzsv5Bjx0y@$sgtyo_$*~j)KZ?+PBdC@as~{yr#ha zf6aeC7Wuo+kNJh9{@3Ty|HtRS`{v{~-6;{SAf>rrK{EFn9JE_2nTyY=`g6T!iHGu0w3!D zKkGm0Gr&aaOkm(W6>nL{?4C}@9DH>oJ~o;c1A)tV5>t*%?f(A06HBe@b-7Y&(CGUA zt@+F1?C)rQiSySf_f_$NSbobs0<@vCN3@dOvr2bi^6$~RER1IygcGz*67=;GT)x~#TMU@hfIF_FF1 z%kIYD+`rVcnqe$3mljhJt!AS80Y=B(fT&Wa6$E;r@dms@)fAQvStE%$cQVy@^8^|_ z)tbp+v*~O6_tLVrZvNPqXmuu}=iaLFG&mdVlKknFRbA6AH!W~p5O^Pwna^;?-!fV% zozEQC%WeGR(AJ_M;j|Skj5!r#LPEZWoaZd6^PhH4_TybAD}!mqg$NR* z3To#vZq}FDA{`D2_gzXGOW#gQ#D&n&rz7c;+WR8jFyEY?M`1ZS55y2WvamgTJ!JFC z+E^RHUKD`dh4QcS_p!#w z&9V-R=ui$^57Y*A8gHX9*;v_xjDf34pnsaMtF(y^Jf)I<_P?CT-Q;-F@dm{Fc%w&O z=uOjTDPC4pBR=0)!DIl?iTkGD(42$`_)_b~+~I4_z#zUL0cX4-Z#wSLvW&SfM{!uC z#F$Kq208Kp)CWc_=I68Vm9(eDa}wpD$gt`zyr8ny-I8t@BPFo`O2|}2@W4Xzjj{T1 zPA9i4_LM|cLBU+!E#aFvp_516`qxqY#}j4?C&2Q;|7!XF66dc&oAKW(l`YTQe)HCP z5=u?MPfImDm4mTw1g*hWDQOMQ-Unwc;(dVjehyCSo6w4*=BgTLEFX4A`jp6G;c4@E z=rLyTWcnRz-exor)Z5{0%1`1usgcZ^wUhzwh+BIBB6krc_4Xyh*vJM>rUV7`{aL#P z(-2$ko!~7(N%t(6NkWZL!-|`=RXeF&N>#??d)~1&QP*_q%AB%I&Ry6+hB_CucP8sc z-!<=oMxP_8ZS5bjQ)Xsk$(jjhQ71MnX0>Vf5a7xpgi10p=J9hIn4UO9{Rpl*5y z`pY;f^SD5wdscLbh1rT0dIbbjOXE((QEGSC2YxH*y6=y%g50g}#im{DWsh@aM^ zV+tWh&rOX_0RpXY>~KYdJcpICbpQgS*Uq4wO4myEHF;+Np2x`s}KFEwDg8y4MNx{ABR1xsDy460$AjG8>q)TIs zrGOHycjad>cH{;t)uiqjBJv*JAN0ocKNw6xV{!*a<*kZE7ewt9OXfbBP^KJL?Itzt zaUr2XI%OLhCu(0=!aVsQfGANug%Dk3(Bnc1@K5)M6*t8(XW>&`%%Do^5E4WiQ+%}J zeTcA6(a#?fy-FhPJ_)(gx?8 zHvh}Q)PPf*5D$Y*uT>`|kT~{H+dCup!FVf#V&Z1rCk4La(*qx2L;=AZL1d^iI8WqOc=*4 zu%3Dg3GuGi{7aM97oN$OX%J$`>^Gxp_QeJh9W6VP$shMQqDtfu7f(Qf$+l6^Q=|52 zoMBNFYYrq!-R5T!PC{!*90hON3l8Wiv>=7uhT^0K9AmBBD@*j^k3QATqt~mAqc5*z za-7>oOZ3IQmh!$?xjEXncB3X`*VINSypSv6~TOA@j zWp8+$_PD$#XA=UQJ82sCVhHwtPzb~&^r|;T2r$bI@Li|`9f$RJskO^S5 z?X?{b)$Gn#i!X-ww4LGixNnlCB&l0GI!h|aZ0GT?kMiRA?~SzE11gJG`w)H9Qtw0R zH)`6~K1<>(C3+uqY*l^chrYkZPi82v@oW_%PA)(~n9bzl^5x|6pWk;0iW@Ak+}ygl zWJ>SX8}J|H_d7TRxx59hZNI%B&XIUNz3so*&Bj#u5dxb+d2CCQZ%%8U2G#-tIA_MB zdYxLyR{I%j3=;h^Q6r9DI=f_+DqDF!?)%vez3HXXZJgY-`&T}KNDU&J?sLhGD^)O3 z*Gk6BPqpX8T(Fk#ajm{@jTa9*3O1?k#uutSg6O1`nv|pgr!X)7@N&h4nQB^oH^jxT zU7g09a?{a8aeRH#H0ifu%T6kwdvhfU>}P9ta~MHtT15`p+QJLY*VPsQ+KRaAq6*_|3X@p;iywL-hxf=dz&?zqGz) zFJFfs5SpY`#oO)SHP1Iq<}2r0s?%A$w1W(M;lVZ_nHX<5l13xG!Yx1La}>u%9Y8Ls za=NNP;6*0gEAGNRikf>Zm)mlhE>G=$HcJ3e2^wnkmO>0jzH7 zyw^wB^My(FOl$f)ryVXMsY1*bK@Bzv$_-R$dbQzMO)lyT%{n zpT=&4Ufy8j`9ps{yL^^Gr)MWY1*`q)SToj01Z-Q~@G~piQo8_B{%a%^uD-=Ivs>i> za-2XJjDJ~Aq+lPP_=(%7@DvYz0m+=dC57W67|96-^H-8<>5s@2rktmH>_d#?ZOcSC z%I1_eCfd2~%^{|qu+k~Xl%yYuWh(Zrmlc8g*Oz?s38V9eqA zY~KQfYei3{DBvv+MS|w9^}gsIx5r{7`-p0wGw8Vjqa?vbj8T2qc%>r{jv$=2dtuaW-%vA%QVyo9KF3(c(I zj;R%1;;>(Qs1pvHVCbyG87{wlT7{4DU^BlVD$8R5HMapXbL_zfRpfw+5PZduO%u_d zv9a#;z_tEFzmpYMirZXnDUv2IAD|3QjBe;R&aa|1-)?pPJxn6WzSv!+IVOUE6ozP_ zom71$q?{(DbQ`bTCaS}kxMHnm-WcocMSdTBT!XbXy*+Os(pBA+Sx=Q=xZ6@k+K+yY zD+TXps<+o!ke6Xg9G8zhuBa9AXsso?eBW7$Y@vnD$uc6`R!Hzf2Q)v+tFjaOzQ+=O zI3H8{3H}n$r4z|8xd0zq{cHC6C7?A0ZEc*4ZJczK-0h4VwO?{56Urkm`Rk7F=<=1y zJB?ycIdqUo#7enhTGBk`QeT8`oE|-`iYh+AP?8Goppf6J@=TKWUuY!EphWCRAbinT zi6o9mHPD(2x){46Y>67m3-58t>k&bqK4qXox`roYqvn%Kr-GlY*d2um`!+KkQusMz z^5a$8I&~anC;=mX6>>C$Ys{Q*=*-iKv}$LIkVx#{jCJ6(OJ9nG_GLt_e~fcn8EweC zM7w1Q-g(P8cCWxY(J_y(>?%_%3Twg90wc&hvBm;1JZ3$so`zG7ZgvEpf*d&Xvyse{ zZMa-$);qC4!B}T;mC^CdgtVdwB34S}tnY9sqZqyN3T5zeCa^$g#yEpYa&+7iKfmfg z6<2mFV*8CF3x=nooU}B(5sKz`RDkp&)i>+ftRlR57^0Gs13kfxv|O&k+^M^X#T}nc z3_i74Su&)a-#xBI&8v6L#sV#R^G#!}4)rvko51${eMerrO^4vQ_SDDBqfA;Fs+pC} zxceM;a36HiZ78B$vk(2AGb6UlSI_)GW4z;bj0D?D+z+q6k@6Fo$I}3=EdlIk$w>l( zpaK2;{F&dMt@85xnSYl5{xq6DaejXu#mkde{w^=S%m6sQKbPfC=HIX4zeL#IMF9|v z0Omg*VDTrw?-zw%ZtVYE5&$~?N{#zuhbT6Y%$gIxoZG z--QA0Pr$$L>;8%I`wryGp`O2sAjvO{{ygIIC&KT$djFK8n)(&tKki%piSqlV*2@my z-z5t$^Zm9T_$TA<`ztSdUVoQ8+TZsE{@MZill}K?hL=s4ze_y*Z|r|<$^6OvdqVu5 z0X4?(8~3k_)yn|(C&2G{<(KU7-$j?{_e|F>1N<|K{wLPT(tl$8 zp8NbKmLki4#`-(c`zO}#nZ17oh&1c(SpUqO{)zH?mg=9TZe#ls$_1Hhf(BS+N<#ko^C&2IPoPPq$^8X3&A0_yG0{y2RZ32G+yz22E l#rS=!{-+puLjQ5 str | None: + """ + Normalise postcode for grouping. + + - Uppercase + - Remove all whitespace + """ + if pd.isna(postcode): + return None + + return postcode.upper().replace(" ", "") + + +def is_valid_postcode(postcode_clean: str) -> bool: + """ + Validate postcode using postcodes.io. + + Expects a sanitised postcode (e.g. E84SQ). + Returns True if valid, False otherwise. + """ + POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" + if not postcode_clean: + return False + + try: + resp = requests.get( + POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), + timeout=5, + ) + resp.raise_for_status() + return resp.json().get("result", False) + except requests.RequestException: + # Network issues, rate limits, etc. + return False + + +def main(): + df = pd.read_excel("hackney.xlsx") + + # Sanitise postcodes + df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) + + # --- validate AFTER grouping (save API calls) --- + + # Get unique, non-null postcodes + unique_postcodes = ( + df["postcode_clean"] + .dropna() + .unique() + ) + + # Validate each postcode once + postcode_validity = { + pc: is_valid_postcode(pc) + for pc in unique_postcodes + } + + # Map validity back onto dataframe + df["postcode_valid"] = df["postcode_clean"].map(postcode_validity) + + # Group only valid postcodes + grouped = ( + df[df["postcode_valid"]] + .groupby("postcode_clean") + ) + + # Example: count addresses per postcode + postcode_counts = grouped.size().sort_values(ascending=False) + + for pc in sorted(unique_postcodes): + pc_df = df[df["postcode_clean"] == pc] + pd_df + +if __name__ == "__main__": + main()