From d4ac6aee71df211e5c31238fc046a23991839faf Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 11 Feb 2026 11:50:02 +0000 Subject: [PATCH] mount home directory to devcontainer home directory --- .devcontainer/backend/devcontainer.json | 2 +- asset_list/AssetList.py | 2 +- asset_list/app.py | 82 ++++---------- backend/address2UPRN/main.py | 23 ++++ backend/postcode_splitter/main.py | 143 ++++++------------------ 5 files changed, 76 insertions(+), 176 deletions(-) diff --git a/.devcontainer/backend/devcontainer.json b/.devcontainer/backend/devcontainer.json index 5d728dcd..6e2edc93 100644 --- a/.devcontainer/backend/devcontainer.json +++ b/.devcontainer/backend/devcontainer.json @@ -6,7 +6,7 @@ "workspaceFolder": "/workspaces/model", "postStartCommand": "bash .devcontainer/backend/post-install.sh", "mounts": [ - "source=${localEnv:HOME},target=/workspaces/home,type=bind" + "source=${localEnv:HOME},target=/home/vscode,type=bind" ], "customizations": { "vscode": { diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py index ea4d8b34..36b3d58e 100644 --- a/asset_list/AssetList.py +++ b/asset_list/AssetList.py @@ -34,7 +34,7 @@ from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "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") +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") diff --git a/asset_list/app.py b/asset_list/app.py index 43c653a7..02557831 100644 --- a/asset_list/app.py +++ b/asset_list/app.py @@ -13,11 +13,15 @@ from asset_list.utils import get_data from dotenv import load_dotenv from backend.SearchEpc import SearchEpc -load_dotenv(dotenv_path="backend/.env") +load_dotenv(dotenv_path="../backend/.env") EPC_AUTH_TOKEN = os.getenv( "EPC_AUTH_TOKEN", ) +OPENAI_API_KEY = os.getenv( + "OPENAI_API_KEY", +) + def extract_address1( asset_list, full_address_col, postcode_col, method="first_two_words" @@ -69,72 +73,24 @@ def app(): Property UPRN """ -<<<<<<< HEAD - data_folder = "/workspaces/model/asset_list/" - data_filename = "manchester.xlsx" - sheet_name = "PW0099 - Property List" - postcode_column = "post Code" - address1_column = "address" - address1_method = None - fulladdress_column = None - address_cols_to_concat = ["address"] -======= - data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Aspire" - data_filename = "ASPIRE ASSET LIST.xlsx" - sheet_name = "Asset List" - postcode_column = "Postcode" + data_folder = "/workspaces/model/asset_list" + data_filename = "assets.xlsx" + sheet_name = "Sheet1" + postcode_column = "POSTCODE" address1_column = None address1_method = "house_number_extraction" - fulladdress_column = "Address" + fulladdress_column = "ADDRESS" address_cols_to_concat = [] missing_postcodes_method = None landlord_year_built = None landlord_os_uprn = None - landlord_property_type = "Property Type" - landlord_built_form = None - landlord_wall_construction = None - landlord_roof_construction = None - landlord_heating_system = None + landlord_property_type = "PROPERTY TYPE" + landlord_built_form = None # Skipped as empty + landlord_wall_construction = "wall combined" # combin F + G + landlord_roof_construction = "HEATING SYSTEM" # Combine I + J + landlord_heating_system = None # Check with Khalim landlord_existing_pv = None - landlord_property_id = "LLUPRN" - landlord_sap = None - outcomes_filename = None - outcomes_sheetname = None - outcomes_postcode = None - outcomes_houseno = None - outcomes_id = None - outcomes_address = None - master_filepaths = [] - master_id_colnames = [] - master_to_asset_list_filepath = None - phase = False - ecosurv_landlords = None - asset_list_header = 0 - landlord_block_reference = None - - # Peabody data for cleaning - data_folder = ( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " - "Project/data_validation" - ) - data_filename = "to_standardise_uprns.xlsx" - sheet_name = "Sheet1" - postcode_column = "Postcode" - address1_column = None - address1_method = "house_number_extraction" - fulladdress_column = "Address" - address_cols_to_concat = None ->>>>>>> d4064da36565f87c2b72d10e9f3604cc6c37bdb6 - missing_postcodes_method = None - landlord_year_built = None - landlord_os_uprn = None - landlord_property_type = None - landlord_built_form = None - landlord_wall_construction = None - landlord_roof_construction = None - landlord_heating_system = None - landlord_existing_pv = None - landlord_property_id = "UHTprop Ref" + landlord_property_id = "UPRN" landlord_sap = None outcomes_filename = None outcomes_sheetname = None @@ -286,7 +242,7 @@ def app(): if skip is not None and not force_retrieve_data: if i <= skip: continue - chunk = asset_list.standardised_asset_list[i: i + chunk_size] + chunk = asset_list.standardised_asset_list[i : i + chunk_size] epc_data_chunk, errors_chunk, no_epc_chunk = get_data( df=chunk, row_id_name=asset_list.DOMNA_PROPERTY_ID, @@ -429,7 +385,7 @@ def app(): # Retrieve just the data we need epc_df = epc_df[ [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) - ].rename(columns=asset_list.EPC_API_DATA_NAMES) + ].rename(columns=asset_list.EPC_API_DATA_NAMES) # Look for columns not in the find my EPC data, which will have happened if we didn't # retrieve it in the first place @@ -446,7 +402,7 @@ def app(): find_my_epc_data[ [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) - ].rename(columns=asset_list.FIND_EPC_DATA_NAMES), + ].rename(columns=asset_list.FIND_EPC_DATA_NAMES), how="left", on=asset_list.DOMNA_PROPERTY_ID, ) diff --git a/backend/address2UPRN/main.py b/backend/address2UPRN/main.py index 2cc604cb..fb812d67 100644 --- a/backend/address2UPRN/main.py +++ b/backend/address2UPRN/main.py @@ -20,6 +20,29 @@ if EPC_AUTH_TOKEN is None: raise RuntimeError("EPC_AUTH_TOKEN not defined in env") +def is_valid_postcode(postcode_clean: str) -> bool: + """ + Validate postcode using postcodes.io. + + Expects a sanitised postcode (e.g. E84SQ). + Returns True if valid, False otherwise. + """ + POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" + if not postcode_clean: + return False + + try: + resp = requests.get( + POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), + timeout=5, + ) + resp.raise_for_status() + return resp.json().get("result", False) + except requests.RequestException: + # Network issues, rate limits, etc. + return False + + def levenshtein(a: str, b: str) -> float: """ Address similarity score in [0, 1]. diff --git a/backend/postcode_splitter/main.py b/backend/postcode_splitter/main.py index 06a9d1a3..0f21a67f 100644 --- a/backend/postcode_splitter/main.py +++ b/backend/postcode_splitter/main.py @@ -78,112 +78,14 @@ def sanitise_postcode(postcode: str) -> str | None: return postcode.upper().replace(" ", "") -def is_valid_postcode(postcode_clean: str) -> bool: - """ - Validate postcode using postcodes.io. - - Expects a sanitised postcode (e.g. E84SQ). - Returns True if valid, False otherwise. - """ - POSTCODES_IO_VALIDATE_URL = "https://api.postcodes.io/postcodes/{postcode}/validate" - if not postcode_clean: - return False - - try: - resp = requests.get( - POSTCODES_IO_VALIDATE_URL.format(postcode=postcode_clean), - timeout=5, - ) - resp.raise_for_status() - return resp.json().get("result", False) - except requests.RequestException: - # Network issues, rate limits, etc. - return False - - -def main(): - df = pd.read_excel("hackney.xlsx", sheet_name="Sustainability") - df = df.head(500) - - # Sanitise postcodes - df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) - - # --- validate AFTER grouping (save API calls) --- - - # Get unique, non-null postcodes - unique_postcodes = df["postcode_clean"].dropna().unique() - - # Validate each postcode once, TODOadd a progress bar - postcode_validity = { - pc: is_valid_postcode(pc) - for pc in tqdm(unique_postcodes, total=len(unique_postcodes)) - } - - # Map validity back onto dataframe - df["postcode_valid"] = df["postcode_clean"].map(postcode_validity) - - results = [] - - for postcode, group_df in tqdm( - df[df["postcode_valid"]].groupby("postcode_clean"), - desc="Resolving UPRNs by postcode", - ): - try: - epc_df = get_epc_data_with_postcode(postcode) - - if epc_df.empty: - tmp = group_df.copy() - tmp["found_uprn"] = None - tmp["status"] = "no_epc_results" - results.append(tmp) - continue - - resolved = resolve_uprns_for_postcode_group( - group_df=group_df, - epc_df=epc_df, - ) - - results.append(resolved) - - except Exception as e: - tmp = group_df.copy() - tmp["found_uprn"] = None - tmp["status"] = "exception" - tmp["error"] = str(e) - results.append(tmp) - - final_df = pd.concat(results, ignore_index=True) - a = final_df[ - [ - "best_match_lexiscore", - "Address 1", - "best_match_address", - "Postcode", - "UPRN", - "best_match_uprn", - ] - ] # add levi score to viewing - b = final_df[final_df["best_match_lexiscore"] > 0] # add levi score to viewing - b = b[ - [ - "best_match_lexiscore", - "Address 1", - "best_match_address", - "Postcode", - "UPRN", - "best_match_uprn", - ] - ] - - -def handler(event, context): +def handler(event, context, local=False): print(f"Function: {context.function_name}") print(f"Request ID: {context.aws_request_id}") # Example SQS message for testing (copy and paste into SQS): # { - # "task_id": "e31f2f21-175b-4a91-a3ec-a6baa325e917", - # "s3_uri": "https://337213553626-7ovirzjr.eu-west-2.console.aws.amazon.com/s3/object/retrofit-data-dev?region=eu-west-2&prefix=ara_raw_inputs/peabody/2025_11_11+-+Peabody+-+Data+Extracts+for+Domna_transformed.csv", + # "task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917", + # "s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv" # } # Handle both single event and batch events (SQS, etc.) @@ -196,7 +98,13 @@ def handler(event, context): task_id = None subtask_id = None try: - # Parse body + # For local development + if local is True: + record = {} + record["body"] = ( + '{"task_id":"e31f2f21-175b-4a91-a3ec-a6baa325e917","s3_uri":"s3://retrofit-data-dev/ara_raw_inputs/peabody/2025_11_11 - Peabody - Data Extracts for Domna_transformed.csv"}' + ) + # Parse body (inputs) if isinstance(record.get("body"), str): body = json.loads(record["body"]) else: @@ -236,17 +144,33 @@ def handler(event, context): df = pd.DataFrame(csv_data) logger.info(f"CSV loaded: {len(df)} rows, {len(df.columns)} columns") - # Get head for demo - df_head = df.head() - logger.info("DataFrame head:") - logger.info(f"\n{df_head}") + # Sanitise postcodes + df["postcode_clean"] = df["Postcode"].apply(sanitise_postcode) + + # Group by sanitised postcode (excluding null values) + grouped_data = [] + for postcode, group_df in df.dropna(subset=["postcode_clean"]).groupby( + "postcode_clean" + ): + group_info = { + "postcode": postcode, + "row_count": len(group_df), + "rows": group_df.to_dict(orient="records"), + } + grouped_data.append(group_info) + logger.info(f"Postcode: {postcode}, Rows: {len(group_df)}") + + logger.info(f"Total postcodes: {len(grouped_data)}") results.append( { - "message": "Postcode splitter processing started", + "message": "Postcode splitter processing completed", "task_id": str(task_id), "s3_uri": s3_uri, "subtask_id": str(subtask_id), + "total_rows": len(df), + "total_postcodes": len(grouped_data), + "grouped_data": grouped_data, } ) @@ -258,6 +182,7 @@ def handler(event, context): "status": "processing_complete", "s3_uri": s3_uri, "rows_processed": len(df), + "total_postcodes": len(grouped_data), }, ) logger.info(f"Subtask {subtask_id} marked as complete") @@ -295,7 +220,3 @@ def handler(event, context): {"processed": results, "errors": errors if errors else None} ), } - - -if __name__ == "__main__": - main()