From dbee05e555d758d464efe2a43c18d6c3b017cef8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 18:37:47 +0000 Subject: [PATCH] working on matching lookup --- .../stonewater/Wave 3 Preparation.py | 48 ++++++++++++++++++- .../requirements/requirements-wave-3-prep.txt | 1 + 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7bedef29..d90360aa 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -5,7 +5,8 @@ import pandas as pd from tqdm import tqdm from collections import Counter -FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" +CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" +FILE_PATH = os.path.join(CUSTOMER_FOLDER_PATH, "Wave 3 Surveys") def extract_summary_report(pdf_path): @@ -653,6 +654,51 @@ def main(): extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] ) + # We now merge on the coordinator data so that against each property, we can map the measures + retrofit_packages_board = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"), + header=4 + ) + retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # We now match this retrofit packages board to the extracted data + matching_lookup = [] + for _, home in retrofit_packages_board.iterrows(): + filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()] + if filtered.empty: + print("Check this once we have full data") + continue + + if filtered.shape[0] == 1: + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Osm. ID": home["Osm. ID"], + "Name": home["Name"] + } + ) + continue + + # home["Name"] should be contained in the survey_folder + filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # We have an edge case wher some properties have two outputs in Sharepoint + if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + + if filtered.empty: + raise Exception("somethign went wrong") + if filtered.shape[0] != 1: + raise Exception("somethign went wrong2") + + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Osm. ID": home["Osm. ID"], + "Name": home["Name"] + } + ) + + matching_lookup = pd.DataFrame(matching_lookup) + # Save this as a csv # extracted_data.to_csv("Wave 3 Summary Data - first 200 files.csv", index=False) diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt index 70bec3cc..97314b32 100644 --- a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -1,3 +1,4 @@ PyPDF2 pandas tqdm +openpyxl