From 8983ebec2fd9ea593f19990f5c02847da4adbc45 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 30 Oct 2024 10:03:10 +0000 Subject: [PATCH] adding epc band --- .../stonewater/Wave 3 Preparation.py | 59 ++++++++++++++++++- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index fe1faa9d..2654fae5 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -11,6 +11,32 @@ SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") NUM_FOLDERS = 14 +def sap_to_epc(sap_points: int | float): + """ + Simple utility function to convert SAP points to EPC rating. + :param sap_points: numerical value of SAP points, typically between 0 and 100 + :return: + """ + + if sap_points <= 0: + raise ValueError("SAP points should be above 0.") + + if sap_points >= 92: + return "A" + elif sap_points >= 81: + return "B" + elif sap_points >= 69: + return "C" + elif sap_points >= 55: + return "D" + elif sap_points >= 39: + return "E" + elif sap_points >= 21: + return "F" + else: + return "G" + + def extract_summary_report(pdf_path): """ Extracts specific data from the provided PDF file. @@ -23,6 +49,7 @@ def extract_summary_report(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, + "Current EPC Band": None, "Fuel Bill": None, "Number of Storeys": None, "Window Age Description": None, @@ -57,7 +84,7 @@ def extract_summary_report(pdf_path): # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) - data["Current SAP Rating"] = sap_match.group(1) + data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] # Number of storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) @@ -367,6 +394,7 @@ def extract_epr(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, + "Current EPC Band": None, "Primary Energy Use (kWh/yr)": None, "Primary Energy Use Intensity (kWh/m2/yr)": None, "Number of Storeys": None, @@ -621,6 +649,9 @@ def main(): folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] survey_folders.extend(folder_contents) # Append contents to the master list + # Get rid of .DS_Store files + survey_folders = [folder for folder in survey_folders if not folder.endswith(".DS_Store")] + extracted_data = [] for survey_folder in tqdm(survey_folders): survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) @@ -643,6 +674,16 @@ def main(): retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) else: retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) + + # Check if everything inside is a sub-folder and the number of folders is 2 + items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] + all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] + if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: + # Get the folder that isn't Property Pics + retrofit_folder_path = os.path.join( + retrofit_folder_path, [item for item in items if item != "Property Pics"][0] + ) + if os.listdir(retrofit_folder_path): # If not empty summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) if summary_data: @@ -673,14 +714,24 @@ def main(): extracted_data = pd.DataFrame(extracted_data) - # What was missed??? - extracted_data["Primary Energy Use (kWh/yr)"] = ( extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] ) + extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int) + extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc) + # TODO: Clean up SAP and extract EPC # TODO: RIR floor area!!! + # Remove some definite duplicates + extracted_data = extracted_data[ + ~extracted_data["survey_folder"].isin( + [ + "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", + ] + ) + ] + # We now merge on the coordinator data so that against each property, we can map the measures retrofit_packages_board = pd.read_excel( os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater_SHDF_3_0_Board_work_in_progress_- 22.10.24.xlsx"), @@ -715,9 +766,11 @@ def main(): filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] # We have an edge case wher some properties have two outputs in Sharepoint if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + bl1h2 filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + blah1 filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] if filtered.empty: