From a9ea89d2ae5253453e227c83c067f8a248d3f893 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 31 Oct 2024 12:03:17 +0000 Subject: [PATCH] done with stonewater for now --- .../stonewater/Wave 3 Preparation.py | 144 ++++++++++++++++-- 1 file changed, 133 insertions(+), 11 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bfdc8beb..477a73c8 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -76,10 +76,13 @@ def extract_summary_report(pdf_path): 'First Extension Wall Area (m2)': None, "Number of Light Fittings": None, "Number of LEL Fittings": None, - "Number of fittings needing LEL": None + "Number of fittings needing LEL": None, + "Main Roof Type": None, + "Main Roof Insulation": None, + "Main Roof Insulation Thickness": None, } - with open(pdf_path, "rb") as file: + with (open(pdf_path, "rb") as file): reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: @@ -205,6 +208,27 @@ def extract_summary_report(pdf_path): data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + roof_section = re.search(r"8\.0 Roofs:\n(.*?)\n9\.0 Floors:", text, re.DOTALL) + roof_text = roof_section.group(1).strip() + roof_type_match = re.search(r"Type\s*([A-Za-z0-9\s]+)", roof_text) + data["Main Roof Type"] = roof_type_match.group(1).strip() if roof_type_match else None + + # Check if "Insulation" exists between Type and Insulation Thickness + insulation_search = re.search( + r"Type\s+.*?\n(Insulation\s+(.*?)\n)?(Insulation Thickness\s+(.*?)\n)", roof_text, re.DOTALL + ) + + if insulation_search: + # Insulation match will be present if it exists, otherwise it will be None + insulation_match = insulation_search.group(2) # Optional group for Insulation + insulation_thickness_match = insulation_search.group(4) # Required group for Insulation Thickness + + # Populate insulation fields + data["Main Roof Insulation"] = insulation_match.strip() if insulation_match else None + data["Main Roof Insulation Thickness"] = ( + insulation_thickness_match.strip() if insulation_thickness_match else None + ) + return data @@ -434,6 +458,49 @@ def extract_building_parts_summary(text): return dimensions +import re + + +def extract_roof_details_epr(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the provided EPR PDF text. + """ + # Define data structure to hold results + roof_data = [] + + # Locate each building part section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + + # Extract each building part's data, including roof details + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + + # Clean up the building part name + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + + part_details = match.group(2) + + # Extract Roof Type, Roof Insulation, and Roof Insulation Thickness + roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details) + + # Store results for this building part + roof_data.append({ + "Building Part": cleaned_part_name, + "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None, + "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None, + "Roof Insulation Thickness": roof_insulation_thickness_match.group( + 1).strip() if roof_insulation_thickness_match else None, + }) + + return roof_data + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -471,7 +538,10 @@ def extract_epr(pdf_path): 'First Extension Wall Area (m2)': None, "Number of Light Fittings": None, "Number of LEL Fittings": None, - "Number of fittings needing LEL": None + "Number of fittings needing LEL": None, + "Main Roof Type": None, + "Main Roof Insulation": None, + "Main Roof Insulation Thickness": None, } with open(pdf_path, "rb") as file: @@ -590,6 +660,13 @@ def extract_epr(pdf_path): data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + roof_details = extract_roof_details_epr(text) + # Get from the main building + main_roof_details = [r for r in roof_details if "Main" in r["Building Part"]] + data["Main Roof Type"] = main_roof_details[0]["Roof Type"] + data["Main Roof Insulation"] = main_roof_details[0]["Roof Insulation"] + data["Main Roof Insulation Thickness"] = main_roof_details[0]["Roof Insulation Thickness"] + return data @@ -1077,13 +1154,11 @@ def main(): # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False) - stonewater_data["Room in Roof"].value_counts() - # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values create_proposed_wave_3_bid( costed_packages_filepath=os.path.join( - CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP).xlsx" + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) V2.xlsx" ), archetypes_sheet_filepath=os.path.join( CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" @@ -1098,11 +1173,30 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa archetypes_to_cost = costed_packages[ [ "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band", - "Modelled SAP Rating", 'Total Cost of Measures', 'Contingency Cost', - 'Total Cost of Measures inc Contingency' + "Modelled SAP Rating", "Package Ref", 'Total Cost of Measures', 'Contingency Cost', + 'Total Cost of Measures inc Contingency', 'Main Roof Type', 'Main Roof Insulation', + 'Main Roof Insulation Thickness', 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference' ] ].copy() + # Combine 'Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', separating by colons! + archetypes_to_cost['Surveyed Main Roof'] = ( + archetypes_to_cost['Main Roof Type'] + ': ' + archetypes_to_cost['Main Roof Insulation'] + ': ' + + archetypes_to_cost['Main Roof Insulation Thickness'].astype(str) + ) + + # Combine the heating systems, separating by colons! + archetypes_to_cost['Surveyed Main Heating'] = ( + archetypes_to_cost['Existing Primary Heating System'] + ': code - ' + archetypes_to_cost[ + 'Existing Primary Heating PCDF Reference'].astype(str) + ) + + archetypes_to_cost = archetypes_to_cost.drop( + columns=['Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', + 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference']) + # We take properties that are EPC D and below (61% of units) archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])] @@ -1139,7 +1233,19 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa match_classification = [] for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)): - surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]] + + surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]].copy() + surveyed["Package Ref"] = surveyed["Package Ref"].astype(str) + + package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) + package = package.replace("\n", "") + + surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) + surveyed_roofs = surveyed_roofs.replace("\n", "") + + surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) + surveyed_heating = surveyed_heating.replace("\n", "") + # We now check if we have a perfect match surveyed = surveyed[ (surveyed["Property Type"] == home["Property Type"]) & @@ -1149,17 +1255,33 @@ def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepa ] if surveyed.empty: + if package == "2B2A": + raise Exception("Fix me") match_classification.append( { "Address ID": home["Address ID"], - "Match to Surveyed": "Approximate" + "Match to Surveyed": "Approximate", + "Proposed Package Ref": package, + "Surveyed Archetype Roofs": surveyed_roofs, + "Surveyed Archetype Heating": surveyed_heating } ) continue + # Re-do + package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) + package = package.replace("\n", "") + surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) + surveyed_roofs = surveyed_roofs.replace("\n", "") + surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) + surveyed_heating = surveyed_heating.replace("\n", "") + match_classification.append( { "Address ID": home["Address ID"], - "Match to Surveyed": "Exact" + "Match to Surveyed": "Exact", + "Proposed Package Ref": package, + "Surveyed Archetype Roofs": surveyed_roofs, + "Surveyed Archetype Heating": surveyed_heating } )