diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 7f4f81e9..0b660c76 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -16,6 +16,7 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ + blah data = { "Address": None, "Postcode": None, @@ -56,8 +57,8 @@ def extract_summary_report(pdf_path): data["Number of Storeys"] = int(storeys_match.group(1)) # Extract Carbon Emissions - carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) - data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) + # carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) + # data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) # Extract Fuel Bill fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) @@ -204,6 +205,69 @@ def extract_window_age_description(windows_text): } +def extract_building_parts_epr(text): + """ + Extracts building parts and associated dimensions from the provided PDF file. + Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length. + """ + data = [] + + # Pattern to locate each "Building part" section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party " + r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)", + re.DOTALL + ) + + # Extract each building part + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + # Clean up building part name to keep only the descriptor (e.g., "Main" or "1st Extension") + cleaned_part_name = re.sub(r" - built in.*", "", part_name) + + floor_data = match.group(2) + + # Pattern to match each floor's measurements + floor_pattern = re.compile( + r"(Lowest floor|First floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract floor details for each building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + # We now extract out the aggregated data + + main_building = [part for part in data if "Main" in part["Building Part"]] + first_extension = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] + ), + "RIR Floor Area": 0, + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building]), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension]) if first_extension else 0, + } + + return dimensions + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -212,6 +276,7 @@ def extract_epr(pdf_path): "Address": None, "Postcode": None, "Current SAP Rating": None, + "Primary Energy Use (kWh/yr)": None, "Primary Energy Use Intensity (kWh/m2/yr)": None, "Number of Storeys": None, "Fuel Bill": None, @@ -232,6 +297,11 @@ def extract_epr(pdf_path): "Existing Secondary Heating % of Heat": None, "Secondary Heating Code": None, "Water Heating Code": None, + 'Total Floor Area (m2)': None, + 'Total Ground Floor Area': None, + 'RIR Floor Area': None, + 'Main Building Wall Area (m2)': None, + 'First Extension Wall Area (m2)': None } with open(pdf_path, "rb") as file: @@ -336,6 +406,9 @@ def extract_epr(pdf_path): window_data = extract_window_age_description(windows_text) data.update(window_data) + building_parts = extract_building_parts_epr(text) + data.update(building_parts) + return data @@ -465,7 +538,7 @@ def main(): if summary_data: summary_data = { "survey_folder": survey_folder, - **summary_data + **summary_data, } extracted_data.append(summary_data) continue @@ -474,11 +547,12 @@ def main(): continue # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) if summary_data: summary_data = { "survey_folder": survey_folder, - **summary_data + **summary_data, } extracted_data.append(summary_data)