From 4e752fb6c48cb163e4350f32eceb14f5a97d2a94 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 29 Oct 2024 17:00:02 +0000 Subject: [PATCH] added summary table dimension extraction --- .../stonewater/Wave 3 Preparation.py | 82 ++++++++++++++++++- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 0b660c76..b660ab64 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -16,7 +16,6 @@ def extract_summary_report(pdf_path): - Fuel Bill - Address """ - blah data = { "Address": None, "Postcode": None, @@ -40,6 +39,11 @@ def extract_summary_report(pdf_path): "Existing Secondary Heating % of Heat": None, "Secondary Heating Code": None, "Water Heating Code": None, + 'Total Floor Area (m2)': None, + 'Total Ground Floor Area (m2)': None, + 'RIR Floor Area': None, + 'Main Building Wall Area (m2)': None, + 'First Extension Wall Area (m2)': None } with open(pdf_path, "rb") as file: @@ -149,6 +153,9 @@ def extract_summary_report(pdf_path): data["Water Heating Code"] = water_heating_code_match.group(1).strip() + dimensions = extract_building_parts_summary(text) + data.update(dimensions) + return data @@ -256,7 +263,7 @@ def extract_building_parts_epr(text): first_extension = [part for part in data if "1st Extension" in part["Building Part"]] dimensions = { "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), - "Total Ground Floor Area": sum( + "Total Ground Floor Area (m2)": sum( [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] ), "RIR Floor Area": 0, @@ -268,6 +275,75 @@ def extract_building_parts_epr(text): return dimensions +def extract_building_parts_summary(text): + """ + Extracts building parts and associated dimensions from the summary report PDF. + This includes Main Property and multiple extensions if they exist. + """ + data = [] + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to extract each building part, starting from Main Property and including extensions + building_part_pattern = re.compile( + r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" + r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory)", + re.DOTALL + ) + + # Loop through each building part match, including Main Property and extensions + for match in building_part_pattern.finditer(dimensions_text): + part_name = match.group(1) + floor_data = match.group(2) + + # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length + floor_pattern = re.compile( + r"(1st Floor|Lowest Floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract data for each floor within the building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data list + data.append({ + "Building Part": part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + # Calculate aggregated dimensions + main_property = [part for part in data if "Main Property" in part["Building Part"]] + first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area (m2)": sum( + [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] + ), + "RIR Floor Area": 0, + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property]), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions] + ), + } + + return dimensions + + def extract_epr(pdf_path): """ Extracts specific data from an Energy Report (EPR) PDF file. @@ -298,7 +374,7 @@ def extract_epr(pdf_path): "Secondary Heating Code": None, "Water Heating Code": None, 'Total Floor Area (m2)': None, - 'Total Ground Floor Area': None, + 'Total Ground Floor Area (m2)': None, 'RIR Floor Area': None, 'Main Building Wall Area (m2)': None, 'First Extension Wall Area (m2)': None