diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 3688ca19..629c10e0 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -170,8 +170,8 @@ def handler(): epr_to_insert = { "Postcode": extracted_contents["elmhurst epr"]["Postcode"], - "City/County": None, - "District/Town": None, + "City/County": extracted_contents["elmhurst epr"]["County"], + "District/Town": extracted_contents["elmhurst epr"]["Town"], "Local Authority": None, 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], 'Pre Heat Transfer': pre_heat_transfer, @@ -207,6 +207,35 @@ def handler(): cr_to_insert ) + if extracted_contents.get("elmhurst summary report"): + total_floor_area = sum( + [x["Floor Area (m2)"] for x in extracted_contents["elmhurst summary report"]["Building Parts"]] + + # Get the conservatory floor area + [extracted_contents["elmhurst summary report"]["Conservatory"]["Conservatory Floor Area"]] + ) + + pre_heat_transfer = ( + extracted_contents["elmhurst summary report"]["Primary Energy Use Intensity (kWh/m2/yr)"] + ) + pre_heat_demand = None # Don't have this + + summary_to_insert = { + "Postcode": extracted_contents["elmhurst summary report"]["Postcode"], + "City/County": extracted_contents["elmhurst summary report"]["County"], + "District/Town": extracted_contents["elmhurst summary report"]["Town"], + 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst summary report"]["Current SAP Rating"], + 'Pre Heat Transfer': pre_heat_transfer, + 'Pre Total Floor Area': total_floor_area, + 'Pre Heat Demand': pre_heat_demand, + "R. Assessor - Name": extracted_contents["elmhurst summary report"]["Assessor Name"], + "Retrofit Assessment Date": extracted_contents["elmhurst summary report"]["Assessment Date"], + } + + update_dictionary_with_check( + output_row_data, + summary_to_insert + ) + extracted.append(output_row_data) extracted_df = pd.DataFrame(extracted) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index f5e014a4..c3cc8a10 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -398,6 +398,15 @@ class ElmhurstEprExtractor: data["Address"] = address_match.group(1).strip() data["Postcode"] = data["Address"].split(",")[-1].strip() + # TODO: + data["Region"] = None + data["House Name"] = None + data["House No"] = None + data["Street"] = None + data["Locality"] = None + data["Town"] = None + data["County"] = None + sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) if not sap_match: logger.error("Failed to extract SAP rating.") @@ -657,26 +666,7 @@ class ElmhurstSummaryReportExtractor: } ) - # Calculate aggregated dimensions - main_property = [part for part in data if "Main Property" in part["Building Part"]] - first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] - dimensions = { - "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), - "Total Ground Floor Area (m2)": sum( - [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] - ), - "RIR Floor Area": sum( - [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] - ), - "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if - x["Perimeter (m)"] and x["Room Height (m)"]]), - "First Extension Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if - x["Perimeter (m)"] and x["Room Height (m)"]] - ), - } - - return dimensions + return data @staticmethod def extract_roof_details(text): @@ -869,7 +859,6 @@ class ElmhurstSummaryReportExtractor: """ data = {} - with (open(self.file_path, "rb") as file): reader = PyPDF2.PdfReader(file) text = "" @@ -885,29 +874,51 @@ class ElmhurstSummaryReportExtractor: # Address and postcode postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + postcode = postcode.group(1).strip() if postcode else "" + region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + region = region.group(1).strip() if region else "" + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_name = house_name.group(1).strip() if house_name else "" + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + house_no = house_no.group(1).strip() if house_no else "" + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + street = street.group(1).strip() if street else "" + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + locality = locality.group(1).strip() if locality else "" + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + town = town.group(1).strip() if town else "" + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + county = county.group(1).strip() if county else "" # Clean extracted values and remove any prefixes address_parts = [ - house_no.group(1).strip() if house_no else "", - house_name.group(1).strip() if house_name else "", - street.group(1).strip() if street else "", - locality.group(1).strip() if locality else "", - town.group(1).strip() if town else "", - county.group(1).strip() if county else "", - region.group(1).strip() if region else "", - postcode.group(1).strip() if postcode else "" + house_no, + house_name, + street, + locality, + town, + county, + region, + postcode ] # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) data["Postcode"] = postcode.group(1).strip() + data["Region"] = region + data["House Name"] = house_name + data["House No"] = house_no + data["Street"] = street + data["Locality"] = locality + data["Town"] = town + data["County"] = county # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)