diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index ede644b8..a395508c 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -43,5 +43,4 @@ def handler(): file_extractor = extractors.get(report_type) if file_extractor is None: continue - extracted_contents[report_type] = file_extractor(filepath).extract() diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index cdd25f8a..15b183dc 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -69,6 +69,10 @@ def is_pdf(filename): class ElmhurstEprExtractor: + """ + A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). + """ + def __init__(self, file_path): self.file_path = file_path @@ -219,7 +223,30 @@ class ElmhurstEprExtractor: return wall_data @staticmethod - def extract_primary_heating(text): + def _extract_heating_details(section_text): + """ + Extracts heating details from a given section of text. + + Args: + section_text (str): The section of text containing heating details. + + Returns: + dict: A dictionary containing heating system details. + """ + + system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text) + pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text) + controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text) + heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text) + + return { + "System": system_search.group(1).strip() if system_search else "", + "PCDF Reference": pcdf_search.group(1) if pcdf_search else "", + "Controls": controls_search.group(1).strip() if controls_search else "", + "% of Heat": int(heat_search.group(1)) if heat_search else 0, + } + + def extract_primary_heating(self, text): # Extract Primary Heating Section (Main Heating 1) primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) @@ -228,61 +255,46 @@ class ElmhurstEprExtractor: primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 primary_text = primary_heating_section.group(1) - primary_heating_output = { - "Existing Primary Heating System": re.search( - r"Main Heating Code\s*(.*?)\n", primary_text - ).group(1).strip(), - "Existing Primary Heating PCDF Reference": re.search( - r"PCDF boiler Reference\s*(\d+)", primary_text - ).group(1), - "Existing Primary Heating Controls": re.search( - r"Main Heating Controls\s*(.*?)\n", primary_text - ).group(1).strip(), - "Existing Primary Heating % of Heat": int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) - ) - } + return self._extract_heating_details(primary_text) - return primary_heating_output - - @staticmethod - def extract_secondary_heating(text): + def extract_secondary_heating(self, text): # Extract Secondary Heating Section (Main Heating 2) secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + output = {} if secondary_heating_section is None: - output["Existing Heating System"] = "" - output["Existing Heating PCDF Reference"] = "" - output["Existing Heating Controls"] = "" - output["Existing Heating % of Heat"] = 0 + + output["System"] = "" + output[" PCDF Reference"] = "" + output["Controls"] = "" + output["% of Heat"] = 0 else: secondary_text = secondary_heating_section.group(1) - - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + output.update( + **self._extract_heating_details(secondary_text) ) - output["Existing Heating System"] = main_heating_code_match_secondary.group(1).strip() - output["Existing Heating PCDF Reference"] = re.search( - r"PCDF boiler Reference\s*(\d+)", secondary_text - ).group(1) - - if output["Existing Heating System"] == "": - output["Existing Heating Controls"] = "" - else: - # Might not have heating controls on 2nd system - secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - output["Existing Heating Controls"] = ( - secondary_controls_match.group(1).strip() if secondary_controls_match else "" - ) - output["Existing Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) - ) + output["Heating Code"] = ( + re.search(r"Secondary Heating Code\s*(.*?)\n", text).group(1).strip() + if output["System"] and re.search(r"Secondary Heating Code\s*(.*?)\n", text) + else "" + ) return output def extract(self): + """ + Extracts all relevant data from the EPR PDF. + + Returns: + dict: A dictionary containing extracted data, including: + - Address and Postcode + - SAP Rating and Primary Energy Use + - Lighting, Doors, Windows, Roof, and Wall Details + - Heating systems (Primary and Secondary) + - Building Parts + """ data = {} with open(self.file_path, "rb") as file: @@ -291,36 +303,56 @@ class ElmhurstEprExtractor: # Extracting individual components address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + if not address_match: + raise ValueError("Failed to extract address.") data["Address"] = address_match.group(1).strip() data["Postcode"] = data["Address"].split(",")[-1].strip() sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) + if not sap_match: + raise ValueError("Failed to extract SAP rating.") data["Current SAP Rating"] = int(sap_match.group(1)) energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) + if not energy_match: + raise ValueError("Failed to extract primary energy use.") data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1)) storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + if not storeys_match: + raise ValueError("Failed to extract number of storeys.") data["Number of Storeys"] = int(storeys_match.group(1)) fuel_match = re.search(r"TOTAL\s*£(\d+)", text) + if not fuel_match: + raise ValueError("Failed to extract fuel bill.") data["Fuel Bill"] = f"£{fuel_match.group(1)}" total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + if not total_doors_match: + raise ValueError("Failed to extract total doors.") data["Total Number of Doors"] = int(total_doors_match.group(1)) # Extract Number of Insulated Doors insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + if not insulated_doors_match: + raise ValueError("Failed to extract insulated doors.") data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) # Get number of lighting outlets and number of fittings needing LEL lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) + if not lighting_fittings_match: + raise ValueError("Failed to extract lighting") data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) + if not lel_fittings_match: + raise ValueError("Failed to extract LEL fittings.") data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + if not windows_section: + raise ValueError("Failed to extract window data.") data["Windows"] = self.extract_window_age_description(windows_section.group(1)) data["Primary Heating"] = self.extract_primary_heating(text) @@ -329,15 +361,9 @@ class ElmhurstEprExtractor: data["Roof Details"] = self.extract_roof_details(text) data["Wall Details"] = self.extract_wall_details(text) - secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - - if data["Secondary Heating"]["Existing Heating System"] == "": - data["Secondary Heating Code"] = "" - else: - data["Secondary Heating Code"] = secondary_heating_code_match.group( - 1).strip() if secondary_heating_code_match else "" - + if not water_heating_code_match: + raise ValueError("Failed to extract water heating code.") data["Water Heating Code"] = water_heating_code_match.group(1).strip() return data