mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
refactoring epr extraction
This commit is contained in:
parent
965cf975e2
commit
0efd0163ee
2 changed files with 76 additions and 51 deletions
|
|
@ -43,5 +43,4 @@ def handler():
|
|||
file_extractor = extractors.get(report_type)
|
||||
if file_extractor is None:
|
||||
continue
|
||||
|
||||
extracted_contents[report_type] = file_extractor(filepath).extract()
|
||||
|
|
|
|||
|
|
@ -69,6 +69,10 @@ def is_pdf(filename):
|
|||
|
||||
|
||||
class ElmhurstEprExtractor:
|
||||
"""
|
||||
A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR).
|
||||
"""
|
||||
|
||||
def __init__(self, file_path):
|
||||
self.file_path = file_path
|
||||
|
||||
|
|
@ -219,7 +223,30 @@ class ElmhurstEprExtractor:
|
|||
return wall_data
|
||||
|
||||
@staticmethod
|
||||
def extract_primary_heating(text):
|
||||
def _extract_heating_details(section_text):
|
||||
"""
|
||||
Extracts heating details from a given section of text.
|
||||
|
||||
Args:
|
||||
section_text (str): The section of text containing heating details.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing heating system details.
|
||||
"""
|
||||
|
||||
system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text)
|
||||
pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text)
|
||||
controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text)
|
||||
heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text)
|
||||
|
||||
return {
|
||||
"System": system_search.group(1).strip() if system_search else "",
|
||||
"PCDF Reference": pcdf_search.group(1) if pcdf_search else "",
|
||||
"Controls": controls_search.group(1).strip() if controls_search else "",
|
||||
"% of Heat": int(heat_search.group(1)) if heat_search else 0,
|
||||
}
|
||||
|
||||
def extract_primary_heating(self, text):
|
||||
|
||||
# Extract Primary Heating Section (Main Heating 1)
|
||||
primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL)
|
||||
|
|
@ -228,61 +255,46 @@ class ElmhurstEprExtractor:
|
|||
primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
|
||||
primary_text = primary_heating_section.group(1)
|
||||
|
||||
primary_heating_output = {
|
||||
"Existing Primary Heating System": re.search(
|
||||
r"Main Heating Code\s*(.*?)\n", primary_text
|
||||
).group(1).strip(),
|
||||
"Existing Primary Heating PCDF Reference": re.search(
|
||||
r"PCDF boiler Reference\s*(\d+)", primary_text
|
||||
).group(1),
|
||||
"Existing Primary Heating Controls": re.search(
|
||||
r"Main Heating Controls\s*(.*?)\n", primary_text
|
||||
).group(1).strip(),
|
||||
"Existing Primary Heating % of Heat": int(
|
||||
re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1)
|
||||
)
|
||||
}
|
||||
return self._extract_heating_details(primary_text)
|
||||
|
||||
return primary_heating_output
|
||||
|
||||
@staticmethod
|
||||
def extract_secondary_heating(text):
|
||||
def extract_secondary_heating(self, text):
|
||||
# Extract Secondary Heating Section (Main Heating 2)
|
||||
secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL)
|
||||
|
||||
output = {}
|
||||
if secondary_heating_section is None:
|
||||
output["Existing Heating System"] = ""
|
||||
output["Existing Heating PCDF Reference"] = ""
|
||||
output["Existing Heating Controls"] = ""
|
||||
output["Existing Heating % of Heat"] = 0
|
||||
|
||||
output["System"] = ""
|
||||
output[" PCDF Reference"] = ""
|
||||
output["Controls"] = ""
|
||||
output["% of Heat"] = 0
|
||||
|
||||
else:
|
||||
secondary_text = secondary_heating_section.group(1)
|
||||
|
||||
main_heating_code_match_secondary = re.search(
|
||||
r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
|
||||
output.update(
|
||||
**self._extract_heating_details(secondary_text)
|
||||
)
|
||||
output["Existing Heating System"] = main_heating_code_match_secondary.group(1).strip()
|
||||
|
||||
output["Existing Heating PCDF Reference"] = re.search(
|
||||
r"PCDF boiler Reference\s*(\d+)", secondary_text
|
||||
).group(1)
|
||||
|
||||
if output["Existing Heating System"] == "":
|
||||
output["Existing Heating Controls"] = ""
|
||||
else:
|
||||
# Might not have heating controls on 2nd system
|
||||
secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
|
||||
output["Existing Heating Controls"] = (
|
||||
secondary_controls_match.group(1).strip() if secondary_controls_match else ""
|
||||
)
|
||||
output["Existing Heating % of Heat"] = int(
|
||||
re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1)
|
||||
)
|
||||
output["Heating Code"] = (
|
||||
re.search(r"Secondary Heating Code\s*(.*?)\n", text).group(1).strip()
|
||||
if output["System"] and re.search(r"Secondary Heating Code\s*(.*?)\n", text)
|
||||
else ""
|
||||
)
|
||||
|
||||
return output
|
||||
|
||||
def extract(self):
|
||||
"""
|
||||
Extracts all relevant data from the EPR PDF.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing extracted data, including:
|
||||
- Address and Postcode
|
||||
- SAP Rating and Primary Energy Use
|
||||
- Lighting, Doors, Windows, Roof, and Wall Details
|
||||
- Heating systems (Primary and Secondary)
|
||||
- Building Parts
|
||||
"""
|
||||
data = {}
|
||||
|
||||
with open(self.file_path, "rb") as file:
|
||||
|
|
@ -291,36 +303,56 @@ class ElmhurstEprExtractor:
|
|||
|
||||
# Extracting individual components
|
||||
address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
|
||||
if not address_match:
|
||||
raise ValueError("Failed to extract address.")
|
||||
data["Address"] = address_match.group(1).strip()
|
||||
data["Postcode"] = data["Address"].split(",")[-1].strip()
|
||||
|
||||
sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
|
||||
if not sap_match:
|
||||
raise ValueError("Failed to extract SAP rating.")
|
||||
data["Current SAP Rating"] = int(sap_match.group(1))
|
||||
|
||||
energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
|
||||
if not energy_match:
|
||||
raise ValueError("Failed to extract primary energy use.")
|
||||
data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1))
|
||||
|
||||
storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
|
||||
if not storeys_match:
|
||||
raise ValueError("Failed to extract number of storeys.")
|
||||
data["Number of Storeys"] = int(storeys_match.group(1))
|
||||
|
||||
fuel_match = re.search(r"TOTAL\s*£(\d+)", text)
|
||||
if not fuel_match:
|
||||
raise ValueError("Failed to extract fuel bill.")
|
||||
data["Fuel Bill"] = f"£{fuel_match.group(1)}"
|
||||
|
||||
total_doors_match = re.search(r"Total Doors:\s*(\d+)", text)
|
||||
if not total_doors_match:
|
||||
raise ValueError("Failed to extract total doors.")
|
||||
data["Total Number of Doors"] = int(total_doors_match.group(1))
|
||||
|
||||
# Extract Number of Insulated Doors
|
||||
insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text)
|
||||
if not insulated_doors_match:
|
||||
raise ValueError("Failed to extract insulated doors.")
|
||||
data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
|
||||
|
||||
# Get number of lighting outlets and number of fittings needing LEL
|
||||
lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text)
|
||||
if not lighting_fittings_match:
|
||||
raise ValueError("Failed to extract lighting")
|
||||
data["Number of Light Fittings"] = int(lighting_fittings_match.group(1))
|
||||
lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text)
|
||||
if not lel_fittings_match:
|
||||
raise ValueError("Failed to extract LEL fittings.")
|
||||
data["Number of LEL Fittings"] = int(lel_fittings_match.group(1))
|
||||
data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
|
||||
|
||||
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
|
||||
if not windows_section:
|
||||
raise ValueError("Failed to extract window data.")
|
||||
data["Windows"] = self.extract_window_age_description(windows_section.group(1))
|
||||
|
||||
data["Primary Heating"] = self.extract_primary_heating(text)
|
||||
|
|
@ -329,15 +361,9 @@ class ElmhurstEprExtractor:
|
|||
data["Roof Details"] = self.extract_roof_details(text)
|
||||
data["Wall Details"] = self.extract_wall_details(text)
|
||||
|
||||
secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
|
||||
water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
|
||||
|
||||
if data["Secondary Heating"]["Existing Heating System"] == "":
|
||||
data["Secondary Heating Code"] = ""
|
||||
else:
|
||||
data["Secondary Heating Code"] = secondary_heating_code_match.group(
|
||||
1).strip() if secondary_heating_code_match else ""
|
||||
|
||||
if not water_heating_code_match:
|
||||
raise ValueError("Failed to extract water heating code.")
|
||||
data["Water Heating Code"] = water_heating_code_match.group(1).strip()
|
||||
|
||||
return data
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue