refactoring epr extraction

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-27 13:30:36 +00:00
parent 965cf975e2
commit 0efd0163ee
2 changed files with 76 additions and 51 deletions

View file

@ -43,5 +43,4 @@ def handler():
file_extractor = extractors.get(report_type)
if file_extractor is None:
continue
extracted_contents[report_type] = file_extractor(filepath).extract()

View file

@ -69,6 +69,10 @@ def is_pdf(filename):
class ElmhurstEprExtractor:
"""
A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR).
"""
def __init__(self, file_path):
self.file_path = file_path
@ -219,7 +223,30 @@ class ElmhurstEprExtractor:
return wall_data
@staticmethod
def extract_primary_heating(text):
def _extract_heating_details(section_text):
"""
Extracts heating details from a given section of text.
Args:
section_text (str): The section of text containing heating details.
Returns:
dict: A dictionary containing heating system details.
"""
system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text)
pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text)
controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text)
heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text)
return {
"System": system_search.group(1).strip() if system_search else "",
"PCDF Reference": pcdf_search.group(1) if pcdf_search else "",
"Controls": controls_search.group(1).strip() if controls_search else "",
"% of Heat": int(heat_search.group(1)) if heat_search else 0,
}
def extract_primary_heating(self, text):
# Extract Primary Heating Section (Main Heating 1)
primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL)
@ -228,61 +255,46 @@ class ElmhurstEprExtractor:
primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
primary_text = primary_heating_section.group(1)
primary_heating_output = {
"Existing Primary Heating System": re.search(
r"Main Heating Code\s*(.*?)\n", primary_text
).group(1).strip(),
"Existing Primary Heating PCDF Reference": re.search(
r"PCDF boiler Reference\s*(\d+)", primary_text
).group(1),
"Existing Primary Heating Controls": re.search(
r"Main Heating Controls\s*(.*?)\n", primary_text
).group(1).strip(),
"Existing Primary Heating % of Heat": int(
re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1)
)
}
return self._extract_heating_details(primary_text)
return primary_heating_output
@staticmethod
def extract_secondary_heating(text):
def extract_secondary_heating(self, text):
# Extract Secondary Heating Section (Main Heating 2)
secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL)
output = {}
if secondary_heating_section is None:
output["Existing Heating System"] = ""
output["Existing Heating PCDF Reference"] = ""
output["Existing Heating Controls"] = ""
output["Existing Heating % of Heat"] = 0
output["System"] = ""
output[" PCDF Reference"] = ""
output["Controls"] = ""
output["% of Heat"] = 0
else:
secondary_text = secondary_heating_section.group(1)
main_heating_code_match_secondary = re.search(
r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
output.update(
**self._extract_heating_details(secondary_text)
)
output["Existing Heating System"] = main_heating_code_match_secondary.group(1).strip()
output["Existing Heating PCDF Reference"] = re.search(
r"PCDF boiler Reference\s*(\d+)", secondary_text
).group(1)
if output["Existing Heating System"] == "":
output["Existing Heating Controls"] = ""
else:
# Might not have heating controls on 2nd system
secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
output["Existing Heating Controls"] = (
secondary_controls_match.group(1).strip() if secondary_controls_match else ""
)
output["Existing Heating % of Heat"] = int(
re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1)
)
output["Heating Code"] = (
re.search(r"Secondary Heating Code\s*(.*?)\n", text).group(1).strip()
if output["System"] and re.search(r"Secondary Heating Code\s*(.*?)\n", text)
else ""
)
return output
def extract(self):
"""
Extracts all relevant data from the EPR PDF.
Returns:
dict: A dictionary containing extracted data, including:
- Address and Postcode
- SAP Rating and Primary Energy Use
- Lighting, Doors, Windows, Roof, and Wall Details
- Heating systems (Primary and Secondary)
- Building Parts
"""
data = {}
with open(self.file_path, "rb") as file:
@ -291,36 +303,56 @@ class ElmhurstEprExtractor:
# Extracting individual components
address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
if not address_match:
raise ValueError("Failed to extract address.")
data["Address"] = address_match.group(1).strip()
data["Postcode"] = data["Address"].split(",")[-1].strip()
sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
if not sap_match:
raise ValueError("Failed to extract SAP rating.")
data["Current SAP Rating"] = int(sap_match.group(1))
energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
if not energy_match:
raise ValueError("Failed to extract primary energy use.")
data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1))
storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
if not storeys_match:
raise ValueError("Failed to extract number of storeys.")
data["Number of Storeys"] = int(storeys_match.group(1))
fuel_match = re.search(r"TOTAL\s*£(\d+)", text)
if not fuel_match:
raise ValueError("Failed to extract fuel bill.")
data["Fuel Bill"] = f"£{fuel_match.group(1)}"
total_doors_match = re.search(r"Total Doors:\s*(\d+)", text)
if not total_doors_match:
raise ValueError("Failed to extract total doors.")
data["Total Number of Doors"] = int(total_doors_match.group(1))
# Extract Number of Insulated Doors
insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text)
if not insulated_doors_match:
raise ValueError("Failed to extract insulated doors.")
data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
# Get number of lighting outlets and number of fittings needing LEL
lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text)
if not lighting_fittings_match:
raise ValueError("Failed to extract lighting")
data["Number of Light Fittings"] = int(lighting_fittings_match.group(1))
lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text)
if not lel_fittings_match:
raise ValueError("Failed to extract LEL fittings.")
data["Number of LEL Fittings"] = int(lel_fittings_match.group(1))
data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
if not windows_section:
raise ValueError("Failed to extract window data.")
data["Windows"] = self.extract_window_age_description(windows_section.group(1))
data["Primary Heating"] = self.extract_primary_heating(text)
@ -329,15 +361,9 @@ class ElmhurstEprExtractor:
data["Roof Details"] = self.extract_roof_details(text)
data["Wall Details"] = self.extract_wall_details(text)
secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
if data["Secondary Heating"]["Existing Heating System"] == "":
data["Secondary Heating Code"] = ""
else:
data["Secondary Heating Code"] = secondary_heating_code_match.group(
1).strip() if secondary_heating_code_match else ""
if not water_heating_code_match:
raise ValueError("Failed to extract water heating code.")
data["Water Heating Code"] = water_heating_code_match.group(1).strip()
return data