extracting epr

This commit is contained in:
Khalim Conn-Kowlessar 2024-10-28 12:04:57 +00:00
parent 0332c77098
commit cf2a94cb36

View file

@ -43,6 +43,65 @@ def extract_summary_report(pdf_path):
return data
def extract_epr(pdf_path):
"""
Extracts specific data from an Energy Report (EPR) PDF file.
"""
data = {
"Address": None,
"Estimated Annual Costs": None,
"Current SAP": None,
"Space Heating": None,
"Water Heating": None,
"Fuel Bill": None,
}
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
# Extract Address
address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
data["Address"] = address_match.group(1).strip()
# Extract Total Floor Area
area_match = re.search(r"Total Floor Area\s*(\d+ m2)", text)
data["Total Floor Area"] = area_match.group(1)
# Extract Estimated Annual Costs
cost_match = re.search(r"TOTAL\s*£(\d+)", text)
data["Estimated Annual Costs"] = f"£{cost_match.group(1)}"
# Extract Current SAP rating
# Updated Regular Expression to find "GG (1-20)" followed by two numbers
sap_match = re.search(r"GG \(1-20\)(\d{1,2})(\d{1,2})", text)
# Extract and validate the Current and Potential SAP ratings
current_sap, potential_sap = int(sap_match.group(1)), int(sap_match.group(2))
# Ensure potential is greater than or equal to current
if 1 <= current_sap <= 99 and 1 <= potential_sap <= 99 and potential_sap >= current_sap:
data["Current SAP"] = current_sap
data["Potential SAP"] = potential_sap
else:
raise ValueError("Failed to parse SAP ratings correctly due to unexpected format.")
# Extract Space Heating (kWh)
space_heating_match = re.search(r"Space Heating\s+(\d+)\s+kWh", text)
data["Space Heating"] = int(space_heating_match.group(1))
# Extract Water Heating (kWh)
water_heating_match = re.search(r"Water Heating\s+(\d+)\s+kWh", text)
data["Water Heating"] = int(water_heating_match.group(1))
# Extract Fuel Bill (total estimated costs)
fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
return data
def extract_retrofit_assessment_folder(retrofit_folder_path):
"""
Handles extraction from a retrofit assessment folder if it exists and has content.
@ -61,22 +120,38 @@ def extract_retrofit_assessment_folder(retrofit_folder_path):
return None # If no relevant PDF is found
def is_energy_report(text):
"""
Determines if the provided text indicates that the PDF is an Energy Report.
Returns True if the text contains 'Energy Report'.
"""
return text.startswith("ENERGY REPORT")
def extract_from_survey_folder_files(survey_folder_path):
"""
Handles extraction directly from files in the survey folder when no 'retrofit assessment' folder exists.
"""
survey_files = os.listdir(survey_folder_path)
survey_files = [f for f in os.listdir(survey_folder_path) if f.endswith(".pdf")]
# Look for a summary report directly in the survey folder
summary_report = next(
(name for name in survey_files if "summary" in name.lower() and name.endswith(".pdf")), None
)
for pdf_file in survey_files:
pdf_path = os.path.join(survey_folder_path, pdf_file)
if summary_report:
pdf_path = os.path.join(survey_folder_path, summary_report)
return extract_summary_report(pdf_path)
# Attempt to read the first page of the PDF to determine type
with open(pdf_path, "rb") as file:
reader = PyPDF2.PdfReader(file)
first_page_text = reader.pages[0].extract_text() if reader.pages else ""
return None # If no relevant PDF is found
if is_energy_report(first_page_text):
# Treat this as an Energy Report
return extract_epr(pdf_path)
elif "summary" in pdf_file.lower():
# Treat this as a Summary Report
return extract_summary_report(pdf_path)
else:
raise NotImplementedError("Implement me")
return None
def main():
@ -109,7 +184,6 @@ def main():
}
extracted_data.append(summary_data)
continue
# If no retrofit folder or it was empty, check files in survey_folder
summary_data = extract_from_survey_folder_files(survey_folder_path)
if summary_data: