Model/survey_report/app.py
Khalim Conn-Kowlessar daabf2a586 extracting epr
2025-01-30 01:09:41 +00:00

50 lines
1.9 KiB
Python

import os
import PyPDF2
from survey_report.extraction.detect_report_type import detect_report_type
from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor
def handle():
"""
Performs the data extraction process for the survey report
:return:
"""
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2"
folder_contents = os.listdir(data_folder)
# We look for the following files:
# Site notes
file_mapping = {}
for file in folder_contents:
# Check if it's a pdf file
if not file.endswith(".pdf"):
continue
filepath = os.path.join(data_folder, file)
with (open(filepath, "rb") as f):
pdf = PyPDF2.PdfReader(f)
first_page = pdf.pages[0].extract_text()
text = ""
for page in pdf.pages:
text += page.extract_text()
# Check the report type
report_type = detect_report_type(first_page)
if report_type is not None:
file_mapping[report_type] = text
# This is only set up to work with quido site notes so we must have it
site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
site_notes = site_notes_extractor.extract_all()
# We also must have an EPR
epr_extractor = EPRExtractor(file_mapping["quidos_epr"])
epr = epr_extractor.extract_all()
# We now produce the combined data sheet which is the starting figure:
data_sheet = {**epr, **site_notes}
del data_sheet['Building Dimensions']
# We unnest the Total Building Dimensions
data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
del data_sheet["Total Building Dimensions"]