extending extraction

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-27 17:18:17 +00:00
parent 749faaebca
commit 63521dd1e3
2 changed files with 17 additions and 6 deletions

View file

@ -126,7 +126,7 @@ def handler():
file_extractor = extractors.get(report_type)
if file_extractor is None:
continue
extracted_contents[report_type] = file_extractor(filepath).extract()
if file_extraction_tools.is_xml(filepath):
@ -136,6 +136,7 @@ def handler():
file_extractor = extractors.get(xml_type)
if file_extractor is None:
continue
extracted_contents[xml_type] = file_extractor(filepath).extract()
output_row_data = output_template.copy()
@ -144,10 +145,12 @@ def handler():
# 'Local Authority',
# 'Trustmark Lodgement ID',
# 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR',
# 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Name', 'Phone', 'Email', 'Secondary Contact
# 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date',
# 'Name', 'Phone', 'Email', (owner)
# 'Secondary Contact
# Name', 'Secondary Contact Phone', 'Trustmark Licence Number', 'Retrofit Assessment Date', 'Company Name',
# 'Retrofit Designer Name', , 'No. of Bedrooms',
# , 'Pre Heat Transfer', 'Pre Total Floor Area', 'Pre Heat Demand',
# ,
# 'Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat Transfer', 'Post Total Floor Area',
# 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures Installed', 'Total Cost of Works',
# 'Annual Fuel Saving (MTP)', 'Work Type ID', 'Measure Category', 'Installer', 'Operative Name', 'Operative
@ -159,7 +162,12 @@ def handler():
total_floor_area = sum(
[x["Floor Area (m2)"] for x in extracted_contents["elmhurst epr"]["Building Parts"]] +
# Get the conservatory floor area
extracted_contents["elmhurst epr"]["Conservatory"]["Floor Area (m2)"]
[extracted_contents["elmhurst epr"]["Conservatory"]["Conservatory Floor Area"]]
)
pre_heat_transfer = extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"]
pre_heat_demand = (
extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area
)
to_insert = {
@ -172,8 +180,9 @@ def handler():
"Local Authority": None,
'Property Age': extracted_contents["elmhurst epr"]["Property Age"],
'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"],
'Pre Heat Transfer': extracted_contents["elmhurst epr"][
"Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area,
'Pre Heat Transfer': pre_heat_transfer,
'Pre Total Floor Area': total_floor_area,
'Pre Heat Demand': pre_heat_demand,
}
output_row_data["Property Address"] = property_folder.split(")")[1].strip()

View file

@ -387,6 +387,8 @@ class ElmhurstEprExtractor:
reader = PyPDF2.PdfReader(file)
text = "".join(page.extract_text() for page in reader.pages)
data["Assessor Name"] = re.search(r"Created by:\s*(.*?)\n", text).group(1).strip()
# Extracting individual components
address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
if not address_match: