implementing summary report extraction

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-28 12:00:43 +00:00
parent 8b875cbccf
commit 5a2ffe646c
2 changed files with 71 additions and 31 deletions

View file

@ -170,8 +170,8 @@ def handler():
epr_to_insert = {
"Postcode": extracted_contents["elmhurst epr"]["Postcode"],
"City/County": None,
"District/Town": None,
"City/County": extracted_contents["elmhurst epr"]["County"],
"District/Town": extracted_contents["elmhurst epr"]["Town"],
"Local Authority": None,
'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"],
'Pre Heat Transfer': pre_heat_transfer,
@ -207,6 +207,35 @@ def handler():
cr_to_insert
)
if extracted_contents.get("elmhurst summary report"):
total_floor_area = sum(
[x["Floor Area (m2)"] for x in extracted_contents["elmhurst summary report"]["Building Parts"]] +
# Get the conservatory floor area
[extracted_contents["elmhurst summary report"]["Conservatory"]["Conservatory Floor Area"]]
)
pre_heat_transfer = (
extracted_contents["elmhurst summary report"]["Primary Energy Use Intensity (kWh/m2/yr)"]
)
pre_heat_demand = None # Don't have this
summary_to_insert = {
"Postcode": extracted_contents["elmhurst summary report"]["Postcode"],
"City/County": extracted_contents["elmhurst summary report"]["County"],
"District/Town": extracted_contents["elmhurst summary report"]["Town"],
'SAP Rating Pre (from IMA)': extracted_contents["elmhurst summary report"]["Current SAP Rating"],
'Pre Heat Transfer': pre_heat_transfer,
'Pre Total Floor Area': total_floor_area,
'Pre Heat Demand': pre_heat_demand,
"R. Assessor - Name": extracted_contents["elmhurst summary report"]["Assessor Name"],
"Retrofit Assessment Date": extracted_contents["elmhurst summary report"]["Assessment Date"],
}
update_dictionary_with_check(
output_row_data,
summary_to_insert
)
extracted.append(output_row_data)
extracted_df = pd.DataFrame(extracted)

View file

@ -398,6 +398,15 @@ class ElmhurstEprExtractor:
data["Address"] = address_match.group(1).strip()
data["Postcode"] = data["Address"].split(",")[-1].strip()
# TODO:
data["Region"] = None
data["House Name"] = None
data["House No"] = None
data["Street"] = None
data["Locality"] = None
data["Town"] = None
data["County"] = None
sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
if not sap_match:
logger.error("Failed to extract SAP rating.")
@ -657,26 +666,7 @@ class ElmhurstSummaryReportExtractor:
}
)
# Calculate aggregated dimensions
main_property = [part for part in data if "Main Property" in part["Building Part"]]
first_extensions = [part for part in data if "1st Extension" in part["Building Part"]]
dimensions = {
"Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]),
"Total Ground Floor Area (m2)": sum(
[part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]]
),
"RIR Floor Area": sum(
[part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]]
),
"Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if
x["Perimeter (m)"] and x["Room Height (m)"]]),
"First Extension Wall Area (m2)": sum(
[x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if
x["Perimeter (m)"] and x["Room Height (m)"]]
),
}
return dimensions
return data
@staticmethod
def extract_roof_details(text):
@ -869,7 +859,6 @@ class ElmhurstSummaryReportExtractor:
"""
data = {}
with (open(self.file_path, "rb") as file):
reader = PyPDF2.PdfReader(file)
text = ""
@ -885,29 +874,51 @@ class ElmhurstSummaryReportExtractor:
# Address and postcode
postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)
postcode = postcode.group(1).strip() if postcode else ""
region = re.search(r"Region:\s*(.*?)\nHouse Name:", text)
region = region.group(1).strip() if region else ""
house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text)
house_name = house_name.group(1).strip() if house_name else ""
house_no = re.search(r"House No:\s*(.*?)\nStreet:", text)
house_no = house_no.group(1).strip() if house_no else ""
street = re.search(r"Street:\s*(.*?)\nLocality:", text)
street = street.group(1).strip() if street else ""
locality = re.search(r"Locality:\s*(.*?)\nTown:", text)
locality = locality.group(1).strip() if locality else ""
town = re.search(r"Town:\s*(.*?)\nCounty:", text)
town = town.group(1).strip() if town else ""
county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text)
county = county.group(1).strip() if county else ""
# Clean extracted values and remove any prefixes
address_parts = [
house_no.group(1).strip() if house_no else "",
house_name.group(1).strip() if house_name else "",
street.group(1).strip() if street else "",
locality.group(1).strip() if locality else "",
town.group(1).strip() if town else "",
county.group(1).strip() if county else "",
region.group(1).strip() if region else "",
postcode.group(1).strip() if postcode else ""
house_no,
house_name,
street,
locality,
town,
county,
region,
postcode
]
# Join non-empty parts with a comma
data["Address"] = ", ".join([part for part in address_parts if part])
data["Postcode"] = postcode.group(1).strip()
data["Region"] = region
data["House Name"] = house_name
data["House No"] = house_no
data["Street"] = street
data["Locality"] = locality
data["Town"] = town
data["County"] = county
# Extract Current SAP rating
sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)