done with summary report extraction

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-28 09:44:26 +00:00
parent 26e0206f37
commit 8b875cbccf

View file

@ -801,6 +801,64 @@ class ElmhurstSummaryReportExtractor:
return wall_data
@staticmethod
def extract_conservatory(text):
"""
Extracts conservatory data from the provided text.
The section is located between "5.0 Conservatory" and "7.0 Walls".
Args:
text (str): The full text of the Summary Report PDF.
Returns:
dict: A dictionary with conservatory details:
- "Conservatory Present"
- "Conservatory Separated"
- "Conservatory Floor Area"
- "Conservatory Double Glazed"
- "Conservatory Glazed Perimeter"
- "Heated Conservatory Height"
"""
# Extract the section between "5.0 Conservatory" and "7.0 Walls"
conservatory_match = re.search(r"5\.0 Conservatory:(.*?)7\.0 Walls:", text, re.DOTALL)
if not conservatory_match:
logger.error("Failed to extract conservatory data.")
raise ValueError("Could not extract conservatory data.")
conservatory_text = conservatory_match.group(1)
# Check if conservatory is present
present_match = re.search(r"Is there a conservatory\?\s*(Yes|No)", conservatory_text, re.IGNORECASE)
if not present_match or present_match.group(1).strip().lower() == "no":
return {
"Conservatory Present": "No",
"Conservatory Separated": "",
"Conservatory Floor Area": 0,
"Conservatory Double Glazed": "",
"Conservatory Glazed Perimeter": 0,
"Heated Conservatory Height": "",
}
# If we get here, raise a temporary exception since we've not seen a case of this, so should make sure
# this is correct
separated_match = re.search(r"Is it thermally separated\?\s*(Yes|No)", conservatory_text, re.IGNORECASE)
floor_area_match = re.search(r"Floor Area \[m2\]\s*([\d.]+)", conservatory_text, re.IGNORECASE)
double_glazed_match = re.search(r"Double Glazed\s*(Yes|No)", conservatory_text, re.IGNORECASE)
glazed_perimeter_match = re.search(r"Glazed Perimeter \[m\]\s*([\d.]+)", conservatory_text, re.IGNORECASE)
height_match = re.search(r"Room Height\s*(.*?)(?=\n|$)", conservatory_text, re.IGNORECASE)
return {
"Conservatory Present": "Yes",
"Conservatory Separated": separated_match.group(1).strip() if separated_match else "",
"Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0,
"Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "",
"Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0,
"Heated Conservatory Height": height_match.group(1).strip() if height_match else "",
}
def extract(self):
"""
Extracts specific data from the provided PDF file.
@ -810,11 +868,6 @@ class ElmhurstSummaryReportExtractor:
- Address
"""
# Expected keys:
# dict_keys([
# 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory',
# 'Water Heating Code'])
data = {}
with (open(self.file_path, "rb") as file):
@ -904,6 +957,7 @@ class ElmhurstSummaryReportExtractor:
data["Building Parts"] = self.extract_building_parts(text)
data["Roof Details"] = self.extract_roof_details(text)
data["Wall Details"] = self.extract_wall_details(text)
data["Conservatory"] = self.extract_conservatory(text)
water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
if not water_heating_code_match:
@ -911,15 +965,4 @@ class ElmhurstSummaryReportExtractor:
data["Water Heating Code"] = water_heating_code_match.group(1).strip()
# Get the main building wall data
main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0]
data["Main Wall Type"] = main_building_walls["Wall Type"]
data["Main Wall Insulation"] = main_building_walls["Wall Insulation"]
data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"]
data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"]
data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"]
data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"]
data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"]
data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"]
return data