diff --git a/backend/documents_parser/extractor.py b/backend/documents_parser/extractor.py index 7ea3ccd1..62d5d667 100644 --- a/backend/documents_parser/extractor.py +++ b/backend/documents_parser/extractor.py @@ -3,6 +3,7 @@ from typing import List, Optional from datatypes.epc.surveys.pashub_rdsap_site_notes import ( BuildingConstruction, + InspectionMetadata, BuildingMeasurements, Conservatories, CustomerResponse, @@ -81,9 +82,41 @@ class PasHubRdSapSiteNotesExtractor: # --- public extract methods --- + def extract_inspection_metadata(self) -> InspectionMetadata: + try: + addr_start = self.text_list.index("Property Address:") + 1 + addr_end = self.text_list.index("Property Photo", addr_start) + property_address = ", ".join( + t.rstrip(",") for t in self.text_list[addr_start:addr_end] + ) + except ValueError: + property_address = "" + + created_on_raw = self._get("Created On:") + created_on = ( + datetime.strptime(created_on_raw, "%d %B %Y").strftime("%Y-%m-%d") + if created_on_raw + else "" + ) + + date_of_inspection_raw = self._get("Date of Inspection:") + if not date_of_inspection_raw: + raise ValueError("Date of Inspection not found in document") + date_of_inspection = datetime.strptime(date_of_inspection_raw, "%d %B %Y").date() + + return InspectionMetadata( + inspection_surveyor=self._get("Inspection Surveyor:") or "", + email_address=self._get("E-Mail Address:") or "", + report_reference=self._get("Report Reference:") or "", + created_on=created_on, + date_of_inspection=date_of_inspection, + property_address=property_address, + property_photo="Property Photo" in self.text_list, + ) + def extract(self) -> PasHubRdSapSiteNotes: return PasHubRdSapSiteNotes( - inspection_metadata=None, + inspection_metadata=self.extract_inspection_metadata(), general=self.extract_general(), building_construction=self.extract_building_construction(), building_measurements=self.extract_building_measurements(), diff --git a/backend/documents_parser/tests/fixtures/ExampleSiteNotes.pdf b/backend/documents_parser/tests/fixtures/ExampleSiteNotes.pdf index 402d38aa..361482ee 100644 Binary files a/backend/documents_parser/tests/fixtures/ExampleSiteNotes.pdf and b/backend/documents_parser/tests/fixtures/ExampleSiteNotes.pdf differ diff --git a/backend/documents_parser/tests/fixtures/site_notes_example_text.json b/backend/documents_parser/tests/fixtures/site_notes_example_text.json index 17d0265a..b18ac082 100644 --- a/backend/documents_parser/tests/fixtures/site_notes_example_text.json +++ b/backend/documents_parser/tests/fixtures/site_notes_example_text.json @@ -1,4 +1,24 @@ [ + "SMART EPC: Record of", + "Inspection & Site Notes", + "Inspection Surveyor:", + "Benjamin Burke", + "E-Mail Address:", + "ben@mbsolutionsgroup.co.uk", + "Report Reference:", + "6EA2A86D-94CE-4792-8D49-AB495C744EDD", + "Created On:", + "10 November 2025", + "Date of Inspection:", + "25 September 2025", + "Property Address:", + "40,", + "Abbey Place,", + "Crewe,", + "Cheshire,", + "CW1 4JR", + "Property Photo", + "Page 1", "", "Photo of electricity meter:", "Photo of electricity meter:", diff --git a/backend/documents_parser/tests/test_end_to_end.py b/backend/documents_parser/tests/test_end_to_end.py new file mode 100644 index 00000000..359a4ea8 --- /dev/null +++ b/backend/documents_parser/tests/test_end_to_end.py @@ -0,0 +1,236 @@ +import os +from datetime import date + +import pytest + +from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor +from backend.documents_parser.pdf import pdf_to_text_list +from datatypes.epc.domain.epc_property_data import ( + EpcPropertyData, + InstantaneousWwhrs, + MainHeatingDetail, + SapBuildingPart, + SapEnergySource, + SapFloorDimension, + SapHeating, + SapWindow, +) +from datatypes.epc.domain.mapper import EpcPropertyDataMapper + +PDF_PATH = os.path.join(os.path.dirname(__file__), "fixtures", "ExampleSiteNotes.pdf") + + +class TestPdfToEpcPropertyData: + @pytest.fixture + def result(self) -> EpcPropertyData: + with open(PDF_PATH, "rb") as f: + pdf_bytes = f.read() + site_notes = PasHubRdSapSiteNotesExtractor(pdf_to_text_list(pdf_bytes)).extract() + return EpcPropertyDataMapper.from_site_notes(site_notes) + + def test_full_epc_property_data(self, result: EpcPropertyData) -> None: + assert result == EpcPropertyData( + dwelling_type="Mid-terrace house", + inspection_date=date(2025, 9, 25), + tenure="Rented Social", + transaction_type="Grant-Scheme (ECO, RHI, etc.)", + roofs=[], + walls=[], + floors=[], + main_heating=[], + door_count=2, + sap_heating=SapHeating( + instantaneous_wwhrs=InstantaneousWwhrs(), + main_heating_details=[ + MainHeatingDetail( + has_fghrs=False, + main_fuel_type="Mains gas", + heat_emitter_type="Radiators", + emitter_temperature="Unknown", + main_heating_control="Programmer, room thermostat and TRVs", + fan_flue_present=True, + ) + ], + has_fixed_air_conditioning=False, + ), + sap_windows=[ + SapWindow( + pvc_frame="Wooden or PVC", + glazing_gap="16 mm or more", + orientation="North West", + window_type="Window", + glazing_type="Double glazing, Unknown install date", + window_width=2.3, + window_height=1.2, + draught_proofed=True, + window_location="Main Building", + window_wall_type="External wall", + permanent_shutters_present=False, + ), + SapWindow( + pvc_frame="Wooden or PVC", + glazing_gap="16 mm or more", + orientation="North West", + window_type="Window", + glazing_type="Double glazing, Unknown install date", + window_width=1.0, + window_height=1.2, + draught_proofed=True, + window_location="Main Building", + window_wall_type="External wall", + permanent_shutters_present=False, + ), + SapWindow( + pvc_frame="Wooden or PVC", + glazing_gap="16 mm or more", + orientation="North East", + window_type="Window", + glazing_type="Double glazing, Unknown install date", + window_width=1.0, + window_height=0.9, + draught_proofed=True, + window_location="Main Building", + window_wall_type="External wall", + permanent_shutters_present=False, + ), + SapWindow( + pvc_frame="Wooden or PVC", + glazing_gap="16 mm or more", + orientation="North", + window_type="Window", + glazing_type="Double glazing, Unknown install date", + window_width=1.0, + window_height=0.9, + draught_proofed=True, + window_location="Extension 1", + window_wall_type="External wall", + permanent_shutters_present=False, + ), + SapWindow( + pvc_frame="Wooden or PVC", + glazing_gap="16 mm or more", + orientation="North East", + window_type="Window", + glazing_type="Double glazing, Unknown install date", + window_width=1.7, + window_height=0.9, + draught_proofed=True, + window_location="Extension 1", + window_wall_type="External wall", + permanent_shutters_present=False, + ), + SapWindow( + pvc_frame="Wooden or PVC", + glazing_gap="16 mm or more", + orientation="North West", + window_type="Window", + glazing_type="Double glazing, Unknown install date", + window_width=2.3, + window_height=0.9, + draught_proofed=True, + window_location="Extension 1", + window_wall_type="External wall", + permanent_shutters_present=False, + ), + SapWindow( + pvc_frame="Wooden or PVC", + glazing_gap="16 mm or more", + orientation="North West", + window_type="Window", + glazing_type="Double glazing, Unknown install date", + window_width=1.2, + window_height=1.0, + draught_proofed=True, + window_location="Extension 1", + window_wall_type="External wall", + permanent_shutters_present=False, + ), + SapWindow( + pvc_frame="Wooden or PVC", + glazing_gap="16 mm or more", + orientation="North East", + window_type="Window", + glazing_type="Double glazing, Unknown install date", + window_width=1.0, + window_height=0.9, + draught_proofed=True, + window_location="Extension 1", + window_wall_type="External wall", + permanent_shutters_present=False, + ), + ], + sap_energy_source=SapEnergySource( + mains_gas=True, + meter_type="Single", + pv_battery_count=0, + wind_turbines_count=0, + gas_smart_meter_present=True, + is_dwelling_export_capable=True, + wind_turbines_terrain_type="Suburban", + electricity_smart_meter_present=True, + ), + sap_building_parts=[ + SapBuildingPart( + identifier="main", + construction_age_band="1950-1966", + wall_construction="Cavity", + wall_insulation_type="Filled Cavity", + wall_thickness_measured=True, + party_wall_construction="Cavity Masonry, Filled", + sap_floor_dimensions=[ + SapFloorDimension( + room_height_m=2.19, + total_floor_area_m2=35.68, + party_wall_length_m=10.62, + heat_loss_perimeter_m=13.44, + ), + SapFloorDimension( + room_height_m=2.17, + total_floor_area_m2=35.68, + party_wall_length_m=10.62, + heat_loss_perimeter_m=11.0, + ), + ], + wall_thickness_mm=310, + ), + SapBuildingPart( + identifier="extension_1", + construction_age_band="2003-2006", + wall_construction="Cavity", + wall_insulation_type="As built", + wall_thickness_measured=True, + party_wall_construction="Cavity Masonry, Filled", + sap_floor_dimensions=[ + SapFloorDimension( + room_height_m=2.0, + total_floor_area_m2=3.8, + party_wall_length_m=0.0, + heat_loss_perimeter_m=5.7, + ), + ], + wall_thickness_mm=310, + ), + ], + solar_water_heating=False, + has_hot_water_cylinder=False, + has_fixed_air_conditioning=False, + wet_rooms_count=0, + extensions_count=1, + heated_rooms_count=0, + open_chimneys_count=0, + habitable_rooms_count=3, + insulated_door_count=0, + cfl_fixed_lighting_bulbs_count=1, + led_fixed_lighting_bulbs_count=0, + incandescent_fixed_lighting_bulbs_count=4, + total_floor_area_m2=75.16, + built_form="Mid-terrace", + property_type="House", + has_conservatory=False, + blocked_chimneys_count=0, + draughtproofed_door_count=2, + address_line_1="40, Abbey Place", + post_town="Crewe", + postcode="CW1 4JR", + report_reference="6EA2A86D-94CE-4792-8D49-AB495C744EDD", + ) diff --git a/backend/documents_parser/tests/test_extractor.py b/backend/documents_parser/tests/test_extractor.py index 222a30e6..9f672956 100644 --- a/backend/documents_parser/tests/test_extractor.py +++ b/backend/documents_parser/tests/test_extractor.py @@ -17,6 +17,7 @@ from datatypes.epc.surveys.pashub_rdsap_site_notes import ( FloorMeasurement, General, HeatingAndHotWater, + InspectionMetadata, MainBuildingConstruction, MainBuildingMeasurements, MainHeating, @@ -40,6 +41,20 @@ def load_text_fixture() -> list[str]: return json.load(f) +class TestInspectionMetadata: + def test_full_inspection_metadata(self) -> None: + result = PasHubRdSapSiteNotesExtractor(load_text_fixture()).extract_inspection_metadata() + assert result == InspectionMetadata( + inspection_surveyor="Benjamin Burke", + email_address="ben@mbsolutionsgroup.co.uk", + report_reference="6EA2A86D-94CE-4792-8D49-AB495C744EDD", + created_on="2025-11-10", + date_of_inspection=date(2025, 9, 25), + property_address="40, Abbey Place, Crewe, Cheshire, CW1 4JR", + property_photo=True, + ) + + class TestGeneral: @pytest.fixture def general(self) -> General: @@ -530,7 +545,7 @@ class TestCustomerResponse: class TestExtract: def test_full_extract(self) -> None: result = PasHubRdSapSiteNotesExtractor(load_text_fixture()).extract() - assert result.inspection_metadata is None + assert result.inspection_metadata.inspection_surveyor == "Benjamin Burke" assert result.general.inspection_date == date(2025, 9, 25) assert result.building_construction.main_building.wall_thickness_mm == 310 assert result.building_measurements.main_building.floors[0].area_m2 == 35.68 diff --git a/datatypes/epc/domain/epc_property_data.py b/datatypes/epc/domain/epc_property_data.py index b92a46aa..ef01155f 100644 --- a/datatypes/epc/domain/epc_property_data.py +++ b/datatypes/epc/domain/epc_property_data.py @@ -280,6 +280,7 @@ class EpcPropertyData: schema_type: Optional[str] = None schema_versions_original: Optional[str] = None report_type: Optional[str] = None # TODO: make enum? + report_reference: Optional[str] = None uprn_source: Optional[str] = None address_line_2: Optional[str] = None region_code: Optional[str] = None # TODO: make enum? diff --git a/datatypes/epc/domain/mapper.py b/datatypes/epc/domain/mapper.py index 61793c7d..bdcbbb12 100644 --- a/datatypes/epc/domain/mapper.py +++ b/datatypes/epc/domain/mapper.py @@ -78,6 +78,12 @@ class EpcPropertyDataMapper: @staticmethod def from_site_notes(survey: PasHubRdSapSiteNotes) -> EpcPropertyData: general = survey.general + metadata = survey.inspection_metadata + address_parts = [p.strip() for p in metadata.property_address.split(", ")] + postcode = address_parts[-1] if len(address_parts) >= 1 else None + post_town = address_parts[-3] if len(address_parts) >= 4 else (address_parts[-2] if len(address_parts) >= 3 else None) + address_line_1 = ", ".join(address_parts[:-3]) if len(address_parts) >= 4 else ", ".join(address_parts[:-2]) if len(address_parts) >= 3 else address_parts[0] if address_parts else None + construction = survey.building_construction measurements = survey.building_measurements heating = survey.heating_and_hot_water @@ -145,6 +151,10 @@ class EpcPropertyDataMapper: has_conservatory=survey.conservatories.has_conservatory, blocked_chimneys_count=room_counts.number_of_blocked_chimneys, draughtproofed_door_count=room_counts.number_of_draughtproofed_external_doors, + address_line_1=address_line_1, + post_town=post_town, + postcode=postcode, + report_reference=metadata.report_reference, ) @staticmethod diff --git a/datatypes/epc/domain/tests/test_from_site_notes.py b/datatypes/epc/domain/tests/test_from_site_notes.py index 47327ff7..81efe32d 100644 --- a/datatypes/epc/domain/tests/test_from_site_notes.py +++ b/datatypes/epc/domain/tests/test_from_site_notes.py @@ -330,14 +330,14 @@ class TestFromSiteNotesExample1: def test_uprn_absent(self, result: EpcPropertyData) -> None: assert result.uprn is None - def test_address_absent(self, result: EpcPropertyData) -> None: - assert result.address_line_1 is None + def test_address_line_1(self, result: EpcPropertyData) -> None: + assert result.address_line_1 == "1, Test Street" - def test_postcode_absent(self, result: EpcPropertyData) -> None: - assert result.postcode is None + def test_postcode(self, result: EpcPropertyData) -> None: + assert result.postcode == "TE1 1ST" - def test_post_town_absent(self, result: EpcPropertyData) -> None: - assert result.post_town is None + def test_post_town(self, result: EpcPropertyData) -> None: + assert result.post_town == "Test Town" def test_status_absent(self, result: EpcPropertyData) -> None: assert result.status is None @@ -352,9 +352,9 @@ class TestFromSiteNotesExample1: sap_version=None, dwelling_type="Mid-terrace house", uprn=None, - address_line_1=None, - postcode=None, - post_town=None, + address_line_1="1, Test Street", + postcode="TE1 1ST", + post_town="Test Town", inspection_date=date(2026, 3, 31), status=None, tenure="Rented Social", @@ -495,5 +495,6 @@ class TestFromSiteNotesExample1: has_conservatory=False, blocked_chimneys_count=0, draughtproofed_door_count=2, + report_reference="49D422A9-0779-44DD-9665-464D35DFF1A8", ) assert result == expected diff --git a/datatypes/epc/surveys/pashub_rdsap_site_notes.py b/datatypes/epc/surveys/pashub_rdsap_site_notes.py index 54c84a17..31e189a5 100644 --- a/datatypes/epc/surveys/pashub_rdsap_site_notes.py +++ b/datatypes/epc/surveys/pashub_rdsap_site_notes.py @@ -276,7 +276,7 @@ class SurveyAddendum: @dataclass class PasHubRdSapSiteNotes: - inspection_metadata: Optional[InspectionMetadata] + inspection_metadata: InspectionMetadata general: General building_construction: BuildingConstruction building_measurements: BuildingMeasurements diff --git a/datatypes/epc/surveys/tests/fixtures/pashub_rdsap_site_notes_example1.json b/datatypes/epc/surveys/tests/fixtures/pashub_rdsap_site_notes_example1.json index b5772e24..f19bea20 100644 --- a/datatypes/epc/surveys/tests/fixtures/pashub_rdsap_site_notes_example1.json +++ b/datatypes/epc/surveys/tests/fixtures/pashub_rdsap_site_notes_example1.json @@ -5,7 +5,7 @@ "report_reference": "49D422A9-0779-44DD-9665-464D35DFF1A8", "created_on": "2026-03-31", "date_of_inspection": "2026-03-31", - "property_address": "test" + "property_address": "1, Test Street, Test Town, Test County, TE1 1ST" }, "general": { "epc_checked_before_assessment": true, @@ -229,4 +229,4 @@ "hard_to_treat_cavity_high_exposure": false, "hard_to_treat_cavity_narrow_cavities": false } -} +} \ No newline at end of file