diff --git a/backend/documents_parser/__init__.py b/backend/documents_parser/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/documents_parser/extractor.py b/backend/documents_parser/extractor.py new file mode 100644 index 00000000..268c86de --- /dev/null +++ b/backend/documents_parser/extractor.py @@ -0,0 +1,9 @@ +from datatypes.epc.surveys.pashub_rdsap_site_notes import PasHubRdSapSiteNotes + + +class PasHubRdSapSiteNotesExtractor: + def __init__(self, text_list: list[str]) -> None: + self.text_list = text_list + + def extract(self) -> PasHubRdSapSiteNotes: + raise NotImplementedError diff --git a/backend/documents_parser/handler/handler.py b/backend/documents_parser/handler/handler.py new file mode 100644 index 00000000..321a4a10 --- /dev/null +++ b/backend/documents_parser/handler/handler.py @@ -0,0 +1,101 @@ +import json +import os +import time +from typing import Any, List, Mapping + +import boto3 + +from utils.logger import setup_logger +from utils.s3 import upload_file_to_s3 + +logger = setup_logger() + +BUCKET = "retrofit-energy-assessments-dev" +PDF_S3_KEY = "example/SiteNotesExample.pdf" +PDF_LOCAL_PATH = os.path.join( + os.path.dirname(__file__), + "..", + "..", + "tests", + "test_data", + "SiteNotesExample.pdf", +) + + +def upload_pdf(local_path: str, bucket: str, key: str) -> None: + logger.info(f"Uploading {local_path} to s3://{bucket}/{key}") + upload_file_to_s3(local_path, bucket, key) + logger.info("Upload complete") + + +def start_textract_job(bucket: str, key: str) -> str: + client = boto3.client("textract") + response = client.start_document_analysis( + DocumentLocation={"S3Object": {"Bucket": bucket, "Name": key}}, + FeatureTypes=["FORMS"], + ) + job_id: str = response["JobId"] + logger.info(f"Started Textract job {job_id}") + return job_id + + +def wait_for_job(job_id: str, poll_interval_seconds: int = 5) -> None: + client = boto3.client("textract") + logger.info(f"Polling Textract job {job_id}...") + while True: + response = client.get_document_analysis(JobId=job_id, MaxResults=1) + status = response["JobStatus"] + logger.info(f"Status: {status}") + if status == "SUCCEEDED": + return + if status == "FAILED": + raise RuntimeError( + f"Textract job {job_id} failed: {response.get('StatusMessage')}" + ) + time.sleep(poll_interval_seconds) + + +def collect_blocks(job_id: str) -> List[Any]: + client = boto3.client("textract") + blocks: List[Any] = [] + next_token = None + + while True: + kwargs: dict = {"JobId": job_id, "MaxResults": 1000} + if next_token: + kwargs["NextToken"] = next_token + + response = client.get_document_analysis(**kwargs) + blocks.extend(response.get("Blocks", [])) + + next_token = response.get("NextToken") + if not next_token: + break + + logger.info(f"Collected {len(blocks)} blocks") + return blocks + + +def save_blocks(blocks: List[Any], output_path: str) -> None: + with open(output_path, "w") as f: + json.dump(blocks, f, indent=2, default=str) + logger.info(f"Saved blocks to {output_path}") + + +def handler(event: Mapping[str, Any], context: Any) -> None: + logger.info("Entered handler") + + output_path = os.path.join(os.path.dirname(__file__), "..", "textract_blocks.json") + + upload_pdf(PDF_LOCAL_PATH, BUCKET, PDF_S3_KEY) + + job_id = start_textract_job(BUCKET, PDF_S3_KEY) + wait_for_job(job_id) + blocks = collect_blocks(job_id) + save_blocks(blocks, output_path) + + logger.info("Done") + + +if __name__ == "__main__": + handler({}, None) diff --git a/backend/documents_parser/tests/__init__.py b/backend/documents_parser/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/documents_parser/tests/fixtures/site_notes_example_text.json b/backend/documents_parser/tests/fixtures/site_notes_example_text.json new file mode 100644 index 00000000..17d0265a --- /dev/null +++ b/backend/documents_parser/tests/fixtures/site_notes_example_text.json @@ -0,0 +1,623 @@ +[ + "", + "Photo of electricity meter:", + "Photo of electricity meter:", + "Photo of electricity meter:", + "Photo of electricity meter:", + "RdSAP Assessment", + "General", + "Confirm you have checked for the existence of an", + "EPC before carrying out another energy assessment.", + "Yes", + "Does an EPC exist at the point of carrying out this", + "energy assessment?", + "No", + "Inspection Date:", + "25/09/2025", + "Transaction Type:", + "Grant-Scheme (ECO, RHI, etc.)", + "Tenure:", + "Rented Social", + "Type of Property:", + "House", + "Detachment Type:", + "Mid-terrace", + "Number of storeys:", + "2 Storeys", + "Terrain Type:", + "Suburban", + "Number of Extensions:", + "1 Extension", + "Is an electricity smart meter present?", + "Yes", + "Electric meter type:", + "Single", + "Is the dwelling export-capable?", + "Yes", + "Is mains gas available?", + "Yes", + "Is there a gas smart meter?", + "Yes", + "Is the gas meter accessible?", + "Yes", + "Page 2", + "", + "Photo of Gas Meter:", + "External indicators of Cavity Wall Construction:", + "External indicators of Cavity Wall Construction:", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Select Measurements Location:", + "Internal", + "Building Construction", + "Main Building", + "Age Range:", + "1950-1966", + "Record indicators of property age:", + "local knowledge, enquiries of owner", + "Walls - Construction Type:", + "Cavity", + "Record external indicators of Cavity Construction:", + "wall thickness over 270 mm", + "Walls - Insulation Type:", + "Filled Cavity", + "Record indicators of filled cavity:", + "evidence of cavity fill drill holes", + "Page 3", + "", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Page 4", + "", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Photo indicators of filled cavity insulation:", + "Thermal conductivity of wall insulation:", + "Unknown", + "Wall U-Value known?", + "Not Known", + "Wall thickness:", + "310 mm", + "Page 5", + "", + "Photo wall thickness:", + "External indicators of Cavity Wall Construction:", + "Party wall construction type:", + "Cavity Masonry, Filled", + "Floor type:", + "Ground Floor", + "Floor Construction:", + "Solid", + "Floor Insulation Type:", + "As Built", + "Floor U-Value known?", + "Not Known", + "Extension 1", + "Age Range:", + "2003-2006", + "Record indicators of property age:", + "local knowledge, enquiries of owner", + "Walls - Construction Type:", + "Cavity", + "Record external indicators of Cavity Construction:", + "wall thickness over 270 mm", + "Walls - Insulation Type:", + "As built", + "Thermal conductivity of wall insulation:", + "Unknown", + "Wall U-Value known?", + "Not Known", + "Wall thickness:", + "310 mm", + "Page 6", + "", + "Photo wall thickness:", + "Party wall construction type:", + "Cavity Masonry, Filled", + "Floor type:", + "Ground Floor", + "Floor Construction:", + "Solid", + "Floor Insulation Type:", + "As Built", + "Floor U-Value known?", + "Not Known", + "Building Measurements", + "Area (m2)", + "Height (m)", + "Heat Loss Perimeter (m)", + "PWL (m)", + "Main Building", + "Floor 1", + "35.68", + "2.19", + "13.44", + "10.62", + "Floor 0", + "35.68", + "2.17", + "11", + "10.62", + "Extension 1", + "Floor 0", + "3.8", + "2", + "5.7", + "0", + "Roof Space", + "Main Building", + "Roofs - Construction Type:", + "Pitched roof (Slates or tiles), Access to loft", + "Roofs - Insulation At:", + "Joists", + "Roof U-Value:", + "Not Known", + "Roofs - Insulation Thickness:", + "100 mm", + "Page 7", + "", + "Loft insulation:", + "Loft insulation:", + "Loft insulation:", + "Loft insulation:", + "Loft insulation:", + "Page 8", + "", + "Loft insulation:", + "Loft insulation:", + "Loft insulation:", + "Loft insulation:", + "Loft insulation:", + "Loft insulation:", + "Page 9", + "", + "Loft insulation:", + "Indicators of Cavity Wall Construction in roof space:", + "Indicators of Cavity Wall Construction in roof space:", + "Record indicators of party wall construction in roof space:", + "Record indicators of party wall construction in roof space:", + "Record indicators of Cavity Wall Construction in roof", + "space:", + "cavity visible in roof space", + "Are there rooms in the roof?", + "No", + "Extension 1", + "Roofs - Construction Type:", + "Pitched roof, Sloping ceiling", + "Page 10", + "", + "Photo of glazing type:", + "Photo of glazing type:", + "Roofs - Insulation At:", + "Sloping ceiling insulation", + "Roof U-Value:", + "Not Known", + "Roofs - Insulation Thickness:", + "As built", + "Record indicators of Cavity Wall Construction in roof", + "space:", + "No indicator of construction visible", + "Are there rooms in the roof?", + "No", + "Windows", + "Window 1", + "Window location:", + "Main Building", + "Window wall type:", + "External wall", + "Glazing Type:", + "Double glazing, Unknown install date", + "Window type:", + "Window", + "Window frame type:", + "Wooden or PVC", + "What size is the glazing gap?", + "16 mm or more", + "Is the window draught proofed?", + "Yes", + "Are there permanent shutters present?", + "No", + "Window height:", + "1.2 m", + "Window width:", + "2.3 m", + "Orientation:", + "North West", + "Window 2", + "Window location:", + "Main Building", + "Window wall type:", + "External wall", + "Glazing Type:", + "Double glazing, Unknown install date", + "Window type:", + "Window", + "Window frame type:", + "Wooden or PVC", + "What size is the glazing gap?", + "16 mm or more", + "Is the window draught proofed?", + "Yes", + "Are there permanent shutters present?", + "No", + "Page 11", + "", + "Photo of glazing type:", + "Photo of glazing type:", + "Photo of glazing type:", + "Photo of glazing type:", + "Window height:", + "1.2 m", + "Window width:", + "1 m", + "Orientation:", + "North West", + "Window 3", + "Window location:", + "Main Building", + "Window wall type:", + "External wall", + "Glazing Type:", + "Double glazing, Unknown install date", + "Window type:", + "Window", + "Window frame type:", + "Wooden or PVC", + "What size is the glazing gap?", + "16 mm or more", + "Is the window draught proofed?", + "Yes", + "Are there permanent shutters present?", + "No", + "Window height:", + "0.9 m", + "Window width:", + "1 m", + "Orientation:", + "North East", + "Window 4", + "Window location:", + "Extension 1", + "Window wall type:", + "External wall", + "Page 12", + "", + "Photo of glazing type:", + "Photo of glazing type:", + "Glazing Type:", + "Double glazing, Unknown install date", + "Window type:", + "Window", + "Window frame type:", + "Wooden or PVC", + "What size is the glazing gap?", + "16 mm or more", + "Is the window draught proofed?", + "Yes", + "Are there permanent shutters present?", + "No", + "Window height:", + "0.9 m", + "Window width:", + "1 m", + "Orientation:", + "North", + "Window 5", + "Window location:", + "Extension 1", + "Window wall type:", + "External wall", + "Glazing Type:", + "Double glazing, Unknown install date", + "Window type:", + "Window", + "Window frame type:", + "Wooden or PVC", + "What size is the glazing gap?", + "16 mm or more", + "Is the window draught proofed?", + "Yes", + "Are there permanent shutters present?", + "No", + "Window height:", + "0.9 m", + "Window width:", + "1.7 m", + "Orientation:", + "North East", + "Page 13", + "", + "Photo of glazing type:", + "Photo of glazing type:", + "Photo of glazing type:", + "Photo of glazing type:", + "Window 6", + "Window location:", + "Extension 1", + "Window wall type:", + "External wall", + "Glazing Type:", + "Double glazing, Unknown install date", + "Window type:", + "Window", + "Window frame type:", + "Wooden or PVC", + "What size is the glazing gap?", + "16 mm or more", + "Is the window draught proofed?", + "Yes", + "Are there permanent shutters present?", + "No", + "Window height:", + "0.9 m", + "Window width:", + "2.3 m", + "Orientation:", + "North West", + "Window 7", + "Window location:", + "Extension 1", + "Window wall type:", + "External wall", + "Glazing Type:", + "Double glazing, Unknown install date", + "Window type:", + "Window", + "Window frame type:", + "Wooden or PVC", + "Page 14", + "", + "Photo of glazing type:", + "Photo of glazing type:", + "Photo of glazing type:", + "Photo of glazing type:", + "What size is the glazing gap?", + "16 mm or more", + "Is the window draught proofed?", + "Yes", + "Are there permanent shutters present?", + "No", + "Window height:", + "1 m", + "Window width:", + "1.2 m", + "Orientation:", + "North West", + "Window 8", + "Window location:", + "Extension 1", + "Window wall type:", + "External wall", + "Glazing Type:", + "Double glazing, Unknown install date", + "Window type:", + "Window", + "Window frame type:", + "Wooden or PVC", + "What size is the glazing gap?", + "16 mm or more", + "Is the window draught proofed?", + "Yes", + "Are there permanent shutters present?", + "No", + "Window height:", + "0.9 m", + "Window width:", + "1 m", + "Orientation:", + "North East", + "Page 15", + "", + "Photo of heating system:", + "Heating & Hot Water", + "Main Heating Systems", + "Main Heating 1", + "How would you like to select the Heating System?", + "PCDF Search", + "System type:", + "Boiler with radiators or underfloor heating", + "Product Id", + "16839", + "Manufacturer", + "Vaillant", + "Model", + "ecoTEC pro 28", + "Orig Manuf", + "Vaillant", + "Fuel", + "Mains gas", + "S. Efficiency", + "0", + "Type", + "Combi", + "Condensing", + "Yes", + "Year", + "2005 - 2015", + "Mount", + "Wall", + "Open Flue", + "Room-sealed", + "Fan Assist", + "Yes", + "Status", + "Normal status for an actual product", + "Central heating pump age:", + "Unknown", + "Controls:", + "Programmer, room thermostat and TRVs", + "Does the boiler have a Flue Gas Heat Recover", + "System (FGHRS)?", + "No", + "Is there a weather compensator?", + "No", + "Emitter:", + "Radiators", + "Emitter Temperature:", + "Unknown", + "Page 16", + "", + "Photo of heating system:", + "Photo of heating system:", + "Photo of heating system:", + "Photo of heating system:", + "Photo of heating system:", + "Photo of heating system:", + "Page 17", + "", + "Photo of heating system:", + "Photo of heating system:", + "Photo of heating system:", + "Photo of heating system:", + "Photo of heating controls:", + "Photo of heating controls:", + "Secondary Heating System", + "Secondary Fuel", + "No Secondary Heating", + "Water Heating & Cylinder", + "Water Heating Type:", + "Regular", + "Water Heating System:", + "From main heating 1", + "Cylinder Size:", + "No Cylinder", + "Ventilation", + "Ventilation type:", + "Mechanical Extract - Decentralised", + "Page 18", + "", + "Photo of ventilation type:", + "Has fixed air conditioning?", + "No", + "Is the ventilation in the PCDF database?", + "No", + "Number of open flues:", + "0", + "Number of closed flues:", + "0", + "Number of boiler flues:", + "0", + "Number of other flues:", + "0", + "Number of extract fans:", + "0", + "Number of passive vents:", + "0", + "Number of flueless gas fires:", + "0", + "Pressure test:", + "No test", + "Is there a draught lobby?", + "No", + "Conservatories", + "Is there conservatory?", + "No conservatory", + "Page 19", + "", + "Photo of incandescent bulbs:", + "Photo of incandescent bulbs:", + "Renewables", + "Wind Turbines", + "Has wind turbines?", + "No", + "Solar hot water", + "Has solar hot water?", + "No", + "Photovoltaics", + "Has photovoltaic array?", + "No", + "Number of PV batteries:", + "None", + "Hydro", + "Is the dwelling connected to Hydro?", + "No", + "Room Count Elements", + "Number of habitable rooms?", + "3", + "Are any of these rooms unheated?", + "No", + "Number of external doors?", + "2", + "Number of insulated external doors?", + "0", + "Number of draughtproofed external doors?", + "2", + "Number of open chimneys?", + "0", + "Number of blocked chimneys?", + "0", + "Number of fixed incandescent bulbs:", + "4", + "Page 20", + "", + "Photo of incandescent bulbs:", + "Photo of incandescent bulbs:", + "Photo of CFL bulbs:", + "Is the exact number of LED and CFL bulbs known?", + "Yes", + "Number of fixed LED bulbs:", + "0", + "Number of fixed CFL bulbs:", + "1", + "Are there any waste water heat recovery systems?", + "None", + "Number of baths:", + "1", + "How many special features are there at the", + "property?", + "0", + "Showers", + "Shower 1", + "Shower outlet type:", + "Non-Electric Shower", + "Page 21", + "", + "Photo of shower:", + "Photo of shower:", + "General Photos:", + "Customer Response", + "Customer present?", + "Yes", + "Customer willing to answer satisfaction survey?", + "No", + "Addendum + Related Party Disclosure", + "Addendum", + "None", + "Related party disclosure", + "No related party", + "Hard to treat cavity walls: Property has access", + "issues?", + "No", + "Hard to treat cavity walls: Property has high", + "exposure?", + "No", + "Hard to treat cavity walls: Property has narrow", + "cavities?", + "No", + "Photographs Required", + "Page 22", + "", + "External Elevations:", + "External Elevations:", + "External Elevations:", + "External Elevations:", + "External Elevations:", + "External Elevations:", + "External Elevations:", + "Page 23", + "", + "Page 24", + "", + "Page 25", + "", + "Page 26", + "", + "Page 27", + "" +] \ No newline at end of file diff --git a/backend/documents_parser/tests/test_extractor.py b/backend/documents_parser/tests/test_extractor.py new file mode 100644 index 00000000..dea3ab92 --- /dev/null +++ b/backend/documents_parser/tests/test_extractor.py @@ -0,0 +1,56 @@ +import json +import os + +import pytest + +from backend.documents_parser.extractor import PasHubRdSapSiteNotesExtractor +from datatypes.epc.surveys.pashub_rdsap_site_notes import General, PasHubRdSapSiteNotes + +FIXTURES = os.path.join(os.path.dirname(__file__), "fixtures") + + +def load_text_fixture() -> list[str]: + with open(os.path.join(FIXTURES, "site_notes_example_text.json")) as f: + return json.load(f) + + +class TestGeneral: + @pytest.fixture + def general(self) -> General: + return PasHubRdSapSiteNotesExtractor(load_text_fixture()).extract().general + + def test_epc_checked_before_assessment(self, general: General) -> None: + assert general.epc_checked_before_assessment is True + + def test_epc_exists_at_point_of_assessment(self, general: General) -> None: + assert general.epc_exists_at_point_of_assessment is False + + def test_inspection_date(self, general: General) -> None: + assert general.inspection_date == "2025-09-25" + + def test_transaction_type(self, general: General) -> None: + assert general.transaction_type == "Grant-Scheme (ECO, RHI, etc.)" + + def test_tenure(self, general: General) -> None: + assert general.tenure == "Rented Social" + + def test_property_type(self, general: General) -> None: + assert general.property_type == "House" + + def test_detachment_type(self, general: General) -> None: + assert general.detachment_type == "Mid-terrace" + + def test_number_of_storeys(self, general: General) -> None: + assert general.number_of_storeys == 2 + + def test_number_of_extensions(self, general: General) -> None: + assert general.number_of_extensions == 1 + + def test_electricity_smart_meter(self, general: General) -> None: + assert general.electricity_smart_meter is True + + def test_mains_gas_available(self, general: General) -> None: + assert general.mains_gas_available is True + + def test_measurements_location(self, general: General) -> None: + assert general.measurements_location == "Internal" diff --git a/pytest.ini b/pytest.ini index 55c2873a..6cb3b611 100644 --- a/pytest.ini +++ b/pytest.ini @@ -3,6 +3,6 @@ pythonpath = . log_cli = true log_cli_level = INFO addopts = --cov-report term-missing --cov=etl/epc --cov=recommendations --cov=backend --cov=etl/epc_clean --cov=etl/spatial -testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests +testpaths = recommendations/tests backend/tests etl/epc/tests etl/epc_clean/tests etl/spatial/tests backend/condition/tests backend/address2UPRN/tests backend/onboarders/tests backend/categorisation/tests backend/export/tests etl/hubspot/tests backend/hubspot_trigger_orchestrator/tests datatypes/epc/schema/tests datatypes/epc/surveys/tests datatypes/epc/domain/tests backend/documents_parser/tests markers = integration: mark a test as an integration test