import sys import json from pprint import pprint sys.path.insert(0, '/workspaces/survey-extractor') from etl.fileReader.pdfReaderToText import pdfReaderToText # Set your PDF path here SOURCE_PDF_PATH = "/workspaces/survey-extractor/RdSAP_SiteNote_95053636_V3_Assessment (1).pdf" reader = pdfReaderToText(SOURCE_PDF_PATH) print(f"Detected type: {reader.type}") print(f"Total text tokens: {len(reader.text_list)}\n") print("--- Raw Token List ---") for i, line in enumerate(reader.text_list): print(f"[{i:04d}] {repr(line)}") print("\n\n--- Extracted Structured Data ---") extractor = reader.get_reader() obj = extractor.master_obj sections = [ ("Header", obj.header), ("General", obj.general), ("Building Construction", obj.building_construction), ("Roof Space", obj.roof_space), ("Main Heating", obj.main_heating), ("Secondary Heating", obj.secondary_heating), ("Water Heating", obj.water_heating), ("Ventilation", obj.ventilation), ("Renewables", obj.renewables), ("Room Count", obj.room_count), ("Misc", obj.misc), ("Customer Response", obj.customer_response), ("Addendum", obj.addendum), ] for title, section in sections: print(f"\n=== {title} ===") pprint(section.model_dump()) print(f"\n=== Windows ({len(obj.windows)} found) ===") for w in obj.windows: pprint(w.model_dump()) print("\n\n--- Full JSON ---") print(json.dumps(obj.model_dump(), indent=2, default=str))