import os import tempfile import requests import boto3 from urllib.parse import urlparse from etl.fileReader.pdfReaderToText import pdfReaderToText from etl.fileReader.sitenotes import ( SiteNotesExtractor, WarmHomesConditionReport ) from uuid import UUID import json from typing import Any from etl.db.db import get_db_session, init_db from typing import Union import uuid from datetime import datetime, timezone from sqlmodel import select from sqlalchemy import update from etl.models.topLevel import uploaded_files def update_uploaded_file_json_uri_by_query( db_session, file_id: Union[str, uuid.UUID], json_uri: str, ): """ Query uploaded_files by id, update s3_json_uri and s3_json_upload_timestamp, commit, refresh, and return the ORM object. Raises ValueError if not found. """ try: file_id_norm = uuid.UUID(str(file_id)) except (ValueError, AttributeError, TypeError): file_id_norm = file_id # leave as-is if not a UUID obj = ( db_session .query(uploaded_files) .filter(uploaded_files.id == file_id_norm) .first() ) obj.s3_json_uri = json_uri obj.s3_json_upload_timestamp = datetime.now(timezone.utc) db_session.add(obj) db_session.commit() db_session.refresh(obj) return obj def serialize_model(model: Any): """Recursively convert Pydantic models/lists into plain dicts.""" if hasattr(model, "dict"): return {k: serialize_model(v) for k, v in model.dict().items()} elif isinstance(model, list): return [serialize_model(item) for item in model] else: return model def make_final_json(rooms_obj, heating_system_obj, occupant, access_and_elevations, bepoke_info): # Convert to dict recursively rooms_data = serialize_model(rooms_obj) heating_data = serialize_model(heating_system_obj) occupant_data = serialize_model(occupant) access_and_elevations_data = serialize_model(access_and_elevations) # Combine into one big JSON-ready dict final_data = { "rooms": rooms_data, "heating_system": heating_data, "occupant_info": occupant_data, "access_and_elevations": access_and_elevations_data, "bespoke_data": bepoke_info } # Convert to pretty JSON string return final_data def parse_s3_uri(uri: str): """ Parse an S3 URI or HTTPS S3 URL into bucket and key. Supports formats: - s3://bucket-name/path/to/file - https://bucket-name.s3.region.amazonaws.com/path/to/file """ parsed = urlparse(uri) if parsed.scheme == "s3": # s3://bucket/key bucket = parsed.netloc key = parsed.path.lstrip("/") elif parsed.scheme in ("http", "https"): # https://bucket-name.s3.region.amazonaws.com/key host_parts = parsed.netloc.split(".") if len(host_parts) >= 3 and host_parts[1] == "s3": bucket = host_parts[0] else: raise ValueError("Not a valid S3 HTTPS URL format") key = parsed.path.lstrip("/") else: raise ValueError("Unsupported URI scheme") return bucket, key def download_private_s3_file(uri) -> str: bucket_name, key = parse_s3_uri(uri) """ Download a private S3 file using hardcoded AWS credentials. Saves it to /tmp and returns the local file path. """ # Hardcoded AWS credentials (quick testing only) aws_access_key = "AKIAU5A36PPNJMZZ3KRW" aws_secret_key = "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" aws_region = "eu-west-2" # Where to store the file locally tmp_dir = tempfile.gettempdir() filename = os.path.basename(key) file_path = os.path.join(tmp_dir, filename) # Create S3 client with hardcoded creds s3 = boto3.client( "s3", aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key, region_name=aws_region ) # Download file s3.download_file(bucket_name, key, file_path) return file_path def upload_json_to_s3(json_obj, dest_uri: str) -> str: """ Upload a JSON-serializable object to S3 at the given s3:// or https S3 URL. Returns the public-style HTTPS S3 URL (still private if bucket is private). """ bucket, pdf_key = parse_s3_uri(dest_uri) base_folder = os.path.dirname(pdf_key) # e.g. ".../report" # Build jsonified folder + timestamp filename timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") json_key = f"{base_folder}/jsonified/{timestamp}.json" # Same region/creds you used for download aws_access_key = "AKIAU5A36PPNJMZZ3KRW" aws_secret_key = "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" aws_region = "eu-west-2" s3 = boto3.client( "s3", aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key, region_name=aws_region ) body = json.dumps(json_obj, ensure_ascii=False, indent=2).encode("utf-8") s3.put_object( Bucket=bucket, Key=json_key, Body=body, ContentType="application/json" # Optional hardening: # , ServerSideEncryption="AES256" ) # Return an HTTPS-style S3 URL (matches your input style) return f"https://{bucket}.s3.{aws_region}.amazonaws.com/{json_key}" def get_file_uri(id): with get_db_session() as session: obj = ( session .query(uploaded_files) .filter(uploaded_files.id == id) .first() ) if obj is None: raise RuntimeError(f"Failed to find uploaded_files record with id {id}") return obj.s3_file_uri def handler(event, context): try: print("trying to connect to db") init_db() print("connected to db") for r in event.get("Records", []): body = json.loads(r["body"]) id_ = body.get("id") if not id_: # covers None or empty string raise ValueError(f"❌ Missing 'id' in SQS body: {body}") print(f"Retrieving file uri with id {id_}") file_uri = get_file_uri(id_) print(f"Retrieved file uri with {file_uri}") print("Downloading file locally for extraction...") local_path = download_private_s3_file(file_uri) # Local development of file, please comment out for prod # local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/67-Aylestone-Road-1 1.pdf")) # local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/2-Wilford-Crescent-West.pdf")) # local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/3-Carlinghow-court.pdf")) # local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/26-Marden-Road.pdf")) # local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/6E-plantagenet-street.pdf")) print("Extracting file...") reader = pdfReaderToText(local_path) # obj2 = WarmHomesConditionReport(reader.text_list, debug=True) obj = WarmHomesConditionReport(reader.text_list) json_ = make_final_json( obj.master_obj[0], obj.master_obj[1], obj.master_obj[2], obj.master_obj[3], {} ) print("Extracted completed, made json") print("uploading json to s3 bucket...") json_uri = upload_json_to_s3(json_, file_uri) print("Updating Database with json_uri") with get_db_session() as session: update_uploaded_file_json_uri_by_query( session, id_, json_uri, ) print("job completed successfully") except Exception as e: print(f"❌ Error: {e}") return { "statusCode": 500, "body": str(e) }