mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-08 11:17:29 +00:00
245 lines
No EOL
7.9 KiB
Python
245 lines
No EOL
7.9 KiB
Python
import os
|
|
import tempfile
|
|
import requests
|
|
import boto3
|
|
from urllib.parse import urlparse
|
|
from etl.fileReader.pdfReaderToText import pdfReaderToText
|
|
from etl.fileReader.sitenotes import (
|
|
SiteNotesExtractor,
|
|
WarmHomesConditionReport
|
|
)
|
|
|
|
from uuid import UUID
|
|
import json
|
|
from typing import Any
|
|
from etl.db.db import get_db_session, init_db
|
|
from typing import Union
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from sqlmodel import select
|
|
from sqlalchemy import update
|
|
from etl.models.topLevel import uploaded_files
|
|
|
|
def update_uploaded_file_json_uri_by_query(
|
|
db_session,
|
|
file_id: Union[str, uuid.UUID],
|
|
json_uri: str,
|
|
):
|
|
"""
|
|
Query uploaded_files by id, update s3_json_uri and s3_json_upload_timestamp,
|
|
commit, refresh, and return the ORM object. Raises ValueError if not found.
|
|
"""
|
|
try:
|
|
file_id_norm = uuid.UUID(str(file_id))
|
|
except (ValueError, AttributeError, TypeError):
|
|
file_id_norm = file_id # leave as-is if not a UUID
|
|
|
|
obj = (
|
|
db_session
|
|
.query(uploaded_files)
|
|
.filter(uploaded_files.id == file_id_norm)
|
|
.first()
|
|
)
|
|
obj.s3_json_uri = json_uri
|
|
obj.s3_json_upload_timestamp = datetime.now(timezone.utc)
|
|
|
|
db_session.add(obj)
|
|
db_session.commit()
|
|
db_session.refresh(obj)
|
|
return obj
|
|
|
|
def serialize_model(model: Any):
|
|
"""Recursively convert Pydantic models/lists into plain dicts."""
|
|
if hasattr(model, "dict"):
|
|
return {k: serialize_model(v) for k, v in model.dict().items()}
|
|
elif isinstance(model, list):
|
|
return [serialize_model(item) for item in model]
|
|
else:
|
|
return model
|
|
|
|
def make_final_json(rooms_obj, heating_system_obj, occupant, access_and_elevations, bepoke_info):
|
|
# Convert to dict recursively
|
|
rooms_data = serialize_model(rooms_obj)
|
|
heating_data = serialize_model(heating_system_obj)
|
|
occupant_data = serialize_model(occupant)
|
|
access_and_elevations_data = serialize_model(access_and_elevations)
|
|
|
|
# Combine into one big JSON-ready dict
|
|
final_data = {
|
|
"rooms": rooms_data,
|
|
"heating_system": heating_data,
|
|
"occupant_info": occupant_data,
|
|
"access_and_elevations": access_and_elevations_data,
|
|
"bespoke_data": bepoke_info
|
|
}
|
|
|
|
# Convert to pretty JSON string
|
|
return final_data
|
|
|
|
def parse_s3_uri(uri: str):
|
|
"""
|
|
Parse an S3 URI or HTTPS S3 URL into bucket and key.
|
|
Supports formats:
|
|
- s3://bucket-name/path/to/file
|
|
- https://bucket-name.s3.region.amazonaws.com/path/to/file
|
|
"""
|
|
parsed = urlparse(uri)
|
|
|
|
if parsed.scheme == "s3":
|
|
# s3://bucket/key
|
|
bucket = parsed.netloc
|
|
key = parsed.path.lstrip("/")
|
|
elif parsed.scheme in ("http", "https"):
|
|
# https://bucket-name.s3.region.amazonaws.com/key
|
|
host_parts = parsed.netloc.split(".")
|
|
if len(host_parts) >= 3 and host_parts[1] == "s3":
|
|
bucket = host_parts[0]
|
|
else:
|
|
raise ValueError("Not a valid S3 HTTPS URL format")
|
|
key = parsed.path.lstrip("/")
|
|
else:
|
|
raise ValueError("Unsupported URI scheme")
|
|
|
|
return bucket, key
|
|
|
|
|
|
def download_private_s3_file(uri) -> str:
|
|
bucket_name, key = parse_s3_uri(uri)
|
|
"""
|
|
Download a private S3 file using hardcoded AWS credentials.
|
|
Saves it to /tmp and returns the local file path.
|
|
"""
|
|
|
|
# Hardcoded AWS credentials (quick testing only)
|
|
aws_access_key = "AKIAU5A36PPNJMZZ3KRW"
|
|
aws_secret_key = "Pr5uxwh1zOCocKuFDA4DWQX039t0h2mnM7kaxlSt"
|
|
aws_region = "eu-west-2"
|
|
|
|
# Where to store the file locally
|
|
tmp_dir = tempfile.gettempdir()
|
|
filename = os.path.basename(key)
|
|
file_path = os.path.join(tmp_dir, filename)
|
|
|
|
# Create S3 client with hardcoded creds
|
|
s3 = boto3.client(
|
|
"s3",
|
|
aws_access_key_id=aws_access_key,
|
|
aws_secret_access_key=aws_secret_key,
|
|
region_name=aws_region
|
|
)
|
|
|
|
# Download file
|
|
s3.download_file(bucket_name, key, file_path)
|
|
|
|
return file_path
|
|
|
|
def upload_json_to_s3(json_obj, dest_uri: str) -> str:
|
|
"""
|
|
Upload a JSON-serializable object to S3 at the given s3:// or https S3 URL.
|
|
Returns the public-style HTTPS S3 URL (still private if bucket is private).
|
|
"""
|
|
bucket, pdf_key = parse_s3_uri(dest_uri)
|
|
base_folder = os.path.dirname(pdf_key) # e.g. ".../report"
|
|
|
|
# Build jsonified folder + timestamp filename
|
|
timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
|
|
json_key = f"{base_folder}/jsonified/{timestamp}.json"
|
|
|
|
# Same region/creds you used for download
|
|
aws_access_key = "AKIAU5A36PPNJMZZ3KRW"
|
|
aws_secret_key = "Pr5uxwh1zOCocKuFDA4DWQX039t0h2mnM7kaxlSt"
|
|
aws_region = "eu-west-2"
|
|
|
|
s3 = boto3.client(
|
|
"s3",
|
|
aws_access_key_id=aws_access_key,
|
|
aws_secret_access_key=aws_secret_key,
|
|
region_name=aws_region
|
|
)
|
|
|
|
body = json.dumps(json_obj, ensure_ascii=False, indent=2).encode("utf-8")
|
|
|
|
s3.put_object(
|
|
Bucket=bucket,
|
|
Key=json_key,
|
|
Body=body,
|
|
ContentType="application/json"
|
|
# Optional hardening:
|
|
# , ServerSideEncryption="AES256"
|
|
)
|
|
|
|
# Return an HTTPS-style S3 URL (matches your input style)
|
|
return f"https://{bucket}.s3.{aws_region}.amazonaws.com/{json_key}"
|
|
|
|
def get_file_uri(id):
|
|
with get_db_session() as session:
|
|
obj = (
|
|
session
|
|
.query(uploaded_files)
|
|
.filter(uploaded_files.id == id)
|
|
.first()
|
|
)
|
|
if obj is None:
|
|
raise RuntimeError(f"Failed to find uploaded_files record with id {id}")
|
|
|
|
return obj.s3_file_uri
|
|
|
|
|
|
def handler(event, context):
|
|
try:
|
|
print("trying to connect to db")
|
|
init_db()
|
|
print("connected to db")
|
|
for r in event.get("Records", []):
|
|
body = json.loads(r["body"])
|
|
id_ = body.get("id")
|
|
if not id_: # covers None or empty string
|
|
raise ValueError(f"❌ Missing 'id' in SQS body: {body}")
|
|
|
|
print(f"Retrieving file uri with id {id_}")
|
|
file_uri = get_file_uri(id_)
|
|
print(f"Retrieved file uri with {file_uri}")
|
|
|
|
print("Downloading file locally for extraction...")
|
|
local_path = download_private_s3_file(file_uri)
|
|
|
|
# Local development of file, please comment out for prod
|
|
# local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/67-Aylestone-Road-1 1.pdf"))
|
|
# local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/2-Wilford-Crescent-West.pdf"))
|
|
# local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/3-Carlinghow-court.pdf"))
|
|
# local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/26-Marden-Road.pdf"))
|
|
# local_path = os.path.join(os.path.join(os.getcwd(), "../../../../../", "home/Downloads/works/6E-plantagenet-street.pdf"))
|
|
|
|
|
|
|
|
print("Extracting file...")
|
|
reader = pdfReaderToText(local_path)
|
|
# obj2 = WarmHomesConditionReport(reader.text_list, debug=True)
|
|
obj = WarmHomesConditionReport(reader.text_list)
|
|
json_ = make_final_json(
|
|
obj.master_obj[0],
|
|
obj.master_obj[1],
|
|
obj.master_obj[2],
|
|
obj.master_obj[3],
|
|
{}
|
|
)
|
|
print("Extracted completed, made json")
|
|
|
|
print("uploading json to s3 bucket...")
|
|
json_uri = upload_json_to_s3(json_, file_uri)
|
|
|
|
print("Updating Database with json_uri")
|
|
with get_db_session() as session:
|
|
update_uploaded_file_json_uri_by_query(
|
|
session,
|
|
id_,
|
|
json_uri,
|
|
)
|
|
print("job completed successfully")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error: {e}")
|
|
return {
|
|
"statusCode": 500,
|
|
"body": str(e)
|
|
} |