mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
273 lines
11 KiB
Python
273 lines
11 KiB
Python
import os
|
|
from io import BytesIO
|
|
from typing import List
|
|
|
|
from fastapi import APIRouter, Depends
|
|
from starlette.responses import Response
|
|
|
|
from backend.app.config import get_settings
|
|
from backend.app.dependencies import validate_token
|
|
from backend.app.energy_assessments.schemas import EnergyAssessmentUploadPayload
|
|
|
|
from sqlalchemy.orm import sessionmaker
|
|
from sqlalchemy.exc import IntegrityError, OperationalError
|
|
from backend.app.db.connection import db_engine
|
|
from backend.app.db.functions.energy_assessment_functions import (
|
|
bulk_insert_energy_assessments, create_scenarios_for_documents, create_documents
|
|
)
|
|
|
|
from etl.xml_survey_extraction.XmlParser import XmlParser
|
|
|
|
from utils.s3 import (
|
|
read_from_s3, list_files_and_subfolders_in_s3_folder, list_xmls_in_s3_folder, save_csv_to_s3,
|
|
list_files_in_s3_folder
|
|
)
|
|
from utils.logger import setup_logger
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
def insert_energy_assessment_documents(document_list: List[dict], uprn_to_assessment_id: dict):
|
|
"""
|
|
Inserts or updates energy assessment documents, assigning the correct energy_assessment_id.
|
|
|
|
:param document_list: A list of dictionaries containing document data.
|
|
:param uprn_to_assessment_id: A dictionary mapping UPRN to energy_assessment_id.
|
|
"""
|
|
for document in document_list:
|
|
uprn = document['uprn']
|
|
# Assign the energy_assessment_id based on uprn
|
|
energy_assessment_id = uprn_to_assessment_id.get(uprn)
|
|
|
|
if not energy_assessment_id:
|
|
logger.info(f"No energy_assessment_id found for UPRN: {uprn}. Skipping document.")
|
|
continue
|
|
|
|
# Attach energy_assessment_id to each document
|
|
document['energy_assessment_id'] = energy_assessment_id
|
|
|
|
logger.info("Energy Assessment IDs assigned to documents.")
|
|
|
|
|
|
router = APIRouter(
|
|
prefix="/energy-assessments",
|
|
tags=["energy-assessments"],
|
|
dependencies=[Depends(validate_token)],
|
|
responses={404: {"description": "Not found"}}
|
|
)
|
|
|
|
|
|
@router.post("/upload")
|
|
async def upload(body: EnergyAssessmentUploadPayload):
|
|
"""
|
|
Given a location in S3, this service will retrieve the data in s3 and perform the following:
|
|
1) Extract the data and store it to the data
|
|
2) Extract the links to other artefacts collected during the energy assessment, such as EPRs, floor plans and
|
|
condition reports
|
|
|
|
This will allow us to do the following:
|
|
1) Present the findings of the energy assessment to the client
|
|
2) Allow the end use to download the artefacts collected during the energy assessment
|
|
|
|
Eventually, we will this service to collect the key documents from the service where they're uploaded
|
|
(e.g. Onedrive) and store them to S3, but for the moment, this is sufficient
|
|
|
|
"""
|
|
|
|
logger.info("Connecting to db")
|
|
session = sessionmaker(bind=db_engine)()
|
|
|
|
try:
|
|
logger.info("Extracting energy assessment data")
|
|
energy_assessments = list_files_and_subfolders_in_s3_folder(
|
|
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
|
|
folder_name=f"{body.surveyor}/{body.project_code}/"
|
|
)
|
|
|
|
logger.info(
|
|
f"Found {len(energy_assessments)} energy assessments for {body.surveyor} and {body.project_code}"
|
|
)
|
|
assessments_map = {}
|
|
for assessment in energy_assessments:
|
|
uploaded_xmls = list_xmls_in_s3_folder(
|
|
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
|
|
folder_name=os.path.join(assessment, "docs & plans")
|
|
)
|
|
|
|
energy_assessment_files = list_files_in_s3_folder(
|
|
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
|
|
folder_name=os.path.join(assessment, "docs & plans")
|
|
)
|
|
# Remove xmls from the list of files
|
|
energy_assessment_files = [file for file in energy_assessment_files if file not in uploaded_xmls]
|
|
# We now split this into the different types of files
|
|
# EPR
|
|
eprs = [
|
|
file for file in energy_assessment_files if "epr.pdf" in file.split("/")[-1].replace(" ", "").lower()
|
|
]
|
|
# Condition report
|
|
condition_reports = [
|
|
file for file in energy_assessment_files if "cr.pdf" in file.split("/")[-1].replace(" ", "").lower()
|
|
]
|
|
# Evidence report
|
|
evidence_reports = [
|
|
file for file in energy_assessment_files
|
|
if "evidence.pdf" in file.split("/")[-1].replace(" ", "").lower()
|
|
]
|
|
# Summary report
|
|
summary_reports = [
|
|
file for file in energy_assessment_files
|
|
if "sn.pdf" in file.split("/")[-1].replace(" ", "").lower()
|
|
]
|
|
# Floor plans - these are just the jpgs
|
|
floor_plans = [file for file in energy_assessment_files if file.endswith(".jpg")]
|
|
|
|
# We now retrieve scenarios
|
|
scenario_folders = list_files_and_subfolders_in_s3_folder(
|
|
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
|
|
folder_name=assessment
|
|
)
|
|
|
|
# filter folders that contain the word scenario
|
|
scenario_folders = [
|
|
folder for folder in scenario_folders if "scenario" in folder.rstrip("/").split("/")[-1].lower()
|
|
]
|
|
scenario_documents = []
|
|
for sf in scenario_folders:
|
|
scenario_files = list_files_in_s3_folder(
|
|
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
|
|
folder_name=sf
|
|
)
|
|
notes = [
|
|
file for file in scenario_files if "sitenotes" in file.split("/")[-1].replace(" ", "").lower()
|
|
]
|
|
# This should be the leftovers
|
|
draft_epc = [file for file in scenario_files if file not in notes]
|
|
scenario_documents.append(
|
|
{
|
|
"identifier": sf.rstrip("/").split("/")[-1],
|
|
"Scenario Site Notes": notes,
|
|
"Scenario Draft EPC": draft_epc
|
|
}
|
|
)
|
|
|
|
uprn = int(assessment.rstrip("/").split("/")[-1])
|
|
assessments_map[uprn] = {
|
|
"xmls": uploaded_xmls,
|
|
"EPR": eprs,
|
|
"Condition Report": condition_reports,
|
|
"Evidence Report": evidence_reports,
|
|
"Summary Information": summary_reports,
|
|
"Floor Plan": floor_plans,
|
|
"scenario_documents": scenario_documents
|
|
}
|
|
|
|
logger.info("Extracted energy assessment data and storing file locations to database")
|
|
xml_data_to_store = []
|
|
energy_assessment_documents = []
|
|
for uprn, files in assessments_map.items():
|
|
# Create the rows of data to insert into the energy assessment documents
|
|
property_ea_docs = []
|
|
for doc_type, doc_files in files.items():
|
|
if doc_type == "xmls":
|
|
continue
|
|
|
|
if doc_type == "scenario_documents":
|
|
for doc in doc_files:
|
|
# This scenario id is put in as a placeholder means os associating the scenario documents with
|
|
# the correct scenario
|
|
scenario_id = doc["identifier"]
|
|
for sn in doc["Scenario Site Notes"]:
|
|
property_ea_docs.append(
|
|
{
|
|
"uprn": uprn,
|
|
"document_type": "Scenario Site Notes",
|
|
"document_location": sn,
|
|
"scenario_id": scenario_id
|
|
}
|
|
)
|
|
|
|
for d_epc in doc["Scenario Draft EPC"]:
|
|
property_ea_docs.append(
|
|
{
|
|
"uprn": uprn,
|
|
"document_type": "Scenario Draft EPC",
|
|
"document_location": d_epc,
|
|
"scenario_id": scenario_id
|
|
}
|
|
)
|
|
|
|
continue
|
|
|
|
for doc in doc_files:
|
|
property_ea_docs.append(
|
|
{
|
|
"uprn": uprn,
|
|
"document_type": doc_type,
|
|
"document_location": doc,
|
|
"scenario_id": None
|
|
}
|
|
)
|
|
energy_assessment_documents.extend(property_ea_docs)
|
|
|
|
xmls = files["xmls"]
|
|
extracted_data = {}
|
|
for xml in xmls:
|
|
xml_data = read_from_s3(bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, s3_file_name=xml)
|
|
xml_data_io = BytesIO(xml_data)
|
|
xml_parser = XmlParser(
|
|
file=xml_data_io,
|
|
filekey=os.path.join(f"s3://{get_settings().ENERGY_ASSESSMENTS_BUCKET}", xml),
|
|
uprn=uprn,
|
|
surveyor_company=body.surveyor,
|
|
)
|
|
xml_parser.run()
|
|
if xml_parser.is_lig:
|
|
logger.info(f"Extracted data from {xml}")
|
|
extracted_epc = xml_parser.epc
|
|
extracted_additional_data = xml_parser.additional_data
|
|
|
|
data_to_update = {
|
|
**extracted_epc, **extracted_additional_data
|
|
}
|
|
|
|
# We need to update the keys to match the database schema - i.e. we should replace all hyphens with
|
|
# underscores
|
|
data_to_update = {k.replace("-", "_"): v for k, v in data_to_update.items()}
|
|
|
|
extracted_data.update(data_to_update)
|
|
|
|
xml_data_to_store.append(extracted_data)
|
|
|
|
logger.info("Storing energy assessment xml data to database")
|
|
uprn_to_assessment_id = bulk_insert_energy_assessments(session, xml_data_to_store)
|
|
|
|
# Insert energy assessment id into the documents data
|
|
insert_energy_assessment_documents(energy_assessment_documents, uprn_to_assessment_id)
|
|
|
|
create_scenarios_for_documents(session, energy_assessment_documents, uprn_to_assessment_id)
|
|
|
|
create_documents(session, energy_assessment_documents)
|
|
|
|
session.close()
|
|
|
|
except IntegrityError:
|
|
logger.error("Database integrity error occurred", exc_info=True)
|
|
session.rollback()
|
|
return Response(status_code=500, content="Database integrity error.")
|
|
except OperationalError:
|
|
logger.error("Database operational error occurred", exc_info=True)
|
|
session.rollback()
|
|
return Response(status_code=500, content="Database operational error.")
|
|
except ValueError:
|
|
logger.error("Value error - possibly due to malformed data", exc_info=True)
|
|
session.rollback()
|
|
return Response(status_code=400, content="Bad request: malformed data.")
|
|
except Exception as e: # General exception handling
|
|
logger.error(f"An error occurred: {e}")
|
|
session.rollback()
|
|
return Response(status_code=500, content="An unexpected error occurred.")
|
|
finally:
|
|
session.close()
|
|
|
|
return Response(status_code=200)
|