import os from io import BytesIO from typing import List from fastapi import APIRouter, Depends from starlette.responses import Response from backend.app.config import get_settings from backend.app.dependencies import validate_token from backend.app.energy_assessments.schemas import EnergyAssessmentUploadPayload from sqlalchemy.orm import sessionmaker from sqlalchemy.exc import IntegrityError, OperationalError from backend.app.db.connection import db_engine from backend.app.db.functions.energy_assessment_functions import ( bulk_insert_energy_assessments, create_scenarios_for_documents, create_documents ) from etl.xml_survey_extraction.XmlParser import XmlParser from utils.s3 import ( read_from_s3, list_files_and_subfolders_in_s3_folder, list_xmls_in_s3_folder, save_csv_to_s3, list_files_in_s3_folder ) from utils.logger import setup_logger logger = setup_logger() def insert_energy_assessment_documents(document_list: List[dict], uprn_to_assessment_id: dict): """ Inserts or updates energy assessment documents, assigning the correct energy_assessment_id. :param document_list: A list of dictionaries containing document data. :param uprn_to_assessment_id: A dictionary mapping UPRN to energy_assessment_id. """ for document in document_list: uprn = document['uprn'] # Assign the energy_assessment_id based on uprn energy_assessment_id = uprn_to_assessment_id.get(uprn) if not energy_assessment_id: logger.info(f"No energy_assessment_id found for UPRN: {uprn}. Skipping document.") continue # Attach energy_assessment_id to each document document['energy_assessment_id'] = energy_assessment_id logger.info("Energy Assessment IDs assigned to documents.") router = APIRouter( prefix="/energy-assessments", tags=["energy-assessments"], dependencies=[Depends(validate_token)], responses={404: {"description": "Not found"}} ) @router.post("/upload") async def upload(body: EnergyAssessmentUploadPayload): """ Given a location in S3, this service will retrieve the data in s3 and perform the following: 1) Extract the data and store it to the data 2) Extract the links to other artefacts collected during the energy assessment, such as EPRs, floor plans and condition reports This will allow us to do the following: 1) Present the findings of the energy assessment to the client 2) Allow the end use to download the artefacts collected during the energy assessment Eventually, we will this service to collect the key documents from the service where they're uploaded (e.g. Onedrive) and store them to S3, but for the moment, this is sufficient """ logger.info("Connecting to db") session = sessionmaker(bind=db_engine)() try: logger.info("Extracting energy assessment data") energy_assessments = list_files_and_subfolders_in_s3_folder( bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, folder_name=f"{body.surveyor}/{body.project_code}/" ) logger.info( f"Found {len(energy_assessments)} energy assessments for {body.surveyor} and {body.project_code}" ) assessments_map = {} for assessment in energy_assessments: uploaded_xmls = list_xmls_in_s3_folder( bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, folder_name=os.path.join(assessment, "docs & plans") ) energy_assessment_files = list_files_in_s3_folder( bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, folder_name=os.path.join(assessment, "docs & plans") ) # Remove xmls from the list of files energy_assessment_files = [file for file in energy_assessment_files if file not in uploaded_xmls] # We now split this into the different types of files # EPR eprs = [ file for file in energy_assessment_files if "epr.pdf" in file.split("/")[-1].replace(" ", "").lower() ] # Condition report condition_reports = [ file for file in energy_assessment_files if "cr.pdf" in file.split("/")[-1].replace(" ", "").lower() ] # Evidence report evidence_reports = [ file for file in energy_assessment_files if "evidence.pdf" in file.split("/")[-1].replace(" ", "").lower() ] # Summary report summary_reports = [ file for file in energy_assessment_files if "sn.pdf" in file.split("/")[-1].replace(" ", "").lower() ] # Floor plans - these are just the jpgs floor_plans = [file for file in energy_assessment_files if file.endswith(".jpg")] # We now retrieve scenarios scenario_folders = list_files_and_subfolders_in_s3_folder( bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, folder_name=assessment ) # filter folders that contain the word scenario scenario_folders = [ folder for folder in scenario_folders if "scenario" in folder.rstrip("/").split("/")[-1].lower() ] scenario_documents = [] for sf in scenario_folders: scenario_files = list_files_in_s3_folder( bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, folder_name=sf ) notes = [ file for file in scenario_files if "sitenotes" in file.split("/")[-1].replace(" ", "").lower() ] # This should be the leftovers draft_epc = [file for file in scenario_files if file not in notes] scenario_documents.append( { "identifier": sf.rstrip("/").split("/")[-1], "Scenario Site Notes": notes, "Scenario Draft EPC": draft_epc } ) uprn = int(assessment.rstrip("/").split("/")[-1]) assessments_map[uprn] = { "xmls": uploaded_xmls, "EPR": eprs, "Condition Report": condition_reports, "Evidence Report": evidence_reports, "Summary Information": summary_reports, "Floor Plan": floor_plans, "scenario_documents": scenario_documents } logger.info("Extracted energy assessment data and storing file locations to database") xml_data_to_store = [] energy_assessment_documents = [] for uprn, files in assessments_map.items(): # Create the rows of data to insert into the energy assessment documents property_ea_docs = [] for doc_type, doc_files in files.items(): if doc_type == "xmls": continue if doc_type == "scenario_documents": for doc in doc_files: # This scenario id is put in as a placeholder means os associating the scenario documents with # the correct scenario scenario_id = doc["identifier"] for sn in doc["Scenario Site Notes"]: property_ea_docs.append( { "uprn": uprn, "document_type": "Scenario Site Notes", "document_location": sn, "scenario_id": scenario_id } ) for d_epc in doc["Scenario Draft EPC"]: property_ea_docs.append( { "uprn": uprn, "document_type": "Scenario Draft EPC", "document_location": d_epc, "scenario_id": scenario_id } ) continue for doc in doc_files: property_ea_docs.append( { "uprn": uprn, "document_type": doc_type, "document_location": doc, "scenario_id": None } ) energy_assessment_documents.extend(property_ea_docs) xmls = files["xmls"] extracted_data = {} for xml in xmls: xml_data = read_from_s3(bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, s3_file_name=xml) xml_data_io = BytesIO(xml_data) xml_parser = XmlParser( file=xml_data_io, filekey=os.path.join(f"s3://{get_settings().ENERGY_ASSESSMENTS_BUCKET}", xml), uprn=uprn, surveyor_company=body.surveyor, ) xml_parser.run() if xml_parser.is_lig: logger.info(f"Extracted data from {xml}") extracted_epc = xml_parser.epc extracted_additional_data = xml_parser.additional_data data_to_update = { **extracted_epc, **extracted_additional_data } # We need to update the keys to match the database schema - i.e. we should replace all hyphens with # underscores data_to_update = {k.replace("-", "_"): v for k, v in data_to_update.items()} extracted_data.update(data_to_update) xml_data_to_store.append(extracted_data) logger.info("Storing energy assessment xml data to database") uprn_to_assessment_id = bulk_insert_energy_assessments(session, xml_data_to_store) # Insert energy assessment id into the documents data insert_energy_assessment_documents(energy_assessment_documents, uprn_to_assessment_id) create_scenarios_for_documents(session, energy_assessment_documents, uprn_to_assessment_id) create_documents(session, energy_assessment_documents) session.close() except IntegrityError: logger.error("Database integrity error occurred", exc_info=True) session.rollback() return Response(status_code=500, content="Database integrity error.") except OperationalError: logger.error("Database operational error occurred", exc_info=True) session.rollback() return Response(status_code=500, content="Database operational error.") except ValueError: logger.error("Value error - possibly due to malformed data", exc_info=True) session.rollback() return Response(status_code=400, content="Bad request: malformed data.") except Exception as e: # General exception handling logger.error(f"An error occurred: {e}") session.rollback() return Response(status_code=500, content="An unexpected error occurred.") finally: session.close() return Response(status_code=200)