Model/backend/app/energy_assessments/router.py
2024-09-04 19:39:31 +01:00

273 lines
11 KiB
Python

import os
from io import BytesIO
from typing import List
from fastapi import APIRouter, Depends
from starlette.responses import Response
from backend.app.config import get_settings
from backend.app.dependencies import validate_token
from backend.app.energy_assessments.schemas import EnergyAssessmentUploadPayload
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import IntegrityError, OperationalError
from backend.app.db.connection import db_engine
from backend.app.db.functions.energy_assessment_functions import (
bulk_insert_energy_assessments, create_scenarios_for_documents, create_documents
)
from etl.xml_survey_extraction.XmlParser import XmlParser
from utils.s3 import (
read_from_s3, list_files_and_subfolders_in_s3_folder, list_xmls_in_s3_folder, save_csv_to_s3,
list_files_in_s3_folder
)
from utils.logger import setup_logger
logger = setup_logger()
def insert_energy_assessment_documents(document_list: List[dict], uprn_to_assessment_id: dict):
"""
Inserts or updates energy assessment documents, assigning the correct energy_assessment_id.
:param document_list: A list of dictionaries containing document data.
:param uprn_to_assessment_id: A dictionary mapping UPRN to energy_assessment_id.
"""
for document in document_list:
uprn = document['uprn']
# Assign the energy_assessment_id based on uprn
energy_assessment_id = uprn_to_assessment_id.get(uprn)
if not energy_assessment_id:
logger.info(f"No energy_assessment_id found for UPRN: {uprn}. Skipping document.")
continue
# Attach energy_assessment_id to each document
document['energy_assessment_id'] = energy_assessment_id
logger.info("Energy Assessment IDs assigned to documents.")
router = APIRouter(
prefix="/energy-assessments",
tags=["energy-assessments"],
dependencies=[Depends(validate_token)],
responses={404: {"description": "Not found"}}
)
@router.post("/upload")
async def upload(body: EnergyAssessmentUploadPayload):
"""
Given a location in S3, this service will retrieve the data in s3 and perform the following:
1) Extract the data and store it to the data
2) Extract the links to other artefacts collected during the energy assessment, such as EPRs, floor plans and
condition reports
This will allow us to do the following:
1) Present the findings of the energy assessment to the client
2) Allow the end use to download the artefacts collected during the energy assessment
Eventually, we will this service to collect the key documents from the service where they're uploaded
(e.g. Onedrive) and store them to S3, but for the moment, this is sufficient
"""
logger.info("Connecting to db")
session = sessionmaker(bind=db_engine)()
try:
logger.info("Extracting energy assessment data")
energy_assessments = list_files_and_subfolders_in_s3_folder(
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
folder_name=f"{body.surveyor}/{body.project_code}/"
)
logger.info(
f"Found {len(energy_assessments)} energy assessments for {body.surveyor} and {body.project_code}"
)
assessments_map = {}
for assessment in energy_assessments:
uploaded_xmls = list_xmls_in_s3_folder(
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
folder_name=os.path.join(assessment, "docs & plans")
)
energy_assessment_files = list_files_in_s3_folder(
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
folder_name=os.path.join(assessment, "docs & plans")
)
# Remove xmls from the list of files
energy_assessment_files = [file for file in energy_assessment_files if file not in uploaded_xmls]
# We now split this into the different types of files
# EPR
eprs = [
file for file in energy_assessment_files if "epr.pdf" in file.split("/")[-1].replace(" ", "").lower()
]
# Condition report
condition_reports = [
file for file in energy_assessment_files if "cr.pdf" in file.split("/")[-1].replace(" ", "").lower()
]
# Evidence report
evidence_reports = [
file for file in energy_assessment_files
if "evidence.pdf" in file.split("/")[-1].replace(" ", "").lower()
]
# Summary report
summary_reports = [
file for file in energy_assessment_files
if "sn.pdf" in file.split("/")[-1].replace(" ", "").lower()
]
# Floor plans - these are just the jpgs
floor_plans = [file for file in energy_assessment_files if file.endswith(".jpg")]
# We now retrieve scenarios
scenario_folders = list_files_and_subfolders_in_s3_folder(
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
folder_name=assessment
)
# filter folders that contain the word scenario
scenario_folders = [
folder for folder in scenario_folders if "scenario" in folder.rstrip("/").split("/")[-1].lower()
]
scenario_documents = []
for sf in scenario_folders:
scenario_files = list_files_in_s3_folder(
bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET,
folder_name=sf
)
notes = [
file for file in scenario_files if "sitenotes" in file.split("/")[-1].replace(" ", "").lower()
]
# This should be the leftovers
draft_epc = [file for file in scenario_files if file not in notes]
scenario_documents.append(
{
"identifier": sf.rstrip("/").split("/")[-1],
"Scenario Site Notes": notes,
"Scenario Draft EPC": draft_epc
}
)
uprn = int(assessment.rstrip("/").split("/")[-1])
assessments_map[uprn] = {
"xmls": uploaded_xmls,
"EPR": eprs,
"Condition Report": condition_reports,
"Evidence Report": evidence_reports,
"Summary Information": summary_reports,
"Floor Plan": floor_plans,
"scenario_documents": scenario_documents
}
logger.info("Extracted energy assessment data and storing file locations to database")
xml_data_to_store = []
energy_assessment_documents = []
for uprn, files in assessments_map.items():
# Create the rows of data to insert into the energy assessment documents
property_ea_docs = []
for doc_type, doc_files in files.items():
if doc_type == "xmls":
continue
if doc_type == "scenario_documents":
for doc in doc_files:
# This scenario id is put in as a placeholder means os associating the scenario documents with
# the correct scenario
scenario_id = doc["identifier"]
for sn in doc["Scenario Site Notes"]:
property_ea_docs.append(
{
"uprn": uprn,
"document_type": "Scenario Site Notes",
"document_location": sn,
"scenario_id": scenario_id
}
)
for d_epc in doc["Scenario Draft EPC"]:
property_ea_docs.append(
{
"uprn": uprn,
"document_type": "Scenario Draft EPC",
"document_location": d_epc,
"scenario_id": scenario_id
}
)
continue
for doc in doc_files:
property_ea_docs.append(
{
"uprn": uprn,
"document_type": doc_type,
"document_location": doc,
"scenario_id": None
}
)
energy_assessment_documents.extend(property_ea_docs)
xmls = files["xmls"]
extracted_data = {}
for xml in xmls:
xml_data = read_from_s3(bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, s3_file_name=xml)
xml_data_io = BytesIO(xml_data)
xml_parser = XmlParser(
file=xml_data_io,
filekey=os.path.join(f"s3://{get_settings().ENERGY_ASSESSMENTS_BUCKET}", xml),
uprn=uprn,
surveyor_company=body.surveyor,
)
xml_parser.run()
if xml_parser.is_lig:
logger.info(f"Extracted data from {xml}")
extracted_epc = xml_parser.epc
extracted_additional_data = xml_parser.additional_data
data_to_update = {
**extracted_epc, **extracted_additional_data
}
# We need to update the keys to match the database schema - i.e. we should replace all hyphens with
# underscores
data_to_update = {k.replace("-", "_"): v for k, v in data_to_update.items()}
extracted_data.update(data_to_update)
xml_data_to_store.append(extracted_data)
logger.info("Storing energy assessment xml data to database")
uprn_to_assessment_id = bulk_insert_energy_assessments(session, xml_data_to_store)
# Insert energy assessment id into the documents data
insert_energy_assessment_documents(energy_assessment_documents, uprn_to_assessment_id)
create_scenarios_for_documents(session, energy_assessment_documents, uprn_to_assessment_id)
create_documents(session, energy_assessment_documents)
session.close()
except IntegrityError:
logger.error("Database integrity error occurred", exc_info=True)
session.rollback()
return Response(status_code=500, content="Database integrity error.")
except OperationalError:
logger.error("Database operational error occurred", exc_info=True)
session.rollback()
return Response(status_code=500, content="Database operational error.")
except ValueError:
logger.error("Value error - possibly due to malformed data", exc_info=True)
session.rollback()
return Response(status_code=400, content="Bad request: malformed data.")
except Exception as e: # General exception handling
logger.error(f"An error occurred: {e}")
session.rollback()
return Response(status_code=500, content="An unexpected error occurred.")
finally:
session.close()
return Response(status_code=200)