From 92564be655263f74a8ea9c5cc5d0309edce265ee Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 4 Sep 2024 10:26:39 +0100 Subject: [PATCH] migrating xml extraction to new router --- backend/app/energy_assessments/router.py | 32 +++++++++++++++++++++-- backend/app/energy_assessments/schemas.py | 7 +++-- etl/xml_survey_extraction/app.py | 2 +- 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/backend/app/energy_assessments/router.py b/backend/app/energy_assessments/router.py index ec49c1c1..c4e0308b 100644 --- a/backend/app/energy_assessments/router.py +++ b/backend/app/energy_assessments/router.py @@ -10,6 +10,16 @@ from backend.app.energy_assessments.schemas import EnergyAssessmentUploadPayload from sqlalchemy.exc import IntegrityError, OperationalError from sqlalchemy.orm import sessionmaker +from backend.app.db.functions.energy_assessment_functions import bulk_insert_energy_assessments +from sqlalchemy.orm import sessionmaker +from backend.app.db.connection import db_engine +from utils.s3 import read_from_s3, list_files_and_subfolders_in_s3_folder, list_xmls_in_s3_folder, save_csv_to_s3 +from utils.logger import setup_logger +from etl.xml_survey_extraction.XmlParser import XmlParser +import os +import pandas as pd +from io import BytesIO + from utils.logger import setup_logger logger = setup_logger() @@ -37,14 +47,32 @@ async def upload(body: EnergyAssessmentUploadPayload): Eventually, we will this service to collect the key documents from the service where they're uploaded (e.g. Onedrive) and store them to S3, but for the moment, this is sufficient - # TODO - Holding up on implementing this """ logger.info("Connecting to db") session = sessionmaker(bind=db_engine)() try: - logger.info("Uploading energy assessment data") + logger.info("Extracting energy assessment data") + energy_assessments = list_files_and_subfolders_in_s3_folder( + bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, + folder_name=f"{body.surveyor}/{body.project_code}/" + ) + + logger.info( + f"Found {len(energy_assessments)} energy assessments for {body.surveyor} and {body.project_code}" + ) + assessments_map = {} + for assessment in energy_assessments: + uploaded_xmls = list_xmls_in_s3_folder( + bucket_name=get_settings().ENERGY_ASSESSMENTS_BUCKET, + folder_name=os.path.join(assessment, "docs & plans") + ) + uprn = int(assessment.rstrip("/").split("/")[-1]) + assessments_map[uprn] = uploaded_xmls + + logger.info(f"Exatracted XMLS for the energy assessments") + except IntegrityError: logger.error("Database integrity error occurred", exc_info=True) session.rollback() diff --git a/backend/app/energy_assessments/schemas.py b/backend/app/energy_assessments/schemas.py index 83a9a44e..cfee76ff 100644 --- a/backend/app/energy_assessments/schemas.py +++ b/backend/app/energy_assessments/schemas.py @@ -3,5 +3,8 @@ from pydantic import BaseModel class EnergyAssessmentUploadPayload(BaseModel): portfolio_id: int - # This is the s3 location, where the informaton collected during the energy assessment is stored - s3_filepath: str + # This is the energy assessment company/individual that conducted the energy assessment, where the data is uploaded + # against + surveyor: str + # is a code, like VEC001, which is used to identify the project and also where the data is uploaded against + project_code: str diff --git a/etl/xml_survey_extraction/app.py b/etl/xml_survey_extraction/app.py index 5c09b7bf..ffe6274c 100644 --- a/etl/xml_survey_extraction/app.py +++ b/etl/xml_survey_extraction/app.py @@ -15,7 +15,7 @@ USER_ID = 8 non_invasive_recommendations_filepath = "{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" SCENARIOS = { 101: { - "project_code": "VDE001", + "project_code": "VEC001", "surveyor": "JAFFERSONS ENERGY CONSULTANTS", "bodies": [ # Scenario A: Cavity wall insulation