Model/etl/xml_survey_extraction/app.py

from utils.s3 import read_from_s3, list_files_and_subfolders_in_s3_folder, list_xmls_in_s3_folder
from utils.logger import setup_logger
from etl.xml_survey_extraction.XmlParser import XmlParser
import os
from io import BytesIO

logger = setup_logger()

SURVEYORS = "JAFFERSONS ENERGY CONSULTANTS"
PROJECT_CODE = "VDE001"
BUCKET = "retrofit-energy-assessments-dev"
PORTFOLIO_ID = None


def main():
    """
    This function executes the main process, which will retrieve data from the specified locations, extract the data
    fields and store them to our database
    :return:
    """

    # TODO: Build solution to get this data from Onedrive and store what we need in S3
    #       In s3, we have a bucket called retrofit-energy-assessments-{stage} which contains the data we need
    #      The data is stored in a folder called {surveyors}/{project_code}/{uprn}
    #       We'll need to get the uprn from the folder name, which we can do with EpcSearcher class

    #
    energy_assessments = list_files_and_subfolders_in_s3_folder(
        bucket_name=BUCKET, folder_name=f"{SURVEYORS}/{PROJECT_CODE}/"
    )

    logger.info(f"Found {len(energy_assessments)} energy assessments for {SURVEYORS} and {PROJECT_CODE}")
    assessments_map = {}
    for assessment in energy_assessments:
        uploaded_xmls = list_xmls_in_s3_folder(
            bucket_name=BUCKET, folder_name=os.path.join(assessment, "docs & plans")
        )
        uprn = int(assessment.rstrip("/").split("/")[-1])
        assessments_map[uprn] = uploaded_xmls

    logger.info(f"Exatracted XMLS for the energy assessments")

    # For each property, we download the xmls and extract the data
    for uprn, xmls in assessments_map.items():
        extracted_data = {}
        for xml in xmls:
            xml_data = read_from_s3(bucket_name=BUCKET, s3_file_name=xml)
            xml_data_io = BytesIO(xml_data)
            xml_parser = XmlParser(file=xml_data_io, filekey=xml, uprn=uprn)
            xml_parser.run()
            logger.info(f"Extracted data from {xml}")

    # TODO: Set a portfolio ID, Target and Automatically upload the asset list and create the event for the portfolio