Model/etl/xml_survey_extraction/app.py
2024-07-25 14:46:11 +01:00

53 lines
2.1 KiB
Python

from utils.s3 import read_from_s3, list_files_and_subfolders_in_s3_folder, list_xmls_in_s3_folder
from utils.logger import setup_logger
from etl.xml_survey_extraction.XmlParser import XmlParser
import os
from io import BytesIO
logger = setup_logger()
SURVEYORS = "JAFFERSONS ENERGY CONSULTANTS"
PROJECT_CODE = "VDE001"
BUCKET = "retrofit-energy-assessments-dev"
PORTFOLIO_ID = None
def main():
"""
This function executes the main process, which will retrieve data from the specified locations, extract the data
fields and store them to our database
:return:
"""
# TODO: Build solution to get this data from Onedrive and store what we need in S3
# In s3, we have a bucket called retrofit-energy-assessments-{stage} which contains the data we need
# The data is stored in a folder called {surveyors}/{project_code}/{uprn}
# We'll need to get the uprn from the folder name, which we can do with EpcSearcher class
#
energy_assessments = list_files_and_subfolders_in_s3_folder(
bucket_name=BUCKET, folder_name=f"{SURVEYORS}/{PROJECT_CODE}/"
)
logger.info(f"Found {len(energy_assessments)} energy assessments for {SURVEYORS} and {PROJECT_CODE}")
assessments_map = {}
for assessment in energy_assessments:
uploaded_xmls = list_xmls_in_s3_folder(
bucket_name=BUCKET, folder_name=os.path.join(assessment, "docs & plans")
)
uprn = int(assessment.rstrip("/").split("/")[-1])
assessments_map[uprn] = uploaded_xmls
logger.info(f"Exatracted XMLS for the energy assessments")
# For each property, we download the xmls and extract the data
for uprn, xmls in assessments_map.items():
extracted_data = {}
for xml in xmls:
xml_data = read_from_s3(bucket_name=BUCKET, s3_file_name=xml)
xml_data_io = BytesIO(xml_data)
xml_parser = XmlParser(file=xml_data_io, filekey=xml, uprn=uprn)
xml_parser.run()
logger.info(f"Extracted data from {xml}")
# TODO: Set a portfolio ID, Target and Automatically upload the asset list and create the event for the portfolio