Model/etl/xml_survey_extraction/app.py

from backend.app.db.functions.energy_assessment_functions import bulk_insert_energy_assessments
from sqlalchemy.orm import sessionmaker
from backend.app.db.connection import db_engine
from utils.s3 import read_from_s3, list_files_and_subfolders_in_s3_folder, list_xmls_in_s3_folder, save_csv_to_s3
from utils.logger import setup_logger
from etl.xml_survey_extraction.XmlParser import XmlParser
import os
import pandas as pd
from io import BytesIO

logger = setup_logger()

BUCKET = "retrofit-energy-assessments-dev"
USER_ID = 8
non_invasive_recommendations_filepath = "{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
SCENARIOS = {
    101: {
        "project_code": "VEC001",
        "surveyor": "JAFFERSONS ENERGY CONSULTANTS",
        "bodies": [
            # Scenario A: Cavity wall insulation
            {
                "portfolio_id": str(101),
                "housing_type": "Private",
                "goal": "Increasing EPC",
                "goal_value": "A",
                "trigger_file_path": "",
                "already_installed_file_path": "",
                "patches_file_path": "",
                "non_invasive_recommendations_file_path": "",
                "inclusions": [
                    "draught_proofing", "secondary_glazing", "trickle_vents", "low_energy_lighting",
                ],
                "budget": None,
                "scenario_name": "Quick wins - do now while tenanted",
                "multi_plan": True,
            },
            # Scenario B: CWI, Solar PV, AHSP
            {
                "portfolio_id": str(101),
                "housing_type": "Private",
                "goal": "Increasing EPC",
                "goal_value": "A",
                "trigger_file_path": "",
                "already_installed_file_path": "",
                "patches_file_path": "",
                "non_invasive_recommendations_file_path": "",
                "inclusions": [
                    "draught_proofing",
                    "secondary_glazing",
                    "trickle_vents",
                    "low_energy_lighting",
                    "suspended_floor_insulation",
                    "internal_wall_insulation"
                ],
                "budget": None,
                "scenario_name": "Do when void",
                "multi_plan": True,
            },
        ]
    },
}

# TODO: These non-intrusive recommendations should be detected from the EPRs, the scenarios and the condition report?
#       For recommendations like trickle vents, we can deduce this from the condition report, depending on the
#       ventilation of the room and the presence of trickle vents.
NON_INTRUSITVE_RECOMMENDATIONS = [
    {
        # 2 Grove Mansions
        "uprn": 121016121,
        "recommendations": [
            {
                "type": "draught_proofing",
                "cost": None,
                "survey": True
            },
            {"type": "secondary_glazing", "cost": None, "survey": True},
            {"type": "trickle_vents", "cost": None, "survey": True},
            {"type": "suspended_floor_insulation", "cost": None, "survey": True},
            {"type": "internal_wall_insulation", "cost": None, "survey": True},
        ]
    },
    {
        # 8 Grove Mansions
        "uprn": 10024087855,
        "recommendations": [
            {"type": "draught_proofing", "cost": None, "survey": True},
            {"type": "secondary_glazing", "cost": None, "survey": True},
            {"type": "trickle_vents", "cost": None, "survey": True},
            {"type": "low_energy_lighting", "cost": None, "survey": True},
            {"type": "internal_wall_insulation", "cost": None, "survey": True},
        ]
    },
    {
        # 9 Grove Mansions
        "uprn": 121016128,
        "recommendations": [
            {"type": "draught_proofing", "cost": None, "survey": True},
            {"type": "secondary_glazing", "cost": None, "survey": True},
            {"type": "trickle_vents", "cost": None, "survey": True},
            {"type": "low_energy_lighting", "cost": None, "survey": True},
            {"type": "suspended_floor_insulation", "cost": None},
            {"type": "internal_wall_insulation", "cost": None, "survey": True},
        ]
    },
    {
        # 5 Grove Mansions
        "uprn": 121016124,
        "recommendations": [
            {"type": "secondary_glazing", "cost": None, "survey": True},
            {"type": "trickle_vents", "cost": None, "survey": True},
            {"type": "low_energy_lighting", "cost": None, "survey": True},
            {"type": "internal_wall_insulation", "cost": None, "survey": True},
        ]
    },
    {
        # 14 Grove Mansions
        "uprn": 121016117,
        "recommendations": [
            {"type": "draught_proofing", "cost": None, "survey": True},
            {"type": "secondary_glazing", "cost": None, "survey": True},
            {"type": "trickle_vents", "cost": None, "survey": True},
            {"type": "low_energy_lighting", "cost": None, "survey": True},
            {"type": "internal_wall_insulation", "cost": None, "survey": True},
        ]
    },
    {
        # 19 Grove Mansions
        "uprn": 121016117,
        "recommendations": [
            {"type": "low_energy_lighting", "cost": None, "survey": True},
            {"type": "secondary_glazing", "cost": None, "survey": True},
            {"type": "internal_wall_insulation", "cost": None, "survey": True},
            {"type": "room_roof_insulation", "cost": None, "survey": True},
        ]
    },
]


def main():
    """
    This function executes the main process, which will retrieve data from the specified locations, extract the data
    fields and store them to our database
    :return:
    """

    # TODO: Build solution to get this data from Onedrive and store what we need in S3
    #       In s3, we have a bucket called retrofit-energy-assessments-{stage} which contains the data we need
    #      The data is stored in a folder called {surveyors}/{project_code}/{uprn}
    #       We'll need to get the uprn from the folder name, which we can do with EpcSearcher class

    # TODO: Pull out county, as in create_epc_records in the router, we pull it from the latest EPC, but we should
    #       be able to deduce it from just the address. Same for constituency and constituency_label

    # TODO: Store the project code in the database
    #

    for scenario_config in SCENARIOS.values():
        energy_assessments = list_files_and_subfolders_in_s3_folder(
            bucket_name=BUCKET, folder_name=f"{scenario_config['surveyor']}/{scenario_config['project_code']}/"
        )

        logger.info(
            f"Found {len(energy_assessments)} energy assessments for {scenario_config['surveyor']} and "
            f"{scenario_config['project_code']}"
        )
        assessments_map = {}
        for assessment in energy_assessments:
            uploaded_xmls = list_xmls_in_s3_folder(
                bucket_name=BUCKET, folder_name=os.path.join(assessment, "docs & plans")
            )
            uprn = int(assessment.rstrip("/").split("/")[-1])
            assessments_map[uprn] = uploaded_xmls

        logger.info(f"Exatracted XMLS for the energy assessments")

        # TODO: IF we have many uploads, we can do them in a batch so we don't try and upload huge amounts of data to
        #       the database at onece

        # TODO: We now have detailed information about primary and secondary walls, so we should use this information
        #       in our recommendations when we have it
        #       For example, for 77 Peryn Road, W3 7LT, the energy assessment has a main dwelling and two extensions,
        #       where
        #       the physical dimensions and the fabric of each building is constructed in a way as if each building is
        #       separate. We should use this information to make recommendations that are specific to each building
        #       part, though the problem here is that while the fabric and dimensions are separate, the actual SAP,
        #       CO2, etc
        #       figures span across the entire property.
        #       Idea: We can collect all of this information by building part and store it separately in the database
        #             against the uprn. We can have key data for the EPC, but then also additional data for each
        #             building
        #             part. We can then use this data to make recommendations that are specific to each building part
        #       We should probably re-think this data model, so we break up the data in a more considered fasion and
        #       produce
        #       the underlying EPC data as a summary of the building parts. Not only do we have data against the main
        #       dwelling and extensions, but we also have multiple windows with individiaul pieces of information that
        #       we can use to make recommendations. We should store this data in a way that we can easily access it and
        #       use it to make recommendations (e.g. we should have a Windows table)

        # For each property, we download the xmls and extract the data
        database_data = []
        for uprn, xmls in assessments_map.items():

            extracted_data = {}
            for xml in xmls:
                xml_data = read_from_s3(bucket_name=BUCKET, s3_file_name=xml)
                xml_data_io = BytesIO(xml_data)
                xml_parser = XmlParser(
                    file=xml_data_io,
                    filekey=os.path.join(f"s3://{BUCKET}", xml),
                    uprn=uprn,
                    surveyor_company=scenario_config["surveyor"],
                )
                xml_parser.run()
                if xml_parser.is_lig:
                    logger.info(f"Extracted data from {xml}")
                extracted_epc = xml_parser.epc
                extracted_additional_data = xml_parser.additional_data

                data_to_update = {
                    **extracted_epc, **extracted_additional_data
                }

                # We need to update the keys to match the database schema - i.e. we should replace all hyphens with
                # underscores
                data_to_update = {k.replace("-", "_"): v for k, v in data_to_update.items()}

                extracted_data.update(data_to_update)

            database_data.append(extracted_data)

        logger.info("Uploading data to the database")
        session = sessionmaker(bind=db_engine)()
        bulk_insert_energy_assessments(session, database_data)
        session.close()

        # Create the asset list
        asset_list = [
            {"uprn": x["uprn"], "address": x["address1"], "postcode": x["postcode"]} for x in database_data
        ]
        asset_list = pd.DataFrame(asset_list)

        # Store the asset list in s3
        filename = f"{USER_ID}/{scenario_config['bodies'][0]['portfolio_id']}/non_intrusives.csv"
        save_csv_to_s3(
            dataframe=asset_list,
            bucket_name="retrofit-plan-inputs-dev",
            file_name=filename
        )

        for body in scenario_config["bodies"]:
            body["trigger_file_path"] = filename
            print(body)

    # TODO: In order to get the full data associated to the heating system, we need to download and parse the pcdb which
    #       can be found here: https://www.ncm-pcdb.org.uk/pcdb/pcdb10.dat
    #                          https://www.ncm-pcdb.org.uk/sap/download
    #       However retrieving this data is not a priority, so we can leave this for now as parsing the database
    #       is a non-trivial task

    # TODO: The condition report contains additional data such as the number of bedrooms and the number of bathrooms
    #       We can extract this data and store it in the database as well. We can then update our kwargs methodology
    #       that is passed to the property class, where instead we store this additional data in our database (it could
    #       be stored in the energy assessment table, or in a separate table) and then when we're passed additional data
    #       we can query the database for this data and use it to update the property object, instead of storing it
    #       in the asset list and pulling it out of the asset list
    #       1) Bathrooms
    #       2) Bedrooms