Model/model_data/app.py

from tqdm import tqdm
import os
import pandas as pd

from model_data.config import EPC_AUTH_TOKEN
from epc_api.client import EpcClient
from model_data.downloader import pagenated_epc_download
from model_data.EpcClean import EpcClean
from model_data.analysis.UvalueEstimations import UvalueEstimations
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
from pathlib import Path

LAND_REGISTRY_PATHS = [
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
]

EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"


def app():
    """
    For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
    and produce a dataset of cleaned fields so that when we get new properties, we can quickly
    sanitise any description data
    :return:
    """

    # epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN)
    #
    # constituencies = {'E14000555', 'E14000726', 'E14000720', 'E14000721', 'E14000553', 'E14000752'}
    # property_types = ["bungalow", "flat", "house", "maisonette", "park home"]
    # floor_areas = ["unknown", "s", "m", "l", "xl", "xxl", "xxxl"]
    #
    # # We pull properties from local authorities, by property type. This will allow us to build
    # # a dataset of up to 10k properties per local authority/property type combination
    # # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
    # # conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
    # # and Wales from 31 July 2014
    # # Download data from August 2014 onwards
    # data = []
    # for c in tqdm(constituencies):
    #     for pt in property_types:
    #         for fa in floor_areas:
    #             data.extend(
    #                 pagenated_epc_download(
    #                     client=epc_client,
    #                     params={
    #                         "constituency": c,
    #                         "property-type": pt,
    #                         "from-month": 8,
    #                         "from-year": 2014,
    #                         "floor-area": fa,
    #                     },
    #                     page_size=5000,
    #                     n_pages=10,
    #                 )
    #             )

    # Production of sample data for land registry
    # address_meta = [
    #     {
    #         "postcode": x["postcode"].upper(),
    #         "address1": x["address1"].upper(),
    #         "address2": x["address2"].upper(),
    #         "address3": x["address3"].upper(),
    #         "address": x["address"],
    #         "uprn": x["uprn"]
    #     } for x in data
    # ]
    #
    # import pickle
    # with open("sample_addresses.pkl", "wb") as f:
    #     pickle.dump(address_meta, f)

    epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
    for directory in epc_directories:
        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
        # Rename the columns to the same format as the api returns
        data.columns = [c.replace("_", "-").lower() for c in data.columns]
        # Take just date before the date threshold
        data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]

        # Convert to list of dictioaries as returned by the api
        data = data.to_dict("records")

        # Incorporate input data into cleaning
        cleaner = EpcClean(data)
        lighting_averages = cleaner.lighting_averages
        #
        # TODO: All of these outputs can be stored by constituency so we can reduce the amount
        #       of data we fetch
        #
        # TODO: WE need to store lighting_averages to a s3
        #       We should also extend these averages so they're by more variables (property type, age band,
        #       constituency,
        #       etc)
        cleaner.clean()
        # TODO: cleaner.cleaned datasets to s3

        # TODO: Add property age band into this
        uvalue_estimates = UvalueEstimations(data=data)
        uvalue_estimates.get_estimates(cleaner=cleaner)
        # TODO: Store these to a s3