Model/model_data/cleaner_app.py

from tqdm import tqdm
import os
import pandas as pd

from model_data.config import EPC_AUTH_TOKEN
from epc_api.client import EpcClient
from model_data.downloader import pagenated_epc_download
from model_data.EpcClean import EpcClean
from model_data.analysis.UvalueEstimations import UvalueEstimations
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
from pathlib import Path

LAND_REGISTRY_PATHS = [
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv",
    os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
]

EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"


def app():
    """
    For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
    and produce a dataset of cleaned fields so that when we get new properties, we can quickly
    sanitise any description data
    :return:
    """

    cleaned_data = {}
    epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
    for directory in tqdm(epc_directories):
        directory_destructured = str(directory).split("/")[-1].split("-")
        gss_code = directory_destructured[1]
        local_authority = directory_destructured[2]

        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
        # Rename the columns to the same format as the api returns
        data.columns = [c.replace("_", "-").lower() for c in data.columns]
        # Take just date before the date threshold
        data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]

        # Convert to list of dictioaries as returned by the api
        data = data.to_dict("records")

        # Incorporate input data into cleaning
        cleaner = EpcClean(data)

        cleaner.clean()
        # Extended cleaned_data
        for k, data in cleaner.cleaned.items():
            if k not in cleaned_data:
                cleaned_data[k] = data
            else:
                existing_descriptions = [x["original_description"] for x in cleaned_data[k]]
                new_data = [x for x in data if x["original_description"] not in existing_descriptions]
                cleaned_data[k].extend(new_data)

        # TODO: Add property age band into this
        # uvalue_estimates = UvalueEstimations(data=data)
        # uvalue_estimates.get_estimates(cleaner=cleaner)
        # # TODO: Store these to a s3
        # uvalue_estimates.walls
        # uvalue_estimates.floors
        # uvalue_estimates.roofs