mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
66 lines
2.9 KiB
Python
66 lines
2.9 KiB
Python
from tqdm import tqdm
|
|
import os
|
|
import pandas as pd
|
|
|
|
from model_data.config import EPC_AUTH_TOKEN
|
|
from epc_api.client import EpcClient
|
|
from model_data.downloader import pagenated_epc_download
|
|
from model_data.EpcClean import EpcClean
|
|
from model_data.analysis.UvalueEstimations import UvalueEstimations
|
|
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
|
|
from pathlib import Path
|
|
|
|
LAND_REGISTRY_PATHS = [
|
|
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
|
|
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv",
|
|
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv",
|
|
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",
|
|
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv",
|
|
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv",
|
|
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv",
|
|
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
|
|
]
|
|
|
|
EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
|
|
|
|
|
def app():
|
|
"""
|
|
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
|
|
and produce a dataset of cleaned fields so that when we get new properties, we can quickly
|
|
sanitise any description data
|
|
:return:
|
|
"""
|
|
|
|
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
|
for directory in tqdm(epc_directories):
|
|
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
|
# Rename the columns to the same format as the api returns
|
|
data.columns = [c.replace("_", "-").lower() for c in data.columns]
|
|
# Take just date before the date threshold
|
|
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
|
|
|
|
# Convert to list of dictioaries as returned by the api
|
|
data = data.to_dict("records")
|
|
|
|
# Incorporate input data into cleaning
|
|
cleaner = EpcClean(data)
|
|
lighting_averages = cleaner.lighting_averages
|
|
#
|
|
# TODO: All of these outputs can be stored by constituency so we can reduce the amount
|
|
# of data we fetch
|
|
#
|
|
# TODO: WE need to store lighting_averages to a s3
|
|
# We should also extend these averages so they're by more variables (property type, age band,
|
|
# constituency,
|
|
# etc)
|
|
cleaner.clean()
|
|
# TODO: cleaner.cleaned datasets to s3
|
|
|
|
# TODO: Add property age band into this
|
|
# uvalue_estimates = UvalueEstimations(data=data)
|
|
# uvalue_estimates.get_estimates(cleaner=cleaner)
|
|
# # TODO: Store these to a s3
|
|
# uvalue_estimates.walls
|
|
# uvalue_estimates.floors
|
|
# uvalue_estimates.roofs
|