Model/model_data/app.py
2023-09-07 15:21:12 +03:00

113 lines
4.9 KiB
Python

from tqdm import tqdm
import os
import pandas as pd
from model_data.config import EPC_AUTH_TOKEN
from epc_api.client import EpcClient
from model_data.downloader import pagenated_epc_download
from model_data.EpcClean import EpcClean
from model_data.analysis.UvalueEstimations import UvalueEstimations
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
from pathlib import Path
LAND_REGISTRY_PATHS = [
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2022 (1).csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2021.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2020.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2019.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2018.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part1.csv",
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
]
EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
def app():
"""
For a pre-defined list of constituencies and property data_types, we'll download EPC data from the API
and produce a dataset of cleaned fields so that when we get new properties, we can quickly
sanitise any description data
:return:
"""
# epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN)
#
# constituencies = {'E14000555', 'E14000726', 'E14000720', 'E14000721', 'E14000553', 'E14000752'}
# property_types = ["bungalow", "flat", "house", "maisonette", "park home"]
# floor_areas = ["unknown", "s", "m", "l", "xl", "xxl", "xxxl"]
#
# # We pull properties from local authorities, by property type. This will allow us to build
# # a dataset of up to 10k properties per local authority/property type combination
# # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
# # conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
# # and Wales from 31 July 2014
# # Download data from August 2014 onwards
# data = []
# for c in tqdm(constituencies):
# for pt in property_types:
# for fa in floor_areas:
# data.extend(
# pagenated_epc_download(
# client=epc_client,
# params={
# "constituency": c,
# "property-type": pt,
# "from-month": 8,
# "from-year": 2014,
# "floor-area": fa,
# },
# page_size=5000,
# n_pages=10,
# )
# )
# Production of sample data for land registry
# address_meta = [
# {
# "postcode": x["postcode"].upper(),
# "address1": x["address1"].upper(),
# "address2": x["address2"].upper(),
# "address3": x["address3"].upper(),
# "address": x["address"],
# "uprn": x["uprn"]
# } for x in data
# ]
#
# import pickle
# with open("sample_addresses.pkl", "wb") as f:
# pickle.dump(address_meta, f)
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
for directory in tqdm(epc_directories):
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
data.columns = [c.replace("_", "-").lower() for c in data.columns]
# Take just date before the date threshold
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
# Convert to list of dictioaries as returned by the api
data = data.to_dict("records")
# Incorporate input data into cleaning
cleaner = EpcClean(data)
lighting_averages = cleaner.lighting_averages
#
# TODO: All of these outputs can be stored by constituency so we can reduce the amount
# of data we fetch
#
# TODO: WE need to store lighting_averages to a s3
# We should also extend these averages so they're by more variables (property type, age band,
# constituency,
# etc)
cleaner.clean()
# TODO: cleaner.cleaned datasets to s3
# TODO: Add property age band into this
uvalue_estimates = UvalueEstimations(data=data)
uvalue_estimates.get_estimates(cleaner=cleaner)
# TODO: Store these to a s3
uvalue_estimates.walls
uvalue_estimates.floors
uvalue_estimates.roofs