mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added days elapsed calculations
This commit is contained in:
parent
235d85d5bd
commit
1b84033d0b
3 changed files with 74 additions and 52 deletions
|
|
@ -1,12 +1,14 @@
|
|||
from tqdm import tqdm
|
||||
import os
|
||||
import pandas as pd
|
||||
|
||||
from model_data.config import EPC_AUTH_TOKEN
|
||||
from epc_api.client import EpcClient
|
||||
from model_data.downloader import pagenated_epc_download
|
||||
from model_data.EpcClean import EpcClean
|
||||
from model_data.analysis.UvalueEstimations import UvalueEstimations
|
||||
from model_data.analysis.SapModel import SapModel
|
||||
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
|
||||
from pathlib import Path
|
||||
|
||||
LAND_REGISTRY_PATHS = [
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
|
||||
|
|
@ -19,6 +21,8 @@ LAND_REGISTRY_PATHS = [
|
|||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv",
|
||||
]
|
||||
|
||||
EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
|
||||
|
||||
def app():
|
||||
"""
|
||||
|
|
@ -28,36 +32,36 @@ def app():
|
|||
:return:
|
||||
"""
|
||||
|
||||
epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN)
|
||||
|
||||
constituencies = {'E14000555', 'E14000726', 'E14000720', 'E14000721', 'E14000553', 'E14000752'}
|
||||
property_types = ["bungalow", "flat", "house", "maisonette", "park home"]
|
||||
floor_areas = ["unknown", "s", "m", "l", "xl", "xxl", "xxxl"]
|
||||
|
||||
# We pull properties from local authorities, by property type. This will allow us to build
|
||||
# a dataset of up to 10k properties per local authority/property type combination
|
||||
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
||||
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
|
||||
# and Wales from 31 July 2014
|
||||
# Download data from August 2014 onwards
|
||||
data = []
|
||||
for c in tqdm(constituencies):
|
||||
for pt in property_types:
|
||||
for fa in floor_areas:
|
||||
data.extend(
|
||||
pagenated_epc_download(
|
||||
client=epc_client,
|
||||
params={
|
||||
"constituency": c,
|
||||
"property-type": pt,
|
||||
"from-month": 8,
|
||||
"from-year": 2014,
|
||||
"floor-area": fa,
|
||||
},
|
||||
page_size=5000,
|
||||
n_pages=10,
|
||||
)
|
||||
)
|
||||
# epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN)
|
||||
#
|
||||
# constituencies = {'E14000555', 'E14000726', 'E14000720', 'E14000721', 'E14000553', 'E14000752'}
|
||||
# property_types = ["bungalow", "flat", "house", "maisonette", "park home"]
|
||||
# floor_areas = ["unknown", "s", "m", "l", "xl", "xxl", "xxxl"]
|
||||
#
|
||||
# # We pull properties from local authorities, by property type. This will allow us to build
|
||||
# # a dataset of up to 10k properties per local authority/property type combination
|
||||
# # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
||||
# # conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
|
||||
# # and Wales from 31 July 2014
|
||||
# # Download data from August 2014 onwards
|
||||
# data = []
|
||||
# for c in tqdm(constituencies):
|
||||
# for pt in property_types:
|
||||
# for fa in floor_areas:
|
||||
# data.extend(
|
||||
# pagenated_epc_download(
|
||||
# client=epc_client,
|
||||
# params={
|
||||
# "constituency": c,
|
||||
# "property-type": pt,
|
||||
# "from-month": 8,
|
||||
# "from-year": 2014,
|
||||
# "floor-area": fa,
|
||||
# },
|
||||
# page_size=5000,
|
||||
# n_pages=10,
|
||||
# )
|
||||
# )
|
||||
|
||||
# Production of sample data for land registry
|
||||
# address_meta = [
|
||||
|
|
@ -75,20 +79,32 @@ def app():
|
|||
# with open("sample_addresses.pkl", "wb") as f:
|
||||
# pickle.dump(address_meta, f)
|
||||
|
||||
# Incorporate input data into cleaning
|
||||
cleaner = EpcClean(data)
|
||||
lighting_averages = cleaner.lighting_averages
|
||||
# TODO: WE need to store lighting_averages to a db
|
||||
# We should also extend these averages so they're by more variables (property type, age band, constituency,
|
||||
# etc)
|
||||
cleaner.clean()
|
||||
# TODO: cleaner.cleaned datasets to a db
|
||||
epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
for directory in epc_directories:
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
# Rename the columns to the same format as the api returns
|
||||
data.columns = [c.replace("_", "-").lower() for c in data.columns]
|
||||
# Take just date before the date threshold
|
||||
data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
|
||||
|
||||
# TODO: Add property age band into this
|
||||
uvalue_estimates = UvalueEstimations(data=data)
|
||||
uvalue_estimates.get_estimates(cleaner=cleaner)
|
||||
# TODO: Store these to a db
|
||||
# Convert to list of dictioaries as returned by the api
|
||||
data = data.to_dict("records")
|
||||
|
||||
sap_model = SapModel(data=data, cleaner=cleaner)
|
||||
sap_model.run()
|
||||
# TODO: Store outputs to db
|
||||
# Incorporate input data into cleaning
|
||||
cleaner = EpcClean(data)
|
||||
lighting_averages = cleaner.lighting_averages
|
||||
#
|
||||
# TODO: All of these outputs can be stored by constituency so we can reduce the amount
|
||||
# of data we fetch
|
||||
#
|
||||
# TODO: WE need to store lighting_averages to a s3
|
||||
# We should also extend these averages so they're by more variables (property type, age band,
|
||||
# constituency,
|
||||
# etc)
|
||||
cleaner.clean()
|
||||
# TODO: cleaner.cleaned datasets to s3
|
||||
|
||||
# TODO: Add property age band into this
|
||||
uvalue_estimates = UvalueEstimations(data=data)
|
||||
uvalue_estimates.get_estimates(cleaner=cleaner)
|
||||
# TODO: Store these to a s3
|
||||
|
|
|
|||
|
|
@ -53,6 +53,11 @@ DEPLOYMENT_FOLDER = "deployment"
|
|||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
||||
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
||||
|
||||
AVERAGE_FIXED_FEATURES = [
|
||||
"TOTAL_FLOOR_AREA",
|
||||
"FLOOR_HEIGHT"
|
||||
]
|
||||
|
||||
COLUMNS_TO_MERGE_ON = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ from simulation_system.core.Settings import (
|
|||
RDSAP_RESPONSE,
|
||||
HEAT_DEMAND_RESPONSE,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
EARLIEST_EPC_DATE
|
||||
)
|
||||
from simulation_system.core.DataProcessor import DataProcessor
|
||||
from utils import save_dataframe_to_s3_parquet
|
||||
|
|
@ -69,9 +70,6 @@ def app():
|
|||
property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
|
||||
)
|
||||
|
||||
# Taking just the last row, which is the percentage change from the latest to previous one only
|
||||
# property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
|
||||
|
||||
# Extract the columns that are not all None
|
||||
modified_property_data = DataProcessor.apply_averages_cleaning(
|
||||
data_to_clean=property_data,
|
||||
|
|
@ -143,9 +141,12 @@ def app():
|
|||
data_by_urpn_df = pd.DataFrame(data_by_urpn)
|
||||
# Add some temporal features - we look at the days from the standard starting point in time
|
||||
# for the starting and ending date so all records are from a fixed point
|
||||
# TODO: implement me
|
||||
data_by_urpn_df["DAYS_TO_STARTING"] = None
|
||||
data_by_urpn_df["DAYS_TO_ENDING"] = None
|
||||
data_by_urpn_df["DAYS_TO_STARTING"] = (
|
||||
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||
).dt.days
|
||||
data_by_urpn_df["DAYS_TO_ENDING"] = (
|
||||
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||
).dt.days
|
||||
|
||||
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
|
||||
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue