mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added new epc-api-python version and increased amount of data being used
This commit is contained in:
parent
8c55df82fa
commit
7724c216a8
3 changed files with 75 additions and 11 deletions
|
|
@ -77,12 +77,12 @@ def handler():
|
|||
# We pull properties from local authorities, by property type. This will allow us to build
|
||||
# a dataset of up to 10k properties per local authority/property type combination
|
||||
data = []
|
||||
for la in tqdm(local_authorities):
|
||||
for c in tqdm(constituencies):
|
||||
for pt in property_types:
|
||||
data.extend(
|
||||
pagenated_epc_download(
|
||||
client=epc_client,
|
||||
params={"local-authority": la, "property-type": pt},
|
||||
params={"constituency": c, "property-type": pt},
|
||||
page_size=5000,
|
||||
n_pages=10,
|
||||
)
|
||||
|
|
@ -240,11 +240,17 @@ def handler():
|
|||
# "construction-age-band",
|
||||
"number-habitable-rooms",
|
||||
"constituency",
|
||||
"number-heated-rooms",
|
||||
]
|
||||
|
||||
component_features = [
|
||||
"walls-description",
|
||||
"floor-description",
|
||||
"lighting-description",
|
||||
"windows-description",
|
||||
"roof-description",
|
||||
"mainheat-description",
|
||||
"main-fuel"
|
||||
]
|
||||
|
||||
model_data = df[[response] + component_features + base_features]
|
||||
|
|
@ -253,18 +259,37 @@ def handler():
|
|||
|
||||
# Append on u-value estimates
|
||||
model_data = model_data.merge(
|
||||
pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]],
|
||||
pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]].rename(
|
||||
columns={"thermal_transmittance": "walls_u_value", }
|
||||
),
|
||||
how="left",
|
||||
left_on="walls-description",
|
||||
right_on="original_description"
|
||||
) \
|
||||
.drop(columns=["original_description"]) \
|
||||
.merge(
|
||||
pd.DataFrame(cleaner.cleaned["floor-description"])[["original_description", "thermal_transmittance"]].rename(
|
||||
columns={"thermal_transmittance": "floor_u_value", }
|
||||
),
|
||||
how="left",
|
||||
left_on="floor-description",
|
||||
right_on="original_description"
|
||||
)
|
||||
# Take just entries with U-values
|
||||
model_data = model_data[~pd.isnull(model_data["thermal_transmittance"])]
|
||||
model_data = model_data[base_features + ["thermal_transmittance", response]]
|
||||
model_data = model_data[
|
||||
~pd.isnull(model_data["walls_u_value"]) &
|
||||
~pd.isnull(model_data["floor_u_value"])
|
||||
]
|
||||
model_data = model_data[
|
||||
base_features + [c for c in component_features if c not in [
|
||||
"walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", response]
|
||||
]
|
||||
|
||||
# We need to split the data into a train and test set for model build
|
||||
categorical_cols = [
|
||||
"property-type", "built-form", "number-habitable-rooms", "constituency",
|
||||
"property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
|
||||
"lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
|
||||
|
||||
]
|
||||
|
||||
# If these categorical variables are not of type 'category', convert them
|
||||
|
|
@ -325,7 +350,7 @@ def handler():
|
|||
|
||||
import numpy as np
|
||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||
median_absolute_error
|
||||
median_absolute_error, mean_absolute_percentage_error
|
||||
|
||||
def calculate_regression_metrics(y_true, y_pred, n=20):
|
||||
"""
|
||||
|
|
@ -340,11 +365,14 @@ def handler():
|
|||
"""
|
||||
metrics = {}
|
||||
|
||||
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
|
||||
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
|
||||
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
|
||||
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
||||
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
||||
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
||||
metrics['Mean True Value'] = y_true.mean()
|
||||
metrics['Mean Predicted Value'] = y_pred.mean()
|
||||
|
||||
errors = pd.DataFrame()
|
||||
errors['Fit'] = y_true
|
||||
|
|
@ -358,8 +386,44 @@ def handler():
|
|||
|
||||
fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
|
||||
|
||||
worst_x = model_data[model_data.index.isin(worst_errors.index)]
|
||||
model_data['fit'] = results.fittedvalues
|
||||
# The worst errors over index heavily for flats
|
||||
worst_x = model_data[model_data.index.isin(worst_errors.index)]
|
||||
|
||||
# Notes
|
||||
# TODO: We might want to look at adding in the u-value estimates for the properties that do not have them
|
||||
# so that we have move data.
|
||||
# TODO: Add in the u-values for roofs rather than the description
|
||||
# TODO: Add in the actual property features for walls, floors, roof, not just the u-value
|
||||
# TODO: Think about how we use sap vs rdsap - should we add a feature in the model for transaction-type?
|
||||
#
|
||||
# property type looks okay - we're definitely low on the number of bungalows
|
||||
# number-habitable-rooms & number-heated-rooms is unpopulated so pretty useless atm
|
||||
# **** constituency should be looked at - potentially modelled individually as some constituencies
|
||||
# peform much worse that others despite enough data.
|
||||
# **** Lighting is a bit of mess - needs to be looked at. Most properties are of the same type
|
||||
# and a few of the categories just have barely any data and poor scores
|
||||
# **** windows-description again most of the properties are of the same type, need more samples
|
||||
# for thge smaller groups
|
||||
# **** Turn roof into U-value
|
||||
# **** mainheat is a bad one - community scheme seems to actually be quite a lot of properties, it's ok for
|
||||
# MAPE though.
|
||||
|
||||
grouped_error = []
|
||||
groupby = ["mainheat-description"]
|
||||
for group, data in model_data.groupby(groupby, observed=True):
|
||||
group_fit_error, _ = calculate_regression_metrics(y_true=data[response].astype(float), y_pred=data["fit"])
|
||||
# plot_regression(pd.DataFrame({"fit": data["fit"].values, "actual": data[response].astype(float).values}))
|
||||
grouped_error.append(
|
||||
{
|
||||
**dict(zip(groupby, group)),
|
||||
"n_samples": data.shape[0],
|
||||
**group_fit_error,
|
||||
}
|
||||
)
|
||||
|
||||
grouped_error = pd.DataFrame(grouped_error)
|
||||
grouped_error = grouped_error.sort_values("R2 Score", ascending=True)
|
||||
|
||||
fit_df = pd.DataFrame(
|
||||
{
|
||||
|
|
|
|||
|
|
@ -15,9 +15,9 @@ def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdo
|
|||
# Note: We can only make 10k queries for a single set of search queries.
|
||||
# It might make sense to download data via zip for machine learning since we don't need this
|
||||
# data to be perfectly up to date
|
||||
if search_resp is None:
|
||||
if not search_resp:
|
||||
break
|
||||
|
||||
|
||||
n_completed += 1
|
||||
|
||||
results.extend(search_resp["rows"])
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
epc-api-python
|
||||
epc-api-python==1.0.2
|
||||
python-dotenv
|
||||
tqdm
|
||||
pandas
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue