mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
Added new epc-api-python version and increased amount of data being used
This commit is contained in:
parent
8c55df82fa
commit
7724c216a8
3 changed files with 75 additions and 11 deletions
|
|
@ -77,12 +77,12 @@ def handler():
|
||||||
# We pull properties from local authorities, by property type. This will allow us to build
|
# We pull properties from local authorities, by property type. This will allow us to build
|
||||||
# a dataset of up to 10k properties per local authority/property type combination
|
# a dataset of up to 10k properties per local authority/property type combination
|
||||||
data = []
|
data = []
|
||||||
for la in tqdm(local_authorities):
|
for c in tqdm(constituencies):
|
||||||
for pt in property_types:
|
for pt in property_types:
|
||||||
data.extend(
|
data.extend(
|
||||||
pagenated_epc_download(
|
pagenated_epc_download(
|
||||||
client=epc_client,
|
client=epc_client,
|
||||||
params={"local-authority": la, "property-type": pt},
|
params={"constituency": c, "property-type": pt},
|
||||||
page_size=5000,
|
page_size=5000,
|
||||||
n_pages=10,
|
n_pages=10,
|
||||||
)
|
)
|
||||||
|
|
@ -240,11 +240,17 @@ def handler():
|
||||||
# "construction-age-band",
|
# "construction-age-band",
|
||||||
"number-habitable-rooms",
|
"number-habitable-rooms",
|
||||||
"constituency",
|
"constituency",
|
||||||
|
"number-heated-rooms",
|
||||||
]
|
]
|
||||||
|
|
||||||
component_features = [
|
component_features = [
|
||||||
"walls-description",
|
"walls-description",
|
||||||
"floor-description",
|
"floor-description",
|
||||||
|
"lighting-description",
|
||||||
|
"windows-description",
|
||||||
|
"roof-description",
|
||||||
|
"mainheat-description",
|
||||||
|
"main-fuel"
|
||||||
]
|
]
|
||||||
|
|
||||||
model_data = df[[response] + component_features + base_features]
|
model_data = df[[response] + component_features + base_features]
|
||||||
|
|
@ -253,18 +259,37 @@ def handler():
|
||||||
|
|
||||||
# Append on u-value estimates
|
# Append on u-value estimates
|
||||||
model_data = model_data.merge(
|
model_data = model_data.merge(
|
||||||
pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]],
|
pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]].rename(
|
||||||
|
columns={"thermal_transmittance": "walls_u_value", }
|
||||||
|
),
|
||||||
how="left",
|
how="left",
|
||||||
left_on="walls-description",
|
left_on="walls-description",
|
||||||
right_on="original_description"
|
right_on="original_description"
|
||||||
|
) \
|
||||||
|
.drop(columns=["original_description"]) \
|
||||||
|
.merge(
|
||||||
|
pd.DataFrame(cleaner.cleaned["floor-description"])[["original_description", "thermal_transmittance"]].rename(
|
||||||
|
columns={"thermal_transmittance": "floor_u_value", }
|
||||||
|
),
|
||||||
|
how="left",
|
||||||
|
left_on="floor-description",
|
||||||
|
right_on="original_description"
|
||||||
)
|
)
|
||||||
# Take just entries with U-values
|
# Take just entries with U-values
|
||||||
model_data = model_data[~pd.isnull(model_data["thermal_transmittance"])]
|
model_data = model_data[
|
||||||
model_data = model_data[base_features + ["thermal_transmittance", response]]
|
~pd.isnull(model_data["walls_u_value"]) &
|
||||||
|
~pd.isnull(model_data["floor_u_value"])
|
||||||
|
]
|
||||||
|
model_data = model_data[
|
||||||
|
base_features + [c for c in component_features if c not in [
|
||||||
|
"walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", response]
|
||||||
|
]
|
||||||
|
|
||||||
# We need to split the data into a train and test set for model build
|
# We need to split the data into a train and test set for model build
|
||||||
categorical_cols = [
|
categorical_cols = [
|
||||||
"property-type", "built-form", "number-habitable-rooms", "constituency",
|
"property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
|
||||||
|
"lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# If these categorical variables are not of type 'category', convert them
|
# If these categorical variables are not of type 'category', convert them
|
||||||
|
|
@ -325,7 +350,7 @@ def handler():
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||||
median_absolute_error
|
median_absolute_error, mean_absolute_percentage_error
|
||||||
|
|
||||||
def calculate_regression_metrics(y_true, y_pred, n=20):
|
def calculate_regression_metrics(y_true, y_pred, n=20):
|
||||||
"""
|
"""
|
||||||
|
|
@ -340,11 +365,14 @@ def handler():
|
||||||
"""
|
"""
|
||||||
metrics = {}
|
metrics = {}
|
||||||
|
|
||||||
|
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
|
||||||
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
|
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
|
||||||
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
|
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
|
||||||
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
metrics['R2 Score'] = r2_score(y_true, y_pred)
|
||||||
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
|
||||||
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
|
||||||
|
metrics['Mean True Value'] = y_true.mean()
|
||||||
|
metrics['Mean Predicted Value'] = y_pred.mean()
|
||||||
|
|
||||||
errors = pd.DataFrame()
|
errors = pd.DataFrame()
|
||||||
errors['Fit'] = y_true
|
errors['Fit'] = y_true
|
||||||
|
|
@ -358,8 +386,44 @@ def handler():
|
||||||
|
|
||||||
fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
|
fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
|
||||||
|
|
||||||
worst_x = model_data[model_data.index.isin(worst_errors.index)]
|
model_data['fit'] = results.fittedvalues
|
||||||
# The worst errors over index heavily for flats
|
# The worst errors over index heavily for flats
|
||||||
|
worst_x = model_data[model_data.index.isin(worst_errors.index)]
|
||||||
|
|
||||||
|
# Notes
|
||||||
|
# TODO: We might want to look at adding in the u-value estimates for the properties that do not have them
|
||||||
|
# so that we have move data.
|
||||||
|
# TODO: Add in the u-values for roofs rather than the description
|
||||||
|
# TODO: Add in the actual property features for walls, floors, roof, not just the u-value
|
||||||
|
# TODO: Think about how we use sap vs rdsap - should we add a feature in the model for transaction-type?
|
||||||
|
#
|
||||||
|
# property type looks okay - we're definitely low on the number of bungalows
|
||||||
|
# number-habitable-rooms & number-heated-rooms is unpopulated so pretty useless atm
|
||||||
|
# **** constituency should be looked at - potentially modelled individually as some constituencies
|
||||||
|
# peform much worse that others despite enough data.
|
||||||
|
# **** Lighting is a bit of mess - needs to be looked at. Most properties are of the same type
|
||||||
|
# and a few of the categories just have barely any data and poor scores
|
||||||
|
# **** windows-description again most of the properties are of the same type, need more samples
|
||||||
|
# for thge smaller groups
|
||||||
|
# **** Turn roof into U-value
|
||||||
|
# **** mainheat is a bad one - community scheme seems to actually be quite a lot of properties, it's ok for
|
||||||
|
# MAPE though.
|
||||||
|
|
||||||
|
grouped_error = []
|
||||||
|
groupby = ["mainheat-description"]
|
||||||
|
for group, data in model_data.groupby(groupby, observed=True):
|
||||||
|
group_fit_error, _ = calculate_regression_metrics(y_true=data[response].astype(float), y_pred=data["fit"])
|
||||||
|
# plot_regression(pd.DataFrame({"fit": data["fit"].values, "actual": data[response].astype(float).values}))
|
||||||
|
grouped_error.append(
|
||||||
|
{
|
||||||
|
**dict(zip(groupby, group)),
|
||||||
|
"n_samples": data.shape[0],
|
||||||
|
**group_fit_error,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
grouped_error = pd.DataFrame(grouped_error)
|
||||||
|
grouped_error = grouped_error.sort_values("R2 Score", ascending=True)
|
||||||
|
|
||||||
fit_df = pd.DataFrame(
|
fit_df = pd.DataFrame(
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -15,9 +15,9 @@ def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdo
|
||||||
# Note: We can only make 10k queries for a single set of search queries.
|
# Note: We can only make 10k queries for a single set of search queries.
|
||||||
# It might make sense to download data via zip for machine learning since we don't need this
|
# It might make sense to download data via zip for machine learning since we don't need this
|
||||||
# data to be perfectly up to date
|
# data to be perfectly up to date
|
||||||
if search_resp is None:
|
if not search_resp:
|
||||||
break
|
break
|
||||||
|
|
||||||
n_completed += 1
|
n_completed += 1
|
||||||
|
|
||||||
results.extend(search_resp["rows"])
|
results.extend(search_resp["rows"])
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
epc-api-python
|
epc-api-python==1.0.2
|
||||||
python-dotenv
|
python-dotenv
|
||||||
tqdm
|
tqdm
|
||||||
pandas
|
pandas
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue