Added new epc-api-python version and increased amount of data being used

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-01 16:02:35 +01:00
parent 8c55df82fa
commit 7724c216a8
3 changed files with 75 additions and 11 deletions

View file

@ -77,12 +77,12 @@ def handler():
# We pull properties from local authorities, by property type. This will allow us to build
# a dataset of up to 10k properties per local authority/property type combination
data = []
for la in tqdm(local_authorities):
for c in tqdm(constituencies):
for pt in property_types:
data.extend(
pagenated_epc_download(
client=epc_client,
params={"local-authority": la, "property-type": pt},
params={"constituency": c, "property-type": pt},
page_size=5000,
n_pages=10,
)
@ -240,11 +240,17 @@ def handler():
# "construction-age-band",
"number-habitable-rooms",
"constituency",
"number-heated-rooms",
]
component_features = [
"walls-description",
"floor-description",
"lighting-description",
"windows-description",
"roof-description",
"mainheat-description",
"main-fuel"
]
model_data = df[[response] + component_features + base_features]
@ -253,18 +259,37 @@ def handler():
# Append on u-value estimates
model_data = model_data.merge(
pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]],
pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "walls_u_value", }
),
how="left",
left_on="walls-description",
right_on="original_description"
) \
.drop(columns=["original_description"]) \
.merge(
pd.DataFrame(cleaner.cleaned["floor-description"])[["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "floor_u_value", }
),
how="left",
left_on="floor-description",
right_on="original_description"
)
# Take just entries with U-values
model_data = model_data[~pd.isnull(model_data["thermal_transmittance"])]
model_data = model_data[base_features + ["thermal_transmittance", response]]
model_data = model_data[
~pd.isnull(model_data["walls_u_value"]) &
~pd.isnull(model_data["floor_u_value"])
]
model_data = model_data[
base_features + [c for c in component_features if c not in [
"walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", response]
]
# We need to split the data into a train and test set for model build
categorical_cols = [
"property-type", "built-form", "number-habitable-rooms", "constituency",
"property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
"lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
]
# If these categorical variables are not of type 'category', convert them
@ -325,7 +350,7 @@ def handler():
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
median_absolute_error
median_absolute_error, mean_absolute_percentage_error
def calculate_regression_metrics(y_true, y_pred, n=20):
"""
@ -340,11 +365,14 @@ def handler():
"""
metrics = {}
metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
metrics['R2 Score'] = r2_score(y_true, y_pred)
metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
metrics['Mean True Value'] = y_true.mean()
metrics['Mean Predicted Value'] = y_pred.mean()
errors = pd.DataFrame()
errors['Fit'] = y_true
@ -358,8 +386,44 @@ def handler():
fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)
worst_x = model_data[model_data.index.isin(worst_errors.index)]
model_data['fit'] = results.fittedvalues
# The worst errors over index heavily for flats
worst_x = model_data[model_data.index.isin(worst_errors.index)]
# Notes
# TODO: We might want to look at adding in the u-value estimates for the properties that do not have them
# so that we have move data.
# TODO: Add in the u-values for roofs rather than the description
# TODO: Add in the actual property features for walls, floors, roof, not just the u-value
# TODO: Think about how we use sap vs rdsap - should we add a feature in the model for transaction-type?
#
# property type looks okay - we're definitely low on the number of bungalows
# number-habitable-rooms & number-heated-rooms is unpopulated so pretty useless atm
# **** constituency should be looked at - potentially modelled individually as some constituencies
# peform much worse that others despite enough data.
# **** Lighting is a bit of mess - needs to be looked at. Most properties are of the same type
# and a few of the categories just have barely any data and poor scores
# **** windows-description again most of the properties are of the same type, need more samples
# for thge smaller groups
# **** Turn roof into U-value
# **** mainheat is a bad one - community scheme seems to actually be quite a lot of properties, it's ok for
# MAPE though.
grouped_error = []
groupby = ["mainheat-description"]
for group, data in model_data.groupby(groupby, observed=True):
group_fit_error, _ = calculate_regression_metrics(y_true=data[response].astype(float), y_pred=data["fit"])
# plot_regression(pd.DataFrame({"fit": data["fit"].values, "actual": data[response].astype(float).values}))
grouped_error.append(
{
**dict(zip(groupby, group)),
"n_samples": data.shape[0],
**group_fit_error,
}
)
grouped_error = pd.DataFrame(grouped_error)
grouped_error = grouped_error.sort_values("R2 Score", ascending=True)
fit_df = pd.DataFrame(
{

View file

@ -15,9 +15,9 @@ def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdo
# Note: We can only make 10k queries for a single set of search queries.
# It might make sense to download data via zip for machine learning since we don't need this
# data to be perfectly up to date
if search_resp is None:
if not search_resp:
break
n_completed += 1
results.extend(search_resp["rows"])

View file

@ -1,4 +1,4 @@
epc-api-python
epc-api-python==1.0.2
python-dotenv
tqdm
pandas