diff --git a/model_data/app.py b/model_data/app.py index 235ebfc1..0a6055f1 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -77,12 +77,12 @@ def handler(): # We pull properties from local authorities, by property type. This will allow us to build # a dataset of up to 10k properties per local authority/property type combination data = [] - for la in tqdm(local_authorities): + for c in tqdm(constituencies): for pt in property_types: data.extend( pagenated_epc_download( client=epc_client, - params={"local-authority": la, "property-type": pt}, + params={"constituency": c, "property-type": pt}, page_size=5000, n_pages=10, ) @@ -240,11 +240,17 @@ def handler(): # "construction-age-band", "number-habitable-rooms", "constituency", + "number-heated-rooms", ] component_features = [ "walls-description", "floor-description", + "lighting-description", + "windows-description", + "roof-description", + "mainheat-description", + "main-fuel" ] model_data = df[[response] + component_features + base_features] @@ -253,18 +259,37 @@ def handler(): # Append on u-value estimates model_data = model_data.merge( - pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]], + pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]].rename( + columns={"thermal_transmittance": "walls_u_value", } + ), how="left", left_on="walls-description", right_on="original_description" + ) \ + .drop(columns=["original_description"]) \ + .merge( + pd.DataFrame(cleaner.cleaned["floor-description"])[["original_description", "thermal_transmittance"]].rename( + columns={"thermal_transmittance": "floor_u_value", } + ), + how="left", + left_on="floor-description", + right_on="original_description" ) # Take just entries with U-values - model_data = model_data[~pd.isnull(model_data["thermal_transmittance"])] - model_data = model_data[base_features + ["thermal_transmittance", response]] + model_data = model_data[ + ~pd.isnull(model_data["walls_u_value"]) & + ~pd.isnull(model_data["floor_u_value"]) + ] + model_data = model_data[ + base_features + [c for c in component_features if c not in [ + "walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", response] + ] # We need to split the data into a train and test set for model build categorical_cols = [ - "property-type", "built-form", "number-habitable-rooms", "constituency", + "property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms", + "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel", + ] # If these categorical variables are not of type 'category', convert them @@ -325,7 +350,7 @@ def handler(): import numpy as np from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ - median_absolute_error + median_absolute_error, mean_absolute_percentage_error def calculate_regression_metrics(y_true, y_pred, n=20): """ @@ -340,11 +365,14 @@ def handler(): """ metrics = {} + metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred) metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred) metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred) metrics['R2 Score'] = r2_score(y_true, y_pred) metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred) metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred) + metrics['Mean True Value'] = y_true.mean() + metrics['Mean Predicted Value'] = y_pred.mean() errors = pd.DataFrame() errors['Fit'] = y_true @@ -358,8 +386,44 @@ def handler(): fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues) - worst_x = model_data[model_data.index.isin(worst_errors.index)] + model_data['fit'] = results.fittedvalues # The worst errors over index heavily for flats + worst_x = model_data[model_data.index.isin(worst_errors.index)] + + # Notes + # TODO: We might want to look at adding in the u-value estimates for the properties that do not have them + # so that we have move data. + # TODO: Add in the u-values for roofs rather than the description + # TODO: Add in the actual property features for walls, floors, roof, not just the u-value + # TODO: Think about how we use sap vs rdsap - should we add a feature in the model for transaction-type? + # + # property type looks okay - we're definitely low on the number of bungalows + # number-habitable-rooms & number-heated-rooms is unpopulated so pretty useless atm + # **** constituency should be looked at - potentially modelled individually as some constituencies + # peform much worse that others despite enough data. + # **** Lighting is a bit of mess - needs to be looked at. Most properties are of the same type + # and a few of the categories just have barely any data and poor scores + # **** windows-description again most of the properties are of the same type, need more samples + # for thge smaller groups + # **** Turn roof into U-value + # **** mainheat is a bad one - community scheme seems to actually be quite a lot of properties, it's ok for + # MAPE though. + + grouped_error = [] + groupby = ["mainheat-description"] + for group, data in model_data.groupby(groupby, observed=True): + group_fit_error, _ = calculate_regression_metrics(y_true=data[response].astype(float), y_pred=data["fit"]) + # plot_regression(pd.DataFrame({"fit": data["fit"].values, "actual": data[response].astype(float).values})) + grouped_error.append( + { + **dict(zip(groupby, group)), + "n_samples": data.shape[0], + **group_fit_error, + } + ) + + grouped_error = pd.DataFrame(grouped_error) + grouped_error = grouped_error.sort_values("R2 Score", ascending=True) fit_df = pd.DataFrame( { diff --git a/model_data/downloader.py b/model_data/downloader.py index 7dcc84ba..5355367b 100644 --- a/model_data/downloader.py +++ b/model_data/downloader.py @@ -15,9 +15,9 @@ def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdo # Note: We can only make 10k queries for a single set of search queries. # It might make sense to download data via zip for machine learning since we don't need this # data to be perfectly up to date - if search_resp is None: + if not search_resp: break - + n_completed += 1 results.extend(search_resp["rows"]) diff --git a/model_data/requirements.txt b/model_data/requirements.txt index d1dfdd73..126c63ed 100644 --- a/model_data/requirements.txt +++ b/model_data/requirements.txt @@ -1,4 +1,4 @@ -epc-api-python +epc-api-python==1.0.2 python-dotenv tqdm pandas