Added new epc-api-python version and increased amount of data being used

2026-07-27 23:35:01 +00:00 · 2023-07-01 16:02:35 +01:00 · 2023-07-01 16:02:35 +01:00 · 7724c216a8
commit 7724c216a8
parent 8c55df82fa
3 changed files with 75 additions and 11 deletions
--- a/model_data/app.py
+++ b/model_data/app.py
@ -77,12 +77,12 @@ def handler():
    # We pull properties from local authorities, by property type. This will allow us to build
    # a dataset of up to 10k properties per local authority/property type combination
    data = []
-    for la in tqdm(local_authorities):
+    for c in tqdm(constituencies):
        for pt in property_types:
            data.extend(
                pagenated_epc_download(
                    client=epc_client,
-                    params={"local-authority": la, "property-type": pt},
+                    params={"constituency": c, "property-type": pt},
                    page_size=5000,
                    n_pages=10,
                )
@ -240,11 +240,17 @@ def handler():
        # "construction-age-band",
        "number-habitable-rooms",
        "constituency",
+        "number-heated-rooms",
    ]

    component_features = [
        "walls-description",
        "floor-description",
+        "lighting-description",
+        "windows-description",
+        "roof-description",
+        "mainheat-description",
+        "main-fuel"
    ]

    model_data = df[[response] + component_features + base_features]
@ -253,18 +259,37 @@ def handler():

    # Append on u-value estimates
    model_data = model_data.merge(
-        pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]],
+        pd.DataFrame(cleaner.cleaned["walls-description"])[["original_description", "thermal_transmittance"]].rename(
+            columns={"thermal_transmittance": "walls_u_value", }
+        ),
        how="left",
        left_on="walls-description",
        right_on="original_description"
+    ) \
+        .drop(columns=["original_description"]) \
+        .merge(
+        pd.DataFrame(cleaner.cleaned["floor-description"])[["original_description", "thermal_transmittance"]].rename(
+            columns={"thermal_transmittance": "floor_u_value", }
+        ),
+        how="left",
+        left_on="floor-description",
+        right_on="original_description"
    )
    # Take just entries with U-values
-    model_data = model_data[~pd.isnull(model_data["thermal_transmittance"])]
-    model_data = model_data[base_features + ["thermal_transmittance", response]]
+    model_data = model_data[
+        ~pd.isnull(model_data["walls_u_value"]) &
+        ~pd.isnull(model_data["floor_u_value"])
+        ]
+    model_data = model_data[
+        base_features + [c for c in component_features if c not in [
+            "walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", response]
+        ]

    # We need to split the data into a train and test set for model build
    categorical_cols = [
-        "property-type", "built-form", "number-habitable-rooms", "constituency",
+        "property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
+        "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
+
    ]

    # If these categorical variables are not of type 'category', convert them
@ -325,7 +350,7 @@ def handler():

    import numpy as np
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
-        median_absolute_error
+        median_absolute_error, mean_absolute_percentage_error

    def calculate_regression_metrics(y_true, y_pred, n=20):
        """
@ -340,11 +365,14 @@ def handler():
        """
        metrics = {}

+        metrics['MAPE'] = mean_absolute_percentage_error(y_true, y_pred)
        metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
        metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
        metrics['R2 Score'] = r2_score(y_true, y_pred)
        metrics['Explained Variance Score'] = explained_variance_score(y_true, y_pred)
        metrics['Median Absolute Error'] = median_absolute_error(y_true, y_pred)
+        metrics['Mean True Value'] = y_true.mean()
+        metrics['Mean Predicted Value'] = y_pred.mean()

        errors = pd.DataFrame()
        errors['Fit'] = y_true
@ -358,8 +386,44 @@ def handler():

    fit_error, worst_errors = calculate_regression_metrics(y_true=Y, y_pred=results.fittedvalues)

-    worst_x = model_data[model_data.index.isin(worst_errors.index)]
+    model_data['fit'] = results.fittedvalues
    # The worst errors over index heavily for flats
+    worst_x = model_data[model_data.index.isin(worst_errors.index)]
+
+    # Notes
+    # TODO: We might want to look at adding in the u-value estimates for the properties that do not have them
+    # so that we have move data.
+    # TODO: Add in the u-values for roofs rather than the description
+    # TODO: Add in the actual property features for walls, floors, roof, not just the u-value
+    # TODO: Think about how we use sap vs rdsap - should we add a feature in the model for transaction-type?
+    #
+    # property type looks okay - we're definitely low on the number of bungalows
+    # number-habitable-rooms & number-heated-rooms is unpopulated so pretty useless atm
+    # **** constituency should be looked at - potentially modelled individually as some constituencies
+    # peform much worse that others despite enough data.
+    # **** Lighting is a bit of mess - needs to be looked at. Most properties are of the same type
+    # and a few of the categories just have barely any data and poor scores
+    # **** windows-description again most of the properties are of the same type, need more samples
+    # for thge smaller groups
+    # **** Turn roof into U-value
+    # **** mainheat is a bad one - community scheme seems to actually be quite a lot of properties, it's ok for
+    #      MAPE though.
+
+    grouped_error = []
+    groupby = ["mainheat-description"]
+    for group, data in model_data.groupby(groupby, observed=True):
+        group_fit_error, _ = calculate_regression_metrics(y_true=data[response].astype(float), y_pred=data["fit"])
+        # plot_regression(pd.DataFrame({"fit": data["fit"].values, "actual": data[response].astype(float).values}))
+        grouped_error.append(
+            {
+                **dict(zip(groupby, group)),
+                "n_samples": data.shape[0],
+                **group_fit_error,
+            }
+        )
+
+    grouped_error = pd.DataFrame(grouped_error)
+    grouped_error = grouped_error.sort_values("R2 Score", ascending=True)

    fit_df = pd.DataFrame(
        {
--- a/model_data/downloader.py
+++ b/model_data/downloader.py
@ -15,9 +15,9 @@ def pagenated_epc_download(client, params, page_size, n_pages, verbose=0, slowdo
        # Note: We can only make 10k queries for a single set of search queries.
        # It might make sense to download data via zip for machine learning since we don't need this
        # data to be perfectly up to date
-        if search_resp is None:
+        if not search_resp:
            break
-            
+
        n_completed += 1

        results.extend(search_resp["rows"])
--- a/model_data/requirements.txt
+++ b/model_data/requirements.txt
@ -1,4 +1,4 @@
-epc-api-python
+epc-api-python==1.0.2
 python-dotenv
 tqdm
 pandas