From ba201c8b6a67d8786d4b51b1ae4c77d5dee35eef Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Jul 2023 14:39:46 +0100 Subject: [PATCH] working on sap model but need to clean lighting --- model_data/analysis/SapModel.py | 158 +++++++++++++++++++++++++------- model_data/app.py | 20 ---- 2 files changed, 127 insertions(+), 51 deletions(-) diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index 3d84d193..4832a2e8 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -6,7 +6,7 @@ import pickle from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ median_absolute_error, mean_absolute_percentage_error -with open("all_data.pkl", "wb") as f: +with open("all_data.pkl", "rb") as f: all_data = pickle.load(f) @@ -18,25 +18,53 @@ class SalModel: BASE_FEATURES = [ "property-type", "built-form", - # "construction-age-band", + "construction-age-band", "number-habitable-rooms", "constituency", "number-heated-rooms", + "transaction-type" ] COMPONENT_FEATURES = [ "walls-description", "floor-description", "lighting-description", - "windows-description", "roof-description", "mainheat-description", - "main-fuel" + "hotwater-description", + "main-fuel", + "mechanical-ventilation", + "secondheat-description", + "energy-tariff", + "solar-water-heating-flag", + "photo-supply", + "windows-description", + "glazed-type", + "glazed-area", + "multi-glaze-proportion", + # "lighting-description" # Might not need to use this + "low-energy-lighting", + "number-open-fireplaces", ] CATEGORICAL_COLS = [ - "property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms", - "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel", + "property-type", + "built-form", + "number-habitable-rooms", + "constituency", + "number-heated-rooms", + "lighting-description", + "mainheat-description", + "hotwater-description", + "main-fuel", + "mechanical-ventilation", + "secondheat-description", + "energy-tariff", + "solar-water-heating-flag", + "windows-description", + "glazed-type", + "glazed-area", + "mainheat-description", ] @@ -51,41 +79,103 @@ class SalModel: self.fit_error = None self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()} + def _append_extracted_u_values(self, model_data): + """ + We need to estimate the u-value impact for: + 1) Walls + 2) Roof + 3) Floors + """ + + wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[ + ["original_description", "thermal_transmittance"]].rename( + columns={"thermal_transmittance": "walls_u_value"} + ) + + floor_u_values = pd.DataFrame(self.cleaner.cleaned["floor-description"])[ + ["original_description", "thermal_transmittance"]].rename( + columns={"thermal_transmittance": "floor_u_value"} + ) + + roof_u_values = pd.DataFrame(self.cleaner.cleaned["roof-description"])[ + ["original_description", "thermal_transmittance"]].rename( + columns={"thermal_transmittance": "roof_u_value", } + ) + + model_data = model_data.merge( + wall_u_values, + how="left", + left_on="walls-description", + right_on="original_description" + ).drop( + columns=["original_description"] + ).merge( + floor_u_values, + how="left", + left_on="floor-description", + right_on="original_description" + ).drop( + columns=["original_description"] + ).merge( + roof_u_values, + how="left", + left_on="roof-description", + right_on="original_description" + ) + + return model_data + + @staticmethod + def _convert_transaction_type(model_data): + model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling" + model_data = model_data.drop(columns=["transaction-type"]) + return model_data + + @staticmethod + def _clean_numericals(model_data): + + for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting"]: + model_data[col] = np.where( + model_data[col] == "", "0", model_data["photo-supply"] + ).astype(float) + + # We need to clean lighting + + return model_data + def create_dataset(self): model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES] model_data = model_data.reset_index(drop=True) model_data["idx"] = model_data.index.copy() - # Append on u-value estimates - model_data = model_data.merge( - pd.DataFrame(self.cleaner.cleaned["walls-description"])[ - ["original_description", "thermal_transmittance"]].rename( - columns={"thermal_transmittance": "walls_u_value", } - ), - how="left", - left_on="walls-description", - right_on="original_description" - ) \ - .drop(columns=["original_description"]) \ - .merge( - pd.DataFrame(self.cleaner.cleaned["floor-description"])[ - ["original_description", "thermal_transmittance"]].rename( - columns={"thermal_transmittance": "floor_u_value", } - ), - how="left", - left_on="floor-description", - right_on="original_description" - ) + # Append on u-values + model_data = self._append_extracted_u_values(model_data) + + # Convert transaction_type + model_data = self._convert_transaction_type(model_data) + + # Clean numerical columns + model_data = self._clean_numericals(model_data) + # Take just entries with U-values + # TODO: Rather than doing this, do we want to include the estimated u-values? + # Since this ends up with just 2k entries model_data = model_data[ ~pd.isnull(model_data["walls_u_value"]) & - ~pd.isnull(model_data["floor_u_value"]) - ] - model_data = model_data[ - self.BASE_FEATURES + [c for c in self.COMPONENT_FEATURES if c not in [ - "walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", self.RESPONSE] + ~pd.isnull(model_data["floor_u_value"]) & + ~pd.isnull(model_data["roof_u_value"]) ] + exclude_features = ["walls-description", "floor-description", "roof-description", "transaction-type"] + + features = [ + x for x in self.BASE_FEATURES + + self.COMPONENT_FEATURES + + ["walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE] if x not in exclude_features + ] + + model_data = model_data[features] + for col in self.CATEGORICAL_COLS: model_data[col] = model_data[col].astype('category') @@ -168,3 +258,9 @@ class SalModel: worst_errors = errors.nlargest(n, 'Absolute Residual') return metrics, worst_errors + + +self = SalModel( + data=all_data["data"], + cleaner=all_data["cleaner"] +) diff --git a/model_data/app.py b/model_data/app.py index c0159e1d..2fcf48a9 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -265,26 +265,6 @@ def handler(): import numpy as np - # Notes - # TODO: We might want to look at adding in the u-value estimates for the properties that do not have them - # so that we have move data. - # TODO: Add in the u-values for roofs rather than the description - # TODO: Add in the actual property features for walls, floors, roof, not just the u-value - # TODO: Think about how we use sap vs rdsap - should we add a feature in the model for transaction-type? - # TODO: Remove cases where descriptions have no data or are error cases - # - # property type looks okay - we're definitely low on the number of bungalows - # number-habitable-rooms & number-heated-rooms is unpopulated so pretty useless atm - # **** constituency should be looked at - potentially modelled individually as some constituencies - # peform much worse that others despite enough data. - # **** Lighting is a bit of mess - needs to be looked at. Most properties are of the same type - # and a few of the categories just have barely any data and poor scores - # **** windows-description again most of the properties are of the same type, need more samples - # for thge smaller groups - # **** Turn roof into U-value - # **** mainheat is a bad one - community scheme seems to actually be quite a lot of properties, it's ok for - # MAPE though. - grouped_error = [] groupby = ["mainheat-description"] for group, data in model_data.groupby(groupby, observed=True):