From ba201c8b6a67d8786d4b51b1ae4c77d5dee35eef Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 3 Jul 2023 14:39:46 +0100
Subject: [PATCH] working on sap model but need to clean lighting

---
 model_data/analysis/SapModel.py | 158 +++++++++++++++++++++++++-------
 model_data/app.py               |  20 ----
 2 files changed, 127 insertions(+), 51 deletions(-)

diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py
index 3d84d193..4832a2e8 100644
--- a/model_data/analysis/SapModel.py
+++ b/model_data/analysis/SapModel.py
@@ -6,7 +6,7 @@ import pickle
 from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
     median_absolute_error, mean_absolute_percentage_error
 
-with open("all_data.pkl", "wb") as f:
+with open("all_data.pkl", "rb") as f:
     all_data = pickle.load(f)
 
 
@@ -18,25 +18,53 @@ class SalModel:
     BASE_FEATURES = [
         "property-type",
         "built-form",
-        # "construction-age-band",
+        "construction-age-band",
         "number-habitable-rooms",
         "constituency",
         "number-heated-rooms",
+        "transaction-type"
     ]
 
     COMPONENT_FEATURES = [
         "walls-description",
         "floor-description",
         "lighting-description",
-        "windows-description",
         "roof-description",
         "mainheat-description",
-        "main-fuel"
+        "hotwater-description",
+        "main-fuel",
+        "mechanical-ventilation",
+        "secondheat-description",
+        "energy-tariff",
+        "solar-water-heating-flag",
+        "photo-supply",
+        "windows-description",
+        "glazed-type",
+        "glazed-area",
+        "multi-glaze-proportion",
+        # "lighting-description"  # Might not need to use this
+        "low-energy-lighting",
+        "number-open-fireplaces",
     ]
 
     CATEGORICAL_COLS = [
-        "property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
-        "lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
+        "property-type",
+        "built-form",
+        "number-habitable-rooms",
+        "constituency",
+        "number-heated-rooms",
+        "lighting-description",
+        "mainheat-description",
+        "hotwater-description",
+        "main-fuel",
+        "mechanical-ventilation",
+        "secondheat-description",
+        "energy-tariff",
+        "solar-water-heating-flag",
+        "windows-description",
+        "glazed-type",
+        "glazed-area",
+        "mainheat-description",
 
     ]
 
@@ -51,41 +79,103 @@ class SalModel:
         self.fit_error = None
         self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
 
+    def _append_extracted_u_values(self, model_data):
+        """
+        We need to estimate the u-value impact for:
+        1) Walls
+        2) Roof
+        3) Floors
+        """
+
+        wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[
+            ["original_description", "thermal_transmittance"]].rename(
+            columns={"thermal_transmittance": "walls_u_value"}
+        )
+
+        floor_u_values = pd.DataFrame(self.cleaner.cleaned["floor-description"])[
+            ["original_description", "thermal_transmittance"]].rename(
+            columns={"thermal_transmittance": "floor_u_value"}
+        )
+
+        roof_u_values = pd.DataFrame(self.cleaner.cleaned["roof-description"])[
+            ["original_description", "thermal_transmittance"]].rename(
+            columns={"thermal_transmittance": "roof_u_value", }
+        )
+
+        model_data = model_data.merge(
+            wall_u_values,
+            how="left",
+            left_on="walls-description",
+            right_on="original_description"
+        ).drop(
+            columns=["original_description"]
+        ).merge(
+            floor_u_values,
+            how="left",
+            left_on="floor-description",
+            right_on="original_description"
+        ).drop(
+            columns=["original_description"]
+        ).merge(
+            roof_u_values,
+            how="left",
+            left_on="roof-description",
+            right_on="original_description"
+        )
+
+        return model_data
+
+    @staticmethod
+    def _convert_transaction_type(model_data):
+        model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling"
+        model_data = model_data.drop(columns=["transaction-type"])
+        return model_data
+
+    @staticmethod
+    def _clean_numericals(model_data):
+
+        for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting"]:
+            model_data[col] = np.where(
+                model_data[col] == "", "0", model_data["photo-supply"]
+            ).astype(float)
+
+        # We need to clean lighting
+
+        return model_data
+
     def create_dataset(self):
         model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
         model_data = model_data.reset_index(drop=True)
         model_data["idx"] = model_data.index.copy()
 
-        # Append on u-value estimates
-        model_data = model_data.merge(
-            pd.DataFrame(self.cleaner.cleaned["walls-description"])[
-                ["original_description", "thermal_transmittance"]].rename(
-                columns={"thermal_transmittance": "walls_u_value", }
-            ),
-            how="left",
-            left_on="walls-description",
-            right_on="original_description"
-        ) \
-            .drop(columns=["original_description"]) \
-            .merge(
-            pd.DataFrame(self.cleaner.cleaned["floor-description"])[
-                ["original_description", "thermal_transmittance"]].rename(
-                columns={"thermal_transmittance": "floor_u_value", }
-            ),
-            how="left",
-            left_on="floor-description",
-            right_on="original_description"
-        )
+        # Append on u-values
+        model_data = self._append_extracted_u_values(model_data)
+
+        # Convert transaction_type
+        model_data = self._convert_transaction_type(model_data)
+
+        # Clean numerical columns
+        model_data = self._clean_numericals(model_data)
+
         # Take just entries with U-values
+        # TODO: Rather than doing this, do we want to include the estimated u-values?
+        #       Since this ends up with just 2k entries
         model_data = model_data[
             ~pd.isnull(model_data["walls_u_value"]) &
-            ~pd.isnull(model_data["floor_u_value"])
-            ]
-        model_data = model_data[
-            self.BASE_FEATURES + [c for c in self.COMPONENT_FEATURES if c not in [
-                "walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", self.RESPONSE]
+            ~pd.isnull(model_data["floor_u_value"]) &
+            ~pd.isnull(model_data["roof_u_value"])
             ]
 
+        exclude_features = ["walls-description", "floor-description", "roof-description", "transaction-type"]
+
+        features = [
+            x for x in self.BASE_FEATURES +
+                       self.COMPONENT_FEATURES +
+                       ["walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE] if x not in exclude_features
+        ]
+
+        model_data = model_data[features]
+
         for col in self.CATEGORICAL_COLS:
             model_data[col] = model_data[col].astype('category')
 
@@ -168,3 +258,9 @@ class SalModel:
         worst_errors = errors.nlargest(n, 'Absolute Residual')
 
         return metrics, worst_errors
+
+
+self = SalModel(
+    data=all_data["data"],
+    cleaner=all_data["cleaner"]
+)
diff --git a/model_data/app.py b/model_data/app.py
index c0159e1d..2fcf48a9 100644
--- a/model_data/app.py
+++ b/model_data/app.py
@@ -265,26 +265,6 @@ def handler():
 
     import numpy as np
 
-    # Notes
-    # TODO: We might want to look at adding in the u-value estimates for the properties that do not have them
-    # so that we have move data.
-    # TODO: Add in the u-values for roofs rather than the description
-    # TODO: Add in the actual property features for walls, floors, roof, not just the u-value
-    # TODO: Think about how we use sap vs rdsap - should we add a feature in the model for transaction-type?
-    # TODO: Remove cases where descriptions have no data or are error cases
-    #
-    # property type looks okay - we're definitely low on the number of bungalows
-    # number-habitable-rooms & number-heated-rooms is unpopulated so pretty useless atm
-    # **** constituency should be looked at - potentially modelled individually as some constituencies
-    # peform much worse that others despite enough data.
-    # **** Lighting is a bit of mess - needs to be looked at. Most properties are of the same type
-    # and a few of the categories just have barely any data and poor scores
-    # **** windows-description again most of the properties are of the same type, need more samples
-    # for thge smaller groups
-    # **** Turn roof into U-value
-    # **** mainheat is a bad one - community scheme seems to actually be quite a lot of properties, it's ok for
-    #      MAPE though.
-
     grouped_error = []
     groupby = ["mainheat-description"]
     for group, data in model_data.groupby(groupby, observed=True):