From 44099f8d8362847a56ea418d6a4b41ca8739876c Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Thu, 3 Aug 2023 12:09:37 +0100
Subject: [PATCH] Working on simulation system data

---
 .idea/Model.iml                     |   2 +-
 .idea/misc.xml                      |   2 +-
 model_data/simulation_system/app.py | 137 ++++++++++++++++++++++++++--
 3 files changed, 130 insertions(+), 11 deletions(-)
diff --git a/.idea/Model.iml b/.idea/Model.iml
index 05b9012b..b03b31b1 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 3b05c6ac..ca0e1cd9 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>
diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py
index 15902d19..d846dcf7 100644
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/app.py
@@ -49,10 +49,56 @@ COMPONENT_FEATURES = [
     'EXTENSION_COUNT'
 ]
 
+# For these fields, we take an average if we have multiple values
 AVERAGE_FIXED_FEATURES = [
-    "TOTAL_FLOOR_AREA"
+    "TOTAL_FLOOR_AREA",
+    "FLOOR_HEIGHT"
 ]
 
+# For these fields, we take the latest value if we have multiple values
+# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
+# the most accurate
+LATEST_FIELD = [
+    "NUMBER_HABITABLE_ROOMS",
+    "NUMBER_HEATED_ROOMS",
+    "FIXED_LIGHTING_OUTLETS_COUNT",
+    "CONSTRUCTION_AGE_BAND"
+]
+
+# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
+MANDATORY_FIXED_FEATURES = [
+    "PROPERTY_TYPE",
+]
+
+# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
+# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
+# and Wales from 31 July 2014
+EARLIEST_EPC_DATE = "2014-08-01"
+
+RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
+HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
+
+
+def make_cleaning_averages(df):
+    cleaning_averages = df.groupby(
+        ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]
+    )[AVERAGE_FIXED_FEATURES].mean().reset_index()
+
+    return cleaning_averages
+
+
+FLOOR_LEVEL_MAP = {
+    "00": 0,
+    "01": 1,
+    "02": 2,
+    "03": 3,
+    "Basement": -1,
+    "Ground": 0,
+    "1st": 1,
+    "2nd": 2,
+    "3rd": 3,
+}
+
 
 def app():
     # Get all the files in the directory
@@ -63,6 +109,15 @@ def app():
         filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
         df = pd.read_csv(filepath, low_memory=False)
         df = df[~pd.isnull(df["UPRN"])]
+
+        df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
+
+        cleaning_averages = make_cleaning_averages(df)
+
+        # We remove EPCS that were conducted for a new build, since these are performed with
+        # full SAP, which produces different results to the RdSAP methodology
+        df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
+
         df["UPRN"] = df["UPRN"].astype(int).astype(str)
         counts = df.groupby("UPRN").size().reset_index()
         counts.columns = ["UPRN", "count"]
@@ -73,32 +128,70 @@ def app():
         df = df[df["UPRN"].isin(counts["UPRN"])]
         df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
 
-        for uprn, property_data in df.groupby("UPRN"):
+        results = []
+        for uprn, property_data in df.groupby("UPRN", observed=True):
 
             # Fixed features - these are property attributes that shouldn't change over time
 
+            ignore_epc = False
             fixed_data = {}
             for field in FIXED_FEATURES:
                 vals = property_data[field].dropna().unique()
                 # Remove invalid values
                 vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES]
 
-                if len(vals) > 1:
-                    raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
+                if field == "FLOOR_LEVEL":
+                    vals = list({FLOOR_LEVEL_MAP[v] for v in vals})
 
                 if field in AVERAGE_FIXED_FEATURES:
-                    # Check the values are too far apart
-                    if abs(vals[0] - vals[1]) / vals[0] > 0.1:
-                        raise ValueError("Large deviation in fixed feature {} - fix me".format(field))
 
-                    field_value = np.mean(vals)
+                    if len(vals) > 1:
+                        # Check the values are too far apart
+                        if abs(vals[0] - vals[1]) / vals[0] > 0.1:
+                            # Take the more recent value since it's likely to be more accurate
+                            vals = [vals[-1]]
+
+                    if vals:
+                        field_value = np.mean(vals)
+                    else:
+                        # Clean using averages
+                        avgs = cleaning_averages[
+                            (cleaning_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) &
+                            (cleaning_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0]) &
+                            (cleaning_averages["CONSTRUCTION_AGE_BAND"] == property_data["CONSTRUCTION_AGE_BAND"].iloc[
+                                0]) &
+                            (cleaning_averages["NUMBER_HABITABLE_ROOMS"] ==
+                             property_data["NUMBER_HABITABLE_ROOMS"].iloc[0]) &
+                            (cleaning_averages["NUMBER_HEATED_ROOMS"] == property_data["NUMBER_HEATED_ROOMS"].iloc[0])
+                            ]
+
+                        field_value = avgs[field].iloc[0]
+
+                elif field in LATEST_FIELD:
+                    field_value = vals[-1] if vals else None
                 else:
+                    if len(vals) > 1:
+                        if field in MANDATORY_FIXED_FEATURES:
+                            ignore_epc = True
+                        else:
+                            raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
+
                     field_value = vals[0] if vals else None
 
                 fixed_data[field] = field_value
 
-            variable_data = property_data[COMPONENT_FEATURES]
+            if ignore_epc:
+                continue
 
+            # We include the lodgement date here as we probably need to factor time into the
+            # model, since EPC standards and rigour have changed over time
+            variable_data = property_data[
+                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
+                ]
+
+            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
+            # e.g. first vs second, second vs third and also first vs third
+            property_model_data = []
             for idx in range(0, property_data.shape[0] - 1):
 
                 if idx >= property_data.shape[0] - 1:
@@ -106,3 +199,29 @@ def app():
 
                 starting_record = variable_data.iloc[idx]
                 ending_record = variable_data.iloc[idx + 1]
+                rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
+                heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE]
+
+                if rdsap_change == 0:
+                    # Assumption: We aren't interested in records that exhibit no change
+                    continue
+
+                # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
+                #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
+                #       within descriptions
+
+                starting_record = starting_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
+                ending_record = ending_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
+
+                features = pd.concat([starting_record, ending_record])
+
+                property_model_data.append(
+                    {
+                        "UPRN": uprn,
+                        "RDSAP_CHANGE": rdsap_change,
+                        "HEAT_DEMAND_CHANGE": heat_demand_change,
+                        **features.to_dict()
+                    }
+                )
+
+            results.extend(property_model_data)