From e516a6ac4151a24f77432473a7310442762a0309 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 6 Sep 2023 17:08:18 +0100
Subject: [PATCH] make sure all responses are positive, began adding temporal
 features

---
 model_data/simulation_system/core/Settings.py |  11 +-
 .../generate_rdsap_change.py                  | 156 ++++++------------
 2 files changed, 51 insertions(+), 116 deletions(-)

diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py
index 259acddd..030747ee 100644
--- a/model_data/simulation_system/core/Settings.py
+++ b/model_data/simulation_system/core/Settings.py
@@ -53,12 +53,6 @@ DEPLOYMENT_FOLDER = "deployment"
 TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
 
-# If we have multiple records for a numerical field, such as floor area,
-# we check the margine for error between the biggest and lowest values. If we see large
-# swings in measured values, we take the most recent value for this field as we interpret this
-# as inaccurate measurements in the past and use the most recent value
-MULTIPLE_VALUES_MARGIN_FOR_ERROR = 0.1
-
 COLUMNS_TO_MERGE_ON = [
     "PROPERTY_TYPE",
     "BUILT_FORM",
@@ -109,12 +103,11 @@ COMPONENT_FEATURES = [
     "NUMBER_OPEN_FIREPLACES",
     "MAINHEATCONT_DESCRIPTION",
     "EXTENSION_COUNT",
+    "TOTAL_FLOOR_AREA",
+    "FLOOR_HEIGHT",
     # 'GLAZED_AREA',  # May not need this since we have MULTI_GLAZE_PROPORTION
 ]
 
-# For these fields, we take an average if we have multiple values
-AVERAGE_FIXED_FEATURES = ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
-
 # For these fields, we take the latest value if we have multiple values
 # Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
 # the most accurate
diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py
index 80991e82..2d691d62 100644
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@@ -5,13 +5,11 @@ from tqdm import tqdm
 from pathlib import Path
 from simulation_system.core.Settings import (
     MANDATORY_FIXED_FEATURES,
-    AVERAGE_FIXED_FEATURES,
     LATEST_FIELD,
     COMPONENT_FEATURES,
     RDSAP_RESPONSE,
     HEAT_DEMAND_RESPONSE,
     COLUMNS_TO_MERGE_ON,
-    MULTIPLE_VALUES_MARGIN_FOR_ERROR,
 )
 from simulation_system.core.DataProcessor import DataProcessor
 from utils import save_dataframe_to_s3_parquet
@@ -19,9 +17,6 @@ from utils import save_dataframe_to_s3_parquet
 DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"
 
 
-# TODO: Have a look at temporal features
-
-
 def app():
     # Get all the files in the directory
 
@@ -34,58 +29,20 @@ def app():
     dataset = []
     cleaning_dataset = []
 
-    # TODO: Does energy tariff make a difference
-    # TODO: If SAP hasn't changed, we don't include the record
-    # TODO: Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
-    # TODO: Same as floor area for floor height
-    # TODO: If fundamental building fabric changes, we should proabably discard the record
-    # TODO: Should we prune records that have an exceptionally large amount of time between them?
-    # TODO: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
+    # TODO [x] : Does energy tariff make a difference
+    #           - leave for now but it may not
+    # TODO [x] : If SAP hasn't changed, we don't include the record
+    # TODO [x]: If SAP gets worse, it genuinely looks like in the vast majority of cases that the building looks
+    #           worse in the newer epc, so we can switch the orders
+    # TODO [] : Have a look at temporal features
+    # TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
+    # TODO [x]: Same as floor area for floor height
+    # TODO []: If fundamental building fabric changes, we should proabably discard the record
+    # TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
+    #           - leave for now and check performance after temporal features
+    # TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
+    #           - Leave for now
     #
-    # TODO: REMOVE ME
-    dodgy_uprns = []
-    observed_uprns = [
-        "10002082244",  # Doesn't really make sense, house no longer has lel and not has more insulation but lower score
-        "10002082259",
-        # Property has more roof insulation, lel, but now the floor isn't insulated and has a lower score. Also the
-        # floor assessment is now assumed whereas before it wasnt
-        "10002082418",  # Walls went from insulated to not...
-        "10002082640",  # Property identical besides different energy taffiff
-        "10002082830",  # Lots of records going from not insulated to insulated but some parts of
-        # the property has gotten better
-        "10002083244",  # latest epc indicates the property is worse
-        "10002083592",  # lastest epc doesn't have a fuel system present, but has slightly more insulation. Also the
-        # floor type has changed from solid to syspended. lel has decreased
-        "100030533576",  # property slightly worse, has less lels and the floor description has changed type
-        "100030533668",  # has slightly less lels. Glazed type is now missing
-        "100030533803",  # Not super clea why this is lower, newer epc has more lel but is using second heating
-        "100030534016",  # Property has less lel but more roof insulation. Floor type has changed
-        "100030534040",  # property has less lel and the floor type has changed
-        "100030534041",  # property has less insulation and less lel
-        "100030534243",  # Cavity wall has gone from filled to unfilled
-        "100030534294",  # less roof insulation but now has an air source heat pump
-        "100030534322",  # identical between records but now with higher lel but no change recorded
-        "100030534413",  # identical between records but different energy tariff, no sap change
-        "100030534437",  # property has less lel and the mainheating no longer has a programmer and trvs
-        "100030534569",  # Cavity wall no longer filled, 30mm more roof insulation in newest epc
-        "100030534676",  # Property has less lel, is now using secondary heating, has 50mm less roof insulation, but
-        # the wall cavity is no longer filled
-        "100030534732",  # property has higher lel %. Not clear why this is worse, glazing type has changed.
-        # This looks dodgy has the UPRN_SOURCE is address matched also the floor area has increased from the first to
-        # the later epc
-        "100030534791",  # Property has started using secondary heating - the EPCs are taken on the same day so maybe we
-        # should discard
-        "100030534795",  # More lel but a lot less insulation. This is a very dodgy record, sap has gone from 90 to 66
-        # The newer epc indicates the property now has 40% photo supply so this doesn't make much sense
-        "100030534897",  # Roof has gone from thatched with additional insulation to pitched with insulation,
-        # sap score hasn't changed
-        "100030534986",  # Property has gone from 300mm loft insulation to none. has 2% higher lel (negligible) and
-        # slightly better main heating setup
-        "100030535043",  # Property lel increased by 12%, not clear why sap worse. Maybe due to different floor area and
-        # wall height
-        "100030535173",  # lel increased from 20% to 80% but roof gone from 100m insulation to "limited" insulation
-        "100030535244",  # lel gone from 100% to 0%, sap is the same
-    ]
 
     for directory in tqdm(directories):
 
@@ -121,18 +78,6 @@ def app():
                 cols_to_merge_on=COLUMNS_TO_MERGE_ON
             )
 
-            for field in AVERAGE_FIXED_FEATURES:
-
-                vals = list(modified_property_data[field].dropna().unique())
-                if len(vals) > 1:
-                    lowest_value = min(vals)
-                    largest_value = max(vals)
-                    if abs(largest_value - lowest_value) / lowest_value > MULTIPLE_VALUES_MARGIN_FOR_ERROR:
-                        # Take the more recent value since it's likely to be more accurate
-                        vals = [vals[-1]]
-
-                fixed_data[field] = np.mean(vals)
-
             # Combine all fields together
             fixed_data.update(mandatory_field_data)
             fixed_data.update(latest_field_data)
@@ -152,46 +97,28 @@ def app():
                 if idx >= modified_property_data.shape[0] - 1:
                     break
 
-                starting_record = variable_data.iloc[idx]
-                ending_record = variable_data.iloc[idx + 1]
-                rdsap_change = (
-                    ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
-                )
-                heat_demand_change = (
-                    ending_record[HEAT_DEMAND_RESPONSE]
-                    - starting_record[HEAT_DEMAND_RESPONSE]
-                )
+                earliest_record = variable_data.iloc[idx]
+                latest_record = variable_data.iloc[idx + 1]
 
-                # Check for a change in the starting and ending record
-                check_cols = [
-                    col for col in starting_record.index if col not in [
-                        "LODGEMENT_DATE", "CURRENT_ENERGY_EFFICIENCY", "ENERGY_CONSUMPTION_CURRENT", "ENERGY_TARIFF"
-                    ]
-                ]
-                all_same = True
-                for col in check_cols:
-                    if starting_record[col] != ending_record[col]:
-                        all_same = False
-                        break
+                # Check if the sap gets better or worse
+                gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
 
-                if rdsap_change <= 0:
-                    if all_same | (uprn in observed_uprns):
-                        if uprn not in observed_uprns:
-                            dodgy_uprns.append(uprn)
-                    else:
-                        compare = pd.concat([starting_record, ending_record], axis=1)
-                        bljd
+                if gets_better:
+                    rdsap_change = latest_record[RDSAP_RESPONSE] - earliest_record[RDSAP_RESPONSE]
+                    heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - earliest_record[HEAT_DEMAND_RESPONSE]
+                else:
+                    rdsap_change = earliest_record[RDSAP_RESPONSE] - latest_record[RDSAP_RESPONSE]
+                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - latest_record[HEAT_DEMAND_RESPONSE]
 
-                # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
-                #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
-                #       within descriptions
+                if rdsap_change == 0:
+                    continue
 
-                starting_record = starting_record[
-                    COMPONENT_FEATURES + ["LODGEMENT_DATE"]
-                    ].add_suffix("_STARTING")
-                ending_record = ending_record[
-                    COMPONENT_FEATURES + ["LODGEMENT_DATE"]
-                    ].add_suffix("_ENDING")
+                if gets_better:
+                    starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
+                    ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
+                else:
+                    starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
+                    ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
 
                 features = pd.concat([starting_record, ending_record])
 
@@ -205,7 +132,18 @@ def app():
                     }
                 )
 
-            dataset.append(property_model_data)
+            property_model_df = pd.DataFrame(property_model_data)
+            # Add some temporal features - we look at the days from the standard starting point in time
+            # for the starting and ending date so all records are from a fixed point
+            # TODO: implement me
+            property_model_df["DAYS_TO_STARTING"] = None
+            property_model_df["DAYS_TO_ENDING"] = None
+
+            dataset.append(property_model_df)
+
+        # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
+        #       floors, we may want to use the U-value. We may also want to handle the (assumed) tags
+        #       within descriptions
 
         cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
         cleaning_dataset.append(cleaning_averages)
@@ -218,8 +156,12 @@ def app():
         file_key="sap_change_model/cleaning_dataset.parquet",
     )
 
-    output = pd.DataFrame(dataset)
-    output.to_parquet("./dataset.parquet")
+    output = pd.concat(dataset)
+    save_dataframe_to_s3_parquet(
+        df=output,
+        bucket_name="retrofit-data-dev",
+        file_key="sap_change_model/dataset.parquet",
+    )
 
 
 if __name__ == "__main__":