Added potential variables to dataset

2026-07-27 23:35:01 +00:00 · 2023-10-17 15:38:17 +11:00 · 2023-10-17 15:38:17 +11:00 · 40a6d2041e
commit 40a6d2041e
parent 2ead4906be
5 changed files with 108 additions and 14 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyNamespacePackagesService">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/etl/epc/DataProcessor.py
+++ b/etl/epc/DataProcessor.py
@ -569,3 +569,20 @@ class DataProcessor:
                df[col] = df[col].fillna("Unknown")

        return df
+
+    @staticmethod
+    def clean_efficiency_variables(df):
+        missings = pd.isnull(df).sum()
+        missings = missings[missings >= 1]
+
+        if len(missings) == 0:
+            return df
+
+        # Make sure they are all efficiency columns
+        if any(~missings.index.str.contains("ENERGY_EFF")):
+            raise ValueError("Non efficiency columns are missing")
+
+        for m in missings.index:
+            df[m] = df[m].fillna("NO_RATING")
+
+        return df
--- a/etl/epc/property_change_app.py
+++ b/etl/epc/property_change_app.py
@ -12,6 +12,10 @@ from etl.epc.settings import (
    HEAT_DEMAND_RESPONSE,
    COLUMNS_TO_MERGE_ON,
    CARBON_RESPONSE,
+    CORE_COMPONENT_FEATURES,
+    EFFICIENCY_FEATURES,
+    POTENTIAL_COLUMNS,
+    MINIMUM_FLOOR_HEIGHT
 )
 from etl.epc.DataProcessor import DataProcessor
 from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
@ -363,6 +367,25 @@ def make_uvalues(df):
    return df


+def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
+    """
+    For a list of columns, check if the earliest and latest record are the same
+    If they are the same, we indicate this, because we have example of SAP scores changing
+    without any feature changes
+    :param earliest_record: pd.Series
+    :param latest_record: pd.Series
+    :param columns: list of columns to compare
+    :return: boolean indicating whether or not all features are the same
+    """
+
+    all_equal = True
+    for col in columns:
+        if earliest_record[col] != latest_record[col]:
+            return False
+    if all_equal:
+        return True
+
+
 def app():
    # Get all the files in the directory

@ -376,6 +399,8 @@ def app():

    dataset = []
    cleaning_dataset = []
+    # Keep track of the number of all equals
+    all_equal_count = 0

    for directory in tqdm(directories):

@ -422,7 +447,9 @@ def app():
            # We include the lodgement date here as we probably need to factor time into the
            # model, since EPC standards and rigour have changed over time
            variable_data = property_data[
-                COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
+                COMPONENT_FEATURES + EFFICIENCY_FEATURES + POTENTIAL_COLUMNS + [
+                    "LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE
+                ]
                ]

            # Note: we look at changes between subsequent EPCS, however we could look at other permutations
@ -439,6 +466,8 @@ def app():
                # Check if the sap gets better or worse
                gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]

+                component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
+
                if gets_better:
                    starting_sap = earliest_record[RDSAP_RESPONSE]
                    starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
@ -452,8 +481,8 @@ def app():
                    heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
                    carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon

-                    starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
-                    ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
+                    starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
+                    ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
                else:
                    starting_sap = latest_record[RDSAP_RESPONSE]
                    starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
@ -467,12 +496,23 @@ def app():
                    heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
                    carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon

-                    starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
-                    ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
+                    starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
+                    ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")

                if rdsap_change == 0:
                    continue

+                all_equal = compare_records(
+                    earliest_record=earliest_record,
+                    latest_record=latest_record,
+                    columns=CORE_COMPONENT_FEATURES
+                )
+
+                if all_equal:
+                    # Keep track of this for the moment
+                    all_equal_count += 1
+                    continue
+
                features = pd.concat([starting_record, ending_record])

                property_model_data.append(
@ -487,6 +527,10 @@ def app():
                        "HEAT_DEMAND_ENDING": ending_heat_demand,
                        "CARBON_STARTING": starting_carbon,
                        "CARBON_ENDING": ending_carbon,
+                        "POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
+                        "ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
+                        "ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
+                        "CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
                        **fixed_data,
                        **features.to_dict(),
                    }
@ -496,8 +540,6 @@ def app():

        data_by_urpn_df = pd.DataFrame(data_by_urpn)

-        # Add some temporal features - we look at the days from the standard starting point in time
-        # for the starting and ending date so all records are from a fixed point
        data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
            data_by_urpn_df["LODGEMENT_DATE_STARTING"]
        )
@ -508,6 +550,8 @@ def app():

        data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])

+        data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df)
+
        # We look for key building fabric features that have changed from one EPC to the next.
        # if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
        # remove this record, as it indicates that the quality of the EPC conducted in the first instance
@ -541,6 +585,8 @@ def app():
        cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
        cleaning_dataset.append(cleaning_averages)

+    print("Final all equal count: %s" % str(all_equal_count))
+
    # Store cleaning dataset in s3 as a parquet file
    cleaning_dataset = pd.concat(cleaning_dataset)
    save_dataframe_to_s3_parquet(
--- a/etl/epc/settings.py
+++ b/etl/epc/settings.py
@ -85,8 +85,7 @@ FIXED_FEATURES = [
    "FIXED_LIGHTING_OUTLETS_COUNT",
 ]

-COMPONENT_FEATURES = [
-    "TRANSACTION_TYPE",
+CORE_COMPONENT_FEATURES = [
    "WALLS_DESCRIPTION",
    "FLOOR_DESCRIPTION",
    "LIGHTING_DESCRIPTION",
@ -96,21 +95,49 @@ COMPONENT_FEATURES = [
    "MAIN_FUEL",
    "MECHANICAL_VENTILATION",
    "SECONDHEAT_DESCRIPTION",
-    "ENERGY_TARIFF",  # Not sure if this is relevant
-    "SOLAR_WATER_HEATING_FLAG",
-    "PHOTO_SUPPLY",
    "WINDOWS_DESCRIPTION",
    "GLAZED_TYPE",
    "MULTI_GLAZE_PROPORTION",
    "LOW_ENERGY_LIGHTING",
    "NUMBER_OPEN_FIREPLACES",
    "MAINHEATCONT_DESCRIPTION",
+    "SOLAR_WATER_HEATING_FLAG",
+    "PHOTO_SUPPLY",
+]
+
+EFFICIENCY_FEATURES = [
+    'HOT_WATER_ENERGY_EFF',
+    'FLOOR_ENERGY_EFF',
+    'WINDOWS_ENERGY_EFF',
+    'WALLS_ENERGY_EFF',
+    'SHEATING_ENERGY_EFF',
+    'ROOF_ENERGY_EFF',
+    'MAINHEAT_ENERGY_EFF',
+    'MAINHEATC_ENERGY_EFF',
+    'LIGHTING_ENERGY_EFF'
+]
+
+COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
+    "TRANSACTION_TYPE",
+    "ENERGY_TARIFF",  # Not sure if this is relevant
    "EXTENSION_COUNT",
    "TOTAL_FLOOR_AREA",
    "FLOOR_HEIGHT",
    # 'GLAZED_AREA',  # May not need this since we have MULTI_GLAZE_PROPORTION
 ]

+POTENTIAL_COLUMNS = [
+    'POTENTIAL_ENERGY_RATING',
+    'POTENTIAL_ENERGY_EFFICIENCY',
+    'ENVIRONMENT_IMPACT_POTENTIAL',
+    'ENERGY_CONSUMPTION_POTENTIAL',
+    'CO2_EMISSIONS_POTENTIAL',
+    # We don't include cost features for the moment
+    # 'LIGHTING_COST_POTENTIAL',
+    # 'HEATING_COST_POTENTIAL',
+    # 'HOT_WATER_COST_POTENTIAL'
+]
+
 # For these fields, we take the latest value if we have multiple values
 # Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
 # the most accurate
@ -253,3 +280,7 @@ ENDING_SUFFIX_COMPONENT_COLS = [
    'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
    'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
 ]
+
+# We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore
+# filter out any homes with a floor height below this
+MINIMUM_FLOOR_HEIGHT = 1.65