finished data differencing poc

2026-07-27 23:35:01 +00:00 · 2023-09-21 18:48:33 +01:00 · 2023-09-21 18:48:33 +01:00 · 7956a4adc4
commit 7956a4adc4
parent 00328d461b
2 changed files with 110 additions and 83 deletions
--- a/model_data/simulation_system/core/DataProcessor.py
+++ b/model_data/simulation_system/core/DataProcessor.py
@ -7,8 +7,6 @@ from model_data.simulation_system.core.Settings import (
    EARLIEST_EPC_DATE,
    FULLY_GLAZED_DESCRIPTIONS,
    AVERAGE_FIXED_FEATURES,
-    FLOOR_HEIGHT_NATIONAL_AVERAGE,
-    TOTAL_FLOOR_AREA_NATIONAL_AVERAGE,
    FLOOR_LEVEL_MAP,
    BUILT_FORM_REMAP,
    COLUMNS_TO_MERGE_ON,
@ -17,8 +15,10 @@ from model_data.simulation_system.core.Settings import (
    COLUMNTYPES,
    RDSAP_RESPONSE,
    MAX_SAP_SCORE,
-    fill_na_map
+    fill_na_map,
+    FIXED_DESCRIPTON_MAPPED_FEATURES
 )
+
 from typing import List


@ -502,3 +502,108 @@ class DataProcessor:
        :return: Pandas dataframe containing the columns defined in FIXED_FEATURES
        """
        return self.data[FIXED_FEATURES]
+
+    @staticmethod
+    def difference_data(df):
+
+        """
+        Given a dataframe and starting and ending columns, this function will convert the features to
+        differenced the ending subtract the starting value, which is useful for modelling the difference responces
+        """
+
+        columns = {
+            x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [
+                "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
+                "CARBON_STARTING", "UPRN", "CONSTITUENCY",
+            ]
+        }
+
+        non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
+        non_numerical_columns = [col for col in non_numerical_columns if col in columns]
+        levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
+
+        df = pd.get_dummies(df, columns=non_numerical_columns)
+
+        # We make sure there is a starting and ending version of the column
+        diff_columns = []
+        no_diff_columns = []  # Store for debugging
+        for col in columns:
+            if "_ENDING" in col:
+                # Don't keep the endings
+                continue
+            else:
+                # We have a starting column so check if we have an ending
+                if col.replace("_STARTING", "") + "_ENDING" in columns:
+                    diff_columns.append(col)
+                else:
+                    no_diff_columns.append(col)
+
+        if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
+            raise Exception("Something went wrong, potentially missed a differencing colunn")
+
+        datatypes = df.dtypes
+
+        # Do the differencing
+        cols_to_append = {}
+        for starting_col in diff_columns:
+
+            base_col = starting_col.replace("_STARTING", "")
+
+            if "_STARTING" in starting_col:
+                ending_col = starting_col.replace("_STARTING", "_ENDING")
+            else:
+                ending_col = starting_col + "_ENDING"
+
+            if starting_col not in non_numerical_columns:
+                cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
+                df = df.drop(columns=[starting_col, ending_col])
+                continue
+
+            level_values = list(set(levels[starting_col] + levels[ending_col]))
+
+            level_cols = []
+            for level in level_values:
+                starting_level_col = "_".join([starting_col, str(level)])
+                ending_level_col = "_".join([ending_col, str(level)])
+
+                if starting_level_col not in df.columns:
+                    # We have no starting, just ending
+                    col_type = datatypes[ending_level_col].name
+
+                    if col_type == "bool":
+                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col].astype(int)
+                    else:
+                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col]
+
+                    level_cols.append(ending_level_col)
+
+                elif ending_level_col not in df.columns:
+                    # We have no ending, just starting
+                    col_type = datatypes[starting_level_col].name
+
+                    if col_type == "bool":
+                        cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[starting_level_col].astype(int)
+                    else:
+                        cols_to_append[f"{base_col}_{level}_DIFF"] = -1 * df[ending_level_col]
+
+                    level_cols.append(starting_level_col)
+
+                else:
+                    col_type = datatypes[starting_level_col].name
+
+                    if col_type == "bool":
+                        cols_to_append[f"{base_col}_{level}_DIFF"] = (
+                            df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
+                        )
+                    else:
+                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
+
+                    level_cols.extend([starting_level_col, ending_level_col])
+
+            # Drop the columns
+            df = df.drop(columns=level_cols)
+
+        cols_to_append = pd.DataFrame(cols_to_append)
+        df = pd.concat([df, cols_to_append], axis=1)
+
+        return df
--- a/model_data/simulation_system/generate_rdsap_change.py
+++ b/model_data/simulation_system/generate_rdsap_change.py
@ -19,7 +19,7 @@ from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_
 from recommendations.rdsap_tables import england_wales_age_band_lookup
 from recommendations.recommendation_utils import (
    get_wall_u_value, get_roof_u_value, get_floor_u_value, estimate_perimeter_2_rooms, estimate_perimeter,
-    extract_insulation_thickness, get_wall_type
+    get_wall_type
 )

 DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
@ -539,85 +539,7 @@ def app():
        if pd.isnull(data_by_urpn_df).sum().sum():
            raise ValueError("Null values found in dataset after process_and_prune_desriptions")

-        # TODO: Move to dataprocesser
-        def difference_data(df):
-
-            from model_data.simulation_system.core.Settings import FIXED_FEATURES, FIXED_DESCRIPTON_MAPPED_FEATURES
-
-            columns = {
-                x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [
-                    "RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
-                    "CARBON_STARTING", "UPRN", "CONSTITUENCY",
-                ]
-            }
-
-            non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
-            non_numerical_columns = [col for col in non_numerical_columns if col in columns]
-            levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
-
-            df = pd.get_dummies(df, columns=non_numerical_columns)
-
-            # We make sure there is a starting and ending version of the column
-            diff_columns = []
-            no_diff_columns = []  # Store for debugging
-            for col in columns:
-                if "_ENDING" in col:
-                    # Don't keep the endings
-                    continue
-                else:
-                    # We have a starting column so check if we have an ending
-                    if col.replace("_STARTING", "") + "_ENDING" in columns:
-                        diff_columns.append(col)
-                    else:
-                        no_diff_columns.append(col)
-
-            if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
-                raise Exception("Something went wrong, potentially missed a differencing colunn")
-
-            datatypes = df.dtypes
-
-            # Do the differencing
-            cols_to_append = {}
-            for starting_col in diff_columns:
-
-                base_col = starting_col.replace("_STARTING", "")
-
-                if "_STARTING" in starting_col:
-                    ending_col = starting_col.replace("_STARTING", "_ENDING")
-                else:
-                    ending_col = starting_col + "_ENDING"
-
-                if starting_col not in non_numerical_columns:
-                    cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
-                    df = df.drop(columns=[starting_col, ending_col])
-                    continue
-
-                level_values = list(set(levels[starting_col] + levels[ending_col]))
-
-                level_cols = []
-                for level in level_values:
-                    starting_level_col = "_".join([starting_col, str(level)])
-                    ending_level_col = "_".join([ending_col, str(level)])
-
-                    col_type = datatypes[starting_level_col].name
-
-                    if starting_level_col not in df.columns:
-                        df[starting_level_col] = 0
-
-                    if ending_level_col not in df.columns:
-                        df[ending_level_col] = 0
-
-                    if col_type == "bool":
-                        cols_to_append[f"{base_col}_{level}_DIFF"] = (
-                            df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
-                        )
-                    else:
-                        cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
-
-                    level_cols.extend([starting_level_col, ending_level_col])
-
-                # Drop the columns
-                df = df.drop(columns=level_cols)
+        data_by_urpn_df = DataProcessor.difference_data(data_by_urpn_df)

        dataset.append(data_by_urpn_df)