From ed407bc98b453bedf41a152b567c7e619da96750 Mon Sep 17 00:00:00 2001
From: Michael Duong <michael123ster@gmail.com>
Date: Thu, 22 Feb 2024 20:22:11 +0000
Subject: [PATCH]  fix weird cases for now

---
 etl/epc/Dataset.py             | 215 ++++++++++++++++++++-------------
 etl/epc/Pipeline.py            |  19 ++-
 etl/epc/property_change_app.py |   3 +-
 3 files changed, 145 insertions(+), 92 deletions(-)

diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py
index 5efcae23..3228668e 100644
--- a/etl/epc/Dataset.py
+++ b/etl/epc/Dataset.py
@@ -809,6 +809,7 @@ class TrainingDataset(BaseDataset):
     #     else:
     #         return self.__add__(other)
 
+
 class RecordDataset(BaseDataset):
     """
     A collection of EPCRecrods can be combined into a Dataset.
@@ -824,25 +825,25 @@ class RecordDataset(BaseDataset):
         self._expand_description_to_features(cleaned_lookup)
         self._adjust_assumed_values_in_wall_descriptions()
         self._generate_u_values_from_features()
-        # # TODO: For some of the features that we clean, we have either a true, false or possibly null value
-        # #       Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
-        # #       need to
+        # # # TODO: For some of the features that we clean, we have either a true, false or possibly null value
+        # # #       Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
+        # # #       need to
         self._clean_missing_values()
         self._null_validation(information="Clean Missing Values")
-        # self._remove_abnormal_change_in_floor_area()
+        # # self._remove_abnormal_change_in_floor_area()
         self._ensure_numeric()
 
-
     def _ensure_numeric(self):
         """
         Ensure that all columns are numeric
         """
         # TODO: move into EPCRecord record
-        uvalue_columns = [col for col in self.df.columns if "thermal_transmittance" in col]
+        uvalue_columns = [
+            col for col in self.df.columns if "thermal_transmittance" in col
+        ]
         for uvalue_col in uvalue_columns:
             self.df[uvalue_col] = pd.to_numeric(self.df[uvalue_col])
 
-    
     def _clean_missing_values(self, ignore_cols=None):
         missings = pd.isnull(self.df).sum()
         missings = missings[missings > 0]
@@ -859,17 +860,22 @@ class RecordDataset(BaseDataset):
             else:
                 self.df[col] = self.df[col].fillna("Unknown")
 
-    
     @staticmethod
     def _lambda_function_to_generate_roof_uvalue(row, is_end=False):
         """
         Using the apply method, use the get_roof_u_value method to generate the u-value
         """
 
-        col_name = "roof_insulation_thickness" if not is_end else "roof_insulation_thickness_ending"
+        col_name = (
+            "roof_insulation_thickness"
+            if not is_end
+            else "roof_insulation_thickness_ending"
+        )
 
         if row["has_dwelling_above"]:
-            if row["roof_thermal_transmittance"] != 0:
+            if (row["roof_thermal_transmittance"] != 0) & (
+                not pd.isnull(row["roof_thermal_transmittance"])
+            ):
                 raise ValueError("Should have 0 u-value for roof")
 
         return get_roof_u_value(
@@ -881,16 +887,24 @@ class RecordDataset(BaseDataset):
             is_flat=row["is_flat"],
             is_pitched=row["is_pitched"],
             is_at_rafters=row["is_at_rafters"],
-            age_band=england_wales_age_band_lookup[row["construction_age_band"]]
-        )   
-    
+            age_band=england_wales_age_band_lookup[row["construction_age_band"]],
+        )
+
     @staticmethod
     def _lambda_function_to_generate_wall_uvalue(row, is_end=False):
         """
         Using the apply method, use the get_wall_u_value method to generate the u-value
         """
-        description_col_name = "walls_clean_description" if not is_end else "walls_clean_description_ending"
-        thermal_transistance_col_name = "walls_thermal_transmittance" if not is_end else "walls_thermal_transmittance_ending"
+        description_col_name = (
+            "walls_clean_description"
+            if not is_end
+            else "walls_clean_description_ending"
+        )
+        thermal_transistance_col_name = (
+            "walls_thermal_transmittance"
+            if not is_end
+            else "walls_thermal_transmittance_ending"
+        )
 
         if pd.isnull(row[thermal_transistance_col_name]):
             output = get_wall_u_value(
@@ -903,17 +917,23 @@ class RecordDataset(BaseDataset):
             output = row[thermal_transistance_col_name]
 
         return output
-    
+
     @staticmethod
     def _lambda_function_to_generate_floor_uvalue(row, is_end=False):
         """
         Using the apply method, use the get_floor_u_value method to generate the u-value
         """
 
-        floor_thermal_col_name = "floor_thermal_transmittance" if not is_end else "floor_thermal_transmittance_ending"
+        floor_thermal_col_name = (
+            "floor_thermal_transmittance"
+            if not is_end
+            else "floor_thermal_transmittance_ending"
+        )
 
         if row["another_property_below"]:
-            if row["floor_thermal_transmittance"] != 0:
+            if (row["floor_thermal_transmittance"] != 0) & (
+                not pd.isnull(row["floor_thermal_transmittance"])
+            ):
                 raise ValueError("Should have 0 u-value for floor")
 
             return 0
@@ -922,19 +942,27 @@ class RecordDataset(BaseDataset):
 
         if pd.isnull(uvalue):
 
-            insulation_col_name = "floor_insulation_thickness" if not is_end else "floor_insulation_thickness_ending"
-            floor_area_col_name = "estimated_perimeter" if not is_end else "estimated_perimeter_ending"
-            perimeter_col_name = "total_floor_area" if not is_end else "total_floor_area_ending"
+            insulation_col_name = (
+                "floor_insulation_thickness"
+                if not is_end
+                else "floor_insulation_thickness_ending"
+            )
+            floor_area_col_name = (
+                "estimated_perimeter" if not is_end else "estimated_perimeter_ending"
+            )
+            perimeter_col_name = (
+                "total_floor_area" if not is_end else "total_floor_area_ending"
+            )
 
             uvalue = get_floor_u_value(
-                    floor_type=row["floor_type"],
-                    perimeter=row[floor_area_col_name],
-                    area=row[perimeter_col_name],
-                    insulation_thickness=row[insulation_col_name],
-                    wall_type=row["wall_type"],
-                    age_band=england_wales_age_band_lookup[row["construction_age_band"]]
-                )
-        
+                floor_type=row["floor_type"],
+                perimeter=row[floor_area_col_name],
+                area=row[perimeter_col_name],
+                insulation_thickness=row[insulation_col_name],
+                wall_type=row["wall_type"],
+                age_band=england_wales_age_band_lookup[row["construction_age_band"]],
+            )
+
         return uvalue
 
     def _generate_u_values_from_features(self):
@@ -947,58 +975,63 @@ class RecordDataset(BaseDataset):
         # ~~~~~~~~~~~~~~~~~~
 
         walls_uvalue = self.df.apply(
-            lambda row: self._lambda_function_to_generate_wall_uvalue(row),
-            axis=1
+            lambda row: self._lambda_function_to_generate_wall_uvalue(row), axis=1
         )
 
-        walls_uvalue = self.df['walls_thermal_transmittance'].fillna(walls_uvalue)
-     
+        walls_uvalue = self.df["walls_thermal_transmittance"].fillna(walls_uvalue)
+
         # ~~~~~~~~~~~~~~~~~~
         # Roof
         # ~~~~~~~~~~~~~~~~~~
-            
+
         roof_uvalue = self.df.apply(
-            lambda row: self._lambda_function_to_generate_roof_uvalue(row),
-            axis=1
+            lambda row: self._lambda_function_to_generate_roof_uvalue(row), axis=1
         )
 
-        roof_uvalue = self.df['roof_thermal_transmittance'].fillna(roof_uvalue)
+        roof_uvalue = self.df["roof_thermal_transmittance"].fillna(roof_uvalue)
 
         # ~~~~~~~~~~~~~~~~~~
         # Floor
         # ~~~~~~~~~~~~~~~~~~
-        
-        self.df['estimated_perimeter'] = self.df.apply(
-            lambda row: estimate_perimeter(row["total_floor_area"], row["number_habitable_rooms"]),
-            axis=1
+
+        self.df["estimated_perimeter"] = self.df.apply(
+            lambda row: estimate_perimeter(
+                row["total_floor_area"], row["number_habitable_rooms"]
+            ),
+            axis=1,
         )
 
-        self.df["floor_type"] = self.df["is_suspended"].replace({True: "suspended", False: "solid"})
+        self.df["floor_type"] = self.df["is_suspended"].replace(
+            {True: "suspended", False: "solid"}
+        )
         self.df["wall_type"] = self.df.apply(
             lambda row: get_wall_type(
-                is_cavity_wall=row["is_cavity_wall"], 
-                is_solid_brick=row["is_solid_brick"], 
-                is_timber_frame=row["is_timber_frame"], 
-                is_granite_or_whinstone=row["is_granite_or_whinstone"], 
-                is_cob=row["is_cob"], 
+                is_cavity_wall=row["is_cavity_wall"],
+                is_solid_brick=row["is_solid_brick"],
+                is_timber_frame=row["is_timber_frame"],
+                is_granite_or_whinstone=row["is_granite_or_whinstone"],
+                is_cob=row["is_cob"],
                 is_sandstone_or_limestone=row["is_sandstone_or_limestone"],
                 is_system_built=row["is_system_built"],
-                is_park_home=row["is_park_home"]
-                ),
-            axis=1
-        )
-        
-        floor_uvalue = self.df.apply(
-            lambda row: self._lambda_function_to_generate_floor_uvalue(row),
-            axis=1
+                is_park_home=row["is_park_home"],
+            ),
+            axis=1,
         )
 
-        floor_uvalue = self.df['floor_thermal_transmittance'].fillna(floor_uvalue)
+        floor_uvalue = self.df.apply(
+            lambda row: self._lambda_function_to_generate_floor_uvalue(row), axis=1
+        )
+
+        floor_uvalue = self.df["floor_thermal_transmittance"].fillna(floor_uvalue)
 
         for component in ["walls", "roof", "floor"]:
-            self.df[f"{component}_thermal_transmittance"] = self.df[f"{component}_thermal_transmittance"].fillna(eval(f"{component}_uvalue"))
+            self.df[f"{component}_thermal_transmittance"] = self.df[
+                f"{component}_thermal_transmittance"
+            ].fillna(eval(f"{component}_uvalue"))
 
-        self.df = self.df.drop(columns=["floor_type", "wall_type", "walls_clean_description"])
+        self.df = self.df.drop(
+            columns=["floor_type", "wall_type", "walls_clean_description"]
+        )
 
     def _adjust_assumed_values_in_wall_descriptions(self):
         """
@@ -1007,7 +1040,6 @@ class RecordDataset(BaseDataset):
         for col in ["walls_clean_description"]:
             self.df[col] = self.df[col].str.replace("(assumed)", "").str.rstrip()
 
-
     def _clean_efficiency_variables(self):
         """
         These is scope to clean this by the model per corresponding description.
@@ -1023,7 +1055,7 @@ class RecordDataset(BaseDataset):
         missings = missings[missings >= 1]
 
         if len(missings) == 0:
-            return 
+            return
 
         # Make sure they are all efficiency columns
         if any(~missings.index.str.contains("energy_eff")):
@@ -1033,13 +1065,11 @@ class RecordDataset(BaseDataset):
             column_index = self.df[m].isna()
             self.df.loc[column_index, m] = "NO_RATING"
 
-
     def _null_validation(self, information: str):
         print(f"Null validation after {information}")
         if pd.isnull(self.df).sum().sum():
             raise ValueError(f"Null values found in dataset, after step {information}")
 
-    
     def _expand_description_to_features(self, cleaned_lookup: dict):
         """
         This method will merge on the cleaned lookup table and ensure that the building fabric in the
@@ -1050,49 +1080,63 @@ class RecordDataset(BaseDataset):
         # remove this record, as it indicates that the quality of the EPC conducted in the first instance
         # is low
         # We also replace descriptions with their cleaned variants
-        """ 
+        """
 
         cols_to_drop = {
             "walls": [
                 # We need to cleaned descriptions for pulling out u-values
-                'original_description', 'thermal_transmittance_unit',
+                "original_description",
+                "thermal_transmittance_unit",
                 # Re remove the is_assumed columns
-                "is_assumed"
+                "is_assumed",
             ],
             "floor": [
-                "original_description", "clean_description", "thermal_transmittance_unit",
-                "no_data", 
-                "is_assumed"
+                "original_description",
+                "clean_description",
+                "thermal_transmittance_unit",
+                "no_data",
+                "is_assumed",
             ],
             "roof": [
-                "original_description", "clean_description", "thermal_transmittance_unit",
-                "is_assumed", "is_valid"
+                "original_description",
+                "clean_description",
+                "thermal_transmittance_unit",
+                "is_assumed",
+                "is_valid",
             ],
             "hotwater": [
-                "original_description", "clean_description", "assumed",
+                "original_description",
+                "clean_description",
+                "assumed",
             ],
             "mainheat": [
-                "original_description", "clean_description",
+                "original_description",
+                "clean_description",
                 "has_assumed",
             ],
             "mainheatcont": [
-                "original_description", "clean_description",
+                "original_description",
+                "clean_description",
             ],
             "windows": [
-                "original_description", "clean_description",
+                "original_description",
+                "clean_description",
                 # We don't need many of the glazing coverage features because we have the multi_glaze_proportion feature
-                "has_glazing", "glazing_coverage", "no_data", 
+                "has_glazing",
+                "glazing_coverage",
+                "no_data",
             ],
             "main-fuel": [
-                "original_description", "clean_description",
+                "original_description",
+                "clean_description",
             ],
         }
 
         components_to_expand = cols_to_drop.keys()
-        
+
         for component in components_to_expand:
-            
-            # TODO: change cleaned dataframe to have underscores instead of dashes     
+
+            # TODO: change cleaned dataframe to have underscores instead of dashes
             if component == "main-fuel":
                 cleaned_key = "main-fuel"
                 left_on_key = "main_fuel"
@@ -1108,11 +1152,13 @@ class RecordDataset(BaseDataset):
                 cleaned_lookup_df_for_key,
                 how="left",
                 left_on=left_on_key,
-                right_on="original_description"
+                right_on="original_description",
             )
 
             # Drop original cols and cols to drop
-            expanded_df = expanded_df.drop(columns=cols_to_drop[component] + original_cols)
+            expanded_df = expanded_df.drop(
+                columns=cols_to_drop[component] + original_cols
+            )
 
             # Rename columns to component specific names, if they have not been dropped
             expanded_df = expanded_df.rename(
@@ -1124,17 +1170,16 @@ class RecordDataset(BaseDataset):
                 }
             )
             self.df = expanded_df
-            
+
         # We don't need any lighting specific cleaning, we just drop the original description as we use
         # LOW_ENERGY_LIGHTING_STARTING, LOW_ENERGY_LIGHTING_ENDING
         self.df = self.df.drop(columns=["lighting_description"])
-    
 
     # def __add__(self, other) -> "NewDataset":
     #     if not isinstance(other, NewDataset):
     #         raise TypeError("Addition can only be performed with another instance of ScoringDataset")
     #     return NewDataset(self.datasets + other.datasets)
-        
+
     # def __radd__(self, other):
     #     """
     #     Required for sum() to work
@@ -1142,4 +1187,4 @@ class RecordDataset(BaseDataset):
     #     if isinstance(other, int):
     #         return self
     #     else:
-    #         return self.__add__(other)
\ No newline at end of file
+    #         return self.__add__(other)
diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py
index f0be3c2f..f8be16b4 100644
--- a/etl/epc/Pipeline.py
+++ b/etl/epc/Pipeline.py
@@ -87,9 +87,9 @@ class EPCPipeline:
         run_mode="training",
         epc_local_file="certificates.csv",
         epc_bucket_name="retrofit-data-dev",
-        epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_rooms.parquet",
-        epc_all_equal_rows_key="sap_change_model/all_equal_rows_rooms.parquet",
-        epc_compiled_dataset_key="sap_change_model/dataset_rooms.parquet",
+        epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_record.parquet",
+        epc_all_equal_rows_key="sap_change_model/all_equal_rows_record.parquet",
+        epc_compiled_dataset_key="sap_change_model/dataset_record.parquet",
     ):
         """
         :param directories: List of directories to process
@@ -127,7 +127,6 @@ class EPCPipeline:
             self.run_record_dataset_pipeline()
         else:
             raise ValueError("Run mode defined needs to be in 'training' or 'newdata'")
-        
 
     def run_record_dataset_pipeline(self):
         """
@@ -150,9 +149,17 @@ class EPCPipeline:
             )
 
             # TODO: integrate with EPCRecord
-            record_dataset = constituency_data[['uprn'] + VARIABLE_DATA_FEATURES + MANDATORY_FIXED_FEATURES + LATEST_FIELD]
+            record_dataset = constituency_data[
+                ["uprn"]
+                + [RDSAP_RESPONSE]
+                + VARIABLE_DATA_FEATURES
+                + MANDATORY_FIXED_FEATURES
+                + LATEST_FIELD
+            ].rename(columns={RDSAP_RESPONSE: "sap"})
 
-            constituency_dataset = RecordDataset(datasets=record_dataset, cleaned_lookup=clean_lookup)
+            constituency_dataset = RecordDataset(
+                datasets=record_dataset, cleaned_lookup=clean_lookup
+            )
 
             self.compiled_dataset = pd.concat(
                 [self.compiled_dataset, constituency_dataset.df]
diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py
index c8923d6d..8c97bff4 100644
--- a/etl/epc/property_change_app.py
+++ b/etl/epc/property_change_app.py
@@ -12,10 +12,11 @@ def main():
     """
 
     directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
-    # directories = directories[0:3]
+    # directories = directories[202:203]
 
     epc_pipeline = EPCPipeline(
         directories=directories,
+        run_mode="record",
         epc_data_processor=EPCDataProcessor(run_mode="training"),
     )