From ed407bc98b453bedf41a152b567c7e619da96750 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Thu, 22 Feb 2024 20:22:11 +0000 Subject: [PATCH] fix weird cases for now --- etl/epc/Dataset.py | 215 ++++++++++++++++++++------------- etl/epc/Pipeline.py | 19 ++- etl/epc/property_change_app.py | 3 +- 3 files changed, 145 insertions(+), 92 deletions(-) diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 5efcae23..3228668e 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -809,6 +809,7 @@ class TrainingDataset(BaseDataset): # else: # return self.__add__(other) + class RecordDataset(BaseDataset): """ A collection of EPCRecrods can be combined into a Dataset. @@ -824,25 +825,25 @@ class RecordDataset(BaseDataset): self._expand_description_to_features(cleaned_lookup) self._adjust_assumed_values_in_wall_descriptions() self._generate_u_values_from_features() - # # TODO: For some of the features that we clean, we have either a true, false or possibly null value - # # Those nulls should be False. clean_missings_after_description_process handles this but shouldn't - # # need to + # # # TODO: For some of the features that we clean, we have either a true, false or possibly null value + # # # Those nulls should be False. clean_missings_after_description_process handles this but shouldn't + # # # need to self._clean_missing_values() self._null_validation(information="Clean Missing Values") - # self._remove_abnormal_change_in_floor_area() + # # self._remove_abnormal_change_in_floor_area() self._ensure_numeric() - def _ensure_numeric(self): """ Ensure that all columns are numeric """ # TODO: move into EPCRecord record - uvalue_columns = [col for col in self.df.columns if "thermal_transmittance" in col] + uvalue_columns = [ + col for col in self.df.columns if "thermal_transmittance" in col + ] for uvalue_col in uvalue_columns: self.df[uvalue_col] = pd.to_numeric(self.df[uvalue_col]) - def _clean_missing_values(self, ignore_cols=None): missings = pd.isnull(self.df).sum() missings = missings[missings > 0] @@ -859,17 +860,22 @@ class RecordDataset(BaseDataset): else: self.df[col] = self.df[col].fillna("Unknown") - @staticmethod def _lambda_function_to_generate_roof_uvalue(row, is_end=False): """ Using the apply method, use the get_roof_u_value method to generate the u-value """ - col_name = "roof_insulation_thickness" if not is_end else "roof_insulation_thickness_ending" + col_name = ( + "roof_insulation_thickness" + if not is_end + else "roof_insulation_thickness_ending" + ) if row["has_dwelling_above"]: - if row["roof_thermal_transmittance"] != 0: + if (row["roof_thermal_transmittance"] != 0) & ( + not pd.isnull(row["roof_thermal_transmittance"]) + ): raise ValueError("Should have 0 u-value for roof") return get_roof_u_value( @@ -881,16 +887,24 @@ class RecordDataset(BaseDataset): is_flat=row["is_flat"], is_pitched=row["is_pitched"], is_at_rafters=row["is_at_rafters"], - age_band=england_wales_age_band_lookup[row["construction_age_band"]] - ) - + age_band=england_wales_age_band_lookup[row["construction_age_band"]], + ) + @staticmethod def _lambda_function_to_generate_wall_uvalue(row, is_end=False): """ Using the apply method, use the get_wall_u_value method to generate the u-value """ - description_col_name = "walls_clean_description" if not is_end else "walls_clean_description_ending" - thermal_transistance_col_name = "walls_thermal_transmittance" if not is_end else "walls_thermal_transmittance_ending" + description_col_name = ( + "walls_clean_description" + if not is_end + else "walls_clean_description_ending" + ) + thermal_transistance_col_name = ( + "walls_thermal_transmittance" + if not is_end + else "walls_thermal_transmittance_ending" + ) if pd.isnull(row[thermal_transistance_col_name]): output = get_wall_u_value( @@ -903,17 +917,23 @@ class RecordDataset(BaseDataset): output = row[thermal_transistance_col_name] return output - + @staticmethod def _lambda_function_to_generate_floor_uvalue(row, is_end=False): """ Using the apply method, use the get_floor_u_value method to generate the u-value """ - floor_thermal_col_name = "floor_thermal_transmittance" if not is_end else "floor_thermal_transmittance_ending" + floor_thermal_col_name = ( + "floor_thermal_transmittance" + if not is_end + else "floor_thermal_transmittance_ending" + ) if row["another_property_below"]: - if row["floor_thermal_transmittance"] != 0: + if (row["floor_thermal_transmittance"] != 0) & ( + not pd.isnull(row["floor_thermal_transmittance"]) + ): raise ValueError("Should have 0 u-value for floor") return 0 @@ -922,19 +942,27 @@ class RecordDataset(BaseDataset): if pd.isnull(uvalue): - insulation_col_name = "floor_insulation_thickness" if not is_end else "floor_insulation_thickness_ending" - floor_area_col_name = "estimated_perimeter" if not is_end else "estimated_perimeter_ending" - perimeter_col_name = "total_floor_area" if not is_end else "total_floor_area_ending" + insulation_col_name = ( + "floor_insulation_thickness" + if not is_end + else "floor_insulation_thickness_ending" + ) + floor_area_col_name = ( + "estimated_perimeter" if not is_end else "estimated_perimeter_ending" + ) + perimeter_col_name = ( + "total_floor_area" if not is_end else "total_floor_area_ending" + ) uvalue = get_floor_u_value( - floor_type=row["floor_type"], - perimeter=row[floor_area_col_name], - area=row[perimeter_col_name], - insulation_thickness=row[insulation_col_name], - wall_type=row["wall_type"], - age_band=england_wales_age_band_lookup[row["construction_age_band"]] - ) - + floor_type=row["floor_type"], + perimeter=row[floor_area_col_name], + area=row[perimeter_col_name], + insulation_thickness=row[insulation_col_name], + wall_type=row["wall_type"], + age_band=england_wales_age_band_lookup[row["construction_age_band"]], + ) + return uvalue def _generate_u_values_from_features(self): @@ -947,58 +975,63 @@ class RecordDataset(BaseDataset): # ~~~~~~~~~~~~~~~~~~ walls_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_wall_uvalue(row), - axis=1 + lambda row: self._lambda_function_to_generate_wall_uvalue(row), axis=1 ) - walls_uvalue = self.df['walls_thermal_transmittance'].fillna(walls_uvalue) - + walls_uvalue = self.df["walls_thermal_transmittance"].fillna(walls_uvalue) + # ~~~~~~~~~~~~~~~~~~ # Roof # ~~~~~~~~~~~~~~~~~~ - + roof_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_roof_uvalue(row), - axis=1 + lambda row: self._lambda_function_to_generate_roof_uvalue(row), axis=1 ) - roof_uvalue = self.df['roof_thermal_transmittance'].fillna(roof_uvalue) + roof_uvalue = self.df["roof_thermal_transmittance"].fillna(roof_uvalue) # ~~~~~~~~~~~~~~~~~~ # Floor # ~~~~~~~~~~~~~~~~~~ - - self.df['estimated_perimeter'] = self.df.apply( - lambda row: estimate_perimeter(row["total_floor_area"], row["number_habitable_rooms"]), - axis=1 + + self.df["estimated_perimeter"] = self.df.apply( + lambda row: estimate_perimeter( + row["total_floor_area"], row["number_habitable_rooms"] + ), + axis=1, ) - self.df["floor_type"] = self.df["is_suspended"].replace({True: "suspended", False: "solid"}) + self.df["floor_type"] = self.df["is_suspended"].replace( + {True: "suspended", False: "solid"} + ) self.df["wall_type"] = self.df.apply( lambda row: get_wall_type( - is_cavity_wall=row["is_cavity_wall"], - is_solid_brick=row["is_solid_brick"], - is_timber_frame=row["is_timber_frame"], - is_granite_or_whinstone=row["is_granite_or_whinstone"], - is_cob=row["is_cob"], + is_cavity_wall=row["is_cavity_wall"], + is_solid_brick=row["is_solid_brick"], + is_timber_frame=row["is_timber_frame"], + is_granite_or_whinstone=row["is_granite_or_whinstone"], + is_cob=row["is_cob"], is_sandstone_or_limestone=row["is_sandstone_or_limestone"], is_system_built=row["is_system_built"], - is_park_home=row["is_park_home"] - ), - axis=1 - ) - - floor_uvalue = self.df.apply( - lambda row: self._lambda_function_to_generate_floor_uvalue(row), - axis=1 + is_park_home=row["is_park_home"], + ), + axis=1, ) - floor_uvalue = self.df['floor_thermal_transmittance'].fillna(floor_uvalue) + floor_uvalue = self.df.apply( + lambda row: self._lambda_function_to_generate_floor_uvalue(row), axis=1 + ) + + floor_uvalue = self.df["floor_thermal_transmittance"].fillna(floor_uvalue) for component in ["walls", "roof", "floor"]: - self.df[f"{component}_thermal_transmittance"] = self.df[f"{component}_thermal_transmittance"].fillna(eval(f"{component}_uvalue")) + self.df[f"{component}_thermal_transmittance"] = self.df[ + f"{component}_thermal_transmittance" + ].fillna(eval(f"{component}_uvalue")) - self.df = self.df.drop(columns=["floor_type", "wall_type", "walls_clean_description"]) + self.df = self.df.drop( + columns=["floor_type", "wall_type", "walls_clean_description"] + ) def _adjust_assumed_values_in_wall_descriptions(self): """ @@ -1007,7 +1040,6 @@ class RecordDataset(BaseDataset): for col in ["walls_clean_description"]: self.df[col] = self.df[col].str.replace("(assumed)", "").str.rstrip() - def _clean_efficiency_variables(self): """ These is scope to clean this by the model per corresponding description. @@ -1023,7 +1055,7 @@ class RecordDataset(BaseDataset): missings = missings[missings >= 1] if len(missings) == 0: - return + return # Make sure they are all efficiency columns if any(~missings.index.str.contains("energy_eff")): @@ -1033,13 +1065,11 @@ class RecordDataset(BaseDataset): column_index = self.df[m].isna() self.df.loc[column_index, m] = "NO_RATING" - def _null_validation(self, information: str): print(f"Null validation after {information}") if pd.isnull(self.df).sum().sum(): raise ValueError(f"Null values found in dataset, after step {information}") - def _expand_description_to_features(self, cleaned_lookup: dict): """ This method will merge on the cleaned lookup table and ensure that the building fabric in the @@ -1050,49 +1080,63 @@ class RecordDataset(BaseDataset): # remove this record, as it indicates that the quality of the EPC conducted in the first instance # is low # We also replace descriptions with their cleaned variants - """ + """ cols_to_drop = { "walls": [ # We need to cleaned descriptions for pulling out u-values - 'original_description', 'thermal_transmittance_unit', + "original_description", + "thermal_transmittance_unit", # Re remove the is_assumed columns - "is_assumed" + "is_assumed", ], "floor": [ - "original_description", "clean_description", "thermal_transmittance_unit", - "no_data", - "is_assumed" + "original_description", + "clean_description", + "thermal_transmittance_unit", + "no_data", + "is_assumed", ], "roof": [ - "original_description", "clean_description", "thermal_transmittance_unit", - "is_assumed", "is_valid" + "original_description", + "clean_description", + "thermal_transmittance_unit", + "is_assumed", + "is_valid", ], "hotwater": [ - "original_description", "clean_description", "assumed", + "original_description", + "clean_description", + "assumed", ], "mainheat": [ - "original_description", "clean_description", + "original_description", + "clean_description", "has_assumed", ], "mainheatcont": [ - "original_description", "clean_description", + "original_description", + "clean_description", ], "windows": [ - "original_description", "clean_description", + "original_description", + "clean_description", # We don't need many of the glazing coverage features because we have the multi_glaze_proportion feature - "has_glazing", "glazing_coverage", "no_data", + "has_glazing", + "glazing_coverage", + "no_data", ], "main-fuel": [ - "original_description", "clean_description", + "original_description", + "clean_description", ], } components_to_expand = cols_to_drop.keys() - + for component in components_to_expand: - - # TODO: change cleaned dataframe to have underscores instead of dashes + + # TODO: change cleaned dataframe to have underscores instead of dashes if component == "main-fuel": cleaned_key = "main-fuel" left_on_key = "main_fuel" @@ -1108,11 +1152,13 @@ class RecordDataset(BaseDataset): cleaned_lookup_df_for_key, how="left", left_on=left_on_key, - right_on="original_description" + right_on="original_description", ) # Drop original cols and cols to drop - expanded_df = expanded_df.drop(columns=cols_to_drop[component] + original_cols) + expanded_df = expanded_df.drop( + columns=cols_to_drop[component] + original_cols + ) # Rename columns to component specific names, if they have not been dropped expanded_df = expanded_df.rename( @@ -1124,17 +1170,16 @@ class RecordDataset(BaseDataset): } ) self.df = expanded_df - + # We don't need any lighting specific cleaning, we just drop the original description as we use # LOW_ENERGY_LIGHTING_STARTING, LOW_ENERGY_LIGHTING_ENDING self.df = self.df.drop(columns=["lighting_description"]) - # def __add__(self, other) -> "NewDataset": # if not isinstance(other, NewDataset): # raise TypeError("Addition can only be performed with another instance of ScoringDataset") # return NewDataset(self.datasets + other.datasets) - + # def __radd__(self, other): # """ # Required for sum() to work @@ -1142,4 +1187,4 @@ class RecordDataset(BaseDataset): # if isinstance(other, int): # return self # else: - # return self.__add__(other) \ No newline at end of file + # return self.__add__(other) diff --git a/etl/epc/Pipeline.py b/etl/epc/Pipeline.py index f0be3c2f..f8be16b4 100644 --- a/etl/epc/Pipeline.py +++ b/etl/epc/Pipeline.py @@ -87,9 +87,9 @@ class EPCPipeline: run_mode="training", epc_local_file="certificates.csv", epc_bucket_name="retrofit-data-dev", - epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_rooms.parquet", - epc_all_equal_rows_key="sap_change_model/all_equal_rows_rooms.parquet", - epc_compiled_dataset_key="sap_change_model/dataset_rooms.parquet", + epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_record.parquet", + epc_all_equal_rows_key="sap_change_model/all_equal_rows_record.parquet", + epc_compiled_dataset_key="sap_change_model/dataset_record.parquet", ): """ :param directories: List of directories to process @@ -127,7 +127,6 @@ class EPCPipeline: self.run_record_dataset_pipeline() else: raise ValueError("Run mode defined needs to be in 'training' or 'newdata'") - def run_record_dataset_pipeline(self): """ @@ -150,9 +149,17 @@ class EPCPipeline: ) # TODO: integrate with EPCRecord - record_dataset = constituency_data[['uprn'] + VARIABLE_DATA_FEATURES + MANDATORY_FIXED_FEATURES + LATEST_FIELD] + record_dataset = constituency_data[ + ["uprn"] + + [RDSAP_RESPONSE] + + VARIABLE_DATA_FEATURES + + MANDATORY_FIXED_FEATURES + + LATEST_FIELD + ].rename(columns={RDSAP_RESPONSE: "sap"}) - constituency_dataset = RecordDataset(datasets=record_dataset, cleaned_lookup=clean_lookup) + constituency_dataset = RecordDataset( + datasets=record_dataset, cleaned_lookup=clean_lookup + ) self.compiled_dataset = pd.concat( [self.compiled_dataset, constituency_dataset.df] diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index c8923d6d..8c97bff4 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -12,10 +12,11 @@ def main(): """ directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] - # directories = directories[0:3] + # directories = directories[202:203] epc_pipeline = EPCPipeline( directories=directories, + run_mode="record", epc_data_processor=EPCDataProcessor(run_mode="training"), )