diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 4ad854c1..92d5511b 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -100,8 +100,8 @@ class EPCDataProcessor: # FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA self.violation_mode = violation_mode - if run_mode not in ["training", "newdata"]: - raise ValueError("Run mode must be either training or newdata") + if run_mode not in ["training", "newdata", "kwhdata"]: + raise ValueError("Run mode must be either training, newdata or kwhdata") self.run_mode = run_mode if not violation_mode else "newdata" def prepare_data(self, filepath: Path | str | None = None) -> None: @@ -110,7 +110,9 @@ class EPCDataProcessor: Ignore step is used to highlight which steps are not needed in newdata """ - ignore_step = True if self.run_mode == "newdata" else False + ignore_step = True if self.run_mode in ["newdata"] else False + if self.run_mode == "kwhdata": + self.rename_kwhdata_columns() if filepath is not None: self.load_data( @@ -126,18 +128,21 @@ class EPCDataProcessor: self.remap_build_form() self.cast_data_column_values_to_lower() self.standardise_construction_age_band(ignore_step=ignore_step) - self.clean_missing_rooms(ignore_step=ignore_step) + if self.run_mode != "kwhdata": + self.clean_missing_rooms(ignore_step=ignore_step) self.recast_df_columns( column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"] ) self.clean_multi_glaze_proportion(ignore_step=ignore_step) self.clean_photo_supply() - self.retain_multiple_epc_properties( - epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"], - ignore_step=ignore_step, - ) + if self.run_mode != "kwhdata": + self.retain_multiple_epc_properties( + epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"], + ignore_step=ignore_step, + ) - self.fill_na_fields() + if self.run_mode != "kwhdata": + self.fill_na_fields() self.sort_data_by_uprn_lodgement_date(ignore_step=ignore_step) @@ -148,8 +153,9 @@ class EPCDataProcessor: self.fill_invalid_constituency_fields(ignore_step=ignore_step) - self.make_cleaning_averages(ignore_step=ignore_step) - self.add_local_authority_to_cleaning_average(ignore_step=ignore_step) + if self.run_mode != "kwhdata": + self.make_cleaning_averages(ignore_step=ignore_step) + self.add_local_authority_to_cleaning_average(ignore_step=ignore_step) # TODO: check if this has impact on training dataset # cleaned_data = self.apply_averages_cleaning( @@ -160,21 +166,33 @@ class EPCDataProcessor: # ) # When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper + cleaning_averages = self.cleaning_averages.copy() if self.run_mode == "newdata": cleaning_averages.columns = cleaning_averages.columns.str.upper() - cleaned_data = self.apply_averages_cleaning( - data_to_clean=self.data, - cleaning_data=cleaning_averages, - cols_to_merge_on=COLUMNS_TO_MERGE_ON, - ) + if self.run_mode == "kwhdata": + cleaned_data = self.data + else: + cleaned_data = self.apply_averages_cleaning( + data_to_clean=self.data, + cleaning_data=cleaning_averages, + cols_to_merge_on=COLUMNS_TO_MERGE_ON, + ) self.data = self.data if cleaned_data is None else cleaned_data - self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step) + if self.run_mode != "kwhdata": + self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step) + self.cast_data_columns_to_lower() + def rename_kwhdata_columns(self): + """ + Rename the columns for the kwh data to the epc api data, which are uppercase and underscore + """ + self.data.columns = self.data.columns.str.upper().str.replace("-", "_") + def cast_data_columns_to_lower(self): """ Convert all columns names to lower