potential cleaning for kwh data

This commit is contained in:
Michael Duong 2024-08-12 22:05:43 +01:00
parent 19c471f614
commit 4a49278eb2

View file

@ -100,8 +100,8 @@ class EPCDataProcessor:
# FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA
self.violation_mode = violation_mode
if run_mode not in ["training", "newdata"]:
raise ValueError("Run mode must be either training or newdata")
if run_mode not in ["training", "newdata", "kwhdata"]:
raise ValueError("Run mode must be either training, newdata or kwhdata")
self.run_mode = run_mode if not violation_mode else "newdata"
def prepare_data(self, filepath: Path | str | None = None) -> None:
@ -110,7 +110,9 @@ class EPCDataProcessor:
Ignore step is used to highlight which steps are not needed in newdata
"""
ignore_step = True if self.run_mode == "newdata" else False
ignore_step = True if self.run_mode in ["newdata"] else False
if self.run_mode == "kwhdata":
self.rename_kwhdata_columns()
if filepath is not None:
self.load_data(
@ -126,18 +128,21 @@ class EPCDataProcessor:
self.remap_build_form()
self.cast_data_column_values_to_lower()
self.standardise_construction_age_band(ignore_step=ignore_step)
self.clean_missing_rooms(ignore_step=ignore_step)
if self.run_mode != "kwhdata":
self.clean_missing_rooms(ignore_step=ignore_step)
self.recast_df_columns(
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
)
self.clean_multi_glaze_proportion(ignore_step=ignore_step)
self.clean_photo_supply()
self.retain_multiple_epc_properties(
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"],
ignore_step=ignore_step,
)
if self.run_mode != "kwhdata":
self.retain_multiple_epc_properties(
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"],
ignore_step=ignore_step,
)
self.fill_na_fields()
if self.run_mode != "kwhdata":
self.fill_na_fields()
self.sort_data_by_uprn_lodgement_date(ignore_step=ignore_step)
@ -148,8 +153,9 @@ class EPCDataProcessor:
self.fill_invalid_constituency_fields(ignore_step=ignore_step)
self.make_cleaning_averages(ignore_step=ignore_step)
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
if self.run_mode != "kwhdata":
self.make_cleaning_averages(ignore_step=ignore_step)
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
# TODO: check if this has impact on training dataset
# cleaned_data = self.apply_averages_cleaning(
@ -160,21 +166,33 @@ class EPCDataProcessor:
# )
# When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
cleaning_averages = self.cleaning_averages.copy()
if self.run_mode == "newdata":
cleaning_averages.columns = cleaning_averages.columns.str.upper()
cleaned_data = self.apply_averages_cleaning(
data_to_clean=self.data,
cleaning_data=cleaning_averages,
cols_to_merge_on=COLUMNS_TO_MERGE_ON,
)
if self.run_mode == "kwhdata":
cleaned_data = self.data
else:
cleaned_data = self.apply_averages_cleaning(
data_to_clean=self.data,
cleaning_data=cleaning_averages,
cols_to_merge_on=COLUMNS_TO_MERGE_ON,
)
self.data = self.data if cleaned_data is None else cleaned_data
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
if self.run_mode != "kwhdata":
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
self.cast_data_columns_to_lower()
def rename_kwhdata_columns(self):
"""
Rename the columns for the kwh data to the epc api data, which are uppercase and underscore
"""
self.data.columns = self.data.columns.str.upper().str.replace("-", "_")
def cast_data_columns_to_lower(self):
"""
Convert all columns names to lower