mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
potential cleaning for kwh data
This commit is contained in:
parent
19c471f614
commit
4a49278eb2
1 changed files with 35 additions and 17 deletions
|
|
@ -100,8 +100,8 @@ class EPCDataProcessor:
|
|||
|
||||
# FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA
|
||||
self.violation_mode = violation_mode
|
||||
if run_mode not in ["training", "newdata"]:
|
||||
raise ValueError("Run mode must be either training or newdata")
|
||||
if run_mode not in ["training", "newdata", "kwhdata"]:
|
||||
raise ValueError("Run mode must be either training, newdata or kwhdata")
|
||||
self.run_mode = run_mode if not violation_mode else "newdata"
|
||||
|
||||
def prepare_data(self, filepath: Path | str | None = None) -> None:
|
||||
|
|
@ -110,7 +110,9 @@ class EPCDataProcessor:
|
|||
Ignore step is used to highlight which steps are not needed in newdata
|
||||
"""
|
||||
|
||||
ignore_step = True if self.run_mode == "newdata" else False
|
||||
ignore_step = True if self.run_mode in ["newdata"] else False
|
||||
if self.run_mode == "kwhdata":
|
||||
self.rename_kwhdata_columns()
|
||||
|
||||
if filepath is not None:
|
||||
self.load_data(
|
||||
|
|
@ -126,18 +128,21 @@ class EPCDataProcessor:
|
|||
self.remap_build_form()
|
||||
self.cast_data_column_values_to_lower()
|
||||
self.standardise_construction_age_band(ignore_step=ignore_step)
|
||||
self.clean_missing_rooms(ignore_step=ignore_step)
|
||||
if self.run_mode != "kwhdata":
|
||||
self.clean_missing_rooms(ignore_step=ignore_step)
|
||||
self.recast_df_columns(
|
||||
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
|
||||
)
|
||||
self.clean_multi_glaze_proportion(ignore_step=ignore_step)
|
||||
self.clean_photo_supply()
|
||||
self.retain_multiple_epc_properties(
|
||||
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"],
|
||||
ignore_step=ignore_step,
|
||||
)
|
||||
if self.run_mode != "kwhdata":
|
||||
self.retain_multiple_epc_properties(
|
||||
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"],
|
||||
ignore_step=ignore_step,
|
||||
)
|
||||
|
||||
self.fill_na_fields()
|
||||
if self.run_mode != "kwhdata":
|
||||
self.fill_na_fields()
|
||||
|
||||
self.sort_data_by_uprn_lodgement_date(ignore_step=ignore_step)
|
||||
|
||||
|
|
@ -148,8 +153,9 @@ class EPCDataProcessor:
|
|||
|
||||
self.fill_invalid_constituency_fields(ignore_step=ignore_step)
|
||||
|
||||
self.make_cleaning_averages(ignore_step=ignore_step)
|
||||
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
|
||||
if self.run_mode != "kwhdata":
|
||||
self.make_cleaning_averages(ignore_step=ignore_step)
|
||||
self.add_local_authority_to_cleaning_average(ignore_step=ignore_step)
|
||||
|
||||
# TODO: check if this has impact on training dataset
|
||||
# cleaned_data = self.apply_averages_cleaning(
|
||||
|
|
@ -160,21 +166,33 @@ class EPCDataProcessor:
|
|||
# )
|
||||
|
||||
# When running in newdata mode, cleaning_averages has lower cases so we co-erce back to upper
|
||||
|
||||
cleaning_averages = self.cleaning_averages.copy()
|
||||
if self.run_mode == "newdata":
|
||||
cleaning_averages.columns = cleaning_averages.columns.str.upper()
|
||||
|
||||
cleaned_data = self.apply_averages_cleaning(
|
||||
data_to_clean=self.data,
|
||||
cleaning_data=cleaning_averages,
|
||||
cols_to_merge_on=COLUMNS_TO_MERGE_ON,
|
||||
)
|
||||
if self.run_mode == "kwhdata":
|
||||
cleaned_data = self.data
|
||||
else:
|
||||
cleaned_data = self.apply_averages_cleaning(
|
||||
data_to_clean=self.data,
|
||||
cleaning_data=cleaning_averages,
|
||||
cols_to_merge_on=COLUMNS_TO_MERGE_ON,
|
||||
)
|
||||
|
||||
self.data = self.data if cleaned_data is None else cleaned_data
|
||||
|
||||
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
|
||||
if self.run_mode != "kwhdata":
|
||||
self.cast_cleaning_averages_columns_to_lower(ignore_step=ignore_step)
|
||||
|
||||
self.cast_data_columns_to_lower()
|
||||
|
||||
def rename_kwhdata_columns(self):
|
||||
"""
|
||||
Rename the columns for the kwh data to the epc api data, which are uppercase and underscore
|
||||
"""
|
||||
self.data.columns = self.data.columns.str.upper().str.replace("-", "_")
|
||||
|
||||
def cast_data_columns_to_lower(self):
|
||||
"""
|
||||
Convert all columns names to lower
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue