diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 8379546f..66715083 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -14,7 +14,33 @@ class TrainingDataset: self._feature_generation() self._drop_features() self._clean_dataframe() - self._clean_efficiency_variables(self.df) + self._clean_efficiency_variables() + self._null_validation(information="Clean Efficiency Variables") + self._process_and_prune() + self._clean_missing_values() + self._null_validation(information="Clean Missing Values") + + + def _clean_missing_values(self, ignore_cols=None): + missings = pd.isnull(self.df).sum() + missings = missings[missings > 0] + + if ignore_cols: + missings = missings[~missings.index.isin(ignore_cols)] + + for col in missings.index: + unique_values = self.df[col].unique() + if True in unique_values or False in unique_values: + self.df[col] = self.df[col].fillna(False) + if "none" in unique_values: + self.df[col] = self.df[col].fillna("none") + else: + self.df[col] = self.df[col].fillna("Unknown") + + + def _null_validation(self, information: str = ""): + if pd.isnull(self.df).sum().sum(): + raise ValueError(f"Null values found in dataset, after step {information}") def _drop_features(self): """ @@ -30,8 +56,7 @@ class TrainingDataset: self.df["DAYS_TO_STARTING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_STARTING"]) self.df["DAYS_TO_ENDING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_ENDING"]) - @staticmethod - def _clean_efficiency_variables(df): + def _clean_efficiency_variables(self, df): """ These is scope to clean this by the model per corresponding description. @@ -43,20 +68,19 @@ class TrainingDataset: :return: """ - missings = pd.isnull(df).sum() + missings = pd.isnull(self.df).sum() missings = missings[missings >= 1] if len(missings) == 0: - return df + return # Make sure they are all efficiency columns if any(~missings.index.str.contains("ENERGY_EFF")): raise ValueError("Non efficiency columns are missing") for m in missings.index: - df[m] = df[m].fillna("NO_RATING") + self.df[m] = self.df[m].fillna("NO_RATING") - return df @staticmethod def _calculate_days_to(lodgement_date): diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index da17fe05..14a1bfd2 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -579,7 +579,7 @@ def app(): from etl.epc.Dataset import TrainingDataset constituency_data = TrainingDataset(datasets=data_by_uprn) - data_by_urpn_df = pd.DataFrame(data_by_urpn) + # data_by_urpn_df = pd.DataFrame(data_by_urpn) # # TODO: can we move this into the epc record? # data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to( @@ -592,7 +592,7 @@ def app(): # data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"]) - data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df) + # data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df) # We look for key building fabric features that have changed from one EPC to the next. # if, for example, we see that a home has gone from being a cavity wall to a solid wall, we @@ -600,8 +600,8 @@ def app(): # is low # We also replace descriptions with their cleaned variants - if pd.isnull(data_by_urpn_df).sum().sum(): - raise ValueError("Null values found in dataset") + # if pd.isnull(data_by_urpn_df).sum().sum(): + # raise ValueError("Null values found in dataset") data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup) @@ -617,10 +617,10 @@ def app(): # Those nulls should be False. clean_missings_after_description_process handles this but shouldn't # need to - data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df) + # data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df) - if pd.isnull(data_by_urpn_df).sum().sum(): - raise ValueError("Null values found in dataset after process_and_prune_desriptions") + # if pd.isnull(data_by_urpn_df).sum().sum(): + # raise ValueError("Null values found in dataset after process_and_prune_desriptions") dataset.append(data_by_urpn_df)