mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
few more steps
This commit is contained in:
parent
353e8a90db
commit
7a2c2fff15
2 changed files with 38 additions and 14 deletions
|
|
@ -14,7 +14,33 @@ class TrainingDataset:
|
||||||
self._feature_generation()
|
self._feature_generation()
|
||||||
self._drop_features()
|
self._drop_features()
|
||||||
self._clean_dataframe()
|
self._clean_dataframe()
|
||||||
self._clean_efficiency_variables(self.df)
|
self._clean_efficiency_variables()
|
||||||
|
self._null_validation(information="Clean Efficiency Variables")
|
||||||
|
self._process_and_prune()
|
||||||
|
self._clean_missing_values()
|
||||||
|
self._null_validation(information="Clean Missing Values")
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_missing_values(self, ignore_cols=None):
|
||||||
|
missings = pd.isnull(self.df).sum()
|
||||||
|
missings = missings[missings > 0]
|
||||||
|
|
||||||
|
if ignore_cols:
|
||||||
|
missings = missings[~missings.index.isin(ignore_cols)]
|
||||||
|
|
||||||
|
for col in missings.index:
|
||||||
|
unique_values = self.df[col].unique()
|
||||||
|
if True in unique_values or False in unique_values:
|
||||||
|
self.df[col] = self.df[col].fillna(False)
|
||||||
|
if "none" in unique_values:
|
||||||
|
self.df[col] = self.df[col].fillna("none")
|
||||||
|
else:
|
||||||
|
self.df[col] = self.df[col].fillna("Unknown")
|
||||||
|
|
||||||
|
|
||||||
|
def _null_validation(self, information: str = ""):
|
||||||
|
if pd.isnull(self.df).sum().sum():
|
||||||
|
raise ValueError(f"Null values found in dataset, after step {information}")
|
||||||
|
|
||||||
def _drop_features(self):
|
def _drop_features(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -30,8 +56,7 @@ class TrainingDataset:
|
||||||
self.df["DAYS_TO_STARTING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_STARTING"])
|
self.df["DAYS_TO_STARTING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_STARTING"])
|
||||||
self.df["DAYS_TO_ENDING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_ENDING"])
|
self.df["DAYS_TO_ENDING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_ENDING"])
|
||||||
|
|
||||||
@staticmethod
|
def _clean_efficiency_variables(self, df):
|
||||||
def _clean_efficiency_variables(df):
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
These is scope to clean this by the model per corresponding description.
|
These is scope to clean this by the model per corresponding description.
|
||||||
|
|
@ -43,20 +68,19 @@ class TrainingDataset:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
missings = pd.isnull(df).sum()
|
missings = pd.isnull(self.df).sum()
|
||||||
missings = missings[missings >= 1]
|
missings = missings[missings >= 1]
|
||||||
|
|
||||||
if len(missings) == 0:
|
if len(missings) == 0:
|
||||||
return df
|
return
|
||||||
|
|
||||||
# Make sure they are all efficiency columns
|
# Make sure they are all efficiency columns
|
||||||
if any(~missings.index.str.contains("ENERGY_EFF")):
|
if any(~missings.index.str.contains("ENERGY_EFF")):
|
||||||
raise ValueError("Non efficiency columns are missing")
|
raise ValueError("Non efficiency columns are missing")
|
||||||
|
|
||||||
for m in missings.index:
|
for m in missings.index:
|
||||||
df[m] = df[m].fillna("NO_RATING")
|
self.df[m] = self.df[m].fillna("NO_RATING")
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _calculate_days_to(lodgement_date):
|
def _calculate_days_to(lodgement_date):
|
||||||
|
|
|
||||||
|
|
@ -579,7 +579,7 @@ def app():
|
||||||
from etl.epc.Dataset import TrainingDataset
|
from etl.epc.Dataset import TrainingDataset
|
||||||
constituency_data = TrainingDataset(datasets=data_by_uprn)
|
constituency_data = TrainingDataset(datasets=data_by_uprn)
|
||||||
|
|
||||||
data_by_urpn_df = pd.DataFrame(data_by_urpn)
|
# data_by_urpn_df = pd.DataFrame(data_by_urpn)
|
||||||
|
|
||||||
# # TODO: can we move this into the epc record?
|
# # TODO: can we move this into the epc record?
|
||||||
# data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
|
# data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
|
||||||
|
|
@ -592,7 +592,7 @@ def app():
|
||||||
|
|
||||||
# data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
# data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
||||||
|
|
||||||
data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df)
|
# data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df)
|
||||||
|
|
||||||
# We look for key building fabric features that have changed from one EPC to the next.
|
# We look for key building fabric features that have changed from one EPC to the next.
|
||||||
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
|
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
|
||||||
|
|
@ -600,8 +600,8 @@ def app():
|
||||||
# is low
|
# is low
|
||||||
# We also replace descriptions with their cleaned variants
|
# We also replace descriptions with their cleaned variants
|
||||||
|
|
||||||
if pd.isnull(data_by_urpn_df).sum().sum():
|
# if pd.isnull(data_by_urpn_df).sum().sum():
|
||||||
raise ValueError("Null values found in dataset")
|
# raise ValueError("Null values found in dataset")
|
||||||
|
|
||||||
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
|
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
|
||||||
|
|
||||||
|
|
@ -617,10 +617,10 @@ def app():
|
||||||
# Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
|
# Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
|
||||||
# need to
|
# need to
|
||||||
|
|
||||||
data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df)
|
# data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df)
|
||||||
|
|
||||||
if pd.isnull(data_by_urpn_df).sum().sum():
|
# if pd.isnull(data_by_urpn_df).sum().sum():
|
||||||
raise ValueError("Null values found in dataset after process_and_prune_desriptions")
|
# raise ValueError("Null values found in dataset after process_and_prune_desriptions")
|
||||||
|
|
||||||
dataset.append(data_by_urpn_df)
|
dataset.append(data_by_urpn_df)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue