mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
131 lines
No EOL
4.4 KiB
Python
131 lines
No EOL
4.4 KiB
Python
import pandas as pd
|
|
from typing import List
|
|
from etl.epc.EPCRecord import EPCDifferenceRecord
|
|
|
|
class TrainingDataset:
|
|
"""
|
|
A collection of EPCDifferenceRecords can be combined into a TrainingDataset.
|
|
"""
|
|
|
|
def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
|
|
self.datasets = datasets
|
|
self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])
|
|
|
|
self._feature_generation()
|
|
self._drop_features()
|
|
self._clean_dataframe()
|
|
self._clean_efficiency_variables()
|
|
self._null_validation(information="Clean Efficiency Variables")
|
|
self._process_and_prune()
|
|
self._clean_missing_values()
|
|
self._null_validation(information="Clean Missing Values")
|
|
|
|
|
|
def _clean_missing_values(self, ignore_cols=None):
|
|
missings = pd.isnull(self.df).sum()
|
|
missings = missings[missings > 0]
|
|
|
|
if ignore_cols:
|
|
missings = missings[~missings.index.isin(ignore_cols)]
|
|
|
|
for col in missings.index:
|
|
unique_values = self.df[col].unique()
|
|
if True in unique_values or False in unique_values:
|
|
self.df[col] = self.df[col].fillna(False)
|
|
if "none" in unique_values:
|
|
self.df[col] = self.df[col].fillna("none")
|
|
else:
|
|
self.df[col] = self.df[col].fillna("Unknown")
|
|
|
|
|
|
def _null_validation(self, information: str = ""):
|
|
if pd.isnull(self.df).sum().sum():
|
|
raise ValueError(f"Null values found in dataset, after step {information}")
|
|
|
|
def _drop_features(self):
|
|
"""
|
|
Drop features that are not needed for modelling
|
|
"""
|
|
self.df = self.df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
|
|
|
|
|
def _feature_generation(self):
|
|
"""
|
|
Generate features for modelling
|
|
"""
|
|
self.df["DAYS_TO_STARTING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_STARTING"])
|
|
self.df["DAYS_TO_ENDING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_ENDING"])
|
|
|
|
def _clean_efficiency_variables(self, df):
|
|
|
|
"""
|
|
These is scope to clean this by the model per corresponding description.
|
|
E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
|
|
fill in the missing values with this.
|
|
When looking at this initially, there are a large volume of records with missing energy efficiency
|
|
values and therefore a simpler approach was taken just to test including these variables
|
|
:param df:
|
|
:return:
|
|
"""
|
|
|
|
missings = pd.isnull(self.df).sum()
|
|
missings = missings[missings >= 1]
|
|
|
|
if len(missings) == 0:
|
|
return
|
|
|
|
# Make sure they are all efficiency columns
|
|
if any(~missings.index.str.contains("ENERGY_EFF")):
|
|
raise ValueError("Non efficiency columns are missing")
|
|
|
|
for m in missings.index:
|
|
self.df[m] = self.df[m].fillna("NO_RATING")
|
|
|
|
|
|
@staticmethod
|
|
def _calculate_days_to(lodgement_date):
|
|
|
|
if isinstance(lodgement_date, str):
|
|
return (
|
|
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).daye
|
|
|
|
return (
|
|
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
|
|
).dt.days
|
|
|
|
def __add__(self, other) -> "TrainingDataset":
|
|
if not isinstance(other, TrainingDataset):
|
|
raise TypeError("Addition can only be performed with another instance of TrainingDataset")
|
|
return TrainingDataset(self.datasets + other.datasets)
|
|
|
|
def __radd__(self, other):
|
|
"""
|
|
Required for sum() to work
|
|
"""
|
|
if isinstance(other, int):
|
|
return self
|
|
else:
|
|
return self.__add__(other)
|
|
|
|
class ScoringDataset:
|
|
"""
|
|
A collection of EPCDifferenceRecords can be combined into a ScoringDataset.
|
|
"""
|
|
|
|
def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
|
|
self.datasets = datasets
|
|
|
|
def __add__(self, other) -> "ScoringDataset":
|
|
if not isinstance(other, ScoringDataset):
|
|
raise TypeError("Addition can only be performed with another instance of ScoringDataset")
|
|
return ScoringDataset(self.datasets + other.datasets)
|
|
|
|
def __radd__(self, other):
|
|
"""
|
|
Required for sum() to work
|
|
"""
|
|
if isinstance(other, int):
|
|
return self
|
|
else:
|
|
return self.__add__(other) |