Model/etl/epc/Dataset.py

import pandas as pd
from typing import List
from etl.epc.EPCRecord import EPCDifferenceRecord

class TrainingDataset:
    """
    A collection of EPCDifferenceRecords can be combined into a TrainingDataset.
    """

    def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
        self.datasets = datasets
        self.df = pd.DataFrame([dataset.difference_record for dataset in datasets])

        self._feature_generation()
        self._drop_features()
        self._clean_dataframe()
        self._clean_efficiency_variables()
        self._null_validation(information="Clean Efficiency Variables")
        self._process_and_prune()
        self._clean_missing_values()
        self._null_validation(information="Clean Missing Values")


    def _clean_missing_values(self, ignore_cols=None):
        missings = pd.isnull(self.df).sum()
        missings = missings[missings > 0]

        if ignore_cols:
            missings = missings[~missings.index.isin(ignore_cols)]

        for col in missings.index:
            unique_values = self.df[col].unique()
            if True in unique_values or False in unique_values:
                self.df[col] = self.df[col].fillna(False)
            if "none" in unique_values:
                self.df[col] = self.df[col].fillna("none")
            else:
                self.df[col] = self.df[col].fillna("Unknown")


    def _null_validation(self, information: str = ""):
        if pd.isnull(self.df).sum().sum():
            raise ValueError(f"Null values found in dataset, after step {information}")

    def _drop_features(self):
        """
        Drop features that are not needed for modelling
        """
        self.df = self.df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])


    def _feature_generation(self):
        """
        Generate features for modelling
        """
        self.df["DAYS_TO_STARTING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_STARTING"])
        self.df["DAYS_TO_ENDING"] = self._calculate_days_to(self.df["LODGEMENT_DATE_ENDING"])

    def _clean_efficiency_variables(self, df):

        """
        These is scope to clean this by the model per corresponding description.
        E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
        fill in the missing values with this.
        When looking at this initially, there are a large volume of records with missing energy efficiency
        values and therefore a simpler approach was taken just to test including these variables
        :param df:
        :return:
        """

        missings = pd.isnull(self.df).sum()
        missings = missings[missings >= 1]

        if len(missings) == 0:
            return

        # Make sure they are all efficiency columns
        if any(~missings.index.str.contains("ENERGY_EFF")):
            raise ValueError("Non efficiency columns are missing")

        for m in missings.index:
            self.df[m] = self.df[m].fillna("NO_RATING")


    @staticmethod
    def _calculate_days_to(lodgement_date):

        if isinstance(lodgement_date, str):
            return (
                pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
            ).daye

        return (
            pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
        ).dt.days

    def __add__(self, other) -> "TrainingDataset":
        if not isinstance(other, TrainingDataset):
            raise TypeError("Addition can only be performed with another instance of TrainingDataset")
        return TrainingDataset(self.datasets + other.datasets)

    def __radd__(self, other):
        """
        Required for sum() to work
        """
        if isinstance(other, int):
            return self
        else:
            return self.__add__(other)

class ScoringDataset:
    """
    A collection of EPCDifferenceRecords can be combined into a ScoringDataset.
    """

    def __init__(self, datasets: List[EPCDifferenceRecord]) -> None:
        self.datasets = datasets

    def __add__(self, other) -> "ScoringDataset":
        if not isinstance(other, ScoringDataset):
            raise TypeError("Addition can only be performed with another instance of ScoringDataset")
        return ScoringDataset(self.datasets + other.datasets)

    def __radd__(self, other):
        """
        Required for sum() to work
        """
        if isinstance(other, int):
            return self
        else:
            return self.__add__(other)