""" Create additional features from the dataset """ import pandas as pd from typing import List from core.Logger import logger RDSAP_CHANGE_DROP_COLUMNS = ["UPRN", "HEAT_DEMAND_CHANGE"] HEAT_DEMAND_CHANGE_DROP_COLUMNS = ["UPRN", "RDSAP_CHANGE"] RANDOM_SEED = 0 class FeatureProcessor: """ Handle all feature manipulation before modelling """ @staticmethod def drop_unused_columns( df: pd.DataFrame, target_column: str = "RDSAP_CHANGE" ) -> pd.DataFrame: """ Remove the unused columns for RDS """ if target_column == "RDSAP_CHANGE": df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS) elif target_column == "HEAT_DEMAND_CHANGE": df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS) return df @staticmethod def retain_features(df: pd.DataFrame, features: List[str] | None = None): """ Determine which columns to keep for modelling """ if features is None: features = df.columns.to_list() else: if not set(features).issubset(df.columns): logger.error("Features defined is not contained in data") exit(1) df = df[features] return df @staticmethod def subsample_data( df: pd.DataFrame, subsample_amount: int | None = None ) -> pd.DataFrame: """ Sample data to reduce number of rows for model building if needed """ if subsample_amount: df = df.sample(subsample_amount, random_state=RANDOM_SEED) return df def process( self, df: pd.DataFrame, target_column: str = "RDSAP_CHANGE", features: List[str] | None = None, subsample_amount: int | None = None, ) -> pd.DataFrame: """ Pipeline to get data ready for building a model """ df = self.subsample_data(df, subsample_amount=subsample_amount) df = self.drop_unused_columns(df, target_column=target_column) df = self.retain_features(df, features=features) return df