Model/etl/epc/FeatureProcessor.py
2023-10-05 18:20:52 +01:00

74 lines
2.1 KiB
Python

"""
Create additional features from the dataset
"""
import pandas as pd
from typing import List
from core.Logger import logger
RDSAP_CHANGE_DROP_COLUMNS = ["UPRN", "HEAT_DEMAND_CHANGE"]
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ["UPRN", "RDSAP_CHANGE"]
RANDOM_SEED = 0
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
def drop_unused_columns(
df: pd.DataFrame, target_column: str = "RDSAP_CHANGE"
) -> pd.DataFrame:
"""
Remove the unused columns for RDS
"""
if target_column == "RDSAP_CHANGE":
df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS)
elif target_column == "HEAT_DEMAND_CHANGE":
df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS)
return df
@staticmethod
def retain_features(df: pd.DataFrame, features: List[str] | None = None):
"""
Determine which columns to keep for modelling
"""
if features is None:
features = df.columns.to_list()
else:
if not set(features).issubset(df.columns):
logger.error("Features defined is not contained in data")
exit(1)
df = df[features]
return df
@staticmethod
def subsample_data(
df: pd.DataFrame, subsample_amount: int | None = None
) -> pd.DataFrame:
"""
Sample data to reduce number of rows for model building if needed
"""
if subsample_amount:
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
return df
def process(
self,
df: pd.DataFrame,
target_column: str = "RDSAP_CHANGE",
features: List[str] | None = None,
subsample_amount: int | None = None,
) -> pd.DataFrame:
"""
Pipeline to get data ready for building a model
"""
df = self.subsample_data(df, subsample_amount=subsample_amount)
df = self.drop_unused_columns(df, target_column=target_column)
df = self.retain_features(df, features=features)
return df