mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
74 lines
2.1 KiB
Python
74 lines
2.1 KiB
Python
"""
|
|
Create additional features from the dataset
|
|
"""
|
|
|
|
import pandas as pd
|
|
from typing import List
|
|
from core.Logger import logger
|
|
|
|
RDSAP_CHANGE_DROP_COLUMNS = ["UPRN", "HEAT_DEMAND_CHANGE"]
|
|
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ["UPRN", "RDSAP_CHANGE"]
|
|
|
|
RANDOM_SEED = 0
|
|
|
|
|
|
class FeatureProcessor:
|
|
"""
|
|
Handle all feature manipulation before modelling
|
|
"""
|
|
|
|
@staticmethod
|
|
def drop_unused_columns(
|
|
df: pd.DataFrame, target_column: str = "RDSAP_CHANGE"
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Remove the unused columns for RDS
|
|
"""
|
|
if target_column == "RDSAP_CHANGE":
|
|
df = df.drop(columns=RDSAP_CHANGE_DROP_COLUMNS)
|
|
elif target_column == "HEAT_DEMAND_CHANGE":
|
|
df = df.drop(columns=HEAT_DEMAND_CHANGE_DROP_COLUMNS)
|
|
return df
|
|
|
|
@staticmethod
|
|
def retain_features(df: pd.DataFrame, features: List[str] | None = None):
|
|
"""
|
|
Determine which columns to keep for modelling
|
|
"""
|
|
if features is None:
|
|
features = df.columns.to_list()
|
|
else:
|
|
if not set(features).issubset(df.columns):
|
|
logger.error("Features defined is not contained in data")
|
|
exit(1)
|
|
|
|
df = df[features]
|
|
|
|
return df
|
|
|
|
@staticmethod
|
|
def subsample_data(
|
|
df: pd.DataFrame, subsample_amount: int | None = None
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Sample data to reduce number of rows for model building if needed
|
|
"""
|
|
|
|
if subsample_amount:
|
|
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
|
|
return df
|
|
|
|
def process(
|
|
self,
|
|
df: pd.DataFrame,
|
|
target_column: str = "RDSAP_CHANGE",
|
|
features: List[str] | None = None,
|
|
subsample_amount: int | None = None,
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Pipeline to get data ready for building a model
|
|
"""
|
|
df = self.subsample_data(df, subsample_amount=subsample_amount)
|
|
df = self.drop_unused_columns(df, target_column=target_column)
|
|
df = self.retain_features(df, features=features)
|
|
return df
|