Model/model_data/simulation_system/core/Metrics.py

"""
Generate metrics and enable regeneration of metrics if new metrics are generated
Key tasks:
- Specify metric functions that take in prediction vs actual to generate a metric value
- Given a model and test data, produce a suite of all metrics
"""

import pandas as pd
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from core.Settings import (
    RESIDUAL_TRUE_LABEL,
    RESIDUAL_PREDICTION_LABEL,
    SEABORN_RESIDUAL_AXIS_FONTSIZE,
    SEABORN_RESIDUAL_TITLE_FONTSIZE,
    SEABORN_RESIDUAL_STYLE,
    SEABORN_RESIDUAL_ASPECT_RATIO,
    SEABORN_RESIDUAL_PLOT_DPI,
    SEABORN_RESIDUAL_RANGE,
    SEABORN_RESIDUAL_LINE_COLOUR,
    SEABORN_RESIDUAL_LINE_WIDTH,
)
from sklearn.metrics import (
    mean_absolute_error,
    median_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error,
)


# Dummy example of new metric that can be added - must be true and prediction as arguments
def max_error(y_true: pd.Series, y_pred: pd.Series):
    return max(y_true - y_pred)


METRIC_TO_APPLY = [
    mean_absolute_error,
    median_absolute_error,
    mean_squared_error,
    mean_absolute_percentage_error,
    # max_error
]


def sort_by_metric(
    data: pd.DataFrame, optimse_metric: str, best_model_column_name: str
) -> pd.DataFrame:
    """
    Helper function to sort data frame by metric and append a best model flag
    """
    # Ascending as we want lowest error values
    data = data.sort_values(optimse_metric, ascending=True).reset_index(drop=True)
    data[best_model_column_name] = [False] * len(data)
    data.loc[0, best_model_column_name] = True

    return data


class Metrics:
    """
    All metric functions used to generate a dictionary of metrics
    """

    @staticmethod
    def list_metric_functions() -> list:
        """
        Gather all metric functions to run
        """
        return [metric_to_apply.__name__ for metric_to_apply in METRIC_TO_APPLY]

    @staticmethod
    def generate_metric_suite(actuals: pd.Series, predictions: pd.Series) -> pd.Series:
        """
        For the model, test data and target, generate predictions and then iterative over all metrics to generate a Series of metric values
        """

        metric_dict = {}
        for metric_function in METRIC_TO_APPLY:
            metric_dict[metric_function.__name__] = metric_function(
                actuals, predictions
            )

        metrics = pd.Series(metric_dict)

        return metrics

    @staticmethod
    def generate_plot_suite():
        """
        Can do all metric ploting
        """

    @staticmethod
    def generate_residual_plot(
        actuals: pd.Series,
        predictions: pd.Series,
        target_column: str,
        output_filepath: Path | str,
    ):

        # TODO: can have a model.metric_outputs method
        # FOr not just do it here
        residual_df = pd.DataFrame(
            list(zip(actuals, predictions)),
            columns=[RESIDUAL_TRUE_LABEL, RESIDUAL_PREDICTION_LABEL],
        )

        # image formatting
        sns.set(style=SEABORN_RESIDUAL_STYLE)
        ax = sns.scatterplot(
            x=RESIDUAL_TRUE_LABEL, y=RESIDUAL_PREDICTION_LABEL, data=residual_df
        )
        ax.set_aspect(SEABORN_RESIDUAL_ASPECT_RATIO)
        ax.set_xlabel(f"True {target_column}", fontsize=SEABORN_RESIDUAL_AXIS_FONTSIZE)
        ax.set_ylabel(
            f"Predicted {target_column}", fontsize=SEABORN_RESIDUAL_AXIS_FONTSIZE
        )  # ylabel
        ax.set_title("Residuals", fontsize=SEABORN_RESIDUAL_TITLE_FONTSIZE)

        # Square aspect ratio
        ax.plot(
            SEABORN_RESIDUAL_RANGE,
            SEABORN_RESIDUAL_RANGE,
            SEABORN_RESIDUAL_LINE_COLOUR,
            linewidth=SEABORN_RESIDUAL_LINE_WIDTH,
        )

        plt.tight_layout()
        plt.savefig(output_filepath, dpi=SEABORN_RESIDUAL_PLOT_DPI)