Model/etl/epc/PredictionMatrix.py

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import pandas as pd

@dataclass
class RecommendationPrediction:
    measure_id: str
    prediction: Any
    metadata: Dict[str, Any] = field(default_factory=dict)

@dataclass
class PredictionEntry:
    uprn: int
    rebaselined_prediction: Any = None
    recommendation_predictions: List[RecommendationPrediction] = field(default_factory=list)
    original_epc: Optional[Dict[str, Any]] = None
    landlord_differences: Optional[Dict[str, Any]] = None
    lodgement_date: Optional[Any] = None

class PredictionMatrix:
    def __init__(self):
        self.entries: Dict[int, PredictionEntry] = {}

    def add_entry(self, entry: PredictionEntry):
        self.entries[entry.uprn] = entry

    def add_recommendation(self, uprn: int, measure_id: str, prediction: Any, metadata: Optional[Dict[str, Any]] = None):
        if uprn not in self.entries:
            self.entries[uprn] = PredictionEntry(uprn=uprn)
        rec = RecommendationPrediction(measure_id=measure_id, prediction=prediction, metadata=metadata or {})
        self.entries[uprn].recommendation_predictions.append(rec)

    def set_rebaselined_prediction(self, uprn: int, prediction: Any):
        if uprn not in self.entries:
            self.entries[uprn] = PredictionEntry(uprn=uprn)
        self.entries[uprn].rebaselined_prediction = prediction

    def set_original_epc(self, uprn: int, original_epc: Dict[str, Any], landlord_differences: Dict[str, Any], lodgement_date: Any = None):
        if uprn not in self.entries:
            self.entries[uprn] = PredictionEntry(uprn=uprn)
        self.entries[uprn].original_epc = original_epc
        self.entries[uprn].landlord_differences = landlord_differences
        self.entries[uprn].lodgement_date = lodgement_date

    def to_dataframe(self) -> pd.DataFrame:
        rows = []
        for entry in self.entries.values():
            base = {
                "uprn": entry.uprn,
                "rebaselined_prediction": entry.rebaselined_prediction,
                "lodgement_date": entry.lodgement_date,
                "landlord_differences": entry.landlord_differences,
            }
            # Add original EPC fields if present
            if entry.original_epc and entry.landlord_differences:
                for k in entry.landlord_differences.keys():
                    base[f"{k}_ori"] = entry.original_epc.get(k)
                    base[f"{k}_ll"] = entry.landlord_differences.get(k)
            # Add measure-level predictions
            for rec in entry.recommendation_predictions:
                row = base.copy()
                row["measure_id"] = rec.measure_id
                row["measure_prediction"] = rec.prediction
                row["measure_metadata"] = rec.metadata
                rows.append(row)
            if not entry.recommendation_predictions:
                rows.append(base)
        return pd.DataFrame(rows)

    def summarise_differences(self, df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        if df is None:
            df = self.to_dataframe()
        ori_cols = [c for c in df.columns if c.endswith("_ori")]
        for ori_col in ori_cols:
            ll_col = ori_col.replace("_ori", "_ll")
            if ll_col in df.columns:
                same = df[ori_col].fillna("NULL") == df[ll_col].fillna("NULL")
                df.loc[same, [ori_col, ll_col]] = None
        return df