Model/etl/epc/PredictionMatrix.py

80 lines
3.4 KiB
Python

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import pandas as pd
@dataclass
class RecommendationPrediction:
measure_id: str
prediction: Any
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class PredictionEntry:
uprn: int
rebaselined_prediction: Any = None
recommendation_predictions: List[RecommendationPrediction] = field(default_factory=list)
original_epc: Optional[Dict[str, Any]] = None
landlord_differences: Optional[Dict[str, Any]] = None
lodgement_date: Optional[Any] = None
class PredictionMatrix:
def __init__(self):
self.entries: Dict[int, PredictionEntry] = {}
def add_entry(self, entry: PredictionEntry):
self.entries[entry.uprn] = entry
def add_recommendation(self, uprn: int, measure_id: str, prediction: Any, metadata: Optional[Dict[str, Any]] = None):
if uprn not in self.entries:
self.entries[uprn] = PredictionEntry(uprn=uprn)
rec = RecommendationPrediction(measure_id=measure_id, prediction=prediction, metadata=metadata or {})
self.entries[uprn].recommendation_predictions.append(rec)
def set_rebaselined_prediction(self, uprn: int, prediction: Any):
if uprn not in self.entries:
self.entries[uprn] = PredictionEntry(uprn=uprn)
self.entries[uprn].rebaselined_prediction = prediction
def set_original_epc(self, uprn: int, original_epc: Dict[str, Any], landlord_differences: Dict[str, Any], lodgement_date: Any = None):
if uprn not in self.entries:
self.entries[uprn] = PredictionEntry(uprn=uprn)
self.entries[uprn].original_epc = original_epc
self.entries[uprn].landlord_differences = landlord_differences
self.entries[uprn].lodgement_date = lodgement_date
def to_dataframe(self) -> pd.DataFrame:
rows = []
for entry in self.entries.values():
base = {
"uprn": entry.uprn,
"rebaselined_prediction": entry.rebaselined_prediction,
"lodgement_date": entry.lodgement_date,
"landlord_differences": entry.landlord_differences,
}
# Add original EPC fields if present
if entry.original_epc and entry.landlord_differences:
for k in entry.landlord_differences.keys():
base[f"{k}_ori"] = entry.original_epc.get(k)
base[f"{k}_ll"] = entry.landlord_differences.get(k)
# Add measure-level predictions
for rec in entry.recommendation_predictions:
row = base.copy()
row["measure_id"] = rec.measure_id
row["measure_prediction"] = rec.prediction
row["measure_metadata"] = rec.metadata
rows.append(row)
if not entry.recommendation_predictions:
rows.append(base)
return pd.DataFrame(rows)
def summarise_differences(self, df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
if df is None:
df = self.to_dataframe()
ori_cols = [c for c in df.columns if c.endswith("_ori")]
for ori_col in ori_cols:
ll_col = ori_col.replace("_ori", "_ll")
if ll_col in df.columns:
same = df[ori_col].fillna("NULL") == df[ll_col].fillna("NULL")
df.loc[same, [ori_col, ll_col]] = None
return df