From 8ed1d3b9bd0f8d765415310a3229103be1e99c5f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 9 Dec 2025 23:17:36 +0000 Subject: [PATCH] implemented simple scoring model for battery SAP improvement --- backend/app/BatterySapScorer.py | 29 +++++++++++++ etl/battery_model/train.py | 62 ++++++++++++++++++++++++++++ sfr/principal_pitch/2_export_data.py | 34 ++++++++++++--- 3 files changed, 119 insertions(+), 6 deletions(-) create mode 100644 backend/app/BatterySapScorer.py create mode 100644 etl/battery_model/train.py diff --git a/backend/app/BatterySapScorer.py b/backend/app/BatterySapScorer.py new file mode 100644 index 00000000..59462677 --- /dev/null +++ b/backend/app/BatterySapScorer.py @@ -0,0 +1,29 @@ +import numpy as np + + +class BatterySAPScorer: + """ + Lightweight production scorer — no sklearn dependency. + Uses hard-coded coefficients discovered offline. The code for discovering the coefficients + can be found in etl/battery_model/train.py + """ + + INTERCEPT = 10.310168559226678 + COEF_STARTING_SAP = -0.16120648633993315 + COEF_PV_SIZE = 1.0500492005420736 + + @classmethod + def score(cls, starting_sap, pv_size): + """ + heating_system: string used to infer is_electric + """ + + sap_uplift = ( + cls.INTERCEPT + + cls.COEF_STARTING_SAP * starting_sap + + cls.COEF_PV_SIZE * pv_size + ) + + # Round + clamp to [0,5] + sap_uplift = int(np.round(np.clip(sap_uplift, 0, 5))) + return sap_uplift diff --git a/etl/battery_model/train.py b/etl/battery_model/train.py new file mode 100644 index 00000000..086f68cb --- /dev/null +++ b/etl/battery_model/train.py @@ -0,0 +1,62 @@ +import pandas as pd +from sklearn.linear_model import Ridge + + +class SAPUpliftTrainer: + """ + Offline training class — discovers SAP uplift model coefficients. + """ + + def __init__(self, alpha=1.0): + self.alpha = alpha + self.model = Ridge(alpha=self.alpha) + self.feature_names = ["starting SAP", "PV Array size"] + + def prepare_data(self, df): + df = df.copy() + # df["is_electric"] = df["heating"].str.contains( + # "Electric", case=False, na=False + # ).astype(int) + X = df[self.feature_names] + y = df["SAP points"] + return X, y + + def fit(self, df): + X, y = self.prepare_data(df) + self.model.fit(X, y) + + def coefficients(self): + return { + "intercept": float(self.model.intercept_), + **{ + name: float(coef) + for name, coef in zip(self.feature_names, self.model.coef_) + } + } + + def export_runtime_config(self): + """ + Returns a dict suitable for copy-pasting into the runtime scoring class. + """ + coefs = self.coefficients() + return { + "intercept": coefs["intercept"], + "coef_starting_sap": coefs["starting SAP"], + "coef_pv_size": coefs["PV Array size"], + # "coef_is_electric": coefs["is_electric"], + } + + +# The training data can be found in the Domna sharepoint in Product Development > Solar Battery Recommendations +df = pd.read_csv("/Users/khalimconn-kowlessar/Downloads/SAP Movement data(Sheet1).csv") + +trainer = SAPUpliftTrainer(alpha=1.0) +trainer.fit(df) + +print(trainer.coefficients()) +print(trainer.export_runtime_config()) + +# Last updated: 9th December 2025 +# Coefficients: +# {'intercept': 10.310168559226678, 'starting SAP': -0.16120648633993315, 'PV Array size': 1.0500492005420736} +# The code for scoring with this model can be found in backend/app/BatterySapScorer.py diff --git a/sfr/principal_pitch/2_export_data.py b/sfr/principal_pitch/2_export_data.py index 79238273..f6618f22 100644 --- a/sfr/principal_pitch/2_export_data.py +++ b/sfr/principal_pitch/2_export_data.py @@ -11,8 +11,8 @@ from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcMod # PORTFOLIO_ID = 206 # SCENARIOS = [389] -PORTFOLIO_ID = 221 -SCENARIOS = [427] +PORTFOLIO_ID = 388 +SCENARIOS = [803] def get_data(portfolio_id, scenario_ids): @@ -95,6 +95,18 @@ post_install_sap = post_install_sap[post_install_sap["default"]] # Sum up the sap points by property id post_install_sap = post_install_sap.groupby("property_id")[["sap_points"]].sum().reset_index() +# Find dupes by property id and measure type +dupes = recommended_measures_df.duplicated( + subset=["property_id", "measure_type"], keep=False +) +dupe_df = recommended_measures_df[dupes] + +if dupe_df.shape: + # Drop dupes - happened due to a funny bug + recommended_measures_df = recommended_measures_df.drop_duplicates( + subset=["property_id", "measure_type"], keep='first' + ) + recommendations_measures_pivot = recommended_measures_df.pivot( index='property_id', columns='measure_type', @@ -131,10 +143,19 @@ from utils.s3 import read_csv_from_s3, read_excel_from_s3 # asset_list = read_csv_from_s3(bucket_name="retrofit-plan-inputs-dev", filepath='8/206/asset_list.csv') asset_list = read_excel_from_s3( - bucket_name="retrofit-plan-inputs-dev", file_key='8/221/20250722T202328736Z/asset_list.xlsx', - header_row=0, sheet_name="320 - edited" + bucket_name="retrofit-plan-inputs-dev", file_key='2/388/20251208T203603925Z/asset_list.xlsx', + header_row=0, sheet_name="Standardised Asset List" ) asset_list = pd.DataFrame(asset_list) +asset_list = asset_list.rename( + columns={ + "postcode": "domna_postcode" + } +) +if "domna_full_address": + # For Peabody + asset_list["domna_full_address"] = asset_list["domna_address_1"] + asset_list = asset_list[["domna_full_address", "domna_postcode", "epc_os_uprn", ]].copy() asset_list = asset_list.rename(columns={"epc_os_uprn": "uprn"}) df["uprn"] = df["uprn"].astype(str) @@ -179,9 +200,10 @@ asset_list = asset_list.merge( on="uprn" ) -# For exporting NCHA +# For exporting asset_list.to_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/NCHA/320 Portfolio/asset_list_epc_b.xlsx", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Peabody/Nov 2025 Consulting " + "Project/20251209_sample_package_data.xlsx", index=False )