mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
slice 14k: E2E pipeline runs on real 2026 RdSAP certs
Two production fixes surfaced by the live run: - mapper.from_rdsap_schema_21_0_1 now sets the three ML target scalars (energy_rating_current, co2_emissions_current, energy_consumption_current). They were silently None for every cert before, leaving the only labels as the kWh fields from renewable_heat_incentive. - train_baseline coerces object-dtype columns to numeric (None -> NaN) and drops rows with null target per fit, so LightGBM accepts the frame. E2E on 500 real certs (~1s): sap_score R^2=0.604 MAPE=0.084 co2_emissions R^2=0.813 MAPE=0.130 peui_raw R^2=0.979 MAPE=0.026 space_heating_kwh R^2=0.823 MAPE=0.213 hot_water_kwh R^2=0.519 MAPE=0.115 peui_ucl excluded: UCL correction still needs wiring. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
6697a6c76e
commit
8fddd25b9a
2 changed files with 21 additions and 8 deletions
|
|
@ -1159,8 +1159,8 @@ class EpcPropertyDataMapper:
|
|||
window_type=w.window_type,
|
||||
frame_factor=w.frame_factor,
|
||||
glazing_type=w.glazing_type,
|
||||
window_width=w.window_width,
|
||||
window_height=w.window_height,
|
||||
window_width=_measurement_value(w.window_width),
|
||||
window_height=_measurement_value(w.window_height),
|
||||
draught_proofed=w.draught_proofed == "true",
|
||||
window_location=w.window_location,
|
||||
window_wall_type=w.window_wall_type,
|
||||
|
|
@ -1177,7 +1177,7 @@ class EpcPropertyDataMapper:
|
|||
sap_energy_source=SapEnergySource(
|
||||
mains_gas=es.mains_gas == "Y",
|
||||
meter_type=str(es.meter_type),
|
||||
pv_battery_count=es.pv_battery_count,
|
||||
pv_battery_count=es.pv_battery_count or 0,
|
||||
wind_turbines_count=es.wind_turbines_count,
|
||||
gas_smart_meter_present=es.gas_smart_meter_present == "true",
|
||||
is_dwelling_export_capable=es.is_dwelling_export_capable == "true",
|
||||
|
|
@ -1391,8 +1391,8 @@ class EpcPropertyDataMapper:
|
|||
window_type=w.window_type,
|
||||
frame_factor=w.frame_factor,
|
||||
glazing_type=w.glazing_type,
|
||||
window_width=w.window_width,
|
||||
window_height=w.window_height,
|
||||
window_width=_measurement_value(w.window_width),
|
||||
window_height=_measurement_value(w.window_height),
|
||||
draught_proofed=w.draught_proofed == "true",
|
||||
window_location=w.window_location,
|
||||
window_wall_type=w.window_wall_type,
|
||||
|
|
@ -1414,7 +1414,7 @@ class EpcPropertyDataMapper:
|
|||
sap_energy_source=SapEnergySource(
|
||||
mains_gas=es.mains_gas == "Y",
|
||||
meter_type=str(es.meter_type),
|
||||
pv_battery_count=es.pv_battery_count,
|
||||
pv_battery_count=es.pv_battery_count or 0,
|
||||
wind_turbines_count=es.wind_turbines_count,
|
||||
gas_smart_meter_present=es.gas_smart_meter_present == "true",
|
||||
is_dwelling_export_capable=es.is_dwelling_export_capable == "true",
|
||||
|
|
@ -1520,6 +1520,10 @@ class EpcPropertyDataMapper:
|
|||
else None
|
||||
),
|
||||
),
|
||||
# ML targets: the assessment-derived scalars used as training labels.
|
||||
energy_rating_current=schema.energy_rating_current,
|
||||
co2_emissions_current=float(schema.co2_emissions_current),
|
||||
energy_consumption_current=schema.energy_consumption_current,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
|
|
|||
|
|
@ -38,11 +38,20 @@ def train_baseline(
|
|||
n_estimators: int = 200,
|
||||
) -> dict[str, dict[str, float]]:
|
||||
feature_cols = [c for c in df.columns if c not in targets and c != _CERT_NUM_COLUMN]
|
||||
# LightGBM needs numeric (or pd.Categorical) dtypes. Coerce object columns whose
|
||||
# contents are numeric-or-None to numeric (None -> NaN). Pandas object columns
|
||||
# that are *actually* string categoricals are left alone if coercion would
|
||||
# destroy data — pd.Categorical features pass through LightGBM correctly.
|
||||
for col in [*feature_cols, *targets]:
|
||||
if df[col].dtype == "object":
|
||||
df[col] = pd.to_numeric(df[col], errors="coerce")
|
||||
metrics: dict[str, dict[str, float]] = {}
|
||||
|
||||
for target in targets:
|
||||
x = df[feature_cols]
|
||||
y = df[target]
|
||||
# Drop rows where this target is null so LightGBM doesn't trip on label NaN.
|
||||
target_df = df.dropna(subset=[target])
|
||||
x = target_df[feature_cols]
|
||||
y = target_df[target]
|
||||
split = cast(
|
||||
tuple[pd.DataFrame, pd.DataFrame, "pd.Series[Any]", "pd.Series[Any]"],
|
||||
train_test_split(x, y, test_size=test_size, random_state=seed),
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue