fixed silly error in all_cleaners

This commit is contained in:
Khalim Conn-Kowlessar 2025-08-28 23:07:48 +01:00
parent 95226a73ff
commit 2acf5c3534
2 changed files with 31 additions and 2 deletions

View file

@ -75,6 +75,20 @@ epc_data = pd.read_csv(
low_memory=False
)
# TODO: Store this for cleaning
costs_by_floor_area = epc_data[
pd.to_datetime(epc_data["LODGEMENT_DATE"]) >= "2024-01-01"
][["TOTAL_FLOOR_AREA", "CURRENT_ENERGY_EFFICIENCY", "LIGHTING_COST_CURRENT", "HEATING_COST_CURRENT",
"HOT_WATER_COST_CURRENT"]].copy()
costs_by_floor_area.columns = [c.lower().replace("_", "-") for c in costs_by_floor_area.columns]
for c in ["lighting-cost-current", "heating-cost-current", "hot-water-cost-current"]:
costs_by_floor_area[c + "_scaled"] = costs_by_floor_area[c] / costs_by_floor_area["total-floor-area"]
costs_by_floor_area = costs_by_floor_area.groupby("current-energy-efficiency")[
["lighting-cost-current_scaled", "heating-cost-current_scaled", "hot-water-cost-current_scaled"]
].mean().reset_index()
sample_epc_data = epc_data.drop_duplicates("UPRN").sample(1000).reset_index(drop=True)
# Load the input properties
@ -140,12 +154,27 @@ for p in tqdm(input_properties):
mocked_kwh_predictions["heating_kwh_predictions"] = pd.DataFrame(mocked_kwh_predictions["heating_kwh_predictions"])
mocked_kwh_predictions["hotwater_kwh_predictions"] = pd.DataFrame(mocked_kwh_predictions["hotwater_kwh_predictions"])
# TODO: We might want to implement this generally, via an ETL process
for p in input_properties:
for col in ["lighting-cost-current", "heating-cost-current", "hot-water-cost-current"]:
if pd.isnull(p.data[col]):
min_diff = abs(
(costs_by_floor_area["current-energy-efficiency"] - p.data["current-energy-efficiency"])
).min()
df = costs_by_floor_area[
abs((costs_by_floor_area["current-energy-efficiency"] - p.data[
"current-energy-efficiency"])) == min_diff
]
if df.shape[0] > 1:
df = df.head(1)
p.data[col] = (df[col + "_scaled"] * p.data["total-floor-area"]).values[0]
[
p.set_features(cleaned=cleaned, kwh_client=kwh_client, kwh_predictions=mocked_kwh_predictions) for p in
input_properties
]
for p in input_properties:
for p in tqdm(input_properties):
# TEMP
p.DATA_ANOMALY_MATCHES = DATA_ANOMALY_MATCHES
p.set_features(cleaned=cleaned, kwh_client=kwh_client, kwh_predictions=mocked_kwh_predictions)

View file

@ -17,5 +17,5 @@ all_cleaner_map = {
'roof-description': RoofAttributes,
'walls-description': WallAttributes,
'windows-description': WindowAttributes,
'lighting-description:': LightingAttributes,
'lighting-description': LightingAttributes,
}