diff --git a/backend/tests/test_integration.py b/backend/tests/test_integration.py index cd41d5c0..f0c25ac0 100644 --- a/backend/tests/test_integration.py +++ b/backend/tests/test_integration.py @@ -75,6 +75,20 @@ epc_data = pd.read_csv( low_memory=False ) +# TODO: Store this for cleaning +costs_by_floor_area = epc_data[ + pd.to_datetime(epc_data["LODGEMENT_DATE"]) >= "2024-01-01" + ][["TOTAL_FLOOR_AREA", "CURRENT_ENERGY_EFFICIENCY", "LIGHTING_COST_CURRENT", "HEATING_COST_CURRENT", + "HOT_WATER_COST_CURRENT"]].copy() + +costs_by_floor_area.columns = [c.lower().replace("_", "-") for c in costs_by_floor_area.columns] +for c in ["lighting-cost-current", "heating-cost-current", "hot-water-cost-current"]: + costs_by_floor_area[c + "_scaled"] = costs_by_floor_area[c] / costs_by_floor_area["total-floor-area"] + +costs_by_floor_area = costs_by_floor_area.groupby("current-energy-efficiency")[ + ["lighting-cost-current_scaled", "heating-cost-current_scaled", "hot-water-cost-current_scaled"] +].mean().reset_index() + sample_epc_data = epc_data.drop_duplicates("UPRN").sample(1000).reset_index(drop=True) # Load the input properties @@ -140,12 +154,27 @@ for p in tqdm(input_properties): mocked_kwh_predictions["heating_kwh_predictions"] = pd.DataFrame(mocked_kwh_predictions["heating_kwh_predictions"]) mocked_kwh_predictions["hotwater_kwh_predictions"] = pd.DataFrame(mocked_kwh_predictions["hotwater_kwh_predictions"]) +# TODO: We might want to implement this generally, via an ETL process +for p in input_properties: + for col in ["lighting-cost-current", "heating-cost-current", "hot-water-cost-current"]: + if pd.isnull(p.data[col]): + min_diff = abs( + (costs_by_floor_area["current-energy-efficiency"] - p.data["current-energy-efficiency"]) + ).min() + df = costs_by_floor_area[ + abs((costs_by_floor_area["current-energy-efficiency"] - p.data[ + "current-energy-efficiency"])) == min_diff + ] + if df.shape[0] > 1: + df = df.head(1) + p.data[col] = (df[col + "_scaled"] * p.data["total-floor-area"]).values[0] + [ p.set_features(cleaned=cleaned, kwh_client=kwh_client, kwh_predictions=mocked_kwh_predictions) for p in input_properties ] -for p in input_properties: +for p in tqdm(input_properties): # TEMP p.DATA_ANOMALY_MATCHES = DATA_ANOMALY_MATCHES p.set_features(cleaned=cleaned, kwh_client=kwh_client, kwh_predictions=mocked_kwh_predictions) diff --git a/etl/epc_clean/epc_attributes/all_cleaners.py b/etl/epc_clean/epc_attributes/all_cleaners.py index e4e0a0ba..cb9b2b24 100644 --- a/etl/epc_clean/epc_attributes/all_cleaners.py +++ b/etl/epc_clean/epc_attributes/all_cleaners.py @@ -17,5 +17,5 @@ all_cleaner_map = { 'roof-description': RoofAttributes, 'walls-description': WallAttributes, 'windows-description': WindowAttributes, - 'lighting-description:': LightingAttributes, + 'lighting-description': LightingAttributes, }