# We use some sample properties from Newhaven to use as a testing dataset for implementing the model fixes import inspect import pandas as pd from etl.epc.settings import EARLIEST_EPC_DATE from pathlib import Path from utils.s3 import save_csv_to_s3 src_file_path = inspect.getfile(lambda: None) EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates" USER_ID = 8 PORTFOLIO_ID = -1 def app(): """ This application is tasked with pulling a large quantity of data from the find my epc website, containing the estimated energy consumption for properties :return: """ lewes_directory = EPC_DIRECTORY / "domestic-E07000063-Lewes/certificates.csv" data = pd.read_csv(lewes_directory, low_memory=False) # Rename the columns to the same format as the api returns data.columns = [c.replace("_", "-").lower() for c in data.columns] # Take just date before the date threshold data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] data = data[~pd.isnull(data["uprn"])] data = data[data["current-energy-efficiency"].astype(float) < 52] data = data.sample(10) # Create an asset list asset_list = data[["uprn", "address1", "postcode"]].copy().rename(columns={"address1": "address"}) asset_list["uprn"] = asset_list["uprn"].astype(str) filename = f"{USER_ID}/{PORTFOLIO_ID}/pilot.csv" save_csv_to_s3( dataframe=asset_list, bucket_name="retrofit-plan-inputs-dev", file_name=filename ) body = { "portfolio_id": str(PORTFOLIO_ID), "housing_type": "Private", "goal": "Increasing EPC", "goal_value": "B", "trigger_file_path": filename, "already_installed_file_path": "", "patches_file_path": "", "non_invasive_recommendations_file_path": "", "budget": None, } print(body) # This is some temp code, which is for diagnosing the issues with the bills models heating_training_data_filepath = "sap_change_model/2024-08-06-11-19-49/dataset_rooms.parquet" # For the heating model: heating_drop_columns = [ "sap_ending", "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "lighting_cost_ending", "hot_water_cost_ending", # "days_to_ending", "days_to_starting", # TODO This is in the live version 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', 'number_habitable_rooms', 'number_heated_rooms' ] heating_response = "heating_cost_ending" # for the hot water model (older dataset) hot_water_training_data_filepath = "sap_change_model/2024-07-10-20-28-54/dataset_rooms.parquet" hot_water_drop_columns = [ "sap_ending", "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "lighting_cost_ending", "heating_cost_ending", "days_to_starting", "days_to_ending", 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', 'number_habitable_rooms', 'number_heated_rooms' ] # Diagnose heating from utils.s3 import read_dataframe_from_s3_parquet train = read_dataframe_from_s3_parquet( bucket_name="retrofit-data-dev", file_key=heating_training_data_filepath ) # Drop the columns that aren't used train = train.drop(columns=heating_drop_columns) # if the value is postive, it means the ending cost is bigger than the starting (which means it got more expensive) train["cost_diference"] = (train["heating_cost_ending"] - train["heating_cost_starting"]) change_direction = train["cost_diference"] > 0 change_direction.value_counts(normalize=True) average_costs_by_time_starting = train.groupby( ["lodgement_year_starting", "lodgement_month_starting"] )["heating_cost_starting"].mean().reset_index().sort_values(["lodgement_year_starting", "lodgement_month_starting"]) average_costs_by_time_ending = train.groupby( ["lodgement_year_ending", "lodgement_month_ending"] )["heating_cost_ending"].mean().reset_index().sort_values(["lodgement_year_ending", "lodgement_month_ending"]) # Check by photo supply values - if the property is gas, solar panels won't have an affect on the heating or hot # water so let's look for electric homes # Across the entire dataset, there is no correlation # Even for electric properties, there is no correlation photo_supply_averages = train[ train["fuel_type_ending"] == "electricity" ].groupby(["photo_supply_ending"])["heating_cost_ending"].mean().reset_index() photo_supply_to_size = train.groupby("photo_supply_ending")["total_floor_area_ending"].mean().reset_index() photo_supply_to_size[["photo_supply_ending", "total_floor_area_ending"]].corr() train[["total_floor_area_ending", "heating_cost_ending"]].corr() # Bigger properties end up with smaller photo_supply values. This will be because the array size likely remains fairly # consistent but takes up a smaller proportion of the roof. Typically, the bigger the floor area, the higher the heating # costs, but bigger units also have smaller photo_supply adding_solar = train[ (train["photo_supply_ending"] > 0) & (train["photo_supply_starting"] == 0) ] is_positive = (adding_solar["cost_diference"] > 0) is_positive.value_counts(normalize=True) photo_supply_by_time = ( train[ train["fuel_type_ending"] == "electricity" ].groupby( ["lodgement_year_ending", "photo_supply_ending"] )["heating_cost_ending"].mean().reset_index().sort_values( ["lodgement_year_ending", "photo_supply_ending"], ascending=True) ) # Plot photo_supply_by_time[["photo_supply_ending", "heating_cost_ending"]].corr() photo_supply_by_time.plot() # Observations # 1) We retain all of the potential columns, however they are just based on the starting EPC # 2) 21% of the the time, the ending heating cost is more than the starting but this is clearly a minority # 3) Let's get ride of estimated perimeter starting and ending # Things I should check # 1) Do we updated the lodgment_year_ending and lodgement_month_ending # 2) Should we adjust costs to now, as well as lodgement_dates to today? Since 2023, costs have increased a lot so # any savings should be benchmarked against what a customer is paying now # 3) It might make sense to create a feature between floor area and photo supply, to give a more consistent estimate # of a panel size for the property # Get an example and score with the models example = train[ (train["photo_supply_starting"] == 0) & (train["photo_supply_ending"] > 0) & (train["heating_cost_starting"] > train["heating_cost_ending"]) ].sample(1) # example["lodgement_month_starting"] # example["lodgement_year_starting"] # example["lodgement_month_ending"] # example["lodgement_year_ending"].values[0] # # example["lodgement_year_ending"] = 2023 # example["days_to_ending"] = 3500 # example["days_to_starting"] # {'heating_cost_predictions': predictions # 0 378.5} resp = model_api.predict_all( df=example, bucket="retrofit-data-dev", prediction_buckets=get_prediction_buckets(), model_prefixes=["heating_cost_predictions"], extract_ids=False ) # Step 1: get a cost for today p.create_base_difference_epc_record(cleaned) cwi_impact = p.base_difference_record.df.copy() for k in property_recommendations[0][0]["simulation_config"]: cwi_impact[k] = property_recommendations[0][0]["simulation_config"][k] # 2212.4 - Baseline today = model_api.predict_all( df=p.base_difference_record.df.copy(), bucket="retrofit-data-dev", prediction_buckets=get_prediction_buckets(), model_prefixes=["heating_cost_predictions"], extract_ids=False ) # impact of CWI - 1908 cwi_response = model_api.predict_all( df=cwi_impact, bucket="retrofit-data-dev", prediction_buckets=get_prediction_buckets(), model_prefixes=["heating_cost_predictions"], extract_ids=False ) pv_impact = cwi_impact.copy() pv_impact["photo_supply_ending"] = 50 pv_impact["heating_cost_starting"] = 2212.4 pv_response = model_api.predict_all( df=pv_impact, bucket="retrofit-data-dev", prediction_buckets=get_prediction_buckets(), model_prefixes=["heating_cost_predictions"], extract_ids=False ) # Testing kwh for vde base_prediction = model_api.predict_all( df=epcs_for_scoring, bucket=get_settings().DATA_BUCKET, prediction_buckets=get_prediction_buckets(), model_prefixes=["heating_kwh_predictions"], extract_ids=False ) cwi_epc = pd.DataFrame([property_scoring_epcs[1].copy()]) cwi_epc = add_features_from_code(cwi_epc) cwi_epc = add_estimate_annual_kwh(cwi_epc) # cwi_epc["walls-description"] = "Cavity wall, filled cavity" # cwi_epc["walls-energy-eff"] = "Good" # cwi_epc["heating-cost-current"] = 1650 # cwi_epc["current-energy-efficiency"] = 72 # cwi_epc["current-energy-rating"] = "C" # cwi_epc["co2-emissions-current"] = 3.7 # cwi_epc["energy-consumption-current"] = 121 # cwi_epc["co2-emiss-curr-per-floor-area"] = 19 # cwi_epc["photo-supply"] = 0 # cwi_epc["energy-consumption-current"] = # cwi_epc["roof-description"] = "Pitched, 300 mm loft insulation" # cwi_epc["roof-energy-eff"] = "Very Good" # cwi_epc["heating-cost-current"] = 1264 # "heating-cost-current": rec_impact["epc_heating_cost"], # "hot-water-cost-current": rec_impact["epc_hot_water_cost"], # # CO₂ emissions per square metre floor area per year in kg/m². Since CO₂ emissions are in tonnes # # per year, we multiply by 1000 to get kg/m² # "co2-emiss-curr-per-floor-area": round( # 1000 * (rec_impact["carbon"] / self.data["total-floor-area"]) # ), # "co2-emissions-current": rec_impact["carbon"], # "current-energy-rating": sap_to_epc(rec_impact["sap"]), # "current-energy-efficiency": int(np.floor(rec_impact["sap"])), # "energy-consumption-current": rec_impact["heat_demand"], # "lighting-cost-current": rec_impact["epc_lighting_cost"], # "id": "+".join([str(self.id), rec_id]) cwi_prediction = model_api.predict_all( df=cwi_epc, bucket=get_settings().DATA_BUCKET, prediction_buckets=get_prediction_buckets(), model_prefixes=["heating_kwh_predictions", "hotwater_kwh_predictions"], extract_ids=False ) # 77 perryn starting_heating = 19837.2 starting_hot_water = 2974.1 ending_heating = 17041.1 ending_hot_water = 2735.3 # 44 lindlings starting_heating = 13327.1 starting_hot_water = 2349.5 ending_heating = 9672.3 ending_hot_water = 2030.2 ending_heating = 8695.1 ending_hot_water = 2437.0 heating_impact = starting_heating - ending_heating hot_water_impact = starting_hot_water - ending_hot_water total_impact = heating_impact + hot_water_impact