From bd0fb8c2c17aa4b0572bf38d1c328bf156ed63cc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 12 Feb 2024 18:11:31 +0000 Subject: [PATCH] adding unique ids and config hash to prevent duplications --- etl/testing_data/sap_model_simulation.py | 62 +++++++++++++++++------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/etl/testing_data/sap_model_simulation.py b/etl/testing_data/sap_model_simulation.py index 0434ffd5..0a044201 100644 --- a/etl/testing_data/sap_model_simulation.py +++ b/etl/testing_data/sap_model_simulation.py @@ -104,6 +104,9 @@ def app(): pitched_roof_solar = [] flat_roof_solar = [] for property_config in tqdm(property_types.itertuples(), total=property_types.shape[0]): + + config_hash = hash(str(property_config)) + # Take a sample row population = dataset[ (dataset["property_type"] == property_config.property_type) & @@ -144,7 +147,7 @@ def app(): property_id=row["uprn"].values[0], recommendation_record=row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "loft_insulation_270mm", + "recommendation_id": f"loft_insulation_{insulation_thickness}_270mm_{config_hash}", "type": "loft_insulation", "new_u_value": best_270mm_uvalue, "parts": [ @@ -157,7 +160,7 @@ def app(): property_id=row["uprn"].values[0], recommendation_record=row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "loft_insulation_300mm", + "recommendation_id": f"loft_insulation_{insulation_thickness}_300mm_{config_hash}", "type": "loft_insulation", "new_u_value": best_300mm_uvalue, "parts": [ @@ -198,7 +201,7 @@ def app(): property_id=row["uprn"].values[0], recommendation_record=row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "internal_wall_insulation", + "recommendation_id": f"internal_wall_insulation_uvalue_{uvalue}_{config_hash}", "type": "internal_wall_insulation", "new_u_value": best_internal_wall_uvalue, "parts": [] @@ -210,7 +213,7 @@ def app(): property_id=row["uprn"].values[0], recommendation_record=row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "external_wall_insulation", + "recommendation_id": f"external_wall_insulation_uvalue_{uvalue}_{config_hash}", "type": "external_wall_insulation", "new_u_value": best_external_wall_uvalue, "parts": [] @@ -239,7 +242,7 @@ def app(): property_id=row["uprn"].values[0], recommendation_record=row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "cavity_wall_insulation", + "recommendation_id": f"cavity_wall_insulation_uvalue_{uvalue}_{config_hash}", "type": "cavity_wall_insulation", "new_u_value": best_cavity_wall_uvalue, "parts": [] @@ -268,7 +271,7 @@ def app(): property_id=nearest_row["uprn"].values[0], recommendation_record=nearest_row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "solid_floor_insulation", + "recommendation_id": f"solid_floor_insulation_uvalue_{uvalue}_{config_hash}", "type": "solid_floor_insulation", "new_u_value": None, # This doesn't matter at the moment "parts": [] @@ -297,7 +300,7 @@ def app(): property_id=nearest_row["uprn"].values[0], recommendation_record=nearest_row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "suspended_floor_insulation", + "recommendation_id": f"suspended_floor_insulation_uvalue_{uvalue}_{config_hash}", "type": "suspended_floor_insulation", "new_u_value": None, # This doesn't matter at the moment "parts": [] @@ -321,7 +324,7 @@ def app(): property_id=row["uprn"].values[0], recommendation_record=row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "windows_glazing", + "recommendation_id": f"windows_glazing_single_to_double_{config_hash}", "type": "windows_glazing", "new_u_value": None, # This doesn't matter at the moment "parts": [], @@ -334,7 +337,7 @@ def app(): property_id=row["uprn"].values[0], recommendation_record=row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "windows_glazing", + "recommendation_id": f"windows_glazing_single_to_secondary_{config_hash}", "type": "windows_glazing", "new_u_value": None, # This doesn't matter at the moment "parts": [], @@ -379,7 +382,7 @@ def app(): property_id=nearest_row["uprn"].values[0], recommendation_record=nearest_row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "windows_glazing", + "recommendation_id": f"windows_glazing_partial_double_to_double_{value}_{config_hash}", "type": "windows_glazing", "new_u_value": None, # This doesn't matter at the moment "parts": [], @@ -414,7 +417,7 @@ def app(): property_id=nearest_row["uprn"].values[0], recommendation_record=nearest_row.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "windows_glazing", + "recommendation_id": f"windows_glazing_partial_secondary_to_secondary_{value}_{config_hash}", "type": "windows_glazing", "new_u_value": None, # This doesn't matter at the moment "parts": [], @@ -449,7 +452,7 @@ def app(): property_id=pitched_roof_no_solar["uprn"].values[0], recommendation_record=pitched_roof_no_solar.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "solar_pv", + "recommendation_id": f"pitched_solar_pv_coverage_{coverage}_percent_{config_hash}", "type": "solar_pv", "new_u_value": None, # This doesn't matter at the moment "parts": [], @@ -463,7 +466,7 @@ def app(): property_id=flat_roof_no_solar["uprn"].values[0], recommendation_record=flat_roof_no_solar.copy().to_dict("records")[0], recommendation={ - "recommendation_id": "solar_pv", + "recommendation_id": f"flat_solar_pv_coverage_{coverage}_percent_{config_hash}", "type": "solar_pv", "new_u_value": None, # This doesn't matter at the moment "parts": [], @@ -521,18 +524,43 @@ def app(): # Store final parquet in s3 save_dataframe_to_s3_parquet( df=loft_insulation_predictions, - bucket_name="retrofit-data-dev", + bucket_name="retrofit-datalake-dev", file_key=f"sap_change_model/simulation-pipeline-loft-insulation-predictions_{MODEL_VERSION}.parquet" ) # We now merge the loft insulation predictions onto the scoring data and calculate exactly how much the insulation # is worth - loft_insulation_comparison_df = loft_insulation_testing_df[ - ["simulation_ending_insulation_thickness", "simulation_starting_insulation_thickness", "uprn", "id", ""] + loft_insulation_comparison_matrix = loft_insulation_testing_df[ + ["simulation_starting_insulation_thickness", "simulation_ending_insulation_thickness", "uprn", "id", + "sap_starting"] ].merge( - loft_insulation_predictions, + loft_insulation_predictions.drop(columns=["recommendation_id"]), left_on="id", right_on="id", how="left" ) + + loft_insulation_comparison_matrix["measure_impact"] = loft_insulation_comparison_matrix["predictions"] - \ + loft_insulation_comparison_matrix["sap_starting"] + # Perform a group by describe + loft_insulation_describe = loft_insulation_comparison_matrix.groupby( + ["simulation_starting_insulation_thickness", "simulation_ending_insulation_thickness"] + )[["measure_impact"]].describe().reset_index() + + z = loft_insulation_comparison_matrix[loft_insulation_comparison_matrix["measure_impact"] < 0] + z.head(1)[["uprn", "id"]] + error_row = loft_insulation_testing_df[ + (loft_insulation_testing_df["id"] == "100090292333+loft_insulation_150_270mm") + ] + + error_dataset = dataset[ + (dataset["uprn"] == "10070401239") & (dataset["roof_insulation_thickness"] == "250") + ] + + changed_from_dataset = [] + for c in column_config: + ending_value = error_row[column_config[c]].values[0] + starting_value = error_row[column_config[c]].values[0] + error_dataset["roof_insulation_thickness"] + error_dataset["roof_insulation_thickness_ending"]