adding unique ids and config hash to prevent duplications

This commit is contained in:
Khalim Conn-Kowlessar 2024-02-12 18:11:31 +00:00
parent 1028861a1b
commit bd0fb8c2c1

View file

@ -104,6 +104,9 @@ def app():
pitched_roof_solar = []
flat_roof_solar = []
for property_config in tqdm(property_types.itertuples(), total=property_types.shape[0]):
config_hash = hash(str(property_config))
# Take a sample row
population = dataset[
(dataset["property_type"] == property_config.property_type) &
@ -144,7 +147,7 @@ def app():
property_id=row["uprn"].values[0],
recommendation_record=row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "loft_insulation_270mm",
"recommendation_id": f"loft_insulation_{insulation_thickness}_270mm_{config_hash}",
"type": "loft_insulation",
"new_u_value": best_270mm_uvalue,
"parts": [
@ -157,7 +160,7 @@ def app():
property_id=row["uprn"].values[0],
recommendation_record=row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "loft_insulation_300mm",
"recommendation_id": f"loft_insulation_{insulation_thickness}_300mm_{config_hash}",
"type": "loft_insulation",
"new_u_value": best_300mm_uvalue,
"parts": [
@ -198,7 +201,7 @@ def app():
property_id=row["uprn"].values[0],
recommendation_record=row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "internal_wall_insulation",
"recommendation_id": f"internal_wall_insulation_uvalue_{uvalue}_{config_hash}",
"type": "internal_wall_insulation",
"new_u_value": best_internal_wall_uvalue,
"parts": []
@ -210,7 +213,7 @@ def app():
property_id=row["uprn"].values[0],
recommendation_record=row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "external_wall_insulation",
"recommendation_id": f"external_wall_insulation_uvalue_{uvalue}_{config_hash}",
"type": "external_wall_insulation",
"new_u_value": best_external_wall_uvalue,
"parts": []
@ -239,7 +242,7 @@ def app():
property_id=row["uprn"].values[0],
recommendation_record=row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "cavity_wall_insulation",
"recommendation_id": f"cavity_wall_insulation_uvalue_{uvalue}_{config_hash}",
"type": "cavity_wall_insulation",
"new_u_value": best_cavity_wall_uvalue,
"parts": []
@ -268,7 +271,7 @@ def app():
property_id=nearest_row["uprn"].values[0],
recommendation_record=nearest_row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "solid_floor_insulation",
"recommendation_id": f"solid_floor_insulation_uvalue_{uvalue}_{config_hash}",
"type": "solid_floor_insulation",
"new_u_value": None, # This doesn't matter at the moment
"parts": []
@ -297,7 +300,7 @@ def app():
property_id=nearest_row["uprn"].values[0],
recommendation_record=nearest_row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "suspended_floor_insulation",
"recommendation_id": f"suspended_floor_insulation_uvalue_{uvalue}_{config_hash}",
"type": "suspended_floor_insulation",
"new_u_value": None, # This doesn't matter at the moment
"parts": []
@ -321,7 +324,7 @@ def app():
property_id=row["uprn"].values[0],
recommendation_record=row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "windows_glazing",
"recommendation_id": f"windows_glazing_single_to_double_{config_hash}",
"type": "windows_glazing",
"new_u_value": None, # This doesn't matter at the moment
"parts": [],
@ -334,7 +337,7 @@ def app():
property_id=row["uprn"].values[0],
recommendation_record=row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "windows_glazing",
"recommendation_id": f"windows_glazing_single_to_secondary_{config_hash}",
"type": "windows_glazing",
"new_u_value": None, # This doesn't matter at the moment
"parts": [],
@ -379,7 +382,7 @@ def app():
property_id=nearest_row["uprn"].values[0],
recommendation_record=nearest_row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "windows_glazing",
"recommendation_id": f"windows_glazing_partial_double_to_double_{value}_{config_hash}",
"type": "windows_glazing",
"new_u_value": None, # This doesn't matter at the moment
"parts": [],
@ -414,7 +417,7 @@ def app():
property_id=nearest_row["uprn"].values[0],
recommendation_record=nearest_row.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "windows_glazing",
"recommendation_id": f"windows_glazing_partial_secondary_to_secondary_{value}_{config_hash}",
"type": "windows_glazing",
"new_u_value": None, # This doesn't matter at the moment
"parts": [],
@ -449,7 +452,7 @@ def app():
property_id=pitched_roof_no_solar["uprn"].values[0],
recommendation_record=pitched_roof_no_solar.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "solar_pv",
"recommendation_id": f"pitched_solar_pv_coverage_{coverage}_percent_{config_hash}",
"type": "solar_pv",
"new_u_value": None, # This doesn't matter at the moment
"parts": [],
@ -463,7 +466,7 @@ def app():
property_id=flat_roof_no_solar["uprn"].values[0],
recommendation_record=flat_roof_no_solar.copy().to_dict("records")[0],
recommendation={
"recommendation_id": "solar_pv",
"recommendation_id": f"flat_solar_pv_coverage_{coverage}_percent_{config_hash}",
"type": "solar_pv",
"new_u_value": None, # This doesn't matter at the moment
"parts": [],
@ -521,18 +524,43 @@ def app():
# Store final parquet in s3
save_dataframe_to_s3_parquet(
df=loft_insulation_predictions,
bucket_name="retrofit-data-dev",
bucket_name="retrofit-datalake-dev",
file_key=f"sap_change_model/simulation-pipeline-loft-insulation-predictions_{MODEL_VERSION}.parquet"
)
# We now merge the loft insulation predictions onto the scoring data and calculate exactly how much the insulation
# is worth
loft_insulation_comparison_df = loft_insulation_testing_df[
["simulation_ending_insulation_thickness", "simulation_starting_insulation_thickness", "uprn", "id", ""]
loft_insulation_comparison_matrix = loft_insulation_testing_df[
["simulation_starting_insulation_thickness", "simulation_ending_insulation_thickness", "uprn", "id",
"sap_starting"]
].merge(
loft_insulation_predictions,
loft_insulation_predictions.drop(columns=["recommendation_id"]),
left_on="id",
right_on="id",
how="left"
)
loft_insulation_comparison_matrix["measure_impact"] = loft_insulation_comparison_matrix["predictions"] - \
loft_insulation_comparison_matrix["sap_starting"]
# Perform a group by describe
loft_insulation_describe = loft_insulation_comparison_matrix.groupby(
["simulation_starting_insulation_thickness", "simulation_ending_insulation_thickness"]
)[["measure_impact"]].describe().reset_index()
z = loft_insulation_comparison_matrix[loft_insulation_comparison_matrix["measure_impact"] < 0]
z.head(1)[["uprn", "id"]]
error_row = loft_insulation_testing_df[
(loft_insulation_testing_df["id"] == "100090292333+loft_insulation_150_270mm")
]
error_dataset = dataset[
(dataset["uprn"] == "10070401239") & (dataset["roof_insulation_thickness"] == "250")
]
changed_from_dataset = []
for c in column_config:
ending_value = error_row[column_config[c]].values[0]
starting_value = error_row[column_config[c]].values[0]
error_dataset["roof_insulation_thickness"]
error_dataset["roof_insulation_thickness_ending"]