added de-duping

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-18 22:47:39 +00:00
parent ac9b7b3730
commit 5d5001fec3
3 changed files with 85 additions and 149 deletions

View file

@ -1803,21 +1803,26 @@ def propsed_wave_3_sample():
def match_property_to_surveyed(property, survey_results_with_original_features):
surveyed = survey_results_with_original_features[
(
survey_results_with_original_features["Postal Region"] ==
property["Postal Region"]
) &
(
survey_results_with_original_features["Property Type"] ==
property["Property Type"]
)
&
(
survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
property["Wall Type"].split(":")[0]
) &
(
survey_results_with_original_features["Wall Type"] ==
property["Wall Type"]
survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
property["Roof Type"].split(":")[0]
) &
(
survey_results_with_original_features["Roof Type"] ==
property["Roof Type"]
) &
(
survey_results_with_original_features["Heating"] ==
property["Heating"]
survey_results_with_original_features["Heating"].str.split(":").str[0] ==
property["Heating"].split(":")[0]
)
].copy()
@ -1826,23 +1831,47 @@ def propsed_wave_3_sample():
surveyed = survey_results_with_original_features[
(
survey_results_with_original_features["Property Type"] ==
property["Property Type"]
survey_results_with_original_features["Postal Region"] ==
property["Postal Region"]
) &
(
survey_results_with_original_features["Wall Type"] ==
property["Wall Type"]
survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
property["Property Type"].split(":")[0]
)
&
(
survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
property["Wall Type"].split(":")[0]
) &
(
survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
property["Roof Type"].split(":")[0]
) &
(
survey_results_with_original_features["Heating"] ==
property["Heating"]
survey_results_with_original_features["Heating"].str.split(":").str[0] ==
property["Heating"].split(":")[0]
)
].copy()
# surveyed = survey_results_with_original_features[
# (
# survey_results_with_original_features["Property Type"] ==
# property["Property Type"]
# ) &
# (
# survey_results_with_original_features["Wall Type"] ==
# property["Wall Type"]
# ) &
# (
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
# property["Roof Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Heating"] ==
# property["Heating"]
# )
# ].copy()
if not surveyed.empty:
return surveyed
@ -1906,7 +1935,12 @@ def propsed_wave_3_sample():
on="Address ID",
how="left"
)
region_assets['Distance to Closest Match (m)'] = 0
region_assets['Distance to Closest Match (m)'] = None
region_assets["Distance to Closest Match (m)"] = np.where(
~pd.isnull(region_assets["Current EPC Band"]),
0,
region_assets["Distance to Closest Match (m)"]
)
# Label the tier 1 properties
region_assets["Confidence Tier"] = None
@ -2016,7 +2050,7 @@ def propsed_wave_3_sample():
missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"])
archetype_surveyed = []
# archetype_surveyed = []
for arch_id in missed_archetypes:
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
archetype_data = survey_results_with_original_features[
@ -2175,7 +2209,14 @@ def propsed_wave_3_sample():
{
"Address ID": a_id,
"Confidence Tier": "4 - no similar property, needs survey to confirm",
"Current EPC Band": "Needs Survey"
"Current EPC Band": "Needs Survey",
"Current SAP Rating": "Needs Survey",
'Survey: Main Wall Type': "Not Surveyed",
"Survey: Main Alternative Wall": "Not Surveyed",
"Survey: Main Roof Type": "Not Surveyed",
"Survey: Primary Heating System": "Not Surveyed",
"Survey: Matching Address ID": "Not Surveyed",
'Distance to Closest Match (m)': 9999999,
}
)
continue
@ -2197,18 +2238,6 @@ def propsed_wave_3_sample():
# Take the 3 nearest
surveyed = surveyed.head(3)
# # We allow a max distance of 10km
# surveyed = surveyed[surveyed["distance_meters"] < 10000]
# if surveyed.empty:
# final_missed_matches.append(
# {
# "Address ID": a_id,
# "Confidence Tier": "4 - no similar property, needs survey to confirm",
# "Current EPC Band": "Needs Survey"
# }
# )
# continue
# perform a weighted mean of SAP rating - the closer the better
expected_sap = np.average(
surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1)
@ -2218,129 +2247,24 @@ def propsed_wave_3_sample():
if expected_epc in ["C", "B", "A"]:
match_type = "5 - EPC C or above"
closest_match = surveyed.iloc[0]
final_missed_matches.append(
{
"Address ID": a_id,
"Confidence Tier": match_type,
"Current EPC Band": expected_epc
"Current EPC Band": expected_epc,
"Current SAP Rating": expected_sap,
'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
"Survey: Main Alternative Wall": closest_match["Survey: Main Alternative Wall"],
"Survey: Main Roof Type": closest_match["Survey: Main Roof Type"],
"Survey: Primary Heating System": closest_match["Survey: Primary Heating System"],
"Survey: Matching Address ID": closest_match["Address ID"],
'Distance to Closest Match (m)': closest_match["distance_meters"],
}
)
continue
# if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
# filter_property_types = ["House", "Bungalow"]
# else:
# filter_property_types = ["Flat"]
#
# surveyed_similar = survey_results_with_original_features[
# (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
# (
# survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
# filter_property_types
# )
# ) &
# (
# survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
# property["Wall Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
# property["Roof Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Heating"].str.split(":").str[0] ==
# property["Heating"].split(":")[0]
# )
# ]
# if surveyed_similar.empty:
# surveyed_similar = survey_results_with_original_features[
# (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
# (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
# filter_property_types
# )) &
# (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
# property["Wall Type"].split(":")[0]) &
# (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
# property["Roof Type"].split(":")[0]) &
# (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
# property["Heating"].split(":")[0])
# ]
#
# if surveyed_similar.empty:
#
# # We get an average based on the postcode
# surveyed_similar = survey_results_with_original_features[
# (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
# (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
# filter_property_types
# ))
# ]
# if surveyed_similar.empty:
# surveyed_similar_entire_population = survey_results_with_original_features[
# (
# survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[
# "Property Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
# property["Wall Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
# property["Roof Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Heating"].str.split(":").str[0] ==
# property["Heating"].split(":")[0]
# )
# ]
#
# # We order them by distance on postcode
#
# # Average
# expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean()
# expected_epc = sap_to_epc(expected_sap)
#
# final_missed_matches.append(
# {
# "Address ID": a_id,
# "Confidence Tier": "3 - similar property, all areas searched",
# "Current EPC Band": expected_epc
# }
#
# )
# else:
# expected_sap = surveyed_similar["Current SAP Rating"].mean()
# expected_epc = sap_to_epc(expected_sap)
# if expected_epc in ["C", "B", "A"]:
# tier = "5 - EPC C or above"
# else:
# tier = "3 - similar property, relaxed conditions"
#
# final_missed_matches.append(
# {
# "Address ID": a_id,
# "Confidence Tier": tier,
# "Current EPC Band": expected_epc
# }
# )
# continue
# # We take an average
# expected_sap = surveyed_similar["Current SAP Rating"].mean()
# expected_epc = sap_to_epc(expected_sap)
# if expected_epc in ["C", "B", "A"]:
# tier = "5 - EPC C or above"
# else:
# tier = "3 - similar property"
#
# final_missed_matches.append(
# {
# "Address ID": a_id,
# "Confidence Tier": tier,
# "Current EPC Band": expected_epc
# }
# )
final_missed_matches = pd.DataFrame(final_missed_matches)
region_assets = region_assets.merge(
@ -2353,12 +2277,11 @@ def propsed_wave_3_sample():
region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna(
region_assets["Confidence Tier_method3"]
)
region_assets["Current EPC Band"] = np.where(
pd.isnull(region_assets["Current EPC Band"]),
region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"]
)
region_assets = region_assets.drop(columns=["Confidence Tier_method3", "Current EPC Band_method3"])
region_assets = fill_survey_columns(region_assets, suffix="_method3")
method_3_columns = [c for c in region_assets.columns if c.endswith("_method3")]
region_assets = region_assets.drop(columns=method_3_columns)
if pd.isnull(region_assets["Current EPC Band"]).sum():
raise Exception("Something went wrong")

View file

@ -289,6 +289,12 @@ class RetrieveFindMyEpc:
"Fuel change recommendation": [],
"PV Cells recommendation": [],
"Replacement glazing units": ["double_glazing"],
"Heating controls (time and temperature zone control)": ["time_temperature_zone_control"],
"High heat retention storage heaters": ["high_heat_retention_storage_heaters"],
"Gas condensing boiler": ["boiler_upgrade"],
"Change room heaters to condensing boiler": ["boiler_upgrade"],
"Cylinder thermostat": ["cylinder_thermostat"],
"Heat recovery system for mixer showers": ["heat_recovery_shower"],
}
survey = True

View file

@ -150,6 +150,13 @@ def app():
# We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
# We check for duplicated addresses
asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
if asset_list["deduper"].duplicated().sum():
# Drop the dupes
print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
asset_list = asset_list[~asset_list["deduper"].duplicated()]
epc_data, errors, no_epc = get_data(
asset_list=asset_list,
fulladdress_column=FULLADDRESS_COLUMN,