mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added de-duping
This commit is contained in:
parent
ac9b7b3730
commit
5d5001fec3
3 changed files with 85 additions and 149 deletions
|
|
@ -1803,21 +1803,26 @@ def propsed_wave_3_sample():
|
|||
|
||||
def match_property_to_surveyed(property, survey_results_with_original_features):
|
||||
surveyed = survey_results_with_original_features[
|
||||
(
|
||||
survey_results_with_original_features["Postal Region"] ==
|
||||
property["Postal Region"]
|
||||
) &
|
||||
(
|
||||
survey_results_with_original_features["Property Type"] ==
|
||||
property["Property Type"]
|
||||
)
|
||||
&
|
||||
(
|
||||
survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
property["Wall Type"].split(":")[0]
|
||||
) &
|
||||
(
|
||||
survey_results_with_original_features["Wall Type"] ==
|
||||
property["Wall Type"]
|
||||
survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
property["Roof Type"].split(":")[0]
|
||||
) &
|
||||
(
|
||||
survey_results_with_original_features["Roof Type"] ==
|
||||
property["Roof Type"]
|
||||
) &
|
||||
(
|
||||
survey_results_with_original_features["Heating"] ==
|
||||
property["Heating"]
|
||||
survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
property["Heating"].split(":")[0]
|
||||
)
|
||||
].copy()
|
||||
|
||||
|
|
@ -1826,23 +1831,47 @@ def propsed_wave_3_sample():
|
|||
|
||||
surveyed = survey_results_with_original_features[
|
||||
(
|
||||
survey_results_with_original_features["Property Type"] ==
|
||||
property["Property Type"]
|
||||
survey_results_with_original_features["Postal Region"] ==
|
||||
property["Postal Region"]
|
||||
) &
|
||||
(
|
||||
survey_results_with_original_features["Wall Type"] ==
|
||||
property["Wall Type"]
|
||||
survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
|
||||
property["Property Type"].split(":")[0]
|
||||
)
|
||||
&
|
||||
(
|
||||
survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
property["Wall Type"].split(":")[0]
|
||||
) &
|
||||
(
|
||||
survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
property["Roof Type"].split(":")[0]
|
||||
) &
|
||||
(
|
||||
survey_results_with_original_features["Heating"] ==
|
||||
property["Heating"]
|
||||
survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
property["Heating"].split(":")[0]
|
||||
)
|
||||
].copy()
|
||||
|
||||
# surveyed = survey_results_with_original_features[
|
||||
# (
|
||||
# survey_results_with_original_features["Property Type"] ==
|
||||
# property["Property Type"]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Wall Type"] ==
|
||||
# property["Wall Type"]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
# property["Roof Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Heating"] ==
|
||||
# property["Heating"]
|
||||
# )
|
||||
# ].copy()
|
||||
|
||||
if not surveyed.empty:
|
||||
return surveyed
|
||||
|
||||
|
|
@ -1906,7 +1935,12 @@ def propsed_wave_3_sample():
|
|||
on="Address ID",
|
||||
how="left"
|
||||
)
|
||||
region_assets['Distance to Closest Match (m)'] = 0
|
||||
region_assets['Distance to Closest Match (m)'] = None
|
||||
region_assets["Distance to Closest Match (m)"] = np.where(
|
||||
~pd.isnull(region_assets["Current EPC Band"]),
|
||||
0,
|
||||
region_assets["Distance to Closest Match (m)"]
|
||||
)
|
||||
|
||||
# Label the tier 1 properties
|
||||
region_assets["Confidence Tier"] = None
|
||||
|
|
@ -2016,7 +2050,7 @@ def propsed_wave_3_sample():
|
|||
|
||||
missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"])
|
||||
|
||||
archetype_surveyed = []
|
||||
# archetype_surveyed = []
|
||||
for arch_id in missed_archetypes:
|
||||
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
|
||||
archetype_data = survey_results_with_original_features[
|
||||
|
|
@ -2175,7 +2209,14 @@ def propsed_wave_3_sample():
|
|||
{
|
||||
"Address ID": a_id,
|
||||
"Confidence Tier": "4 - no similar property, needs survey to confirm",
|
||||
"Current EPC Band": "Needs Survey"
|
||||
"Current EPC Band": "Needs Survey",
|
||||
"Current SAP Rating": "Needs Survey",
|
||||
'Survey: Main Wall Type': "Not Surveyed",
|
||||
"Survey: Main Alternative Wall": "Not Surveyed",
|
||||
"Survey: Main Roof Type": "Not Surveyed",
|
||||
"Survey: Primary Heating System": "Not Surveyed",
|
||||
"Survey: Matching Address ID": "Not Surveyed",
|
||||
'Distance to Closest Match (m)': 9999999,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
|
@ -2197,18 +2238,6 @@ def propsed_wave_3_sample():
|
|||
# Take the 3 nearest
|
||||
surveyed = surveyed.head(3)
|
||||
|
||||
# # We allow a max distance of 10km
|
||||
# surveyed = surveyed[surveyed["distance_meters"] < 10000]
|
||||
# if surveyed.empty:
|
||||
# final_missed_matches.append(
|
||||
# {
|
||||
# "Address ID": a_id,
|
||||
# "Confidence Tier": "4 - no similar property, needs survey to confirm",
|
||||
# "Current EPC Band": "Needs Survey"
|
||||
# }
|
||||
# )
|
||||
# continue
|
||||
|
||||
# perform a weighted mean of SAP rating - the closer the better
|
||||
expected_sap = np.average(
|
||||
surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1)
|
||||
|
|
@ -2218,129 +2247,24 @@ def propsed_wave_3_sample():
|
|||
if expected_epc in ["C", "B", "A"]:
|
||||
match_type = "5 - EPC C or above"
|
||||
|
||||
closest_match = surveyed.iloc[0]
|
||||
|
||||
final_missed_matches.append(
|
||||
{
|
||||
"Address ID": a_id,
|
||||
"Confidence Tier": match_type,
|
||||
"Current EPC Band": expected_epc
|
||||
"Current EPC Band": expected_epc,
|
||||
"Current SAP Rating": expected_sap,
|
||||
'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
|
||||
"Survey: Main Alternative Wall": closest_match["Survey: Main Alternative Wall"],
|
||||
"Survey: Main Roof Type": closest_match["Survey: Main Roof Type"],
|
||||
"Survey: Primary Heating System": closest_match["Survey: Primary Heating System"],
|
||||
"Survey: Matching Address ID": closest_match["Address ID"],
|
||||
'Distance to Closest Match (m)': closest_match["distance_meters"],
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
|
||||
# filter_property_types = ["House", "Bungalow"]
|
||||
# else:
|
||||
# filter_property_types = ["Flat"]
|
||||
#
|
||||
# surveyed_similar = survey_results_with_original_features[
|
||||
# (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
|
||||
# (
|
||||
# survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
|
||||
# filter_property_types
|
||||
# )
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
# property["Wall Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
# property["Roof Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
# property["Heating"].split(":")[0]
|
||||
# )
|
||||
# ]
|
||||
# if surveyed_similar.empty:
|
||||
# surveyed_similar = survey_results_with_original_features[
|
||||
# (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
|
||||
# (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
|
||||
# filter_property_types
|
||||
# )) &
|
||||
# (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
# property["Wall Type"].split(":")[0]) &
|
||||
# (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
# property["Roof Type"].split(":")[0]) &
|
||||
# (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
# property["Heating"].split(":")[0])
|
||||
# ]
|
||||
#
|
||||
# if surveyed_similar.empty:
|
||||
#
|
||||
# # We get an average based on the postcode
|
||||
# surveyed_similar = survey_results_with_original_features[
|
||||
# (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
|
||||
# (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
|
||||
# filter_property_types
|
||||
# ))
|
||||
# ]
|
||||
# if surveyed_similar.empty:
|
||||
# surveyed_similar_entire_population = survey_results_with_original_features[
|
||||
# (
|
||||
# survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[
|
||||
# "Property Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
# property["Wall Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
# property["Roof Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
# property["Heating"].split(":")[0]
|
||||
# )
|
||||
# ]
|
||||
#
|
||||
# # We order them by distance on postcode
|
||||
#
|
||||
# # Average
|
||||
# expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean()
|
||||
# expected_epc = sap_to_epc(expected_sap)
|
||||
#
|
||||
# final_missed_matches.append(
|
||||
# {
|
||||
# "Address ID": a_id,
|
||||
# "Confidence Tier": "3 - similar property, all areas searched",
|
||||
# "Current EPC Band": expected_epc
|
||||
# }
|
||||
#
|
||||
# )
|
||||
# else:
|
||||
# expected_sap = surveyed_similar["Current SAP Rating"].mean()
|
||||
# expected_epc = sap_to_epc(expected_sap)
|
||||
# if expected_epc in ["C", "B", "A"]:
|
||||
# tier = "5 - EPC C or above"
|
||||
# else:
|
||||
# tier = "3 - similar property, relaxed conditions"
|
||||
#
|
||||
# final_missed_matches.append(
|
||||
# {
|
||||
# "Address ID": a_id,
|
||||
# "Confidence Tier": tier,
|
||||
# "Current EPC Band": expected_epc
|
||||
# }
|
||||
# )
|
||||
# continue
|
||||
# # We take an average
|
||||
# expected_sap = surveyed_similar["Current SAP Rating"].mean()
|
||||
# expected_epc = sap_to_epc(expected_sap)
|
||||
# if expected_epc in ["C", "B", "A"]:
|
||||
# tier = "5 - EPC C or above"
|
||||
# else:
|
||||
# tier = "3 - similar property"
|
||||
#
|
||||
# final_missed_matches.append(
|
||||
# {
|
||||
# "Address ID": a_id,
|
||||
# "Confidence Tier": tier,
|
||||
# "Current EPC Band": expected_epc
|
||||
# }
|
||||
# )
|
||||
|
||||
final_missed_matches = pd.DataFrame(final_missed_matches)
|
||||
|
||||
region_assets = region_assets.merge(
|
||||
|
|
@ -2353,12 +2277,11 @@ def propsed_wave_3_sample():
|
|||
region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna(
|
||||
region_assets["Confidence Tier_method3"]
|
||||
)
|
||||
region_assets["Current EPC Band"] = np.where(
|
||||
pd.isnull(region_assets["Current EPC Band"]),
|
||||
region_assets["Current EPC Band_method3"], region_assets["Current EPC Band"]
|
||||
)
|
||||
|
||||
region_assets = region_assets.drop(columns=["Confidence Tier_method3", "Current EPC Band_method3"])
|
||||
region_assets = fill_survey_columns(region_assets, suffix="_method3")
|
||||
|
||||
method_3_columns = [c for c in region_assets.columns if c.endswith("_method3")]
|
||||
region_assets = region_assets.drop(columns=method_3_columns)
|
||||
|
||||
if pd.isnull(region_assets["Current EPC Band"]).sum():
|
||||
raise Exception("Something went wrong")
|
||||
|
|
|
|||
|
|
@ -289,6 +289,12 @@ class RetrieveFindMyEpc:
|
|||
"Fuel change recommendation": [],
|
||||
"PV Cells recommendation": [],
|
||||
"Replacement glazing units": ["double_glazing"],
|
||||
"Heating controls (time and temperature zone control)": ["time_temperature_zone_control"],
|
||||
"High heat retention storage heaters": ["high_heat_retention_storage_heaters"],
|
||||
"Gas condensing boiler": ["boiler_upgrade"],
|
||||
"Change room heaters to condensing boiler": ["boiler_upgrade"],
|
||||
"Cylinder thermostat": ["cylinder_thermostat"],
|
||||
"Heat recovery system for mixer showers": ["heat_recovery_shower"],
|
||||
}
|
||||
|
||||
survey = True
|
||||
|
|
|
|||
|
|
@ -150,6 +150,13 @@ def app():
|
|||
# We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
|
||||
asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
|
||||
|
||||
# We check for duplicated addresses
|
||||
asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
|
||||
if asset_list["deduper"].duplicated().sum():
|
||||
# Drop the dupes
|
||||
print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
|
||||
asset_list = asset_list[~asset_list["deduper"].duplicated()]
|
||||
|
||||
epc_data, errors, no_epc = get_data(
|
||||
asset_list=asset_list,
|
||||
fulladdress_column=FULLADDRESS_COLUMN,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue