implementing distance weighting

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-17 19:10:23 +00:00
parent 7d63c16404
commit eff80e637f

View file

@ -1635,8 +1635,9 @@ def propsed_wave_3_sample():
header=4
)
# TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater
asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"]
# TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing
# UPRN
asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])]
# Clean address ids
asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
@ -1648,7 +1649,7 @@ def propsed_wave_3_sample():
# Keep just the columns we need
asset_list = asset_list[
["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
"Heating"]
]
@ -1665,7 +1666,7 @@ def propsed_wave_3_sample():
survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
survey_results_with_original_features = survey_results.merge(
asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
on="Address ID",
how="left"
)
@ -1673,6 +1674,45 @@ def propsed_wave_3_sample():
if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
raise ValueError("Something went wrong")
# We get longitude & Latitude
from utils.s3 import read_pickle_from_s3
archetyping_spatial_features = read_pickle_from_s3(
bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
)
archetyping_spatial_features = pd.concat(archetyping_spatial_features)
archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename(
columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"}
)
# Merge them onto both datasets
asset_list = asset_list.merge(
archetyping_spatial_features, how="left", on="UPRN"
)
if pd.isnull(asset_list["longitude"]).sum():
raise ValueError("Something went wrong")
survey_results_with_original_features = survey_results_with_original_features.merge(
archetyping_spatial_features, how="left", on="UPRN"
)
if pd.isnull(survey_results_with_original_features["longitude"]).sum():
raise ValueError("Something went wrong")
def haversine(lat1, lon1, lat2, lon2):
# Radius of Earth in meters
R = 6371000
# Convert degrees to radians
lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
# Differences
dlat = lat2 - lat1
dlon = lon2 - lon1
# Haversine formula
a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
distance = R * c
return distance
# Tier definitions
# Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D
# Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
@ -1716,6 +1756,7 @@ def propsed_wave_3_sample():
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
if region_surveyed["Archetype ID"].duplicated().sum():
blah1
region_surveyed = survey_results[
survey_results["Archetype ID"].isin(archetypes) &
(survey_results["Postal Region"] == region)
@ -1755,23 +1796,46 @@ def propsed_wave_3_sample():
survey_results["Archetype ID"].isin(missed_archetypes)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
# TODO - We could average the property?? And call it borderline, call out it was averaged!!!
# We could also find the nearest property to it, with similar wall, roof, heating?
# Can use long/lag to distance calc. We have this data from previous
if archetype_surveyed["Archetype ID"].duplicated().sum():
archetype_surveyed = survey_results[
survey_results["Archetype ID"].isin(missed_archetypes)
].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc)
archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"])
region_assets = region_assets.merge(
archetype_surveyed,
on="Archetype ID",
how="left",
suffixes=("", "_method2")
)
archetype_surveyed = []
for arch_id in missed_archetypes:
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
archetype_data = survey_results_with_original_features[
survey_results["Archetype ID"] == arch_id
].copy()
if archetype_data.empty:
continue
archetype_data["distance_meters"] = haversine(
lat1=property.latitude, lon1=property.longitude,
lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
)
expected_sap = np.average(
archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
)
expected_epc = sap_to_epc(expected_sap)
archetype_surveyed.append(
{
"Archetype ID": arch_id,
"Address ID": property["Address ID"],
"Current EPC Band": expected_epc
}
)
archetype_surveyed = pd.DataFrame(archetype_surveyed)
region_assets = region_assets.merge(
archetype_surveyed,
on=["Archetype ID", "Address ID"],
how="left",
suffixes=("", "_method2")
)
else:
region_assets = region_assets.merge(
archetype_surveyed,
on="Archetype ID",
how="left",
suffixes=("", "_method2")
)
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
@ -1792,6 +1856,16 @@ def propsed_wave_3_sample():
"5 - EPC C or above", region_assets["Confidence Tier"]
)
region_assets["Confidence Tier"] = np.where(
region_assets["Archetype ID"] == "EPC C OR ABOVE",
"5 - EPC C or above", region_assets["Confidence Tier"]
)
region_assets["Current EPC Band"] = np.where(
region_assets["Archetype ID"] == "EPC C OR ABOVE",
"C", region_assets["Current EPC Band"]
)
missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
if not missed_addressids:
@ -1803,17 +1877,10 @@ def propsed_wave_3_sample():
for a_id in missed_addressids:
property = asset_list[asset_list["Address ID"] == a_id].squeeze()
if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
filter_property_types = ["House", "Bungalow"]
else:
filter_property_types = ["Flat"]
surveyed_similar = survey_results_with_original_features[
(survey_results_with_original_features["Postcode"] == property["Postcode"]) &
surveyed = survey_results_with_original_features[
(
survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
filter_property_types
)
survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
property["Property Type"].split(":")[0]
) &
(
survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
@ -1827,62 +1894,38 @@ def propsed_wave_3_sample():
survey_results_with_original_features["Heating"].str.split(":").str[0] ==
property["Heating"].split(":")[0]
)
]
if surveyed_similar.empty:
surveyed_similar = survey_results_with_original_features[
(survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
(survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
filter_property_types
)) &
(survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
property["Wall Type"].split(":")[0]) &
(survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
property["Roof Type"].split(":")[0]) &
(survey_results_with_original_features["Heating"].str.split(":").str[0] ==
property["Heating"].split(":")[0])
]
].copy()
if surveyed_similar.empty:
if surveyed.empty:
blah3
# We get an average based on the postcode
surveyed_similar = survey_results_with_original_features[
(survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
(survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
filter_property_types
))
]
if surveyed_similar.empty:
final_missed_matches.append(
{
"Address ID": a_id,
"Confidence Tier": "4 - no similar property, needs survey to confirm",
"Current EPC Band": "Unknown"
}
# Calculate distance
surveyed["distance_meters"] = haversine(
lat1=property["latitude"], lon1=property["longitude"],
lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values
)
surveyed = surveyed.sort_values("distance_meters", ascending=True)
)
else:
expected_sap = surveyed_similar["Current SAP Rating"].mean()
expected_epc = sap_to_epc(expected_sap)
if expected_epc in ["C", "B", "A"]:
tier = "5 - EPC C or above"
else:
tier = "3 - similar property, relaxed conditions"
# Check if we have a postcode match check if surveyed postcode is the same as the property postcode
if any(surveyed["Postcode"] == property["Postcode"]):
surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]]
final_missed_matches.append(
{
"Address ID": a_id,
"Confidence Tier": tier,
"Current EPC Band": expected_epc
}
)
continue
# We take an average
expected_sap = surveyed_similar["Current SAP Rating"].mean()
if any(surveyed["Postal Region"] == property["Postal Region"]):
surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]]
# Take the 5 nearest
surveyed_similar = surveyed_similar.head(5)
# perform a weighted mean of SAP rating - the closer the better
expected_sap = np.average(
surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1)
)
expected_epc = sap_to_epc(expected_sap)
if expected_epc in ["C", "B", "A"]:
tier = "5 - EPC C or above"
else:
tier = "3 - similar property"
tier = "3 - similar property, weighted on distance"
final_missed_matches.append(
{
@ -1891,6 +1934,121 @@ def propsed_wave_3_sample():
"Current EPC Band": expected_epc
}
)
continue
# if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
# filter_property_types = ["House", "Bungalow"]
# else:
# filter_property_types = ["Flat"]
#
# surveyed_similar = survey_results_with_original_features[
# (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
# (
# survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
# filter_property_types
# )
# ) &
# (
# survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
# property["Wall Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
# property["Roof Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Heating"].str.split(":").str[0] ==
# property["Heating"].split(":")[0]
# )
# ]
# if surveyed_similar.empty:
# surveyed_similar = survey_results_with_original_features[
# (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
# (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
# filter_property_types
# )) &
# (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
# property["Wall Type"].split(":")[0]) &
# (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
# property["Roof Type"].split(":")[0]) &
# (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
# property["Heating"].split(":")[0])
# ]
#
# if surveyed_similar.empty:
#
# # We get an average based on the postcode
# surveyed_similar = survey_results_with_original_features[
# (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
# (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
# filter_property_types
# ))
# ]
# if surveyed_similar.empty:
# surveyed_similar_entire_population = survey_results_with_original_features[
# (
# survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[
# "Property Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
# property["Wall Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
# property["Roof Type"].split(":")[0]
# ) &
# (
# survey_results_with_original_features["Heating"].str.split(":").str[0] ==
# property["Heating"].split(":")[0]
# )
# ]
#
# # We order them by distance on postcode
#
# # Average
# expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean()
# expected_epc = sap_to_epc(expected_sap)
#
# final_missed_matches.append(
# {
# "Address ID": a_id,
# "Confidence Tier": "3 - similar property, all areas searched",
# "Current EPC Band": expected_epc
# }
#
# )
# else:
# expected_sap = surveyed_similar["Current SAP Rating"].mean()
# expected_epc = sap_to_epc(expected_sap)
# if expected_epc in ["C", "B", "A"]:
# tier = "5 - EPC C or above"
# else:
# tier = "3 - similar property, relaxed conditions"
#
# final_missed_matches.append(
# {
# "Address ID": a_id,
# "Confidence Tier": tier,
# "Current EPC Band": expected_epc
# }
# )
# continue
# # We take an average
# expected_sap = surveyed_similar["Current SAP Rating"].mean()
# expected_epc = sap_to_epc(expected_sap)
# if expected_epc in ["C", "B", "A"]:
# tier = "5 - EPC C or above"
# else:
# tier = "3 - similar property"
#
# final_missed_matches.append(
# {
# "Address ID": a_id,
# "Confidence Tier": tier,
# "Current EPC Band": expected_epc
# }
# )
final_missed_matches = pd.DataFrame(final_missed_matches)
@ -1928,27 +2086,33 @@ def propsed_wave_3_sample():
# We create the gain and loss columns
# Gain is the sum of these columns:
# '1 - Archetype surveyed', '1 - property was surveyed',
# '2 - same archetype', '3 - similar property',
# '1 - Archetype surveyed',
# '1 - property was surveyed',
# '2 - same archetype',
# '3 - similar property',
# '3 - similar property, all areas searched',
# '3 - similar property, relaxed conditions'
#
# Loss is the sum of these columns:
# '4 - no similar property, needs survey to confirm',
# '5 - EPC C or above', '5 - property was surveyed'
geographic_summary["Gain"] = geographic_summary[
['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property']
[
'1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property',
'3 - similar property, all areas searched', '3 - similar property, relaxed conditions'
]
].sum(axis=1)
geographic_summary["Loss"] = geographic_summary[
['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed']
['5 - EPC C or above', '5 - property was surveyed']
].sum(axis=1)
geographic_summary.sum()
print(geographic_summary.sum())
geographic_summary = geographic_summary.sort_values("Loss", ascending=True)
geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum()
geographic_summary[["Loss", "Gain"]].head()
loss = geographic_summary["Loss"].values
gain = geographic_summary["Gain"].values