mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
implementing distance weighting
This commit is contained in:
parent
7d63c16404
commit
eff80e637f
1 changed files with 248 additions and 84 deletions
|
|
@ -1635,8 +1635,9 @@ def propsed_wave_3_sample():
|
|||
header=4
|
||||
)
|
||||
|
||||
# TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater
|
||||
asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"]
|
||||
# TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing
|
||||
# UPRN
|
||||
asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])]
|
||||
# Clean address ids
|
||||
asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
|
||||
asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
|
||||
|
|
@ -1648,7 +1649,7 @@ def propsed_wave_3_sample():
|
|||
|
||||
# Keep just the columns we need
|
||||
asset_list = asset_list[
|
||||
["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
|
||||
["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
|
||||
"Heating"]
|
||||
]
|
||||
|
||||
|
|
@ -1665,7 +1666,7 @@ def propsed_wave_3_sample():
|
|||
survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
|
||||
|
||||
survey_results_with_original_features = survey_results.merge(
|
||||
asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
|
||||
asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
|
||||
on="Address ID",
|
||||
how="left"
|
||||
)
|
||||
|
|
@ -1673,6 +1674,45 @@ def propsed_wave_3_sample():
|
|||
if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
|
||||
raise ValueError("Something went wrong")
|
||||
|
||||
# We get longitude & Latitude
|
||||
from utils.s3 import read_pickle_from_s3
|
||||
archetyping_spatial_features = read_pickle_from_s3(
|
||||
bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
|
||||
)
|
||||
archetyping_spatial_features = pd.concat(archetyping_spatial_features)
|
||||
archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename(
|
||||
columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"}
|
||||
)
|
||||
# Merge them onto both datasets
|
||||
asset_list = asset_list.merge(
|
||||
archetyping_spatial_features, how="left", on="UPRN"
|
||||
)
|
||||
if pd.isnull(asset_list["longitude"]).sum():
|
||||
raise ValueError("Something went wrong")
|
||||
|
||||
survey_results_with_original_features = survey_results_with_original_features.merge(
|
||||
archetyping_spatial_features, how="left", on="UPRN"
|
||||
)
|
||||
if pd.isnull(survey_results_with_original_features["longitude"]).sum():
|
||||
raise ValueError("Something went wrong")
|
||||
|
||||
def haversine(lat1, lon1, lat2, lon2):
|
||||
# Radius of Earth in meters
|
||||
R = 6371000
|
||||
|
||||
# Convert degrees to radians
|
||||
lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
|
||||
|
||||
# Differences
|
||||
dlat = lat2 - lat1
|
||||
dlon = lon2 - lon1
|
||||
|
||||
# Haversine formula
|
||||
a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
|
||||
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
|
||||
distance = R * c
|
||||
return distance
|
||||
|
||||
# Tier definitions
|
||||
# Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D
|
||||
# Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
|
||||
|
|
@ -1716,6 +1756,7 @@ def propsed_wave_3_sample():
|
|||
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
|
||||
|
||||
if region_surveyed["Archetype ID"].duplicated().sum():
|
||||
blah1
|
||||
region_surveyed = survey_results[
|
||||
survey_results["Archetype ID"].isin(archetypes) &
|
||||
(survey_results["Postal Region"] == region)
|
||||
|
|
@ -1755,23 +1796,46 @@ def propsed_wave_3_sample():
|
|||
survey_results["Archetype ID"].isin(missed_archetypes)
|
||||
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
|
||||
|
||||
# TODO - We could average the property?? And call it borderline, call out it was averaged!!!
|
||||
# We could also find the nearest property to it, with similar wall, roof, heating?
|
||||
# Can use long/lag to distance calc. We have this data from previous
|
||||
|
||||
if archetype_surveyed["Archetype ID"].duplicated().sum():
|
||||
archetype_surveyed = survey_results[
|
||||
survey_results["Archetype ID"].isin(missed_archetypes)
|
||||
].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index()
|
||||
archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc)
|
||||
archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"])
|
||||
|
||||
region_assets = region_assets.merge(
|
||||
archetype_surveyed,
|
||||
on="Archetype ID",
|
||||
how="left",
|
||||
suffixes=("", "_method2")
|
||||
)
|
||||
archetype_surveyed = []
|
||||
for arch_id in missed_archetypes:
|
||||
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
|
||||
archetype_data = survey_results_with_original_features[
|
||||
survey_results["Archetype ID"] == arch_id
|
||||
].copy()
|
||||
if archetype_data.empty:
|
||||
continue
|
||||
archetype_data["distance_meters"] = haversine(
|
||||
lat1=property.latitude, lon1=property.longitude,
|
||||
lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
|
||||
)
|
||||
expected_sap = np.average(
|
||||
archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
|
||||
)
|
||||
expected_epc = sap_to_epc(expected_sap)
|
||||
archetype_surveyed.append(
|
||||
{
|
||||
"Archetype ID": arch_id,
|
||||
"Address ID": property["Address ID"],
|
||||
"Current EPC Band": expected_epc
|
||||
}
|
||||
)
|
||||
archetype_surveyed = pd.DataFrame(archetype_surveyed)
|
||||
region_assets = region_assets.merge(
|
||||
archetype_surveyed,
|
||||
on=["Archetype ID", "Address ID"],
|
||||
how="left",
|
||||
suffixes=("", "_method2")
|
||||
)
|
||||
else:
|
||||
|
||||
region_assets = region_assets.merge(
|
||||
archetype_surveyed,
|
||||
on="Archetype ID",
|
||||
how="left",
|
||||
suffixes=("", "_method2")
|
||||
)
|
||||
|
||||
region_assets["Confidence Tier"] = np.where(
|
||||
region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
|
||||
|
|
@ -1792,6 +1856,16 @@ def propsed_wave_3_sample():
|
|||
"5 - EPC C or above", region_assets["Confidence Tier"]
|
||||
)
|
||||
|
||||
region_assets["Confidence Tier"] = np.where(
|
||||
region_assets["Archetype ID"] == "EPC C OR ABOVE",
|
||||
"5 - EPC C or above", region_assets["Confidence Tier"]
|
||||
)
|
||||
|
||||
region_assets["Current EPC Band"] = np.where(
|
||||
region_assets["Archetype ID"] == "EPC C OR ABOVE",
|
||||
"C", region_assets["Current EPC Band"]
|
||||
)
|
||||
|
||||
missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
|
||||
|
||||
if not missed_addressids:
|
||||
|
|
@ -1803,17 +1877,10 @@ def propsed_wave_3_sample():
|
|||
for a_id in missed_addressids:
|
||||
property = asset_list[asset_list["Address ID"] == a_id].squeeze()
|
||||
|
||||
if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
|
||||
filter_property_types = ["House", "Bungalow"]
|
||||
else:
|
||||
filter_property_types = ["Flat"]
|
||||
|
||||
surveyed_similar = survey_results_with_original_features[
|
||||
(survey_results_with_original_features["Postcode"] == property["Postcode"]) &
|
||||
surveyed = survey_results_with_original_features[
|
||||
(
|
||||
survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
|
||||
filter_property_types
|
||||
)
|
||||
survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
|
||||
property["Property Type"].split(":")[0]
|
||||
) &
|
||||
(
|
||||
survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
|
|
@ -1827,62 +1894,38 @@ def propsed_wave_3_sample():
|
|||
survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
property["Heating"].split(":")[0]
|
||||
)
|
||||
]
|
||||
if surveyed_similar.empty:
|
||||
surveyed_similar = survey_results_with_original_features[
|
||||
(survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
|
||||
(survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
|
||||
filter_property_types
|
||||
)) &
|
||||
(survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
property["Wall Type"].split(":")[0]) &
|
||||
(survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
property["Roof Type"].split(":")[0]) &
|
||||
(survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
property["Heating"].split(":")[0])
|
||||
]
|
||||
].copy()
|
||||
|
||||
if surveyed_similar.empty:
|
||||
if surveyed.empty:
|
||||
blah3
|
||||
|
||||
# We get an average based on the postcode
|
||||
surveyed_similar = survey_results_with_original_features[
|
||||
(survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
|
||||
(survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
|
||||
filter_property_types
|
||||
))
|
||||
]
|
||||
if surveyed_similar.empty:
|
||||
final_missed_matches.append(
|
||||
{
|
||||
"Address ID": a_id,
|
||||
"Confidence Tier": "4 - no similar property, needs survey to confirm",
|
||||
"Current EPC Band": "Unknown"
|
||||
}
|
||||
# Calculate distance
|
||||
surveyed["distance_meters"] = haversine(
|
||||
lat1=property["latitude"], lon1=property["longitude"],
|
||||
lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values
|
||||
)
|
||||
surveyed = surveyed.sort_values("distance_meters", ascending=True)
|
||||
|
||||
)
|
||||
else:
|
||||
expected_sap = surveyed_similar["Current SAP Rating"].mean()
|
||||
expected_epc = sap_to_epc(expected_sap)
|
||||
if expected_epc in ["C", "B", "A"]:
|
||||
tier = "5 - EPC C or above"
|
||||
else:
|
||||
tier = "3 - similar property, relaxed conditions"
|
||||
# Check if we have a postcode match check if surveyed postcode is the same as the property postcode
|
||||
if any(surveyed["Postcode"] == property["Postcode"]):
|
||||
surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]]
|
||||
|
||||
final_missed_matches.append(
|
||||
{
|
||||
"Address ID": a_id,
|
||||
"Confidence Tier": tier,
|
||||
"Current EPC Band": expected_epc
|
||||
}
|
||||
)
|
||||
continue
|
||||
# We take an average
|
||||
expected_sap = surveyed_similar["Current SAP Rating"].mean()
|
||||
if any(surveyed["Postal Region"] == property["Postal Region"]):
|
||||
surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]]
|
||||
|
||||
# Take the 5 nearest
|
||||
surveyed_similar = surveyed_similar.head(5)
|
||||
|
||||
# perform a weighted mean of SAP rating - the closer the better
|
||||
expected_sap = np.average(
|
||||
surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1)
|
||||
)
|
||||
expected_epc = sap_to_epc(expected_sap)
|
||||
|
||||
if expected_epc in ["C", "B", "A"]:
|
||||
tier = "5 - EPC C or above"
|
||||
else:
|
||||
tier = "3 - similar property"
|
||||
tier = "3 - similar property, weighted on distance"
|
||||
|
||||
final_missed_matches.append(
|
||||
{
|
||||
|
|
@ -1891,6 +1934,121 @@ def propsed_wave_3_sample():
|
|||
"Current EPC Band": expected_epc
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# if property["Property Type"].split(":")[0] in ["House", "Bungalow"]:
|
||||
# filter_property_types = ["House", "Bungalow"]
|
||||
# else:
|
||||
# filter_property_types = ["Flat"]
|
||||
#
|
||||
# surveyed_similar = survey_results_with_original_features[
|
||||
# (survey_results_with_original_features["Postcode"] == property["Postcode"]) &
|
||||
# (
|
||||
# survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
|
||||
# filter_property_types
|
||||
# )
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
# property["Wall Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
# property["Roof Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
# property["Heating"].split(":")[0]
|
||||
# )
|
||||
# ]
|
||||
# if surveyed_similar.empty:
|
||||
# surveyed_similar = survey_results_with_original_features[
|
||||
# (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
|
||||
# (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
|
||||
# filter_property_types
|
||||
# )) &
|
||||
# (survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
# property["Wall Type"].split(":")[0]) &
|
||||
# (survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
# property["Roof Type"].split(":")[0]) &
|
||||
# (survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
# property["Heating"].split(":")[0])
|
||||
# ]
|
||||
#
|
||||
# if surveyed_similar.empty:
|
||||
#
|
||||
# # We get an average based on the postcode
|
||||
# surveyed_similar = survey_results_with_original_features[
|
||||
# (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
|
||||
# (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
|
||||
# filter_property_types
|
||||
# ))
|
||||
# ]
|
||||
# if surveyed_similar.empty:
|
||||
# surveyed_similar_entire_population = survey_results_with_original_features[
|
||||
# (
|
||||
# survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[
|
||||
# "Property Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
|
||||
# property["Wall Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
|
||||
# property["Roof Type"].split(":")[0]
|
||||
# ) &
|
||||
# (
|
||||
# survey_results_with_original_features["Heating"].str.split(":").str[0] ==
|
||||
# property["Heating"].split(":")[0]
|
||||
# )
|
||||
# ]
|
||||
#
|
||||
# # We order them by distance on postcode
|
||||
#
|
||||
# # Average
|
||||
# expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean()
|
||||
# expected_epc = sap_to_epc(expected_sap)
|
||||
#
|
||||
# final_missed_matches.append(
|
||||
# {
|
||||
# "Address ID": a_id,
|
||||
# "Confidence Tier": "3 - similar property, all areas searched",
|
||||
# "Current EPC Band": expected_epc
|
||||
# }
|
||||
#
|
||||
# )
|
||||
# else:
|
||||
# expected_sap = surveyed_similar["Current SAP Rating"].mean()
|
||||
# expected_epc = sap_to_epc(expected_sap)
|
||||
# if expected_epc in ["C", "B", "A"]:
|
||||
# tier = "5 - EPC C or above"
|
||||
# else:
|
||||
# tier = "3 - similar property, relaxed conditions"
|
||||
#
|
||||
# final_missed_matches.append(
|
||||
# {
|
||||
# "Address ID": a_id,
|
||||
# "Confidence Tier": tier,
|
||||
# "Current EPC Band": expected_epc
|
||||
# }
|
||||
# )
|
||||
# continue
|
||||
# # We take an average
|
||||
# expected_sap = surveyed_similar["Current SAP Rating"].mean()
|
||||
# expected_epc = sap_to_epc(expected_sap)
|
||||
# if expected_epc in ["C", "B", "A"]:
|
||||
# tier = "5 - EPC C or above"
|
||||
# else:
|
||||
# tier = "3 - similar property"
|
||||
#
|
||||
# final_missed_matches.append(
|
||||
# {
|
||||
# "Address ID": a_id,
|
||||
# "Confidence Tier": tier,
|
||||
# "Current EPC Band": expected_epc
|
||||
# }
|
||||
# )
|
||||
|
||||
final_missed_matches = pd.DataFrame(final_missed_matches)
|
||||
|
||||
|
|
@ -1928,27 +2086,33 @@ def propsed_wave_3_sample():
|
|||
|
||||
# We create the gain and loss columns
|
||||
# Gain is the sum of these columns:
|
||||
# '1 - Archetype surveyed', '1 - property was surveyed',
|
||||
# '2 - same archetype', '3 - similar property',
|
||||
# '1 - Archetype surveyed',
|
||||
# '1 - property was surveyed',
|
||||
# '2 - same archetype',
|
||||
# '3 - similar property',
|
||||
# '3 - similar property, all areas searched',
|
||||
# '3 - similar property, relaxed conditions'
|
||||
#
|
||||
# Loss is the sum of these columns:
|
||||
# '4 - no similar property, needs survey to confirm',
|
||||
# '5 - EPC C or above', '5 - property was surveyed'
|
||||
geographic_summary["Gain"] = geographic_summary[
|
||||
['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property']
|
||||
[
|
||||
'1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property',
|
||||
'3 - similar property, all areas searched', '3 - similar property, relaxed conditions'
|
||||
]
|
||||
].sum(axis=1)
|
||||
|
||||
geographic_summary["Loss"] = geographic_summary[
|
||||
['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed']
|
||||
['5 - EPC C or above', '5 - property was surveyed']
|
||||
].sum(axis=1)
|
||||
|
||||
geographic_summary.sum()
|
||||
print(geographic_summary.sum())
|
||||
|
||||
geographic_summary = geographic_summary.sort_values("Loss", ascending=True)
|
||||
geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum()
|
||||
geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum()
|
||||
|
||||
geographic_summary[["Loss", "Gain"]].head()
|
||||
|
||||
loss = geographic_summary["Loss"].values
|
||||
gain = geographic_summary["Gain"].values
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue