updating methdology for matching

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-18 22:08:10 +00:00
parent 6eb52a509e
commit ac9b7b3730

View file

@ -1867,6 +1867,19 @@ def propsed_wave_3_sample():
return surveyed
def fill_survey_columns(region_assets, suffix):
for col in [
'Current EPC Band', 'Current SAP Rating',
'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
'Survey: Main Roof Type', 'Survey: Primary Heating System',
'Survey: Matching Address ID', 'Distance to Closest Match (m)'
]:
region_assets[col] = np.where(
pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]),
region_assets[col + suffix], region_assets[col]
)
return region_assets
survey_attribute_columns = [
"Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
'Survey: Primary Heating System'
@ -1920,6 +1933,14 @@ def propsed_wave_3_sample():
].copy()
if archetype_data.empty:
continue
match_type = "2 - same archetype"
if any(archetype_data["Postal Region"] == property["Postal Region"]):
match_type = "1 - same archetype, same postal region"
archetype_data = archetype_data[
archetype_data["Postal Region"] == property["Postal Region"]
]
if archetype_data.shape[0] > 1:
# Look for an exact match, or as close as possible
archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
@ -1949,11 +1970,21 @@ def propsed_wave_3_sample():
'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"],
'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
"Survey: Matching Address ID": closest_match["Address ID"],
'Distance to Closest Match (m)': closest_match["distance_meters"]
'Distance to Closest Match (m)': closest_match["distance_meters"],
"Match Type": match_type
}
)
region_surveyed = pd.DataFrame(region_surveyed)
if region_surveyed.empty:
region_surveyed = pd.DataFrame(
columns=[
"Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
]
)
starting_shape = region_assets.shape[0]
region_assets = region_assets.merge(
region_surveyed,
@ -1968,95 +1999,99 @@ def propsed_wave_3_sample():
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) &
pd.isnull(region_assets["Confidence Tier"]),
"1 - Archetype surveyed", region_assets["Confidence Tier"]
"1 - Archetype surveyed in region", region_assets["Confidence Tier"]
)
region_assets["Current EPC Band"] = np.where(
pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]),
region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"]
)
# Handle EPC C
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
region_assets["Current EPC Band_method1"].isin(["C", "B", "F", "G"]) &
pd.isnull(region_assets["Confidence Tier"]),
"5 - EPC C or above", region_assets["Confidence Tier"]
)
region_assets = region_assets.drop(columns=["Current EPC Band_method1"])
# TODO: Turn into a function
missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
region_assets = fill_survey_columns(region_assets, suffix="_method1")
archetype_surveyed = survey_results[
survey_results["Archetype ID"].isin(missed_archetypes)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")]
region_assets = region_assets.drop(columns=method_1_columns)
if archetype_surveyed["Archetype ID"].duplicated().sum():
missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"])
archetype_surveyed = []
for arch_id in missed_archetypes:
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
archetype_data = survey_results_with_original_features[
survey_results["Archetype ID"] == arch_id
].copy()
if archetype_data.empty:
continue
archetype_data["distance_meters"] = haversine(
lat1=property.latitude, lon1=property.longitude,
lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
)
expected_sap = np.average(
archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
)
expected_epc = sap_to_epc(expected_sap)
archetype_surveyed.append(
{
"Archetype ID": arch_id,
"Address ID": property["Address ID"],
"Current EPC Band": expected_epc
}
)
archetype_surveyed = pd.DataFrame(archetype_surveyed)
region_assets = region_assets.merge(
archetype_surveyed,
on=["Archetype ID", "Address ID"],
how="left",
suffixes=("", "_method2")
)
else:
region_assets = region_assets.merge(
archetype_surveyed,
on="Archetype ID",
how="left",
suffixes=("", "_method2")
)
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
region_assets["Confidence Tier"]),
"2 - same archetype", region_assets["Confidence Tier"]
)
region_assets["Current EPC Band"] = np.where(
pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method2"]),
region_assets["Current EPC Band_method2"], region_assets["Current EPC Band"]
)
region_assets = region_assets.drop(columns=["Current EPC Band_method2"])
archetype_surveyed = []
for arch_id in missed_archetypes:
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
archetype_data = survey_results_with_original_features[
survey_results["Archetype ID"] == arch_id
].copy()
if archetype_data.empty:
continue
raise Exception("IMPLEMENT ME")
# archetype_data["distance_meters"] = haversine(
# lat1=property.latitude, lon1=property.longitude,
# lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
# )
# expected_sap = np.average(
# archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
# )
# expected_epc = sap_to_epc(expected_sap)
# archetype_surveyed.append(
# {
# "Archetype ID": arch_id,
# "Address ID": property["Address ID"],
# "Current EPC Band": expected_epc
# }
# )
# archetype_surveyed = pd.DataFrame(archetype_surveyed)
# if archetype_surveyed.empty:
# archetype_surveyed = pd.DataFrame(
# columns=[
# "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
# 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
# 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
# ]
# )
#
# region_assets = region_assets.merge(
# archetype_surveyed,
# on=["Archetype ID", "Address ID"],
# how="left",
# suffixes=("", "_method2")
# )
#
# region_assets["Confidence Tier"] = np.where(
# region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
# region_assets["Confidence Tier"]),
# "2 - same archetype", region_assets["Confidence Tier"]
# )
#
# for col in [
# 'Current EPC Band', 'Current SAP Rating',
# 'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
# 'Survey: Main Roof Type', 'Survey: Primary Heating System',
# 'Survey: Matching Address ID', 'Distance to Closest Match (m)'
# ]:
# region_assets[col] = np.where(
# pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]),
# region_assets[col + "_method2"], region_assets[col]
# )
#
# method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")]
# region_assets = region_assets.drop(columns=method_2_columns)
# We label EPC C properties
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
"5 - EPC C or above", region_assets["Confidence Tier"]
)
region_assets["Confidence Tier"] = np.where(
region_assets["Archetype ID"] == "EPC C OR ABOVE",
"5 - EPC C or above", region_assets["Confidence Tier"]
)
region_assets["Current EPC Band"] = np.where(
region_assets["Archetype ID"] == "EPC C OR ABOVE",
"C", region_assets["Current EPC Band"]
)
# region_assets["Confidence Tier"] = np.where(
# region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
# "5 - EPC C or above", region_assets["Confidence Tier"]
# )
#
# region_assets["Confidence Tier"] = np.where(
# region_assets["Archetype ID"] == "EPC C OR ABOVE",
# "5 - EPC C or above", region_assets["Confidence Tier"]
# )
#
# region_assets["Current EPC Band"] = np.where(
# region_assets["Archetype ID"] == "EPC C OR ABOVE",
# "C", region_assets["Current EPC Band"]
# )
missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()