mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
updating methdology for matching
This commit is contained in:
parent
6eb52a509e
commit
ac9b7b3730
1 changed files with 114 additions and 79 deletions
|
|
@ -1867,6 +1867,19 @@ def propsed_wave_3_sample():
|
|||
|
||||
return surveyed
|
||||
|
||||
def fill_survey_columns(region_assets, suffix):
|
||||
for col in [
|
||||
'Current EPC Band', 'Current SAP Rating',
|
||||
'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
|
||||
'Survey: Main Roof Type', 'Survey: Primary Heating System',
|
||||
'Survey: Matching Address ID', 'Distance to Closest Match (m)'
|
||||
]:
|
||||
region_assets[col] = np.where(
|
||||
pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]),
|
||||
region_assets[col + suffix], region_assets[col]
|
||||
)
|
||||
return region_assets
|
||||
|
||||
survey_attribute_columns = [
|
||||
"Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
|
||||
'Survey: Primary Heating System'
|
||||
|
|
@ -1920,6 +1933,14 @@ def propsed_wave_3_sample():
|
|||
].copy()
|
||||
if archetype_data.empty:
|
||||
continue
|
||||
|
||||
match_type = "2 - same archetype"
|
||||
if any(archetype_data["Postal Region"] == property["Postal Region"]):
|
||||
match_type = "1 - same archetype, same postal region"
|
||||
archetype_data = archetype_data[
|
||||
archetype_data["Postal Region"] == property["Postal Region"]
|
||||
]
|
||||
|
||||
if archetype_data.shape[0] > 1:
|
||||
# Look for an exact match, or as close as possible
|
||||
archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
|
||||
|
|
@ -1949,11 +1970,21 @@ def propsed_wave_3_sample():
|
|||
'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"],
|
||||
'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
|
||||
"Survey: Matching Address ID": closest_match["Address ID"],
|
||||
'Distance to Closest Match (m)': closest_match["distance_meters"]
|
||||
'Distance to Closest Match (m)': closest_match["distance_meters"],
|
||||
"Match Type": match_type
|
||||
}
|
||||
)
|
||||
|
||||
region_surveyed = pd.DataFrame(region_surveyed)
|
||||
|
||||
if region_surveyed.empty:
|
||||
region_surveyed = pd.DataFrame(
|
||||
columns=[
|
||||
"Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
|
||||
'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
|
||||
'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
|
||||
]
|
||||
)
|
||||
|
||||
starting_shape = region_assets.shape[0]
|
||||
region_assets = region_assets.merge(
|
||||
region_surveyed,
|
||||
|
|
@ -1968,95 +1999,99 @@ def propsed_wave_3_sample():
|
|||
region_assets["Confidence Tier"] = np.where(
|
||||
region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) &
|
||||
pd.isnull(region_assets["Confidence Tier"]),
|
||||
"1 - Archetype surveyed", region_assets["Confidence Tier"]
|
||||
"1 - Archetype surveyed in region", region_assets["Confidence Tier"]
|
||||
)
|
||||
|
||||
region_assets["Current EPC Band"] = np.where(
|
||||
pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method1"]),
|
||||
region_assets["Current EPC Band_method1"], region_assets["Current EPC Band"]
|
||||
)
|
||||
# Handle EPC C
|
||||
region_assets["Confidence Tier"] = np.where(
|
||||
region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
|
||||
region_assets["Current EPC Band_method1"].isin(["C", "B", "F", "G"]) &
|
||||
pd.isnull(region_assets["Confidence Tier"]),
|
||||
"5 - EPC C or above", region_assets["Confidence Tier"]
|
||||
)
|
||||
|
||||
region_assets = region_assets.drop(columns=["Current EPC Band_method1"])
|
||||
# TODO: Turn into a function
|
||||
missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
|
||||
region_assets = fill_survey_columns(region_assets, suffix="_method1")
|
||||
|
||||
archetype_surveyed = survey_results[
|
||||
survey_results["Archetype ID"].isin(missed_archetypes)
|
||||
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
|
||||
method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")]
|
||||
region_assets = region_assets.drop(columns=method_1_columns)
|
||||
|
||||
if archetype_surveyed["Archetype ID"].duplicated().sum():
|
||||
missed_archetypes = set(archetype_ids) - set(region_surveyed["Archetype ID"])
|
||||
|
||||
archetype_surveyed = []
|
||||
for arch_id in missed_archetypes:
|
||||
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
|
||||
archetype_data = survey_results_with_original_features[
|
||||
survey_results["Archetype ID"] == arch_id
|
||||
].copy()
|
||||
if archetype_data.empty:
|
||||
continue
|
||||
archetype_data["distance_meters"] = haversine(
|
||||
lat1=property.latitude, lon1=property.longitude,
|
||||
lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
|
||||
)
|
||||
expected_sap = np.average(
|
||||
archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
|
||||
)
|
||||
expected_epc = sap_to_epc(expected_sap)
|
||||
archetype_surveyed.append(
|
||||
{
|
||||
"Archetype ID": arch_id,
|
||||
"Address ID": property["Address ID"],
|
||||
"Current EPC Band": expected_epc
|
||||
}
|
||||
)
|
||||
archetype_surveyed = pd.DataFrame(archetype_surveyed)
|
||||
region_assets = region_assets.merge(
|
||||
archetype_surveyed,
|
||||
on=["Archetype ID", "Address ID"],
|
||||
how="left",
|
||||
suffixes=("", "_method2")
|
||||
)
|
||||
else:
|
||||
region_assets = region_assets.merge(
|
||||
archetype_surveyed,
|
||||
on="Archetype ID",
|
||||
how="left",
|
||||
suffixes=("", "_method2")
|
||||
)
|
||||
|
||||
region_assets["Confidence Tier"] = np.where(
|
||||
region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
|
||||
region_assets["Confidence Tier"]),
|
||||
"2 - same archetype", region_assets["Confidence Tier"]
|
||||
)
|
||||
|
||||
region_assets["Current EPC Band"] = np.where(
|
||||
pd.isnull(region_assets["Current EPC Band"]) & pd.notnull(region_assets["Current EPC Band_method2"]),
|
||||
region_assets["Current EPC Band_method2"], region_assets["Current EPC Band"]
|
||||
)
|
||||
|
||||
region_assets = region_assets.drop(columns=["Current EPC Band_method2"])
|
||||
archetype_surveyed = []
|
||||
for arch_id in missed_archetypes:
|
||||
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
|
||||
archetype_data = survey_results_with_original_features[
|
||||
survey_results["Archetype ID"] == arch_id
|
||||
].copy()
|
||||
if archetype_data.empty:
|
||||
continue
|
||||
raise Exception("IMPLEMENT ME")
|
||||
# archetype_data["distance_meters"] = haversine(
|
||||
# lat1=property.latitude, lon1=property.longitude,
|
||||
# lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
|
||||
# )
|
||||
# expected_sap = np.average(
|
||||
# archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
|
||||
# )
|
||||
# expected_epc = sap_to_epc(expected_sap)
|
||||
# archetype_surveyed.append(
|
||||
# {
|
||||
# "Archetype ID": arch_id,
|
||||
# "Address ID": property["Address ID"],
|
||||
# "Current EPC Band": expected_epc
|
||||
# }
|
||||
# )
|
||||
# archetype_surveyed = pd.DataFrame(archetype_surveyed)
|
||||
# if archetype_surveyed.empty:
|
||||
# archetype_surveyed = pd.DataFrame(
|
||||
# columns=[
|
||||
# "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
|
||||
# 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
|
||||
# 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)'
|
||||
# ]
|
||||
# )
|
||||
#
|
||||
# region_assets = region_assets.merge(
|
||||
# archetype_surveyed,
|
||||
# on=["Archetype ID", "Address ID"],
|
||||
# how="left",
|
||||
# suffixes=("", "_method2")
|
||||
# )
|
||||
#
|
||||
# region_assets["Confidence Tier"] = np.where(
|
||||
# region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull(
|
||||
# region_assets["Confidence Tier"]),
|
||||
# "2 - same archetype", region_assets["Confidence Tier"]
|
||||
# )
|
||||
#
|
||||
# for col in [
|
||||
# 'Current EPC Band', 'Current SAP Rating',
|
||||
# 'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
|
||||
# 'Survey: Main Roof Type', 'Survey: Primary Heating System',
|
||||
# 'Survey: Matching Address ID', 'Distance to Closest Match (m)'
|
||||
# ]:
|
||||
# region_assets[col] = np.where(
|
||||
# pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + "_method2"]),
|
||||
# region_assets[col + "_method2"], region_assets[col]
|
||||
# )
|
||||
#
|
||||
# method_2_columns = [c for c in region_assets.columns if c.endswith("_method2")]
|
||||
# region_assets = region_assets.drop(columns=method_2_columns)
|
||||
|
||||
# We label EPC C properties
|
||||
region_assets["Confidence Tier"] = np.where(
|
||||
region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
|
||||
"5 - EPC C or above", region_assets["Confidence Tier"]
|
||||
)
|
||||
|
||||
region_assets["Confidence Tier"] = np.where(
|
||||
region_assets["Archetype ID"] == "EPC C OR ABOVE",
|
||||
"5 - EPC C or above", region_assets["Confidence Tier"]
|
||||
)
|
||||
|
||||
region_assets["Current EPC Band"] = np.where(
|
||||
region_assets["Archetype ID"] == "EPC C OR ABOVE",
|
||||
"C", region_assets["Current EPC Band"]
|
||||
)
|
||||
# region_assets["Confidence Tier"] = np.where(
|
||||
# region_assets["Current EPC Band"].isin(["C", "B", "A"]) & pd.isnull(region_assets["Confidence Tier"]),
|
||||
# "5 - EPC C or above", region_assets["Confidence Tier"]
|
||||
# )
|
||||
#
|
||||
# region_assets["Confidence Tier"] = np.where(
|
||||
# region_assets["Archetype ID"] == "EPC C OR ABOVE",
|
||||
# "5 - EPC C or above", region_assets["Confidence Tier"]
|
||||
# )
|
||||
#
|
||||
# region_assets["Current EPC Band"] = np.where(
|
||||
# region_assets["Archetype ID"] == "EPC C OR ABOVE",
|
||||
# "C", region_assets["Current EPC Band"]
|
||||
# )
|
||||
|
||||
missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue