pulling out data from best match

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-18 20:30:57 +00:00
parent 377d9929e4
commit a7857c0375
3 changed files with 83 additions and 94 deletions

View file

@ -1727,7 +1727,7 @@ def propsed_wave_3_sample():
"Existing Primary Heating System": "Survey: Primary Heating System"
}
)
survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
# Concatenate from the wall information
survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[
"Main Wall Insulation Type"].astype(str)
@ -1872,6 +1872,8 @@ def propsed_wave_3_sample():
'Survey: Primary Heating System'
]
survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy()
results = []
for region in tqdm(unique_postal_regions):
# Take all of the properties in that region
@ -1884,10 +1886,14 @@ def propsed_wave_3_sample():
region_assets = region_assets.merge(
exact_surveyed[
["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns],
["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [
"Survey: Matching Address ID"
]
],
on="Address ID",
how="left"
)
region_assets['Distance to Closest Match (m)'] = 0
# Label the tier 1 properties
region_assets["Confidence Tier"] = None
@ -1901,61 +1907,62 @@ def propsed_wave_3_sample():
"5 - property was surveyed", region_assets["Confidence Tier"]
)
archetypes = region_assets[
archetype_ids = region_assets[
pd.isnull(region_assets["Confidence Tier"])
]["Archetype ID"].unique()
# We get the properties that have been surveyed
region_surveyed = survey_results[
survey_results["Archetype ID"].isin(archetypes) &
(survey_results["Postal Region"] == region)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
if region_surveyed["Archetype ID"].duplicated().sum():
region_surveyed = []
for arch_id in archetype_ids:
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
archetype_data = survey_results_with_original_features[
survey_results["Archetype ID"] == arch_id
].copy()
if archetype_data.empty:
continue
if archetype_data.shape[0] > 1:
# Look for an exact match, or as close as possible
archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
if not archetype_data_filtered.empty:
archetype_data = archetype_data_filtered
region_surveyed = []
for arch_id in archetypes:
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
archetype_data = survey_results_with_original_features[
survey_results["Archetype ID"] == arch_id
].copy()
if archetype_data.empty:
continue
if archetype_data.shape[0] > 1:
# Look for an exact match, or as close as possible
archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
if not archetype_data_filtered.empty:
archetype_data = archetype_data_filtered
archetype_data["distance_meters"] = haversine(
lat1=property.latitude, lon1=property.longitude,
lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
)
expected_sap = np.average(
archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
)
expected_epc = sap_to_epc(expected_sap)
archetype_data["distance_meters"] = haversine(
lat1=property.latitude, lon1=property.longitude,
lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
)
expected_sap = np.average(
archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
)
expected_epc = sap_to_epc(expected_sap)
region_surveyed.append(
{
"Archetype ID": arch_id,
"Address ID": property["Address ID"],
"Current EPC Band": expected_epc
}
)
# We take the features of the closest matching property
closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0]
region_surveyed = pd.DataFrame(region_surveyed)
region_assets = region_assets.merge(
region_surveyed,
on=["Archetype ID", "Address ID"],
how="left",
suffixes=("", "_method1")
)
else:
region_assets = region_assets.merge(
region_surveyed,
on="Archetype ID",
how="left",
suffixes=("", "_method1")
)
region_surveyed.append(
{
"Archetype ID": arch_id,
"Address ID": property["Address ID"],
"Current EPC Band": expected_epc,
"Current SAP Rating": expected_sap,
'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"],
'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"],
'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
"Survey: Matching Address ID": closest_match["Address ID"],
'Distance to Closest Match (m)': closest_match["distance_meters"]
}
)
region_surveyed = pd.DataFrame(region_surveyed)
starting_shape = region_assets.shape[0]
region_assets = region_assets.merge(
region_surveyed,
on=["Archetype ID", "Address ID"],
how="left",
suffixes=("", "_method1")
)
if region_assets.shape[0] != starting_shape:
raise ValueError("Something went wrong")
# Label the tier 1 properties
region_assets["Confidence Tier"] = np.where(
@ -2326,7 +2333,9 @@ def propsed_wave_3_sample():
results = pd.concat(results)
# Check if there are missings in current epc band, current sap rating or any of the survey attributes
for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns:
for c in (
["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
survey_attribute_columns):
if pd.isnull(results[c]).sum():
raise Exception("Something went wrong")

View file

@ -269,6 +269,7 @@ class RetrieveFindMyEpc:
"Loft insulation": ["loft_insulation"],
"Solar photovoltaic (PV) panels": ["solar_pv"],
"Party wall insulation": ["party_wall_insulation"],
'Draught proofing': ["draught_proofing"],
}
survey = True

View file

@ -23,41 +23,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
epc_data = []
errors = []
no_epc = []
for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
postcode = home[postcode_column]
house_number = home[address1_column]
full_address = home[fulladdress_column]
searcher = SearchEpc(
address1=str(house_number),
postcode=postcode,
auth_token=EPC_AUTH_TOKEN,
os_api_key="",
property_type=None,
fast=True,
full_address=full_address,
max_retries=5
)
# Force the skipping of estimating the EPC
searcher.ordnance_survey_client.property_type = None
searcher.ordnance_survey_client.built_form = None
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
continue
# Look for EPC recommendatons
try:
property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
except:
property_recommendations = {"rows": []}
# Retrieve data from FindMyEPC
find_epc_searcher = RetrieveFindMyEpc(
address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
)
find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
time.sleep(np.random.uniform(0.1, 1))
try:
postcode = home[postcode_column]
house_number = home[address1_column]
@ -79,6 +46,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
no_epc.append(home["row_id"])
continue
# Look for EPC recommendatons
@ -106,7 +74,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
errors.append(home["row_id"])
time.sleep(5)
return epc_data, errors
return epc_data, errors, no_epc
def extract_address1(asset_list, full_address_col, method="first_two_words"):
@ -140,26 +108,37 @@ def app():
Property UPRN
"""
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/"
DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx"
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/"
DATA_FILENAME = "Bromford programme review.xlsx"
SHEET_NAME = "Bromford"
POSTCODE_COLUMN = "Postcode"
FULLADDRESS_COLUMN = "Address"
ADDRESS1_COLUMN = None
FULLADDRESS_COLUMN = None
ADDRESS1_COLUMN = "No."
ADDRESS1_METHOD = "first_two_words"
ADDRESS_COLS_TO_CONCAT = ["No.", "Address"]
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0)
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
asset_list = asset_list[~pd.isnull(asset_list["Postcode"])]
asset_list["row_id"] = asset_list.index
# We clean up portential non-breaking spaces, and double spaces
for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
asset_list[col] = asset_list[col].astype(str)
asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False)
if ADDRESS1_COLUMN is None:
ADDRESS1_COLUMN = "address1_extracted"
asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD)
asset_list = extract_address1(
asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD
)
epc_data, errors = get_data(
if FULLADDRESS_COLUMN is None:
FULLADDRESS_COLUMN = "fulladdress_extracted"
# We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
epc_data, errors, no_epc = get_data(
asset_list=asset_list,
fulladdress_column=FULLADDRESS_COLUMN,
address1_column=ADDRESS1_COLUMN,
@ -168,7 +147,7 @@ def app():
# We now retrieve any failed properties
asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
epc_data_failed, _ = get_data(
epc_data_failed, _, _ = get_data(
asset_list=asset_list_failed,
fulladdress_column=FULLADDRESS_COLUMN,
address1_column=ADDRESS1_COLUMN,