mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
pulling out data from best match
This commit is contained in:
parent
377d9929e4
commit
a7857c0375
3 changed files with 83 additions and 94 deletions
|
|
@ -1727,7 +1727,7 @@ def propsed_wave_3_sample():
|
|||
"Existing Primary Heating System": "Survey: Primary Heating System"
|
||||
}
|
||||
)
|
||||
|
||||
survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
|
||||
# Concatenate from the wall information
|
||||
survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[
|
||||
"Main Wall Insulation Type"].astype(str)
|
||||
|
|
@ -1872,6 +1872,8 @@ def propsed_wave_3_sample():
|
|||
'Survey: Primary Heating System'
|
||||
]
|
||||
|
||||
survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy()
|
||||
|
||||
results = []
|
||||
for region in tqdm(unique_postal_regions):
|
||||
# Take all of the properties in that region
|
||||
|
|
@ -1884,10 +1886,14 @@ def propsed_wave_3_sample():
|
|||
|
||||
region_assets = region_assets.merge(
|
||||
exact_surveyed[
|
||||
["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns],
|
||||
["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [
|
||||
"Survey: Matching Address ID"
|
||||
]
|
||||
],
|
||||
on="Address ID",
|
||||
how="left"
|
||||
)
|
||||
region_assets['Distance to Closest Match (m)'] = 0
|
||||
|
||||
# Label the tier 1 properties
|
||||
region_assets["Confidence Tier"] = None
|
||||
|
|
@ -1901,61 +1907,62 @@ def propsed_wave_3_sample():
|
|||
"5 - property was surveyed", region_assets["Confidence Tier"]
|
||||
)
|
||||
|
||||
archetypes = region_assets[
|
||||
archetype_ids = region_assets[
|
||||
pd.isnull(region_assets["Confidence Tier"])
|
||||
]["Archetype ID"].unique()
|
||||
# We get the properties that have been surveyed
|
||||
region_surveyed = survey_results[
|
||||
survey_results["Archetype ID"].isin(archetypes) &
|
||||
(survey_results["Postal Region"] == region)
|
||||
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
|
||||
|
||||
if region_surveyed["Archetype ID"].duplicated().sum():
|
||||
region_surveyed = []
|
||||
for arch_id in archetype_ids:
|
||||
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
|
||||
archetype_data = survey_results_with_original_features[
|
||||
survey_results["Archetype ID"] == arch_id
|
||||
].copy()
|
||||
if archetype_data.empty:
|
||||
continue
|
||||
if archetype_data.shape[0] > 1:
|
||||
# Look for an exact match, or as close as possible
|
||||
archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
|
||||
if not archetype_data_filtered.empty:
|
||||
archetype_data = archetype_data_filtered
|
||||
|
||||
region_surveyed = []
|
||||
for arch_id in archetypes:
|
||||
for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
|
||||
archetype_data = survey_results_with_original_features[
|
||||
survey_results["Archetype ID"] == arch_id
|
||||
].copy()
|
||||
if archetype_data.empty:
|
||||
continue
|
||||
if archetype_data.shape[0] > 1:
|
||||
# Look for an exact match, or as close as possible
|
||||
archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
|
||||
if not archetype_data_filtered.empty:
|
||||
archetype_data = archetype_data_filtered
|
||||
archetype_data["distance_meters"] = haversine(
|
||||
lat1=property.latitude, lon1=property.longitude,
|
||||
lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
|
||||
)
|
||||
expected_sap = np.average(
|
||||
archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
|
||||
)
|
||||
expected_epc = sap_to_epc(expected_sap)
|
||||
|
||||
archetype_data["distance_meters"] = haversine(
|
||||
lat1=property.latitude, lon1=property.longitude,
|
||||
lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
|
||||
)
|
||||
expected_sap = np.average(
|
||||
archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
|
||||
)
|
||||
expected_epc = sap_to_epc(expected_sap)
|
||||
region_surveyed.append(
|
||||
{
|
||||
"Archetype ID": arch_id,
|
||||
"Address ID": property["Address ID"],
|
||||
"Current EPC Band": expected_epc
|
||||
}
|
||||
)
|
||||
# We take the features of the closest matching property
|
||||
closest_match = archetype_data.sort_values("distance_meters", ascending=True).iloc[0]
|
||||
|
||||
region_surveyed = pd.DataFrame(region_surveyed)
|
||||
region_assets = region_assets.merge(
|
||||
region_surveyed,
|
||||
on=["Archetype ID", "Address ID"],
|
||||
how="left",
|
||||
suffixes=("", "_method1")
|
||||
)
|
||||
else:
|
||||
region_assets = region_assets.merge(
|
||||
region_surveyed,
|
||||
on="Archetype ID",
|
||||
how="left",
|
||||
suffixes=("", "_method1")
|
||||
)
|
||||
region_surveyed.append(
|
||||
{
|
||||
"Archetype ID": arch_id,
|
||||
"Address ID": property["Address ID"],
|
||||
"Current EPC Band": expected_epc,
|
||||
"Current SAP Rating": expected_sap,
|
||||
'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
|
||||
'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"],
|
||||
'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"],
|
||||
'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
|
||||
"Survey: Matching Address ID": closest_match["Address ID"],
|
||||
'Distance to Closest Match (m)': closest_match["distance_meters"]
|
||||
}
|
||||
)
|
||||
|
||||
region_surveyed = pd.DataFrame(region_surveyed)
|
||||
starting_shape = region_assets.shape[0]
|
||||
region_assets = region_assets.merge(
|
||||
region_surveyed,
|
||||
on=["Archetype ID", "Address ID"],
|
||||
how="left",
|
||||
suffixes=("", "_method1")
|
||||
)
|
||||
if region_assets.shape[0] != starting_shape:
|
||||
raise ValueError("Something went wrong")
|
||||
|
||||
# Label the tier 1 properties
|
||||
region_assets["Confidence Tier"] = np.where(
|
||||
|
|
@ -2326,7 +2333,9 @@ def propsed_wave_3_sample():
|
|||
results = pd.concat(results)
|
||||
|
||||
# Check if there are missings in current epc band, current sap rating or any of the survey attributes
|
||||
for c in ["Current EPC Band", "Current SAP Rating"] + survey_attribute_columns:
|
||||
for c in (
|
||||
["Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
|
||||
survey_attribute_columns):
|
||||
if pd.isnull(results[c]).sum():
|
||||
raise Exception("Something went wrong")
|
||||
|
||||
|
|
|
|||
|
|
@ -269,6 +269,7 @@ class RetrieveFindMyEpc:
|
|||
"Loft insulation": ["loft_insulation"],
|
||||
"Solar photovoltaic (PV) panels": ["solar_pv"],
|
||||
"Party wall insulation": ["party_wall_insulation"],
|
||||
'Draught proofing': ["draught_proofing"],
|
||||
}
|
||||
|
||||
survey = True
|
||||
|
|
|
|||
|
|
@ -23,41 +23,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
|||
def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
|
||||
epc_data = []
|
||||
errors = []
|
||||
no_epc = []
|
||||
for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
||||
postcode = home[postcode_column]
|
||||
house_number = home[address1_column]
|
||||
full_address = home[fulladdress_column]
|
||||
|
||||
searcher = SearchEpc(
|
||||
address1=str(house_number),
|
||||
postcode=postcode,
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key="",
|
||||
property_type=None,
|
||||
fast=True,
|
||||
full_address=full_address,
|
||||
max_retries=5
|
||||
)
|
||||
# Force the skipping of estimating the EPC
|
||||
searcher.ordnance_survey_client.property_type = None
|
||||
searcher.ordnance_survey_client.built_form = None
|
||||
|
||||
searcher.find_property(skip_os=True)
|
||||
if searcher.newest_epc is None:
|
||||
continue
|
||||
|
||||
# Look for EPC recommendatons
|
||||
try:
|
||||
property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
|
||||
except:
|
||||
property_recommendations = {"rows": []}
|
||||
|
||||
# Retrieve data from FindMyEPC
|
||||
find_epc_searcher = RetrieveFindMyEpc(
|
||||
address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
|
||||
)
|
||||
find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
|
||||
time.sleep(np.random.uniform(0.1, 1))
|
||||
try:
|
||||
postcode = home[postcode_column]
|
||||
house_number = home[address1_column]
|
||||
|
|
@ -79,6 +46,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
|
|||
|
||||
searcher.find_property(skip_os=True)
|
||||
if searcher.newest_epc is None:
|
||||
no_epc.append(home["row_id"])
|
||||
continue
|
||||
|
||||
# Look for EPC recommendatons
|
||||
|
|
@ -106,7 +74,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column):
|
|||
errors.append(home["row_id"])
|
||||
time.sleep(5)
|
||||
|
||||
return epc_data, errors
|
||||
return epc_data, errors, no_epc
|
||||
|
||||
|
||||
def extract_address1(asset_list, full_address_col, method="first_two_words"):
|
||||
|
|
@ -140,26 +108,37 @@ def app():
|
|||
Property UPRN
|
||||
|
||||
"""
|
||||
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/P&F/"
|
||||
DATA_FILENAME = "BELOW C - WFT FINDINGS ON INSPECTION PLUS SUGGESTED ACTION.xlsx"
|
||||
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/"
|
||||
DATA_FILENAME = "Bromford programme review.xlsx"
|
||||
SHEET_NAME = "Bromford"
|
||||
POSTCODE_COLUMN = "Postcode"
|
||||
FULLADDRESS_COLUMN = "Address"
|
||||
ADDRESS1_COLUMN = None
|
||||
FULLADDRESS_COLUMN = None
|
||||
ADDRESS1_COLUMN = "No."
|
||||
ADDRESS1_METHOD = "first_two_words"
|
||||
ADDRESS_COLS_TO_CONCAT = ["No.", "Address"]
|
||||
|
||||
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0)
|
||||
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
|
||||
asset_list = asset_list[~pd.isnull(asset_list["Postcode"])]
|
||||
asset_list["row_id"] = asset_list.index
|
||||
|
||||
# We clean up portential non-breaking spaces, and double spaces
|
||||
for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
|
||||
asset_list[col] = asset_list[col].astype(str)
|
||||
asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
|
||||
asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False)
|
||||
|
||||
if ADDRESS1_COLUMN is None:
|
||||
ADDRESS1_COLUMN = "address1_extracted"
|
||||
asset_list = extract_address1(asset_list, FULLADDRESS_COLUMN, ADDRESS1_METHOD)
|
||||
asset_list = extract_address1(
|
||||
asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD
|
||||
)
|
||||
|
||||
epc_data, errors = get_data(
|
||||
if FULLADDRESS_COLUMN is None:
|
||||
FULLADDRESS_COLUMN = "fulladdress_extracted"
|
||||
# We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
|
||||
asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
|
||||
|
||||
epc_data, errors, no_epc = get_data(
|
||||
asset_list=asset_list,
|
||||
fulladdress_column=FULLADDRESS_COLUMN,
|
||||
address1_column=ADDRESS1_COLUMN,
|
||||
|
|
@ -168,7 +147,7 @@ def app():
|
|||
|
||||
# We now retrieve any failed properties
|
||||
asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
|
||||
epc_data_failed, _ = get_data(
|
||||
epc_data_failed, _, _ = get_data(
|
||||
asset_list=asset_list_failed,
|
||||
fulladdress_column=FULLADDRESS_COLUMN,
|
||||
address1_column=ADDRESS1_COLUMN,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue