working on stonewater matching algorithm

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-16 15:49:08 +00:00
parent 31c5935577
commit dc1cf6d604
3 changed files with 171 additions and 9 deletions

View file

@ -229,7 +229,3 @@ def app():
filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov "
"2024.xlsx")
asset_list.to_excel(filename, index=False)
asset_list["% of the Roof with PV"].value_counts()
asset_list[asset_list["% of the Roof with PV"] == "50.0"][["Address", "Postcode"]]

View file

@ -117,7 +117,7 @@ def extract_summary_report(pdf_path):
- Fuel Bill
- Address
"""
data = {
"Address": None,
"Postcode": None,
@ -1618,5 +1618,136 @@ def append_stonewater_id():
index=False
)
def propsed_wave_3_sample():
"""
Stonewater want to ensure that the properties that when selecting properties for wave 3, they choose properties
such that most of the properties within a geographical area are treatable within the bid.
Name, if we take a geographical area (which could be postal region) they want the most, and ideally all, of the
properties within that geographical area to be included within the bid
:return:
"""
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
"- Archetyped V3.1.xlsx",
header=4
)
# Clean address ids
asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
asset_list["Address ID"] = asset_list["Address ID"].astype(int)
# Create the postal region, taking the first part of the postcode
asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0]
unique_postal_regions = asset_list["Postal Region"].unique()
# Keep just the columns we need
asset_list = asset_list[
["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type",
"Heating"]
]
survey_results = pd.read_excel(
os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.24.xlsx"),
header=13,
sheet_name="Modelled Packages"
)
# TOOD: We probably want the actual surveyed wall, roof, heating type
survey_results = survey_results[
["Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode"]
]
survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
survey_results_with_original_features = survey_results.merge(
asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
on="Address ID",
how="left"
)
if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
raise ValueError("Something went wrong")
# Tier definitions
# Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D
# Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
#
for region in unique_postal_regions:
# Take all of the properties in that region
region_assets = asset_list[asset_list["Postal Region"] == region].copy()
archetypes = region_assets["Archetype ID"].unique()
# We get the properties that have been surveyed
region_surveyed = survey_results[
survey_results["Archetype ID"].isin(archetypes) &
(survey_results["Postal Region"] == region)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
if region_surveyed["Archetype ID"].duplicated().sum():
raise NotImplementedError("Fix me")
region_assets = region_assets.merge(
region_surveyed,
on="Archetype ID",
how="left"
)
# Label the tier 1 properties
region_assets["Confidence Tier"] = None
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]),
"1", region_assets["Confidence Tier"]
)
# TODO: Turn into a function
missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
region_surveyed = survey_results[
survey_results["Archetype ID"].isin(missed_archetypes)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
if region_surveyed["Archetype ID"].duplicated().sum():
raise NotImplementedError("Fix me 2")
region_assets = region_assets.merge(
region_surveyed,
on="Archetype ID",
how="left",
suffixes=("", "_method2")
)
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]),
"2 - same archetype", region_assets["Confidence Tier"]
)
region_assets["Current EPC Band"] = region_assets["Current EPC Band"].fillna(
region_assets["Current EPC Band_method2"])
region_assets = region_assets.drop(columns=["Current EPC Band_method2"])
missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
# This means that this archetype was never surveyed and so we need to find a sufficiently similar property
for a_id in missed_addressids:
property = asset_list[asset_list["Address ID"] == a_id].squeeze()
surveyed_same_postcode = survey_results_with_original_features[
(survey_results_with_original_features["Postcode"] == property["Postcode"]) &
(survey_results_with_original_features["Property Type"] == property["Property Type"])
]
surveyed_same_region = survey_results_with_original_features[
(survey_results_with_original_features["Postal Region"] == property["Postal Region"]) &
(survey_results_with_original_features["Property Type"] == property["Property Type"])
]
same_postcode = survey_results[
survey_results["Archetype ID"].isin(missed_archetypes) &
(survey_results["Postal Region"] == region)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
pd.isnull(region_assets["Current EPC Band"]).sum()
# if __name__ == "__main__":
# main()

View file

@ -206,6 +206,14 @@ def app():
# Drop the column that is ""
transformed_df = transformed_df.drop(columns=[""])
# Get the find my epc data
find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
pd.json_normalize(epc_df["find_my_epc_data"])
)
# We check if we get the solar pv column:
if "Solar photovoltaics" not in find_my_epc_data.columns:
find_my_epc_data["Solar photovoltaics"] = False
# Retrieve just the data we need
epc_df = epc_df[
[
@ -228,6 +236,7 @@ def app():
"mainheat-description",
#
"energy-consumption-current", # kwh/m2
"photo-supply",
]
]
@ -236,12 +245,25 @@ def app():
how="left",
on="row_id"
).merge(
transformed_df,
find_my_epc_data[
[
"row_id", "heating_text", "hot_water_text", 'Assessors name',
"Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
"Assessors ID", "Solar photovoltaics"
]
].rename(
columns={
"Solar photovoltaics": "Has Solar PV",
"heating_text": "Heating Estimated kWh",
"hot_water_text": "Hot Water Estimated kWh",
}
),
how="left",
on="row_id"
)
asset_list = asset_list.drop(columns=["row_id"])
asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
asset_list = asset_list.drop(columns=["photo-supply"])
# Rename the columns
asset_list = asset_list.rename(columns={
@ -259,7 +281,7 @@ def app():
"mainheat-description": "Heating Type",
"secondheat-description": "Secondary Heating",
"transaction-type": "Reason for last EPC",
"energy-consumption-current": "Heat Demand (kWh/m2)"
"energy-consumption-current": "Heat Demand (kWh/m2)",
})
asset_list["Estimated Number of Floors"] = asset_list.apply(
@ -295,6 +317,19 @@ def app():
axis=1
)
# For all of the columns in transformed_df, prefix with "Recommendation: "
for col in transformed_df.columns:
if col == "row_id":
continue
transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"})
asset_list = asset_list.merge(
transformed_df,
how="left",
on="row_id"
)
asset_list = asset_list.drop(columns=["row_id"])
# Store as an excel
filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx"
filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
asset_list.to_excel(filename, index=False)