This commit is contained in:
Khalim Conn-Kowlessar 2024-07-01 13:35:00 +01:00
parent c5693289c3
commit 51333ff31a
3 changed files with 93 additions and 10 deletions

3
.idea/misc.xml generated
View file

@ -4,6 +4,9 @@
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PyCharmProfessionalAdvertiser">
<option name="shown" value="true" />
</component>
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -9,14 +9,16 @@ In this script, we do the following things:
3) Mapping of the archetypes
"""
import pandas as pd
import json
from utils.s3 import read_pickle_from_s3
archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
archetyped_asset_list = archetyped_asset_list[
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
archetyped_asset_list = stonewater_asset_list[
[
"internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank"
"internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
"archetype_representative", "rank"
]
]
].copy()
archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
# Sort
@ -28,12 +30,38 @@ clustering_features = read_pickle_from_s3(
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
)
# Move property-type and built-form to the first two columns
columns_to_move = ['property-type', 'built-form']
# Get the remaining columns
remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
# Create the new column order
new_column_order = columns_to_move + remaining_columns
# Reorder the DataFrame
clustering_features = clustering_features[new_column_order]
archetyped_asset_list = archetyped_asset_list.merge(
clustering_features,
on="internal_id",
how="inner"
)
archetyped_asset_list = archetyped_asset_list.rename(
columns={
"internal_id": "Osm. ID",
"customer_asset_id": "Org. ref.",
"external_address_id": "Address ID",
"cluster": "Archetype ID",
"archetype_representative": "Archetype Representative",
"rank": "Archetype Group Rank",
}
)
archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
# Create an extract of the features
# Look at number of combinations
# - If we look at the number of combinations of property type & built form, we have 25 unique combinations
# - If we look at the number of combinations of property type, built form, and walls description, this jumps
@ -50,6 +78,55 @@ archetyped_asset_list = archetyped_asset_list.merge(
# ["property-type", "built-form", "walls-description", "roof-description",
# "floor-description"]].drop_duplicates().shape
property_type_archetypes = archetyped_asset_list[
["cluster", "rank", "property-type", "built-form", "walls-description"]
]
# Save this as an excel
# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
# We store the location data, which will be used for the mapping. We just need the longitude and latitude
mapping_data = stonewater_asset_list[
stonewater_asset_list["archetype_representative"]
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
mapping_data = mapping_data.merge(
clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
)
mapping_data = mapping_data.drop(columns=["internal_id"])
with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w") as f:
f.write(json.dumps(mapping_data.to_dict(orient="records")))
# We also include some data for visualising the breakdown of EPCS
proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
# Invert the true and false
proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
proportion_of_real_epcs = proportion_of_real_epcs.rename(
columns={"estimated": "is_real_epc"}
)
with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
# Produce the breakdown of EPC ratings
epc_rating_breakdown = (
clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
.value_counts()
.to_frame()
.reset_index()
)
epc_rating_breakdown = epc_rating_breakdown.rename(
columns={"current-energy-rating": "EPC"}
)
with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
epc_a_properties = clustering_features[
(clustering_features["current-energy-rating"] == "A")
& (~clustering_features["estimated"])
]
epc_a_properties = epc_a_properties.merge(
stonewater_asset_list,
on="internal_id",
how="inner"
)

View file

@ -678,7 +678,8 @@ def compile_data():
# )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
header=4
)
udprn_data = pd.read_excel(
@ -1128,7 +1129,8 @@ def compile_data_final():
########################################################################
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
header=4
)
udprn_data = pd.read_excel(
@ -1788,12 +1790,13 @@ def compile_data_final():
property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
# Drop the description columns that are the keys in cleaned
print("PUT ME BACK!!??")
property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
# Perform the mapping
# CLUSTERING!!
grouping_columns = [
'is_cavity_wall', 'is_solid_brick', 'built-form', 'property-type'
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
]
# Define the preprocessing for numerical and categorical features