mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
minor
This commit is contained in:
parent
c5693289c3
commit
51333ff31a
3 changed files with 93 additions and 10 deletions
3
.idea/misc.xml
generated
3
.idea/misc.xml
generated
|
|
@ -4,6 +4,9 @@
|
|||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="PyCharmProfessionalAdvertiser">
|
||||
<option name="shown" value="true" />
|
||||
</component>
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -9,14 +9,16 @@ In this script, we do the following things:
|
|||
3) Mapping of the archetypes
|
||||
"""
|
||||
import pandas as pd
|
||||
import json
|
||||
from utils.s3 import read_pickle_from_s3
|
||||
|
||||
archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
|
||||
archetyped_asset_list = archetyped_asset_list[
|
||||
stonewater_asset_list = pd.read_csv("Stonewater asset list with archetypes V2.csv")
|
||||
archetyped_asset_list = stonewater_asset_list[
|
||||
[
|
||||
"internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank"
|
||||
"internal_id", "customer_asset_id", "external_address_id", "udprn", "uprn", "cluster",
|
||||
"archetype_representative", "rank"
|
||||
]
|
||||
]
|
||||
].copy()
|
||||
archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
|
||||
archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
|
||||
# Sort
|
||||
|
|
@ -28,12 +30,38 @@ clustering_features = read_pickle_from_s3(
|
|||
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
||||
)
|
||||
|
||||
# Move property-type and built-form to the first two columns
|
||||
columns_to_move = ['property-type', 'built-form']
|
||||
|
||||
# Get the remaining columns
|
||||
remaining_columns = [col for col in clustering_features.columns if col not in columns_to_move]
|
||||
|
||||
# Create the new column order
|
||||
new_column_order = columns_to_move + remaining_columns
|
||||
|
||||
# Reorder the DataFrame
|
||||
clustering_features = clustering_features[new_column_order]
|
||||
|
||||
archetyped_asset_list = archetyped_asset_list.merge(
|
||||
clustering_features,
|
||||
on="internal_id",
|
||||
how="inner"
|
||||
)
|
||||
|
||||
archetyped_asset_list = archetyped_asset_list.rename(
|
||||
columns={
|
||||
"internal_id": "Osm. ID",
|
||||
"customer_asset_id": "Org. ref.",
|
||||
"external_address_id": "Address ID",
|
||||
"cluster": "Archetype ID",
|
||||
"archetype_representative": "Archetype Representative",
|
||||
"rank": "Archetype Group Rank",
|
||||
}
|
||||
)
|
||||
archetyped_asset_list["uprn"] = archetyped_asset_list["uprn"].astype('Int64')
|
||||
# Create an extract of the features
|
||||
|
||||
|
||||
# Look at number of combinations
|
||||
# - If we look at the number of combinations of property type & built form, we have 25 unique combinations
|
||||
# - If we look at the number of combinations of property type, built form, and walls description, this jumps
|
||||
|
|
@ -50,6 +78,55 @@ archetyped_asset_list = archetyped_asset_list.merge(
|
|||
# ["property-type", "built-form", "walls-description", "roof-description",
|
||||
# "floor-description"]].drop_duplicates().shape
|
||||
|
||||
property_type_archetypes = archetyped_asset_list[
|
||||
["cluster", "rank", "property-type", "built-form", "walls-description"]
|
||||
]
|
||||
# Save this as an excel
|
||||
# archetyped_asset_list.to_excel("Stonewater Archetyping Features.xlsx", index=False)
|
||||
|
||||
# We store the location data, which will be used for the mapping. We just need the longitude and latitude
|
||||
mapping_data = stonewater_asset_list[
|
||||
stonewater_asset_list["archetype_representative"]
|
||||
][["internal_id", "uprn", "standardised_address", "standardised_postcode"]]
|
||||
|
||||
mapping_data = mapping_data.merge(
|
||||
clustering_features[["internal_id", "LONGITUDE", "LATITUDE"]],
|
||||
)
|
||||
mapping_data = mapping_data.drop(columns=["internal_id"])
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater Mapping Data.json", "w") as f:
|
||||
f.write(json.dumps(mapping_data.to_dict(orient="records")))
|
||||
|
||||
# We also include some data for visualising the breakdown of EPCS
|
||||
proportion_of_real_epcs = clustering_features["estimated"].value_counts().to_frame().reset_index()
|
||||
# Invert the true and false
|
||||
proportion_of_real_epcs["estimated"] = ~proportion_of_real_epcs["estimated"]
|
||||
proportion_of_real_epcs = proportion_of_real_epcs.rename(
|
||||
columns={"estimated": "is_real_epc"}
|
||||
)
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater real EPC breakdown.json", "w") as f:
|
||||
f.write(json.dumps(proportion_of_real_epcs.to_dict(orient="records")))
|
||||
|
||||
# Produce the breakdown of EPC ratings
|
||||
epc_rating_breakdown = (
|
||||
clustering_features[~clustering_features["estimated"]]["current-energy-rating"]
|
||||
.value_counts()
|
||||
.to_frame()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
epc_rating_breakdown = epc_rating_breakdown.rename(
|
||||
columns={"current-energy-rating": "EPC"}
|
||||
)
|
||||
|
||||
with open("etl/customers/stonewater/map_app/Stonewater EPC rating breakdown.json", "w") as f:
|
||||
f.write(json.dumps(epc_rating_breakdown.to_dict(orient="records")))
|
||||
|
||||
epc_a_properties = clustering_features[
|
||||
(clustering_features["current-energy-rating"] == "A")
|
||||
& (~clustering_features["estimated"])
|
||||
]
|
||||
|
||||
epc_a_properties = epc_a_properties.merge(
|
||||
stonewater_asset_list,
|
||||
on="internal_id",
|
||||
how="inner"
|
||||
)
|
||||
|
|
|
|||
|
|
@ -678,7 +678,8 @@ def compile_data():
|
|||
# )[["AddressId", "UDPRN"]].rename(columns={"AddressId": "external_address_id"})
|
||||
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
||||
header=4
|
||||
)
|
||||
|
||||
udprn_data = pd.read_excel(
|
||||
|
|
@ -1128,7 +1129,8 @@ def compile_data_final():
|
|||
########################################################################
|
||||
|
||||
asset_list = pd.read_excel(
|
||||
"/Users/khalimconn-kowlessar/Downloads/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx", header=4
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
|
||||
header=4
|
||||
)
|
||||
|
||||
udprn_data = pd.read_excel(
|
||||
|
|
@ -1788,12 +1790,13 @@ def compile_data_final():
|
|||
property_attributes[to_col] = property_attributes[to_col].fillna("unknown")
|
||||
|
||||
# Drop the description columns that are the keys in cleaned
|
||||
print("PUT ME BACK!!??")
|
||||
property_attributes = property_attributes.drop(columns=list(cleaned.keys()))
|
||||
# Perform the mapping
|
||||
|
||||
# CLUSTERING!!
|
||||
grouping_columns = [
|
||||
'is_cavity_wall', 'is_solid_brick', 'built-form', 'property-type'
|
||||
'is_cavity_wall', 'is_solid_brick', 'property-type', 'is_pitched', 'is_flat', 'has_dwelling_above'
|
||||
]
|
||||
|
||||
# Define the preprocessing for numerical and categorical features
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue