mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
finished stonewater
This commit is contained in:
parent
496ae8c969
commit
5b9a36d6d2
2 changed files with 89 additions and 29 deletions
|
|
@ -482,15 +482,22 @@ class SearchEpc:
|
|||
if lmks_to_drop is not None:
|
||||
epc_data = epc_data[~epc_data["lmk-key"].isin(lmks_to_drop)]
|
||||
|
||||
if not epc_data.empty:
|
||||
# Further processing of the EPC data
|
||||
try:
|
||||
epc_data['lodgement-datetime'] = pd.to_datetime(
|
||||
epc_data['lodgement-datetime'], format='%Y-%m-%d %H:%M:%S', errors='coerce'
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Problem formatting lodgement-datime, appling fallback: " + str(e))
|
||||
epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], errors='coerce')
|
||||
|
||||
if exclude_old:
|
||||
# Exclude EPC data older than 10 years
|
||||
epc_data = epc_data[
|
||||
epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10))
|
||||
]
|
||||
if exclude_old:
|
||||
# Exclude EPC data older than 10 years
|
||||
epc_data = epc_data[
|
||||
epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10))
|
||||
]
|
||||
|
||||
if not epc_data.empty:
|
||||
# Further processing of the EPC data
|
||||
|
||||
epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
|
||||
epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
|
||||
|
|
|
|||
|
|
@ -11,7 +11,8 @@ from fuzzywuzzy import fuzz
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import time
|
||||
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet
|
||||
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
|
||||
save_dataframe_to_s3_parquet, save_pickle_to_s3
|
||||
|
||||
load_dotenv(dotenv_path="backend/.env")
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
|
|
@ -1360,7 +1361,10 @@ def compile_data_final():
|
|||
)
|
||||
p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
|
||||
|
||||
searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
|
||||
if not p_os_df.empty:
|
||||
searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
|
||||
else:
|
||||
searcher.ordnance_survey_client.property_type = ""
|
||||
# Now we estimate
|
||||
searcher.newest_epc = searcher.estimate_epc(
|
||||
property_type=searcher.ordnance_survey_client.property_type,
|
||||
|
|
@ -1395,20 +1399,19 @@ def compile_data_final():
|
|||
|
||||
if searcher.older_epcs is not None:
|
||||
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
|
||||
|
||||
# Store in S3
|
||||
# TODO - read in instead of running
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(epc_data_batch_2),
|
||||
# s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
#
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(older_epcs_batch_2),
|
||||
# s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.json",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
save_pickle_to_s3(
|
||||
data=epc_data_batch_2,
|
||||
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
save_pickle_to_s3(
|
||||
data=older_epcs_batch_2,
|
||||
s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
|
||||
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
|
||||
|
|
@ -1439,15 +1442,15 @@ def compile_data_final():
|
|||
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
|
||||
spatial_data_to_uprn.append(spatial_df)
|
||||
|
||||
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
||||
|
||||
# TODO: Let's store this in s3
|
||||
# save_data_to_s3(
|
||||
# data=json.dumps(spatial_data_to_uprn.to_dict("records")),
|
||||
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
|
||||
# save_pickle_to_s3(
|
||||
# data=spatial_data_to_uprn,
|
||||
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
|
||||
# bucket_name="retrofit-data-dev"
|
||||
# )
|
||||
|
||||
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
||||
|
||||
spatial_data_to_uprn = spatial_data_to_uprn.drop(
|
||||
columns=["partition", "filename"]
|
||||
).rename(columns={"UPRN": "uprn"})
|
||||
|
|
@ -1455,10 +1458,16 @@ def compile_data_final():
|
|||
|
||||
property_attributes = complete_epcs.merge(
|
||||
spatial_data_to_uprn,
|
||||
how="left",
|
||||
how="inner",
|
||||
on="uprn"
|
||||
)
|
||||
|
||||
property_attributes = property_attributes.merge(
|
||||
asset_list[["internal_id", "owner", "match_type"]], how="left", on="internal_id"
|
||||
)
|
||||
|
||||
# TODO: Add on data from the asset list such as ownership
|
||||
|
||||
# We drop the columns we don't care about for clustering
|
||||
property_attributes = property_attributes.drop(
|
||||
columns=[
|
||||
|
|
@ -1502,7 +1511,7 @@ def compile_data_final():
|
|||
|
||||
# Fields to transform: lodgement-datetime
|
||||
property_attributes["days_since_last_epc"] = (
|
||||
datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"])
|
||||
datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"], errors="coerce")
|
||||
).dt.days
|
||||
|
||||
property_attributes = property_attributes.drop(columns=["lodgement-datetime"])
|
||||
|
|
@ -1561,6 +1570,7 @@ def compile_data_final():
|
|||
"mainheatc-env-eff": "N",
|
||||
"floor-level": "NODATA!",
|
||||
"hot-water-energy-eff": "N/A",
|
||||
"glazed-type": "unknown"
|
||||
}
|
||||
|
||||
# Consolidation columns to single value
|
||||
|
|
@ -1608,6 +1618,19 @@ def compile_data_final():
|
|||
|
||||
property_attributes["estimated"] = property_attributes["estimated"].fillna(False)
|
||||
property_attributes["conservation_status"] = property_attributes["conservation_status"].fillna(False)
|
||||
property_attributes["days_since_last_epc"] = property_attributes["days_since_last_epc"].fillna(
|
||||
property_attributes["days_since_last_epc"].mean()
|
||||
)
|
||||
|
||||
missings = pd.isnull(property_attributes).sum()
|
||||
missings = missings[missings > 0]
|
||||
|
||||
# Save this
|
||||
# save_pickle_to_s3(
|
||||
# data=property_attributes,
|
||||
# bucket_name="retrofit-data-dev",
|
||||
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
||||
# )
|
||||
|
||||
# CLUSTERING!!
|
||||
|
||||
|
|
@ -1680,7 +1703,7 @@ def compile_data_final():
|
|||
)
|
||||
|
||||
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
|
||||
('kmeans', KMeans(n_clusters=10, random_state=0))])
|
||||
('kmeans', KMeans(n_clusters=450, random_state=0))])
|
||||
|
||||
# Fit the pipeline to the data
|
||||
pipeline.fit(property_attributes)
|
||||
|
|
@ -1718,6 +1741,36 @@ def compile_data_final():
|
|||
# Sorting to verify
|
||||
property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
|
||||
|
||||
################################################
|
||||
# Prepare outputs!!!!
|
||||
################################################
|
||||
property_attributes.reset_index(inplace=True)
|
||||
property_attributes["archetype_representative"] = property_attributes["rank"] == 1
|
||||
|
||||
asset_list_with_archetypes = asset_list.merge(
|
||||
property_attributes[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
|
||||
on="internal_id"
|
||||
)
|
||||
|
||||
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].fillna(-999)
|
||||
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].astype(int).astype(str)
|
||||
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].replace("-999", "NO ARCHETYPE")
|
||||
|
||||
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
|
||||
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
|
||||
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
|
||||
|
||||
asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
|
||||
"archetype_representative"].fillna(False)
|
||||
|
||||
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False)
|
||||
|
||||
stonewater_uprn_lookup = asset_list_with_archetypes[
|
||||
["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
|
||||
]
|
||||
|
||||
stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
|
||||
|
||||
################################################
|
||||
# Agglomertive Clustering
|
||||
################################################
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue