finished stonewater

This commit is contained in:
Khalim Conn-Kowlessar 2024-06-13 11:28:12 +01:00
parent 496ae8c969
commit 5b9a36d6d2
2 changed files with 89 additions and 29 deletions

View file

@ -482,15 +482,22 @@ class SearchEpc:
if lmks_to_drop is not None:
epc_data = epc_data[~epc_data["lmk-key"].isin(lmks_to_drop)]
if not epc_data.empty:
# Further processing of the EPC data
try:
epc_data['lodgement-datetime'] = pd.to_datetime(
epc_data['lodgement-datetime'], format='%Y-%m-%d %H:%M:%S', errors='coerce'
)
except Exception as e:
logger.error("Problem formatting lodgement-datime, appling fallback: " + str(e))
epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], errors='coerce')
if exclude_old:
# Exclude EPC data older than 10 years
epc_data = epc_data[
epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10))
]
if exclude_old:
# Exclude EPC data older than 10 years
epc_data = epc_data[
epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10))
]
if not epc_data.empty:
# Further processing of the EPC data
epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))

View file

@ -11,7 +11,8 @@ from fuzzywuzzy import fuzz
import numpy as np
import pandas as pd
import time
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet
from utils.s3 import save_data_to_s3, read_excel_from_s3, read_from_s3, read_dataframe_from_s3_parquet, \
save_dataframe_to_s3_parquet, save_pickle_to_s3
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@ -1360,7 +1361,10 @@ def compile_data_final():
)
p_os_df = p_os_df[p_os_df["UPRN"] == property["uprn"]]
searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
if not p_os_df.empty:
searcher.ordnance_survey_client.parse_classification_code(p_os_df["CLASSIFICATION_CODE"].values[0])
else:
searcher.ordnance_survey_client.property_type = ""
# Now we estimate
searcher.newest_epc = searcher.estimate_epc(
property_type=searcher.ordnance_survey_client.property_type,
@ -1395,20 +1399,19 @@ def compile_data_final():
if searcher.older_epcs is not None:
older_epcs_batch_2[property["internal_id"]] = searcher.older_epcs
# Store in S3
# TODO - read in instead of running
# save_data_to_s3(
# data=json.dumps(epc_data_batch_2),
# s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.json",
# bucket_name="retrofit-data-dev"
# )
#
# save_data_to_s3(
# data=json.dumps(older_epcs_batch_2),
# s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.json",
# bucket_name="retrofit-data-dev"
# )
save_pickle_to_s3(
data=epc_data_batch_2,
s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
bucket_name="retrofit-data-dev"
)
save_pickle_to_s3(
data=older_epcs_batch_2,
s3_file_name="customers/Stonewater/clustering/older_epcs_batch_2.pkl",
bucket_name="retrofit-data-dev"
)
epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
complete_epcs = pd.concat([epc_data, epc_data_batch_2])
@ -1439,15 +1442,15 @@ def compile_data_final():
spatial_df = spatial_data[spatial_data["UPRN"].isin(associated_uprn)]
spatial_data_to_uprn.append(spatial_df)
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
# TODO: Let's store this in s3
# save_data_to_s3(
# data=json.dumps(spatial_data_to_uprn.to_dict("records")),
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.json",
# save_pickle_to_s3(
# data=spatial_data_to_uprn,
# s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
# bucket_name="retrofit-data-dev"
# )
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
spatial_data_to_uprn = spatial_data_to_uprn.drop(
columns=["partition", "filename"]
).rename(columns={"UPRN": "uprn"})
@ -1455,10 +1458,16 @@ def compile_data_final():
property_attributes = complete_epcs.merge(
spatial_data_to_uprn,
how="left",
how="inner",
on="uprn"
)
property_attributes = property_attributes.merge(
asset_list[["internal_id", "owner", "match_type"]], how="left", on="internal_id"
)
# TODO: Add on data from the asset list such as ownership
# We drop the columns we don't care about for clustering
property_attributes = property_attributes.drop(
columns=[
@ -1502,7 +1511,7 @@ def compile_data_final():
# Fields to transform: lodgement-datetime
property_attributes["days_since_last_epc"] = (
datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"])
datetime.now() - pd.to_datetime(property_attributes["lodgement-datetime"], errors="coerce")
).dt.days
property_attributes = property_attributes.drop(columns=["lodgement-datetime"])
@ -1561,6 +1570,7 @@ def compile_data_final():
"mainheatc-env-eff": "N",
"floor-level": "NODATA!",
"hot-water-energy-eff": "N/A",
"glazed-type": "unknown"
}
# Consolidation columns to single value
@ -1608,6 +1618,19 @@ def compile_data_final():
property_attributes["estimated"] = property_attributes["estimated"].fillna(False)
property_attributes["conservation_status"] = property_attributes["conservation_status"].fillna(False)
property_attributes["days_since_last_epc"] = property_attributes["days_since_last_epc"].fillna(
property_attributes["days_since_last_epc"].mean()
)
missings = pd.isnull(property_attributes).sum()
missings = missings[missings > 0]
# Save this
# save_pickle_to_s3(
# data=property_attributes,
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
# )
# CLUSTERING!!
@ -1680,7 +1703,7 @@ def compile_data_final():
)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('kmeans', KMeans(n_clusters=10, random_state=0))])
('kmeans', KMeans(n_clusters=450, random_state=0))])
# Fit the pipeline to the data
pipeline.fit(property_attributes)
@ -1718,6 +1741,36 @@ def compile_data_final():
# Sorting to verify
property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
################################################
# Prepare outputs!!!!
################################################
property_attributes.reset_index(inplace=True)
property_attributes["archetype_representative"] = property_attributes["rank"] == 1
asset_list_with_archetypes = asset_list.merge(
property_attributes[["internal_id", "cluster", "archetype_representative", "rank"]], how="left",
on="internal_id"
)
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].fillna(-999)
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].astype(int).astype(str)
asset_list_with_archetypes["cluster"] = asset_list_with_archetypes["cluster"].replace("-999", "NO ARCHETYPE")
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].fillna(-999)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].astype(int).astype(str)
asset_list_with_archetypes["rank"] = asset_list_with_archetypes["rank"].replace("-999", "NO ARCHETYPE")
asset_list_with_archetypes["archetype_representative"] = asset_list_with_archetypes[
"archetype_representative"].fillna(False)
asset_list_with_archetypes.to_csv("Stonewater asset list with archetypes.csv", index=False)
stonewater_uprn_lookup = asset_list_with_archetypes[
["external_address_id", "udprn", "uprn", "match_type", "standardised_address", "standardised_postcode"]
]
stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
################################################
# Agglomertive Clustering
################################################