mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added handling of some additional cases in sap description cleaning
This commit is contained in:
parent
0875213779
commit
e9366c72e8
10 changed files with 126 additions and 164 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyNamespacePackagesService">
|
||||
|
|
|
|||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,7 +3,7 @@
|
|||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (backend)" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
48
etl/customers/stonewater/outputs 27th June 2024.py
Normal file
48
etl/customers/stonewater/outputs 27th June 2024.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
"""
|
||||
This script prepares some outputs for the stonewater project, 27th June 2024
|
||||
|
||||
The work done so far has been data cleaning and clustering.
|
||||
In this script, we do the following things:
|
||||
|
||||
1) Match the clustering data to the archetypes
|
||||
2) Do some basic analysis on the data
|
||||
3) Mapping of the archetypes
|
||||
"""
|
||||
import pandas as pd
|
||||
from utils.s3 import read_pickle_from_s3
|
||||
|
||||
archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes.csv")
|
||||
archetyped_asset_list = archetyped_asset_list[
|
||||
[
|
||||
"internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank"
|
||||
]
|
||||
]
|
||||
archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
|
||||
archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
|
||||
# Sort
|
||||
archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
|
||||
|
||||
# Read in and merge on clustering features
|
||||
clustering_features = read_pickle_from_s3(
|
||||
bucket_name="retrofit-data-dev",
|
||||
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
||||
)
|
||||
|
||||
archetyped_asset_list = archetyped_asset_list.merge(
|
||||
clustering_features,
|
||||
on="internal_id",
|
||||
how="inner"
|
||||
)
|
||||
|
||||
property_type_archetypes = archetyped_asset_list[
|
||||
["cluster", "rank", "property-type", "built-form", "walls-description"]]
|
||||
|
||||
# Key variables for separation:
|
||||
# - property-type
|
||||
# - built-form
|
||||
# - walls-description
|
||||
# - roof-description
|
||||
|
||||
clustering_features[["property-type", "built-form", "walls-description"]].drop_duplicates().shape
|
||||
|
||||
clustering_features["walls-description"].value_counts()
|
||||
|
|
@ -1633,58 +1633,60 @@ def compile_data_final():
|
|||
# )
|
||||
|
||||
# from utils.s3 import read_pickle_from_s3
|
||||
# data = read_pickle_from_s3(
|
||||
# property_attributes = read_pickle_from_s3(
|
||||
# bucket_name="retrofit-data-dev",
|
||||
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
|
||||
# )
|
||||
|
||||
# CLUSTERING!!
|
||||
# We perform some additional cleaning on the data
|
||||
import msgpack
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
# from sklearn.cluster import KMeans
|
||||
# from sklearn.preprocessing import OneHotEncoder
|
||||
# from scipy.spatial.distance import cdist
|
||||
#
|
||||
# property_attributes.set_index('internal_id', inplace=True)
|
||||
#
|
||||
# # Step 1: Prepare the data
|
||||
# # Identify categorical columns (you might need to adjust this)
|
||||
# categorical_cols = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
# for col in categorical_cols:
|
||||
# property_attributes[col] = property_attributes[col].astype(str)
|
||||
#
|
||||
# # Applying OneHotEncoder
|
||||
# encoder = OneHotEncoder(sparse=False)
|
||||
# encoded_cats = encoder.fit_transform(property_attributes[categorical_cols])
|
||||
#
|
||||
# # Creating a new DataFrame with encoded categorical data and original numerical data
|
||||
# numerical_data = property_attributes.select_dtypes(include=[np.number])
|
||||
# data_for_clustering = pd.concat([numerical_data, pd.DataFrame(encoded_cats, index=numerical_data.index)], axis=1)
|
||||
#
|
||||
# # Convert all column names to strings to satisfy KMeans requirements
|
||||
# data_for_clustering.columns = data_for_clustering.columns.astype(str)
|
||||
#
|
||||
# # Step 2: K-Means Clustering
|
||||
# k = 450 # number of clusters
|
||||
# kmeans = KMeans(n_clusters=k, random_state=0)
|
||||
# property_attributes['cluster'] = kmeans.fit_predict(data_for_clustering)
|
||||
#
|
||||
# # Extracting centroids
|
||||
# centroids = kmeans.cluster_centers_
|
||||
#
|
||||
# # Step 3: Assign clusters and rank rows
|
||||
# # Calculating distances from each point to its cluster's centroid
|
||||
# distances = cdist(data_for_clustering, centroids, 'euclidean')
|
||||
# min_distances = distances.min(axis=1)
|
||||
# property_attributes['distance_to_centroid'] = min_distances
|
||||
#
|
||||
# # Ranking rows by distance within each cluster
|
||||
# property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(method='first')
|
||||
#
|
||||
# # Sorting to verify
|
||||
# property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
|
||||
#
|
||||
# # Optional: Displaying the dataframe
|
||||
# print(property_attributes.head())
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
|
||||
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
|
||||
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
|
||||
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
|
||||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||||
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
|
||||
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
|
||||
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
|
||||
|
||||
cleaners = {
|
||||
"floor-description": FloorAttributes,
|
||||
'hotwater-description': HotWaterAttributes,
|
||||
'main-fuel': MainFuelAttributes,
|
||||
'mainheat-description': MainHeatAttributes,
|
||||
'mainheatcont-description': MainheatControlAttributes,
|
||||
'roof-description': RoofAttributes,
|
||||
'walls-description': WallAttributes,
|
||||
'windows-description': WindowAttributes,
|
||||
'lighting-description': LightingAttributes
|
||||
}
|
||||
for variable_to_clean in cleaned.keys():
|
||||
unique_descriptions = property_attributes[variable_to_clean].unique()
|
||||
clean_df = pd.DataFrame(cleaned[variable_to_clean])
|
||||
# Check if we have any
|
||||
missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values]
|
||||
if missed:
|
||||
descriptions_to_append = []
|
||||
for description in missed:
|
||||
if variable_to_clean == "lighting-description":
|
||||
cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()})
|
||||
else:
|
||||
cln = cleaners[variable_to_clean](description)
|
||||
to_append = {
|
||||
"original_description": description,
|
||||
"clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
|
||||
**cln.process()
|
||||
}
|
||||
descriptions_to_append.append(to_append)
|
||||
|
||||
# CLUSTERING!!
|
||||
|
||||
from sklearn.cluster import KMeans
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
|
|
@ -1777,110 +1779,6 @@ def compile_data_final():
|
|||
|
||||
stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
|
||||
|
||||
################################################
|
||||
# Agglomertive Clustering
|
||||
################################################
|
||||
|
||||
# from sklearn.cluster import KMeans, AgglomerativeClustering
|
||||
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
# from sklearn.compose import ColumnTransformer
|
||||
# from sklearn.pipeline import Pipeline
|
||||
# from scipy.spatial.distance import cdist
|
||||
# import numpy as np
|
||||
# from collections import Counter
|
||||
#
|
||||
# id_column = 'internal_id'
|
||||
# property_attributes.set_index(id_column, inplace=True)
|
||||
#
|
||||
# # Define the preprocessing for numerical and categorical features
|
||||
# numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
|
||||
# categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
|
||||
#
|
||||
# for col in categorical_features:
|
||||
# property_attributes[col] = property_attributes[col].astype(str)
|
||||
#
|
||||
# preprocessor = ColumnTransformer(
|
||||
# transformers=[
|
||||
# ('num', StandardScaler(), numerical_features),
|
||||
# ('cat', OneHotEncoder(sparse_output=False), categorical_features)
|
||||
# ]
|
||||
# )
|
||||
#
|
||||
# # Function to perform clustering and merge small clusters
|
||||
# def cluster_with_min_size(data, preprocessor, n_clusters=10, min_size=5):
|
||||
# while True:
|
||||
# # Preprocess the data
|
||||
# processed_data = preprocessor.fit_transform(data)
|
||||
#
|
||||
# # Initial clustering
|
||||
# clustering = AgglomerativeClustering(n_clusters=n_clusters)
|
||||
# labels = clustering.fit_predict(processed_data)
|
||||
#
|
||||
# # Check cluster sizes
|
||||
# cluster_counts = Counter(labels)
|
||||
#
|
||||
# # Find clusters smaller than min_size
|
||||
# small_clusters = {cluster for cluster, count in cluster_counts.items() if count < min_size}
|
||||
#
|
||||
# if not small_clusters:
|
||||
# break
|
||||
#
|
||||
# # Merge small clusters
|
||||
# for cluster in small_clusters:
|
||||
# # Find the nearest cluster to merge with
|
||||
# cluster_data = processed_data[labels == cluster]
|
||||
# other_clusters = [i for i in range(n_clusters) if i not in small_clusters]
|
||||
# other_cluster_data = [processed_data[labels == i] for i in other_clusters]
|
||||
# other_centroids = np.vstack([data.mean(axis=0) for data in other_cluster_data])
|
||||
#
|
||||
# distances = cdist(cluster_data, other_centroids).mean(axis=0)
|
||||
# closest_cluster = other_clusters[np.argmin(distances)]
|
||||
#
|
||||
# labels[labels == cluster] = closest_cluster
|
||||
#
|
||||
# n_clusters -= len(small_clusters)
|
||||
#
|
||||
# return labels
|
||||
#
|
||||
# # Perform clustering with minimum size constraint
|
||||
# n_clusters = 10
|
||||
# min_size = 5
|
||||
# property_attributes['cluster'] = cluster_with_min_size(property_attributes, preprocessor, n_clusters, min_size)
|
||||
#
|
||||
# # Filter out empty clusters
|
||||
# valid_clusters = property_attributes['cluster'].unique()
|
||||
#
|
||||
# # Get centroids for the resulting clusters
|
||||
# processed_data = preprocessor.transform(property_attributes.drop(columns=["cluster"]))
|
||||
# centroids = np.vstack([processed_data[property_attributes['cluster'] == i].mean(axis=0) for i in valid_clusters])
|
||||
#
|
||||
# # Calculate distances from each point to the centroid of its cluster
|
||||
# distances_to_centroids = [
|
||||
# cdist(processed_data[i].reshape(1, -1),
|
||||
# centroids[valid_clusters.tolist().index(label)].reshape(1, -1)).flatten()[0]
|
||||
# for i, label in enumerate(property_attributes['cluster'])
|
||||
# ]
|
||||
#
|
||||
# property_attributes['distance_to_centroid'] = distances_to_centroids
|
||||
#
|
||||
# # Verify that at least one point in each cluster has zero distance to the centroid
|
||||
# for cluster_id in valid_clusters:
|
||||
# cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
|
||||
# min_distance = cluster_data['distance_to_centroid'].min()
|
||||
# print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
|
||||
# if min_distance != 0:
|
||||
# print(f"No point with zero distance found in cluster {cluster_id}")
|
||||
#
|
||||
# # Rank the distances within each cluster
|
||||
# property_attributes['rank_within_cluster'] = property_attributes.groupby('cluster')['distance_to_centroid'] \
|
||||
# .rank(method='first')
|
||||
#
|
||||
# # Reset index to get 'internal_id' back
|
||||
# property_attributes.reset_index(inplace=True)
|
||||
#
|
||||
# # Display the DataFrame
|
||||
# print(property_attributes)
|
||||
|
||||
|
||||
def pull_ideal_postcodes(missing_uprn_with_udprn):
|
||||
api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ class FloorAttributes(Definitions):
|
|||
self.description: str = description.lower()
|
||||
|
||||
self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
|
||||
description in self.OBSERVED_ERRORS)
|
||||
description in self.OBSERVED_ERRORS) or (self.description == "sap05:floor")
|
||||
|
||||
# Try and perform a translation, incase it's in welsh
|
||||
self.translate_welsh_text()
|
||||
|
|
|
|||
|
|
@ -129,7 +129,9 @@ class HotWaterAttributes(Definitions):
|
|||
def __init__(self, description: str):
|
||||
self.description: str = clean_description(description.lower()).strip()
|
||||
|
||||
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
|
||||
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
|
||||
self.description == "sap05 hot-water"
|
||||
)
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,18 @@
|
|||
import re
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
|
||||
from etl.epc_clean.utils import correct_spelling
|
||||
|
||||
|
||||
class LightingAttributes:
|
||||
class LightingAttributes(Definitions):
|
||||
WELSH_TEXT = {
|
||||
"goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets",
|
||||
"dim goleuadau ynni-isel": "no low energy lighting",
|
||||
"goleuadau ynni-isel ym mhob un o'r mannau gosod": 'Low energy lighting in all fixed outlets'
|
||||
}
|
||||
|
||||
OBSERVED_ERRORS = []
|
||||
|
||||
def __init__(self, description, averages):
|
||||
self.description: str = clean_description(description.lower())
|
||||
|
||||
|
|
@ -18,6 +21,9 @@ class LightingAttributes:
|
|||
self.description = correct_spelling(self.description)
|
||||
self.averages = averages
|
||||
|
||||
self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
|
||||
description in self.OBSERVED_ERRORS) or (description == "SAP05:Lighting")
|
||||
|
||||
def welsh_translation_search(self):
|
||||
"""
|
||||
For welsh text describing the percentage of low energy lighting, we match the regular
|
||||
|
|
@ -40,6 +46,9 @@ class LightingAttributes:
|
|||
|
||||
description = self.description
|
||||
|
||||
if self.nodata:
|
||||
return {"low_energy_proportion": None}
|
||||
|
||||
if 'no low energy lighting' in description:
|
||||
return {"low_energy_proportion": 0}
|
||||
|
||||
|
|
|
|||
|
|
@ -77,7 +77,9 @@ class MainHeatAttributes(Definitions):
|
|||
|
||||
self.description: str = clean_description(self.description).strip()
|
||||
# Remove special characters
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or (
|
||||
description == "SAP05:Main-Heating"
|
||||
)
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
if translation:
|
||||
|
|
@ -97,11 +99,12 @@ class MainHeatAttributes(Definitions):
|
|||
|
||||
self.process_edge_cases()
|
||||
|
||||
if (not description or not any(
|
||||
rt in self.description for rt in
|
||||
self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
|
||||
) and not self.is_edge_case):
|
||||
raise ValueError('Invalid description')
|
||||
if not self.nodata:
|
||||
if (not description or not any(
|
||||
rt in self.description for rt in
|
||||
self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
|
||||
) and not self.is_edge_case):
|
||||
raise ValueError('Invalid description')
|
||||
|
||||
def process_edge_cases(self) -> (dict, bool):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -117,7 +117,9 @@ class MainheatControlAttributes(Definitions):
|
|||
|
||||
def __init__(self, description: str):
|
||||
self.description: str = clean_description(description.lower()).strip()
|
||||
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
|
||||
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
|
||||
description == "SAP05:Main-Heating-Controls"
|
||||
)
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
if translation:
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ class WindowAttributes(Definitions):
|
|||
|
||||
# In the case of an empty description, we want to return a dictionary with all values set to False
|
||||
# and indicate there was no data
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
|
||||
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or description == "SAP05:Windows"
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
if translation:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue