diff --git a/.idea/Model.iml b/.idea/Model.iml index 4413bb06..b0f9c00d 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 6f308057..1122b380 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/outputs 27th June 2024.py b/etl/customers/stonewater/outputs 27th June 2024.py new file mode 100644 index 00000000..ebb6fc5b --- /dev/null +++ b/etl/customers/stonewater/outputs 27th June 2024.py @@ -0,0 +1,48 @@ +""" +This script prepares some outputs for the stonewater project, 27th June 2024 + +The work done so far has been data cleaning and clustering. +In this script, we do the following things: + +1) Match the clustering data to the archetypes +2) Do some basic analysis on the data +3) Mapping of the archetypes +""" +import pandas as pd +from utils.s3 import read_pickle_from_s3 + +archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes.csv") +archetyped_asset_list = archetyped_asset_list[ + [ + "internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank" + ] +] +archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"] +archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int) +# Sort +archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"]) + +# Read in and merge on clustering features +clustering_features = read_pickle_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl" +) + +archetyped_asset_list = archetyped_asset_list.merge( + clustering_features, + on="internal_id", + how="inner" +) + +property_type_archetypes = archetyped_asset_list[ + ["cluster", "rank", "property-type", "built-form", "walls-description"]] + +# Key variables for separation: +# - property-type +# - built-form +# - walls-description +# - roof-description + +clustering_features[["property-type", "built-form", "walls-description"]].drop_duplicates().shape + +clustering_features["walls-description"].value_counts() diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index 6c7a0fc6..b8e71ae7 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -1633,58 +1633,60 @@ def compile_data_final(): # ) # from utils.s3 import read_pickle_from_s3 - # data = read_pickle_from_s3( + # property_attributes = read_pickle_from_s3( # bucket_name="retrofit-data-dev", # s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl" # ) - # CLUSTERING!! + # We perform some additional cleaning on the data + import msgpack + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) - # from sklearn.cluster import KMeans - # from sklearn.preprocessing import OneHotEncoder - # from scipy.spatial.distance import cdist - # - # property_attributes.set_index('internal_id', inplace=True) - # - # # Step 1: Prepare the data - # # Identify categorical columns (you might need to adjust this) - # categorical_cols = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist() - # for col in categorical_cols: - # property_attributes[col] = property_attributes[col].astype(str) - # - # # Applying OneHotEncoder - # encoder = OneHotEncoder(sparse=False) - # encoded_cats = encoder.fit_transform(property_attributes[categorical_cols]) - # - # # Creating a new DataFrame with encoded categorical data and original numerical data - # numerical_data = property_attributes.select_dtypes(include=[np.number]) - # data_for_clustering = pd.concat([numerical_data, pd.DataFrame(encoded_cats, index=numerical_data.index)], axis=1) - # - # # Convert all column names to strings to satisfy KMeans requirements - # data_for_clustering.columns = data_for_clustering.columns.astype(str) - # - # # Step 2: K-Means Clustering - # k = 450 # number of clusters - # kmeans = KMeans(n_clusters=k, random_state=0) - # property_attributes['cluster'] = kmeans.fit_predict(data_for_clustering) - # - # # Extracting centroids - # centroids = kmeans.cluster_centers_ - # - # # Step 3: Assign clusters and rank rows - # # Calculating distances from each point to its cluster's centroid - # distances = cdist(data_for_clustering, centroids, 'euclidean') - # min_distances = distances.min(axis=1) - # property_attributes['distance_to_centroid'] = min_distances - # - # # Ranking rows by distance within each cluster - # property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(method='first') - # - # # Sorting to verify - # property_attributes.sort_values(by=['cluster', 'rank'], inplace=True) - # - # # Optional: Displaying the dataframe - # print(property_attributes.head()) + cleaned = msgpack.unpackb(cleaned, raw=False) + from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes + from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes + from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes + from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes + from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes + from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes + from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes + from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes + + cleaners = { + "floor-description": FloorAttributes, + 'hotwater-description': HotWaterAttributes, + 'main-fuel': MainFuelAttributes, + 'mainheat-description': MainHeatAttributes, + 'mainheatcont-description': MainheatControlAttributes, + 'roof-description': RoofAttributes, + 'walls-description': WallAttributes, + 'windows-description': WindowAttributes, + 'lighting-description': LightingAttributes + } + for variable_to_clean in cleaned.keys(): + unique_descriptions = property_attributes[variable_to_clean].unique() + clean_df = pd.DataFrame(cleaned[variable_to_clean]) + # Check if we have any + missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values] + if missed: + descriptions_to_append = [] + for description in missed: + if variable_to_clean == "lighting-description": + cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()}) + else: + cln = cleaners[variable_to_clean](description) + to_append = { + "original_description": description, + "clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(), + **cln.process() + } + descriptions_to_append.append(to_append) + + # CLUSTERING!! from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler, OneHotEncoder @@ -1777,110 +1779,6 @@ def compile_data_final(): stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx") - ################################################ - # Agglomertive Clustering - ################################################ - - # from sklearn.cluster import KMeans, AgglomerativeClustering - # from sklearn.preprocessing import StandardScaler, OneHotEncoder - # from sklearn.compose import ColumnTransformer - # from sklearn.pipeline import Pipeline - # from scipy.spatial.distance import cdist - # import numpy as np - # from collections import Counter - # - # id_column = 'internal_id' - # property_attributes.set_index(id_column, inplace=True) - # - # # Define the preprocessing for numerical and categorical features - # numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist() - # categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist() - # - # for col in categorical_features: - # property_attributes[col] = property_attributes[col].astype(str) - # - # preprocessor = ColumnTransformer( - # transformers=[ - # ('num', StandardScaler(), numerical_features), - # ('cat', OneHotEncoder(sparse_output=False), categorical_features) - # ] - # ) - # - # # Function to perform clustering and merge small clusters - # def cluster_with_min_size(data, preprocessor, n_clusters=10, min_size=5): - # while True: - # # Preprocess the data - # processed_data = preprocessor.fit_transform(data) - # - # # Initial clustering - # clustering = AgglomerativeClustering(n_clusters=n_clusters) - # labels = clustering.fit_predict(processed_data) - # - # # Check cluster sizes - # cluster_counts = Counter(labels) - # - # # Find clusters smaller than min_size - # small_clusters = {cluster for cluster, count in cluster_counts.items() if count < min_size} - # - # if not small_clusters: - # break - # - # # Merge small clusters - # for cluster in small_clusters: - # # Find the nearest cluster to merge with - # cluster_data = processed_data[labels == cluster] - # other_clusters = [i for i in range(n_clusters) if i not in small_clusters] - # other_cluster_data = [processed_data[labels == i] for i in other_clusters] - # other_centroids = np.vstack([data.mean(axis=0) for data in other_cluster_data]) - # - # distances = cdist(cluster_data, other_centroids).mean(axis=0) - # closest_cluster = other_clusters[np.argmin(distances)] - # - # labels[labels == cluster] = closest_cluster - # - # n_clusters -= len(small_clusters) - # - # return labels - # - # # Perform clustering with minimum size constraint - # n_clusters = 10 - # min_size = 5 - # property_attributes['cluster'] = cluster_with_min_size(property_attributes, preprocessor, n_clusters, min_size) - # - # # Filter out empty clusters - # valid_clusters = property_attributes['cluster'].unique() - # - # # Get centroids for the resulting clusters - # processed_data = preprocessor.transform(property_attributes.drop(columns=["cluster"])) - # centroids = np.vstack([processed_data[property_attributes['cluster'] == i].mean(axis=0) for i in valid_clusters]) - # - # # Calculate distances from each point to the centroid of its cluster - # distances_to_centroids = [ - # cdist(processed_data[i].reshape(1, -1), - # centroids[valid_clusters.tolist().index(label)].reshape(1, -1)).flatten()[0] - # for i, label in enumerate(property_attributes['cluster']) - # ] - # - # property_attributes['distance_to_centroid'] = distances_to_centroids - # - # # Verify that at least one point in each cluster has zero distance to the centroid - # for cluster_id in valid_clusters: - # cluster_data = property_attributes[property_attributes['cluster'] == cluster_id] - # min_distance = cluster_data['distance_to_centroid'].min() - # print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}") - # if min_distance != 0: - # print(f"No point with zero distance found in cluster {cluster_id}") - # - # # Rank the distances within each cluster - # property_attributes['rank_within_cluster'] = property_attributes.groupby('cluster')['distance_to_centroid'] \ - # .rank(method='first') - # - # # Reset index to get 'internal_id' back - # property_attributes.reset_index(inplace=True) - # - # # Display the DataFrame - # print(property_attributes) - def pull_ideal_postcodes(missing_uprn_with_udprn): api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/ diff --git a/etl/epc_clean/epc_attributes/FloorAttributes.py b/etl/epc_clean/epc_attributes/FloorAttributes.py index 245a91bc..817c2b43 100644 --- a/etl/epc_clean/epc_attributes/FloorAttributes.py +++ b/etl/epc_clean/epc_attributes/FloorAttributes.py @@ -38,7 +38,7 @@ class FloorAttributes(Definitions): self.description: str = description.lower() self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or ( - description in self.OBSERVED_ERRORS) + description in self.OBSERVED_ERRORS) or (self.description == "sap05:floor") # Try and perform a translation, incase it's in welsh self.translate_welsh_text() diff --git a/etl/epc_clean/epc_attributes/HotWaterAttributes.py b/etl/epc_clean/epc_attributes/HotWaterAttributes.py index 54deaa09..f9cec48b 100644 --- a/etl/epc_clean/epc_attributes/HotWaterAttributes.py +++ b/etl/epc_clean/epc_attributes/HotWaterAttributes.py @@ -129,7 +129,9 @@ class HotWaterAttributes(Definitions): def __init__(self, description: str): self.description: str = clean_description(description.lower()).strip() - self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES + self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or ( + self.description == "sap05 hot-water" + ) translation = self.WELSH_TEXT.get(self.description) diff --git a/etl/epc_clean/epc_attributes/LightingAttributes.py b/etl/epc_clean/epc_attributes/LightingAttributes.py index 0fe3db16..18475b2d 100644 --- a/etl/epc_clean/epc_attributes/LightingAttributes.py +++ b/etl/epc_clean/epc_attributes/LightingAttributes.py @@ -1,15 +1,18 @@ import re +from BaseUtility import Definitions from etl.epc_clean.epc_attributes.attribute_utils import clean_description from etl.epc_clean.utils import correct_spelling -class LightingAttributes: +class LightingAttributes(Definitions): WELSH_TEXT = { "goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets", "dim goleuadau ynni-isel": "no low energy lighting", "goleuadau ynni-isel ym mhob un o'r mannau gosod": 'Low energy lighting in all fixed outlets' } + OBSERVED_ERRORS = [] + def __init__(self, description, averages): self.description: str = clean_description(description.lower()) @@ -18,6 +21,9 @@ class LightingAttributes: self.description = correct_spelling(self.description) self.averages = averages + self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or ( + description in self.OBSERVED_ERRORS) or (description == "SAP05:Lighting") + def welsh_translation_search(self): """ For welsh text describing the percentage of low energy lighting, we match the regular @@ -40,6 +46,9 @@ class LightingAttributes: description = self.description + if self.nodata: + return {"low_energy_proportion": None} + if 'no low energy lighting' in description: return {"low_energy_proportion": 0} diff --git a/etl/epc_clean/epc_attributes/MainheatAttributes.py b/etl/epc_clean/epc_attributes/MainheatAttributes.py index 9f0931a3..56115dca 100644 --- a/etl/epc_clean/epc_attributes/MainheatAttributes.py +++ b/etl/epc_clean/epc_attributes/MainheatAttributes.py @@ -77,7 +77,9 @@ class MainHeatAttributes(Definitions): self.description: str = clean_description(self.description).strip() # Remove special characters - self.nodata = not description or description in self.DATA_ANOMALY_MATCHES + self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or ( + description == "SAP05:Main-Heating" + ) translation = self.WELSH_TEXT.get(self.description) if translation: @@ -97,11 +99,12 @@ class MainHeatAttributes(Definitions): self.process_edge_cases() - if (not description or not any( - rt in self.description for rt in - self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS - ) and not self.is_edge_case): - raise ValueError('Invalid description') + if not self.nodata: + if (not description or not any( + rt in self.description for rt in + self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS + ) and not self.is_edge_case): + raise ValueError('Invalid description') def process_edge_cases(self) -> (dict, bool): """ diff --git a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py index 887bdda7..46fff6d8 100644 --- a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py +++ b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py @@ -117,7 +117,9 @@ class MainheatControlAttributes(Definitions): def __init__(self, description: str): self.description: str = clean_description(description.lower()).strip() - self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES + self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or ( + description == "SAP05:Main-Heating-Controls" + ) translation = self.WELSH_TEXT.get(self.description) if translation: diff --git a/etl/epc_clean/epc_attributes/WindowAttributes.py b/etl/epc_clean/epc_attributes/WindowAttributes.py index 5286fc5a..e9139510 100644 --- a/etl/epc_clean/epc_attributes/WindowAttributes.py +++ b/etl/epc_clean/epc_attributes/WindowAttributes.py @@ -38,7 +38,7 @@ class WindowAttributes(Definitions): # In the case of an empty description, we want to return a dictionary with all values set to False # and indicate there was no data - self.nodata = not description or description in self.DATA_ANOMALY_MATCHES + self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or description == "SAP05:Windows" translation = self.WELSH_TEXT.get(self.description) if translation: