added handling of some additional cases in sap description cleaning

This commit is contained in:
Khalim Conn-Kowlessar 2024-06-27 12:41:56 +01:00
parent 0875213779
commit e9366c72e8
10 changed files with 126 additions and 164 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -0,0 +1,48 @@
"""
This script prepares some outputs for the stonewater project, 27th June 2024
The work done so far has been data cleaning and clustering.
In this script, we do the following things:
1) Match the clustering data to the archetypes
2) Do some basic analysis on the data
3) Mapping of the archetypes
"""
import pandas as pd
from utils.s3 import read_pickle_from_s3
archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes.csv")
archetyped_asset_list = archetyped_asset_list[
[
"internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank"
]
]
archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
# Sort
archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
# Read in and merge on clustering features
clustering_features = read_pickle_from_s3(
bucket_name="retrofit-data-dev",
s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
)
archetyped_asset_list = archetyped_asset_list.merge(
clustering_features,
on="internal_id",
how="inner"
)
property_type_archetypes = archetyped_asset_list[
["cluster", "rank", "property-type", "built-form", "walls-description"]]
# Key variables for separation:
# - property-type
# - built-form
# - walls-description
# - roof-description
clustering_features[["property-type", "built-form", "walls-description"]].drop_duplicates().shape
clustering_features["walls-description"].value_counts()

View file

@ -1633,58 +1633,60 @@ def compile_data_final():
# )
# from utils.s3 import read_pickle_from_s3
# data = read_pickle_from_s3(
# property_attributes = read_pickle_from_s3(
# bucket_name="retrofit-data-dev",
# s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
# )
# CLUSTERING!!
# We perform some additional cleaning on the data
import msgpack
cleaned = read_from_s3(
s3_file_name="cleaned_epc_data/cleaned.bson",
bucket_name="retrofit-data-dev"
)
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import OneHotEncoder
# from scipy.spatial.distance import cdist
#
# property_attributes.set_index('internal_id', inplace=True)
#
# # Step 1: Prepare the data
# # Identify categorical columns (you might need to adjust this)
# categorical_cols = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
# for col in categorical_cols:
# property_attributes[col] = property_attributes[col].astype(str)
#
# # Applying OneHotEncoder
# encoder = OneHotEncoder(sparse=False)
# encoded_cats = encoder.fit_transform(property_attributes[categorical_cols])
#
# # Creating a new DataFrame with encoded categorical data and original numerical data
# numerical_data = property_attributes.select_dtypes(include=[np.number])
# data_for_clustering = pd.concat([numerical_data, pd.DataFrame(encoded_cats, index=numerical_data.index)], axis=1)
#
# # Convert all column names to strings to satisfy KMeans requirements
# data_for_clustering.columns = data_for_clustering.columns.astype(str)
#
# # Step 2: K-Means Clustering
# k = 450 # number of clusters
# kmeans = KMeans(n_clusters=k, random_state=0)
# property_attributes['cluster'] = kmeans.fit_predict(data_for_clustering)
#
# # Extracting centroids
# centroids = kmeans.cluster_centers_
#
# # Step 3: Assign clusters and rank rows
# # Calculating distances from each point to its cluster's centroid
# distances = cdist(data_for_clustering, centroids, 'euclidean')
# min_distances = distances.min(axis=1)
# property_attributes['distance_to_centroid'] = min_distances
#
# # Ranking rows by distance within each cluster
# property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(method='first')
#
# # Sorting to verify
# property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
#
# # Optional: Displaying the dataframe
# print(property_attributes.head())
cleaned = msgpack.unpackb(cleaned, raw=False)
from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
cleaners = {
"floor-description": FloorAttributes,
'hotwater-description': HotWaterAttributes,
'main-fuel': MainFuelAttributes,
'mainheat-description': MainHeatAttributes,
'mainheatcont-description': MainheatControlAttributes,
'roof-description': RoofAttributes,
'walls-description': WallAttributes,
'windows-description': WindowAttributes,
'lighting-description': LightingAttributes
}
for variable_to_clean in cleaned.keys():
unique_descriptions = property_attributes[variable_to_clean].unique()
clean_df = pd.DataFrame(cleaned[variable_to_clean])
# Check if we have any
missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values]
if missed:
descriptions_to_append = []
for description in missed:
if variable_to_clean == "lighting-description":
cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()})
else:
cln = cleaners[variable_to_clean](description)
to_append = {
"original_description": description,
"clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
**cln.process()
}
descriptions_to_append.append(to_append)
# CLUSTERING!!
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, OneHotEncoder
@ -1777,110 +1779,6 @@ def compile_data_final():
stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")
################################################
# Agglomertive Clustering
################################################
# from sklearn.cluster import KMeans, AgglomerativeClustering
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from scipy.spatial.distance import cdist
# import numpy as np
# from collections import Counter
#
# id_column = 'internal_id'
# property_attributes.set_index(id_column, inplace=True)
#
# # Define the preprocessing for numerical and categorical features
# numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
# categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
#
# for col in categorical_features:
# property_attributes[col] = property_attributes[col].astype(str)
#
# preprocessor = ColumnTransformer(
# transformers=[
# ('num', StandardScaler(), numerical_features),
# ('cat', OneHotEncoder(sparse_output=False), categorical_features)
# ]
# )
#
# # Function to perform clustering and merge small clusters
# def cluster_with_min_size(data, preprocessor, n_clusters=10, min_size=5):
# while True:
# # Preprocess the data
# processed_data = preprocessor.fit_transform(data)
#
# # Initial clustering
# clustering = AgglomerativeClustering(n_clusters=n_clusters)
# labels = clustering.fit_predict(processed_data)
#
# # Check cluster sizes
# cluster_counts = Counter(labels)
#
# # Find clusters smaller than min_size
# small_clusters = {cluster for cluster, count in cluster_counts.items() if count < min_size}
#
# if not small_clusters:
# break
#
# # Merge small clusters
# for cluster in small_clusters:
# # Find the nearest cluster to merge with
# cluster_data = processed_data[labels == cluster]
# other_clusters = [i for i in range(n_clusters) if i not in small_clusters]
# other_cluster_data = [processed_data[labels == i] for i in other_clusters]
# other_centroids = np.vstack([data.mean(axis=0) for data in other_cluster_data])
#
# distances = cdist(cluster_data, other_centroids).mean(axis=0)
# closest_cluster = other_clusters[np.argmin(distances)]
#
# labels[labels == cluster] = closest_cluster
#
# n_clusters -= len(small_clusters)
#
# return labels
#
# # Perform clustering with minimum size constraint
# n_clusters = 10
# min_size = 5
# property_attributes['cluster'] = cluster_with_min_size(property_attributes, preprocessor, n_clusters, min_size)
#
# # Filter out empty clusters
# valid_clusters = property_attributes['cluster'].unique()
#
# # Get centroids for the resulting clusters
# processed_data = preprocessor.transform(property_attributes.drop(columns=["cluster"]))
# centroids = np.vstack([processed_data[property_attributes['cluster'] == i].mean(axis=0) for i in valid_clusters])
#
# # Calculate distances from each point to the centroid of its cluster
# distances_to_centroids = [
# cdist(processed_data[i].reshape(1, -1),
# centroids[valid_clusters.tolist().index(label)].reshape(1, -1)).flatten()[0]
# for i, label in enumerate(property_attributes['cluster'])
# ]
#
# property_attributes['distance_to_centroid'] = distances_to_centroids
#
# # Verify that at least one point in each cluster has zero distance to the centroid
# for cluster_id in valid_clusters:
# cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
# min_distance = cluster_data['distance_to_centroid'].min()
# print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
# if min_distance != 0:
# print(f"No point with zero distance found in cluster {cluster_id}")
#
# # Rank the distances within each cluster
# property_attributes['rank_within_cluster'] = property_attributes.groupby('cluster')['distance_to_centroid'] \
# .rank(method='first')
#
# # Reset index to get 'internal_id' back
# property_attributes.reset_index(inplace=True)
#
# # Display the DataFrame
# print(property_attributes)
def pull_ideal_postcodes(missing_uprn_with_udprn):
api_key = "" # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/

View file

@ -38,7 +38,7 @@ class FloorAttributes(Definitions):
self.description: str = description.lower()
self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
description in self.OBSERVED_ERRORS)
description in self.OBSERVED_ERRORS) or (self.description == "sap05:floor")
# Try and perform a translation, incase it's in welsh
self.translate_welsh_text()

View file

@ -129,7 +129,9 @@ class HotWaterAttributes(Definitions):
def __init__(self, description: str):
self.description: str = clean_description(description.lower()).strip()
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
self.description == "sap05 hot-water"
)
translation = self.WELSH_TEXT.get(self.description)

View file

@ -1,15 +1,18 @@
import re
from BaseUtility import Definitions
from etl.epc_clean.epc_attributes.attribute_utils import clean_description
from etl.epc_clean.utils import correct_spelling
class LightingAttributes:
class LightingAttributes(Definitions):
WELSH_TEXT = {
"goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets",
"dim goleuadau ynni-isel": "no low energy lighting",
"goleuadau ynni-isel ym mhob un o'r mannau gosod": 'Low energy lighting in all fixed outlets'
}
OBSERVED_ERRORS = []
def __init__(self, description, averages):
self.description: str = clean_description(description.lower())
@ -18,6 +21,9 @@ class LightingAttributes:
self.description = correct_spelling(self.description)
self.averages = averages
self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
description in self.OBSERVED_ERRORS) or (description == "SAP05:Lighting")
def welsh_translation_search(self):
"""
For welsh text describing the percentage of low energy lighting, we match the regular
@ -40,6 +46,9 @@ class LightingAttributes:
description = self.description
if self.nodata:
return {"low_energy_proportion": None}
if 'no low energy lighting' in description:
return {"low_energy_proportion": 0}

View file

@ -77,7 +77,9 @@ class MainHeatAttributes(Definitions):
self.description: str = clean_description(self.description).strip()
# Remove special characters
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or (
description == "SAP05:Main-Heating"
)
translation = self.WELSH_TEXT.get(self.description)
if translation:
@ -97,11 +99,12 @@ class MainHeatAttributes(Definitions):
self.process_edge_cases()
if (not description or not any(
rt in self.description for rt in
self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
) and not self.is_edge_case):
raise ValueError('Invalid description')
if not self.nodata:
if (not description or not any(
rt in self.description for rt in
self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
) and not self.is_edge_case):
raise ValueError('Invalid description')
def process_edge_cases(self) -> (dict, bool):
"""

View file

@ -117,7 +117,9 @@ class MainheatControlAttributes(Definitions):
def __init__(self, description: str):
self.description: str = clean_description(description.lower()).strip()
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
description == "SAP05:Main-Heating-Controls"
)
translation = self.WELSH_TEXT.get(self.description)
if translation:

View file

@ -38,7 +38,7 @@ class WindowAttributes(Definitions):
# In the case of an empty description, we want to return a dictionary with all values set to False
# and indicate there was no data
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or description == "SAP05:Windows"
translation = self.WELSH_TEXT.get(self.description)
if translation: