added handling of some additional cases in sap description cleaning

2026-07-27 23:35:01 +00:00 · 2024-06-27 12:41:56 +01:00 · 2024-06-27 12:41:56 +01:00 · e9366c72e8
commit e9366c72e8
parent 0875213779
10 changed files with 126 additions and 164 deletions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyNamespacePackagesService">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,7 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/etl/customers/stonewater/outputs
+++ b/etl/customers/stonewater/outputs
@ -0,0 +1,48 @@
+"""
+This script prepares some outputs for the stonewater project, 27th June 2024
+
+The work done so far has been data cleaning and clustering.
+In this script, we do the following things:
+
+1) Match the clustering data to the archetypes
+2) Do some basic analysis on the data
+3) Mapping of the archetypes
+"""
+import pandas as pd
+from utils.s3 import read_pickle_from_s3
+
+archetyped_asset_list = pd.read_csv("Stonewater asset list with archetypes.csv")
+archetyped_asset_list = archetyped_asset_list[
+    [
+        "internal_id", "customer_asset_id", "udprn", "uprn", "cluster", "archetype_representative", "rank"
+    ]
+]
+archetyped_asset_list = archetyped_asset_list[archetyped_asset_list["rank"] != "NO ARCHETYPE"]
+archetyped_asset_list["rank"] = archetyped_asset_list["rank"].astype(int)
+# Sort
+archetyped_asset_list = archetyped_asset_list.sort_values(by=["cluster", "rank"])
+
+# Read in and merge on clustering features
+clustering_features = read_pickle_from_s3(
+    bucket_name="retrofit-data-dev",
+    s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
+)
+
+archetyped_asset_list = archetyped_asset_list.merge(
+    clustering_features,
+    on="internal_id",
+    how="inner"
+)
+
+property_type_archetypes = archetyped_asset_list[
+    ["cluster", "rank", "property-type", "built-form", "walls-description"]]
+
+# Key variables for separation:
+# - property-type
+# - built-form
+# - walls-description
+# - roof-description
+
+clustering_features[["property-type", "built-form", "walls-description"]].drop_duplicates().shape
+
+clustering_features["walls-description"].value_counts()
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@ -1633,58 +1633,60 @@ def compile_data_final():
    # )

    # from utils.s3 import read_pickle_from_s3
-    # data = read_pickle_from_s3(
+    # property_attributes = read_pickle_from_s3(
    #     bucket_name="retrofit-data-dev",
    #     s3_file_name="customers/Stonewater/clustering/clustering_dataframe.pkl"
    # )

-    # CLUSTERING!!
+    # We perform some additional cleaning on the data
+    import msgpack
+    cleaned = read_from_s3(
+        s3_file_name="cleaned_epc_data/cleaned.bson",
+        bucket_name="retrofit-data-dev"
+    )

-    # from sklearn.cluster import KMeans
-    # from sklearn.preprocessing import OneHotEncoder
-    # from scipy.spatial.distance import cdist
-    #
-    # property_attributes.set_index('internal_id', inplace=True)
-    #
-    # # Step 1: Prepare the data
-    # # Identify categorical columns (you might need to adjust this)
-    # categorical_cols = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
-    # for col in categorical_cols:
-    #     property_attributes[col] = property_attributes[col].astype(str)
-    #
-    # # Applying OneHotEncoder
-    # encoder = OneHotEncoder(sparse=False)
-    # encoded_cats = encoder.fit_transform(property_attributes[categorical_cols])
-    #
-    # # Creating a new DataFrame with encoded categorical data and original numerical data
-    # numerical_data = property_attributes.select_dtypes(include=[np.number])
-    # data_for_clustering = pd.concat([numerical_data, pd.DataFrame(encoded_cats, index=numerical_data.index)], axis=1)
-    #
-    # # Convert all column names to strings to satisfy KMeans requirements
-    # data_for_clustering.columns = data_for_clustering.columns.astype(str)
-    #
-    # # Step 2: K-Means Clustering
-    # k = 450  # number of clusters
-    # kmeans = KMeans(n_clusters=k, random_state=0)
-    # property_attributes['cluster'] = kmeans.fit_predict(data_for_clustering)
-    #
-    # # Extracting centroids
-    # centroids = kmeans.cluster_centers_
-    #
-    # # Step 3: Assign clusters and rank rows
-    # # Calculating distances from each point to its cluster's centroid
-    # distances = cdist(data_for_clustering, centroids, 'euclidean')
-    # min_distances = distances.min(axis=1)
-    # property_attributes['distance_to_centroid'] = min_distances
-    #
-    # # Ranking rows by distance within each cluster
-    # property_attributes['rank'] = property_attributes.groupby('cluster')['distance_to_centroid'].rank(method='first')
-    #
-    # # Sorting to verify
-    # property_attributes.sort_values(by=['cluster', 'rank'], inplace=True)
-    #
-    # # Optional: Displaying the dataframe
-    # print(property_attributes.head())
+    cleaned = msgpack.unpackb(cleaned, raw=False)
+    from etl.epc_clean.epc_attributes.FloorAttributes import FloorAttributes
+    from etl.epc_clean.epc_attributes.HotWaterAttributes import HotWaterAttributes
+    from etl.epc_clean.epc_attributes.MainFuelAttributes import MainFuelAttributes
+    from etl.epc_clean.epc_attributes.MainheatAttributes import MainHeatAttributes
+    from etl.epc_clean.epc_attributes.MainheatControlAttributes import MainheatControlAttributes
+    from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+    from etl.epc_clean.epc_attributes.WallAttributes import WallAttributes
+    from etl.epc_clean.epc_attributes.WindowAttributes import WindowAttributes
+    from etl.epc_clean.epc_attributes.LightingAttributes import LightingAttributes
+
+    cleaners = {
+        "floor-description": FloorAttributes,
+        'hotwater-description': HotWaterAttributes,
+        'main-fuel': MainFuelAttributes,
+        'mainheat-description': MainHeatAttributes,
+        'mainheatcont-description': MainheatControlAttributes,
+        'roof-description': RoofAttributes,
+        'walls-description': WallAttributes,
+        'windows-description': WindowAttributes,
+        'lighting-description': LightingAttributes
+    }
+    for variable_to_clean in cleaned.keys():
+        unique_descriptions = property_attributes[variable_to_clean].unique()
+        clean_df = pd.DataFrame(cleaned[variable_to_clean])
+        # Check if we have any
+        missed = [x for x in unique_descriptions if x not in clean_df["original_description"].values]
+        if missed:
+            descriptions_to_append = []
+            for description in missed:
+                if variable_to_clean == "lighting-description":
+                    cln = cleaners[variable_to_clean](description, **{"averages": pd.DataFrame()})
+                else:
+                    cln = cleaners[variable_to_clean](description)
+                to_append = {
+                    "original_description": description,
+                    "clean_description": cln.description.replace("(assumed)", "").rstrip().capitalize(),
+                    **cln.process()
+                }
+                descriptions_to_append.append(to_append)
+
+    # CLUSTERING!!

    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
@ -1777,110 +1779,6 @@ def compile_data_final():

    stonewater_uprn_lookup.to_excel("Stonewater uprn lookup table.xlsx")

-    ################################################
-    # Agglomertive Clustering
-    ################################################
-
-    # from sklearn.cluster import KMeans, AgglomerativeClustering
-    # from sklearn.preprocessing import StandardScaler, OneHotEncoder
-    # from sklearn.compose import ColumnTransformer
-    # from sklearn.pipeline import Pipeline
-    # from scipy.spatial.distance import cdist
-    # import numpy as np
-    # from collections import Counter
-    #
-    # id_column = 'internal_id'
-    # property_attributes.set_index(id_column, inplace=True)
-    #
-    # # Define the preprocessing for numerical and categorical features
-    # numerical_features = property_attributes.select_dtypes(include=['int64', 'float64']).columns.tolist()
-    # categorical_features = property_attributes.select_dtypes(include=['object', 'category']).columns.tolist()
-    #
-    # for col in categorical_features:
-    #     property_attributes[col] = property_attributes[col].astype(str)
-    #
-    # preprocessor = ColumnTransformer(
-    #     transformers=[
-    #         ('num', StandardScaler(), numerical_features),
-    #         ('cat', OneHotEncoder(sparse_output=False), categorical_features)
-    #     ]
-    # )
-    #
-    # # Function to perform clustering and merge small clusters
-    # def cluster_with_min_size(data, preprocessor, n_clusters=10, min_size=5):
-    #     while True:
-    #         # Preprocess the data
-    #         processed_data = preprocessor.fit_transform(data)
-    #
-    #         # Initial clustering
-    #         clustering = AgglomerativeClustering(n_clusters=n_clusters)
-    #         labels = clustering.fit_predict(processed_data)
-    #
-    #         # Check cluster sizes
-    #         cluster_counts = Counter(labels)
-    #
-    #         # Find clusters smaller than min_size
-    #         small_clusters = {cluster for cluster, count in cluster_counts.items() if count < min_size}
-    #
-    #         if not small_clusters:
-    #             break
-    #
-    #         # Merge small clusters
-    #         for cluster in small_clusters:
-    #             # Find the nearest cluster to merge with
-    #             cluster_data = processed_data[labels == cluster]
-    #             other_clusters = [i for i in range(n_clusters) if i not in small_clusters]
-    #             other_cluster_data = [processed_data[labels == i] for i in other_clusters]
-    #             other_centroids = np.vstack([data.mean(axis=0) for data in other_cluster_data])
-    #
-    #             distances = cdist(cluster_data, other_centroids).mean(axis=0)
-    #             closest_cluster = other_clusters[np.argmin(distances)]
-    #
-    #             labels[labels == cluster] = closest_cluster
-    #
-    #         n_clusters -= len(small_clusters)
-    #
-    #     return labels
-    #
-    # # Perform clustering with minimum size constraint
-    # n_clusters = 10
-    # min_size = 5
-    # property_attributes['cluster'] = cluster_with_min_size(property_attributes, preprocessor, n_clusters, min_size)
-    #
-    # # Filter out empty clusters
-    # valid_clusters = property_attributes['cluster'].unique()
-    #
-    # # Get centroids for the resulting clusters
-    # processed_data = preprocessor.transform(property_attributes.drop(columns=["cluster"]))
-    # centroids = np.vstack([processed_data[property_attributes['cluster'] == i].mean(axis=0) for i in valid_clusters])
-    #
-    # # Calculate distances from each point to the centroid of its cluster
-    # distances_to_centroids = [
-    #     cdist(processed_data[i].reshape(1, -1),
-    #           centroids[valid_clusters.tolist().index(label)].reshape(1, -1)).flatten()[0]
-    #     for i, label in enumerate(property_attributes['cluster'])
-    # ]
-    #
-    # property_attributes['distance_to_centroid'] = distances_to_centroids
-    #
-    # # Verify that at least one point in each cluster has zero distance to the centroid
-    # for cluster_id in valid_clusters:
-    #     cluster_data = property_attributes[property_attributes['cluster'] == cluster_id]
-    #     min_distance = cluster_data['distance_to_centroid'].min()
-    #     print(f"Cluster {cluster_id} minimum distance to centroid: {min_distance}")
-    #     if min_distance != 0:
-    #         print(f"No point with zero distance found in cluster {cluster_id}")
-    #
-    # # Rank the distances within each cluster
-    # property_attributes['rank_within_cluster'] = property_attributes.groupby('cluster')['distance_to_centroid'] \
-    #     .rank(method='first')
-    #
-    # # Reset index to get 'internal_id' back
-    # property_attributes.reset_index(inplace=True)
-    #
-    # # Display the DataFrame
-    # print(property_attributes)
-

 def pull_ideal_postcodes(missing_uprn_with_udprn):
    api_key = ""  # Log into the platform the get the API key: https://account.ideal-postcodes.co.uk/
--- a/etl/epc_clean/epc_attributes/FloorAttributes.py
+++ b/etl/epc_clean/epc_attributes/FloorAttributes.py
@ -38,7 +38,7 @@ class FloorAttributes(Definitions):
        self.description: str = description.lower()

        self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
-            description in self.OBSERVED_ERRORS)
+            description in self.OBSERVED_ERRORS) or (self.description == "sap05:floor")

        # Try and perform a translation, incase it's in welsh
        self.translate_welsh_text()
--- a/etl/epc_clean/epc_attributes/HotWaterAttributes.py
+++ b/etl/epc_clean/epc_attributes/HotWaterAttributes.py
@ -129,7 +129,9 @@ class HotWaterAttributes(Definitions):
    def __init__(self, description: str):
        self.description: str = clean_description(description.lower()).strip()

-        self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
+        self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
+            self.description == "sap05 hot-water"
+        )

        translation = self.WELSH_TEXT.get(self.description)

--- a/etl/epc_clean/epc_attributes/LightingAttributes.py
+++ b/etl/epc_clean/epc_attributes/LightingAttributes.py
@ -1,15 +1,18 @@
 import re
+from BaseUtility import Definitions
 from etl.epc_clean.epc_attributes.attribute_utils import clean_description
 from etl.epc_clean.utils import correct_spelling


-class LightingAttributes:
+class LightingAttributes(Definitions):
    WELSH_TEXT = {
        "goleuadau ynni-isel ym mhob un ogçör mannau gosod": "low energy lighting in all fixed outlets",
        "dim goleuadau ynni-isel": "no low energy lighting",
        "goleuadau ynni-isel ym mhob un o'r mannau gosod": 'Low energy lighting in all fixed outlets'
    }

+    OBSERVED_ERRORS = []
+
    def __init__(self, description, averages):
        self.description: str = clean_description(description.lower())

@ -18,6 +21,9 @@ class LightingAttributes:
        self.description = correct_spelling(self.description)
        self.averages = averages

+        self.nodata = (not description) or (description in self.DATA_ANOMALY_MATCHES) or (
+            description in self.OBSERVED_ERRORS) or (description == "SAP05:Lighting")
+
    def welsh_translation_search(self):
        """
        For welsh text describing the percentage of low energy lighting, we match the regular
@ -40,6 +46,9 @@ class LightingAttributes:

        description = self.description

+        if self.nodata:
+            return {"low_energy_proportion": None}
+
        if 'no low energy lighting' in description:
            return {"low_energy_proportion": 0}

--- a/etl/epc_clean/epc_attributes/MainheatAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainheatAttributes.py
@ -77,7 +77,9 @@ class MainHeatAttributes(Definitions):

        self.description: str = clean_description(self.description).strip()
        # Remove special characters
-        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
+        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or (
+            description == "SAP05:Main-Heating"
+        )

        translation = self.WELSH_TEXT.get(self.description)
        if translation:
@ -97,11 +99,12 @@ class MainHeatAttributes(Definitions):

        self.process_edge_cases()

-        if (not description or not any(
-            rt in self.description for rt in
-            self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
-        ) and not self.is_edge_case):
-            raise ValueError('Invalid description')
+        if not self.nodata:
+            if (not description or not any(
+                rt in self.description for rt in
+                self.HEAT_SYSTEMS + self.FUEL_TYPES + self.DISTRIBUTION_SYSTEMS + self.OTHERS
+            ) and not self.is_edge_case):
+                raise ValueError('Invalid description')

    def process_edge_cases(self) -> (dict, bool):
        """
--- a/etl/epc_clean/epc_attributes/MainheatControlAttributes.py
+++ b/etl/epc_clean/epc_attributes/MainheatControlAttributes.py
@ -117,7 +117,9 @@ class MainheatControlAttributes(Definitions):

    def __init__(self, description: str):
        self.description: str = clean_description(description.lower()).strip()
-        self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES
+        self.nodata = not self.description or description in self.DATA_ANOMALY_MATCHES or (
+            description == "SAP05:Main-Heating-Controls"
+        )

        translation = self.WELSH_TEXT.get(self.description)
        if translation:
--- a/etl/epc_clean/epc_attributes/WindowAttributes.py
+++ b/etl/epc_clean/epc_attributes/WindowAttributes.py
@ -38,7 +38,7 @@ class WindowAttributes(Definitions):

        # In the case of an empty description, we want to return a dictionary with all values set to False
        # and indicate there was no data
-        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES
+        self.nodata = not description or description in self.DATA_ANOMALY_MATCHES or description == "SAP05:Windows"

        translation = self.WELSH_TEXT.get(self.description)
        if translation: