From 4d021f0ba6a5894659275d8090e1f65be6ca68f6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Nov 2024 17:12:55 +0000 Subject: [PATCH] working on stonewater alg --- .../stonewater/Wave 3 Preparation.py | 102 +++++++++++++++--- 1 file changed, 86 insertions(+), 16 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index ef7dd414..40dfd38e 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -3,6 +3,7 @@ import PyPDF2 import re import pandas as pd import numpy as np +from docutils.utils.math.tex2mathml_extern import blahtexml from tqdm import tqdm from collections import Counter @@ -1681,19 +1682,15 @@ def propsed_wave_3_sample(): for region in tqdm(unique_postal_regions): # Take all of the properties in that region region_assets = asset_list[asset_list["Postal Region"] == region].copy() - archetypes = region_assets["Archetype ID"].unique() - # We get the properties that have been surveyed - region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(archetypes) & - (survey_results["Postal Region"] == region) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() - if region_surveyed["Archetype ID"].duplicated().sum(): - raise NotImplementedError("Fix me") + # We have a tier 1 match if the property itself was surveyed + exact_surveyed = survey_results[ + survey_results["Address ID"].isin(region_assets["Address ID"]) + ] region_assets = region_assets.merge( - region_surveyed, - on="Archetype ID", + exact_surveyed[["Address ID", "Current EPC Band"]], + on="Address ID", how="left" ) @@ -1701,22 +1698,95 @@ def propsed_wave_3_sample(): region_assets["Confidence Tier"] = None region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]), - "1", region_assets["Confidence Tier"] + "1 - property was surveyed", region_assets["Confidence Tier"] ) - # TODO: Turn into a function - missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["C", "B", "A"]), + "6 - property was surveyed", region_assets["Confidence Tier"] + ) + + archetypes = region_assets[ + pd.isnull(region_assets["Confidence Tier"]) + ]["Archetype ID"].unique() + # We get the properties that have been surveyed region_surveyed = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) - ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + survey_results["Archetype ID"].isin(archetypes) & + (survey_results["Postal Region"] == region) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): - raise NotImplementedError("Fix me 2") + # Take the duplicated archetypes + duplicated_archetypes = region_surveyed[ + region_surveyed["Archetype ID"].duplicated() + ]["Archetype ID"].unique() + duplicated_archetypes = region_surveyed[ + region_surveyed["Archetype ID"].isin(duplicated_archetypes) + ] + + # We need to select which one is the most relevant to these properties + survey_data = survey_results_with_original_features[ + survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values) + ] + + raise NotImplementedError("Fix me") region_assets = region_assets.merge( region_surveyed, on="Archetype ID", how="left", + suffixes=("", "_method1") + ) + + # Label the tier 1 properties + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & + pd.isnull(region_assets["Confidence Tier"]), + "1 - Archetype surveyed", region_assets["Confidence Tier"] + ) + region_assets = region_assets.drop(columns=["Current EPC Band_method1"]) + # TODO: Turn into a function + missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"]) + + archetype_surveyed = survey_results[ + survey_results["Archetype ID"].isin(missed_archetypes) + ][["Archetype ID", "Current EPC Band"]].drop_duplicates() + + if archetype_surveyed["Archetype ID"].duplicated().sum(): + # We need to select which one is the most relevant to these properties + duplicated_archetypes = archetype_surveyed[ + archetype_surveyed["Archetype ID"].duplicated() + ]["Archetype ID"].unique() + + survey_data = survey_results_with_original_features[ + survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes) + ] + + homes_with_these_archetypes = region_assets[ + region_assets["Archetype ID"].isin(duplicated_archetypes) + ] + + for _, home in homes_with_these_archetypes.iterrows(): + first_filter = survey_data[ + (survey_data["Postal Region"] == home["Postal Region"]) & + (survey_data["Property Type"] == home["Property Type"]) & + (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) + ] + + if not first_filter.empty: + NotImplementedError("Fix me 0") + + second_filter = survey_data[ + (survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) & + (survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0]) + ] + + raise NotImplementedError("Fix me 2") + + region_assets = region_assets.merge( + archetype_surveyed, + on="Archetype ID", + how="left", suffixes=("", "_method2") )