working on stonewater alg

This commit is contained in:
Khalim Conn-Kowlessar 2024-11-16 17:12:55 +00:00
parent 2158ab2cd5
commit 4d021f0ba6

View file

@ -3,6 +3,7 @@ import PyPDF2
import re
import pandas as pd
import numpy as np
from docutils.utils.math.tex2mathml_extern import blahtexml
from tqdm import tqdm
from collections import Counter
@ -1681,19 +1682,15 @@ def propsed_wave_3_sample():
for region in tqdm(unique_postal_regions):
# Take all of the properties in that region
region_assets = asset_list[asset_list["Postal Region"] == region].copy()
archetypes = region_assets["Archetype ID"].unique()
# We get the properties that have been surveyed
region_surveyed = survey_results[
survey_results["Archetype ID"].isin(archetypes) &
(survey_results["Postal Region"] == region)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
if region_surveyed["Archetype ID"].duplicated().sum():
raise NotImplementedError("Fix me")
# We have a tier 1 match if the property itself was surveyed
exact_surveyed = survey_results[
survey_results["Address ID"].isin(region_assets["Address ID"])
]
region_assets = region_assets.merge(
region_surveyed,
on="Archetype ID",
exact_surveyed[["Address ID", "Current EPC Band"]],
on="Address ID",
how="left"
)
@ -1701,22 +1698,95 @@ def propsed_wave_3_sample():
region_assets["Confidence Tier"] = None
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]),
"1", region_assets["Confidence Tier"]
"1 - property was surveyed", region_assets["Confidence Tier"]
)
# TODO: Turn into a function
missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band"].isin(["C", "B", "A"]),
"6 - property was surveyed", region_assets["Confidence Tier"]
)
archetypes = region_assets[
pd.isnull(region_assets["Confidence Tier"])
]["Archetype ID"].unique()
# We get the properties that have been surveyed
region_surveyed = survey_results[
survey_results["Archetype ID"].isin(missed_archetypes)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
survey_results["Archetype ID"].isin(archetypes) &
(survey_results["Postal Region"] == region)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
if region_surveyed["Archetype ID"].duplicated().sum():
raise NotImplementedError("Fix me 2")
# Take the duplicated archetypes
duplicated_archetypes = region_surveyed[
region_surveyed["Archetype ID"].duplicated()
]["Archetype ID"].unique()
duplicated_archetypes = region_surveyed[
region_surveyed["Archetype ID"].isin(duplicated_archetypes)
]
# We need to select which one is the most relevant to these properties
survey_data = survey_results_with_original_features[
survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes["Archetype ID"].values)
]
raise NotImplementedError("Fix me")
region_assets = region_assets.merge(
region_surveyed,
on="Archetype ID",
how="left",
suffixes=("", "_method1")
)
# Label the tier 1 properties
region_assets["Confidence Tier"] = np.where(
region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) &
pd.isnull(region_assets["Confidence Tier"]),
"1 - Archetype surveyed", region_assets["Confidence Tier"]
)
region_assets = region_assets.drop(columns=["Current EPC Band_method1"])
# TODO: Turn into a function
missed_archetypes = set(archetypes) - set(region_surveyed["Archetype ID"])
archetype_surveyed = survey_results[
survey_results["Archetype ID"].isin(missed_archetypes)
][["Archetype ID", "Current EPC Band"]].drop_duplicates()
if archetype_surveyed["Archetype ID"].duplicated().sum():
# We need to select which one is the most relevant to these properties
duplicated_archetypes = archetype_surveyed[
archetype_surveyed["Archetype ID"].duplicated()
]["Archetype ID"].unique()
survey_data = survey_results_with_original_features[
survey_results_with_original_features["Archetype ID"].isin(duplicated_archetypes)
]
homes_with_these_archetypes = region_assets[
region_assets["Archetype ID"].isin(duplicated_archetypes)
]
for _, home in homes_with_these_archetypes.iterrows():
first_filter = survey_data[
(survey_data["Postal Region"] == home["Postal Region"]) &
(survey_data["Property Type"] == home["Property Type"]) &
(survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
]
if not first_filter.empty:
NotImplementedError("Fix me 0")
second_filter = survey_data[
(survey_data["Property Type"].str.split(":").str[0] == home["Property Type"].split(":")[0]) &
(survey_data["Wall Type"].str.split(":").str[0] == home["Wall Type"].split(":")[0])
]
raise NotImplementedError("Fix me 2")
region_assets = region_assets.merge(
archetype_surveyed,
on="Archetype ID",
how="left",
suffixes=("", "_method2")
)