mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on new forecast approach for warmfront remaining sales
This commit is contained in:
parent
6ae21bbcb0
commit
8b8e2bf902
2 changed files with 768 additions and 45 deletions
|
|
@ -17,6 +17,7 @@ from etl.eligibility.ha_15_32.app import prepare_model_data_row
|
||||||
from backend.ml_models.api import ModelApi
|
from backend.ml_models.api import ModelApi
|
||||||
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
|
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
|
||||||
from recommendations.recommendation_utils import calculate_cavity_age
|
from recommendations.recommendation_utils import calculate_cavity_age
|
||||||
|
from etl.epc.Record import EPCRecord
|
||||||
|
|
||||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||||
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
|
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
|
||||||
|
|
@ -181,25 +182,25 @@ class DataLoader:
|
||||||
if ha_name in ["HA1", "HA6", "HA16", "HA24"]:
|
if ha_name in ["HA1", "HA6", "HA16", "HA24"]:
|
||||||
asset_list["matching_address"] = asset_list[
|
asset_list["matching_address"] = asset_list[
|
||||||
self.COLUMN_CONFIG[ha_name]["address"]
|
self.COLUMN_CONFIG[ha_name]["address"]
|
||||||
].str.lower().str.strip()
|
].astype(str).str.lower().str.strip()
|
||||||
asset_list["matching_postcode"] = asset_list[
|
asset_list["matching_postcode"] = asset_list[
|
||||||
self.COLUMN_CONFIG[ha_name]["postcode"]
|
self.COLUMN_CONFIG[ha_name]["postcode"]
|
||||||
].str.lower().str.strip()
|
].astype(str).str.lower().str.strip()
|
||||||
elif ha_name == "HA7":
|
elif ha_name == "HA7":
|
||||||
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
|
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
|
||||||
asset_list["matching_address"] = asset_list["Address"].str.lower().str.strip() + ", " + \
|
asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
asset_list["Address2"].str.lower().str.strip() + ", " + \
|
asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
asset_list["Address3"].str.lower().str.strip() + ", " + \
|
asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
asset_list["Postcode"].str.lower().str.strip()
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
||||||
asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
||||||
elif ha_name == "HA14":
|
elif ha_name == "HA14":
|
||||||
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
|
# Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode
|
||||||
asset_list["matching_address"] = asset_list["Address 1"].str.lower().str.strip() + ", " + \
|
asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
asset_list["Address 2"].str.lower().str.strip() + ", " + \
|
asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
asset_list["Address 3"].str.lower().str.strip() + ", " + \
|
asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
asset_list["Address 4"].str.lower().str.strip() + ", " + \
|
asset_list["Address 4"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
asset_list["Postcode"].str.lower().str.strip()
|
asset_list["Postcode"].astype(str).str.lower().str.strip()
|
||||||
asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip()
|
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
|
||||||
|
|
||||||
elif ha_name == "HA39":
|
elif ha_name == "HA39":
|
||||||
# Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
|
# Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code
|
||||||
|
|
@ -209,7 +210,7 @@ class DataLoader:
|
||||||
asset_list["add_4"].astype(str).str.lower().str.strip() + ", " + \
|
asset_list["add_4"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \
|
asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
asset_list["post_code"].astype(str).str.lower().str.strip()
|
asset_list["post_code"].astype(str).str.lower().str.strip()
|
||||||
asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip()
|
asset_list["matching_postcode"] = asset_list["post_code"].astype(str).str.lower().str.strip()
|
||||||
elif ha_name == "HA107":
|
elif ha_name == "HA107":
|
||||||
# Create matching_address by concatenating House No, Street, Town, District, Postcode
|
# Create matching_address by concatenating House No, Street, Town, District, Postcode
|
||||||
asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
|
asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \
|
||||||
|
|
@ -1098,8 +1099,8 @@ class DataLoader:
|
||||||
self.december_figures = pd.read_csv(self.december_figures_filepath)
|
self.december_figures = pd.read_csv(self.december_figures_filepath)
|
||||||
# Remove the spaces in HA Name
|
# Remove the spaces in HA Name
|
||||||
self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "")
|
self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "")
|
||||||
self.december_figures["ECO4"] = self.december_figures["ECO4"].astype("Int64")
|
for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]:
|
||||||
self.december_figures["GBIS"] = self.december_figures["GBIS"].astype("Int64")
|
self.december_figures[col] = self.december_figures[col].astype("Int64")
|
||||||
|
|
||||||
if self.use_cache:
|
if self.use_cache:
|
||||||
self.data = read_pickle_from_s3(
|
self.data = read_pickle_from_s3(
|
||||||
|
|
@ -1203,7 +1204,6 @@ class DataLoader:
|
||||||
# Update the asset list with the categorisations and rename changes
|
# Update the asset list with the categorisations and rename changes
|
||||||
if asset_list.shape[0] != asset_list_starting_size:
|
if asset_list.shape[0] != asset_list_starting_size:
|
||||||
raise ValueError("The asset list has changed in size")
|
raise ValueError("The asset list has changed in size")
|
||||||
self.data[ha_name]["asset_list"] = asset_list
|
|
||||||
|
|
||||||
# Report on sales
|
# Report on sales
|
||||||
sales_report = {}
|
sales_report = {}
|
||||||
|
|
@ -1259,7 +1259,31 @@ class DataLoader:
|
||||||
)
|
)
|
||||||
|
|
||||||
# We get the sales
|
# We get the sales
|
||||||
sales_report = survey_list["installation_status"].value_counts().to_dict()
|
sales_report = {
|
||||||
|
"ECO4 - surveys sold": survey_list.shape[0],
|
||||||
|
**survey_list["installation_status"].value_counts().to_dict()
|
||||||
|
}
|
||||||
|
|
||||||
|
# We find some cases where properties have sold but are missing CIGA checks
|
||||||
|
survey_list_to_merge = survey_list[["asset_list_row_id"]].copy()
|
||||||
|
survey_list_to_merge["has_a_survey_record"] = True
|
||||||
|
survey_list_to_merge = survey_list_to_merge[~pd.isnull(survey_list_to_merge["asset_list_row_id"])]
|
||||||
|
|
||||||
|
asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id")
|
||||||
|
asset_list["ECO Eligibility"] = np.where(
|
||||||
|
(asset_list["ECO Eligibility"] == "eco4 (subject to ciga)") & (
|
||||||
|
asset_list["has_a_survey_record"] == True
|
||||||
|
),
|
||||||
|
"eco4 - passed ciga",
|
||||||
|
asset_list["ECO Eligibility"]
|
||||||
|
)
|
||||||
|
asset_list = asset_list.drop(columns=["has_a_survey_record"])
|
||||||
|
|
||||||
|
# Update the survey list with installation status
|
||||||
|
self.data[ha_name]["survey_list"] = survey_list
|
||||||
|
|
||||||
|
# Insert updated asset list
|
||||||
|
self.data[ha_name]["asset_list"] = asset_list
|
||||||
|
|
||||||
ha_facts_and_figures.append(
|
ha_facts_and_figures.append(
|
||||||
{
|
{
|
||||||
|
|
@ -1687,7 +1711,21 @@ def analyse_ha_data(outputs, loader):
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
eco4_rate = 1710
|
||||||
|
gbis_rate = 600
|
||||||
|
old_eco4_rate = 1456
|
||||||
|
old_gbis_rate = 432
|
||||||
|
|
||||||
|
epc_c_threshold = 80
|
||||||
|
scheme_map = {
|
||||||
|
"ECO4": "ECO4",
|
||||||
|
"AFFORDABLE WARMTH": "ECO4",
|
||||||
|
"ECO4 A/W": "ECO4",
|
||||||
|
"ECO4 GBIS (ECO+)": "GBIS"
|
||||||
|
}
|
||||||
|
|
||||||
ha_analysis_results = []
|
ha_analysis_results = []
|
||||||
|
total_revenue_results = []
|
||||||
for ha_name, datasets in outputs.items():
|
for ha_name, datasets in outputs.items():
|
||||||
inputs = [x for k, x in loader.data.items() if k == ha_name][0]
|
inputs = [x for k, x in loader.data.items() if k == ha_name][0]
|
||||||
|
|
||||||
|
|
@ -1702,6 +1740,88 @@ def analyse_ha_data(outputs, loader):
|
||||||
left_on="asset_list_row_id"
|
left_on="asset_list_row_id"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
analysis_data["is_remaining"] = True
|
||||||
|
|
||||||
|
n_sold_eco4 = 0
|
||||||
|
n_sold_gbis = 0
|
||||||
|
if not inputs["survey_list"].empty:
|
||||||
|
# Merge on the survey list and signal everything that is remaining or not (i.e. anything that hasn't had
|
||||||
|
# a survey)
|
||||||
|
survey_list = inputs["survey_list"].copy()
|
||||||
|
|
||||||
|
# TODO: TEMP
|
||||||
|
scheme_column = survey_list.columns[0]
|
||||||
|
# We clean up the survey list installation or cancelled
|
||||||
|
survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower()
|
||||||
|
# Remove all punctuation
|
||||||
|
survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
|
||||||
|
r'[^\w\s]', '', regex=True
|
||||||
|
)
|
||||||
|
# Remove double spaces
|
||||||
|
survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
|
||||||
|
r'\s+', ' ', regex=True
|
||||||
|
)
|
||||||
|
# Remove trailing spaces
|
||||||
|
survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip()
|
||||||
|
|
||||||
|
# Remap the values in the scheme column
|
||||||
|
survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map)
|
||||||
|
|
||||||
|
survey_list["installation_status"] = None
|
||||||
|
survey_list["installation_status"] = np.where(
|
||||||
|
survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]),
|
||||||
|
"installed",
|
||||||
|
survey_list["installation_status"]
|
||||||
|
)
|
||||||
|
survey_list["installation_status"] = np.where(
|
||||||
|
survey_list["installed_or_cancelled_clean"].isin(["cancelled"]),
|
||||||
|
"cancelled",
|
||||||
|
survey_list["installation_status"]
|
||||||
|
)
|
||||||
|
# Find partial installations
|
||||||
|
survey_list["installation_status"] = np.where(
|
||||||
|
survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"),
|
||||||
|
"partially installed",
|
||||||
|
survey_list["installation_status"]
|
||||||
|
)
|
||||||
|
# Find partial cancellations
|
||||||
|
# TODO: We might have more indications of partial cancellations
|
||||||
|
survey_list["installation_status"] = np.where(
|
||||||
|
survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]),
|
||||||
|
"partially cancelled",
|
||||||
|
survey_list["installation_status"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Finally, for other cases, we set the status to "in progress"
|
||||||
|
survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress")
|
||||||
|
|
||||||
|
# We concatenate the scheme name with the installation status
|
||||||
|
survey_list["installation_status"] = (
|
||||||
|
survey_list[scheme_column] + " - " + survey_list["installation_status"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO: END TEMP
|
||||||
|
|
||||||
|
survey_list_to_merge = survey_list[["asset_list_row_id", scheme_column]].copy()
|
||||||
|
survey_list_to_merge["is_remaining"] = False
|
||||||
|
analysis_data = analysis_data.drop(columns="is_remaining").merge(
|
||||||
|
survey_list_to_merge,
|
||||||
|
how="left", on="asset_list_row_id"
|
||||||
|
)
|
||||||
|
analysis_data["is_remaining"] = analysis_data["is_remaining"].fillna(True)
|
||||||
|
|
||||||
|
n_sold_eco4 = survey_list_to_merge[survey_list_to_merge[scheme_column] == "ECO4"].shape[0]
|
||||||
|
n_sold_gbis = survey_list_to_merge[survey_list_to_merge[scheme_column] == "GBIS"].shape[0]
|
||||||
|
|
||||||
|
# Take just remaining
|
||||||
|
analysis_data = analysis_data[analysis_data["is_remaining"]]
|
||||||
|
|
||||||
|
# Also, if the HA has started selling, we remove any that are still subject to ciga
|
||||||
|
n_eco4_missed_subject_to_ciga = 0
|
||||||
|
if not inputs["survey_list"].empty:
|
||||||
|
n_eco4_missed_subject_to_ciga = (analysis_data["ECO Eligibility"] == "eco4 (subject to ciga)").sum()
|
||||||
|
analysis_data = analysis_data[analysis_data["ECO Eligibility"] != "eco4 (subject to ciga)"]
|
||||||
|
|
||||||
################################################################################################
|
################################################################################################
|
||||||
# We take the properties that strictly qualified under eco
|
# We take the properties that strictly qualified under eco
|
||||||
################################################################################################
|
################################################################################################
|
||||||
|
|
@ -1714,8 +1834,11 @@ def analyse_ha_data(outputs, loader):
|
||||||
eco4_identified["identification_type"]
|
eco4_identified["identification_type"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# For expansive, the property can be no higher than an EPC C
|
||||||
eco4_identified["identification_type"] = np.where(
|
eco4_identified["identification_type"] = np.where(
|
||||||
(eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False),
|
(eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False) & (
|
||||||
|
eco4_identified["sap"] <= epc_c_threshold
|
||||||
|
),
|
||||||
"expansive",
|
"expansive",
|
||||||
eco4_identified["identification_type"]
|
eco4_identified["identification_type"]
|
||||||
)
|
)
|
||||||
|
|
@ -1743,21 +1866,17 @@ def analyse_ha_data(outputs, loader):
|
||||||
"Meets fabric, fails SAP check",
|
"Meets fabric, fails SAP check",
|
||||||
"Meets cavity, loft borderline, meets sap",
|
"Meets cavity, loft borderline, meets sap",
|
||||||
]
|
]
|
||||||
),
|
) & (ciga_dependent_identified["sap"] <= epc_c_threshold),
|
||||||
"strict",
|
"strict",
|
||||||
ciga_dependent_identified["identification_type"]
|
ciga_dependent_identified["identification_type"]
|
||||||
)
|
)
|
||||||
|
|
||||||
ciga_dependent_identified["identification_type"] = np.where(
|
ciga_dependent_identified["identification_type"] = np.where(
|
||||||
(ciga_dependent_identified["eco4_message"].isin(["All conditions fail", "failed fabric check"])) &
|
((ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | (
|
||||||
(ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"])),
|
|
||||||
"expansive",
|
|
||||||
ciga_dependent_identified["identification_type"]
|
|
||||||
)
|
|
||||||
|
|
||||||
ciga_dependent_identified["identification_type"] = np.where(
|
|
||||||
(ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | (
|
|
||||||
ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"])
|
ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"])
|
||||||
|
)) & (
|
||||||
|
(ciga_dependent_identified["sap"] <= epc_c_threshold) &
|
||||||
|
pd.isnull(ciga_dependent_identified["identification_type"])
|
||||||
),
|
),
|
||||||
"expansive",
|
"expansive",
|
||||||
ciga_dependent_identified["identification_type"]
|
ciga_dependent_identified["identification_type"]
|
||||||
|
|
@ -1775,7 +1894,9 @@ def analyse_ha_data(outputs, loader):
|
||||||
)
|
)
|
||||||
|
|
||||||
gbis_identified["identification_type"] = np.where(
|
gbis_identified["identification_type"] = np.where(
|
||||||
(gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] >= 69),
|
(gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] <= epc_c_threshold) & (
|
||||||
|
pd.isnull(gbis_identified["identification_type"])
|
||||||
|
),
|
||||||
"expansive",
|
"expansive",
|
||||||
gbis_identified["identification_type"]
|
gbis_identified["identification_type"]
|
||||||
)
|
)
|
||||||
|
|
@ -1806,9 +1927,16 @@ def analyse_ha_data(outputs, loader):
|
||||||
]
|
]
|
||||||
surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False]
|
surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False]
|
||||||
|
|
||||||
# Output variables
|
# Output variables - the data was sent to us in December, but the remaining figures are
|
||||||
|
# what was in November
|
||||||
|
november_remaining = loader.december_figures[loader.december_figures["HA Name"] == ha_name]
|
||||||
|
|
||||||
# ECO4
|
# ECO4
|
||||||
n_properties_in_asset_list = inputs["asset_list"].shape[0]
|
n_properties_remaining_in_asset_list = inputs["asset_list"].shape[0]
|
||||||
|
november_eco4_remaining = max(november_remaining["ECO4 remaining"].values[0], 0)
|
||||||
|
november_eco4_sold = november_remaining["No. of Tech surveys complete - Eco 4"].values[0]
|
||||||
|
eco4_sales_since_november = n_sold_eco4 - november_eco4_sold
|
||||||
|
|
||||||
n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0]
|
n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0]
|
||||||
eco4_of_which_identified_strict = (
|
eco4_of_which_identified_strict = (
|
||||||
eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] +
|
eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] +
|
||||||
|
|
@ -1820,26 +1948,37 @@ def analyse_ha_data(outputs, loader):
|
||||||
)
|
)
|
||||||
# GBIS
|
# GBIS
|
||||||
n_warmfront_identified_gbis = gbis_identified.shape[0]
|
n_warmfront_identified_gbis = gbis_identified.shape[0]
|
||||||
|
november_gbis_remaining = max(november_remaining["GBIS remaining"].values[0], 0)
|
||||||
|
november_gbis_sold = november_remaining["No. of Tech surveys complete - GBIS"].values[0]
|
||||||
|
gbis_sales_since_november = n_sold_gbis - november_gbis_sold
|
||||||
gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0]
|
gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0]
|
||||||
gbis_of_which_identified_expansive = \
|
gbis_of_which_identified_expansive = \
|
||||||
gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0]
|
gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0]
|
||||||
|
|
||||||
to_append = {
|
to_append = {
|
||||||
("", "HA Name"): ha_name,
|
("", "HA Name"): ha_name,
|
||||||
("", "# Properties in asset list"): n_properties_in_asset_list,
|
("", "# properties in asset list"): n_properties_remaining_in_asset_list,
|
||||||
############
|
############
|
||||||
# ECO4
|
# ECO4
|
||||||
############
|
############
|
||||||
("ECO4", "# Properties identieid by Warmfront"): n_warmfront_identified_eco4,
|
("ECO4", "# remaining November file"): november_eco4_remaining,
|
||||||
|
("ECO4", "# sold in November file"): november_eco4_sold,
|
||||||
|
("ECO4", "# sold (survey list)"): n_sold_eco4,
|
||||||
|
("ECO4", "# that missed CIGA check"): n_eco4_missed_subject_to_ciga,
|
||||||
|
("ECO4", "# Remaining properties (asset list)"): n_warmfront_identified_eco4,
|
||||||
("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict,
|
("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict,
|
||||||
("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive,
|
("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive,
|
||||||
("ECO4", "Of which identified by model - total"): (
|
("ECO4", "Of which identified by model - total"): (
|
||||||
eco4_of_which_identified_strict + eco4_of_which_identified_expansive),
|
eco4_of_which_identified_strict + eco4_of_which_identified_expansive
|
||||||
|
),
|
||||||
("ECO4", "Additional properties"): surplus_eco4.shape[0],
|
("ECO4", "Additional properties"): surplus_eco4.shape[0],
|
||||||
############
|
############
|
||||||
# GBIS
|
# GBIS
|
||||||
############
|
############
|
||||||
("GBIS", "# Properties identieid by Warmfront"): n_warmfront_identified_gbis,
|
("GBIS", "# remaining November file"): november_gbis_remaining,
|
||||||
|
("GBIS", "# sold in November file"): november_gbis_sold,
|
||||||
|
("GBIS", "# sold (survey list)"): n_sold_gbis,
|
||||||
|
("GBIS", "# Remaining properties (asset list)"): n_warmfront_identified_gbis,
|
||||||
("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict,
|
("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict,
|
||||||
("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive,
|
("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive,
|
||||||
("GBIS", "Of which identified by model - total"): (
|
("GBIS", "Of which identified by model - total"): (
|
||||||
|
|
@ -1850,6 +1989,24 @@ def analyse_ha_data(outputs, loader):
|
||||||
|
|
||||||
ha_analysis_results.append(to_append)
|
ha_analysis_results.append(to_append)
|
||||||
|
|
||||||
|
# Calculate the revenue results
|
||||||
|
to_append_revenue = {
|
||||||
|
("", "HA Name"): ha_name,
|
||||||
|
# Eco4 revenue
|
||||||
|
("ECO4", "£ remaining November file"): november_eco4_remaining * eco4_rate,
|
||||||
|
("ECO4", "£ sold November file"): november_eco4_sold * old_eco4_rate,
|
||||||
|
("ECO4", "£ sold since November"): eco4_sales_since_november * eco4_rate,
|
||||||
|
("ECO4", "£ stuck at ciga check"): n_eco4_missed_subject_to_ciga * eco4_rate,
|
||||||
|
("ECO4", "£ remaining (asset list)"): n_warmfront_identified_eco4 * eco4_rate,
|
||||||
|
("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict * eco4_rate,
|
||||||
|
("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive * eco4_rate,
|
||||||
|
("ECO4", "Of which identified by model - total"): eco4_rate * (
|
||||||
|
eco4_of_which_identified_strict + eco4_of_which_identified_expansive
|
||||||
|
),
|
||||||
|
("ECO4", "Additional properties"): eco4_rate * surplus_eco4.shape[0],
|
||||||
|
}
|
||||||
|
total_revenue_results.append(to_append_revenue)
|
||||||
|
|
||||||
ha_analysis_results = pd.DataFrame(ha_analysis_results)
|
ha_analysis_results = pd.DataFrame(ha_analysis_results)
|
||||||
ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns)
|
ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns)
|
||||||
|
|
||||||
|
|
@ -1862,8 +2019,8 @@ def analyse_ha_data(outputs, loader):
|
||||||
facts_and_figures = facts_and_figures.rename(
|
facts_and_figures = facts_and_figures.rename(
|
||||||
columns={
|
columns={
|
||||||
# ECO4 cols
|
# ECO4 cols
|
||||||
"ECO4": "ECO4 - December",
|
"ECO4": "ECO4 - November",
|
||||||
"GBIS": "GBIS - December",
|
"GBIS": "GBIS - November",
|
||||||
"eco4 (subject to ciga)": "ECO4 - subject to ciga",
|
"eco4 (subject to ciga)": "ECO4 - subject to ciga",
|
||||||
"eco4": "ECO4 - doesn't need CIGA",
|
"eco4": "ECO4 - doesn't need CIGA",
|
||||||
"eco4 - passed ciga": "ECO4 - passed CIGA",
|
"eco4 - passed ciga": "ECO4 - passed CIGA",
|
||||||
|
|
@ -1880,19 +2037,27 @@ def analyse_ha_data(outputs, loader):
|
||||||
# ECO4 - doesn't need CIGA + ECO4 - passed CIGA
|
# ECO4 - doesn't need CIGA + ECO4 - passed CIGA
|
||||||
# 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is
|
# 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is
|
||||||
# ECO4 - doesn't need CIGA + ECO4 - subject to ciga
|
# ECO4 - doesn't need CIGA + ECO4 - subject to ciga
|
||||||
facts_and_figures["ECO4 total (asset list)"] = np.where(
|
facts_and_figures["ECO4 total (asset list - pre ciga)"] = (
|
||||||
|
facts_and_figures["ECO4 - doesn't need CIGA"] +
|
||||||
|
facts_and_figures["ECO4 - subject to ciga"] +
|
||||||
|
facts_and_figures["ECO4 - passed CIGA"]
|
||||||
|
)
|
||||||
|
|
||||||
|
facts_and_figures["ECO4 total (asset list - post ciga)"] = None
|
||||||
|
facts_and_figures["ECO4 total (asset list - post ciga)"] = np.where(
|
||||||
facts_and_figures["ECO4 - passed CIGA"] > 0,
|
facts_and_figures["ECO4 - passed CIGA"] > 0,
|
||||||
facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"],
|
facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"],
|
||||||
facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - subject to ciga"]
|
facts_and_figures["ECO4 total (asset list - post ciga)"]
|
||||||
)
|
)
|
||||||
|
|
||||||
# Re-arrange the columns
|
# Re-arrange the columns
|
||||||
facts_and_figures = facts_and_figures[
|
facts_and_figures = facts_and_figures[
|
||||||
[
|
[
|
||||||
'HA Name',
|
'HA Name',
|
||||||
'ECO4 - December',
|
'ECO4 - November',
|
||||||
'GBIS - December',
|
'GBIS - November',
|
||||||
'ECO4 total (asset list)',
|
'ECO4 total (asset list - pre ciga)',
|
||||||
|
'ECO4 total (asset list - post ciga)',
|
||||||
'GBIS total (asset list)',
|
'GBIS total (asset list)',
|
||||||
'ECO4 - subject to ciga',
|
'ECO4 - subject to ciga',
|
||||||
"ECO4 - doesn't need CIGA",
|
"ECO4 - doesn't need CIGA",
|
||||||
|
|
@ -1916,6 +2081,8 @@ def analyse_ha_data(outputs, loader):
|
||||||
facts_and_figures["Missed CIGA checks opportunity"]
|
facts_and_figures["Missed CIGA checks opportunity"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
facts_and_figures.to_csv("Facts and figures sample.csv")
|
||||||
|
|
||||||
# Re arrage the columns
|
# Re arrage the columns
|
||||||
|
|
||||||
# Also sort ha_analysis_results by ha number
|
# Also sort ha_analysis_results by ha number
|
||||||
|
|
@ -1937,6 +2104,333 @@ def analyse_ha_data(outputs, loader):
|
||||||
for i, width in enumerate(get_col_widths(df)):
|
for i, width in enumerate(get_col_widths(df)):
|
||||||
writer.sheets[sheet].set_column(i, i, width)
|
writer.sheets[sheet].set_column(i, i, width)
|
||||||
|
|
||||||
|
# Inspection: - Looking into the proportion of homes with "cavity, as built, insulated (assumed)" as their
|
||||||
|
# description, and what proportion of time they get identified via non-invasive surveys
|
||||||
|
|
||||||
|
# true_eco4_assets = []
|
||||||
|
# ciga_dependent_assets = []
|
||||||
|
# not_eligible = []
|
||||||
|
# as_built_insulated = []
|
||||||
|
# date_cols = {
|
||||||
|
# "HA39": "date_built",
|
||||||
|
# "HA14": "Built In Year",
|
||||||
|
# "HA6": "Construction Year",
|
||||||
|
# "HA1": "Build Date",
|
||||||
|
# "HA107": "YEAR BUILT"
|
||||||
|
# }
|
||||||
|
# for ha_name, data_objects in outputs.items():
|
||||||
|
# inputs = [x for k, x in loader.data.items() if k == ha_name][0]
|
||||||
|
#
|
||||||
|
# date_col = date_cols[ha_name]
|
||||||
|
# results_df = data_objects["results_df"].copy()
|
||||||
|
# df = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility", date_col]].rename(
|
||||||
|
# columns={"row_meaning": "asset_identification_status", date_col: "date_built"}
|
||||||
|
# ).merge(
|
||||||
|
# results_df,
|
||||||
|
# how="left",
|
||||||
|
# right_on="row_id",
|
||||||
|
# left_on="asset_list_row_id"
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
# # take the true ECO4
|
||||||
|
# true_eco4 = df[df["ECO Eligibility"] == "eco4"].copy()
|
||||||
|
# ciga_dependent = df[
|
||||||
|
# df["ECO Eligibility"].isin(
|
||||||
|
# [
|
||||||
|
# "eco4 (subject to ciga)",
|
||||||
|
# "failed ciga",
|
||||||
|
# "eco4 - passed ciga"
|
||||||
|
# ]
|
||||||
|
# )
|
||||||
|
# ]
|
||||||
|
# insulated_assumed = df[df["walls"] == "Cavity wall, as built, insulated"].copy()
|
||||||
|
# # We convert date built to datetime
|
||||||
|
# try:
|
||||||
|
# insulated_assumed = insulated_assumed[~pd.isnull(insulated_assumed["date_built"])]
|
||||||
|
# insulated_assumed["year_built"] = pd.to_datetime(insulated_assumed["date_built"].astype(str)).dt.year
|
||||||
|
# as_built_insulated.append(insulated_assumed)
|
||||||
|
# except Exception as e:
|
||||||
|
# print("oh well")
|
||||||
|
#
|
||||||
|
# true_eco4_assets.append(true_eco4)
|
||||||
|
# ciga_dependent_assets.append(ciga_dependent)
|
||||||
|
#
|
||||||
|
# true_eco4_assets = pd.concat(true_eco4_assets)
|
||||||
|
# ciga_dependent_assets = pd.concat(ciga_dependent_assets)
|
||||||
|
# as_built_insulated = pd.concat(as_built_insulated)
|
||||||
|
#
|
||||||
|
# true_eco4_assets["walls"].value_counts(normalize=True)
|
||||||
|
# ciga_dependent_assets["walls"].value_counts(normalize=True)
|
||||||
|
#
|
||||||
|
# from recommendations.recommendation_utils import extract_insulation_thickness
|
||||||
|
#
|
||||||
|
# true_eco4_assets["roof_insulation_thickness"] = true_eco4_assets["roof"].apply(
|
||||||
|
# lambda x: extract_insulation_thickness(x)
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
# true_eco4_assets["e"] = true_eco4_assets.merge(
|
||||||
|
# pd.DataFrame(cleaned["roof-description"])[["original_description", "insulation_thickness"]],
|
||||||
|
# how="left",
|
||||||
|
# left_on="roof",
|
||||||
|
# right_on="original_description"
|
||||||
|
# )
|
||||||
|
#
|
||||||
|
# true_eco4_assets["sap"].mean()
|
||||||
|
#
|
||||||
|
# true_eco4_assets["insulation_thickness"].isin(
|
||||||
|
# ["250", "150", "200", "100", "75", "50"]
|
||||||
|
# ).sum() / true_eco4_assets.shape[0]
|
||||||
|
#
|
||||||
|
# true_eco4_assets["insulation_thickness"].isin(
|
||||||
|
# ["100"]
|
||||||
|
# ).sum() / true_eco4_assets.shape[0]
|
||||||
|
#
|
||||||
|
# as_built_insulated.groupby("property_type")["ECO Eligibility"].value_counts(normalize=True)
|
||||||
|
|
||||||
|
|
||||||
|
def get_propensity_model_data(
|
||||||
|
loader, cleaned, cleaning_data, created_at, photo_supply_lookup,
|
||||||
|
floor_area_decile_thresholds, pull_data=True
|
||||||
|
):
|
||||||
|
# TODO: Set a seed!
|
||||||
|
model_data = []
|
||||||
|
for ha_name, data_assets in loader.data.items():
|
||||||
|
|
||||||
|
logger.info("Processing HA: %s", ha_name)
|
||||||
|
if data_assets["survey_list"].empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
number_sold = data_assets["survey_list"].shape[0]
|
||||||
|
|
||||||
|
# For each HA, we read pull in the data required, and store in S3
|
||||||
|
asset_list = data_assets["asset_list"].copy()
|
||||||
|
# We determine the number of properties that we should select that are eligible
|
||||||
|
asset_list_size = asset_list.shape[0]
|
||||||
|
# Number eligible
|
||||||
|
n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0]
|
||||||
|
success_rate = n_eligibile / asset_list_size
|
||||||
|
needed_sample_size = np.ceil(number_sold / success_rate)
|
||||||
|
number_negative_samples = int(needed_sample_size - number_sold)
|
||||||
|
|
||||||
|
sold_asset_list_ids = data_assets["survey_list"]["asset_list_row_id"].tolist()
|
||||||
|
negative_sample_asset_list_ids = asset_list["asset_list_row_id"].sample(number_negative_samples).tolist()
|
||||||
|
sample_ids = sold_asset_list_ids + negative_sample_asset_list_ids
|
||||||
|
|
||||||
|
sample_asset_list = asset_list[asset_list["asset_list_row_id"].isin(sample_ids)]
|
||||||
|
|
||||||
|
# In order to have the most confidence, we should take just properties that have 1 EPC. We might need to
|
||||||
|
# cut down the number of properties that we include because of this
|
||||||
|
# Note: This is an imbalanced problem so we will need to build a model accomadating of that
|
||||||
|
|
||||||
|
data = []
|
||||||
|
errors = []
|
||||||
|
for index, property_meta in tqdm(sample_asset_list.iterrows(), total=len(sample_asset_list)):
|
||||||
|
|
||||||
|
if property_meta["matching_postcode"] is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
property_type, built_form = get_property_type_and_built_form(
|
||||||
|
property_meta=property_meta, ha_name=ha_name
|
||||||
|
)
|
||||||
|
|
||||||
|
searcher = SearchEpc(
|
||||||
|
address1=str(property_meta["HouseNo"]),
|
||||||
|
postcode=property_meta["matching_postcode"],
|
||||||
|
auth_token=EPC_AUTH_TOKEN,
|
||||||
|
os_api_key="",
|
||||||
|
full_address=property_meta["matching_address"]
|
||||||
|
)
|
||||||
|
searcher.ordnance_survey_client.property_type = property_type
|
||||||
|
searcher.ordnance_survey_client.built_form = built_form
|
||||||
|
searcher.find_property(skip_os=True)
|
||||||
|
|
||||||
|
if searcher.newest_epc is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if searcher.newest_epc.get("estimated"):
|
||||||
|
# We insert the row ID as our proxy for UPRN
|
||||||
|
searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
|
||||||
|
|
||||||
|
newest_epc = searcher.newest_epc
|
||||||
|
older_epcs = searcher.older_epcs
|
||||||
|
full_sap_epc = searcher.full_sap_epc
|
||||||
|
|
||||||
|
# If we have more than 1 EPC for the moment we just continue
|
||||||
|
if older_epcs or full_sap_epc:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
|
||||||
|
# We clean up the data
|
||||||
|
epc_records = {
|
||||||
|
'original_epc': newest_epc.copy(),
|
||||||
|
'full_sap_epc': full_sap_epc.copy(),
|
||||||
|
'old_data': older_epcs.copy(),
|
||||||
|
}
|
||||||
|
|
||||||
|
epc_record = EPCRecord(
|
||||||
|
epc_records=epc_records,
|
||||||
|
run_mode="newdata",
|
||||||
|
cleaning_data=cleaning_data
|
||||||
|
)
|
||||||
|
|
||||||
|
# If we have some data, continue
|
||||||
|
data.append(
|
||||||
|
{
|
||||||
|
"ECO Eligibility": property_meta["ECO Eligibility"],
|
||||||
|
"asset_list_row_id": property_meta["asset_list_row_id"],
|
||||||
|
**epc_record.get("prepared_epc")
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(
|
||||||
|
{
|
||||||
|
"error": str(e),
|
||||||
|
"asset_list_row_id": property_meta["asset_list_row_id"],
|
||||||
|
"matching_postcode": property_meta["matching_postcode"],
|
||||||
|
"matching_address": property_meta["matching_address"]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
data = pd.DataFrame(data)
|
||||||
|
# We store the results in S3 as a pickle
|
||||||
|
save_pickle_to_s3(
|
||||||
|
data=data,
|
||||||
|
bucket_name="retrofit-datalake-dev",
|
||||||
|
s3_file_name=f"propensity_model_data/{ha_name}/train.pickle"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store the errors
|
||||||
|
if errors:
|
||||||
|
save_pickle_to_s3(
|
||||||
|
data=errors,
|
||||||
|
bucket_name="retrofit-datalake-dev",
|
||||||
|
s3_file_name=f"propensity_model_data/{ha_name}/errors.pickle"
|
||||||
|
)
|
||||||
|
|
||||||
|
model_data.append(data)
|
||||||
|
|
||||||
|
return model_data
|
||||||
|
|
||||||
|
|
||||||
|
def conversion_model(loader):
|
||||||
|
# Read in the model data
|
||||||
|
|
||||||
|
model_data = []
|
||||||
|
for ha_name in loader.data.keys():
|
||||||
|
try:
|
||||||
|
picked = read_pickle_from_s3(
|
||||||
|
bucket_name="retrofit-datalake-dev",
|
||||||
|
s3_file_name=f"propensity_model_data/{ha_name}/train.pickle"
|
||||||
|
)
|
||||||
|
data = pd.DataFrame(picked)
|
||||||
|
|
||||||
|
# We merge on the sales data
|
||||||
|
sales_data = loader.data[ha_name]["survey_list"].copy()
|
||||||
|
data = data.merge(
|
||||||
|
sales_data[["asset_list_row_id", "installation_status"]],
|
||||||
|
how="left",
|
||||||
|
on="asset_list_row_id"
|
||||||
|
)
|
||||||
|
data["ha_name"] = ha_name
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error reading in the data for %s", ha_name)
|
||||||
|
continue
|
||||||
|
|
||||||
|
model_data.append(data)
|
||||||
|
|
||||||
|
model_data = pd.concat(model_data)
|
||||||
|
|
||||||
|
model_data["response"] = model_data["installation_status"].isin(
|
||||||
|
[
|
||||||
|
"ECO4 - in progress",
|
||||||
|
"ECO4 - installed"
|
||||||
|
]
|
||||||
|
).astype(int)
|
||||||
|
|
||||||
|
# Because of how we pulled the data, we need to re-balance the sample
|
||||||
|
ha_names = model_data["ha_name"].unique()
|
||||||
|
|
||||||
|
balanced_sample = []
|
||||||
|
for ha_name in ha_names:
|
||||||
|
df = model_data[model_data["ha_name"] == ha_name]
|
||||||
|
positive_samples = df[df["response"] == 1]
|
||||||
|
negative_samples = df[df["response"] != 1]
|
||||||
|
|
||||||
|
inputs = [x for k, x in loader.data.items() if k == ha_name][0]
|
||||||
|
asset_list = inputs["asset_list"].copy()
|
||||||
|
asset_list_size = asset_list.shape[0]
|
||||||
|
n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0]
|
||||||
|
success_rate = n_eligibile / asset_list_size
|
||||||
|
needed_sample_size = np.ceil(positive_samples.shape[0] / success_rate)
|
||||||
|
number_negative_samples = int(needed_sample_size - positive_samples.shape[0])
|
||||||
|
negative_samples_subset = negative_samples.sample(number_negative_samples)
|
||||||
|
|
||||||
|
output = pd.concat([positive_samples, negative_samples_subset])
|
||||||
|
|
||||||
|
balanced_sample.append(output)
|
||||||
|
|
||||||
|
balanced_sample = pd.concat(balanced_sample)
|
||||||
|
|
||||||
|
# We work with a small sample
|
||||||
|
# Drop the ECO Eligibility column and installation_status column
|
||||||
|
# We keep the ID column
|
||||||
|
balanced_sample = balanced_sample.drop(
|
||||||
|
columns=['ECO Eligibility', 'asset_list_row_id', 'address', 'uprn_source', 'address3', 'local_authority_label',
|
||||||
|
'county', 'postcode', 'constituency', 'local_authority', 'inspection_date', 'address1',
|
||||||
|
'constituency_label', 'building_reference_number', 'address2', 'posttown', 'lodgement_datetime',
|
||||||
|
'uprn', 'lodgement_date', 'lmk_key', 'installation_status', 'ha_name']
|
||||||
|
)
|
||||||
|
|
||||||
|
# POC model
|
||||||
|
df = balanced_sample.copy()
|
||||||
|
# FIll missings with means, if they exist
|
||||||
|
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
|
||||||
|
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
|
||||||
|
|
||||||
|
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
||||||
|
df[categorical_cols] = df[categorical_cols].fillna("other")
|
||||||
|
|
||||||
|
# Reduce the number of categories to a specific number and the rest to other
|
||||||
|
max_n_categories = 10
|
||||||
|
for col in categorical_cols:
|
||||||
|
top_categories = df[col].value_counts().nlargest(max_n_categories).index
|
||||||
|
df[col] = df[col].where(df[col].isin(top_categories), other="other")
|
||||||
|
|
||||||
|
# Use a model based approach to feature selection
|
||||||
|
import xgboost as xgb
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
# Assuming your outcome column is named 'target'
|
||||||
|
X = df.drop(columns=['response'])
|
||||||
|
y = df['response']
|
||||||
|
df["low_energy_fixed_light_count"].va
|
||||||
|
|
||||||
|
# Encoding categorical variables if not already done
|
||||||
|
X = pd.get_dummies(X, drop_first=True)
|
||||||
|
|
||||||
|
# Splitting the data into train and test sets
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||||||
|
|
||||||
|
# Initialize an XGBoost classifier
|
||||||
|
model = xgb.XGBClassifier()
|
||||||
|
|
||||||
|
# Fit the model
|
||||||
|
model.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# Get feature importances
|
||||||
|
feature_importances = model.feature_importances_
|
||||||
|
|
||||||
|
# Map feature importances to their corresponding column names
|
||||||
|
feature_importance_dict = {feature: importance for feature, importance in zip(X.columns, feature_importances)}
|
||||||
|
|
||||||
|
# Sort features by importance
|
||||||
|
sorted_features = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)
|
||||||
|
|
||||||
|
# Display sorted features
|
||||||
|
for feature, importance in sorted_features:
|
||||||
|
print(f"{feature}: {importance}")
|
||||||
|
|
||||||
|
|
||||||
def patch_cleaned(cleaned):
|
def patch_cleaned(cleaned):
|
||||||
# Patch to handle the a missing description
|
# Patch to handle the a missing description
|
||||||
|
|
@ -2054,6 +2548,218 @@ def patch_cleaned(cleaned):
|
||||||
return cleaned
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def forecast_remaining_sales(loader):
|
||||||
|
# Assumptions:
|
||||||
|
# We cap the ciga conversion rate at 75% because I expect future HAs to have a lower CIGA conversion rate
|
||||||
|
# and I don't want the numbers to change too much, depenent on the CIGA conversation rate
|
||||||
|
maximum_ciga_conversion = 0.75
|
||||||
|
|
||||||
|
gbis_rate = 600
|
||||||
|
eco4_rate = 1710
|
||||||
|
old_gbis_rate = 432
|
||||||
|
old_eco4_rate = 1456
|
||||||
|
|
||||||
|
# 1) Calculate the conversion rate from passed CIGA to actual sale
|
||||||
|
converted_ciga_jobs = []
|
||||||
|
for ha_name, input_data in loader.data.items():
|
||||||
|
asset_list = input_data["asset_list"].copy()
|
||||||
|
survey_list = input_data["survey_list"].copy()
|
||||||
|
|
||||||
|
if survey_list.empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ciga_dependent_assets = asset_list[
|
||||||
|
asset_list["ECO Eligibility"] == "eco4 - passed ciga"
|
||||||
|
]
|
||||||
|
|
||||||
|
# These are now the ciga dependent assets at installation
|
||||||
|
ciga_dependent_assets_at_installation = ciga_dependent_assets.merge(
|
||||||
|
survey_list[["asset_list_row_id", "installation_status"]],
|
||||||
|
how="inner",
|
||||||
|
on="asset_list_row_id"
|
||||||
|
)
|
||||||
|
|
||||||
|
# We then calculate how many get cancelled
|
||||||
|
ciga_dependent_assets_sold = ciga_dependent_assets_at_installation[
|
||||||
|
ciga_dependent_assets_at_installation["installation_status"].isin(
|
||||||
|
[
|
||||||
|
"ECO4 - installed", "ECO4 - in progress"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
ciga_dependent_assets_failed = ciga_dependent_assets_at_installation[
|
||||||
|
~ciga_dependent_assets_at_installation["installation_status"].isin(
|
||||||
|
[
|
||||||
|
"ECO4 - installed", "ECO4 - in progress"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
converted_ciga_jobs.append(
|
||||||
|
{
|
||||||
|
"HA Name": ha_name,
|
||||||
|
"# Ciga dependent at installation": ciga_dependent_assets_at_installation.shape[0],
|
||||||
|
"# Ciga dependent successfully installed": ciga_dependent_assets_sold.shape[0],
|
||||||
|
"# Ciga dependent failed install": ciga_dependent_assets_failed.shape[0]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
converted_ciga_jobs = pd.DataFrame(converted_ciga_jobs)
|
||||||
|
|
||||||
|
# We calculate a ciga pass to install conversaion rate
|
||||||
|
median_ciga_pass_to_install = (
|
||||||
|
converted_ciga_jobs["# Ciga dependent successfully installed"].sum() /
|
||||||
|
converted_ciga_jobs["# Ciga dependent at installation"].sum()
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2) Calculate the conversion rate from CIGA dependent ciga passed
|
||||||
|
ciga_passrates = []
|
||||||
|
for ha_name, input_data in loader.data.items():
|
||||||
|
|
||||||
|
# If we don't have a ciga list, we can't do anything
|
||||||
|
if input_data["ciga_list"].empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 1) Calculate the conversion rate for CIGA to actual sale
|
||||||
|
asset_list = input_data["asset_list"].copy()
|
||||||
|
|
||||||
|
ciga_completed_assets = asset_list[
|
||||||
|
asset_list["ECO Eligibility"].isin(
|
||||||
|
[
|
||||||
|
"eco4 - passed ciga",
|
||||||
|
"failed ciga"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
ciga_passed = ciga_completed_assets[
|
||||||
|
ciga_completed_assets["ECO Eligibility"].isin(
|
||||||
|
[
|
||||||
|
"eco4 - passed ciga"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
ciga_passrates.append(
|
||||||
|
{
|
||||||
|
"Ha Name": ha_name,
|
||||||
|
"# CIGA dependent": ciga_completed_assets.shape[0],
|
||||||
|
"# CIGA passed": ciga_passed.shape[0],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
ciga_passrates = pd.DataFrame(ciga_passrates)
|
||||||
|
|
||||||
|
median_ciga_pass_to_install = ciga_passrates["# CIGA passed"].sum() / ciga_passrates["# CIGA dependent"].sum()
|
||||||
|
|
||||||
|
# 3) Calculate the conversion rate of an ECO4 and a GBISjob, that doesn't need ciga, to install
|
||||||
|
eco4_ciga_independent_passrates = []
|
||||||
|
gbis_ciga_independent_passrates = []
|
||||||
|
for ha_name, input_data in loader.data.items():
|
||||||
|
asset_list = input_data["asset_list"].copy()
|
||||||
|
survey_list = input_data["survey_list"].copy()
|
||||||
|
|
||||||
|
if survey_list.empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# For properties that were identified as a typical ECO4 job, we calculate the number of properties that
|
||||||
|
# installed
|
||||||
|
# vs cancelled
|
||||||
|
|
||||||
|
typical_eco4 = asset_list[asset_list["ECO Eligibility"] == "eco4"]
|
||||||
|
typical_gbis = asset_list[asset_list["ECO Eligibility"] == "gbis"]
|
||||||
|
|
||||||
|
# Merge on the surveys
|
||||||
|
typical_eco4_installed = typical_eco4.merge(
|
||||||
|
survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not typical_eco4_installed.empty:
|
||||||
|
typical_eco4_sold = typical_eco4_installed[
|
||||||
|
typical_eco4_installed["installation_status"].isin(
|
||||||
|
[
|
||||||
|
"ECO4 - installed", "ECO4 - in progress"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
eco4_ciga_independent_passrates.append(
|
||||||
|
{
|
||||||
|
"Ha Name": ha_name,
|
||||||
|
"# ECO4 at install stage": typical_eco4_installed.shape[0],
|
||||||
|
"# ECO4 successfully installed": typical_eco4_sold.shape[0]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
typical_gbis_installed = typical_gbis.merge(
|
||||||
|
survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id"
|
||||||
|
)
|
||||||
|
if not typical_gbis_installed.empty:
|
||||||
|
typical_gbis_sold = typical_gbis_installed[
|
||||||
|
typical_gbis_installed["installation_status"].isin(
|
||||||
|
[
|
||||||
|
"GBIS - in progress", "GBIS - installed"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
gbis_ciga_independent_passrates.append(
|
||||||
|
{
|
||||||
|
"Ha Name": ha_name,
|
||||||
|
"# GBIS at install stage": typical_gbis_installed.shape[0],
|
||||||
|
"# GBIS successfully installed": typical_gbis_sold.shape[0]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
eco4_ciga_independent_passrates = pd.DataFrame(eco4_ciga_independent_passrates)
|
||||||
|
gbis_ciga_independent_passrates = pd.DataFrame(gbis_ciga_independent_passrates)
|
||||||
|
|
||||||
|
median_eco4_to_install = (
|
||||||
|
eco4_ciga_independent_passrates["# ECO4 successfully installed"].sum() /
|
||||||
|
eco4_ciga_independent_passrates["# ECO4 at install stage"].sum()
|
||||||
|
)
|
||||||
|
|
||||||
|
median_gbis_to_install = (
|
||||||
|
gbis_ciga_independent_passrates["# GBIS successfully installed"].sum() /
|
||||||
|
gbis_ciga_independent_passrates["# GBIS at install stage"].sum()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Produce the final output
|
||||||
|
december_figures = loader.december_figures.copy()
|
||||||
|
december_figures = december_figures.fillna(0)
|
||||||
|
results = []
|
||||||
|
for ha_name, input_data in loader.data.items():
|
||||||
|
# Original warmfront figures
|
||||||
|
original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name]
|
||||||
|
|
||||||
|
original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0]
|
||||||
|
original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0]
|
||||||
|
original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0]
|
||||||
|
original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0]
|
||||||
|
|
||||||
|
original_warmfront_eco4_revenue = (
|
||||||
|
original_warmfront_remaining_eco4 * eco4_rate +
|
||||||
|
(original_warmfront_eco4 - original_warmfront_remaining_eco4) * old_eco4_rate
|
||||||
|
)
|
||||||
|
original_warmfront_remaining_eco4_revenue = original_warmfront_remaining_eco4 * eco4_rate
|
||||||
|
|
||||||
|
original_warmfront_gbis_revenue = (
|
||||||
|
original_warmfront_remaining_gbis * gbis_rate +
|
||||||
|
(original_warmfront_gbis - original_warmfront_remaining_gbis) * old_gbis_rate
|
||||||
|
)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
{
|
||||||
|
("", "", "HA Name"): ha_name,
|
||||||
|
("Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4,
|
||||||
|
("", "Remaining - #", ""): original_warmfront_remaining_eco4,
|
||||||
|
("", "Total - £", ""): original_warmfront_eco4_revenue,
|
||||||
|
("", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def app():
|
def app():
|
||||||
"""
|
"""
|
||||||
This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107.
|
This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107.
|
||||||
|
|
@ -2067,11 +2773,14 @@ def app():
|
||||||
pull_data = False
|
pull_data = False
|
||||||
|
|
||||||
# List all of the data in the folder
|
# List all of the data in the folder
|
||||||
directories = [str(list(entry.iterdir())[0]) for entry in DATA_FOLDER.iterdir() if entry.is_dir()]
|
|
||||||
|
directories = [str(file) for entry in DATA_FOLDER.iterdir() if entry.is_dir()
|
||||||
|
for file in entry.iterdir() if file.suffix == '.xlsx']
|
||||||
# Grab the December HA figures filepath
|
# Grab the December HA figures filepath
|
||||||
december_figures_filepath = "local_data/ha_data/HA_December_figures.csv"
|
december_figures_filepath = "local_data/ha_data/HA_December_figures.csv"
|
||||||
|
|
||||||
priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA39", "HA107"]
|
# priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA39", "HA107"]
|
||||||
|
priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA39", "HA107"]
|
||||||
# Filter down the directories to only the priority HAs
|
# Filter down the directories to only the priority HAs
|
||||||
directories = [d for d in directories if d.split("/")[2] in priority_has]
|
directories = [d for d in directories if d.split("/")[2] in priority_has]
|
||||||
|
|
||||||
|
|
@ -2103,3 +2812,17 @@ def app():
|
||||||
floor_area_decile_thresholds=floor_area_decile_thresholds,
|
floor_area_decile_thresholds=floor_area_decile_thresholds,
|
||||||
pull_data=pull_data
|
pull_data=pull_data
|
||||||
)
|
)
|
||||||
|
|
||||||
|
analyse_ha_data(outputs, loader)
|
||||||
|
|
||||||
|
# import pickle
|
||||||
|
# with open("ha_analysis.pickle", "wb") as f:
|
||||||
|
# pickle.dump({"outputs": outputs, "loader": loader}, f)
|
||||||
|
|
||||||
|
# To read:
|
||||||
|
# import pickle
|
||||||
|
# with open("ha_analysis.pickle", "rb") as f:
|
||||||
|
# outputs = pickle.load(f)["outputs"]
|
||||||
|
#
|
||||||
|
# with open("loader.pickle", "rb") as f:
|
||||||
|
# loader = pickle.load(f)
|
||||||
|
|
|
||||||
|
|
@ -184,7 +184,7 @@ def read_pickle_from_s3(bucket_name, s3_file_name):
|
||||||
logger.errpr("Incomplete credentials provided.")
|
logger.errpr("Incomplete credentials provided.")
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.errpr(f'Failed to download data from {bucket_name}/{s3_file_name}: {str(e)}')
|
logger.error(f'Failed to download data from {bucket_name}/{s3_file_name}: {str(e)}')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Deserialize data from pickle format
|
# Deserialize data from pickle format
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue