From 615f2289e758c136e73dfaac88d0ff906785f03a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 22 Feb 2024 12:39:46 +0000 Subject: [PATCH 001/155] Debugging list loading --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../ha_15_32/ha_analysis_batch_3.py | 81 +++++++------------ 3 files changed, 29 insertions(+), 56 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index 4413bb06..b0f9c00d 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 6f308057..1122b380 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 92956337..7bb8b40c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -131,9 +131,17 @@ class DataLoader: return ciga_list + @staticmethod + def get_sheetname(workbook): + if "Asset List" in workbook.sheetnames: + return "Asset List" + else: + return "Assets" + def load_asset_list(self, filepath, ha_name): workbook = openpyxl.load_workbook(filepath) - asset_sheet = workbook["Assets"] + sheetname = self.get_sheetname(workbook) + asset_sheet = workbook[sheetname] asset_sheet_colnames = [cell.value for cell in asset_sheet[1]] rows_data = [] @@ -170,8 +178,10 @@ class DataLoader: # Remove columns that are None survey_list = survey_list.loc[:, survey_list.columns.notnull()] survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))] + # Perform survey list merge - survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name) + if not survey_list.empty: + survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name) # We check if there are CIGA checks ciga_list = pd.DataFrame() @@ -185,9 +195,10 @@ class DataLoader: ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]]) # Remove columns that are None ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()] - ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) # Perform ciga list merge - ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) + if not ciga_list.empty: + ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) + ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) return asset_list, survey_list, ciga_list @@ -208,6 +219,10 @@ class DataLoader: return asset_list + @staticmethod + def correct_ha39_asset_list(asset_list): + return asset_list + @staticmethod def correct_ha6_survey_list(survey_list): @@ -337,6 +352,10 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha39_survey_list(survey_list): + return survey_list + def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): # Correct the asset list @@ -491,23 +510,10 @@ class DataLoader: ha_name=ha_name, ) - if file_config.get("survey_list"): - # TODO: Delete this - logger.info("Loading survey list for {}".format(ha_name)) - survey_list, matched_lookup = self.load_survey_list( - asset_list=asset_list, - file_path=file_config["survey_list"]["filepath"], - ha_name=ha_name, - sheet_name=file_config["survey_list"]["sheetname"] - ) - else: - survey_list = None - matched_lookup = None - data[ha_name] = { "asset_list": asset_list, "survey_list": survey_list, - "matched_lookup": matched_lookup + "ciga_list": ciga_list } self.data = data @@ -1288,42 +1294,9 @@ def app(): # List all of the data in the folder directories = [str(list(entry.iterdir())[0]) for entry in DATA_FOLDER.iterdir() if entry.is_dir()] - files = { - "ha_1": { - "asset_list": { - "filepath": "local_data/ha_data/HA1/ACCENT GROUP.xlsx", - "sheetname": "Energy data" - } - }, - "ha_6": { - "asset_list": { - "filepath": "etl/eligibility/ha_15_32/HA 6 - ASSET LIST.xlsx", - "sheetname": "HA 6" - }, - "survey_list": { - "filepath": "etl/eligibility/ha_15_32/HA 6 - SURVEY LIST.xlsx", - "sheetname": "HA 6" - } - }, - "ha_14": { - "asset_list": { - "filepath": "etl/eligibility/ha_15_32/HA 14 - ASSET LIST.xlsx", - "sheetname": "HA 14" - } - }, - "ha_39": { - "asset_list": { - "filepath": "etl/eligibility/ha_15_32/HA 39 - ASSET LIST.xlsx", - "sheetname": "Sheet1" - } - }, - "ha_107": { - "asset_list": { - "filepath": "etl/eligibility/ha_15_32/HA 107 - ASSET LIST.xlsx", - "sheetname": "HA 107" - } - } - } + priority_has = ["HA1", "HA6", "HA14", "HA39", "HA107"] + # Filter down the directories to only the priority HAs + directories = [d for d in directories if d.split("/")[2] in priority_has] loader = DataLoader(directories, use_cache) loader.load() From a1b2f9bf5bdd2d059c6327612fe2cb83c5be1687 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 22 Feb 2024 12:42:04 +0000 Subject: [PATCH 002/155] Added ciga list id --- .../ha_15_32/ha_analysis_batch_3.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7bb8b40c..fffc9daf 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -195,6 +195,7 @@ class DataLoader: ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]]) # Remove columns that are None ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()] + survey_list["survey_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(survey_list))] # Perform ciga list merge if not ciga_list.empty: ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) @@ -440,14 +441,14 @@ class DataLoader: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())] if df.shape[0] != 1: postcode_lower = row["Post Code"].lower() - if postcode_lower in missed_postcodes: - matching_lookup.append( - { - "survey_list_row_id": row["survey_list_row_id"], - "asset_list_row_id": None, - } - ) - continue + # if postcode_lower in missed_postcodes: + # matching_lookup.append( + # { + # "survey_list_row_id": row["survey_list_row_id"], + # "asset_list_row_id": None, + # } + # ) + # continue print(row["Street / Block Name"]) print(house_number) @@ -456,13 +457,18 @@ class DataLoader: matching_lookup.append( { - "survey_list_row_id": row["survey_list_row_id"], + "ciga_list_row_id": row["ciga_list_row_id"], "asset_list_row_id": df["asset_list_row_id"].values[0], } ) matching_lookup = pd.DataFrame(matching_lookup) + # Merge onto the ciga list + ciga_list = ciga_list.merge(matching_lookup, how='left', on="ciga_list_row_id") + + return ciga_list + @staticmethod def identify_built_form_ha6(property_string): """ From d3bff08df8a4ce0d786acc10f9ab605abc938131 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 22 Feb 2024 12:53:01 +0000 Subject: [PATCH 003/155] debugging survey matching for ha14 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index fffc9daf..d27bf8e8 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -221,7 +221,7 @@ class DataLoader: return asset_list @staticmethod - def correct_ha39_asset_list(asset_list): + def correct_ha14_asset_list(asset_list): return asset_list @staticmethod @@ -354,7 +354,15 @@ class DataLoader: return survey_list @staticmethod - def correct_ha39_survey_list(survey_list): + def correct_ha14_survey_list(survey_list): + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Godfrey Road", "Godfrey Drive" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Oiliver Road", "Oliver Road" + ) + return survey_list def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): @@ -389,7 +397,7 @@ class DataLoader: if df.shape[0] != 1: df = df[df["HouseNo"] == str(house_number)] if df.shape[0] != 1: - df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())] + df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())] if df.shape[0] != 1: postcode_lower = row["Post Code"].lower() if postcode_lower in missed_postcodes: From c6daf520467b0c994a67f7746b51450f36b6bea7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 22 Feb 2024 16:00:23 +0000 Subject: [PATCH 004/155] Trying to handle streetname extraction and edge case in ciga matching --- .../ha_15_32/ha_analysis_batch_3.py | 192 +++++++++++++----- 1 file changed, 143 insertions(+), 49 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index d27bf8e8..cb4b9885 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1,4 +1,5 @@ import os +import re import openpyxl from pathlib import Path import msgpack @@ -36,6 +37,10 @@ class DataLoader: } } + UNMATCHED_CIGA = { + "HA14": 6 + } + def __init__(self, directories, use_cache): self.directories = directories self.use_cache = use_cache @@ -101,6 +106,9 @@ class DataLoader: else: split_addresses = asset_list['matching_address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) + # If we have "flat" or valley" as the house number, then the house number is actually in the second column + house_numbers[0] = np.where(house_numbers[0].isin(["flat", "valley"]), house_numbers[1], house_numbers[0]) + # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how # many columns there might be house_numbers = house_numbers.iloc[:, 0:1] @@ -117,7 +125,7 @@ class DataLoader: :return: """ - if ha_name in ["HA6"]: + if ha_name in ["HA6", "HA14"]: split_addresses = ciga_list['Matched Address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how @@ -132,16 +140,23 @@ class DataLoader: return ciga_list @staticmethod - def get_sheetname(workbook): + def get_asset_sheetname(workbook): if "Asset List" in workbook.sheetnames: return "Asset List" else: return "Assets" + @staticmethod + def get_ciga_sheetname(workbook): + if "CIGA Checks" in workbook.sheetnames: + return "CIGA Checks" + else: + return "CIGA" + def load_asset_list(self, filepath, ha_name): workbook = openpyxl.load_workbook(filepath) - sheetname = self.get_sheetname(workbook) - asset_sheet = workbook[sheetname] + asset_sheetname = self.get_asset_sheetname(workbook) + asset_sheet = workbook[asset_sheetname] asset_sheet_colnames = [cell.value for cell in asset_sheet[1]] rows_data = [] @@ -165,41 +180,46 @@ class DataLoader: asset_list = self.append_asset_list_built_form(ha_name=ha_name, asset_list=asset_list) + # We correct the asset list if it needs it + # Correct the asset list + correction_function_name = f"correct_{ha_name.lower()}_asset_list" + if hasattr(self, correction_function_name): + asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list") + asset_list = asset_list_correction_function(asset_list) + # We check if there is a survey list - survey_list = pd.DataFrame() - if "ECO Surveys" in workbook.sheetnames: - survey_sheet = workbook["ECO Surveys"] - survey_rows = [] - for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers - row_data = [cell.value for cell in row] # This will get you the cell values - survey_rows.append(row_data) + survey_sheetname = "ECO Surveys" + survey_sheet = workbook[survey_sheetname] + survey_rows = [] + for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + survey_rows.append(row_data) - survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) - # Remove columns that are None - survey_list = survey_list.loc[:, survey_list.columns.notnull()] - survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))] + survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) + # Remove columns that are None + survey_list = survey_list.loc[:, survey_list.columns.notnull()] + survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))] - # Perform survey list merge - if not survey_list.empty: - survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name) + # Perform survey list merge + if not survey_list.empty: + survey_list = self.merge_surveys_to_assets(asset_list, survey_list, ha_name) # We check if there are CIGA checks - ciga_list = pd.DataFrame() - if "CIGA Checks" in workbook.sheetnames: - ciga_sheet = workbook["CIGA Checks"] - ciga_rows = [] - for row in ciga_sheet.iter_rows(min_row=2, values_only=False): - row_data = [cell.value for cell in row] # This will get you the cell values - ciga_rows.append(row_data) + ciga_sheetname = self.get_ciga_sheetname(workbook) + ciga_sheet = workbook[ciga_sheetname] + ciga_rows = [] + for row in ciga_sheet.iter_rows(min_row=2, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + ciga_rows.append(row_data) - ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]]) - # Remove columns that are None - ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()] - survey_list["survey_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(survey_list))] - # Perform ciga list merge - if not ciga_list.empty: - ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) - ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) + ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]]) + # Remove columns that are None + ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()] + ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] + # Perform ciga list merge + if not ciga_list.empty: + ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) + ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) return asset_list, survey_list, ciga_list @@ -222,6 +242,21 @@ class DataLoader: @staticmethod def correct_ha14_asset_list(asset_list): + + # For 5 Queens Court, DE72 3NP, the postcode is actually DE72 3QZ + asset_list.loc[ + (asset_list["Address 1"] == "5 Queens Court") & + (asset_list["Postcode"].str.strip() == "DE72 3NP"), + "matching_postcode" + ] = "DE72 3QZ" + + # We then correct the matching_address + asset_list.loc[ + (asset_list["Address 1"] == "5 Queens Court") & + (asset_list["Postcode"].str.strip() == "DE72 3NP"), + "matching_address" + ] = "5 queens court, garfield avenue, draycott, derby, de72 3qz" + return asset_list @staticmethod @@ -363,13 +398,22 @@ class DataLoader: "Oiliver Road", "Oliver Road" ) + # For postodes DE7 4FB, DE7 4EZ, it's actually spelled WINDERMERE AVENUE, not WINDEREMERE AVENUE (without the + # extra e) + survey_list.loc[ + (survey_list["Street / Block Name"] == "WINDEREMERE AVENUE") & + (survey_list["Post Code"].isin(["DE7 4FB", "DE7 4EZ"])), + "Street / Block Name" + ] = "WINDERMERE AVENUE" + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "MACDONALD SQAURE", "MACDONALD SQUARE" + ) + return survey_list def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): - # Correct the asset list - asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list") - asset_list = asset_list_correction_function(asset_list) # Correct the survey list survey_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_survey_list") survey_list = survey_list_correction_function(survey_list) @@ -411,7 +455,7 @@ class DataLoader: print(row["Street / Block Name"]) print(house_number) - print(row["Post Code"].lower()) + print(row["Post Code"]) raise ValueError("Investigate") matching_lookup.append( @@ -428,8 +472,38 @@ class DataLoader: return survey_list + @staticmethod + def extract_streetname(address, house_number=None, postcode=None): + """ + Cleans an address by removing the house number and postcode, and converts everything to lower case. + + :param address: The full address as a string. + :param house_number: The house number to remove, as a string or integer. + :param postcode: The postcode to remove, as a string. + :return: The cleaned address. + """ + # Convert everything to lower case + address = address.lower() + + if house_number is not None: + # Remove the house number + address = re.sub(r'\b{}\b'.format(house_number), '', address, flags=re.IGNORECASE).strip() + + if postcode is not None: + # Remove the postcode + address = re.sub(r'\b{}\b'.format(re.escape(postcode)), '', address, flags=re.IGNORECASE).strip() + + # Get first section before a comma + address = address.split(",")[0] + # Additional cleaning to remove extra spaces and commas left over + address = re.sub(r'\s+', ' ', address) # Replace multiple spaces with a single space + address = re.sub(r'\s*,\s*', ', ', address) # Clean up space around commas + + return address + def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name): matching_lookup = [] + unmatched_addresses = [] for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)): house_number = row["HouseNo"] @@ -442,22 +516,35 @@ class DataLoader: ].copy() df = df[df["HouseNo"] == str(house_number)] + # For ciga, we skip + if df.empty: + if row["Matched Postcode"] == "LE3 3EE": + dew + unmatched_addresses.append( + { + "ciga_list_row_id": row["ciga_list_row_id"], + "HouseNo": house_number, + "Matched Postcode": row["Matched Postcode"] + } + ) + continue # TODO: Might need to consider street name at some point if df.shape[0] != 1: - if df.shape[0] != 1: - df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower())] - if df.shape[0] != 1: - postcode_lower = row["Post Code"].lower() - # if postcode_lower in missed_postcodes: - # matching_lookup.append( - # { - # "survey_list_row_id": row["survey_list_row_id"], - # "asset_list_row_id": None, - # } - # ) - # continue + # We split house number and postcode out of the matched address for ciga + street_name = self.extract_streetname( + address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"] + ) + df = df[df["matching_address"].str.contains(street_name)] + if df.shape[0] != 1: + # The final check we do here is to check for the presence of flat in the address + if "flat" in row["Matched Address"]: + df = df[df["matching_address"].str.contains("flat")] + else: + df = df[df["matching_address"].str.contains("flat") == False] + + if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) print(row["Post Code"].lower()) @@ -470,6 +557,13 @@ class DataLoader: } ) + # We have an acceptable number of ciga failures for each HA + if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]: + raise ValueError(f"Unmatched addresses for {ha_name} is not as expected") + + # In ciga: 35 Valley Drive, Leicester, LE3 3EE + # + matching_lookup = pd.DataFrame(matching_lookup) # Merge onto the ciga list From 75102704cdfeacaac68194c9646e23f208e48baf Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 22 Feb 2024 16:05:31 +0000 Subject: [PATCH 005/155] ciga matching for ha14 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index cb4b9885..1a28500b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -38,7 +38,9 @@ class DataLoader: } UNMATCHED_CIGA = { - "HA14": 6 + # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not + # the asset list + "HA14": 4 } def __init__(self, directories, use_cache): @@ -518,8 +520,6 @@ class DataLoader: df = df[df["HouseNo"] == str(house_number)] # For ciga, we skip if df.empty: - if row["Matched Postcode"] == "LE3 3EE": - dew unmatched_addresses.append( { "ciga_list_row_id": row["ciga_list_row_id"], @@ -528,18 +528,18 @@ class DataLoader: } ) continue - # TODO: Might need to consider street name at some point + if df.shape[0] != 1: # We split house number and postcode out of the matched address for ciga street_name = self.extract_streetname( address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"] ) - df = df[df["matching_address"].str.contains(street_name)] + df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)] if df.shape[0] != 1: # The final check we do here is to check for the presence of flat in the address - if "flat" in row["Matched Address"]: + if "flat" in row["Matched Address"].lower(): df = df[df["matching_address"].str.contains("flat")] else: df = df[df["matching_address"].str.contains("flat") == False] From 32352bbde145c6a0c76f503c766e7fca80c2af99 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 22 Feb 2024 17:46:11 +0000 Subject: [PATCH 006/155] working on survey match for ha107 --- .../ha_15_32/ha_analysis_batch_3.py | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1a28500b..9e850c0e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -40,7 +40,9 @@ class DataLoader: UNMATCHED_CIGA = { # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not # the asset list - "HA14": 4 + "HA14": 4, + # There's just too many unmatched here - if we identify some homes that + "HA6": 117 } def __init__(self, directories, use_cache): @@ -78,11 +80,11 @@ class DataLoader: elif ha_name == "HA107": # Create matching_address by concatenating House No, Street, Town, District, Postcode asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \ - asset_list["Street"].str.lower().str.strip() + ", " + \ - asset_list["Town"].str.lower().str.strip() + ", " + \ - asset_list["District"].str.lower().str.strip() + ", " + \ - asset_list["Postcode"].str.lower().str.strip() - asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() + asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["District"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() else: raise NotImplementedError("implement me") @@ -155,6 +157,13 @@ class DataLoader: else: return "CIGA" + @staticmethod + def get_survey_sheetname(workbook): + if "ECO Surveys" in workbook.sheetnames: + return "ECO Surveys" + else: + return "ECO surveys" + def load_asset_list(self, filepath, ha_name): workbook = openpyxl.load_workbook(filepath) asset_sheetname = self.get_asset_sheetname(workbook) @@ -189,8 +198,13 @@ class DataLoader: asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list") asset_list = asset_list_correction_function(asset_list) + # For HA1, there is an exception in the structure of the data. We don't have any survey or ciga lists, and so + # we can return the asset list now + if ha_name == "HA1": + return asset_list, pd.DataFrame(), pd.DataFrame() + # We check if there is a survey list - survey_sheetname = "ECO Surveys" + survey_sheetname = self.get_survey_sheetname(workbook) survey_sheet = workbook[survey_sheetname] survey_rows = [] for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers @@ -217,6 +231,9 @@ class DataLoader: ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]]) # Remove columns that are None ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()] + # Remove rows with missing postcode which happens in a small number of cases + ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])] + ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] # Perform ciga list merge if not ciga_list.empty: @@ -414,6 +431,10 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha107_survey_list(survey_list): + return survey_list + def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): # Correct the survey list @@ -441,7 +462,7 @@ class DataLoader: df = df[df["matching_address"].str.contains(str(house_number))] if df.shape[0] != 1: - df = df[df["HouseNo"] == str(house_number)] + df = df[df["HouseNo"].astype(str) == str(house_number)] if df.shape[0] != 1: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())] if df.shape[0] != 1: @@ -506,6 +527,7 @@ class DataLoader: def merge_ciga_to_assets(self, asset_list, ciga_list, ha_name): matching_lookup = [] unmatched_addresses = [] + for _, row in tqdm(ciga_list.iterrows(), total=len(ciga_list)): house_number = row["HouseNo"] @@ -528,7 +550,7 @@ class DataLoader: } ) continue - + if df.shape[0] != 1: # We split house number and postcode out of the matched address for ciga @@ -561,9 +583,6 @@ class DataLoader: if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]: raise ValueError(f"Unmatched addresses for {ha_name} is not as expected") - # In ciga: 35 Valley Drive, Leicester, LE3 3EE - # - matching_lookup = pd.DataFrame(matching_lookup) # Merge onto the ciga list @@ -612,7 +631,7 @@ class DataLoader: for filepath in self.directories: ha_name = filepath.split("/")[2] # Load asset list - logger.info("Loading asset list for {}".format(ha_name)) + logger.info("Loading data for {}".format(ha_name)) asset_list, survey_list, ciga_list = self.load_asset_list( filepath=filepath, ha_name=ha_name, From d038d668b8fa8360577ef0f83403e3d4cb6e854e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 22 Feb 2024 17:52:20 +0000 Subject: [PATCH 007/155] ha107 matching 73% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 9e850c0e..46581eca 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -433,6 +433,16 @@ class DataLoader: @staticmethod def correct_ha107_survey_list(survey_list): + # Replace Front Street, East Stockham with Front Street, East Stockwith + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Front Street, East Stockham", "Front Street, East Stockwith" + ) + + # Replace "HONEYHOLE L;ANE" with "HONEYHOLES LANE" + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "HONEYHOLE L;ANE", "HONEYHOLES LANE" + ) + return survey_list def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): From ccb764d4a968efeaef67a068f1cc21f92dfe7000 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 22 Feb 2024 18:01:24 +0000 Subject: [PATCH 008/155] ha107 matching 74% done --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 46581eca..60ef485a 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -443,6 +443,16 @@ class DataLoader: "HONEYHOLE L;ANE", "HONEYHOLES LANE" ) + # Replace "Croft Lane Cherry Willingham, Lincoln" with "Croft Lane, Cherry Willingham, Lincoln" + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Croft Lane Cherry Willingham, Lincoln", "Croft Lane, Cherry Willingham, Lincoln" + ) + + # Replace "Snelland Road Wickenby, Lincoln" with "Snelland Road, Wickenby, Lincoln" + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln" + ) + return survey_list def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): From cef20c6e2cf97275146f36f97349f4d0a46d2410 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 23 Feb 2024 12:08:44 +0000 Subject: [PATCH 009/155] completed matching for ha107, added levenstein method --- .../ha_15_32/ha_analysis_batch_3.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 60ef485a..bf3e6d31 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1,6 +1,7 @@ import os import re import openpyxl +import Levenshtein from pathlib import Path import msgpack from datetime import datetime @@ -453,6 +454,41 @@ class DataLoader: "Snelland Road Wickenby, Lincoln", "Snelland Road, Wickenby, Lincoln" ) + # Replace Reasby Road Snelland, Lincoln with Reasby Road, Snelland, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Reasby Road Snelland, Lincoln", "Reasby Road, Snelland, Lincoln" + ) + + # Replace Silver Street Bardney, Lincoln with Silver Street, Bardney, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Silver Street Bardney, Lincoln", "Silver Street, Bardney, Lincoln" + ) + + # Replace Manor Close Bardney, Lincoln with Manor Close, Bardney, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Manor Close Bardney, Lincoln", "Manor Close, Bardney, Lincoln" + ) + + # Replace Ferry Road Southrey, Lincoln with Ferry Road, Southrey, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Ferry Road Southrey, Lincoln", "Ferry Road, Southrey, Lincoln" + ) + + # Replace Harvey Kent Gardens Bardney, Lincoln with Harvey Kent Gardens, Bardney, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Harvey Kent Gardens Bardney, Lincoln", "Harvey Kent Gardens, Bardney, Lincoln" + ) + + # Replace Wragby Road Bardney, Lincoln with Wragby Road, Bardney, Lincoln + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Wragby Road Bardney, Lincoln", "Wragby Road, Bardney, Lincoln" + ) + + # Replace SPRINKHILL ROAD with SPINKHILL ROAD + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "SPRINKHILL ROAD", "SPINKHILL ROAD" + ) + return survey_list def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): @@ -481,10 +517,35 @@ class DataLoader: ].copy() df = df[df["matching_address"].str.contains(str(house_number))] + + if df.empty: + print(row["Street / Block Name"]) + print(house_number) + print(row["Post Code"]) + raise ValueError("Investigate") + if df.shape[0] != 1: df = df[df["HouseNo"].astype(str) == str(house_number)] if df.shape[0] != 1: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())] + + full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + row[ + "Town/Area"].lower().strip() + row["Post Code"].lower().strip() + # Remove any spaces from the full key + full_key = full_key.replace(" ", "") + + match_to = df["matching_address"].tolist() + # Strip out punctuation and spaces + match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to] + match_to = [x.replace(" ", "") for x in match_to] + + # Perform matching between full key and match_to + distances = [Levenshtein.distance(full_key, s) for s in match_to] + best_match_index = distances.index(min(distances)) + # We might want to consider a threshold for the distance, however for the momeny, + # we don't consider this for the moment + df = df.iloc[best_match_index:best_match_index + 1] + if df.shape[0] != 1: postcode_lower = row["Post Code"].lower() if postcode_lower in missed_postcodes: @@ -510,6 +571,9 @@ class DataLoader: matching_lookup = pd.DataFrame(matching_lookup) + if matching_lookup.shape[0] != survey_list.shape[0]: + raise ValueError("Mismatch in the number of survey rows and matching lookup rows") + # Merge onto the survey list survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id") From bc0a2b8e37eab7dcfc4130b18b5c3ebe1c0953cc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 23 Feb 2024 12:11:00 +0000 Subject: [PATCH 010/155] debygging location of dropping nulls from ciga list --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bf3e6d31..f1709d6e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -232,12 +232,11 @@ class DataLoader: ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]]) # Remove columns that are None ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()] - # Remove rows with missing postcode which happens in a small number of cases - ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])] - - ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] # Perform ciga list merge if not ciga_list.empty: + # Remove rows with missing postcode which happens in a small number of cases + ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])] + ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) From 5a451f2f8239aaac05237c93b99c435de83a8652 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 23 Feb 2024 12:20:46 +0000 Subject: [PATCH 011/155] fixed logic for missed postcodes for ha6 --- .../ha_15_32/ha_analysis_batch_3.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index f1709d6e..95ca3901 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -518,6 +518,17 @@ class DataLoader: df = df[df["matching_address"].str.contains(str(house_number))] if df.empty: + + postcode_lower = row["Post Code"].lower() + if postcode_lower in missed_postcodes: + matching_lookup.append( + { + "survey_list_row_id": row["survey_list_row_id"], + "asset_list_row_id": None, + } + ) + continue + print(row["Street / Block Name"]) print(house_number) print(row["Post Code"]) @@ -546,16 +557,6 @@ class DataLoader: df = df.iloc[best_match_index:best_match_index + 1] if df.shape[0] != 1: - postcode_lower = row["Post Code"].lower() - if postcode_lower in missed_postcodes: - matching_lookup.append( - { - "survey_list_row_id": row["survey_list_row_id"], - "asset_list_row_id": None, - } - ) - continue - print(row["Street / Block Name"]) print(house_number) print(row["Post Code"]) From 75183902c193a8c5634b8cbc9c7bf045dd5a0898 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 23 Feb 2024 15:54:28 +0000 Subject: [PATCH 012/155] completed creationg of matching tables --- .../ha_15_32/ha_analysis_batch_3.py | 63 ++++++++++++++----- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 95ca3901..2d95a946 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -43,7 +43,8 @@ class DataLoader: # the asset list "HA14": 4, # There's just too many unmatched here - if we identify some homes that - "HA6": 117 + "HA6": 117, + "HA107": 52 } def __init__(self, directories, use_cache): @@ -130,7 +131,7 @@ class DataLoader: :return: """ - if ha_name in ["HA6", "HA14"]: + if ha_name in ["HA6", "HA14", "HA107"]: split_addresses = ciga_list['Matched Address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how @@ -153,8 +154,11 @@ class DataLoader: @staticmethod def get_ciga_sheetname(workbook): + if "CIGA Checks" in workbook.sheetnames: return "CIGA Checks" + elif "CIGA checks" in workbook.sheetnames: + return "CIGA checks" else: return "CIGA" @@ -490,6 +494,22 @@ class DataLoader: return survey_list + @staticmethod + def levenstein_match(matching_string, df): + match_to = df["matching_address"].tolist() + # Strip out punctuation and spaces + match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to] + match_to = [x.replace(" ", "") for x in match_to] + + # Perform matching between full key and match_to + distances = [Levenshtein.distance(matching_string, s) for s in match_to] + best_match_index = distances.index(min(distances)) + # We might want to consider a threshold for the distance, however for the momeny, + # we don't consider this for the moment + df = df.iloc[best_match_index:best_match_index + 1] + + return df + def merge_surveys_to_assets(self, asset_list, survey_list, ha_name): # Correct the survey list @@ -544,17 +564,7 @@ class DataLoader: # Remove any spaces from the full key full_key = full_key.replace(" ", "") - match_to = df["matching_address"].tolist() - # Strip out punctuation and spaces - match_to = [re.sub(r'[^\w\s]', '', x) for x in match_to] - match_to = [x.replace(" ", "") for x in match_to] - - # Perform matching between full key and match_to - distances = [Levenshtein.distance(full_key, s) for s in match_to] - best_match_index = distances.index(min(distances)) - # We might want to consider a threshold for the distance, however for the momeny, - # we don't consider this for the moment - df = df.iloc[best_match_index:best_match_index + 1] + df = self.levenstein_match(full_key, df) if df.shape[0] != 1: print(row["Street / Block Name"]) @@ -623,7 +633,7 @@ class DataLoader: asset_list["matching_address"].str.contains(row["Matched Postcode"].lower().strip()) ].copy() - df = df[df["HouseNo"] == str(house_number)] + df = df[df["HouseNo"].astype(str) == str(house_number)] # For ciga, we skip if df.empty: unmatched_addresses.append( @@ -641,7 +651,9 @@ class DataLoader: street_name = self.extract_streetname( address=row["Matched Address"], house_number=house_number, postcode=row["Matched Postcode"] ) - df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)] + # We check if any of the rows contains the street name and if they do, filter + if any(df["matching_address"].str.replace(",", "").str.contains(street_name)): + df = df[df["matching_address"].str.replace(",", "").str.contains(street_name)] if df.shape[0] != 1: # The final check we do here is to check for the presence of flat in the address @@ -650,6 +662,13 @@ class DataLoader: else: df = df[df["matching_address"].str.contains("flat") == False] + if df.shape[0] != 1: + full_key = str(row["HouseNo"]).lower().strip() + row["Matched Address"].lower().strip() + row[ + "Matched Postcode"].lower().strip() + # Remove any spaces from the full key + full_key = full_key.replace(" ", "") + df = self.levenstein_match(full_key, df) + if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) @@ -737,6 +756,19 @@ class DataLoader: s3_file_name="ha-analysis/batch3-inputs.pickle", ) + def ha_facts_and_figures(self): + """ + This function will return a dictionary of facts and figures for each HA + :return: + """ + ha_facts_and_figures = [] + for ha_name, data_assets in self.data.items(): + asset_list = data_assets["asset_list"] + survey_list = data_assets["survey_list"] + ciga_list = data_assets["ciga_list"] + + return ha_facts_and_figures + def get_epc_data( loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True @@ -1511,6 +1543,7 @@ def app(): loader = DataLoader(directories, use_cache) loader.load() + loader.ha_facts_and_figures() # TODO: We probably need to make sure that we have all of the columns that we need From 6693ab4ca6e12a6b9da112e8c8a3d48b1fe6ad87 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 23 Feb 2024 17:13:18 +0000 Subject: [PATCH 013/155] Added in read of december figures --- .../ha_15_32/ha_analysis_batch_3.py | 55 +++++++++++++++++-- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 2d95a946..dbe12e92 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -47,11 +47,13 @@ class DataLoader: "HA107": 52 } - def __init__(self, directories, use_cache): + def __init__(self, directories, december_figures_filepath, use_cache): self.directories = directories self.use_cache = use_cache + self.december_figures_filepath = december_figures_filepath self.data = {} + self.december_figures = None def create_asset_list_matching_address(self, ha_name, asset_list): @@ -730,6 +732,11 @@ class DataLoader: ) return + # Get the december figures, which is just a csv + self.december_figures = pd.read_csv(self.december_figures_filepath) + # Remove the spaces in HA Name + self.december_figures["HA Name"] = december_figures["HA Name"].str.replace(" ", "") + data = {} for filepath in self.directories: ha_name = filepath.split("/")[2] @@ -763,9 +770,43 @@ class DataLoader: """ ha_facts_and_figures = [] for ha_name, data_assets in self.data.items(): - asset_list = data_assets["asset_list"] - survey_list = data_assets["survey_list"] - ciga_list = data_assets["ciga_list"] + asset_list = data_assets["asset_list"].copy() + survey_list = data_assets["survey_list"].copy() + ciga_list = data_assets["ciga_list"].copy() + + asset_list["ECO Eligibility"].value_counts() + + # We merge on ciga and update the status to reflect if it has failed ciga or not + # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA + # check + asset_list = asset_list.merge( + ciga_list[["asset_list_row_id", "Guarantee"]], + how='left', + on="asset_list_row_id" + ) + + asset_list["ECO Eligibility"].value_counts() + + asset_list["ECO Eligibility"] = np.where( + ( + asset_list["ECO Eligibility"].str.contains("(Subject to CIGA)", regex=False) & + (asset_list["Guarantee"] == "Yes") + ), + "Failed CIGA", + asset_list["ECO Eligibility"] + ) + + # We replace any remaining "Subject to CIGA" with pass Ciga + asset_list["ECO Eligibility"] = np.where( + asset_list["ECO Eligibility"].str.contains("Subject to CIGA", regex=False), + "Pass CIGA", + asset_list["ECO Eligibility"] + ) + + asset_list = asset_list.drop(columns=["Guarantee"]) + + # Update the asset list with the categorisations + self.data[ha_name]["asset_list"] = asset_list return ha_facts_and_figures @@ -1532,16 +1573,18 @@ def app(): :return: """ - use_cache = False + use_cache = True # List all of the data in the folder directories = [str(list(entry.iterdir())[0]) for entry in DATA_FOLDER.iterdir() if entry.is_dir()] + # Grab the December HA figures filepath + december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" priority_has = ["HA1", "HA6", "HA14", "HA39", "HA107"] # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] - loader = DataLoader(directories, use_cache) + loader = DataLoader(directories, december_figures_filepath, use_cache) loader.load() loader.ha_facts_and_figures() From 8b48dbac9e5e9f25e3c738c1322b1f3a9fbb11db Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 13:37:50 +0000 Subject: [PATCH 014/155] working on eco eligibility code --- .../ha_15_32/ha_analysis_batch_3.py | 153 ++++++++++++++---- 1 file changed, 122 insertions(+), 31 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index dbe12e92..fdc00876 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -725,6 +725,13 @@ class DataLoader: def load(self): + # Get the december figures, which is just a csv + self.december_figures = pd.read_csv(self.december_figures_filepath) + # Remove the spaces in HA Name + self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "") + self.december_figures["ECO4"] = self.december_figures["ECO4"].astype("Int64") + self.december_figures["GBIS"] = self.december_figures["GBIS"].astype("Int64") + if self.use_cache: self.data = read_pickle_from_s3( bucket_name="retrofit-datalake-dev", @@ -732,11 +739,6 @@ class DataLoader: ) return - # Get the december figures, which is just a csv - self.december_figures = pd.read_csv(self.december_figures_filepath) - # Remove the spaces in HA Name - self.december_figures["HA Name"] = december_figures["HA Name"].str.replace(" ", "") - data = {} for filepath in self.directories: ha_name = filepath.split("/")[2] @@ -768,46 +770,135 @@ class DataLoader: This function will return a dictionary of facts and figures for each HA :return: """ + + scheme_map = { + "ECO4": "ECO4", + "AFFORDABLE WARMTH": "ECO4", + } + + eco_eligibility_map = { + "not eligble": "not eligible" + } + ha_facts_and_figures = [] for ha_name, data_assets in self.data.items(): asset_list = data_assets["asset_list"].copy() survey_list = data_assets["survey_list"].copy() ciga_list = data_assets["ciga_list"].copy() - asset_list["ECO Eligibility"].value_counts() + # Change the column name if it's ECO eligibility + asset_list = asset_list.rename(columns={"ECO eligibility": "ECO Eligibility"}) + # Remove surplus whitespace from the ECO Eligibility column + asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.strip() + # Push to lower case + asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.lower() + # Remap + asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].map(eco_eligibility_map) - # We merge on ciga and update the status to reflect if it has failed ciga or not - # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA - # check - asset_list = asset_list.merge( - ciga_list[["asset_list_row_id", "Guarantee"]], - how='left', - on="asset_list_row_id" - ) + if not ciga_list.empty: + # We merge on ciga and update the status to reflect if it has failed ciga or not + # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA + # check + asset_list = asset_list.merge( + ciga_list[["asset_list_row_id", "Guarantee"]], + how='left', + on="asset_list_row_id" + ) - asset_list["ECO Eligibility"].value_counts() + asset_list["ECO Eligibility"].value_counts() - asset_list["ECO Eligibility"] = np.where( - ( - asset_list["ECO Eligibility"].str.contains("(Subject to CIGA)", regex=False) & - (asset_list["Guarantee"] == "Yes") - ), - "Failed CIGA", - asset_list["ECO Eligibility"] - ) + asset_list["ECO Eligibility"] = np.where( + ( + asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) & + (asset_list["Guarantee"] == "Yes") + ), + "failed ciga", + asset_list["ECO Eligibility"] + ) - # We replace any remaining "Subject to CIGA" with pass Ciga - asset_list["ECO Eligibility"] = np.where( - asset_list["ECO Eligibility"].str.contains("Subject to CIGA", regex=False), - "Pass CIGA", - asset_list["ECO Eligibility"] - ) + # We replace any remaining "Subject to CIGA" with pass Ciga + asset_list["ECO Eligibility"] = np.where( + asset_list["ECO Eligibility"].str.contains("Subject to CIGA", regex=False), + "eco4 - passed ciga", + asset_list["ECO Eligibility"] + ) - asset_list = asset_list.drop(columns=["Guarantee"]) + asset_list = asset_list.drop(columns=["Guarantee"]) - # Update the asset list with the categorisations + # Update the asset list with the categorisations and rename changes self.data[ha_name]["asset_list"] = asset_list + # Report on sales + sales_report = {} + if not survey_list.empty: + scheme_column = survey_list.columns[0] + # We clean up the survey list installation or cancelled + survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower() + # Remove all punctuation + survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( + r'[^\w\s]', '', regex=True + ) + # Remove double spaces + survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( + r'\s+', ' ', regex=True + ) + # Remove trailing spaces + survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip() + + # Remap the values in the scheme column + survey_list[scheme_column] = survey_list[scheme_column].map(scheme_map) + + survey_list["installation_status"] = None + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]), + "installed", + survey_list["installation_status"] + ) + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["cancelled"]), + "cancelled", + survey_list["installation_status"] + ) + # Find partial installations + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), + "partially installed", + survey_list["installation_status"] + ) + # Find partial cancellations + # TODO: We might have more indications of partial cancellations + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), + "partially cancelled", + survey_list["installation_status"] + ) + + # Finally, for other cases, we set the status to "in progress" + survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress") + + # We concatenate the scheme name with the installation status + survey_list["installation_status"] = ( + survey_list[scheme_column] + " - " + survey_list["installation_status"] + ) + + # We get the sales + sales_report = survey_list["installation_status"].value_counts().to_dict() + + ha_facts_and_figures.append( + { + "HA Name": ha_name, + **asset_list["ECO Eligibility"].value_counts().to_dict(), + **sales_report + } + ) + + ha_facts_and_figures = pd.DataFrame(ha_facts_and_figures) + ha_facts_and_figures = ha_facts_and_figures.drop( + columns=["not eligible"] + ) + + ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name") + return ha_facts_and_figures From ae2cc3fab57687bdc83d4aef4d60c23bd3a3b5e8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 14:14:19 +0000 Subject: [PATCH 015/155] working on ha facts and figures --- .../ha_15_32/ha_analysis_batch_3.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index fdc00876..d75a9f34 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -42,7 +42,7 @@ class DataLoader: # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not # the asset list "HA14": 4, - # There's just too many unmatched here - if we identify some homes that + # There's just too many unmatched here "HA6": 117, "HA107": 52 } @@ -786,6 +786,8 @@ class DataLoader: survey_list = data_assets["survey_list"].copy() ciga_list = data_assets["ciga_list"].copy() + asset_list_starting_size = asset_list.shape[0] + # Change the column name if it's ECO eligibility asset_list = asset_list.rename(columns={"ECO eligibility": "ECO Eligibility"}) # Remove surplus whitespace from the ECO Eligibility column @@ -793,19 +795,17 @@ class DataLoader: # Push to lower case asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.lower() # Remap - asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].map(eco_eligibility_map) + asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].replace(eco_eligibility_map) if not ciga_list.empty: # We merge on ciga and update the status to reflect if it has failed ciga or not # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA # check - asset_list = asset_list.merge( - ciga_list[["asset_list_row_id", "Guarantee"]], - how='left', - on="asset_list_row_id" - ) - asset_list["ECO Eligibility"].value_counts() + ciga_list_to_merge = ciga_list[["asset_list_row_id", "Guarantee"]].copy() + ciga_list_to_merge = ciga_list_to_merge[~pd.isnull(ciga_list_to_merge["asset_list_row_id"])] + + asset_list = asset_list.merge(ciga_list_to_merge, how='left', on="asset_list_row_id") asset_list["ECO Eligibility"] = np.where( ( @@ -818,7 +818,10 @@ class DataLoader: # We replace any remaining "Subject to CIGA" with pass Ciga asset_list["ECO Eligibility"] = np.where( - asset_list["ECO Eligibility"].str.contains("Subject to CIGA", regex=False), + ( + asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) & + (asset_list["Guarantee"] == "No") + ), "eco4 - passed ciga", asset_list["ECO Eligibility"] ) @@ -826,6 +829,8 @@ class DataLoader: asset_list = asset_list.drop(columns=["Guarantee"]) # Update the asset list with the categorisations and rename changes + if asset_list.shape[0] != asset_list_starting_size: + raise ValueError("The asset list has changed in size") self.data[ha_name]["asset_list"] = asset_list # Report on sales @@ -846,7 +851,7 @@ class DataLoader: survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip() # Remap the values in the scheme column - survey_list[scheme_column] = survey_list[scheme_column].map(scheme_map) + survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map) survey_list["installation_status"] = None survey_list["installation_status"] = np.where( From 8ef0198606486cf3eee9abf84723181ef221ea6b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 16:22:50 +0000 Subject: [PATCH 016/155] handling deduping ciga match --- .../ha_15_32/ha_analysis_batch_3.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index d75a9f34..6ffe50e3 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -41,7 +41,7 @@ class DataLoader: UNMATCHED_CIGA = { # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not # the asset list - "HA14": 4, + "HA14": 3, # There's just too many unmatched here "HA6": 117, "HA107": 52 @@ -147,6 +147,17 @@ class DataLoader: return ciga_list + @staticmethod + def dedupe_ciga_list(ciga_list): + ciga_list["unique_key"] = ciga_list["Matched Address"] + ciga_list["Matched Postcode"] + # Remove spaces from the unique key + ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(" ", "") + # Remove punctuation from the unique key + ciga_list["unique_key"] = ciga_list["unique_key"].str.replace(r'[^\w\s]', '') + # Drop duplicated keys + ciga_list = ciga_list[~ciga_list["unique_key"].duplicated()] + return ciga_list + @staticmethod def get_asset_sheetname(workbook): if "Asset List" in workbook.sheetnames: @@ -244,6 +255,7 @@ class DataLoader: ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])] ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) + ciga_list = self.dedupe_ciga_list(ciga_list) ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) return asset_list, survey_list, ciga_list @@ -686,10 +698,15 @@ class DataLoader: # We have an acceptable number of ciga failures for each HA if len(unmatched_addresses) != self.UNMATCHED_CIGA[ha_name]: - raise ValueError(f"Unmatched addresses for {ha_name} is not as expected") + raise ValueError( + f"Unmatched addresses for {ha_name} is not as expected, got {len(unmatched_addresses)} unmatched") matching_lookup = pd.DataFrame(matching_lookup) + # Check dupes as this will cause problems later on + if matching_lookup["asset_list_row_id"].duplicated().any(): + raise ValueError("Duplicated asset list row ids") + # Merge onto the ciga list ciga_list = ciga_list.merge(matching_lookup, how='left', on="ciga_list_row_id") From 78f5226ad7a5ec81e4da1ca6f9e78565146e0457 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 16:38:14 +0000 Subject: [PATCH 017/155] put together ha facts and figures --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 6ffe50e3..bd4d5128 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -44,7 +44,7 @@ class DataLoader: "HA14": 3, # There's just too many unmatched here "HA6": 117, - "HA107": 52 + "HA107": 51 } def __init__(self, directories, december_figures_filepath, use_cache): @@ -54,6 +54,7 @@ class DataLoader: self.data = {} self.december_figures = None + self.ha_facts_and_figures = None def create_asset_list_matching_address(self, ha_name, asset_list): @@ -794,7 +795,8 @@ class DataLoader: } eco_eligibility_map = { - "not eligble": "not eligible" + "not eligble": "not eligible", + "eco 4(subject to ciga)": "eco4 (subject to ciga)", } ha_facts_and_figures = [] @@ -919,9 +921,15 @@ class DataLoader: columns=["not eligible"] ) - ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name") + ha_facts_and_figures = ha_facts_and_figures.fillna(0) + # Make all columns apart from HA NAme integers + for col in ha_facts_and_figures.columns[1:]: + ha_facts_and_figures[col] = ha_facts_and_figures[col].astype(int) - return ha_facts_and_figures + ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name") + ha_facts_and_figures = ha_facts_and_figures.fillna(0) + + self.ha_facts_and_figures = ha_facts_and_figures def get_epc_data( From c18740eebda1a2b307a91e215f78fdeafcad8402 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 18:44:11 +0000 Subject: [PATCH 018/155] updating eligibility detection --- etl/eligibility/Eligibility.py | 57 +-- .../ha_15_32/ha_analysis_batch_3.py | 402 ++++++++++-------- 2 files changed, 249 insertions(+), 210 deletions(-) diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index 906ff594..b09d2df5 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -340,7 +340,6 @@ class Eligibility: # Check if the property is suitable for cavity wall self.cavity_insulation() - self.loft_insulation() self.gbis_warmfront = (self.cavity["suitability"]) and ( int(self.epc["current-energy-efficiency"]) <= 68 @@ -384,43 +383,49 @@ class Eligibility: if current_sap >= 69: self.eco4_warmfront = { "eligible": False, - "message": "sap too high", + "message": "SAP too high", "cavity_type": self.cavity["type"], "loft_type": self.loft["thickness_classification"] } return - if post_retrofit_sap is None: - - if current_sap >= 55: - message = "Possibly eligible but property currently EPC D" - else: - message = "subject to post retrofit sap" if is_eligible else "not eligible" - - # Update the message to flag properties that failed just because of a full cavity. - # We need to double check that the wall is a cavity, that the loft is suitable and that the - # sap is within reason - # We can then estimate the age of the cavity fill - if not is_eligible and (current_sap < 69) and self.loft["suitability"] and self.walls["is_cavity_wall"]: - message = "Failed due to full cavity - check cavity age" - + if not is_eligible and current_sap >= 55: self.eco4_warmfront = { - "eligible": is_eligible, - "message": message, + "eligible": False, + "message": "failed fabric and SAP check", "cavity_type": self.cavity["type"], "loft_type": self.loft["thickness_classification"] } return - is_eligible = is_eligible & (post_retrofit_sap >= 69) + if not is_eligible and current_sap < 55: + self.eco4_warmfront = { + "eligible": False, + "message": "failed fabric check", + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] + } + return - self.eco4_warmfront = { - "eligible": is_eligible, - "message": None, - "cavity_type": self.cavity["type"], - "loft_type": self.loft["thickness_classification"] - } - return + if is_eligible and current_sap >= 55: + self.eco4_warmfront = { + "eligible": True, + "message": "Meets fabric, fails SAP check", + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] + } + return + + if is_eligible and current_sap < 55: + self.eco4_warmfront = { + "eligible": True, + "message": "Meets fabric and SAP check", + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] + } + return + + raise ValueError("Implement me") def check_gbis(self): diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bd4d5128..5dd9b6e1 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -25,6 +25,84 @@ DATA_FOLDER = Path(__file__).parent / "local_data" / "ha_data" logger = setup_logger() load_dotenv(ENV_FILE) +PROPERTY_TYPE_LOOKUP = { + "HA1": { + "built_form": { + 'Mid Terrace': 'Mid-Terrace', + 'Semi-Detached': 'Semi-Detached', + 'End Terrace': 'End-Terrace', + 'Detached': 'Detached', + 'Enclosed Mid': 'Mid-Terrace', + 'Detached Local Connect': 'Detached', + } + }, + "HA6": { + "property_type": { + 'HOUSE': "House", + 'GROUND FLOOR FLAT': "Flat", + 'UPPER FLOOR FLAT': "Flat", + 'MAISONETTE': "Maisonette", + 'BUNGALOW': "Bungalow", + 'WARDEN BUNGALOW': "Bungalow", + 'WARDEN FLAT': "Flat", + 'EXTRACARE SCHEME': "Flat", + } + }, + "HA14": { + "property_type": { + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + } + }, + "HA39": { + "Semi house": {"property_type": "House", "built_form": "Semi-Detached"}, + "1st floor flat": {"property_type": "Flat", "built_form": None}, + "Mid terrace house": {"property_type": "House", "built_form": "Mid-Terrace"}, + "Ground floor flat": {"property_type": "Flat", "built_form": None}, + "End terrace house": {"property_type": "House", "built_form": "End-Terrace"}, + "Semi bungalow": {"property_type": "Bungalow", "built_form": "Semi-Detached"}, + "End terrace bungalow": {"property_type": "Bungalow", "built_form": "End-Terrace"}, + "2nd floor flat": {"property_type": "Flat", "built_form": None}, + "Mid terrace bungalow": {"property_type": "Bungalow", "built_form": "Mid-Terrace"}, + "3rd floor flat": {"property_type": "Flat", "built_form": None}, + "Detached bungalow": {"property_type": "Bungalow", "built_form": "Detached"}, + "Maisonette": {"property_type": "Maisonette", "built_form": None}, + "Detached house": {"property_type": "House", "built_form": "Detached"}, + "Lower ground floor flat": {"property_type": "Flat", "built_form": None}, + "Dormer bungalow": {"property_type": "Bungalow", "built_form": None}, + "Basement flat": {"property_type": "Flat", "built_form": None}, + "Cluster House": {"property_type": "House", "built_form": "Detached"}, + "2nd/3rd floor duplex flat": {"property_type": "Flat", "built_form": None}, + "Ground floor flat with study": {"property_type": "Flat", "built_form": None}, + "4th floor flat": {"property_type": "Flat", "built_form": None}, + "1st floor flat with study room": {"property_type": "Flat", "built_form": None}, + "2nd floor flat with study": {"property_type": "Flat", "built_form": None}, + }, + "HA107": { + "property_type": { + "HOUSE": "House", + "BUNGALOW": "Bungalow", + "GRD FLOOR FLAT": "Flat", + "FIRST FLOOR FLAT": "Flat", + "SHELTERED BUNGALOW": "Bungalow", + "MAISONETTE": "Maisonette", + "SECOND FLOOR FLAT": "Flat", + "SHELTERED FIRST FLR": "Flat", + "SHELTERED GROUND FLR": "Flat", + "GRD FLOOR BED SIT": "House" + }, + "built_form": { + "Semi Detached": "Semi-Detached", + "Mid Terrace": "Mid-Terrace", + "End Terrace": "End-Terrace", + "Detached": "Detached", + "Detatched": "Detached", + } + } +} + class DataLoader: COLUMN_CONFIG = { @@ -54,7 +132,7 @@ class DataLoader: self.data = {} self.december_figures = None - self.ha_facts_and_figures = None + self.facts_and_figures = None def create_asset_list_matching_address(self, ha_name, asset_list): @@ -929,7 +1007,77 @@ class DataLoader: ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name") ha_facts_and_figures = ha_facts_and_figures.fillna(0) - self.ha_facts_and_figures = ha_facts_and_figures + self.facts_and_figures = ha_facts_and_figures + + +def get_property_type_and_built_form(property_meta, ha_name): + if ha_name == "HA1": + property_type = property_meta["Asset Type"] + # We correct a small error + if property_type == "a": + property_type = "House" + + # Remap bedsits to flats + if property_type in ["Bedsit", "Room"]: + property_type = "Flat" + + built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"], None) + elif ha_name == "HA6": + property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Dwelling type"]] + built_form = property_meta["built_form"] + elif ha_name == "HA14": + if property_meta["Asset Type Description"] == "Block - Repair": + # We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address + if "room" in property_meta["Address 1"].lower(): + property_type = "House" + else: + property_type = "Flat" + + else: + property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][ + property_meta["Asset Type Description"] + ] + + built_form = None + elif ha_name == "HA39": + + property_type_config = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["ConstructionStyle"], {}) + property_type = property_type_config.get("property_type", None) + built_form = property_type_config.get("built_form", None) + + if property_type is None: + # We check for the presence of room or flat + if "flat" in property_meta["matching_address"]: + property_type = "Flat" + else: + property_type = "House" + elif ha_name == "HA107": + + dwelling_style = property_meta["Dwelling Style"] + if isinstance(dwelling_style, str): + dwelling_style = dwelling_style.strip() + + property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"].get(property_meta["DwellingType"]) + built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(dwelling_style, None) + + if property_type is None: + if built_form in ["Semi-Detached", "Mid-Terrace", "End-Terrace", "Detached"]: + property_type = "House" + + if "flat" in property_meta["Wall Construction"].lower(): + property_type = "Flat" + + if (property_meta["DwellingType"] == "UNKNOWN") & (property_meta["Dwelling Style"] == 0): + # Hand a few specific cases + property_type = "Bungalow" + + if property_meta["Street"] == "School View": + property_type = "Bungalow" + + else: + raise NotImplementedError("Implement me") + + return property_type, built_form def get_epc_data( @@ -938,84 +1086,6 @@ def get_epc_data( if not loader.data: raise ValueError("Data not found - please run loader.load() first") - property_type_lookup = { - "ha_1": { - "built_form": { - 'Mid Terrace': 'Mid-Terrace', - 'Semi-Detached': 'Semi-Detached', - 'End Terrace': 'End-Terrace', - 'Detached': 'Detached', - 'Enclosed Mid': 'Mid-Terrace', - 'Detached Local Connect': 'Detached', - } - }, - "ha_6": { - "property_type": { - 'HOUSE': "House", - 'GROUND FLOOR FLAT': "Flat", - 'UPPER FLOOR FLAT': "Flat", - 'MAISONETTE': "Maisonette", - 'BUNGALOW': "Bungalow", - 'WARDEN BUNGALOW': "Bungalow", - 'WARDEN FLAT': "Flat", - 'EXTRACARE SCHEME': "Flat", - } - }, - "ha_14": { - "property_type": { - "House": "House", - "Flat": "Flat", - "Bungalow": "Bungalow", - "Maisonette": "Maisonette", - } - }, - "ha_39": { - "Semi house": {"property_type": "House", "built_form": "Semi-Detached"}, - "1st floor flat": {"property_type": "Flat", "built_form": None}, - "Mid terrace house": {"property_type": "House", "built_form": "Mid-Terrace"}, - "Ground floor flat": {"property_type": "Flat", "built_form": None}, - "End terrace house": {"property_type": "House", "built_form": "End-Terrace"}, - "Semi bungalow": {"property_type": "Bungalow", "built_form": "Semi-Detached"}, - "End terrace bungalow": {"property_type": "Bungalow", "built_form": "End-Terrace"}, - "2nd floor flat": {"property_type": "Flat", "built_form": None}, - "Mid terrace bungalow": {"property_type": "Bungalow", "built_form": "Mid-Terrace"}, - "3rd floor flat": {"property_type": "Flat", "built_form": None}, - "Detached bungalow": {"property_type": "Bungalow", "built_form": "Detached"}, - "Maisonette": {"property_type": "Maisonette", "built_form": None}, - "Detached house": {"property_type": "House", "built_form": "Detached"}, - "Lower ground floor flat": {"property_type": "Flat", "built_form": None}, - "Dormer bungalow": {"property_type": "Bungalow", "built_form": None}, - "Basement flat": {"property_type": "Flat", "built_form": None}, - "Cluster House": {"property_type": "House", "built_form": "Detached"}, - "2nd/3rd floor duplex flat": {"property_type": "Flat", "built_form": None}, - "Ground floor flat with study": {"property_type": "Flat", "built_form": None}, - "4th floor flat": {"property_type": "Flat", "built_form": None}, - "1st floor flat with study room": {"property_type": "Flat", "built_form": None}, - "2nd floor flat with study": {"property_type": "Flat", "built_form": None}, - }, - "ha_107": { - "property_type": { - "HOUSE": "House", - "BUNGALOW": "Bungalow", - "GRD FLOOR FLAT": "Flat", - "FIRST FLOOR FLAT": "Flat", - "SHELTERED BUNGALOW": "Bungalow", - "MAISONETTE": "Maisonette", - "SECOND FLOOR FLAT": "Flat", - "SHELTERED FIRST FLR": "Flat", - "SHELTERED GROUND FLR": "Flat", - "GRD FLOOR BED SIT": "House" - }, - "built_form": { - "Semi Detached": "Semi-Detached", - "Mid Terrace": "Mid-Terrace", - "End Terrace": "End-Terrace", - "Detached": "Detached", - "Detatched": "Detached", - } - } - } - outputs = {} for ha_name, data_assets in loader.data.items(): @@ -1049,77 +1119,15 @@ def get_epc_data( if property_meta["matching_postcode"] is None: continue - if ha_name == "ha_1": - property_type = property_meta["Asset Type"] - # We correct a small error - if property_type == "a": - property_type = "House" - - # Remap bedsits to flats - if property_type in ["Bedsit", "Room"]: - property_type = "Flat" - - built_form = property_type_lookup[ha_name]["built_form"].get(property_meta["Property Type"], None) - elif ha_name == "ha_6": - property_type = property_type_lookup[ha_name]["property_type"][property_meta["Dwelling type"]] - built_form = property_meta["built_form"] - elif ha_name == "ha_14": - if property_meta["Asset Type Description"] == "Block - Repair": - # We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address - if "room" in property_meta["Address 1"].lower(): - property_type = "House" - else: - property_type = "Flat" - - else: - property_type = property_type_lookup[ha_name]["property_type"][ - property_meta["Asset Type Description"] - ] - - built_form = None - elif ha_name == "ha_39": - - property_type_config = property_type_lookup[ha_name].get(property_meta["ConstructionStyle"], {}) - property_type = property_type_config.get("property_type", None) - built_form = property_type_config.get("built_form", None) - - if property_type is None: - # We check for the presence of room or flat - if "flat" in property_meta["matching_address"]: - property_type = "Flat" - else: - property_type = "House" - elif ha_name == "ha_107": - - dwelling_style = property_meta["Dwelling Style"] - if isinstance(dwelling_style, str): - dwelling_style = dwelling_style.strip() - - property_type = property_type_lookup[ha_name]["property_type"].get(property_meta["DwellingType"]) - built_form = property_type_lookup[ha_name]["built_form"].get(dwelling_style, None) - - if property_type is None: - if built_form in ["Semi-Detached", "Mid-Terrace", "End-Terrace", "Detached"]: - property_type = "House" - - if "flat" in property_meta["Wall Construction"].lower(): - property_type = "Flat" - - if (property_meta["DwellingType"] == "UNKNOWN") & (property_meta["Dwelling Style"] == 0): - # Hand a few specific cases - property_type = "Bungalow" - - if property_meta["Street"] == "School View": - property_type = "Bungalow" - - else: - raise NotImplementedError("Implement me") + property_type, built_form = get_property_type_and_built_form( + property_meta=property_meta, ha_name=ha_name + ) searcher = SearchEpc( address1=str(property_meta["HouseNo"]), postcode=property_meta["matching_postcode"], auth_token=EPC_AUTH_TOKEN, - os_api_key=None, + os_api_key="", full_address=property_meta["matching_address"] ) searcher.ordnance_survey_client.property_type = property_type @@ -1150,9 +1158,21 @@ def get_epc_data( eligibility.check_gbis_warmfront() eligibility.check_eco4_warmfront() - if (not eligibility.eco4_warmfront["eligible"]) and ( - not eligibility.gbis_warmfront - ) and consider_penultimate_epc: + # We check the conditions for checking the penultimate epc + identified_for_gbis = property_meta["ECO Eligibility"] == "gbis" + identified_for_eco4 = property_meta["ECO Eligibility"] in ["eco4"] + + # condition 1 - identified for gbis and not eligible + condition_1 = ( + identified_for_gbis and not eligibility.gbis_warmfront and not eligibility.eco4_warmfront["eligible"] + ) & consider_penultimate_epc + + # condition 2 - identified for eco4 and not eligible + condition_2 = ( + identified_for_eco4 and not eligibility.eco4_warmfront["eligible"] + ) & consider_penultimate_epc + + if identified_for_gbis and not eligibility.gbis_warmfront and not eligibility.eco4_warmfront["eligible"]: # We check the penultimate epc eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned) eligibility.check_gbis_warmfront() @@ -1161,6 +1181,10 @@ def get_epc_data( # We don't update just to make data cleaning easier if penultimate_epc.get("estimated") is None: older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]] + elif identified_for_eco4 and not eligibility.eco4_warmfront["eligible"]: + + else: + blah # If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity # Loft MUST be suitable @@ -1199,6 +1223,7 @@ def get_epc_data( { "row_id": property_meta["asset_list_row_id"], "uprn": eligibility.epc["uprn"], + "is_estimated": searcher.newest_epc.get("estimated") is not None, "property_type": eligibility.epc["property-type"], "gbis_eligible": eligibility.gbis_warmfront, "eco4_eligible": eligibility.eco4_warmfront["eligible"], @@ -1219,7 +1244,6 @@ def get_epc_data( "cavity_age": cavity_age, **eligibility.walls, **eligibility.roof, - "is_estimated": searcher.newest_epc.get("estimated") is not None, "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"], "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"] } @@ -1687,38 +1711,7 @@ def analyse_ha_data(outputs, loader): writer.sheets[sheet].set_column(i, i, width) -def app(): - """ - This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107. - Only HA 6 has surveys - :return: - """ - - use_cache = True - - # List all of the data in the folder - directories = [str(list(entry.iterdir())[0]) for entry in DATA_FOLDER.iterdir() if entry.is_dir()] - # Grab the December HA figures filepath - december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - - priority_has = ["HA1", "HA6", "HA14", "HA39", "HA107"] - # Filter down the directories to only the priority HAs - directories = [d for d in directories if d.split("/")[2] in priority_has] - - loader = DataLoader(directories, december_figures_filepath, use_cache) - loader.load() - loader.ha_facts_and_figures() - - # TODO: We probably need to make sure that we have all of the columns that we need - - # We load in the additional data required to perform the analysis - - cleaned = read_from_s3( - s3_file_name="cleaned_epc_data/cleaned.bson", - bucket_name="retrofit-data-dev" - ) - cleaned = msgpack.unpackb(cleaned, raw=False) - +def patch_cleaned(cleaned): # Patch to handle the a missing description cleaned["floor-description"].extend( [ @@ -1762,16 +1755,57 @@ def app(): x["another_property_below"] = True x["thermal_transmittance"] = 0 + return cleaned + + +def app(): + """ + This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107. + Only HA 6 has surveys + :return: + """ + + # Determines if we want to use the cached data in s3 + use_cache = True + # Determines if we want to perform the data pull + pull_data = True + + # List all of the data in the folder + directories = [str(list(entry.iterdir())[0]) for entry in DATA_FOLDER.iterdir() if entry.is_dir()] + # Grab the December HA figures filepath + december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" + + priority_has = ["HA1", "HA6", "HA14", "HA39", "HA107"] + # Filter down the directories to only the priority HAs + directories = [d for d in directories if d.split("/")[2] in priority_has] + + loader = DataLoader(directories, december_figures_filepath, use_cache) + loader.load() + loader.ha_facts_and_figures() + + # We load in the additional data required to perform the analysis + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + cleaned = patch_cleaned(cleaned) + cleaning_data = read_dataframe_from_s3_parquet( bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", ) - created_at = datetime.now().isoformat() photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") outputs = get_epc_data( - loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=False + loader=loader, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds, + pull_data=pull_data ) # for ha_name, datasets in outputs.items(): From 807ce14790600dce8a810847f47bc216bcddf6b3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 19:09:19 +0000 Subject: [PATCH 019/155] updating the code to do eligibility --- .../ha_15_32/ha_analysis_batch_3.py | 42 +++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 5dd9b6e1..3d0964c6 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1164,15 +1164,33 @@ def get_epc_data( # condition 1 - identified for gbis and not eligible condition_1 = ( - identified_for_gbis and not eligibility.gbis_warmfront and not eligibility.eco4_warmfront["eligible"] - ) & consider_penultimate_epc + identified_for_gbis and not eligibility.gbis_warmfront and not eligibility.eco4_warmfront[ + "eligible"] + ) & consider_penultimate_epc # condition 2 - identified for eco4 and not eligible - condition_2 = ( - identified_for_eco4 and not eligibility.eco4_warmfront["eligible"] - ) & consider_penultimate_epc + condition_2 = (identified_for_eco4 and not eligibility.eco4_warmfront[ + "eligible"]) & consider_penultimate_epc - if identified_for_gbis and not eligibility.gbis_warmfront and not eligibility.eco4_warmfront["eligible"]: + # successfully identigied gbis + condition_3 = ( + identified_for_gbis and (eligibility.gbis_warmfront or eligibility.eco4_warmfront["eligible"]) + ) + + # Nothing identified + condition_4 = ( + not identified_for_gbis and not identified_for_eco4 and not eligibility.gbis_warmfront and not + eligibility.eco4_warmfront["eligible"] + ) + + # Not identified but seemingly eligible for eco4 or gbis + condition_5 = ( + not identified_for_gbis and not identified_for_eco4 and ( + eligibility.eco4_warmfront["eligible"] or eligibility.gbis_warmfront + ) + ) + + if condition_1 or condition_2: # We check the penultimate epc eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned) eligibility.check_gbis_warmfront() @@ -1181,10 +1199,11 @@ def get_epc_data( # We don't update just to make data cleaning easier if penultimate_epc.get("estimated") is None: older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]] - elif identified_for_eco4 and not eligibility.eco4_warmfront["eligible"]: - + elif condition_3 or condition_4 or condition_5: + # If we have successfully identified for gbis, we don't need to check the penultimate epc + pass else: - blah + NotImplementedError("Implement me") # If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity # Loft MUST be suitable @@ -1229,10 +1248,7 @@ def get_epc_data( "eco4_eligible": eligibility.eco4_warmfront["eligible"], "eco4_message": eligibility.eco4_warmfront["message"], "sap": float(eligibility.epc["current-energy-efficiency"]), - "gbis_eligible_future": eligibility.gbis["eligible"], - "gbis_eligible_future_message": eligibility.gbis["message"], - "eco4_eligible_future": eligibility.eco4["eligible"], - "eco4_eligible_future_message": eligibility.eco4["message"], + # Property components "roof": eligibility.roof["clean_description"], "walls": eligibility.walls["clean_description"], From 69dcc73363c43d12076b887707db802384046e07 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 19:18:58 +0000 Subject: [PATCH 020/155] deugging null lodgement-date --- backend/SearchEpc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 4f6fd33d..4a3f371a 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -609,7 +609,11 @@ class SearchEpc: # Insert an estimated lodgement datetime, with a weighted average estimated_epc["lodgement-datetime"] = self.calculate_weighted_lodgement_datetime(epc_data=epc_data) # Extract logement date - estimated_epc["lodgement-date"] = estimated_epc["lodgement-datetime"].strftime("%Y-%m-%d") + # It is possible that there is still no lodgement date, so we need to handle this + if pd.isnull(estimated_epc["lodgement-datetime"]): + estimated_epc["lodgement-date"] = None + else: + estimated_epc["lodgement-date"] = estimated_epc["lodgement-datetime"].strftime("%Y-%m-%d") estimated_epc["postcode"] = self.postcode estimated_epc["uprn"] = self.uprn From b80ffda392e0601f08dd376cfaacba73e733fc9c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 19:29:46 +0000 Subject: [PATCH 021/155] updating eligibility pipeline to factor in ciga --- .../ha_15_32/ha_analysis_batch_3.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 3d0964c6..ecbb4e0a 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1159,8 +1159,11 @@ def get_epc_data( eligibility.check_eco4_warmfront() # We check the conditions for checking the penultimate epc - identified_for_gbis = property_meta["ECO Eligibility"] == "gbis" + identified_for_gbis = property_meta["ECO Eligibility"] in ["gbis"] identified_for_eco4 = property_meta["ECO Eligibility"] in ["eco4"] + subject_to_ciga = property_meta["ECO Eligibility"] in [ + "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga" + ] # condition 1 - identified for gbis and not eligible condition_1 = ( @@ -1179,8 +1182,11 @@ def get_epc_data( # Nothing identified condition_4 = ( - not identified_for_gbis and not identified_for_eco4 and not eligibility.gbis_warmfront and not - eligibility.eco4_warmfront["eligible"] + not identified_for_gbis + and not identified_for_eco4 + and not eligibility.gbis_warmfront + and not subject_to_ciga + and not eligibility.eco4_warmfront["eligible"] ) # Not identified but seemingly eligible for eco4 or gbis @@ -1190,6 +1196,10 @@ def get_epc_data( ) ) + condition_6 = ( + subject_to_ciga and not eligibility.eco4_warmfront["eligible"] + ) + if condition_1 or condition_2: # We check the penultimate epc eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned) @@ -1199,8 +1209,7 @@ def get_epc_data( # We don't update just to make data cleaning easier if penultimate_epc.get("estimated") is None: older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]] - elif condition_3 or condition_4 or condition_5: - # If we have successfully identified for gbis, we don't need to check the penultimate epc + elif condition_3 or condition_4 or condition_5 or condition_6: pass else: NotImplementedError("Implement me") From 281c6f626c833a482a199ba120e1b0e8b1869cf1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 23:23:29 +0000 Subject: [PATCH 022/155] working on eligibility --- backend/Property.py | 3 +- etl/eligibility/Eligibility.py | 90 ++++++++-- etl/eligibility/ha_15_32/app.py | 18 +- .../ha_15_32/ha_analysis_batch_3.py | 156 +++++++++--------- 4 files changed, 167 insertions(+), 100 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index 4a55e504..f86e33dc 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -147,7 +147,8 @@ class Property: # self.base_difference_record.df def adjust_difference_record_with_recommendations( - self, property_recommendations, + self, + property_recommendations, property_representative_recommendations ): """ diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index b09d2df5..bda34923 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -145,6 +145,7 @@ class Eligibility: "reason": None, "thickness_classification": thickness_classification } + return # Insulation is already thick enough self.loft = { @@ -164,8 +165,10 @@ class Eligibility: """ is_cavity = self.walls["is_cavity_wall"] - is_empty = (not self.walls["is_filled_cavity"]) or ( + is_empty = (not self.walls["is_filled_cavity"]) + is_as_built = ( self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["average", "above average"] + and self.walls["is_assumed"] ) is_partial_filled = "partial" in self.walls["clean_description"].lower() # We look for potentially under performing cavities - anything that is assumed, as built and insulated @@ -175,6 +178,7 @@ class Eligibility: is_unfilled_cavity = is_cavity and (is_empty and not is_partial_filled) is_partial_filled_cavity = is_cavity and is_partial_filled + is_assumed_filled_cavity = is_cavity and is_as_built is_underperforming_cavity = is_cavity and is_underperforming # Check if it has internal or external wall insulation @@ -195,6 +199,13 @@ class Eligibility: } return + if is_assumed_filled_cavity: + self.cavity = { + "suitability": True, + "type": "as built assumed", + } + return + if is_partial_filled_cavity: self.cavity = { "suitability": True, @@ -345,7 +356,7 @@ class Eligibility: int(self.epc["current-energy-efficiency"]) <= 68 ) - def check_eco4_warmfront(self, post_retrofit_sap=None): + def check_eco4_warmfront(self): """ This funciton will check if the property is eligible for funding under the ECO4 scheme @@ -377,49 +388,100 @@ class Eligibility: self.cavity_insulation() self.loft_insulation() - # make sure conditions 2 and 3 are true - is_eligible = self.cavity["suitability"] & self.loft["suitability"] - - if current_sap >= 69: + # Case 1: No conditions meet + if not self.cavity["suitability"] and (self.loft["thickness"] > 100) and current_sap >= 55: self.eco4_warmfront = { "eligible": False, - "message": "SAP too high", + "strict": False, + "message": "All conditions fail", "cavity_type": self.cavity["type"], "loft_type": self.loft["thickness_classification"] } return - if not is_eligible and current_sap >= 55: + # Case 2 - perfect match + if (self.cavity["type"] == "empty") and (self.loft["thickness"] <= 100) and (current_sap < 55): self.eco4_warmfront = { - "eligible": False, - "message": "failed fabric and SAP check", + "eligible": True, + "strict": True, + "message": "Perfect suitability", "cavity_type": self.cavity["type"], "loft_type": self.loft["thickness_classification"] } return - if not is_eligible and current_sap < 55: + # Case 2.5 - near perfect match - but we would not recommend this using the model + if self.cavity["suitability"] and (self.loft["thickness"] <= 100) and (current_sap < 55): + self.eco4_warmfront = { + "eligible": True, + "strict": True, + "message": "Perfect suitability", + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] + } + return + + # Case 3 - cavity is suitable, loft is not, sap is good + if self.cavity["suitability"] and (self.loft["thickness"] > 100) and (current_sap < 55): + self.eco4_warmfront = { + "eligible": True, + "strict": False, + "message": "Meets cavity and sap", + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] + } + return + + # Case 4 - cavity is not suitable, loft is, sap is not - we say this is not elifible + if not self.cavity["suitability"] and (self.loft["thickness"] <= 100) and (current_sap < 55): self.eco4_warmfront = { "eligible": False, + "strict": False, "message": "failed fabric check", "cavity_type": self.cavity["type"], "loft_type": self.loft["thickness_classification"] } return - if is_eligible and current_sap >= 55: + # Case 5 - cavity and loft suitable, sap too high + if self.cavity["suitability"] and (self.loft["thickness"] <= 100) and (current_sap >= 55): self.eco4_warmfront = { "eligible": True, + "strict": False, "message": "Meets fabric, fails SAP check", "cavity_type": self.cavity["type"], "loft_type": self.loft["thickness_classification"] } return - if is_eligible and current_sap < 55: + # Case 6 - meets just cavity + if self.cavity["suitability"] and (self.loft["thickness"] > 100) and (current_sap >= 55): self.eco4_warmfront = { "eligible": True, - "message": "Meets fabric and SAP check", + "strict": False, + "message": "Meets just cavity", + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] + } + return + + # Case 7 - fails cavity, loft but meets sap + if not self.cavity["suitability"] and (self.loft["thickness"] > 100) and (current_sap < 55): + self.eco4_warmfront = { + "eligible": False, + "strict": False, + "message": "Fails cavity nd lodt, meets SAP", + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] + } + return + + # Case 8 - fails cavity, meets loft, fails sap + if not self.cavity["suitability"] and (self.loft["thickness"] <= 100) and (current_sap >= 55): + self.eco4_warmfront = { + "eligible": False, + "strict": False, + "message": "Fails cavity, meets loft, fails SAP", "cavity_type": self.cavity["type"], "loft_type": self.loft["thickness_classification"] } diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py index a68bf272..378a0e83 100644 --- a/etl/eligibility/ha_15_32/app.py +++ b/etl/eligibility/ha_15_32/app.py @@ -387,17 +387,19 @@ def prepare_model_data_row( } simulations = [ - [cavity_simulation], - [loft_simulation] + cavity_simulation, + loft_simulation ] - p.adjust_difference_record_with_recommendations(simulations) + recommendation_record = p.base_difference_record.df.to_dict("records")[0].copy() + scoring_dict = p.create_recommendation_scoring_data( + property_id=p.id, + recommendation_record=recommendation_record, + recommendations=simulations, + primary_recommendation_id=cavity_simulation["recommendation_id"] + ) - # Make sure we definitely have the correct data - cavity_scoring = [x for x in p.recommendations_scoring_data if "cavity" in x["id"]][0] - loft_scoring = [x for x in p.recommendations_scoring_data if "loft" in x["id"]][0] - - return [cavity_scoring, loft_scoring] + return [scoring_dict] def get_ha_32data(ha_data, cleaned, cleaning_data, created_at): diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index ecbb4e0a..239fce65 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1114,7 +1114,7 @@ def get_epc_data( results = [] scoring_data = [] nodata = [] - for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): + for index, property_meta in tqdm(eco4.iterrows(), total=len(eco4)): if property_meta["matching_postcode"] is None: continue @@ -1226,10 +1226,6 @@ def get_epc_data( # We check the age of the cavity and if it's particularly old, we flag it cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned) - # Full checks - eligibility.check_gbis() - eligibility.check_eco4() - if eligibility.eco4_warmfront["eligible"]: if eligibility.epc["uprn"] == "": eligibility.epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) @@ -1256,8 +1252,8 @@ def get_epc_data( "gbis_eligible": eligibility.gbis_warmfront, "eco4_eligible": eligibility.eco4_warmfront["eligible"], "eco4_message": eligibility.eco4_warmfront["message"], + "eco4_strict": eligibility.eco4_warmfront["strict"], "sap": float(eligibility.epc["current-energy-efficiency"]), - # Property components "roof": eligibility.roof["clean_description"], "walls": eligibility.walls["clean_description"], @@ -1267,91 +1263,97 @@ def get_epc_data( "date_epc": eligibility.epc["lodgement-date"], "loft_thickness": eligibility.roof["insulation_thickness"], "cavity_age": cavity_age, - **eligibility.walls, - **eligibility.roof, "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"], "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"] } ) - scoring_df = pd.DataFrame(scoring_data) - scoring_df = scoring_df.drop( - columns=[ - "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", - "carbon_ending" - ] - ) - - model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at) - - # scoring_df["is_community"].value_counts() - # scoring_df[scoring_df["is_community"] == "Unknown"] - # property_meta = asset_list[asset_list["asset_list_row_id"] == "ha_67238"].squeeze() - - all_predictions = model_api.predict_all( - df=scoring_df, - bucket="retrofit-data-dev", - prediction_buckets={ - "sap_change_predictions": "retrofit-sap-predictions-dev", - "heat_demand_predictions": "retrofit-heat-predictions-dev", - "carbon_change_predictions": "retrofit-carbon-predictions-dev" - } - ) - results_df = pd.DataFrame(results) + scoring_df = pd.DataFrame(scoring_data) + results_df["post_install_sap"] = None + results_df["eligibility_classification"] = None - predictions = all_predictions["sap_change_predictions"].copy() + eco4 = asset_list[asset_list["ECO Eligibility"] == "eco4"] + z = results_df[results_df["row_id"].isin(eco4["asset_list_row_id"])] + z["walls"].value_counts() + z1 = z[z["walls"] == "Cavity wall, as built, no insulation"] + k = z1[z1["roof"] == "Pitched, 100 mm loft insulation"] + property_meta = asset_list[asset_list["asset_list_row_id"] == k["row_id"].values[0]].squeeze() + z[z["walls"] == "Cavity wall, as built, insulated"]["roof"].value_counts() + z[z["walls"] == "Cavity wall, as built, insulated"]["roof"].value_counts() - predictions = predictions.rename(columns={"property_id": "row_id"}).merge( - results_df[["row_id", "sap"]], how="left", on="row_id" - ) - predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] - predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index() + if not scoring_df.empty: + scoring_df = scoring_df.drop( + columns=[ + "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", + "carbon_ending" + ] + ) - results_df = results_df.merge( - predictions[["sap_uplift", "row_id"]], - how="left", - on="row_id" - ) - results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] + model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at) - eligibility_assessment = [] - for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): - # The upgrade requirements are dependent on the current SAP - - # If the property is an F or G, it only needs to upgrade to an % - if row["sap"] <= 38: - if row["post_install_sap"] >= 57: - eligibility_classification = "highest confidence" - elif row["post_install_sap"] >= 55: - eligibility_classification = "high confidence" - elif row["post_install_sap"] >= 53: - eligibility_classification = "medium confidence" - else: - eligibility_classification = "unlikely" - else: - - if row["post_install_sap"] >= 71: - eligibility_classification = "highest confidence" - elif row["post_install_sap"] >= 69: - eligibility_classification = "high confidence" - elif row["post_install_sap"] >= 67: - eligibility_classification = "medium confidence" - else: - eligibility_classification = "unlikely" - - eligibility_assessment.append( - { - "row_id": row["row_id"], - "eligibility_classification": eligibility_classification + all_predictions = model_api.predict_all( + df=scoring_df, + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + "heat_demand_predictions": "retrofit-heat-predictions-dev", + "carbon_change_predictions": "retrofit-carbon-predictions-dev" } ) - eligibility_assessment = pd.DataFrame(eligibility_assessment) + predictions = all_predictions["sap_change_predictions"].copy() - results_df = results_df.merge( - eligibility_assessment, how="left", on="row_id" - ) + predictions = predictions.rename(columns={"property_id": "row_id"}).merge( + results_df[["row_id", "sap"]], how="left", on="row_id" + ) + predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] + predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index() + + results_df = results_df.merge( + predictions[["sap_uplift", "row_id"]], + how="left", + on="row_id" + ) + results_df["post_install_sap"] = results_df["sap"] + results_df["sap_uplift"] + + eligibility_assessment = [] + for _, row in results_df[results_df["eco4_eligible"] == True].iterrows(): + # The upgrade requirements are dependent on the current SAP + + # If the property is an F or G, it only needs to upgrade to an % + if row["sap"] <= 38: + if row["post_install_sap"] >= 57: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 55: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 53: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + else: + + if row["post_install_sap"] >= 71: + eligibility_classification = "highest confidence" + elif row["post_install_sap"] >= 69: + eligibility_classification = "high confidence" + elif row["post_install_sap"] >= 67: + eligibility_classification = "medium confidence" + else: + eligibility_classification = "unlikely" + + eligibility_assessment.append( + { + "row_id": row["row_id"], + "eligibility_classification": eligibility_classification + } + ) + + eligibility_assessment = pd.DataFrame(eligibility_assessment) + + results_df = results_df.merge( + eligibility_assessment, how="left", on="row_id" + ) # We store the results in S3 as a pickle save_pickle_to_s3( From f4d27aa68dea5595037d55e7ad8c54cc9d7967ad Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 23:30:06 +0000 Subject: [PATCH 023/155] fixing eligibility --- etl/eligibility/Eligibility.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index bda34923..15e3158f 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -421,8 +421,19 @@ class Eligibility: } return + # Case 3 - cavity is suitable, loft is within 150mm, sap is good + if self.cavity["suitability"] and (self.loft["thickness"] <= 150) and (current_sap < 55): + self.eco4_warmfront = { + "eligible": True, + "strict": False, + "message": "Meets cavity, loft borderline, meets sap", + "cavity_type": self.cavity["type"], + "loft_type": self.loft["thickness_classification"] + } + return + # Case 3 - cavity is suitable, loft is not, sap is good - if self.cavity["suitability"] and (self.loft["thickness"] > 100) and (current_sap < 55): + if self.cavity["suitability"] and (self.loft["thickness"] > 150) and (current_sap < 55): self.eco4_warmfront = { "eligible": True, "strict": False, @@ -444,7 +455,7 @@ class Eligibility: return # Case 5 - cavity and loft suitable, sap too high - if self.cavity["suitability"] and (self.loft["thickness"] <= 100) and (current_sap >= 55): + if self.cavity["suitability"] and (self.loft["thickness"] <= 150) and (current_sap >= 55): self.eco4_warmfront = { "eligible": True, "strict": False, @@ -470,7 +481,7 @@ class Eligibility: self.eco4_warmfront = { "eligible": False, "strict": False, - "message": "Fails cavity nd lodt, meets SAP", + "message": "Fails cavity and loft, meets SAP", "cavity_type": self.cavity["type"], "loft_type": self.loft["thickness_classification"] } From 97ce8dc32ea0edd3d24ecefe942a0eb4e8df418e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 23:36:45 +0000 Subject: [PATCH 024/155] fixing eligibility --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 239fce65..1ba75e2b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1114,7 +1114,7 @@ def get_epc_data( results = [] scoring_data = [] nodata = [] - for index, property_meta in tqdm(eco4.iterrows(), total=len(eco4)): + for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): if property_meta["matching_postcode"] is None: continue @@ -1218,10 +1218,7 @@ def get_epc_data( # Loft MUST be suitable cavity_age = None if ( - eligibility.walls["is_cavity_wall"] and - eligibility.walls["is_filled_cavity"] and - eligibility.loft["suitability"] and - eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age" + identified_for_eco4 and not eligibility.eco4_warmfront["eligible"] ): # We check the age of the cavity and if it's particularly old, we flag it cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned) From 0fbf00451291a09349c0bdeeb67bbc80bd4dc9bc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 10:20:55 +0000 Subject: [PATCH 025/155] Expanding gbis eligibiity checks --- etl/eligibility/Eligibility.py | 44 +++++++++++++++++-- .../ha_15_32/ha_analysis_batch_3.py | 20 +++++---- etl/epc/Dataset.py | 16 +++---- 3 files changed, 59 insertions(+), 21 deletions(-) diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index 15e3158f..f7a5ed98 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -352,9 +352,41 @@ class Eligibility: # Check if the property is suitable for cavity wall self.cavity_insulation() - self.gbis_warmfront = (self.cavity["suitability"]) and ( - int(self.epc["current-energy-efficiency"]) <= 68 - ) + current_sap = int(self.epc["current-energy-efficiency"]) + # We have a strict suitability check and a non-strict check + + # Perfect strictness + if (self.cavity["type"] == "empty") and (current_sap < 69): + self.gbis_warmfront = { + "eligible": True, + "strict": True, + "message": "Perfect suitability", + } + return + + # Near perfect + if self.cavity["suitability"] and (current_sap < 55): + self.gbis_warmfront = { + "eligible": True, + "strict": True, + "message": "Near perfect suitability", + } + return + + # Suitable cavity, but high sap + if self.cavity["suitability"] and (current_sap >= 55): + self.gbis_warmfront = { + "eligible": True, + "strict": False, + "message": "Meets cavity, fails SAP check", + } + return + + self.gbis_warmfront = { + "eligible": False, + "strict": False, + "message": "All conditions fail", + } def check_eco4_warmfront(self): """ @@ -388,6 +420,10 @@ class Eligibility: self.cavity_insulation() self.loft_insulation() + # We put in a placeholder when the roof is not a loft + if self.loft["reason"] == "roof not loft": + self.loft["thickness"] = 999 + # Case 1: No conditions meet if not self.cavity["suitability"] and (self.loft["thickness"] > 100) and current_sap >= 55: self.eco4_warmfront = { @@ -415,7 +451,7 @@ class Eligibility: self.eco4_warmfront = { "eligible": True, "strict": True, - "message": "Perfect suitability", + "message": "Near perfect suitability", "cavity_type": self.cavity["type"], "loft_type": self.loft["thickness_classification"] } diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1ba75e2b..28efadd0 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1270,15 +1270,6 @@ def get_epc_data( results_df["post_install_sap"] = None results_df["eligibility_classification"] = None - eco4 = asset_list[asset_list["ECO Eligibility"] == "eco4"] - z = results_df[results_df["row_id"].isin(eco4["asset_list_row_id"])] - z["walls"].value_counts() - z1 = z[z["walls"] == "Cavity wall, as built, no insulation"] - k = z1[z1["roof"] == "Pitched, 100 mm loft insulation"] - property_meta = asset_list[asset_list["asset_list_row_id"] == k["row_id"].values[0]].squeeze() - z[z["walls"] == "Cavity wall, as built, insulated"]["roof"].value_counts() - z[z["walls"] == "Cavity wall, as built, insulated"]["roof"].value_counts() - if not scoring_df.empty: scoring_df = scoring_df.drop( columns=[ @@ -1763,6 +1754,17 @@ def patch_cleaned(cleaned): ] ) + cleaned["roof-description"].extend( + [ + {'original_description': 'Pitched, 300+mm loft insulation', + 'clean_description': 'Pitched, 300+ mm loft insulation', 'thermal_transmittance': None, + 'thermal_transmittance_unit': None, 'is_pitched': True, 'is_roof_room': False, 'is_loft': True, + 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False, + 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': '300+' + } + ] + ) + # Patch mainheatcont-description cleaned["mainheatcont-description"].extend( [ diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index dac829e2..7040d66c 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -203,11 +203,11 @@ class TrainingDataset(BaseDataset): common_cols = [[col + "_starting", col + "_ending"] for col in common_cols] self.df = self.df.loc[ - :, - no_suffix_cols - + only_ending_cols - + [col for cols in common_cols for col in cols], - ] + :, + no_suffix_cols + + only_ending_cols + + [col for cols in common_cols for col in cols], + ] def _remove_abnormal_change_in_floor_area(self): """ @@ -509,7 +509,7 @@ class TrainingDataset(BaseDataset): expanded_df["is_sandstone_or_limestone"] == expanded_df["is_sandstone_or_limestone_ending"] ) - ] + ] elif component == "floor": expanded_df = expanded_df[ (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"]) @@ -526,7 +526,7 @@ class TrainingDataset(BaseDataset): expanded_df["is_to_external_air"] == expanded_df["is_to_external_air_ending"] ) - ] + ] elif component == "roof": expanded_df = expanded_df[ (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"]) @@ -539,7 +539,7 @@ class TrainingDataset(BaseDataset): expanded_df["has_dwelling_above"] == expanded_df["has_dwelling_above_ending"] ) - ] + ] return expanded_df From 7b080094fdf08daf720ac01c10bfad380a917062 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 11:02:12 +0000 Subject: [PATCH 026/155] created distributed scoring for prediction --- .../ha_15_32/ha_analysis_batch_3.py | 46 ++++++++++++------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 28efadd0..3dc4d45f 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1166,10 +1166,9 @@ def get_epc_data( ] # condition 1 - identified for gbis and not eligible - condition_1 = ( - identified_for_gbis and not eligibility.gbis_warmfront and not eligibility.eco4_warmfront[ - "eligible"] - ) & consider_penultimate_epc + condition_1 = (identified_for_gbis and not eligibility.gbis_warmfront + and not eligibility.eco4_warmfront["eligible"] + ) & consider_penultimate_epc # condition 2 - identified for eco4 and not eligible condition_2 = (identified_for_eco4 and not eligibility.eco4_warmfront[ @@ -1246,10 +1245,12 @@ def get_epc_data( "uprn": eligibility.epc["uprn"], "is_estimated": searcher.newest_epc.get("estimated") is not None, "property_type": eligibility.epc["property-type"], - "gbis_eligible": eligibility.gbis_warmfront, "eco4_eligible": eligibility.eco4_warmfront["eligible"], "eco4_message": eligibility.eco4_warmfront["message"], "eco4_strict": eligibility.eco4_warmfront["strict"], + "gbis_eligible": eligibility.gbis_warmfront["eligible"], + "gbis_message": eligibility.gbis_warmfront["message"], + "gbis_strict": eligibility.gbis_warmfront["strict"], "sap": float(eligibility.epc["current-energy-efficiency"]), # Property components "roof": eligibility.roof["clean_description"], @@ -1279,24 +1280,32 @@ def get_epc_data( ) model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at) + model_api.MODEL_PREFIXES = ["sap_change_predictions"] - all_predictions = model_api.predict_all( - df=scoring_df, - bucket="retrofit-data-dev", - prediction_buckets={ - "sap_change_predictions": "retrofit-sap-predictions-dev", - "heat_demand_predictions": "retrofit-heat-predictions-dev", - "carbon_change_predictions": "retrofit-carbon-predictions-dev" - } - ) + scoring_df["id"] = scoring_df["id"] + "phase=0" + # We split up the scoring_df and score + predictions = [] + to_loop_over = range(0, scoring_df.shape[0], 400) + for chunk in tqdm(to_loop_over, total=len(to_loop_over)): + predictions_dict = model_api.predict_all( + df=scoring_df.iloc[chunk:chunk + 400], + bucket="retrofit-data-dev", + prediction_buckets={ + "sap_change_predictions": "retrofit-sap-predictions-dev", + } + ) - predictions = all_predictions["sap_change_predictions"].copy() + predictions.append(predictions_dict["sap_change_predictions"]) + + predictions = pd.concat(predictions) + predictions_size = predictions.shape[0] predictions = predictions.rename(columns={"property_id": "row_id"}).merge( results_df[["row_id", "sap"]], how="left", on="row_id" ) + if predictions.shape[0] != predictions_size: + raise ValueError("Predictions size has changed") predictions["sap_uplift"] = predictions["predictions"] - predictions["sap"] - predictions = predictions.groupby("row_id")["sap_uplift"].sum().reset_index() results_df = results_df.merge( predictions[["sap_uplift", "row_id"]], @@ -1339,9 +1348,12 @@ def get_epc_data( eligibility_assessment = pd.DataFrame(eligibility_assessment) + # Make sure the results haven't changed in size results_df = results_df.merge( eligibility_assessment, how="left", on="row_id" ) + if results_df.shape[0] != len(results): + raise ValueError("results has changed size") # We store the results in S3 as a pickle save_pickle_to_s3( @@ -1809,6 +1821,8 @@ def app(): loader.load() loader.ha_facts_and_figures() + loader.facts_and_figures.to_csv("facts_and_figures.csv", index=False) + # We load in the additional data required to perform the analysis cleaned = read_from_s3( s3_file_name="cleaned_epc_data/cleaned.bson", From 3ef346b248ed89e04a08d07a0231db987809521b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 13:12:54 +0000 Subject: [PATCH 027/155] patching roof description in cleaned further --- .../ha_15_32/ha_analysis_batch_3.py | 60 ++++++++++++++++++- etl/epc/Dataset.py | 28 +++++++++ 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 3dc4d45f..e261710e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1755,7 +1755,16 @@ def patch_cleaned(cleaned): ] ) - # We treat unknown loft insulation as no insulation + cleaned["roof-description"].extend( + [ + {'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation', + 'thermal_transmittance': None, 'thermal_transmittance_unit': None, 'is_pitched': True, + 'is_roof_room': False, + 'is_loft': False, 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': True, + 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': 'none'} + ] + ) + cleaned["roof-description"].extend( [ {'original_description': 'Pitched, Unknown loft insulation', 'clean_description': 'Pitched, no insulation', @@ -1777,6 +1786,55 @@ def patch_cleaned(cleaned): ] ) + thermal_transmittance_values = list(np.arange(0, 2, 0.01)) + for ttv in thermal_transmittance_values: + ttv_roundeded = round(ttv, 2) + # We look for an instance of that thermal transmittance value + rec = [ + x for x in cleaned["roof-description"] if + (x["thermal_transmittance"] == ttv_roundeded) and "Average thermal transmittance" in x["clean_description"] + ] + + if rec: + continue + else: + # We patch the record + cleaned["roof-description"].extend( + [{'original_description': f'Average thermal transmittance {ttv_roundeded} W/m-¦K', + 'clean_description': f'Average thermal transmittance {ttv_roundeded} w/m-¦k', + 'thermal_transmittance': ttv_roundeded, + 'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False, + 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False, + 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}] + ) + + # We also patch a funny unit value we found + for ttv in thermal_transmittance_values: + ttv_rounded = round(ttv, 2) + # We look for an instance of that thermal transmittance value + rec = [ + x for x in cleaned["roof-description"] if + (x["thermal_transmittance"] == ttv_rounded) and "Average thermal transmittance" in x["clean_description"] + and x["thermal_transmittance_unit"] == "w/m?K" + ] + + if rec: + continue + else: + # We patch the record + ttv_string = str(ttv_rounded) + if len(ttv_string) == 3: + ttv_string = f"{ttv_string}0" + + cleaned["roof-description"].extend( + [{'original_description': f'Average thermal transmittance {ttv_string} W/m?K', + 'clean_description': f'Average thermal transmittance {ttv_string} w/m-¦k', + 'thermal_transmittance': ttv_rounded, + 'thermal_transmittance_unit': 'w/m-¦k', 'is_pitched': False, 'is_roof_room': False, 'is_loft': False, + 'is_flat': False, 'is_thatched': False, 'is_at_rafters': False, 'is_assumed': False, + 'has_dwelling_above': False, 'is_valid': True, 'insulation_thickness': None}] + ) + # Patch mainheatcont-description cleaned["mainheatcont-description"].extend( [ diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 7040d66c..cf241747 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -658,6 +658,34 @@ class TrainingDataset(BaseDataset): components_to_expand = cols_to_drop.keys() + for comp in list(components_to_expand): + if comp == "main-fuel": + cleaned_key = "main-fuel" + left_on_starting = "main_fuel_starting" + left_on_ending = "main_fuel_ending" + original_cols = ["main_fuel_starting", "main_fuel_ending"] + else: + cleaned_key = f"{comp}-description" + left_on_starting = f"{comp}_description_starting" + left_on_ending = f"{comp}_description_ending" + original_cols = [ + f"{comp}_description_starting", + f"{comp}_description_ending", + ] + df = pd.DataFrame(cleaned_lookup[cleaned_key]) + # Check for the existence + filtered_1 = df[df["original_description"] == self.df[left_on_starting].values[0]] + filtered_2 = df[df["original_description"] == self.df[left_on_ending].values[0]] + if filtered_1.empty: + print(comp) + print(self.df[left_on_starting].values[0]) + + if filtered_2.empty: + print(f"Original description {self.df[left_on_ending].values[0]} not found in lookup") + + z = pd.DataFrame(cleaned_lookup["roof-description"]) + z[z["original_description"] == "Average thermal transmittance 0.20 W/m?K"] + for component in components_to_expand: # TODO: change cleaned dataframe to have underscores instead of dashes if component == "main-fuel": From 730ad0fd7144b2b5e86d98b8c3ef4e5d71ccd0cb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 13:13:28 +0000 Subject: [PATCH 028/155] removing temp code --- etl/epc/Dataset.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index cf241747..7040d66c 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -658,34 +658,6 @@ class TrainingDataset(BaseDataset): components_to_expand = cols_to_drop.keys() - for comp in list(components_to_expand): - if comp == "main-fuel": - cleaned_key = "main-fuel" - left_on_starting = "main_fuel_starting" - left_on_ending = "main_fuel_ending" - original_cols = ["main_fuel_starting", "main_fuel_ending"] - else: - cleaned_key = f"{comp}-description" - left_on_starting = f"{comp}_description_starting" - left_on_ending = f"{comp}_description_ending" - original_cols = [ - f"{comp}_description_starting", - f"{comp}_description_ending", - ] - df = pd.DataFrame(cleaned_lookup[cleaned_key]) - # Check for the existence - filtered_1 = df[df["original_description"] == self.df[left_on_starting].values[0]] - filtered_2 = df[df["original_description"] == self.df[left_on_ending].values[0]] - if filtered_1.empty: - print(comp) - print(self.df[left_on_starting].values[0]) - - if filtered_2.empty: - print(f"Original description {self.df[left_on_ending].values[0]} not found in lookup") - - z = pd.DataFrame(cleaned_lookup["roof-description"]) - z[z["original_description"] == "Average thermal transmittance 0.20 W/m?K"] - for component in components_to_expand: # TODO: change cleaned dataframe to have underscores instead of dashes if component == "main-fuel": From d573c4d8a0ae911edd0e2f181eceb4087e3e78e4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 15:15:05 +0000 Subject: [PATCH 029/155] added try except mechanism --- .../ha_15_32/ha_analysis_batch_3.py | 35 ++++++++++++------- etl/epc/Record.py | 32 ++++++++--------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index e261710e..da484daa 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1089,6 +1089,9 @@ def get_epc_data( outputs = {} for ha_name, data_assets in loader.data.items(): + if ha_name == "HA39": + continue + if not pull_data: # Then we retrieve the data from S3 processed_ha_results = read_pickle_from_s3( @@ -1114,6 +1117,7 @@ def get_epc_data( results = [] scoring_data = [] nodata = [] + failed_model_rows = [] for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): if property_meta["matching_postcode"] is None: @@ -1225,19 +1229,24 @@ def get_epc_data( if eligibility.eco4_warmfront["eligible"]: if eligibility.epc["uprn"] == "": eligibility.epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) - - scoring_dictionary = prepare_model_data_row( - property_id=property_meta["asset_list_row_id"], - modelling_epc=eligibility.epc, - cleaned=cleaned, - cleaning_data=cleaning_data, - created_at=created_at, - old_data=older_epcs, - full_sap_epc=full_sap_epc, - photo_supply_lookup=photo_supply_lookup, - floor_area_decile_thresholds=floor_area_decile_thresholds - ) - scoring_data.extend(scoring_dictionary) + try: + scoring_dictionary = prepare_model_data_row( + property_id=property_meta["asset_list_row_id"], + modelling_epc=eligibility.epc, + cleaned=cleaned, + cleaning_data=cleaning_data, + created_at=created_at, + old_data=older_epcs, + full_sap_epc=full_sap_epc, + photo_supply_lookup=photo_supply_lookup, + floor_area_decile_thresholds=floor_area_decile_thresholds + ) + scoring_data.extend(scoring_dictionary) + except Exception as e: + # If we fail, we just keep a record of it + failed_model_rows.append( + property_meta["asset_list_row_id"] + ) results.append( { diff --git a/etl/epc/Record.py b/etl/epc/Record.py index c793716f..e74330a2 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -725,26 +725,26 @@ class EPCRecord: if self.prepared_epc["construction-age-band"] in DATA_ANOMALY_MATCHES: if self.old_data: # Take the most recent - max_datetime = max( - [ - old_record["lodgement-datetime"] - for old_record in self.old_data - if old_record["construction-age-band"] - not in DATA_ANOMALY_MATCHES - ] - ) - - most_recent = [ - old_record + old_age_bands = [ + old_record["lodgement-datetime"] for old_record in self.old_data - if old_record["lodgement-datetime"] == max_datetime + if old_record["construction-age-band"] not in DATA_ANOMALY_MATCHES ] - self.prepared_epc["construction-age-band"] = ( - EPCDataProcessor.clean_construction_age_band( - most_recent[0]["construction-age-band"] + if old_age_bands: + max_datetime = max(old_age_bands) + + most_recent = [ + old_record + for old_record in self.old_data + if old_record["lodgement-datetime"] == max_datetime + ] + + self.prepared_epc["construction-age-band"] = ( + EPCDataProcessor.clean_construction_age_band( + most_recent[0]["construction-age-band"] + ) ) - ) self.construction_age_band = self.prepared_epc["construction-age-band"] self.age_band = england_wales_age_band_lookup.get(self.construction_age_band) From b26e44b465e5c832a65b5bd09767f1015c2dfc1a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 15:45:33 +0000 Subject: [PATCH 030/155] Extending to HA 7 --- .../ha_15_32/ha_analysis_batch_3.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index da484daa..2fb26e73 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -48,6 +48,10 @@ PROPERTY_TYPE_LOOKUP = { 'EXTRACARE SCHEME': "Flat", } }, + "HA7": { + "property_type": {}, + "built_form": {} + }, "HA14": { "property_type": { "House": "House", @@ -143,6 +147,13 @@ class DataLoader: asset_list["matching_postcode"] = asset_list[ self.COLUMN_CONFIG[ha_name]["postcode"] ].str.lower().str.strip() + elif ha_name == "HA7": + # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode + asset_list["matching_address"] = asset_list["Address"].str.lower().str.strip() + ", " + \ + asset_list["Address2"].str.lower().str.strip() + ", " + \ + asset_list["Address3"].str.lower().str.strip() + ", " + \ + asset_list["Postcode"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() elif ha_name == "HA14": # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode asset_list["matching_address"] = asset_list["Address 1"].str.lower().str.strip() + ", " + \ @@ -241,6 +252,8 @@ class DataLoader: def get_asset_sheetname(workbook): if "Asset List" in workbook.sheetnames: return "Asset List" + elif "Asset" in workbook.sheetnames and "Assets" not in workbook.sheetnames: + return "Asset" else: return "Assets" @@ -311,6 +324,8 @@ class DataLoader: survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) # Remove columns that are None survey_list = survey_list.loc[:, survey_list.columns.notnull()] + # Remove rows that are completely empty + survey_list = survey_list.loc[survey_list.loc[:, survey_list.columns].notnull().any(axis=1)] survey_list["survey_list_row_id"] = [ha_name + "_survey_" + str(i) for i in range(0, len(survey_list))] # Perform survey list merge @@ -328,6 +343,8 @@ class DataLoader: ciga_list = pd.DataFrame(ciga_rows, columns=[cell.value for cell in ciga_sheet[1]]) # Remove columns that are None ciga_list = ciga_list.loc[:, ciga_list.columns.notnull()] + # Remove rows that are completely None + ciga_list = ciga_list.loc[ciga_list.loc[:, ciga_list.columns].notnull().any(axis=1)] # Perform ciga list merge if not ciga_list.empty: # Remove rows with missing postcode which happens in a small number of cases @@ -1880,7 +1897,7 @@ def app(): # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - priority_has = ["HA1", "HA6", "HA14", "HA39", "HA107"] + priority_has = ["HA1", "HA6", "HA7", "HA14", "HA39", "HA107"] # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From eb216e55d39817a6d7bdd6c582c6da6826050ac9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 16:45:37 +0000 Subject: [PATCH 031/155] Handling missing dates in SearchEpc class --- backend/SearchEpc.py | 15 ++++++++++----- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 4a3f371a..3d2df9fb 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -30,7 +30,7 @@ vartypes = { 'environment-impact-potential': "Int64", 'glazed-type': 'str', 'heating-cost-current': 'float', - 'address3': 'str', + # 'address3': 'str', 'mainheatcont-description': 'str', 'sheating-energy-eff': 'str', 'property-type': 'str', @@ -40,7 +40,7 @@ vartypes = { 'mechanical-ventilation': 'str', 'hot-water-cost-current': 'str', 'county': 'str', - 'postcode': 'str', + # 'postcode': 'str', 'solar-water-heating-flag': 'str', 'constituency': 'str', 'co2-emissions-potential': 'float', @@ -55,7 +55,7 @@ vartypes = { # 'inspection-date': str, 'mains-gas-flag': 'str', 'co2-emiss-curr-per-floor-area': 'float', - 'address1': 'str', + # 'address1': 'str', 'heat-loss-corridor': 'str', 'flat-storey-count': "Int64", 'constituency-label': 'str', @@ -67,7 +67,7 @@ vartypes = { 'roof-description': 'str', 'floor-energy-eff': 'str', 'number-habitable-rooms': 'float', - 'address2': 'str', + # 'address2': 'str', 'hot-water-env-eff': 'str', 'posttown': 'str', 'mainheatc-energy-eff': 'str', @@ -98,7 +98,7 @@ vartypes = { # 'lodgement-date', 'extension-count': "Int64", 'mainheatc-env-eff': 'str', - 'lmk-key': 'str', + # 'lmk-key': 'str', 'wind-turbine-count': "Int64", 'tenure': 'str', 'floor-level': 'str', @@ -575,6 +575,11 @@ class SearchEpc: property_type=property_type ) + # If we have missing lodgment date, we fill it with inspection-date + epc_data["lodgement-datetime"] = epc_data["lodgement-datetime"].fillna(epc_data["inspection-date"]) + # If we still have missing dates, we set it to the mean of the non NA dates + epc_data["lodgement-datetime"] = epc_data["lodgement-datetime"].fillna(epc_data["lodgement-datetime"].mean()) + # For each attribute, we need to determine the datatype and use an appropriate method # to estimate. estimated_epc = {} diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 2fb26e73..a8f0bfa9 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1135,6 +1135,7 @@ def get_epc_data( scoring_data = [] nodata = [] failed_model_rows = [] + # Failed at index 13691 for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): if property_meta["matching_postcode"] is None: From 2a4d16162abc8bcda788950d44a0762148e8904d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 18:01:29 +0000 Subject: [PATCH 032/155] Added ha7 --- .../ha_15_32/ha_analysis_batch_3.py | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index a8f0bfa9..889ae776 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -49,8 +49,19 @@ PROPERTY_TYPE_LOOKUP = { } }, "HA7": { - "property_type": {}, - "built_form": {} + "property_type": { + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + }, + "built_form": { + "Semi Detached": "Semi-Detached", + "Mid Terrace": "Mid-Terrace", + "End Terrace": "End-Terrace", + "Detached": "Detached", + "End Terraced": "End-Terrace", + } }, "HA14": { "property_type": { @@ -1042,6 +1053,9 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA6": property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Dwelling type"]] built_form = property_meta["built_form"] + elif ha_name == "HA7": + property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Archetype"]] + built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"][property_meta["Property Type"]] elif ha_name == "HA14": if property_meta["Asset Type Description"] == "Block - Repair": # We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address @@ -1106,9 +1120,6 @@ def get_epc_data( outputs = {} for ha_name, data_assets in loader.data.items(): - if ha_name == "HA39": - continue - if not pull_data: # Then we retrieve the data from S3 processed_ha_results = read_pickle_from_s3( @@ -1135,7 +1146,6 @@ def get_epc_data( scoring_data = [] nodata = [] failed_model_rows = [] - # Failed at index 13691 for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): if property_meta["matching_postcode"] is None: @@ -1906,8 +1916,6 @@ def app(): loader.load() loader.ha_facts_and_figures() - loader.facts_and_figures.to_csv("facts_and_figures.csv", index=False) - # We load in the additional data required to perform the analysis cleaned = read_from_s3( s3_file_name="cleaned_epc_data/cleaned.bson", From 9ca6c179bca70cfffd34da4e278e144ff8263e24 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 18:34:49 +0000 Subject: [PATCH 033/155] Adding HA16 --- .../ha_15_32/ha_analysis_batch_3.py | 139 +++++++++++++++++- 1 file changed, 135 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 889ae776..a707cfa5 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -128,6 +128,10 @@ class DataLoader: "HA6": { "address": "propertyaddress", "postcode": "address" # The 'address' column actually contains postcode + }, + "HA16": { + "address": "Address", + "postcode": "Postcode" } } @@ -135,9 +139,10 @@ class DataLoader: # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not # the asset list "HA14": 3, + "HA16": 7, # There's just too many unmatched here "HA6": 117, - "HA107": 51 + "HA107": 51, } def __init__(self, directories, december_figures_filepath, use_cache): @@ -151,7 +156,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6"]: + if ha_name in ["HA1", "HA6", "HA16"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].str.lower().str.strip() @@ -173,6 +178,7 @@ class DataLoader: asset_list["Address 4"].str.lower().str.strip() + ", " + \ asset_list["Postcode"].str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() + elif ha_name == "HA39": # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \ @@ -234,7 +240,7 @@ class DataLoader: :return: """ - if ha_name in ["HA6", "HA14", "HA107"]: + if ha_name in ["HA6", "HA14", "HA107", "HA16"]: split_addresses = ciga_list['Matched Address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how @@ -556,6 +562,129 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha16_survey_list(survey_list): + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower() + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == "REEDS RD", + "Reeds ROAD", + survey_list["Street / Block Name"] + ) + # Replace " rd " with "road" + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\brd\b', 'road', + regex=True) + + # Replace " , " with ", " + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace( + " , ", ', ', + ) + # Fix "{place} ,{place}" with "{place}, {place}" + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.replace(r'\s*,\s*', ', ', + regex=True) + # Strip whitespace + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.strip() + + # Correct errors + survey_list["Post Code"] = np.where( + survey_list["Post Code"] == "M38 0SA", + "M38 9SA", + survey_list["Post Code"] + ) + + survey_list["Post Code"] = np.where( + (survey_list["Street / Block Name"] == "nelson drive") & (survey_list["Post Code"] == "M44 5JE"), + "M44 5JF", + survey_list["Post Code"] + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eccels", "eccles") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("chatley, road", + "chatley road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("vaughen", "Vaughan") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("cresent", "crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("plantation road", + "plantation avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("how clough drive", + "howclough drive") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brockhurst lane", + "brookhurst lane") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("biirch road", + "birch road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hadson road", + "hodson road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("harbonne avennue", + "narbonne avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "cumberland road, cadishead", + "cumberland avenue, cadishead") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("aston field drive", + "ashton field drive") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wedgewood road", + "wedgwood road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hamilton close", + "hamilton avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "lichens crescent, fitton hill", + "lichens crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("south croft, fitton hill", + "south croft") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(", fitton hill", "") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("firtree dr", + "fir tree avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hawthorne road", + "hawthorn crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("rein lee avenue", + "reins lee avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("westerhill road", + "wester hill road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("st martins road", + "saint martins road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("timperley avenue", + "timperley close") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("eastwood road", + "eastwood avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("new road", "new street") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grassmere road", + "grasmere road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("hulton road", + "hulton avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("beechfield avenue", + "beechfield road") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("princess avenue", + "princes avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("edge ford crecent", + "edge fold crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("conniston avenue", + "coniston avenue") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("blackthorne crescent", + "blackthorn crescent") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("wellstock road", + "wellstock lane") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brackley avenue", + "brackley street") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("brook avenue swinton", + "brook avenue, swinton") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("green avenue swinton", + "green avenue, swinton") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("grasmere avenue wardley", + "grasmere avenue, wardley") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("mardale avenue wardle", + "mardale avenue, wardle") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("carleach grove", + "cartleach Grove") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("arbour grove", + "arbor Grove") + + # Replacement for clively avenue 66-68 + survey_list["NO."] = np.where( + survey_list["NO."] == "66-68", + "66", + survey_list["NO."] + ) + + return survey_list + @staticmethod def correct_ha107_survey_list(survey_list): # Replace Front Street, East Stockham with Front Street, East Stockwith @@ -898,6 +1027,8 @@ class DataLoader: scheme_map = { "ECO4": "ECO4", "AFFORDABLE WARMTH": "ECO4", + "ECO4 A/W": "ECO4", + "ECO4 GBIS (ECO+)": "GBIS" } eco_eligibility_map = { @@ -1908,7 +2039,7 @@ def app(): # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - priority_has = ["HA1", "HA6", "HA7", "HA14", "HA39", "HA107"] + priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA39", "HA107"] # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From 102600b19651964c4b6c7945307a8defd454f9d1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 18:40:17 +0000 Subject: [PATCH 034/155] Added HA16 --- .../ha_15_32/ha_analysis_batch_3.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index a707cfa5..ee23f238 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -71,6 +71,24 @@ PROPERTY_TYPE_LOOKUP = { "Maisonette": "Maisonette", } }, + "HA16": { + 'Semi Detached Bungalow': {"property-type": "Bungalow", "built-form": "Semi-Detached"}, + 'Mid Terraced House': {"property-type": "House", "built-form": "Mid-Terrace"}, + 'End Terraced House': {"property-type": "House", "built-form": "End-Terrace"}, + 'Low Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'Semi-Detached House': {"property-type": "House", "built-form": "Semi-Detached"}, + 'Detached Bungalow': {"property-type": "Bungalow", "built-form": "Detached"}, + 'End Terraced Bungalow': {"property-type": "Bungalow", "built-form": "End-Terrace"}, + 'Mid Terraced Bungalow': {"property-type": "Bungalow", "built-form": "Mid-Terrace"}, + 'Medium Rise Flat': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'Detached House': {"property-type": "House", "built-form": "Detached"}, + 'Cottage Flat': {"property-type": "Flat", "built-form": "Semi-Detached"}, + 'Maisonette Medium Rise': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'Maisonette Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'End Terraced Town House': {"property-type": "House", "built-form": "End-Terrace"}, + 'Flat Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"}, + 'Mid Terraced Town House': {"property-type": "House", "built-form": "Mid-Terrace"}, + }, "HA39": { "Semi house": {"property_type": "House", "built_form": "Semi-Detached"}, "1st floor flat": {"property_type": "Flat", "built_form": None}, @@ -1201,6 +1219,10 @@ def get_property_type_and_built_form(property_meta, ha_name): ] built_form = None + elif ha_name == "HA16": + config = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Type"]] + property_type = config.get("property-type") + built_form = config.get("built-form") elif ha_name == "HA39": property_type_config = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["ConstructionStyle"], {}) From a1c19b5b8883ead263880c2d589bd76da76d6403 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 19:01:32 +0000 Subject: [PATCH 035/155] Adding ha24 wip --- .../ha_15_32/ha_analysis_batch_3.py | 47 ++++++++++++++++++- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index ee23f238..94df8ceb 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -150,6 +150,10 @@ class DataLoader: "HA16": { "address": "Address", "postcode": "Postcode" + }, + "HA24": { + "address": "Address", + "postcode": "Postcode" } } @@ -174,7 +178,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA16"]: + if ha_name in ["HA1", "HA6", "HA16", "HA24"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].str.lower().str.strip() @@ -289,6 +293,8 @@ class DataLoader: return "Asset List" elif "Asset" in workbook.sheetnames and "Assets" not in workbook.sheetnames: return "Asset" + elif "Decent Homes Stock" in workbook.sheetnames: + return "Decent Homes Stock" else: return "Assets" @@ -703,6 +709,43 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha24_survey_list(survey_list): + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ") + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.lower() + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.strip() + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "council house, nidds lane", "nidds lane" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "wirral avenue", "wirrall avenue" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "st ives road", "st. ives crescent" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "sundringham road", "sandringham road" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "milton avenue", "milton road" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "st ives crescent", "st. ives crescent" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "council house, waterbelly lane", "waterbelly lane" + ) + # Generally remove "councile house, " from the start of the street name + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "council house, ", "" + ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "st. leodegars close", "st leodegars close" + ) + + return survey_list + @staticmethod def correct_ha107_survey_list(survey_list): # Replace Front Street, East Stockham with Front Street, East Stockwith @@ -2061,7 +2104,7 @@ def app(): # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA39", "HA107"] + priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA39", "HA107"] # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From e9bfd63c3588206cd9e7c79b25c6067b617bf436 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 27 Feb 2024 21:00:23 +0000 Subject: [PATCH 036/155] Fixed getting property type and built form for ha107 --- .../ha_15_32/ha_analysis_batch_3.py | 77 ++++++++++++++----- 1 file changed, 57 insertions(+), 20 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 94df8ceb..5cbfb90c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -223,12 +223,67 @@ class DataLoader: return asset_list + @staticmethod + def extract_property_info_ha107(properties): + property_types = { + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + "Bedsit": None + } + + built_forms = { + "Detached": "Detached", + "Semi Detached": "Semi-Detached", + "End Terrace": "End-Terrace", + "Mid Terrace": "Mid-Terrace" + } + + # Function to extract property type and built form from a description + def extract_from_description(description): + property_type = None + built_form = None + + for key in property_types: + if key in description: + property_type = property_types[key] + break + + for key in built_forms: + if key in description: + built_form = built_forms[key] + break + + return property_type, built_form + + # Process each property in the list + results = [] + for property_description in properties: + property_type, built_form = extract_from_description(property_description) + results.append( + { + "Property type": property_description, + "property_type": property_type, + "built_form": built_form + } + ) + results = pd.DataFrame(results) + + return results + def append_asset_list_built_form(self, ha_name, asset_list): # Finally, we process property_type or built form, where needed if ha_name == "HA6": asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6) + if ha_name == "HA107": + mapped_df = self.extract_property_info_ha107(asset_list["Property type"].unique()) + asset_list = asset_list.merge( + mapped_df, how="left", on="Property type" + ) + return asset_list @staticmethod @@ -1280,26 +1335,8 @@ def get_property_type_and_built_form(property_meta, ha_name): property_type = "House" elif ha_name == "HA107": - dwelling_style = property_meta["Dwelling Style"] - if isinstance(dwelling_style, str): - dwelling_style = dwelling_style.strip() - - property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"].get(property_meta["DwellingType"]) - built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(dwelling_style, None) - - if property_type is None: - if built_form in ["Semi-Detached", "Mid-Terrace", "End-Terrace", "Detached"]: - property_type = "House" - - if "flat" in property_meta["Wall Construction"].lower(): - property_type = "Flat" - - if (property_meta["DwellingType"] == "UNKNOWN") & (property_meta["Dwelling Style"] == 0): - # Hand a few specific cases - property_type = "Bungalow" - - if property_meta["Street"] == "School View": - property_type = "Bungalow" + property_type = property_meta.get("property_type", None) + built_form = property_meta.get("built_form", None) else: raise NotImplementedError("Implement me") From 6ae21bbcb023139961eb69749ac1380a7d3ac001 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 28 Feb 2024 12:31:48 +0000 Subject: [PATCH 037/155] Creating the output structure --- etl/eligibility/Eligibility.py | 11 +- .../ha_15_32/ha_analysis_batch_3.py | 548 +++++++----------- 2 files changed, 220 insertions(+), 339 deletions(-) diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index f7a5ed98..b594579f 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -365,7 +365,7 @@ class Eligibility: return # Near perfect - if self.cavity["suitability"] and (current_sap < 55): + if self.cavity["suitability"] and (current_sap < 69): self.gbis_warmfront = { "eligible": True, "strict": True, @@ -373,15 +373,6 @@ class Eligibility: } return - # Suitable cavity, but high sap - if self.cavity["suitability"] and (current_sap >= 55): - self.gbis_warmfront = { - "eligible": True, - "strict": False, - "message": "Meets cavity, fails SAP check", - } - return - self.gbis_warmfront = { "eligible": False, "strict": False, diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 5cbfb90c..61c4a243 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1646,10 +1646,26 @@ def get_epc_data( def get_col_widths(dataframe): - # First we find the maximum length of the index column - idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))]) - # Then, we concatenate this to the max of the lengths of column name and its max value for each column, row-wise - return [idx_max] + [max(dataframe[col].astype(str).map(len).max(), len(col)) for col in dataframe.columns] + # Define a maximum width for any column to prevent excessively wide columns + max_allowed_width = 25 + + # Calculate widths for columns + widths = [] + + if isinstance(dataframe.columns, pd.MultiIndex): + # For MultiIndex, calculate max width considering the header and data + header_widths = [max(len(str(item)) for item in col) + 2 for col in dataframe.columns.values] # +2 for padding + for i, column in enumerate(dataframe.columns): + max_data_width = max(dataframe[column].astype(str).apply(len).max(), header_widths[i]) + widths.append(min(max_data_width, max_allowed_width)) + else: + # For non-MultiIndex, calculate width normally + for col in dataframe.columns: + # Calculate the max length of data or column name and limit it + max_length = max(dataframe[col].astype(str).apply(len).max(), len(str(col)) + 2) # +2 for padding + widths.append(min(max_length, max_allowed_width)) + + return widths def analyse_ha_data(outputs, loader): @@ -1671,42 +1687,13 @@ def analyse_ha_data(outputs, loader): :return: """ - eco4_rate = 1710 - gbis_rate = 600 - ha_analysis_results = [] - ha_revenue_results = [] for ha_name, datasets in outputs.items(): - inputs = [x for k, x in loader.data.items() if k == ha_name][0] - # TODO: This is placeholder because we don't have the schemes that the properties have been qualified for - # yet - # - import random - randomly_allocated_schemes = random.choices(["ECO4", "GBIS"], k=inputs["asset_list"].shape[0]) - inputs["asset_list"]["randomly_allocated_schemes"] = randomly_allocated_schemes - inputs["asset_list"]["funding_scheme"] = None - inputs["asset_list"]["funding_scheme"] = np.where( - inputs["asset_list"]["row_meaning"] == "identified potential eco works (CWI)", - inputs["asset_list"]["randomly_allocated_schemes"], - inputs["asset_list"]["funding_scheme"] - ) - - # TODO: Also temp, just for HA 6 - if ha_name == "ha_6": - inputs["survey_list"]["funding_scheme"] = None - inputs["survey_list"]["funding_scheme"] = np.where( - inputs["survey_list"][ - 'AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION '] == "AFFORDABLE WARMTH", - "ECO4", - "GBIS" - ) - - # End placholder results_df = datasets["results_df"].copy() - analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename( + analysis_data = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility"]].rename( columns={"row_meaning": "asset_identification_status"} ).merge( results_df, @@ -1715,293 +1702,236 @@ def analyse_ha_data(outputs, loader): left_on="asset_list_row_id" ) - # We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is - # remaining + ################################################################################################ + # We take the properties that strictly qualified under eco + ################################################################################################ - if inputs["matched_lookup"] is not None: - analysis_data = analysis_data.merge( - inputs["matched_lookup"], how="left", on="asset_list_row_id" + eco4_identified = analysis_data[analysis_data["ECO Eligibility"] == "eco4"].copy() + eco4_identified["identification_type"] = None + eco4_identified["identification_type"] = np.where( + (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == True), + "strict", + eco4_identified["identification_type"] + ) + + eco4_identified["identification_type"] = np.where( + (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False), + "expansive", + eco4_identified["identification_type"] + ) + ################################################################################################ + # We take the properties dependent on CIGA + ################################################################################################ + + ciga_dependent_identified = analysis_data[ + analysis_data["ECO Eligibility"].isin( + [ + "eco4 (subject to ciga)", + "eco4 - passed ciga" + ] ) - # Drop any rows that have a survey_list_row_id - analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])] + ].copy() - # If we have a survey list, we merge this onto the results - n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique() - - properties_sold = ( - inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if - inputs["survey_list"] is not None else pd.DataFrame(columns=["funding_scheme"]) - ) - properties_sold_eco4 = ( - properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if - (not properties_sold.empty) and ("ECO4" in properties_sold["funding_scheme"].values) else 0 - ) - properties_sold_gbis = ( - properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if - (not properties_sold.empty) and ("GBIS" in properties_sold["funding_scheme"].values) else 0 + # These are properties that show filled cavity + ciga_dependent_identified["identification_type"] = None + ciga_dependent_identified["identification_type"] = np.where( + ciga_dependent_identified["eco4_message"].isin( + [ + "Perfect suitability", + "Meets cavity and sap", + "Fails cavity, meets loft, fails SAP", + "Meets fabric, fails SAP check", + "Meets cavity, loft borderline, meets sap", + ] + ), + "strict", + ciga_dependent_identified["identification_type"] ) - # We now calculate the number of remaining properties, by scheme - remaining_properties = analysis_data[ - analysis_data["asset_identification_status"] == "identified potential eco works (CWI)" - ].copy() - remaining_properties["prospect_type"] = None - - remaining_properties_by_scheme = ( - remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index() + ciga_dependent_identified["identification_type"] = np.where( + (ciga_dependent_identified["eco4_message"].isin(["All conditions fail", "failed fabric check"])) & + (ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"])), + "expansive", + ciga_dependent_identified["identification_type"] ) - n_remaining_properties_eco4 = remaining_properties_by_scheme[ - remaining_properties_by_scheme["funding_scheme"] == "ECO4" - ]["asset_list_row_id"].values[0] + ciga_dependent_identified["identification_type"] = np.where( + (ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | ( + ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"]) + ), + "expansive", + ciga_dependent_identified["identification_type"] + ) - n_remaining_properties_gbis = remaining_properties_by_scheme[ - remaining_properties_by_scheme["funding_scheme"] == "GBIS" - ]["asset_list_row_id"].values[0] + ################################################################################################ + # We properties that qualified for gbis + ################################################################################################ + gbis_identified = analysis_data[analysis_data["ECO Eligibility"] == "gbis"].copy() + gbis_identified["identification_type"] = None + gbis_identified["identification_type"] = np.where( + (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] < 69), + "strict", + gbis_identified["identification_type"] + ) - # For the remaining properties, we use the results of the eligibility process to classify the property into - # one of multiple categories - # - # For properties that have been identified as ECO4 - # 1) Strict ECO4 candidate - Has required fabric and EPC is D or below. We consider D or below here, because - # Warmfront regularly re-surveys properties which then fall within the SAP requirement - # - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties - # here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have - # very old EPCs which may score lower when re-done - # 2) Meets Fabric requirements, not SAP - # Warmfront has identified the property as eligible, but the EPC is not D or below. We consider this but - # label is separately as not a strict - # 3) Subject to CIGA check - Meets loft conditions but shows a filled cavity. - # - we don't have a SAP constraint here because the EPC is (currently) showing what the property might - # actually look like after retrofit and so the EPC currently being a C or above means little, because - # the updated EPC, showing an empty cavity, could bring the property within - # 4) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation. - # - No SAP constraint, for the same reason as in category 2) - # 5) Looks like GBIS instead - # 6) Does not look like ECO4 candidate - # - # For properties that have been identified as GBIS - # 1) Strict GBIS candidates - # 2) Properties that actually look like strict GBIS candidates - # 3) Subject to CIGA check - Filled cavity - # 4) Does not look like a GBIS candidate + gbis_identified["identification_type"] = np.where( + (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] >= 69), + "expansive", + gbis_identified["identification_type"] + ) - remaining_eco4_df = remaining_properties[ - remaining_properties["funding_scheme"] == "ECO4" - ].copy() + # Finally, we look at the properties that have not been identified by Warmfront + not_identified = analysis_data[ + analysis_data["ECO Eligibility"].isin( + [ + "not eligible" + ] + ) + ].copy() - #################################### + surplus_eco4 = not_identified[ + (not_identified["eco4_eligible"] == True) & (not_identified["eco4_message"].isin( + ["Perfect suitability", "Meets cavity, loft borderline, meets sap", "Near perfect suitability"] + )) + ] + + surplus_gbis = not_identified[ + (not_identified["gbis_eligible"] == True) & ( + ~not_identified["asset_list_row_id"].isin(surplus_eco4["asset_list_row_id"].values) + ) & (not_identified["sap"] < 69) & ( + (not_identified["cavity_type"].isin(["empty", "partial insulation"])) | ( + not_identified["walls"].str.contains("partial", case=False, na=False) + ) + ) + ] + surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False] + + # Output variables # ECO4 - #################################### - - # 1) We identify this if: - # - remaining_properties["eco4_eligible"] == True - - remaining_eco4_df["prospect_type"] = np.where( - (remaining_eco4_df["eco4_eligible"] == True), - "strict ECO4", - remaining_eco4_df["prospect_type"] + n_properties_in_asset_list = inputs["asset_list"].shape[0] + n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0] + eco4_of_which_identified_strict = ( + eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] + + ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "strict"].shape[0] ) - - # 2) Meets fabric requirements - remaining_eco4_df["prospect_type"] = np.where( - ( - (remaining_eco4_df["eco4_message"] == "sap too high") & - remaining_eco4_df["eligibility_cavity_type"].isin(["partial", "empty"]) & - remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) & - pd.isnull(remaining_eco4_df["prospect_type"]) - ), - "ECO4 if SAP downgrade", - remaining_eco4_df["prospect_type"] + eco4_of_which_identified_expansive = ( + eco4_identified[eco4_identified["identification_type"] == "expansive"].shape[0] + + ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "expansive"].shape[0] ) - - # 3) We identify this if it has a filled cavity but meets the loft conditions - # TODO: Consider if we should also allow 100-270mm or if we should add some slight tolerance (e.g. 150mm) - # to account for measurement error - remaining_eco4_df["prospect_type"] = np.where( - ( - remaining_eco4_df["eligibility_cavity_type"].isin(["full"]) & - remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) - ), - "ECO4 - Filled cavity - subject to CIGA check", - remaining_eco4_df["prospect_type"] - ) - - # 4) We identify this by ensuring the cavity if empty or partial, and the loft has between 101 and 270mm - remaining_eco4_df["prospect_type"] = np.where( - ( - remaining_eco4_df["eligibility_cavity_type"].isin(["empty", "partial"]) & - remaining_eco4_df["eligibility_loft_type"].isin(["100-270mm"]) - ), - "ECO4 prospect - empty cavity, loft insulation below regulation", - remaining_eco4_df["prospect_type"] - ) - - # 5) Looks like GBIS instead - remaining_eco4_df["prospect_type"] = np.where( - (remaining_eco4_df["gbis_eligible"] == True) & pd.isnull(remaining_eco4_df["prospect_type"]), - "Looks like GBIS", - remaining_eco4_df["prospect_type"] - ) - - # 6) This is everything else (i.e. both the cavity is full and the loft insulation is above 100mm) - remaining_eco4_df["prospect_type"] = remaining_eco4_df["prospect_type"].fillna( - "Does not look like ECO4 candidate" - ) - - #################################### # GBIS - #################################### - - remaining_gbis = remaining_properties[ - remaining_properties["funding_scheme"] == "GBIS" - ].copy() - - # 1) Strict GBIS candidates - remaining_gbis["prospect_type"] = np.where( - ( - (remaining_gbis["gbis_eligible"] == True) & (remaining_gbis["eco4_eligible"] == False) - ), - "strict GBIS", - remaining_gbis["prospect_type"] - ) - - # 2) GBIS candidates that look like strict ECO4 candidates - remaining_gbis["prospect_type"] = np.where( - (remaining_gbis["eco4_eligible"] == True), - "GBIS - Upgradable to ECO4", - remaining_gbis["prospect_type"] - ) - - # 3) Subject to CIGA check - Filled cavity - remaining_gbis["prospect_type"] = np.where( - ( - remaining_gbis["eligibility_cavity_type"].isin(["full"]) & - pd.isnull(remaining_gbis["prospect_type"]) - ), - "GBIS - Filled cavity - subject to CIGA check", - remaining_gbis["prospect_type"] - ) - - # 4) Everything else - remaining_gbis["prospect_type"] = remaining_gbis["prospect_type"].fillna( - "Does not look like GBIS candidate" - ) - - #################################### - # Surplus properties - #################################### - - # Take properties that were not identified by Warmfront and identify those that look like they would qualify - # under the strictest criteria - surplus_df = analysis_data[ - analysis_data["asset_identification_status"] != "identified potential eco works (CWI)" - ].copy() - - eco4_surplus = surplus_df[ - ( - (surplus_df["eco4_eligible"] == True) & (surplus_df["eco4_message"] == "subject to post retrofit sap") & - ( - surplus_df["eligibility_classification"].isin( - ["high confidence", "highest confidence", "medium confidence"] - ) - ) - ) - ].copy() - - gbis_surplus = surplus_df[ - ( - (surplus_df["gbis_eligible"] == True) & (surplus_df["eco4_eligible"] == False) & ( - surplus_df["eligibility_cavity_type"].isin(["empty", "partial"]) - ) - ) - ].copy() - - # Perform some checks to make sure we have all of the values - remaining_eco4_dict = remaining_eco4_df["prospect_type"].value_counts().to_dict() - if n_remaining_properties_eco4 != sum([v for k, v in remaining_eco4_dict.items()]): - raise ValueError( - "Number of remaining properties does not match the number of properties in remaining ECO4 dict" - ) - - remaining_gbis_dict = remaining_gbis["prospect_type"].value_counts().to_dict() - if n_remaining_properties_gbis != sum([v for k, v in remaining_gbis_dict.items()]): - raise ValueError( - "Number of remaining properties does not match the number of properties in remaining GBIS dict" - ) + n_warmfront_identified_gbis = gbis_identified.shape[0] + gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0] + gbis_of_which_identified_expansive = \ + gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0] to_append = { - "ha_name": ha_name, - "n_properties_in_asset_list": n_properties_in_asset_list, + ("", "HA Name"): ha_name, + ("", "# Properties in asset list"): n_properties_in_asset_list, ############ # ECO4 ############ - "properties_sold_eco4": properties_sold_eco4, - "n_remaining_properties_eco4": n_remaining_properties_eco4, - **remaining_eco4_dict, + ("ECO4", "# Properties identieid by Warmfront"): n_warmfront_identified_eco4, + ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict, + ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive, + ("ECO4", "Of which identified by model - total"): ( + eco4_of_which_identified_strict + eco4_of_which_identified_expansive), + ("ECO4", "Additional properties"): surplus_eco4.shape[0], ############ # GBIS ############ - "properties_sold_gbis": properties_sold_gbis, - "n_remaining_properties_gbis": n_remaining_properties_gbis, - **remaining_gbis_dict, - ############ - # GBIS - ############ - "n_eco4_surplus": eco4_surplus.shape[0], - "n_gbis_surplus": gbis_surplus.shape[0], + ("GBIS", "# Properties identieid by Warmfront"): n_warmfront_identified_gbis, + ("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict, + ("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive, + ("GBIS", "Of which identified by model - total"): ( + gbis_of_which_identified_strict + gbis_of_which_identified_expansive + ), + ("GBIS", "Additional properties"): surplus_gbis.shape[0] } ha_analysis_results.append(to_append) - revenue_to_append = { - "ha_name": ha_name, - "£ Remaining from asset list": ( - n_remaining_properties_eco4 * eco4_rate + n_remaining_properties_gbis * gbis_rate - ), - "Of which: Strict": ( - to_append.get('strict ECO4', 0) * eco4_rate + to_append.get('strict GBIS', 0) * gbis_rate + - to_append.get('GBIS - Upgradable to ECO4', 0) * gbis_rate - ), - "Of which: Subject to CIGA": ( - to_append.get("ECO4 - Filled cavity - subject to CIGA check", 0) * eco4_rate + - to_append.get("GBIS - Filled cavity - subject to CIGA check", 0) * gbis_rate - ), - "Of which: Prospect, not perfect strict prospect": ( - to_append.get("ECO4 prospect - empty cavity, loft insulation below regulation", 0) * eco4_rate + - to_append.get("ECO4 if SAP downgrade", 0) * eco4_rate - ), - "Of which: Potential downgrade to GBIS": to_append["Looks like GBIS"] * eco4_rate, - "Of which: Does not look like prospect": ( - to_append.get("Does not look like ECO4 candidate", 0) * eco4_rate + - to_append.get("Does not look like GBIS candidate", 0) * gbis_rate - ), - "Surplus: Unidentified properties": eco4_surplus.shape[0] * eco4_rate + gbis_surplus.shape[0] * gbis_rate, - "Surplus: GBIS Updates to ECO4": to_append.get("GBIS - Upgradable to ECO4", 0) * (eco4_rate - gbis_rate) - } - - # Perform a quick check: - if revenue_to_append["£ Remaining from asset list"] - ( - revenue_to_append["Of which: Strict"] + revenue_to_append["Of which: Subject to CIGA"] + - revenue_to_append["Of which: Prospect, not perfect strict prospect"] + - revenue_to_append["Of which: Potential downgrade to GBIS"] + - revenue_to_append["Of which: Does not look like prospect"] - ) > 1: - raise ValueError("Error between top level revenue figures and breakdown - investigate me") - - ha_revenue_results.append(revenue_to_append) - ha_analysis_results = pd.DataFrame(ha_analysis_results) - ha_revenue_results = pd.DataFrame(ha_revenue_results) + ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns) + facts_and_figures = loader.facts_and_figures.copy() + facts_and_figures["ha_number"] = facts_and_figures["HA Name"].str.extract(r'(\d+)').astype(int) + facts_and_figures = facts_and_figures.sort_values("ha_number") + facts_and_figures = facts_and_figures.drop(columns=["ha_number"]) + + # Rename some of the cols + facts_and_figures = facts_and_figures.rename( + columns={ + # ECO4 cols + "ECO4": "ECO4 - December", + "GBIS": "GBIS - December", + "eco4 (subject to ciga)": "ECO4 - subject to ciga", + "eco4": "ECO4 - doesn't need CIGA", + "eco4 - passed ciga": "ECO4 - passed CIGA", + "failed ciga": "ECO4 - failed CIGA", + "ECO4 - partially cancelled": "ECO4 - Install downgrade to GBIS", + "ECO4 - in progress": "ECO4 - Install in progress", + "ECO4 - cancelled": "ECO4 - Install cancelled", + # GBIS cols + "gbis": "GBIS total (asset list)" + } + ) + # We calculate the eco4 total from the asset list + # 1) If ciga checks have been completed (i.e. ECO4 - passed ciga > 0) this sum is + # ECO4 - doesn't need CIGA + ECO4 - passed CIGA + # 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is + # ECO4 - doesn't need CIGA + ECO4 - subject to ciga + facts_and_figures["ECO4 total (asset list)"] = np.where( + facts_and_figures["ECO4 - passed CIGA"] > 0, + facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"], + facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - subject to ciga"] + ) + + # Re-arrange the columns + facts_and_figures = facts_and_figures[ + [ + 'HA Name', + 'ECO4 - December', + 'GBIS - December', + 'ECO4 total (asset list)', + 'GBIS total (asset list)', + 'ECO4 - subject to ciga', + "ECO4 - doesn't need CIGA", + 'ECO4 - passed CIGA', + 'ECO4 - failed CIGA', + 'ECO4 - installed', + 'ECO4 - Install in progress', + 'ECO4 - Install cancelled', + 'ECO4 - partially installed', + 'ECO4 - Install downgrade to GBIS', + ] + ] + # Addd a note to flag any rows where ECO4 ( + # subject to ciga is greater than 0) and (ECO4 - passed ciga is greater than 0 + # ) + facts_and_figures["Missed CIGA checks opportunity"] = None + facts_and_figures["Missed CIGA checks opportunity"] = np.where( + (facts_and_figures["ECO4 - subject to ciga"] > 0) & (facts_and_figures["ECO4 - passed CIGA"] > 0), + "potential opportunity of " + facts_and_figures["ECO4 - subject to ciga"].astype( + str) + " ECO4 properties needing a CIGA check", + facts_and_figures["Missed CIGA checks opportunity"] + ) + + # Re arrage the columns + + # Also sort ha_analysis_results by ha number + ha_analysis_results["ha_number"] = ha_analysis_results[("", "HA Name")].str.extract(r'(\d+)').astype(int) + ha_analysis_results = ha_analysis_results.sort_values("ha_number") + ha_analysis_results = ha_analysis_results.drop(columns=["ha_number"]) + + # We save 2 sheets # Automate creation of the excel # Create a Pandas Excel writer using XlsxWriter as the engine - with pd.ExcelWriter('HA Analysis - batch3.xlsx', engine='xlsxwriter') as writer: + with pd.ExcelWriter('HA Analysis Results.xlsx', engine='xlsxwriter') as writer: # Write each dataframe to a different worksheet without the index - for df, sheet in [(ha_revenue_results, 'Total Revenue'), - (ha_analysis_results, 'By ECO4 and GBIS')]: + for df, sheet in [(facts_and_figures, 'HA Facts and Figures'), + (ha_analysis_results, 'Asset Identification')]: - df.to_excel(writer, sheet_name=sheet, index=False) + df.to_excel(writer, sheet_name=sheet) # Auto-adjust columns' width for i, width in enumerate(get_col_widths(df)): @@ -2134,7 +2064,7 @@ def app(): # Determines if we want to use the cached data in s3 use_cache = True # Determines if we want to perform the data pull - pull_data = True + pull_data = False # List all of the data in the folder directories = [str(list(entry.iterdir())[0]) for entry in DATA_FOLDER.iterdir() if entry.is_dir()] @@ -2173,43 +2103,3 @@ def app(): floor_area_decile_thresholds=floor_area_decile_thresholds, pull_data=pull_data ) - - # for ha_name, datasets in outputs.items(): - # datasets["results_df"] = datasets["results_df"].drop( - # columns=["eligibility_cavity_type", "eligibility_loft_type"] - # ) - # - # # Re-do - # res = [] - # for _, row in tqdm(datasets["results_df"].iterrows(), total=datasets["results_df"].shape[0]): - # epc = { - # "walls-description": row["walls"], - # "roof-description": row["roof"], - # "floor-description": "", - # "tenure": "", - # "current-energy-efficiency": row["sap"], - # } - # eligibility = Eligibility(epc=epc, cleaned=cleaned) - # eligibility.check_eco4_warmfront() - # res.append( - # { - # "row_id": row["row_id"], - # "eligibility_cavity_type": eligibility.eco4_warmfront["cavity_type"], - # "eligibility_loft_type": eligibility.eco4_warmfront["loft_type"] - # } - # ) - # - # # Merge back on - # res = pd.DataFrame(res) - # datasets["results_df"] = datasets["results_df"].merge(res, how="left", on="row_id") - # - # # Re-save in s3 - # save_pickle_to_s3( - # data={ - # "results_df": datasets["results_df"], - # "scoring_df": datasets["scoring_df"], - # "nodata": datasets["nodata"] - # }, - # bucket_name="retrofit-datalake-dev", - # s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle" - # ) From 8b8e2bf902f8cc6c588eab8b64253580f3364694 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 16:29:19 +0000 Subject: [PATCH 038/155] working on new forecast approach for warmfront remaining sales --- .../ha_15_32/ha_analysis_batch_3.py | 811 +++++++++++++++++- utils/s3.py | 2 +- 2 files changed, 768 insertions(+), 45 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 61c4a243..bb27029e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -17,6 +17,7 @@ from etl.eligibility.ha_15_32.app import prepare_model_data_row from backend.ml_models.api import ModelApi from etl.solar.SolarPhotoSupply import SolarPhotoSupply from recommendations.recommendation_utils import calculate_cavity_age +from etl.epc.Record import EPCRecord EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" @@ -181,25 +182,25 @@ class DataLoader: if ha_name in ["HA1", "HA6", "HA16", "HA24"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] - ].str.lower().str.strip() + ].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list[ self.COLUMN_CONFIG[ha_name]["postcode"] - ].str.lower().str.strip() + ].astype(str).str.lower().str.strip() elif ha_name == "HA7": # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode - asset_list["matching_address"] = asset_list["Address"].str.lower().str.strip() + ", " + \ - asset_list["Address2"].str.lower().str.strip() + ", " + \ - asset_list["Address3"].str.lower().str.strip() + ", " + \ - asset_list["Postcode"].str.lower().str.strip() - asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() + asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA14": # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode - asset_list["matching_address"] = asset_list["Address 1"].str.lower().str.strip() + ", " + \ - asset_list["Address 2"].str.lower().str.strip() + ", " + \ - asset_list["Address 3"].str.lower().str.strip() + ", " + \ - asset_list["Address 4"].str.lower().str.strip() + ", " + \ - asset_list["Postcode"].str.lower().str.strip() - asset_list["matching_postcode"] = asset_list["Postcode"].str.lower().str.strip() + asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address 4"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA39": # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code @@ -209,7 +210,7 @@ class DataLoader: asset_list["add_4"].astype(str).str.lower().str.strip() + ", " + \ asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \ asset_list["post_code"].astype(str).str.lower().str.strip() - asset_list["matching_postcode"] = asset_list["post_code"].str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["post_code"].astype(str).str.lower().str.strip() elif ha_name == "HA107": # Create matching_address by concatenating House No, Street, Town, District, Postcode asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \ @@ -1098,8 +1099,8 @@ class DataLoader: self.december_figures = pd.read_csv(self.december_figures_filepath) # Remove the spaces in HA Name self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "") - self.december_figures["ECO4"] = self.december_figures["ECO4"].astype("Int64") - self.december_figures["GBIS"] = self.december_figures["GBIS"].astype("Int64") + for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]: + self.december_figures[col] = self.december_figures[col].astype("Int64") if self.use_cache: self.data = read_pickle_from_s3( @@ -1203,7 +1204,6 @@ class DataLoader: # Update the asset list with the categorisations and rename changes if asset_list.shape[0] != asset_list_starting_size: raise ValueError("The asset list has changed in size") - self.data[ha_name]["asset_list"] = asset_list # Report on sales sales_report = {} @@ -1259,7 +1259,31 @@ class DataLoader: ) # We get the sales - sales_report = survey_list["installation_status"].value_counts().to_dict() + sales_report = { + "ECO4 - surveys sold": survey_list.shape[0], + **survey_list["installation_status"].value_counts().to_dict() + } + + # We find some cases where properties have sold but are missing CIGA checks + survey_list_to_merge = survey_list[["asset_list_row_id"]].copy() + survey_list_to_merge["has_a_survey_record"] = True + survey_list_to_merge = survey_list_to_merge[~pd.isnull(survey_list_to_merge["asset_list_row_id"])] + + asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id") + asset_list["ECO Eligibility"] = np.where( + (asset_list["ECO Eligibility"] == "eco4 (subject to ciga)") & ( + asset_list["has_a_survey_record"] == True + ), + "eco4 - passed ciga", + asset_list["ECO Eligibility"] + ) + asset_list = asset_list.drop(columns=["has_a_survey_record"]) + + # Update the survey list with installation status + self.data[ha_name]["survey_list"] = survey_list + + # Insert updated asset list + self.data[ha_name]["asset_list"] = asset_list ha_facts_and_figures.append( { @@ -1687,7 +1711,21 @@ def analyse_ha_data(outputs, loader): :return: """ + eco4_rate = 1710 + gbis_rate = 600 + old_eco4_rate = 1456 + old_gbis_rate = 432 + + epc_c_threshold = 80 + scheme_map = { + "ECO4": "ECO4", + "AFFORDABLE WARMTH": "ECO4", + "ECO4 A/W": "ECO4", + "ECO4 GBIS (ECO+)": "GBIS" + } + ha_analysis_results = [] + total_revenue_results = [] for ha_name, datasets in outputs.items(): inputs = [x for k, x in loader.data.items() if k == ha_name][0] @@ -1702,6 +1740,88 @@ def analyse_ha_data(outputs, loader): left_on="asset_list_row_id" ) + analysis_data["is_remaining"] = True + + n_sold_eco4 = 0 + n_sold_gbis = 0 + if not inputs["survey_list"].empty: + # Merge on the survey list and signal everything that is remaining or not (i.e. anything that hasn't had + # a survey) + survey_list = inputs["survey_list"].copy() + + # TODO: TEMP + scheme_column = survey_list.columns[0] + # We clean up the survey list installation or cancelled + survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower() + # Remove all punctuation + survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( + r'[^\w\s]', '', regex=True + ) + # Remove double spaces + survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( + r'\s+', ' ', regex=True + ) + # Remove trailing spaces + survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip() + + # Remap the values in the scheme column + survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map) + + survey_list["installation_status"] = None + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]), + "installed", + survey_list["installation_status"] + ) + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["cancelled"]), + "cancelled", + survey_list["installation_status"] + ) + # Find partial installations + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), + "partially installed", + survey_list["installation_status"] + ) + # Find partial cancellations + # TODO: We might have more indications of partial cancellations + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), + "partially cancelled", + survey_list["installation_status"] + ) + + # Finally, for other cases, we set the status to "in progress" + survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress") + + # We concatenate the scheme name with the installation status + survey_list["installation_status"] = ( + survey_list[scheme_column] + " - " + survey_list["installation_status"] + ) + + # TODO: END TEMP + + survey_list_to_merge = survey_list[["asset_list_row_id", scheme_column]].copy() + survey_list_to_merge["is_remaining"] = False + analysis_data = analysis_data.drop(columns="is_remaining").merge( + survey_list_to_merge, + how="left", on="asset_list_row_id" + ) + analysis_data["is_remaining"] = analysis_data["is_remaining"].fillna(True) + + n_sold_eco4 = survey_list_to_merge[survey_list_to_merge[scheme_column] == "ECO4"].shape[0] + n_sold_gbis = survey_list_to_merge[survey_list_to_merge[scheme_column] == "GBIS"].shape[0] + + # Take just remaining + analysis_data = analysis_data[analysis_data["is_remaining"]] + + # Also, if the HA has started selling, we remove any that are still subject to ciga + n_eco4_missed_subject_to_ciga = 0 + if not inputs["survey_list"].empty: + n_eco4_missed_subject_to_ciga = (analysis_data["ECO Eligibility"] == "eco4 (subject to ciga)").sum() + analysis_data = analysis_data[analysis_data["ECO Eligibility"] != "eco4 (subject to ciga)"] + ################################################################################################ # We take the properties that strictly qualified under eco ################################################################################################ @@ -1714,8 +1834,11 @@ def analyse_ha_data(outputs, loader): eco4_identified["identification_type"] ) + # For expansive, the property can be no higher than an EPC C eco4_identified["identification_type"] = np.where( - (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False), + (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False) & ( + eco4_identified["sap"] <= epc_c_threshold + ), "expansive", eco4_identified["identification_type"] ) @@ -1743,21 +1866,17 @@ def analyse_ha_data(outputs, loader): "Meets fabric, fails SAP check", "Meets cavity, loft borderline, meets sap", ] - ), + ) & (ciga_dependent_identified["sap"] <= epc_c_threshold), "strict", ciga_dependent_identified["identification_type"] ) ciga_dependent_identified["identification_type"] = np.where( - (ciga_dependent_identified["eco4_message"].isin(["All conditions fail", "failed fabric check"])) & - (ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"])), - "expansive", - ciga_dependent_identified["identification_type"] - ) - - ciga_dependent_identified["identification_type"] = np.where( - (ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | ( + ((ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | ( ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"]) + )) & ( + (ciga_dependent_identified["sap"] <= epc_c_threshold) & + pd.isnull(ciga_dependent_identified["identification_type"]) ), "expansive", ciga_dependent_identified["identification_type"] @@ -1775,7 +1894,9 @@ def analyse_ha_data(outputs, loader): ) gbis_identified["identification_type"] = np.where( - (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] >= 69), + (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] <= epc_c_threshold) & ( + pd.isnull(gbis_identified["identification_type"]) + ), "expansive", gbis_identified["identification_type"] ) @@ -1806,9 +1927,16 @@ def analyse_ha_data(outputs, loader): ] surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False] - # Output variables + # Output variables - the data was sent to us in December, but the remaining figures are + # what was in November + november_remaining = loader.december_figures[loader.december_figures["HA Name"] == ha_name] + # ECO4 - n_properties_in_asset_list = inputs["asset_list"].shape[0] + n_properties_remaining_in_asset_list = inputs["asset_list"].shape[0] + november_eco4_remaining = max(november_remaining["ECO4 remaining"].values[0], 0) + november_eco4_sold = november_remaining["No. of Tech surveys complete - Eco 4"].values[0] + eco4_sales_since_november = n_sold_eco4 - november_eco4_sold + n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0] eco4_of_which_identified_strict = ( eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] + @@ -1820,26 +1948,37 @@ def analyse_ha_data(outputs, loader): ) # GBIS n_warmfront_identified_gbis = gbis_identified.shape[0] + november_gbis_remaining = max(november_remaining["GBIS remaining"].values[0], 0) + november_gbis_sold = november_remaining["No. of Tech surveys complete - GBIS"].values[0] + gbis_sales_since_november = n_sold_gbis - november_gbis_sold gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0] gbis_of_which_identified_expansive = \ gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0] to_append = { ("", "HA Name"): ha_name, - ("", "# Properties in asset list"): n_properties_in_asset_list, + ("", "# properties in asset list"): n_properties_remaining_in_asset_list, ############ # ECO4 ############ - ("ECO4", "# Properties identieid by Warmfront"): n_warmfront_identified_eco4, + ("ECO4", "# remaining November file"): november_eco4_remaining, + ("ECO4", "# sold in November file"): november_eco4_sold, + ("ECO4", "# sold (survey list)"): n_sold_eco4, + ("ECO4", "# that missed CIGA check"): n_eco4_missed_subject_to_ciga, + ("ECO4", "# Remaining properties (asset list)"): n_warmfront_identified_eco4, ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict, ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive, ("ECO4", "Of which identified by model - total"): ( - eco4_of_which_identified_strict + eco4_of_which_identified_expansive), + eco4_of_which_identified_strict + eco4_of_which_identified_expansive + ), ("ECO4", "Additional properties"): surplus_eco4.shape[0], ############ # GBIS ############ - ("GBIS", "# Properties identieid by Warmfront"): n_warmfront_identified_gbis, + ("GBIS", "# remaining November file"): november_gbis_remaining, + ("GBIS", "# sold in November file"): november_gbis_sold, + ("GBIS", "# sold (survey list)"): n_sold_gbis, + ("GBIS", "# Remaining properties (asset list)"): n_warmfront_identified_gbis, ("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict, ("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive, ("GBIS", "Of which identified by model - total"): ( @@ -1850,6 +1989,24 @@ def analyse_ha_data(outputs, loader): ha_analysis_results.append(to_append) + # Calculate the revenue results + to_append_revenue = { + ("", "HA Name"): ha_name, + # Eco4 revenue + ("ECO4", "£ remaining November file"): november_eco4_remaining * eco4_rate, + ("ECO4", "£ sold November file"): november_eco4_sold * old_eco4_rate, + ("ECO4", "£ sold since November"): eco4_sales_since_november * eco4_rate, + ("ECO4", "£ stuck at ciga check"): n_eco4_missed_subject_to_ciga * eco4_rate, + ("ECO4", "£ remaining (asset list)"): n_warmfront_identified_eco4 * eco4_rate, + ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict * eco4_rate, + ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive * eco4_rate, + ("ECO4", "Of which identified by model - total"): eco4_rate * ( + eco4_of_which_identified_strict + eco4_of_which_identified_expansive + ), + ("ECO4", "Additional properties"): eco4_rate * surplus_eco4.shape[0], + } + total_revenue_results.append(to_append_revenue) + ha_analysis_results = pd.DataFrame(ha_analysis_results) ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns) @@ -1862,8 +2019,8 @@ def analyse_ha_data(outputs, loader): facts_and_figures = facts_and_figures.rename( columns={ # ECO4 cols - "ECO4": "ECO4 - December", - "GBIS": "GBIS - December", + "ECO4": "ECO4 - November", + "GBIS": "GBIS - November", "eco4 (subject to ciga)": "ECO4 - subject to ciga", "eco4": "ECO4 - doesn't need CIGA", "eco4 - passed ciga": "ECO4 - passed CIGA", @@ -1880,19 +2037,27 @@ def analyse_ha_data(outputs, loader): # ECO4 - doesn't need CIGA + ECO4 - passed CIGA # 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is # ECO4 - doesn't need CIGA + ECO4 - subject to ciga - facts_and_figures["ECO4 total (asset list)"] = np.where( + facts_and_figures["ECO4 total (asset list - pre ciga)"] = ( + facts_and_figures["ECO4 - doesn't need CIGA"] + + facts_and_figures["ECO4 - subject to ciga"] + + facts_and_figures["ECO4 - passed CIGA"] + ) + + facts_and_figures["ECO4 total (asset list - post ciga)"] = None + facts_and_figures["ECO4 total (asset list - post ciga)"] = np.where( facts_and_figures["ECO4 - passed CIGA"] > 0, facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"], - facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - subject to ciga"] + facts_and_figures["ECO4 total (asset list - post ciga)"] ) # Re-arrange the columns facts_and_figures = facts_and_figures[ [ 'HA Name', - 'ECO4 - December', - 'GBIS - December', - 'ECO4 total (asset list)', + 'ECO4 - November', + 'GBIS - November', + 'ECO4 total (asset list - pre ciga)', + 'ECO4 total (asset list - post ciga)', 'GBIS total (asset list)', 'ECO4 - subject to ciga', "ECO4 - doesn't need CIGA", @@ -1916,6 +2081,8 @@ def analyse_ha_data(outputs, loader): facts_and_figures["Missed CIGA checks opportunity"] ) + facts_and_figures.to_csv("Facts and figures sample.csv") + # Re arrage the columns # Also sort ha_analysis_results by ha number @@ -1937,6 +2104,333 @@ def analyse_ha_data(outputs, loader): for i, width in enumerate(get_col_widths(df)): writer.sheets[sheet].set_column(i, i, width) + # Inspection: - Looking into the proportion of homes with "cavity, as built, insulated (assumed)" as their + # description, and what proportion of time they get identified via non-invasive surveys + + # true_eco4_assets = [] + # ciga_dependent_assets = [] + # not_eligible = [] + # as_built_insulated = [] + # date_cols = { + # "HA39": "date_built", + # "HA14": "Built In Year", + # "HA6": "Construction Year", + # "HA1": "Build Date", + # "HA107": "YEAR BUILT" + # } + # for ha_name, data_objects in outputs.items(): + # inputs = [x for k, x in loader.data.items() if k == ha_name][0] + # + # date_col = date_cols[ha_name] + # results_df = data_objects["results_df"].copy() + # df = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility", date_col]].rename( + # columns={"row_meaning": "asset_identification_status", date_col: "date_built"} + # ).merge( + # results_df, + # how="left", + # right_on="row_id", + # left_on="asset_list_row_id" + # ) + # + # # take the true ECO4 + # true_eco4 = df[df["ECO Eligibility"] == "eco4"].copy() + # ciga_dependent = df[ + # df["ECO Eligibility"].isin( + # [ + # "eco4 (subject to ciga)", + # "failed ciga", + # "eco4 - passed ciga" + # ] + # ) + # ] + # insulated_assumed = df[df["walls"] == "Cavity wall, as built, insulated"].copy() + # # We convert date built to datetime + # try: + # insulated_assumed = insulated_assumed[~pd.isnull(insulated_assumed["date_built"])] + # insulated_assumed["year_built"] = pd.to_datetime(insulated_assumed["date_built"].astype(str)).dt.year + # as_built_insulated.append(insulated_assumed) + # except Exception as e: + # print("oh well") + # + # true_eco4_assets.append(true_eco4) + # ciga_dependent_assets.append(ciga_dependent) + # + # true_eco4_assets = pd.concat(true_eco4_assets) + # ciga_dependent_assets = pd.concat(ciga_dependent_assets) + # as_built_insulated = pd.concat(as_built_insulated) + # + # true_eco4_assets["walls"].value_counts(normalize=True) + # ciga_dependent_assets["walls"].value_counts(normalize=True) + # + # from recommendations.recommendation_utils import extract_insulation_thickness + # + # true_eco4_assets["roof_insulation_thickness"] = true_eco4_assets["roof"].apply( + # lambda x: extract_insulation_thickness(x) + # ) + # + # true_eco4_assets["e"] = true_eco4_assets.merge( + # pd.DataFrame(cleaned["roof-description"])[["original_description", "insulation_thickness"]], + # how="left", + # left_on="roof", + # right_on="original_description" + # ) + # + # true_eco4_assets["sap"].mean() + # + # true_eco4_assets["insulation_thickness"].isin( + # ["250", "150", "200", "100", "75", "50"] + # ).sum() / true_eco4_assets.shape[0] + # + # true_eco4_assets["insulation_thickness"].isin( + # ["100"] + # ).sum() / true_eco4_assets.shape[0] + # + # as_built_insulated.groupby("property_type")["ECO Eligibility"].value_counts(normalize=True) + + +def get_propensity_model_data( + loader, cleaned, cleaning_data, created_at, photo_supply_lookup, + floor_area_decile_thresholds, pull_data=True +): + # TODO: Set a seed! + model_data = [] + for ha_name, data_assets in loader.data.items(): + + logger.info("Processing HA: %s", ha_name) + if data_assets["survey_list"].empty: + continue + + number_sold = data_assets["survey_list"].shape[0] + + # For each HA, we read pull in the data required, and store in S3 + asset_list = data_assets["asset_list"].copy() + # We determine the number of properties that we should select that are eligible + asset_list_size = asset_list.shape[0] + # Number eligible + n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0] + success_rate = n_eligibile / asset_list_size + needed_sample_size = np.ceil(number_sold / success_rate) + number_negative_samples = int(needed_sample_size - number_sold) + + sold_asset_list_ids = data_assets["survey_list"]["asset_list_row_id"].tolist() + negative_sample_asset_list_ids = asset_list["asset_list_row_id"].sample(number_negative_samples).tolist() + sample_ids = sold_asset_list_ids + negative_sample_asset_list_ids + + sample_asset_list = asset_list[asset_list["asset_list_row_id"].isin(sample_ids)] + + # In order to have the most confidence, we should take just properties that have 1 EPC. We might need to + # cut down the number of properties that we include because of this + # Note: This is an imbalanced problem so we will need to build a model accomadating of that + + data = [] + errors = [] + for index, property_meta in tqdm(sample_asset_list.iterrows(), total=len(sample_asset_list)): + + if property_meta["matching_postcode"] is None: + continue + + property_type, built_form = get_property_type_and_built_form( + property_meta=property_meta, ha_name=ha_name + ) + + searcher = SearchEpc( + address1=str(property_meta["HouseNo"]), + postcode=property_meta["matching_postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + full_address=property_meta["matching_address"] + ) + searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.built_form = built_form + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + continue + + if searcher.newest_epc.get("estimated"): + # We insert the row ID as our proxy for UPRN + searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) + + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + full_sap_epc = searcher.full_sap_epc + + # If we have more than 1 EPC for the moment we just continue + if older_epcs or full_sap_epc: + continue + try: + + # We clean up the data + epc_records = { + 'original_epc': newest_epc.copy(), + 'full_sap_epc': full_sap_epc.copy(), + 'old_data': older_epcs.copy(), + } + + epc_record = EPCRecord( + epc_records=epc_records, + run_mode="newdata", + cleaning_data=cleaning_data + ) + + # If we have some data, continue + data.append( + { + "ECO Eligibility": property_meta["ECO Eligibility"], + "asset_list_row_id": property_meta["asset_list_row_id"], + **epc_record.get("prepared_epc") + } + ) + except Exception as e: + errors.append( + { + "error": str(e), + "asset_list_row_id": property_meta["asset_list_row_id"], + "matching_postcode": property_meta["matching_postcode"], + "matching_address": property_meta["matching_address"] + } + ) + + data = pd.DataFrame(data) + # We store the results in S3 as a pickle + save_pickle_to_s3( + data=data, + bucket_name="retrofit-datalake-dev", + s3_file_name=f"propensity_model_data/{ha_name}/train.pickle" + ) + + # Store the errors + if errors: + save_pickle_to_s3( + data=errors, + bucket_name="retrofit-datalake-dev", + s3_file_name=f"propensity_model_data/{ha_name}/errors.pickle" + ) + + model_data.append(data) + + return model_data + + +def conversion_model(loader): + # Read in the model data + + model_data = [] + for ha_name in loader.data.keys(): + try: + picked = read_pickle_from_s3( + bucket_name="retrofit-datalake-dev", + s3_file_name=f"propensity_model_data/{ha_name}/train.pickle" + ) + data = pd.DataFrame(picked) + + # We merge on the sales data + sales_data = loader.data[ha_name]["survey_list"].copy() + data = data.merge( + sales_data[["asset_list_row_id", "installation_status"]], + how="left", + on="asset_list_row_id" + ) + data["ha_name"] = ha_name + + except Exception as e: + logger.error("Error reading in the data for %s", ha_name) + continue + + model_data.append(data) + + model_data = pd.concat(model_data) + + model_data["response"] = model_data["installation_status"].isin( + [ + "ECO4 - in progress", + "ECO4 - installed" + ] + ).astype(int) + + # Because of how we pulled the data, we need to re-balance the sample + ha_names = model_data["ha_name"].unique() + + balanced_sample = [] + for ha_name in ha_names: + df = model_data[model_data["ha_name"] == ha_name] + positive_samples = df[df["response"] == 1] + negative_samples = df[df["response"] != 1] + + inputs = [x for k, x in loader.data.items() if k == ha_name][0] + asset_list = inputs["asset_list"].copy() + asset_list_size = asset_list.shape[0] + n_eligibile = asset_list[asset_list["ECO Eligibility"] != "not eligible"].shape[0] + success_rate = n_eligibile / asset_list_size + needed_sample_size = np.ceil(positive_samples.shape[0] / success_rate) + number_negative_samples = int(needed_sample_size - positive_samples.shape[0]) + negative_samples_subset = negative_samples.sample(number_negative_samples) + + output = pd.concat([positive_samples, negative_samples_subset]) + + balanced_sample.append(output) + + balanced_sample = pd.concat(balanced_sample) + + # We work with a small sample + # Drop the ECO Eligibility column and installation_status column + # We keep the ID column + balanced_sample = balanced_sample.drop( + columns=['ECO Eligibility', 'asset_list_row_id', 'address', 'uprn_source', 'address3', 'local_authority_label', + 'county', 'postcode', 'constituency', 'local_authority', 'inspection_date', 'address1', + 'constituency_label', 'building_reference_number', 'address2', 'posttown', 'lodgement_datetime', + 'uprn', 'lodgement_date', 'lmk_key', 'installation_status', 'ha_name'] + ) + + # POC model + df = balanced_sample.copy() + # FIll missings with means, if they exist + numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns + df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean()) + + categorical_cols = df.select_dtypes(include=['object', 'category']).columns + df[categorical_cols] = df[categorical_cols].fillna("other") + + # Reduce the number of categories to a specific number and the rest to other + max_n_categories = 10 + for col in categorical_cols: + top_categories = df[col].value_counts().nlargest(max_n_categories).index + df[col] = df[col].where(df[col].isin(top_categories), other="other") + + # Use a model based approach to feature selection + import xgboost as xgb + from sklearn.model_selection import train_test_split + + # Assuming your outcome column is named 'target' + X = df.drop(columns=['response']) + y = df['response'] + df["low_energy_fixed_light_count"].va + + # Encoding categorical variables if not already done + X = pd.get_dummies(X, drop_first=True) + + # Splitting the data into train and test sets + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Initialize an XGBoost classifier + model = xgb.XGBClassifier() + + # Fit the model + model.fit(X_train, y_train) + + # Get feature importances + feature_importances = model.feature_importances_ + + # Map feature importances to their corresponding column names + feature_importance_dict = {feature: importance for feature, importance in zip(X.columns, feature_importances)} + + # Sort features by importance + sorted_features = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True) + + # Display sorted features + for feature, importance in sorted_features: + print(f"{feature}: {importance}") + def patch_cleaned(cleaned): # Patch to handle the a missing description @@ -2054,6 +2548,218 @@ def patch_cleaned(cleaned): return cleaned +def forecast_remaining_sales(loader): + # Assumptions: + # We cap the ciga conversion rate at 75% because I expect future HAs to have a lower CIGA conversion rate + # and I don't want the numbers to change too much, depenent on the CIGA conversation rate + maximum_ciga_conversion = 0.75 + + gbis_rate = 600 + eco4_rate = 1710 + old_gbis_rate = 432 + old_eco4_rate = 1456 + + # 1) Calculate the conversion rate from passed CIGA to actual sale + converted_ciga_jobs = [] + for ha_name, input_data in loader.data.items(): + asset_list = input_data["asset_list"].copy() + survey_list = input_data["survey_list"].copy() + + if survey_list.empty: + continue + + ciga_dependent_assets = asset_list[ + asset_list["ECO Eligibility"] == "eco4 - passed ciga" + ] + + # These are now the ciga dependent assets at installation + ciga_dependent_assets_at_installation = ciga_dependent_assets.merge( + survey_list[["asset_list_row_id", "installation_status"]], + how="inner", + on="asset_list_row_id" + ) + + # We then calculate how many get cancelled + ciga_dependent_assets_sold = ciga_dependent_assets_at_installation[ + ciga_dependent_assets_at_installation["installation_status"].isin( + [ + "ECO4 - installed", "ECO4 - in progress" + ] + ) + ] + + ciga_dependent_assets_failed = ciga_dependent_assets_at_installation[ + ~ciga_dependent_assets_at_installation["installation_status"].isin( + [ + "ECO4 - installed", "ECO4 - in progress" + ] + ) + ] + + converted_ciga_jobs.append( + { + "HA Name": ha_name, + "# Ciga dependent at installation": ciga_dependent_assets_at_installation.shape[0], + "# Ciga dependent successfully installed": ciga_dependent_assets_sold.shape[0], + "# Ciga dependent failed install": ciga_dependent_assets_failed.shape[0] + } + ) + + converted_ciga_jobs = pd.DataFrame(converted_ciga_jobs) + + # We calculate a ciga pass to install conversaion rate + median_ciga_pass_to_install = ( + converted_ciga_jobs["# Ciga dependent successfully installed"].sum() / + converted_ciga_jobs["# Ciga dependent at installation"].sum() + ) + + # 2) Calculate the conversion rate from CIGA dependent ciga passed + ciga_passrates = [] + for ha_name, input_data in loader.data.items(): + + # If we don't have a ciga list, we can't do anything + if input_data["ciga_list"].empty: + continue + + # 1) Calculate the conversion rate for CIGA to actual sale + asset_list = input_data["asset_list"].copy() + + ciga_completed_assets = asset_list[ + asset_list["ECO Eligibility"].isin( + [ + "eco4 - passed ciga", + "failed ciga" + ] + ) + ] + + ciga_passed = ciga_completed_assets[ + ciga_completed_assets["ECO Eligibility"].isin( + [ + "eco4 - passed ciga" + ] + ) + ] + + ciga_passrates.append( + { + "Ha Name": ha_name, + "# CIGA dependent": ciga_completed_assets.shape[0], + "# CIGA passed": ciga_passed.shape[0], + } + ) + + ciga_passrates = pd.DataFrame(ciga_passrates) + + median_ciga_pass_to_install = ciga_passrates["# CIGA passed"].sum() / ciga_passrates["# CIGA dependent"].sum() + + # 3) Calculate the conversion rate of an ECO4 and a GBISjob, that doesn't need ciga, to install + eco4_ciga_independent_passrates = [] + gbis_ciga_independent_passrates = [] + for ha_name, input_data in loader.data.items(): + asset_list = input_data["asset_list"].copy() + survey_list = input_data["survey_list"].copy() + + if survey_list.empty: + continue + + # For properties that were identified as a typical ECO4 job, we calculate the number of properties that + # installed + # vs cancelled + + typical_eco4 = asset_list[asset_list["ECO Eligibility"] == "eco4"] + typical_gbis = asset_list[asset_list["ECO Eligibility"] == "gbis"] + + # Merge on the surveys + typical_eco4_installed = typical_eco4.merge( + survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id" + ) + + if not typical_eco4_installed.empty: + typical_eco4_sold = typical_eco4_installed[ + typical_eco4_installed["installation_status"].isin( + [ + "ECO4 - installed", "ECO4 - in progress" + ] + ) + ] + + eco4_ciga_independent_passrates.append( + { + "Ha Name": ha_name, + "# ECO4 at install stage": typical_eco4_installed.shape[0], + "# ECO4 successfully installed": typical_eco4_sold.shape[0] + } + ) + + typical_gbis_installed = typical_gbis.merge( + survey_list[["asset_list_row_id", "installation_status"]], how="inner", on="asset_list_row_id" + ) + if not typical_gbis_installed.empty: + typical_gbis_sold = typical_gbis_installed[ + typical_gbis_installed["installation_status"].isin( + [ + "GBIS - in progress", "GBIS - installed" + ] + ) + ] + + gbis_ciga_independent_passrates.append( + { + "Ha Name": ha_name, + "# GBIS at install stage": typical_gbis_installed.shape[0], + "# GBIS successfully installed": typical_gbis_sold.shape[0] + } + ) + + eco4_ciga_independent_passrates = pd.DataFrame(eco4_ciga_independent_passrates) + gbis_ciga_independent_passrates = pd.DataFrame(gbis_ciga_independent_passrates) + + median_eco4_to_install = ( + eco4_ciga_independent_passrates["# ECO4 successfully installed"].sum() / + eco4_ciga_independent_passrates["# ECO4 at install stage"].sum() + ) + + median_gbis_to_install = ( + gbis_ciga_independent_passrates["# GBIS successfully installed"].sum() / + gbis_ciga_independent_passrates["# GBIS at install stage"].sum() + ) + + # Produce the final output + december_figures = loader.december_figures.copy() + december_figures = december_figures.fillna(0) + results = [] + for ha_name, input_data in loader.data.items(): + # Original warmfront figures + original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] + + original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0] + original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0] + original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0] + original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0] + + original_warmfront_eco4_revenue = ( + original_warmfront_remaining_eco4 * eco4_rate + + (original_warmfront_eco4 - original_warmfront_remaining_eco4) * old_eco4_rate + ) + original_warmfront_remaining_eco4_revenue = original_warmfront_remaining_eco4 * eco4_rate + + original_warmfront_gbis_revenue = ( + original_warmfront_remaining_gbis * gbis_rate + + (original_warmfront_gbis - original_warmfront_remaining_gbis) * old_gbis_rate + ) + + results.append( + { + ("", "", "HA Name"): ha_name, + ("Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4, + ("", "Remaining - #", ""): original_warmfront_remaining_eco4, + ("", "Total - £", ""): original_warmfront_eco4_revenue, + ("", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue, + } + ) + + def app(): """ This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107. @@ -2067,11 +2773,14 @@ def app(): pull_data = False # List all of the data in the folder - directories = [str(list(entry.iterdir())[0]) for entry in DATA_FOLDER.iterdir() if entry.is_dir()] + + directories = [str(file) for entry in DATA_FOLDER.iterdir() if entry.is_dir() + for file in entry.iterdir() if file.suffix == '.xlsx'] # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA39", "HA107"] + # priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA39", "HA107"] + priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA39", "HA107"] # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] @@ -2103,3 +2812,17 @@ def app(): floor_area_decile_thresholds=floor_area_decile_thresholds, pull_data=pull_data ) + + analyse_ha_data(outputs, loader) + + # import pickle + # with open("ha_analysis.pickle", "wb") as f: + # pickle.dump({"outputs": outputs, "loader": loader}, f) + + # To read: + # import pickle + # with open("ha_analysis.pickle", "rb") as f: + # outputs = pickle.load(f)["outputs"] + # + # with open("loader.pickle", "rb") as f: + # loader = pickle.load(f) diff --git a/utils/s3.py b/utils/s3.py index cb55094a..8d36bdb3 100644 --- a/utils/s3.py +++ b/utils/s3.py @@ -184,7 +184,7 @@ def read_pickle_from_s3(bucket_name, s3_file_name): logger.errpr("Incomplete credentials provided.") return None except Exception as e: - logger.errpr(f'Failed to download data from {bucket_name}/{s3_file_name}: {str(e)}') + logger.error(f'Failed to download data from {bucket_name}/{s3_file_name}: {str(e)}') return None # Deserialize data from pickle format From 9e679bd3fdb6e38a263f804ffdb07dda3892e7b1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 16:59:22 +0000 Subject: [PATCH 039/155] working on new forecast methodology --- .../ha_15_32/ha_analysis_batch_3.py | 81 +++++++++++++++++-- 1 file changed, 73 insertions(+), 8 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bb27029e..21af73ff 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2728,15 +2728,22 @@ def forecast_remaining_sales(loader): # Produce the final output december_figures = loader.december_figures.copy() december_figures = december_figures.fillna(0) + # If we have negative remaining, it means that actually sold more gbis than they initially thought so we set + # remaining to 0 + december_figures["ECO4 remaining"] = np.where( + december_figures["ECO4 remaining"] < 0, 0, december_figures["ECO4 remaining"] + ) + december_figures["GBIS remaining"] = np.where( + december_figures["GBIS remaining"] < 0, 0, december_figures["GBIS remaining"] + ) + results = [] for ha_name, input_data in loader.data.items(): - # Original warmfront figures + # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0] original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0] - original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0] - original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0] original_warmfront_eco4_revenue = ( original_warmfront_remaining_eco4 * eco4_rate + @@ -2744,21 +2751,79 @@ def forecast_remaining_sales(loader): ) original_warmfront_remaining_eco4_revenue = original_warmfront_remaining_eco4 * eco4_rate + # Original warmfront figures - GBIS + + original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0] + original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0] + original_warmfront_gbis_revenue = ( original_warmfront_remaining_gbis * gbis_rate + (original_warmfront_gbis - original_warmfront_remaining_gbis) * old_gbis_rate ) + original_warmfront_remaining_gbis_revenue = original_warmfront_remaining_gbis * gbis_rate + + # Asset list + asset_list = input_data["asset_list"].copy() + survey_list = input_data["survey_list"].copy() + + asset_list_remaining = asset_list.merge( + survey_list[["asset_list_row_id", "installation_status"]], + how="left", + on="asset_list_row_id" + ) + asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])] + + eligiblity_counts = pd.DataFrame(asset_list["ECO Eligibility"].value_counts()).reset_index() + eligiblity_counts_remaining = pd.DataFrame(asset_list_remaining["ECO Eligibility"].value_counts()).reset_index() + + eco4_pre_ciga = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"].isin( + ["eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"] + ) + ]["count"].sum() + + eco4_pre_ciga_remaining = eligiblity_counts_remaining[ + eligiblity_counts["ECO Eligibility"].isin( + ["eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"] + ) + ]["count"].sum() + + eco4_pre_ciga_revenue = eco4_pre_ciga * eco4_rate + eco4_pre_ciga_remaining_revenue = eco4_pre_ciga_remaining * eco4_rate + + # We check if the property has done a CIGA check + has_ciga_check = not input_data["ciga_list"].empty + + if has_ciga_check: + eco4_post_ciga = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"].isin( + ["eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"] + ) + ]["count"].sum() results.append( { - ("", "", "HA Name"): ha_name, - ("Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4, - ("", "Remaining - #", ""): original_warmfront_remaining_eco4, - ("", "Total - £", ""): original_warmfront_eco4_revenue, - ("", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue, + ("", "", "", "HA Name"): ha_name, + # ECO4 - original warmfront figures + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4, + ("ECO4", "", "Remaining - #", ""): original_warmfront_remaining_eco4, + ("ECO4", "", "Total - £", ""): original_warmfront_eco4_revenue, + ("ECO4", "", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue, + # GBIS - original warmfront figures + ("", "Original Warmfront estimate", "Total - #", "GBIS - November"): original_warmfront_gbis, + ("GBIS", "", "Remaining - #", ""): original_warmfront_gbis, + ("GBIS", "", "Total - £", ""): original_warmfront_gbis_revenue, + ("GBIS", "", "Remaining - £", ""): original_warmfront_remaining_gbis_revenue, + # ECO4 - asset list + ("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga, + ("ECO4", "", "Remaining - #", ""): eco4_pre_ciga_remaining, + ("ECO4", "", "Total - £", ""): eco4_pre_ciga_revenue, + ("ECO4", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, } ) + results = pd.DataFrame(results) + def app(): """ From a81f1f2520479e706479bada1761aaa92bb01a44 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 17:37:57 +0000 Subject: [PATCH 040/155] Adding in eligible properties left estimation --- .../ha_15_32/ha_analysis_batch_3.py | 101 ++++++++++++------ 1 file changed, 69 insertions(+), 32 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 21af73ff..cf9dfa53 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2613,7 +2613,7 @@ def forecast_remaining_sales(loader): converted_ciga_jobs["# Ciga dependent at installation"].sum() ) - # 2) Calculate the conversion rate from CIGA dependent ciga passed + # 2) Calculate the conversion rate from CIGA dependent to ciga passed ciga_passrates = [] for ha_name, input_data in loader.data.items(): @@ -2651,7 +2651,7 @@ def forecast_remaining_sales(loader): ciga_passrates = pd.DataFrame(ciga_passrates) - median_ciga_pass_to_install = ciga_passrates["# CIGA passed"].sum() / ciga_passrates["# CIGA dependent"].sum() + median_ciga_success_rate = ciga_passrates["# CIGA passed"].sum() / ciga_passrates["# CIGA dependent"].sum() # 3) Calculate the conversion rate of an ECO4 and a GBISjob, that doesn't need ciga, to install eco4_ciga_independent_passrates = [] @@ -2762,16 +2762,20 @@ def forecast_remaining_sales(loader): ) original_warmfront_remaining_gbis_revenue = original_warmfront_remaining_gbis * gbis_rate - # Asset list + # Asset list - ECO4 asset_list = input_data["asset_list"].copy() survey_list = input_data["survey_list"].copy() - asset_list_remaining = asset_list.merge( - survey_list[["asset_list_row_id", "installation_status"]], - how="left", - on="asset_list_row_id" - ) - asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])] + if survey_list.empty: + asset_list_remaining = asset_list.copy() + else: + asset_list_remaining = asset_list.merge( + survey_list[["asset_list_row_id", "installation_status"]], + how="left", + on="asset_list_row_id" + ) + asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])] + asset_list_remaining = asset_list_remaining.drop(columns=["installation_status"]) eligiblity_counts = pd.DataFrame(asset_list["ECO Eligibility"].value_counts()).reset_index() eligiblity_counts_remaining = pd.DataFrame(asset_list_remaining["ECO Eligibility"].value_counts()).reset_index() @@ -2791,36 +2795,69 @@ def forecast_remaining_sales(loader): eco4_pre_ciga_revenue = eco4_pre_ciga * eco4_rate eco4_pre_ciga_remaining_revenue = eco4_pre_ciga_remaining * eco4_rate - # We check if the property has done a CIGA check - has_ciga_check = not input_data["ciga_list"].empty + # Total Eligible - this is what passed ciga checks + strict. If we don't have what passed CIGA, we estimate + # We check if the HA has done a CIGA check. Also, if we have assets dormant at CIGA, we estimate what will + # convert + # We estimate a conversion for anything left post CIGA + ha_ciga_conversion = ciga_passrates[ciga_passrates["Ha Name"] == ha_name] + if not ha_ciga_conversion.empty: + ha_ciga_conversion_rate = ( + ha_ciga_conversion["# CIGA passed"].values[0] / ha_ciga_conversion["# CIGA dependent"].values[0] + ) + else: + ha_ciga_conversion_rate = ( + median_ciga_success_rate if median_ciga_success_rate <= median_ciga_success_rate else + median_ciga_success_rate + ) + remaining_needing_ciga_check = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "eco4 (subject to ciga)" + ]["count"].sum() + + has_ciga_check = not input_data["ciga_list"].empty if has_ciga_check: eco4_post_ciga = eligiblity_counts[ eligiblity_counts["ECO Eligibility"].isin( - ["eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"] + ["eco4", "eco4 - passed ciga", "failed ciga"] ) ]["count"].sum() - results.append( - { - ("", "", "", "HA Name"): ha_name, - # ECO4 - original warmfront figures - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4, - ("ECO4", "", "Remaining - #", ""): original_warmfront_remaining_eco4, - ("ECO4", "", "Total - £", ""): original_warmfront_eco4_revenue, - ("ECO4", "", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue, - # GBIS - original warmfront figures - ("", "Original Warmfront estimate", "Total - #", "GBIS - November"): original_warmfront_gbis, - ("GBIS", "", "Remaining - #", ""): original_warmfront_gbis, - ("GBIS", "", "Total - £", ""): original_warmfront_gbis_revenue, - ("GBIS", "", "Remaining - £", ""): original_warmfront_remaining_gbis_revenue, - # ECO4 - asset list - ("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga, - ("ECO4", "", "Remaining - #", ""): eco4_pre_ciga_remaining, - ("ECO4", "", "Total - £", ""): eco4_pre_ciga_revenue, - ("ECO4", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, - } - ) + if remaining_needing_ciga_check > 0: + # We update the eco4 post ciga with the converted remaining + eco4_post_ciga += np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + else: + eco4_post_ciga = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "eco4" + ]["count"].sum() + np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + + eco4_post_ciga = int(eco4_post_ciga) + + to_append = { + ("", "", "", "HA Name"): ha_name, + # ECO4 - original warmfront figures + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4, + ("ECO4 original", "", "Remaining - #", ""): original_warmfront_remaining_eco4, + ("ECO4 original", "", "Total - £", ""): original_warmfront_eco4_revenue, + ("ECO4 original", "", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue, + # GBIS - original warmfront figures + ("", "Original Warmfront estimate", "Total - #", "GBIS - November"): original_warmfront_gbis, + ("GBIS original", "", "Remaining - #", ""): original_warmfront_gbis, + ("GBIS original", "", "Total - £", ""): original_warmfront_gbis_revenue, + ("GBIS original", "", "Remaining - £", ""): original_warmfront_remaining_gbis_revenue, + # ECO4 - asset list, pre-ciga + ("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga, + ("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining, + ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue, + ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, + # ECO4 - asset list, post ciga + ("ECO4 post-ciga", "", "Estimated total eligible - #", ""): eco4_post_ciga, + } + + # Make sure nothing is forgotten due to duplicate multi-index keys + if len(to_append) != 14: + raise ValueError("Something went wrong") + + results.append(to_append) results = pd.DataFrame(results) From 6544adc6c3c9d811f789a0372a33a19bd32beb78 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 17:47:51 +0000 Subject: [PATCH 041/155] Added eligibility calculations --- .../ha_15_32/ha_analysis_batch_3.py | 55 ++++++++++++------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index cf9dfa53..8a46703e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2548,6 +2548,33 @@ def patch_cleaned(cleaned): return cleaned +def calculate_eco4_post_ciga(eligiblity_counts, input_data, ha_ciga_conversion_rate, eco4_rate): + remaining_needing_ciga_check = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "eco4 (subject to ciga)" + ]["count"].sum() + + has_ciga_check = not input_data["ciga_list"].empty + if has_ciga_check: + eco4_post_ciga = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"].isin( + ["eco4", "eco4 - passed ciga", "failed ciga"] + ) + ]["count"].sum() + + if remaining_needing_ciga_check > 0: + # We update the eco4 post ciga with the converted remaining + eco4_post_ciga += np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + else: + eco4_post_ciga = ( + eligiblity_counts[eligiblity_counts["ECO Eligibility"] == "eco4"]["count"].sum() + + np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + ) + eco4_post_ciga = int(eco4_post_ciga) + eco4_post_ciga_revenue = eco4_post_ciga * eco4_rate + + return eco4_post_ciga, eco4_post_ciga_revenue + + def forecast_remaining_sales(loader): # Assumptions: # We cap the ciga conversion rate at 75% because I expect future HAs to have a lower CIGA conversion rate @@ -2810,27 +2837,13 @@ def forecast_remaining_sales(loader): median_ciga_success_rate ) - remaining_needing_ciga_check = eligiblity_counts[ - eligiblity_counts["ECO Eligibility"] == "eco4 (subject to ciga)" - ]["count"].sum() + eco4_post_ciga, eco4_post_ciga_revenue = calculate_eco4_post_ciga( + eligiblity_counts, input_data, ha_ciga_conversion_rate, eco4_rate + ) - has_ciga_check = not input_data["ciga_list"].empty - if has_ciga_check: - eco4_post_ciga = eligiblity_counts[ - eligiblity_counts["ECO Eligibility"].isin( - ["eco4", "eco4 - passed ciga", "failed ciga"] - ) - ]["count"].sum() - - if remaining_needing_ciga_check > 0: - # We update the eco4 post ciga with the converted remaining - eco4_post_ciga += np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) - else: - eco4_post_ciga = eligiblity_counts[ - eligiblity_counts["ECO Eligibility"] == "eco4" - ]["count"].sum() + np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) - - eco4_post_ciga = int(eco4_post_ciga) + eco4_post_ciga_remaining, eco4_post_ciga_remaining_revenue = calculate_eco4_post_ciga( + eligiblity_counts_remaining, input_data, ha_ciga_conversion_rate, eco4_rate + ) to_append = { ("", "", "", "HA Name"): ha_name, @@ -2851,6 +2864,8 @@ def forecast_remaining_sales(loader): ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, # ECO4 - asset list, post ciga ("ECO4 post-ciga", "", "Estimated total eligible - #", ""): eco4_post_ciga, + ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining, + ("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_revenue, } # Make sure nothing is forgotten due to duplicate multi-index keys From 5c686f5ec471b3c5c84b307e0851e2a0462934c0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 17:56:45 +0000 Subject: [PATCH 042/155] working on forecast --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 8a46703e..0bf34e70 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2841,6 +2841,9 @@ def forecast_remaining_sales(loader): eligiblity_counts, input_data, ha_ciga_conversion_rate, eco4_rate ) + # Calculate the delta compared to Warmfront's original estimate + eco4_delta_vs_original_estimate = 200 * (eco4_post_ciga - original_warmfront_eco4) / original_warmfront_eco4 + eco4_post_ciga_remaining, eco4_post_ciga_remaining_revenue = calculate_eco4_post_ciga( eligiblity_counts_remaining, input_data, ha_ciga_conversion_rate, eco4_rate ) @@ -2862,14 +2865,17 @@ def forecast_remaining_sales(loader): ("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining, ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue, ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, - # ECO4 - asset list, post ciga + # ECO4 - asset list, post ciga, total ("ECO4 post-ciga", "", "Estimated total eligible - #", ""): eco4_post_ciga, - ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining, ("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_revenue, + ("ECO4 post-ciga", "", "Delta vs original estimate", ""): eco4_delta_vs_original_estimate, + # ECO4 - asset list, post ciga, remaining + ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining, + ("ECO4 post-ciga", "", "Estimated remaining total eligible - £", ""): eco4_post_ciga_remaining_revenue, } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 14: + if len(to_append) != 18: raise ValueError("Something went wrong") results.append(to_append) From c47af474b92282a1159c2866e8810e8e883db7bd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 18:13:11 +0000 Subject: [PATCH 043/155] Added in remaining breakdowns into forecast and confirmed --- .../ha_15_32/ha_analysis_batch_3.py | 59 ++++++++++++++----- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 0bf34e70..77c18e80 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2555,24 +2555,40 @@ def calculate_eco4_post_ciga(eligiblity_counts, input_data, ha_ciga_conversion_r has_ciga_check = not input_data["ciga_list"].empty if has_ciga_check: - eco4_post_ciga = eligiblity_counts[ + eco4_confirmed = eligiblity_counts[ eligiblity_counts["ECO Eligibility"].isin( - ["eco4", "eco4 - passed ciga", "failed ciga"] + ["eco4", "eco4 - passed ciga"] ) ]["count"].sum() if remaining_needing_ciga_check > 0: # We update the eco4 post ciga with the converted remaining - eco4_post_ciga += np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + eco4_remaining_forecast = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast + else: + eco4_remaining_forecast = 0 + eco4_post_ciga = eco4_confirmed else: + eco4_confirmed = 0 + eco4_remaining_forecast = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) eco4_post_ciga = ( - eligiblity_counts[eligiblity_counts["ECO Eligibility"] == "eco4"]["count"].sum() + - np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + eligiblity_counts[eligiblity_counts["ECO Eligibility"] == "eco4"]["count"].sum() + eco4_remaining_forecast ) eco4_post_ciga = int(eco4_post_ciga) - eco4_post_ciga_revenue = eco4_post_ciga * eco4_rate + eco4_remaining_forecast = int(eco4_remaining_forecast) - return eco4_post_ciga, eco4_post_ciga_revenue + results = { + # Counts + "ECO4 - post CIGA - #": eco4_post_ciga, + "Of which confirmed - #": eco4_confirmed, + "Of which forecast - #": eco4_remaining_forecast, + # Revenue + "ECO4 - post CIGA - £": eco4_post_ciga * eco4_rate, + "Of which confirmed - £": eco4_confirmed * eco4_rate, + "Of which forecast - £": eco4_remaining_forecast * eco4_rate, + } + + return results def forecast_remaining_sales(loader): @@ -2837,14 +2853,16 @@ def forecast_remaining_sales(loader): median_ciga_success_rate ) - eco4_post_ciga, eco4_post_ciga_revenue = calculate_eco4_post_ciga( + eco4_post_ciga_total_results = calculate_eco4_post_ciga( eligiblity_counts, input_data, ha_ciga_conversion_rate, eco4_rate ) # Calculate the delta compared to Warmfront's original estimate - eco4_delta_vs_original_estimate = 200 * (eco4_post_ciga - original_warmfront_eco4) / original_warmfront_eco4 + eco4_delta_vs_original_estimate = 100 * ( + eco4_post_ciga_total_results["ECO4 - post CIGA - #"] - original_warmfront_eco4 + ) / original_warmfront_eco4 - eco4_post_ciga_remaining, eco4_post_ciga_remaining_revenue = calculate_eco4_post_ciga( + eco4_post_ciga_remaining_results = calculate_eco4_post_ciga( eligiblity_counts_remaining, input_data, ha_ciga_conversion_rate, eco4_rate ) @@ -2866,12 +2884,25 @@ def forecast_remaining_sales(loader): ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue, ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, # ECO4 - asset list, post ciga, total - ("ECO4 post-ciga", "", "Estimated total eligible - #", ""): eco4_post_ciga, - ("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_revenue, + ("ECO4 post-ciga", "", "Estimated total eligible - #", ""): eco4_post_ciga_total_results[ + "ECO4 - post CIGA - #"], + ("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_total_results[ + "ECO4 - post CIGA - £"], ("ECO4 post-ciga", "", "Delta vs original estimate", ""): eco4_delta_vs_original_estimate, # ECO4 - asset list, post ciga, remaining - ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining, - ("ECO4 post-ciga", "", "Estimated remaining total eligible - £", ""): eco4_post_ciga_remaining_revenue, + ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining_results[ + "ECO4 - post CIGA - #"], + ("ECO4 post-ciga", "", "Estimated remaining total eligible - £", ""): eco4_post_ciga_remaining_results[ + "ECO4 - post CIGA - £"], + ("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - #", ""): + eco4_post_ciga_remaining_results["Of which confirmed - #"], + ("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - £", ""): + eco4_post_ciga_remaining_results["Of which confirmed - £"], + ("ECO4 post-ciga", "", "Of which forecast - #", ""): + eco4_post_ciga_remaining_results["Of which forecast - #"], + ("ECO4 post-ciga", "", "Of which forecast - £", ""): + eco4_post_ciga_remaining_results["Of which forecast - £"], + # CIGA failures } # Make sure nothing is forgotten due to duplicate multi-index keys From 752f0b0f8384a1082161abf31c18638864c45f1e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 18:37:47 +0000 Subject: [PATCH 044/155] splitting out post ciga figures --- .../ha_15_32/ha_analysis_batch_3.py | 71 +++++++++++++++---- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 77c18e80..4f33bf34 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2548,34 +2548,52 @@ def patch_cleaned(cleaned): return cleaned -def calculate_eco4_post_ciga(eligiblity_counts, input_data, ha_ciga_conversion_rate, eco4_rate): +def calculate_eco4_post_ciga( + eligiblity_counts, input_data, ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate, + eco4_rate +): remaining_needing_ciga_check = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "eco4 (subject to ciga)" ]["count"].sum() has_ciga_check = not input_data["ciga_list"].empty if has_ciga_check: - eco4_confirmed = eligiblity_counts[ - eligiblity_counts["ECO Eligibility"].isin( - ["eco4", "eco4 - passed ciga"] - ) - ]["count"].sum() + + eco4_no_ciga_needed = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "eco4" + ]["count"].sum() + + eco4_ciga_passed = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "eco4 - passed ciga" + ]["count"].sum() + + eco4_confirmed = (eco4_no_ciga_needed * ha_eco4_to_sale_rate) + (eco4_ciga_passed * ha_ciga_pass_to_sale_rate) + eco4_confirmed = np.round(eco4_confirmed) if remaining_needing_ciga_check > 0: # We update the eco4 post ciga with the converted remaining - eco4_remaining_forecast = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + eco4_remaining_forecast = np.round( + remaining_needing_ciga_check * ha_ciga_conversion_rate * ha_ciga_pass_to_sale_rate + ) eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast else: eco4_remaining_forecast = 0 eco4_post_ciga = eco4_confirmed else: - eco4_confirmed = 0 - eco4_remaining_forecast = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + eco4_no_ciga_needed = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "eco4" + ]["count"].sum() + eco4_confirmed = np.round(eco4_no_ciga_needed * ha_eco4_to_sale_rate) + eco4_remaining_forecast = np.round( + remaining_needing_ciga_check * ha_ciga_conversion_rate * ha_ciga_pass_to_sale_rate + ) eco4_post_ciga = ( eligiblity_counts[eligiblity_counts["ECO Eligibility"] == "eco4"]["count"].sum() + eco4_remaining_forecast ) + eco4_post_ciga = int(eco4_post_ciga) eco4_remaining_forecast = int(eco4_remaining_forecast) + eco4_confirmed = int(eco4_confirmed) results = { # Counts @@ -2853,8 +2871,32 @@ def forecast_remaining_sales(loader): median_ciga_success_rate ) + # We also need the ha ciga passed to install success rate + ha_ciga_pass_to_sale = converted_ciga_jobs[converted_ciga_jobs["HA Name"] == ha_name] + if not ha_ciga_pass_to_sale.empty: + ha_ciga_pass_to_sale_rate = ( + ha_ciga_pass_to_sale["# Ciga dependent successfully installed"].values[0] / + ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0] + ) + else: + ha_ciga_pass_to_sale_rate = median_ciga_pass_to_install + + ha_eco4_to_sale = eco4_ciga_independent_passrates[eco4_ciga_independent_passrates["Ha Name"] == ha_name] + if not ha_eco4_to_sale.empty: + ha_eco4_to_sale_rate = ( + ha_eco4_to_sale['# ECO4 successfully installed'].values[0] / + ha_eco4_to_sale['# ECO4 at install stage'].values[0] + ) + else: + ha_eco4_to_sale_rate = median_eco4_to_install + eco4_post_ciga_total_results = calculate_eco4_post_ciga( - eligiblity_counts, input_data, ha_ciga_conversion_rate, eco4_rate + eligiblity_counts=eligiblity_counts, + input_data=input_data, + ha_ciga_conversion_rate=ha_ciga_conversion_rate, + ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate, + ha_eco4_to_sale_rate=ha_eco4_to_sale_rate, + eco4_rate=eco4_rate ) # Calculate the delta compared to Warmfront's original estimate @@ -2863,7 +2905,12 @@ def forecast_remaining_sales(loader): ) / original_warmfront_eco4 eco4_post_ciga_remaining_results = calculate_eco4_post_ciga( - eligiblity_counts_remaining, input_data, ha_ciga_conversion_rate, eco4_rate + eligiblity_counts=eligiblity_counts_remaining, + input_data=input_data, + ha_ciga_conversion_rate=ha_ciga_conversion_rate, + ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate, + ha_eco4_to_sale_rate=ha_eco4_to_sale_rate, + eco4_rate=eco4_rate ) to_append = { @@ -2906,7 +2953,7 @@ def forecast_remaining_sales(loader): } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 18: + if len(to_append) != 22: raise ValueError("Something went wrong") results.append(to_append) From 56ee7224f58e7363a1732ed46aaebd29a71f7acd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 19:53:28 +0000 Subject: [PATCH 045/155] Added gbis remaining columns --- .../ha_15_32/ha_analysis_batch_3.py | 1100 +++++++++-------- 1 file changed, 592 insertions(+), 508 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 4f33bf34..191ca74c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1692,500 +1692,500 @@ def get_col_widths(dataframe): return widths -def analyse_ha_data(outputs, loader): - """ - The approach we take within this function is the following: - For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The - characterisation can be broken down as the following: - 1) The property has been identified by Warmfront and is eligible for ECO4/GBIS work, under the strictest criteria - 2) The property has been identified by Warmfront, however it has a full cavity, and therefore would be subject to - a CIGA check - 3) The property has been identified by Warmfront, but the EPC shows that the property has more than 100mm loft - insulation - 4) The property has been identified by Warmfront, but doesn't look like a property that would likely qualify under - any cirsumstances, given the available data - - Then, for any property that has NOT been identifid by Warmfront, we identify properties that look like they would - qualify under the strictest criteria, and mark these as potential additional opportunities. - - :return: - """ - - eco4_rate = 1710 - gbis_rate = 600 - old_eco4_rate = 1456 - old_gbis_rate = 432 - - epc_c_threshold = 80 - scheme_map = { - "ECO4": "ECO4", - "AFFORDABLE WARMTH": "ECO4", - "ECO4 A/W": "ECO4", - "ECO4 GBIS (ECO+)": "GBIS" - } - - ha_analysis_results = [] - total_revenue_results = [] - for ha_name, datasets in outputs.items(): - inputs = [x for k, x in loader.data.items() if k == ha_name][0] - - results_df = datasets["results_df"].copy() - - analysis_data = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility"]].rename( - columns={"row_meaning": "asset_identification_status"} - ).merge( - results_df, - how="left", - right_on="row_id", - left_on="asset_list_row_id" - ) - - analysis_data["is_remaining"] = True - - n_sold_eco4 = 0 - n_sold_gbis = 0 - if not inputs["survey_list"].empty: - # Merge on the survey list and signal everything that is remaining or not (i.e. anything that hasn't had - # a survey) - survey_list = inputs["survey_list"].copy() - - # TODO: TEMP - scheme_column = survey_list.columns[0] - # We clean up the survey list installation or cancelled - survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower() - # Remove all punctuation - survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( - r'[^\w\s]', '', regex=True - ) - # Remove double spaces - survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( - r'\s+', ' ', regex=True - ) - # Remove trailing spaces - survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip() - - # Remap the values in the scheme column - survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map) - - survey_list["installation_status"] = None - survey_list["installation_status"] = np.where( - survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]), - "installed", - survey_list["installation_status"] - ) - survey_list["installation_status"] = np.where( - survey_list["installed_or_cancelled_clean"].isin(["cancelled"]), - "cancelled", - survey_list["installation_status"] - ) - # Find partial installations - survey_list["installation_status"] = np.where( - survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), - "partially installed", - survey_list["installation_status"] - ) - # Find partial cancellations - # TODO: We might have more indications of partial cancellations - survey_list["installation_status"] = np.where( - survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), - "partially cancelled", - survey_list["installation_status"] - ) - - # Finally, for other cases, we set the status to "in progress" - survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress") - - # We concatenate the scheme name with the installation status - survey_list["installation_status"] = ( - survey_list[scheme_column] + " - " + survey_list["installation_status"] - ) - - # TODO: END TEMP - - survey_list_to_merge = survey_list[["asset_list_row_id", scheme_column]].copy() - survey_list_to_merge["is_remaining"] = False - analysis_data = analysis_data.drop(columns="is_remaining").merge( - survey_list_to_merge, - how="left", on="asset_list_row_id" - ) - analysis_data["is_remaining"] = analysis_data["is_remaining"].fillna(True) - - n_sold_eco4 = survey_list_to_merge[survey_list_to_merge[scheme_column] == "ECO4"].shape[0] - n_sold_gbis = survey_list_to_merge[survey_list_to_merge[scheme_column] == "GBIS"].shape[0] - - # Take just remaining - analysis_data = analysis_data[analysis_data["is_remaining"]] - - # Also, if the HA has started selling, we remove any that are still subject to ciga - n_eco4_missed_subject_to_ciga = 0 - if not inputs["survey_list"].empty: - n_eco4_missed_subject_to_ciga = (analysis_data["ECO Eligibility"] == "eco4 (subject to ciga)").sum() - analysis_data = analysis_data[analysis_data["ECO Eligibility"] != "eco4 (subject to ciga)"] - - ################################################################################################ - # We take the properties that strictly qualified under eco - ################################################################################################ - - eco4_identified = analysis_data[analysis_data["ECO Eligibility"] == "eco4"].copy() - eco4_identified["identification_type"] = None - eco4_identified["identification_type"] = np.where( - (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == True), - "strict", - eco4_identified["identification_type"] - ) - - # For expansive, the property can be no higher than an EPC C - eco4_identified["identification_type"] = np.where( - (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False) & ( - eco4_identified["sap"] <= epc_c_threshold - ), - "expansive", - eco4_identified["identification_type"] - ) - ################################################################################################ - # We take the properties dependent on CIGA - ################################################################################################ - - ciga_dependent_identified = analysis_data[ - analysis_data["ECO Eligibility"].isin( - [ - "eco4 (subject to ciga)", - "eco4 - passed ciga" - ] - ) - ].copy() - - # These are properties that show filled cavity - ciga_dependent_identified["identification_type"] = None - ciga_dependent_identified["identification_type"] = np.where( - ciga_dependent_identified["eco4_message"].isin( - [ - "Perfect suitability", - "Meets cavity and sap", - "Fails cavity, meets loft, fails SAP", - "Meets fabric, fails SAP check", - "Meets cavity, loft borderline, meets sap", - ] - ) & (ciga_dependent_identified["sap"] <= epc_c_threshold), - "strict", - ciga_dependent_identified["identification_type"] - ) - - ciga_dependent_identified["identification_type"] = np.where( - ((ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | ( - ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"]) - )) & ( - (ciga_dependent_identified["sap"] <= epc_c_threshold) & - pd.isnull(ciga_dependent_identified["identification_type"]) - ), - "expansive", - ciga_dependent_identified["identification_type"] - ) - - ################################################################################################ - # We properties that qualified for gbis - ################################################################################################ - gbis_identified = analysis_data[analysis_data["ECO Eligibility"] == "gbis"].copy() - gbis_identified["identification_type"] = None - gbis_identified["identification_type"] = np.where( - (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] < 69), - "strict", - gbis_identified["identification_type"] - ) - - gbis_identified["identification_type"] = np.where( - (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] <= epc_c_threshold) & ( - pd.isnull(gbis_identified["identification_type"]) - ), - "expansive", - gbis_identified["identification_type"] - ) - - # Finally, we look at the properties that have not been identified by Warmfront - not_identified = analysis_data[ - analysis_data["ECO Eligibility"].isin( - [ - "not eligible" - ] - ) - ].copy() - - surplus_eco4 = not_identified[ - (not_identified["eco4_eligible"] == True) & (not_identified["eco4_message"].isin( - ["Perfect suitability", "Meets cavity, loft borderline, meets sap", "Near perfect suitability"] - )) - ] - - surplus_gbis = not_identified[ - (not_identified["gbis_eligible"] == True) & ( - ~not_identified["asset_list_row_id"].isin(surplus_eco4["asset_list_row_id"].values) - ) & (not_identified["sap"] < 69) & ( - (not_identified["cavity_type"].isin(["empty", "partial insulation"])) | ( - not_identified["walls"].str.contains("partial", case=False, na=False) - ) - ) - ] - surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False] - - # Output variables - the data was sent to us in December, but the remaining figures are - # what was in November - november_remaining = loader.december_figures[loader.december_figures["HA Name"] == ha_name] - - # ECO4 - n_properties_remaining_in_asset_list = inputs["asset_list"].shape[0] - november_eco4_remaining = max(november_remaining["ECO4 remaining"].values[0], 0) - november_eco4_sold = november_remaining["No. of Tech surveys complete - Eco 4"].values[0] - eco4_sales_since_november = n_sold_eco4 - november_eco4_sold - - n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0] - eco4_of_which_identified_strict = ( - eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] + - ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "strict"].shape[0] - ) - eco4_of_which_identified_expansive = ( - eco4_identified[eco4_identified["identification_type"] == "expansive"].shape[0] + - ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "expansive"].shape[0] - ) - # GBIS - n_warmfront_identified_gbis = gbis_identified.shape[0] - november_gbis_remaining = max(november_remaining["GBIS remaining"].values[0], 0) - november_gbis_sold = november_remaining["No. of Tech surveys complete - GBIS"].values[0] - gbis_sales_since_november = n_sold_gbis - november_gbis_sold - gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0] - gbis_of_which_identified_expansive = \ - gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0] - - to_append = { - ("", "HA Name"): ha_name, - ("", "# properties in asset list"): n_properties_remaining_in_asset_list, - ############ - # ECO4 - ############ - ("ECO4", "# remaining November file"): november_eco4_remaining, - ("ECO4", "# sold in November file"): november_eco4_sold, - ("ECO4", "# sold (survey list)"): n_sold_eco4, - ("ECO4", "# that missed CIGA check"): n_eco4_missed_subject_to_ciga, - ("ECO4", "# Remaining properties (asset list)"): n_warmfront_identified_eco4, - ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict, - ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive, - ("ECO4", "Of which identified by model - total"): ( - eco4_of_which_identified_strict + eco4_of_which_identified_expansive - ), - ("ECO4", "Additional properties"): surplus_eco4.shape[0], - ############ - # GBIS - ############ - ("GBIS", "# remaining November file"): november_gbis_remaining, - ("GBIS", "# sold in November file"): november_gbis_sold, - ("GBIS", "# sold (survey list)"): n_sold_gbis, - ("GBIS", "# Remaining properties (asset list)"): n_warmfront_identified_gbis, - ("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict, - ("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive, - ("GBIS", "Of which identified by model - total"): ( - gbis_of_which_identified_strict + gbis_of_which_identified_expansive - ), - ("GBIS", "Additional properties"): surplus_gbis.shape[0] - } - - ha_analysis_results.append(to_append) - - # Calculate the revenue results - to_append_revenue = { - ("", "HA Name"): ha_name, - # Eco4 revenue - ("ECO4", "£ remaining November file"): november_eco4_remaining * eco4_rate, - ("ECO4", "£ sold November file"): november_eco4_sold * old_eco4_rate, - ("ECO4", "£ sold since November"): eco4_sales_since_november * eco4_rate, - ("ECO4", "£ stuck at ciga check"): n_eco4_missed_subject_to_ciga * eco4_rate, - ("ECO4", "£ remaining (asset list)"): n_warmfront_identified_eco4 * eco4_rate, - ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict * eco4_rate, - ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive * eco4_rate, - ("ECO4", "Of which identified by model - total"): eco4_rate * ( - eco4_of_which_identified_strict + eco4_of_which_identified_expansive - ), - ("ECO4", "Additional properties"): eco4_rate * surplus_eco4.shape[0], - } - total_revenue_results.append(to_append_revenue) - - ha_analysis_results = pd.DataFrame(ha_analysis_results) - ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns) - - facts_and_figures = loader.facts_and_figures.copy() - facts_and_figures["ha_number"] = facts_and_figures["HA Name"].str.extract(r'(\d+)').astype(int) - facts_and_figures = facts_and_figures.sort_values("ha_number") - facts_and_figures = facts_and_figures.drop(columns=["ha_number"]) - - # Rename some of the cols - facts_and_figures = facts_and_figures.rename( - columns={ - # ECO4 cols - "ECO4": "ECO4 - November", - "GBIS": "GBIS - November", - "eco4 (subject to ciga)": "ECO4 - subject to ciga", - "eco4": "ECO4 - doesn't need CIGA", - "eco4 - passed ciga": "ECO4 - passed CIGA", - "failed ciga": "ECO4 - failed CIGA", - "ECO4 - partially cancelled": "ECO4 - Install downgrade to GBIS", - "ECO4 - in progress": "ECO4 - Install in progress", - "ECO4 - cancelled": "ECO4 - Install cancelled", - # GBIS cols - "gbis": "GBIS total (asset list)" - } - ) - # We calculate the eco4 total from the asset list - # 1) If ciga checks have been completed (i.e. ECO4 - passed ciga > 0) this sum is - # ECO4 - doesn't need CIGA + ECO4 - passed CIGA - # 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is - # ECO4 - doesn't need CIGA + ECO4 - subject to ciga - facts_and_figures["ECO4 total (asset list - pre ciga)"] = ( - facts_and_figures["ECO4 - doesn't need CIGA"] + - facts_and_figures["ECO4 - subject to ciga"] + - facts_and_figures["ECO4 - passed CIGA"] - ) - - facts_and_figures["ECO4 total (asset list - post ciga)"] = None - facts_and_figures["ECO4 total (asset list - post ciga)"] = np.where( - facts_and_figures["ECO4 - passed CIGA"] > 0, - facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"], - facts_and_figures["ECO4 total (asset list - post ciga)"] - ) - - # Re-arrange the columns - facts_and_figures = facts_and_figures[ - [ - 'HA Name', - 'ECO4 - November', - 'GBIS - November', - 'ECO4 total (asset list - pre ciga)', - 'ECO4 total (asset list - post ciga)', - 'GBIS total (asset list)', - 'ECO4 - subject to ciga', - "ECO4 - doesn't need CIGA", - 'ECO4 - passed CIGA', - 'ECO4 - failed CIGA', - 'ECO4 - installed', - 'ECO4 - Install in progress', - 'ECO4 - Install cancelled', - 'ECO4 - partially installed', - 'ECO4 - Install downgrade to GBIS', - ] - ] - # Addd a note to flag any rows where ECO4 ( - # subject to ciga is greater than 0) and (ECO4 - passed ciga is greater than 0 - # ) - facts_and_figures["Missed CIGA checks opportunity"] = None - facts_and_figures["Missed CIGA checks opportunity"] = np.where( - (facts_and_figures["ECO4 - subject to ciga"] > 0) & (facts_and_figures["ECO4 - passed CIGA"] > 0), - "potential opportunity of " + facts_and_figures["ECO4 - subject to ciga"].astype( - str) + " ECO4 properties needing a CIGA check", - facts_and_figures["Missed CIGA checks opportunity"] - ) - - facts_and_figures.to_csv("Facts and figures sample.csv") - - # Re arrage the columns - - # Also sort ha_analysis_results by ha number - ha_analysis_results["ha_number"] = ha_analysis_results[("", "HA Name")].str.extract(r'(\d+)').astype(int) - ha_analysis_results = ha_analysis_results.sort_values("ha_number") - ha_analysis_results = ha_analysis_results.drop(columns=["ha_number"]) - - # We save 2 sheets - # Automate creation of the excel - # Create a Pandas Excel writer using XlsxWriter as the engine - with pd.ExcelWriter('HA Analysis Results.xlsx', engine='xlsxwriter') as writer: - # Write each dataframe to a different worksheet without the index - for df, sheet in [(facts_and_figures, 'HA Facts and Figures'), - (ha_analysis_results, 'Asset Identification')]: - - df.to_excel(writer, sheet_name=sheet) - - # Auto-adjust columns' width - for i, width in enumerate(get_col_widths(df)): - writer.sheets[sheet].set_column(i, i, width) - - # Inspection: - Looking into the proportion of homes with "cavity, as built, insulated (assumed)" as their - # description, and what proportion of time they get identified via non-invasive surveys - - # true_eco4_assets = [] - # ciga_dependent_assets = [] - # not_eligible = [] - # as_built_insulated = [] - # date_cols = { - # "HA39": "date_built", - # "HA14": "Built In Year", - # "HA6": "Construction Year", - # "HA1": "Build Date", - # "HA107": "YEAR BUILT" - # } - # for ha_name, data_objects in outputs.items(): - # inputs = [x for k, x in loader.data.items() if k == ha_name][0] - # - # date_col = date_cols[ha_name] - # results_df = data_objects["results_df"].copy() - # df = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility", date_col]].rename( - # columns={"row_meaning": "asset_identification_status", date_col: "date_built"} - # ).merge( - # results_df, - # how="left", - # right_on="row_id", - # left_on="asset_list_row_id" - # ) - # - # # take the true ECO4 - # true_eco4 = df[df["ECO Eligibility"] == "eco4"].copy() - # ciga_dependent = df[ - # df["ECO Eligibility"].isin( - # [ - # "eco4 (subject to ciga)", - # "failed ciga", - # "eco4 - passed ciga" - # ] - # ) - # ] - # insulated_assumed = df[df["walls"] == "Cavity wall, as built, insulated"].copy() - # # We convert date built to datetime - # try: - # insulated_assumed = insulated_assumed[~pd.isnull(insulated_assumed["date_built"])] - # insulated_assumed["year_built"] = pd.to_datetime(insulated_assumed["date_built"].astype(str)).dt.year - # as_built_insulated.append(insulated_assumed) - # except Exception as e: - # print("oh well") - # - # true_eco4_assets.append(true_eco4) - # ciga_dependent_assets.append(ciga_dependent) - # - # true_eco4_assets = pd.concat(true_eco4_assets) - # ciga_dependent_assets = pd.concat(ciga_dependent_assets) - # as_built_insulated = pd.concat(as_built_insulated) - # - # true_eco4_assets["walls"].value_counts(normalize=True) - # ciga_dependent_assets["walls"].value_counts(normalize=True) - # - # from recommendations.recommendation_utils import extract_insulation_thickness - # - # true_eco4_assets["roof_insulation_thickness"] = true_eco4_assets["roof"].apply( - # lambda x: extract_insulation_thickness(x) - # ) - # - # true_eco4_assets["e"] = true_eco4_assets.merge( - # pd.DataFrame(cleaned["roof-description"])[["original_description", "insulation_thickness"]], - # how="left", - # left_on="roof", - # right_on="original_description" - # ) - # - # true_eco4_assets["sap"].mean() - # - # true_eco4_assets["insulation_thickness"].isin( - # ["250", "150", "200", "100", "75", "50"] - # ).sum() / true_eco4_assets.shape[0] - # - # true_eco4_assets["insulation_thickness"].isin( - # ["100"] - # ).sum() / true_eco4_assets.shape[0] - # - # as_built_insulated.groupby("property_type")["ECO Eligibility"].value_counts(normalize=True) +# def analyse_ha_data(outputs, loader): +# """ +# The approach we take within this function is the following: +# For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The +# characterisation can be broken down as the following: +# 1) The property has been identified by Warmfront and is eligible for ECO4/GBIS work, under the strictest criteria +# 2) The property has been identified by Warmfront, however it has a full cavity, and therefore would be subject to +# a CIGA check +# 3) The property has been identified by Warmfront, but the EPC shows that the property has more than 100mm loft +# insulation +# 4) The property has been identified by Warmfront, but doesn't look like a property that would likely qualify under +# any cirsumstances, given the available data +# +# Then, for any property that has NOT been identifid by Warmfront, we identify properties that look like they would +# qualify under the strictest criteria, and mark these as potential additional opportunities. +# +# :return: +# """ +# +# eco4_rate = 1710 +# gbis_rate = 600 +# # old_eco4_rate = 1456 +# old_gbis_rate = 432 +# +# epc_c_threshold = 80 +# scheme_map = { +# "ECO4": "ECO4", +# "AFFORDABLE WARMTH": "ECO4", +# "ECO4 A/W": "ECO4", +# "ECO4 GBIS (ECO+)": "GBIS" +# } +# +# ha_analysis_results = [] +# total_revenue_results = [] +# for ha_name, datasets in outputs.items(): +# inputs = [x for k, x in loader.data.items() if k == ha_name][0] +# +# results_df = datasets["results_df"].copy() +# +# analysis_data = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility"]].rename( +# columns={"row_meaning": "asset_identification_status"} +# ).merge( +# results_df, +# how="left", +# right_on="row_id", +# left_on="asset_list_row_id" +# ) +# +# analysis_data["is_remaining"] = True +# +# n_sold_eco4 = 0 +# n_sold_gbis = 0 +# if not inputs["survey_list"].empty: +# # Merge on the survey list and signal everything that is remaining or not (i.e. anything that hasn't had +# # a survey) +# survey_list = inputs["survey_list"].copy() +# +# # TODO: TEMP +# scheme_column = survey_list.columns[0] +# # We clean up the survey list installation or cancelled +# survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower() +# # Remove all punctuation +# survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( +# r'[^\w\s]', '', regex=True +# ) +# # Remove double spaces +# survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( +# r'\s+', ' ', regex=True +# ) +# # Remove trailing spaces +# survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip() +# +# # Remap the values in the scheme column +# survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map) +# +# survey_list["installation_status"] = None +# survey_list["installation_status"] = np.where( +# survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]), +# "installed", +# survey_list["installation_status"] +# ) +# survey_list["installation_status"] = np.where( +# survey_list["installed_or_cancelled_clean"].isin(["cancelled"]), +# "cancelled", +# survey_list["installation_status"] +# ) +# # Find partial installations +# survey_list["installation_status"] = np.where( +# survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), +# "partially installed", +# survey_list["installation_status"] +# ) +# # Find partial cancellations +# # TODO: We might have more indications of partial cancellations +# survey_list["installation_status"] = np.where( +# survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), +# "partially cancelled", +# survey_list["installation_status"] +# ) +# +# # Finally, for other cases, we set the status to "in progress" +# survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress") +# +# # We concatenate the scheme name with the installation status +# survey_list["installation_status"] = ( +# survey_list[scheme_column] + " - " + survey_list["installation_status"] +# ) +# +# # TODO: END TEMP +# +# survey_list_to_merge = survey_list[["asset_list_row_id", scheme_column]].copy() +# survey_list_to_merge["is_remaining"] = False +# analysis_data = analysis_data.drop(columns="is_remaining").merge( +# survey_list_to_merge, +# how="left", on="asset_list_row_id" +# ) +# analysis_data["is_remaining"] = analysis_data["is_remaining"].fillna(True) +# +# n_sold_eco4 = survey_list_to_merge[survey_list_to_merge[scheme_column] == "ECO4"].shape[0] +# n_sold_gbis = survey_list_to_merge[survey_list_to_merge[scheme_column] == "GBIS"].shape[0] +# +# # Take just remaining +# analysis_data = analysis_data[analysis_data["is_remaining"]] +# +# # Also, if the HA has started selling, we remove any that are still subject to ciga +# n_eco4_missed_subject_to_ciga = 0 +# if not inputs["survey_list"].empty: +# n_eco4_missed_subject_to_ciga = (analysis_data["ECO Eligibility"] == "eco4 (subject to ciga)").sum() +# analysis_data = analysis_data[analysis_data["ECO Eligibility"] != "eco4 (subject to ciga)"] +# +# ################################################################################################ +# # We take the properties that strictly qualified under eco +# ################################################################################################ +# +# eco4_identified = analysis_data[analysis_data["ECO Eligibility"] == "eco4"].copy() +# eco4_identified["identification_type"] = None +# eco4_identified["identification_type"] = np.where( +# (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == True), +# "strict", +# eco4_identified["identification_type"] +# ) +# +# # For expansive, the property can be no higher than an EPC C +# eco4_identified["identification_type"] = np.where( +# (eco4_identified["eco4_eligible"] == True) & (eco4_identified["eco4_strict"] == False) & ( +# eco4_identified["sap"] <= epc_c_threshold +# ), +# "expansive", +# eco4_identified["identification_type"] +# ) +# ################################################################################################ +# # We take the properties dependent on CIGA +# ################################################################################################ +# +# ciga_dependent_identified = analysis_data[ +# analysis_data["ECO Eligibility"].isin( +# [ +# "eco4 (subject to ciga)", +# "eco4 - passed ciga" +# ] +# ) +# ].copy() +# +# # These are properties that show filled cavity +# ciga_dependent_identified["identification_type"] = None +# ciga_dependent_identified["identification_type"] = np.where( +# ciga_dependent_identified["eco4_message"].isin( +# [ +# "Perfect suitability", +# "Meets cavity and sap", +# "Fails cavity, meets loft, fails SAP", +# "Meets fabric, fails SAP check", +# "Meets cavity, loft borderline, meets sap", +# ] +# ) & (ciga_dependent_identified["sap"] <= epc_c_threshold), +# "strict", +# ciga_dependent_identified["identification_type"] +# ) +# +# ciga_dependent_identified["identification_type"] = np.where( +# ((ciga_dependent_identified["eco4_message"].isin(["Meets just cavity"])) | ( +# ciga_dependent_identified["walls"].isin(["Cavity wall, filled cavity"]) +# )) & ( +# (ciga_dependent_identified["sap"] <= epc_c_threshold) & +# pd.isnull(ciga_dependent_identified["identification_type"]) +# ), +# "expansive", +# ciga_dependent_identified["identification_type"] +# ) +# +# ################################################################################################ +# # We properties that qualified for gbis +# ################################################################################################ +# gbis_identified = analysis_data[analysis_data["ECO Eligibility"] == "gbis"].copy() +# gbis_identified["identification_type"] = None +# gbis_identified["identification_type"] = np.where( +# (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] < 69), +# "strict", +# gbis_identified["identification_type"] +# ) +# +# gbis_identified["identification_type"] = np.where( +# (gbis_identified["gbis_eligible"] == True) & (gbis_identified["sap"] <= epc_c_threshold) & ( +# pd.isnull(gbis_identified["identification_type"]) +# ), +# "expansive", +# gbis_identified["identification_type"] +# ) +# +# # Finally, we look at the properties that have not been identified by Warmfront +# not_identified = analysis_data[ +# analysis_data["ECO Eligibility"].isin( +# [ +# "not eligible" +# ] +# ) +# ].copy() +# +# surplus_eco4 = not_identified[ +# (not_identified["eco4_eligible"] == True) & (not_identified["eco4_message"].isin( +# ["Perfect suitability", "Meets cavity, loft borderline, meets sap", "Near perfect suitability"] +# )) +# ] +# +# surplus_gbis = not_identified[ +# (not_identified["gbis_eligible"] == True) & ( +# ~not_identified["asset_list_row_id"].isin(surplus_eco4["asset_list_row_id"].values) +# ) & (not_identified["sap"] < 69) & ( +# (not_identified["cavity_type"].isin(["empty", "partial insulation"])) | ( +# not_identified["walls"].str.contains("partial", case=False, na=False) +# ) +# ) +# ] +# surplus_gbis = surplus_gbis[surplus_gbis["is_estimated"] == False] +# +# # Output variables - the data was sent to us in December, but the remaining figures are +# # what was in November +# november_remaining = loader.december_figures[loader.december_figures["HA Name"] == ha_name] +# +# # ECO4 +# n_properties_remaining_in_asset_list = inputs["asset_list"].shape[0] +# november_eco4_remaining = max(november_remaining["ECO4 remaining"].values[0], 0) +# november_eco4_sold = november_remaining["No. of Tech surveys complete - Eco 4"].values[0] +# eco4_sales_since_november = n_sold_eco4 - november_eco4_sold +# +# n_warmfront_identified_eco4 = eco4_identified.shape[0] + ciga_dependent_identified.shape[0] +# eco4_of_which_identified_strict = ( +# eco4_identified[eco4_identified["identification_type"] == "strict"].shape[0] + +# ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "strict"].shape[0] +# ) +# eco4_of_which_identified_expansive = ( +# eco4_identified[eco4_identified["identification_type"] == "expansive"].shape[0] + +# ciga_dependent_identified[ciga_dependent_identified["identification_type"] == "expansive"].shape[0] +# ) +# # GBIS +# n_warmfront_identified_gbis = gbis_identified.shape[0] +# november_gbis_remaining = max(november_remaining["GBIS remaining"].values[0], 0) +# november_gbis_sold = november_remaining["No. of Tech surveys complete - GBIS"].values[0] +# gbis_sales_since_november = n_sold_gbis - november_gbis_sold +# gbis_of_which_identified_strict = gbis_identified[gbis_identified["identification_type"] == "strict"].shape[0] +# gbis_of_which_identified_expansive = \ +# gbis_identified[gbis_identified["identification_type"] == "expansive"].shape[0] +# +# to_append = { +# ("", "HA Name"): ha_name, +# ("", "# properties in asset list"): n_properties_remaining_in_asset_list, +# ############ +# # ECO4 +# ############ +# ("ECO4", "# remaining November file"): november_eco4_remaining, +# ("ECO4", "# sold in November file"): november_eco4_sold, +# ("ECO4", "# sold (survey list)"): n_sold_eco4, +# ("ECO4", "# that missed CIGA check"): n_eco4_missed_subject_to_ciga, +# ("ECO4", "# Remaining properties (asset list)"): n_warmfront_identified_eco4, +# ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict, +# ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive, +# ("ECO4", "Of which identified by model - total"): ( +# eco4_of_which_identified_strict + eco4_of_which_identified_expansive +# ), +# ("ECO4", "Additional properties"): surplus_eco4.shape[0], +# ############ +# # GBIS +# ############ +# ("GBIS", "# remaining November file"): november_gbis_remaining, +# ("GBIS", "# sold in November file"): november_gbis_sold, +# ("GBIS", "# sold (survey list)"): n_sold_gbis, +# ("GBIS", "# Remaining properties (asset list)"): n_warmfront_identified_gbis, +# ("GBIS", "Of which identified by model - strict"): gbis_of_which_identified_strict, +# ("GBIS", "Of which identified by model - expansive"): gbis_of_which_identified_expansive, +# ("GBIS", "Of which identified by model - total"): ( +# gbis_of_which_identified_strict + gbis_of_which_identified_expansive +# ), +# ("GBIS", "Additional properties"): surplus_gbis.shape[0] +# } +# +# ha_analysis_results.append(to_append) +# +# # Calculate the revenue results +# to_append_revenue = { +# ("", "HA Name"): ha_name, +# # Eco4 revenue +# ("ECO4", "£ remaining November file"): november_eco4_remaining * eco4_rate, +# ("ECO4", "£ sold November file"): november_eco4_sold * old_eco4_rate, +# ("ECO4", "£ sold since November"): eco4_sales_since_november * eco4_rate, +# ("ECO4", "£ stuck at ciga check"): n_eco4_missed_subject_to_ciga * eco4_rate, +# ("ECO4", "£ remaining (asset list)"): n_warmfront_identified_eco4 * eco4_rate, +# ("ECO4", "Of which identified by model - strict"): eco4_of_which_identified_strict * eco4_rate, +# ("ECO4", "Of which identified by model - expansive"): eco4_of_which_identified_expansive * eco4_rate, +# ("ECO4", "Of which identified by model - total"): eco4_rate * ( +# eco4_of_which_identified_strict + eco4_of_which_identified_expansive +# ), +# ("ECO4", "Additional properties"): eco4_rate * surplus_eco4.shape[0], +# } +# total_revenue_results.append(to_append_revenue) +# +# ha_analysis_results = pd.DataFrame(ha_analysis_results) +# ha_analysis_results.columns = pd.MultiIndex.from_tuples(ha_analysis_results.columns) +# +# facts_and_figures = loader.facts_and_figures.copy() +# facts_and_figures["ha_number"] = facts_and_figures["HA Name"].str.extract(r'(\d+)').astype(int) +# facts_and_figures = facts_and_figures.sort_values("ha_number") +# facts_and_figures = facts_and_figures.drop(columns=["ha_number"]) +# +# # Rename some of the cols +# facts_and_figures = facts_and_figures.rename( +# columns={ +# # ECO4 cols +# "ECO4": "ECO4 - November", +# "GBIS": "GBIS - November", +# "eco4 (subject to ciga)": "ECO4 - subject to ciga", +# "eco4": "ECO4 - doesn't need CIGA", +# "eco4 - passed ciga": "ECO4 - passed CIGA", +# "failed ciga": "ECO4 - failed CIGA", +# "ECO4 - partially cancelled": "ECO4 - Install downgrade to GBIS", +# "ECO4 - in progress": "ECO4 - Install in progress", +# "ECO4 - cancelled": "ECO4 - Install cancelled", +# # GBIS cols +# "gbis": "GBIS total (asset list)" +# } +# ) +# # We calculate the eco4 total from the asset list +# # 1) If ciga checks have been completed (i.e. ECO4 - passed ciga > 0) this sum is +# # ECO4 - doesn't need CIGA + ECO4 - passed CIGA +# # 2) if ciga checks haven't been completed (i.e. ECO4 - passed ciga is missing), this sum is +# # ECO4 - doesn't need CIGA + ECO4 - subject to ciga +# facts_and_figures["ECO4 total (asset list - pre ciga)"] = ( +# facts_and_figures["ECO4 - doesn't need CIGA"] + +# facts_and_figures["ECO4 - subject to ciga"] + +# facts_and_figures["ECO4 - passed CIGA"] +# ) +# +# facts_and_figures["ECO4 total (asset list - post ciga)"] = None +# facts_and_figures["ECO4 total (asset list - post ciga)"] = np.where( +# facts_and_figures["ECO4 - passed CIGA"] > 0, +# facts_and_figures["ECO4 - doesn't need CIGA"] + facts_and_figures["ECO4 - passed CIGA"], +# facts_and_figures["ECO4 total (asset list - post ciga)"] +# ) +# +# # Re-arrange the columns +# facts_and_figures = facts_and_figures[ +# [ +# 'HA Name', +# 'ECO4 - November', +# 'GBIS - November', +# 'ECO4 total (asset list - pre ciga)', +# 'ECO4 total (asset list - post ciga)', +# 'GBIS total (asset list)', +# 'ECO4 - subject to ciga', +# "ECO4 - doesn't need CIGA", +# 'ECO4 - passed CIGA', +# 'ECO4 - failed CIGA', +# 'ECO4 - installed', +# 'ECO4 - Install in progress', +# 'ECO4 - Install cancelled', +# 'ECO4 - partially installed', +# 'ECO4 - Install downgrade to GBIS', +# ] +# ] +# # Addd a note to flag any rows where ECO4 ( +# # subject to ciga is greater than 0) and (ECO4 - passed ciga is greater than 0 +# # ) +# facts_and_figures["Missed CIGA checks opportunity"] = None +# facts_and_figures["Missed CIGA checks opportunity"] = np.where( +# (facts_and_figures["ECO4 - subject to ciga"] > 0) & (facts_and_figures["ECO4 - passed CIGA"] > 0), +# "potential opportunity of " + facts_and_figures["ECO4 - subject to ciga"].astype( +# str) + " ECO4 properties needing a CIGA check", +# facts_and_figures["Missed CIGA checks opportunity"] +# ) +# +# facts_and_figures.to_csv("Facts and figures sample.csv") +# +# # Re arrage the columns +# +# # Also sort ha_analysis_results by ha number +# ha_analysis_results["ha_number"] = ha_analysis_results[("", "HA Name")].str.extract(r'(\d+)').astype(int) +# ha_analysis_results = ha_analysis_results.sort_values("ha_number") +# ha_analysis_results = ha_analysis_results.drop(columns=["ha_number"]) +# +# # We save 2 sheets +# # Automate creation of the excel +# # Create a Pandas Excel writer using XlsxWriter as the engine +# with pd.ExcelWriter('HA Analysis Results.xlsx', engine='xlsxwriter') as writer: +# # Write each dataframe to a different worksheet without the index +# for df, sheet in [(facts_and_figures, 'HA Facts and Figures'), +# (ha_analysis_results, 'Asset Identification')]: +# +# df.to_excel(writer, sheet_name=sheet) +# +# # Auto-adjust columns' width +# for i, width in enumerate(get_col_widths(df)): +# writer.sheets[sheet].set_column(i, i, width) +# +# # Inspection: - Looking into the proportion of homes with "cavity, as built, insulated (assumed)" as their +# # description, and what proportion of time they get identified via non-invasive surveys +# +# # true_eco4_assets = [] +# # ciga_dependent_assets = [] +# # not_eligible = [] +# # as_built_insulated = [] +# # date_cols = { +# # "HA39": "date_built", +# # "HA14": "Built In Year", +# # "HA6": "Construction Year", +# # "HA1": "Build Date", +# # "HA107": "YEAR BUILT" +# # } +# # for ha_name, data_objects in outputs.items(): +# # inputs = [x for k, x in loader.data.items() if k == ha_name][0] +# # +# # date_col = date_cols[ha_name] +# # results_df = data_objects["results_df"].copy() +# # df = inputs["asset_list"][['asset_list_row_id', "ECO Eligibility", date_col]].rename( +# # columns={"row_meaning": "asset_identification_status", date_col: "date_built"} +# # ).merge( +# # results_df, +# # how="left", +# # right_on="row_id", +# # left_on="asset_list_row_id" +# # ) +# # +# # # take the true ECO4 +# # true_eco4 = df[df["ECO Eligibility"] == "eco4"].copy() +# # ciga_dependent = df[ +# # df["ECO Eligibility"].isin( +# # [ +# # "eco4 (subject to ciga)", +# # "failed ciga", +# # "eco4 - passed ciga" +# # ] +# # ) +# # ] +# # insulated_assumed = df[df["walls"] == "Cavity wall, as built, insulated"].copy() +# # # We convert date built to datetime +# # try: +# # insulated_assumed = insulated_assumed[~pd.isnull(insulated_assumed["date_built"])] +# # insulated_assumed["year_built"] = pd.to_datetime(insulated_assumed["date_built"].astype(str)).dt.year +# # as_built_insulated.append(insulated_assumed) +# # except Exception as e: +# # print("oh well") +# # +# # true_eco4_assets.append(true_eco4) +# # ciga_dependent_assets.append(ciga_dependent) +# # +# # true_eco4_assets = pd.concat(true_eco4_assets) +# # ciga_dependent_assets = pd.concat(ciga_dependent_assets) +# # as_built_insulated = pd.concat(as_built_insulated) +# # +# # true_eco4_assets["walls"].value_counts(normalize=True) +# # ciga_dependent_assets["walls"].value_counts(normalize=True) +# # +# # from recommendations.recommendation_utils import extract_insulation_thickness +# # +# # true_eco4_assets["roof_insulation_thickness"] = true_eco4_assets["roof"].apply( +# # lambda x: extract_insulation_thickness(x) +# # ) +# # +# # true_eco4_assets["e"] = true_eco4_assets.merge( +# # pd.DataFrame(cleaned["roof-description"])[["original_description", "insulation_thickness"]], +# # how="left", +# # left_on="roof", +# # right_on="original_description" +# # ) +# # +# # true_eco4_assets["sap"].mean() +# # +# # true_eco4_assets["insulation_thickness"].isin( +# # ["250", "150", "200", "100", "75", "50"] +# # ).sum() / true_eco4_assets.shape[0] +# # +# # true_eco4_assets["insulation_thickness"].isin( +# # ["100"] +# # ).sum() / true_eco4_assets.shape[0] +# # +# # as_built_insulated.groupby("property_type")["ECO Eligibility"].value_counts(normalize=True) def get_propensity_model_data( @@ -2567,29 +2567,39 @@ def calculate_eco4_post_ciga( eligiblity_counts["ECO Eligibility"] == "eco4 - passed ciga" ]["count"].sum() + eco4_confirmed_ciga_failures = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "failed ciga" + ]["count"].sum() + eco4_confirmed = (eco4_no_ciga_needed * ha_eco4_to_sale_rate) + (eco4_ciga_passed * ha_ciga_pass_to_sale_rate) eco4_confirmed = np.round(eco4_confirmed) if remaining_needing_ciga_check > 0: # We update the eco4 post ciga with the converted remaining + eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) eco4_remaining_forecast = np.round( - remaining_needing_ciga_check * ha_ciga_conversion_rate * ha_ciga_pass_to_sale_rate + eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate ) + eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast else: eco4_remaining_forecast = 0 + eco4_estimated_ciga_failures = 0 eco4_post_ciga = eco4_confirmed else: eco4_no_ciga_needed = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "eco4" ]["count"].sum() + eco4_confirmed_ciga_failures = 0 + # Multiply by sale conversion eco4_confirmed = np.round(eco4_no_ciga_needed * ha_eco4_to_sale_rate) + eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass + eco4_remaining_forecast = np.round( - remaining_needing_ciga_check * ha_ciga_conversion_rate * ha_ciga_pass_to_sale_rate - ) - eco4_post_ciga = ( - eligiblity_counts[eligiblity_counts["ECO Eligibility"] == "eco4"]["count"].sum() + eco4_remaining_forecast + eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate ) + eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast eco4_post_ciga = int(eco4_post_ciga) eco4_remaining_forecast = int(eco4_remaining_forecast) @@ -2604,6 +2614,16 @@ def calculate_eco4_post_ciga( "ECO4 - post CIGA - £": eco4_post_ciga * eco4_rate, "Of which confirmed - £": eco4_confirmed * eco4_rate, "Of which forecast - £": eco4_remaining_forecast * eco4_rate, + # Ciga failures + "Estimated total - failed CIGA": int(eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures), + "Confirmed CIGA failures": eco4_confirmed_ciga_failures, + "Estimated CIGA failures": int(eco4_estimated_ciga_failures), + # Ciga failures cost + "Estimated total - failed CIGA - £": int( + (eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures) * eco4_rate + ), + "Confirmed CIGA failures - £": int(eco4_confirmed_ciga_failures * eco4_rate), + "Estimated CIGA failures - £": int(eco4_estimated_ciga_failures * eco4_rate), } return results @@ -2617,8 +2637,8 @@ def forecast_remaining_sales(loader): gbis_rate = 600 eco4_rate = 1710 - old_gbis_rate = 432 - old_eco4_rate = 1456 + # old_gbis_rate = 432 + # old_eco4_rate = 1456 # 1) Calculate the conversion rate from passed CIGA to actual sale converted_ciga_jobs = [] @@ -2800,16 +2820,18 @@ def forecast_remaining_sales(loader): results = [] for ha_name, input_data in loader.data.items(): + # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0] original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0] - original_warmfront_eco4_revenue = ( - original_warmfront_remaining_eco4 * eco4_rate + - (original_warmfront_eco4 - original_warmfront_remaining_eco4) * old_eco4_rate - ) + # original_warmfront_eco4_revenue = ( + # original_warmfront_remaining_eco4 * eco4_rate + + # (original_warmfront_eco4 - original_warmfront_remaining_eco4) * old_eco4_rate + # ) + original_warmfront_eco4_revenue = original_warmfront_eco4 * eco4_rate original_warmfront_remaining_eco4_revenue = original_warmfront_remaining_eco4 * eco4_rate # Original warmfront figures - GBIS @@ -2817,9 +2839,12 @@ def forecast_remaining_sales(loader): original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0] original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0] + # original_warmfront_gbis_revenue = ( + # original_warmfront_remaining_gbis * gbis_rate + + # (original_warmfront_gbis - original_warmfront_remaining_gbis) * old_gbis_rate + # ) original_warmfront_gbis_revenue = ( - original_warmfront_remaining_gbis * gbis_rate + - (original_warmfront_gbis - original_warmfront_remaining_gbis) * old_gbis_rate + original_warmfront_gbis * gbis_rate ) original_warmfront_remaining_gbis_revenue = original_warmfront_remaining_gbis * gbis_rate @@ -2835,6 +2860,7 @@ def forecast_remaining_sales(loader): how="left", on="asset_list_row_id" ) + # Anything that has an installation has gone to installation, and therefore is not remaining asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])] asset_list_remaining = asset_list_remaining.drop(columns=["installation_status"]) @@ -2913,6 +2939,32 @@ def forecast_remaining_sales(loader): eco4_rate=eco4_rate ) + # GBIS Figures + # Estimate the GBIS conversion rate + ha_gbis_sale_conversion = gbis_ciga_independent_passrates[ + gbis_ciga_independent_passrates["Ha Name"] == ha_name + ] + + if not ha_gbis_sale_conversion.empty: + ha_gbis_sale_conversion = ( + ha_gbis_sale_conversion["# GBIS successfully installed"].values[0] / + ha_gbis_sale_conversion["# GBIS at install stage"].values[0] + ) + else: + ha_gbis_sale_conversion = median_gbis_to_install + + gbis_total = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "gbis" + ]["count"].sum() + gbis_total = np.round(gbis_total * ha_gbis_sale_conversion) + gbis_total_revenue = gbis_total * gbis_rate + + gbis_remaining = eligiblity_counts_remaining[ + eligiblity_counts["ECO Eligibility"] == "gbis" + ]["count"].sum() + gbis_remaining = np.round(gbis_remaining * ha_gbis_sale_conversion) + gbis_remaining_revenue = gbis_remaining * gbis_rate + to_append = { ("", "", "", "HA Name"): ha_name, # ECO4 - original warmfront figures @@ -2950,16 +3002,48 @@ def forecast_remaining_sales(loader): ("ECO4 post-ciga", "", "Of which forecast - £", ""): eco4_post_ciga_remaining_results["Of which forecast - £"], # CIGA failures + ("ECO4 CIGA failures", "", "Estimated total - failed CIGA - #", ""): eco4_post_ciga_remaining_results[ + 'Estimated total - failed CIGA' + ], + ("ECO4 CIGA failures", "", "Estimated total - failed CIGA - £", ""): eco4_post_ciga_remaining_results[ + 'Estimated total - failed CIGA - £' + ], + ("ECO4 CIGA failures", "", "Confirmed failures - #", ""): eco4_post_ciga_remaining_results[ + "Confirmed CIGA failures" + ], + ("ECO4 CIGA failures", "", "Confirmed failures - £", ""): eco4_post_ciga_remaining_results[ + "Confirmed CIGA failures - £" + ], + ("ECO4 CIGA failures", "", "Estimated failures - #", ""): eco4_post_ciga_remaining_results[ + "Estimated CIGA failures" + ], + ("ECO4 CIGA failures", "", "Estimated failures - £", ""): eco4_post_ciga_remaining_results[ + "Estimated CIGA failures - £" + ], + # GBIS postcode list + ("", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total, + ("", "Warmfront post code list", "Remaining - #", "GBIS total"): gbis_remaining, + ("", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_revenue, + ("", "Warmfront post code list", "Remaining - £", "GBIS total"): gbis_remaining_revenue, } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 22: + if len(to_append) != 32: raise ValueError("Something went wrong") results.append(to_append) results = pd.DataFrame(results) + # TODO: Add a blank row and then a total row + + assumptions = { + "ECO4 new rate": eco4_rate, + "GBIS new rate": gbis_rate, + # "ECO4 old rate": old_eco4_rate, + # "GBIS old rate": old_gbis_rate, + } + def app(): """ From 2ba37d55e65a746fdb58588aa2768851a83a3887 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 20:06:57 +0000 Subject: [PATCH 046/155] Added assumptions table --- .../ha_15_32/ha_analysis_batch_3.py | 45 ++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 191ca74c..ac4d3a0c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2956,14 +2956,14 @@ def forecast_remaining_sales(loader): gbis_total = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "gbis" ]["count"].sum() - gbis_total = np.round(gbis_total * ha_gbis_sale_conversion) - gbis_total_revenue = gbis_total * gbis_rate + gbis_total = int(np.round(gbis_total * ha_gbis_sale_conversion)) + gbis_total_revenue = int(gbis_total * gbis_rate) gbis_remaining = eligiblity_counts_remaining[ eligiblity_counts["ECO Eligibility"] == "gbis" ]["count"].sum() - gbis_remaining = np.round(gbis_remaining * ha_gbis_sale_conversion) - gbis_remaining_revenue = gbis_remaining * gbis_rate + gbis_remaining = int(np.round(gbis_remaining * ha_gbis_sale_conversion)) + gbis_remaining_revenue = int(gbis_remaining * gbis_rate) to_append = { ("", "", "", "HA Name"): ha_name, @@ -3037,12 +3037,37 @@ def forecast_remaining_sales(loader): # TODO: Add a blank row and then a total row - assumptions = { - "ECO4 new rate": eco4_rate, - "GBIS new rate": gbis_rate, - # "ECO4 old rate": old_eco4_rate, - # "GBIS old rate": old_gbis_rate, - } + assumptions = [ + { + ("", "", "", "HA Name"): "ECO4 rate", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(eco4_rate) + }, + { + ("", "", "", "HA Name"): "GBIS rate", + ("ECO4 original", "", "Remaining - #", ""): "£" + str(gbis_rate) + }, + { + ("", "", "", "HA Name"): "Median CIGA pass rate", + ("ECO4 original", "", "Total - £", ""): str(round(median_ciga_success_rate * 100, 1)) + "%", + }, + { + ("", "", "", "HA Name"): "Maximum allowed CIGA pass rate", + ("ECO4 original", "", "Total - £", ""): str(round(maximum_ciga_conversion * 100, 1)) + "%", + ("ECO4 original", "", "Remaining - £", ""): "- Maximum allowed CIGA conversion for HAs without CIGA checks" + }, + { + ("", "", "", "HA Name"): "Median ECO4 (no CIGA) sales conversion rate", + ("ECO4 original", "", "Total - £", ""): str(round(median_eco4_to_install * 100, 1)) + "%", + ("ECO4 original", "", "Remaining - £", + ""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check" + }, + { + ("", "", "", "HA Name"): "Median ECO4 (subect to CIGA) sales conversion rate", + ("ECO4 original", "", "Total - £", ""): str(round(median_ciga_pass_to_install * 100, 1)) + "%", + ("ECO4 original", "", "Remaining - £", + ""): " - Sales conversion rate for a ECO4 property that passed a CIGA check" + } + ] def app(): From 57a7edf62511207f7d7af176414b5b269f3b1aa1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 20:18:44 +0000 Subject: [PATCH 047/155] collating results --- .../ha_15_32/ha_analysis_batch_3.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index ac4d3a0c..7da6bb3a 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3035,9 +3035,21 @@ def forecast_remaining_sales(loader): results = pd.DataFrame(results) - # TODO: Add a blank row and then a total row + totals_row = {} + for col in results.columns: + if col == ('', '', '', 'HA Name'): + totals_row[col] = "Total" + elif col == ("ECO4 post-ciga", "", "Delta vs original estimate", ""): + totals_row[col] = results[col].mean() + else: + totals_row[col] = results[col].sum() + + blank_row = pd.DataFrame([{col: "" for col in results.columns}]) assumptions = [ + { + ("", "", "", "HA Name"): "Assumptions", + }, { ("", "", "", "HA Name"): "ECO4 rate", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(eco4_rate) @@ -3059,16 +3071,20 @@ def forecast_remaining_sales(loader): ("", "", "", "HA Name"): "Median ECO4 (no CIGA) sales conversion rate", ("ECO4 original", "", "Total - £", ""): str(round(median_eco4_to_install * 100, 1)) + "%", ("ECO4 original", "", "Remaining - £", - ""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check" + ""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check. Job must not cancel" }, { ("", "", "", "HA Name"): "Median ECO4 (subect to CIGA) sales conversion rate", ("ECO4 original", "", "Total - £", ""): str(round(median_ciga_pass_to_install * 100, 1)) + "%", ("ECO4 original", "", "Remaining - £", - ""): " - Sales conversion rate for a ECO4 property that passed a CIGA check" + ""): " - Sales conversion rate for a ECO4 property that passed a CIGA check. Job must not cancel" } ] + results = pd.concat( + [results, pd.DataFrame([totals_row]), blank_row, blank_row, pd.DataFrame(assumptions)] + ) + def app(): """ From 028c2edce7ab951987379a7c653324e5863426ae Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 20:48:43 +0000 Subject: [PATCH 048/155] Added headlines --- .../ha_15_32/ha_analysis_batch_3.py | 129 +++++++++++++++++- 1 file changed, 126 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7da6bb3a..1c320f9c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2991,7 +2991,7 @@ def forecast_remaining_sales(loader): # ECO4 - asset list, post ciga, remaining ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining_results[ "ECO4 - post CIGA - #"], - ("ECO4 post-ciga", "", "Estimated remaining total eligible - £", ""): eco4_post_ciga_remaining_results[ + ("ECO4 post-ciga", "", "Estimated remaining eligible - £", ""): eco4_post_ciga_remaining_results[ "ECO4 - post CIGA - £"], ("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - #", ""): eco4_post_ciga_remaining_results["Of which confirmed - #"], @@ -3046,6 +3046,126 @@ def forecast_remaining_sales(loader): blank_row = pd.DataFrame([{col: "" for col in results.columns}]) + # Put together a Warmfront original remaining ECO4 vs asset list remaining ECO4 and same for GBIS, as well as totals + + # ECO4 Headlines + headline_eco4_original_remaining = totals_row[("ECO4 original", "", "Remaining - #", "")] + headline_eco4_original_remaining_revenue = totals_row[("ECO4 original", "", "Remaining - £", "")] + headline_eco4_postcode_list_remaining = totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")] + headline_eco4_postcode_list_remaining_revenue = totals_row[ + ("ECO4 post-ciga", "", "Estimated remaining eligible - £", "") + ] + headline_eco4_delta = 100 * ( + (headline_eco4_postcode_list_remaining - headline_eco4_original_remaining) / + headline_eco4_original_remaining + ) + headline_eco4_delta = round(headline_eco4_delta, 1) + + # GBIS Headlines + headline_gbis_original_remaining = totals_row[("GBIS original", "", "Remaining - #", "")] + headline_gbis_original_remaining_revenue = totals_row[("GBIS original", "", "Remaining - £", "")] + headline_gbis_postcode_list_remaining = totals_row[("", "Warmfront post code list", "Remaining - #", "GBIS total")] + headline_gbis_postcode_list_remaining_revenue = totals_row[ + ("", "Warmfront post code list", "Remaining - £", "GBIS total") + ] + headline_gbis_delta = 100 * ( + (headline_gbis_postcode_list_remaining - headline_gbis_original_remaining) / + headline_gbis_original_remaining + ) + headline_gbis_delta = round(headline_gbis_delta, 1) + + headline_original_total_revenue_remaining = ( + headline_eco4_original_remaining_revenue + headline_gbis_original_remaining_revenue + ) + + headline_postcode_list_total_revenue_remaining = ( + headline_eco4_postcode_list_remaining_revenue + headline_gbis_postcode_list_remaining_revenue + ) + headline_total_delta = 100 * ( + (headline_postcode_list_total_revenue_remaining - headline_original_total_revenue_remaining) / + headline_original_total_revenue_remaining + ) + headline_total_delta = round(headline_total_delta, 1) + + headlines = [ + { + ("", "", "", "HA Name"): "Headlines", + }, + { + ("", "", "", "HA Name"): "ECO4 Remaining - November - #", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( + headline_eco4_original_remaining + ) + }, + { + ("", "", "", "HA Name"): "ECO4 Remaining - November - £", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( + headline_eco4_original_remaining_revenue + ) + }, + { + ("", "", "", "HA Name"): "ECO4 Remaining - postcode list - #", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( + headline_eco4_postcode_list_remaining + ) + }, + { + ("", "", "", "HA Name"): "ECO4 Remaining - postcode list - £", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( + headline_eco4_postcode_list_remaining_revenue + ) + }, + { + ("", "", "", "HA Name"): "ECO4 delta %", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_eco4_delta) + "%" + }, + { + ("", "", "", "HA Name"): "GBIS Remaining - November - #", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( + headline_gbis_original_remaining + ) + }, + { + ("", "", "", "HA Name"): "GBIS Remaining - November - £", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( + headline_gbis_original_remaining_revenue + ) + }, + { + ("", "", "", "HA Name"): "GBIS Remaining - post code list - #", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( + headline_gbis_postcode_list_remaining + ) + }, + { + ("", "", "", "HA Name"): "GBIS Remaining - post code list - £", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( + headline_gbis_postcode_list_remaining_revenue + ) + }, + { + ("", "", "", "HA Name"): "GBIS delta %", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_gbis_delta) + "%" + }, + # Total revenue + { + ("", "", "", "HA Name"): "Total Remaining - November - £", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( + headline_original_total_revenue_remaining + ) + }, + { + ("", "", "", "HA Name"): "Total Remaining - post code list - £", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( + headline_postcode_list_total_revenue_remaining + ) + }, + { + ("", "", "", "HA Name"): "Total Remaining delta %", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_total_delta) + "%" + }, + ] + assumptions = [ { ("", "", "", "HA Name"): "Assumptions", @@ -3065,7 +3185,9 @@ def forecast_remaining_sales(loader): { ("", "", "", "HA Name"): "Maximum allowed CIGA pass rate", ("ECO4 original", "", "Total - £", ""): str(round(maximum_ciga_conversion * 100, 1)) + "%", - ("ECO4 original", "", "Remaining - £", ""): "- Maximum allowed CIGA conversion for HAs without CIGA checks" + ("ECO4 original", "", "Remaining - £", + ""): "- Maximum allowed CIGA conversion for HAs without CIGA checks We do not allow above this to be " + "conservative" }, { ("", "", "", "HA Name"): "Median ECO4 (no CIGA) sales conversion rate", @@ -3082,7 +3204,8 @@ def forecast_remaining_sales(loader): ] results = pd.concat( - [results, pd.DataFrame([totals_row]), blank_row, blank_row, pd.DataFrame(assumptions)] + [results, pd.DataFrame([headlines]), pd.DataFrame([totals_row]), blank_row, blank_row, + pd.DataFrame(assumptions)] ) From 721bfb19fcc3bd70fe02081e14e4abde22f9a13e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 21:33:45 +0000 Subject: [PATCH 049/155] Added totals percentages aggregations --- .../ha_15_32/ha_analysis_batch_3.py | 74 ++++++++++++++++--- 1 file changed, 64 insertions(+), 10 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1c320f9c..3341e34c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2965,6 +2965,14 @@ def forecast_remaining_sales(loader): gbis_remaining = int(np.round(gbis_remaining * ha_gbis_sale_conversion)) gbis_remaining_revenue = int(gbis_remaining * gbis_rate) + # GBIS delta + if original_warmfront_gbis == 0: + gbis_delta_vs_original_estimate = 100 * gbis_total + else: + gbis_delta_vs_original_estimate = 100 * ( + gbis_total - original_warmfront_gbis + ) / original_warmfront_gbis + to_append = { ("", "", "", "HA Name"): ha_name, # ECO4 - original warmfront figures @@ -2987,7 +2995,7 @@ def forecast_remaining_sales(loader): "ECO4 - post CIGA - #"], ("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_total_results[ "ECO4 - post CIGA - £"], - ("ECO4 post-ciga", "", "Delta vs original estimate", ""): eco4_delta_vs_original_estimate, + ("ECO4 post-ciga", "", "Delta vs original estimate - %", ""): eco4_delta_vs_original_estimate, # ECO4 - asset list, post ciga, remaining ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining_results[ "ECO4 - post CIGA - #"], @@ -3021,14 +3029,15 @@ def forecast_remaining_sales(loader): "Estimated CIGA failures - £" ], # GBIS postcode list - ("", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total, - ("", "Warmfront post code list", "Remaining - #", "GBIS total"): gbis_remaining, - ("", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_revenue, - ("", "Warmfront post code list", "Remaining - £", "GBIS total"): gbis_remaining_revenue, + ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total, + ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_revenue, + ("GBIS Postcode list", "", "Delta vs original estimate - %", ""): gbis_delta_vs_original_estimate, + ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"): gbis_remaining, + ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"): gbis_remaining_revenue, } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 32: + if len(to_append) != 33: raise ValueError("Something went wrong") results.append(to_append) @@ -3039,11 +3048,31 @@ def forecast_remaining_sales(loader): for col in results.columns: if col == ('', '', '', 'HA Name'): totals_row[col] = "Total" - elif col == ("ECO4 post-ciga", "", "Delta vs original estimate", ""): - totals_row[col] = results[col].mean() + elif col in [ + ("ECO4 post-ciga", "", "Delta vs original estimate - %", ""), + ("GBIS Postcode list", "", "Delta vs original estimate - %", "") + ]: + totals_row[col] = None else: totals_row[col] = results[col].sum() + # For the delta columns, we calculate the delta on the totals + totals_row[("ECO4 post-ciga", "", "Delta vs original estimate - %", "")] = round( + 100 * ( + totals_row[("ECO4 post-ciga", "", "Estimated total eligible - #", "")] - + totals_row[("", "Original Warmfront estimate", "Total - #", "ECO4 - November")] + ) / totals_row[("", "Original Warmfront estimate", "Total - #", "ECO4 - November")], + 1 + ) + + totals_row[("GBIS Postcode list", "", "Delta vs original estimate - %", "")] = round( + 100 * ( + totals_row[("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total")] - + totals_row[("", "Original Warmfront estimate", "Total - #", "GBIS - November")] + ) / totals_row[("", "Original Warmfront estimate", "Total - #", "GBIS - November")], + 1 + ) + blank_row = pd.DataFrame([{col: "" for col in results.columns}]) # Put together a Warmfront original remaining ECO4 vs asset list remaining ECO4 and same for GBIS, as well as totals @@ -3204,10 +3233,35 @@ def forecast_remaining_sales(loader): ] results = pd.concat( - [results, pd.DataFrame([headlines]), pd.DataFrame([totals_row]), blank_row, blank_row, - pd.DataFrame(assumptions)] + [ + results, + pd.DataFrame([totals_row]), + pd.DataFrame(headlines), + blank_row, + blank_row, + pd.DataFrame(assumptions) + ] ) + # header_rows = [ + # [name[0] for name in results.columns.values], + # [name[1] for name in results.columns.values], + # [name[2] for name in results.columns.values], + # [name[3] for name in results.columns.values] + # ] + + # Step 2: Write the transformed header and DataFrame data to CSV. + # Open the file in write mode. + import csv + with open("HA Remaining Analysis.csv", "w", newline="") as file: + # writer = csv.writer(file) + + # Write the header rows. + # writer.writerows(header_rows) + + # Write the DataFrame data without the index (adjust if you want the index). + results.to_csv(file, header=True, index=False) + def app(): """ From f9d1a90689ef742fd32217b606c6a919b766d974 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 22:17:11 +0000 Subject: [PATCH 050/155] Fixing some formatting bugs --- .../ha_15_32/ha_analysis_batch_3.py | 86 +++++++++++-------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 3341e34c..6309d2e2 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2820,6 +2820,8 @@ def forecast_remaining_sales(loader): results = [] for ha_name, input_data in loader.data.items(): + if ha_name == "HA16": + dew # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] @@ -2991,8 +2993,9 @@ def forecast_remaining_sales(loader): ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue, ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, # ECO4 - asset list, post ciga, total - ("ECO4 post-ciga", "", "Estimated total eligible - #", ""): eco4_post_ciga_total_results[ - "ECO4 - post CIGA - #"], + ("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total (post-ciga)"): + eco4_post_ciga_total_results[ + "ECO4 - post CIGA - #"], ("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_total_results[ "ECO4 - post CIGA - £"], ("ECO4 post-ciga", "", "Delta vs original estimate - %", ""): eco4_delta_vs_original_estimate, @@ -3059,7 +3062,7 @@ def forecast_remaining_sales(loader): # For the delta columns, we calculate the delta on the totals totals_row[("ECO4 post-ciga", "", "Delta vs original estimate - %", "")] = round( 100 * ( - totals_row[("ECO4 post-ciga", "", "Estimated total eligible - #", "")] - + totals_row[("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total (post-ciga)")] - totals_row[("", "Original Warmfront estimate", "Total - #", "ECO4 - November")] ) / totals_row[("", "Original Warmfront estimate", "Total - #", "ECO4 - November")], 1 @@ -3093,9 +3096,11 @@ def forecast_remaining_sales(loader): # GBIS Headlines headline_gbis_original_remaining = totals_row[("GBIS original", "", "Remaining - #", "")] headline_gbis_original_remaining_revenue = totals_row[("GBIS original", "", "Remaining - £", "")] - headline_gbis_postcode_list_remaining = totals_row[("", "Warmfront post code list", "Remaining - #", "GBIS total")] + headline_gbis_postcode_list_remaining = totals_row[ + ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total") + ] headline_gbis_postcode_list_remaining_revenue = totals_row[ - ("", "Warmfront post code list", "Remaining - £", "GBIS total") + ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total") ] headline_gbis_delta = 100 * ( (headline_gbis_postcode_list_remaining - headline_gbis_original_remaining) / @@ -3205,29 +3210,33 @@ def forecast_remaining_sales(loader): }, { ("", "", "", "HA Name"): "GBIS rate", - ("ECO4 original", "", "Remaining - #", ""): "£" + str(gbis_rate) + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str(gbis_rate) }, { ("", "", "", "HA Name"): "Median CIGA pass rate", - ("ECO4 original", "", "Total - £", ""): str(round(median_ciga_success_rate * 100, 1)) + "%", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( + round(median_ciga_success_rate * 100, 1)) + "%", }, { ("", "", "", "HA Name"): "Maximum allowed CIGA pass rate", - ("ECO4 original", "", "Total - £", ""): str(round(maximum_ciga_conversion * 100, 1)) + "%", - ("ECO4 original", "", "Remaining - £", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( + round(maximum_ciga_conversion * 100, 1)) + "%", + ("ECO4 original", "", "Remaining - #", ""): "- Maximum allowed CIGA conversion for HAs without CIGA checks We do not allow above this to be " "conservative" }, { ("", "", "", "HA Name"): "Median ECO4 (no CIGA) sales conversion rate", - ("ECO4 original", "", "Total - £", ""): str(round(median_eco4_to_install * 100, 1)) + "%", - ("ECO4 original", "", "Remaining - £", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( + round(median_eco4_to_install * 100, 1)) + "%", + ("ECO4 original", "", "Remaining - #", ""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check. Job must not cancel" }, { ("", "", "", "HA Name"): "Median ECO4 (subect to CIGA) sales conversion rate", - ("ECO4 original", "", "Total - £", ""): str(round(median_ciga_pass_to_install * 100, 1)) + "%", - ("ECO4 original", "", "Remaining - £", + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( + round(median_ciga_pass_to_install * 100, 1)) + "%", + ("ECO4 original", "", "Remaining - #", ""): " - Sales conversion rate for a ECO4 property that passed a CIGA check. Job must not cancel" } ] @@ -3236,6 +3245,7 @@ def forecast_remaining_sales(loader): [ results, pd.DataFrame([totals_row]), + blank_row, pd.DataFrame(headlines), blank_row, blank_row, @@ -3291,32 +3301,32 @@ def app(): loader.load() loader.ha_facts_and_figures() + forecast_remaining_sales(loader) + # We load in the additional data required to perform the analysis - cleaned = read_from_s3( - s3_file_name="cleaned_epc_data/cleaned.bson", - bucket_name="retrofit-data-dev" - ) - cleaned = msgpack.unpackb(cleaned, raw=False) - cleaned = patch_cleaned(cleaned) - - cleaning_data = read_dataframe_from_s3_parquet( - bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", - ) - created_at = datetime.now().isoformat() - - photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") - - outputs = get_epc_data( - loader=loader, - cleaned=cleaned, - cleaning_data=cleaning_data, - created_at=created_at, - photo_supply_lookup=photo_supply_lookup, - floor_area_decile_thresholds=floor_area_decile_thresholds, - pull_data=pull_data - ) - - analyse_ha_data(outputs, loader) + # cleaned = read_from_s3( + # s3_file_name="cleaned_epc_data/cleaned.bson", + # bucket_name="retrofit-data-dev" + # ) + # cleaned = msgpack.unpackb(cleaned, raw=False) + # cleaned = patch_cleaned(cleaned) + # + # cleaning_data = read_dataframe_from_s3_parquet( + # bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", + # ) + # created_at = datetime.now().isoformat() + # + # photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") + # + # outputs = get_epc_data( + # loader=loader, + # cleaned=cleaned, + # cleaning_data=cleaning_data, + # created_at=created_at, + # photo_supply_lookup=photo_supply_lookup, + # floor_area_decile_thresholds=floor_area_decile_thresholds, + # pull_data=pull_data + # ) # import pickle # with open("ha_analysis.pickle", "wb") as f: From 0497290b7cac36b4519b3db4c0f9d1d1be4932b5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 22:17:51 +0000 Subject: [PATCH 051/155] removed temp code --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 6309d2e2..ec9469dc 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2820,8 +2820,6 @@ def forecast_remaining_sales(loader): results = [] for ha_name, input_data in loader.data.items(): - if ha_name == "HA16": - dew # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] From fbd808a54d3314d9821d5fad5456e951558959c9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 22:27:50 +0000 Subject: [PATCH 052/155] re-formatting percentages --- .../ha_15_32/ha_analysis_batch_3.py | 64 ++++++++----------- 1 file changed, 27 insertions(+), 37 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index ec9469dc..0daf239b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2926,9 +2926,10 @@ def forecast_remaining_sales(loader): ) # Calculate the delta compared to Warmfront's original estimate - eco4_delta_vs_original_estimate = 100 * ( - eco4_post_ciga_total_results["ECO4 - post CIGA - #"] - original_warmfront_eco4 - ) / original_warmfront_eco4 + eco4_delta_vs_original_estimate = ( + eco4_post_ciga_total_results[ + "ECO4 - post CIGA - #"] - original_warmfront_eco4 + ) / original_warmfront_eco4 eco4_post_ciga_remaining_results = calculate_eco4_post_ciga( eligiblity_counts=eligiblity_counts_remaining, @@ -2967,11 +2968,11 @@ def forecast_remaining_sales(loader): # GBIS delta if original_warmfront_gbis == 0: - gbis_delta_vs_original_estimate = 100 * gbis_total + gbis_delta_vs_original_estimate = gbis_total else: - gbis_delta_vs_original_estimate = 100 * ( - gbis_total - original_warmfront_gbis - ) / original_warmfront_gbis + gbis_delta_vs_original_estimate = ( + gbis_total - original_warmfront_gbis + ) / original_warmfront_gbis to_append = { ("", "", "", "HA Name"): ha_name, @@ -3125,27 +3126,23 @@ def forecast_remaining_sales(loader): }, { ("", "", "", "HA Name"): "ECO4 Remaining - November - #", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( - headline_eco4_original_remaining - ) + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_original_remaining + }, { ("", "", "", "HA Name"): "ECO4 Remaining - November - £", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( - headline_eco4_original_remaining_revenue - ) + ( + "", "Original Warmfront estimate", "Total - #", + "ECO4 - November"): headline_eco4_original_remaining_revenue }, { ("", "", "", "HA Name"): "ECO4 Remaining - postcode list - #", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( - headline_eco4_postcode_list_remaining - ) + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_postcode_list_remaining }, { ("", "", "", "HA Name"): "ECO4 Remaining - postcode list - £", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( - headline_eco4_postcode_list_remaining_revenue - ) + ("", "Original Warmfront estimate", "Total - #", + "ECO4 - November"): headline_eco4_postcode_list_remaining_revenue }, { ("", "", "", "HA Name"): "ECO4 delta %", @@ -3153,27 +3150,22 @@ def forecast_remaining_sales(loader): }, { ("", "", "", "HA Name"): "GBIS Remaining - November - #", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( - headline_gbis_original_remaining - ) + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_original_remaining }, { ("", "", "", "HA Name"): "GBIS Remaining - November - £", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( - headline_gbis_original_remaining_revenue - ) + ( + "", "Original Warmfront estimate", "Total - #", + "ECO4 - November"): headline_gbis_original_remaining_revenue }, { ("", "", "", "HA Name"): "GBIS Remaining - post code list - #", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( - headline_gbis_postcode_list_remaining - ) + ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_postcode_list_remaining }, { ("", "", "", "HA Name"): "GBIS Remaining - post code list - £", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( - headline_gbis_postcode_list_remaining_revenue - ) + ("", "Original Warmfront estimate", "Total - #", + "ECO4 - November"): headline_gbis_postcode_list_remaining_revenue }, { ("", "", "", "HA Name"): "GBIS delta %", @@ -3182,15 +3174,13 @@ def forecast_remaining_sales(loader): # Total revenue { ("", "", "", "HA Name"): "Total Remaining - November - £", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( - headline_original_total_revenue_remaining - ) + ("", "Original Warmfront estimate", "Total - #", + "ECO4 - November"): headline_original_total_revenue_remaining }, { ("", "", "", "HA Name"): "Total Remaining - post code list - £", - ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): "£" + str( - headline_postcode_list_total_revenue_remaining - ) + ("", "Original Warmfront estimate", "Total - #", + "ECO4 - November"): headline_postcode_list_total_revenue_remaining }, { ("", "", "", "HA Name"): "Total Remaining delta %", From 46f5ee8ea43e719dc4f0c8c472de68b62d974270 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 22:34:02 +0000 Subject: [PATCH 053/155] formatting percentage --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 0daf239b..b5c6835b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3059,20 +3059,18 @@ def forecast_remaining_sales(loader): totals_row[col] = results[col].sum() # For the delta columns, we calculate the delta on the totals - totals_row[("ECO4 post-ciga", "", "Delta vs original estimate - %", "")] = round( - 100 * ( + totals_row[("ECO4 post-ciga", "", "Delta vs original estimate - %", "")] = ( + ( totals_row[("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total (post-ciga)")] - totals_row[("", "Original Warmfront estimate", "Total - #", "ECO4 - November")] - ) / totals_row[("", "Original Warmfront estimate", "Total - #", "ECO4 - November")], - 1 + ) / totals_row[("", "Original Warmfront estimate", "Total - #", "ECO4 - November")] ) - totals_row[("GBIS Postcode list", "", "Delta vs original estimate - %", "")] = round( - 100 * ( + totals_row[("GBIS Postcode list", "", "Delta vs original estimate - %", "")] = ( + ( totals_row[("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total")] - totals_row[("", "Original Warmfront estimate", "Total - #", "GBIS - November")] - ) / totals_row[("", "Original Warmfront estimate", "Total - #", "GBIS - November")], - 1 + ) / totals_row[("", "Original Warmfront estimate", "Total - #", "GBIS - November")] ) blank_row = pd.DataFrame([{col: "" for col in results.columns}]) From d9e9be4389d371176a8f83ec5f83f0fcbabbeb8b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 23:48:27 +0000 Subject: [PATCH 054/155] Added HA25 --- .../ha_15_32/ha_analysis_batch_3.py | 79 ++++++++++++------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index b5c6835b..baaa4050 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -159,19 +159,18 @@ class DataLoader: } UNMATCHED_CIGA = { - # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not - # the asset list + "HA6": 117, "HA14": 3, "HA16": 7, - # There's just too many unmatched here - "HA6": 117, + "HA24": 12, "HA107": 51, } - def __init__(self, directories, december_figures_filepath, use_cache): + def __init__(self, directories, december_figures_filepath, use_cache, rebuild): self.directories = directories self.use_cache = use_cache self.december_figures_filepath = december_figures_filepath + self.rebuild = rebuild self.data = {} self.december_figures = None @@ -312,23 +311,20 @@ class DataLoader: return asset_list @staticmethod - def create_ciga_list_house_no(ha_name, ciga_list): + def create_ciga_list_house_no(ciga_list): """ This function will append the House number onto the asset list :return: """ - if ha_name in ["HA6", "HA14", "HA107", "HA16"]: - split_addresses = ciga_list['Matched Address'].str.split(',', expand=True) - house_numbers = split_addresses[0].str.split(' ', expand=True) - # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how - # many columns there might be - house_numbers = house_numbers.iloc[:, 0:1] - house_numbers.columns = ['HouseNo'] + split_addresses = ciga_list['Matched Address'].str.split(',', expand=True) + house_numbers = split_addresses[0].str.split(' ', expand=True) + # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how + # many columns there might be + house_numbers = house_numbers.iloc[:, 0:1] + house_numbers.columns = ['HouseNo'] - ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1) - else: - raise NotImplementedError("Implement me") + ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1) return ciga_list @@ -447,7 +443,7 @@ class DataLoader: # Remove rows with missing postcode which happens in a small number of cases ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])] ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] - ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) + ciga_list = self.create_ciga_list_house_no(ciga_list) ciga_list = self.dedupe_ciga_list(ciga_list) ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) @@ -800,6 +796,10 @@ class DataLoader: "st. leodegars close", "st leodegars close" ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "montgomery crescent", "montgomery road" + ) + return survey_list @staticmethod @@ -1102,16 +1102,18 @@ class DataLoader: for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]: self.december_figures[col] = self.december_figures[col].astype("Int64") - if self.use_cache: - self.data = read_pickle_from_s3( + if self.use_cache and not self.rebuild: + data = read_pickle_from_s3( bucket_name="retrofit-datalake-dev", s3_file_name="ha-analysis/batch3-inputs.pickle", ) - return + else: + data = {} - data = {} for filepath in self.directories: ha_name = filepath.split("/")[2] + if ha_name in data: + continue # Load asset list logger.info("Loading data for {}".format(ha_name)) asset_list, survey_list, ciga_list = self.load_asset_list( @@ -2635,6 +2637,10 @@ def forecast_remaining_sales(loader): # and I don't want the numbers to change too much, depenent on the CIGA conversation rate maximum_ciga_conversion = 0.75 + # This is a hard limit to the allowed conversion rates to final sale. These are typically very + # high but there are some anomalies, amongst surveys that are early on + sales_conversion_lower_bound = 0.8 + gbis_rate = 600 eco4_rate = 1710 # old_gbis_rate = 432 @@ -2796,14 +2802,30 @@ def forecast_remaining_sales(loader): eco4_ciga_independent_passrates = pd.DataFrame(eco4_ciga_independent_passrates) gbis_ciga_independent_passrates = pd.DataFrame(gbis_ciga_independent_passrates) + eco4_ciga_independent_passrates["conversion"] = ( + eco4_ciga_independent_passrates["# ECO4 successfully installed"] / + eco4_ciga_independent_passrates["# ECO4 at install stage"] + ) + eco4_ciga_independent_passrates_clipped = eco4_ciga_independent_passrates[ + eco4_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound + ] + + gbis_ciga_independent_passrates["conversion"] = ( + gbis_ciga_independent_passrates["# GBIS successfully installed"] / + gbis_ciga_independent_passrates["# GBIS at install stage"] + ) + gbis_ciga_independent_passrates_clipped = gbis_ciga_independent_passrates[ + gbis_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound + ] + median_eco4_to_install = ( - eco4_ciga_independent_passrates["# ECO4 successfully installed"].sum() / - eco4_ciga_independent_passrates["# ECO4 at install stage"].sum() + eco4_ciga_independent_passrates_clipped["# ECO4 successfully installed"].sum() / + eco4_ciga_independent_passrates_clipped["# ECO4 at install stage"].sum() ) median_gbis_to_install = ( - gbis_ciga_independent_passrates["# GBIS successfully installed"].sum() / - gbis_ciga_independent_passrates["# GBIS at install stage"].sum() + gbis_ciga_independent_passrates_clipped["# GBIS successfully installed"].sum() / + gbis_ciga_independent_passrates_clipped["# GBIS at install stage"].sum() ) # Produce the final output @@ -3270,6 +3292,8 @@ def app(): use_cache = True # Determines if we want to perform the data pull pull_data = False + # Override to re-build all inputs + rebuild_inputs = False # List all of the data in the folder @@ -3278,12 +3302,11 @@ def app(): # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - # priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA39", "HA107"] - priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA39", "HA107"] + priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA25", "HA39", "HA107"] # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] - loader = DataLoader(directories, december_figures_filepath, use_cache) + loader = DataLoader(directories, december_figures_filepath, use_cache, rebuild_inputs) loader.load() loader.ha_facts_and_figures() From cbd4a0052ef005e00ce143c16306b5f0b782c4ed Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 1 Mar 2024 23:52:19 +0000 Subject: [PATCH 055/155] Starting HA25 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index baaa4050..0c9f685f 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -347,6 +347,8 @@ class DataLoader: return "Asset" elif "Decent Homes Stock" in workbook.sheetnames: return "Decent Homes Stock" + elif "Report" in workbook.sheetnames: + return "Report" else: return "Assets" From fc022b8a22d571651ba21fff9fd4c5901b18e20f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 2 Mar 2024 12:34:22 +0000 Subject: [PATCH 056/155] Added data load for HA25 --- .../ha_15_32/ha_analysis_batch_3.py | 32 +++++++++++++++---- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 0c9f685f..4ae881d2 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -155,6 +155,10 @@ class DataLoader: "HA24": { "address": "Address", "postcode": "Postcode" + }, + "HA25": { + "address": "T1_Address", + "postcode": "matching_postcode" } } @@ -178,7 +182,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA16", "HA24"]: + if ha_name in ["HA1", "HA6", "HA16", "HA24", "HA25"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -374,13 +378,23 @@ class DataLoader: asset_sheetname = self.get_asset_sheetname(workbook) asset_sheet = workbook[asset_sheetname] asset_sheet_colnames = [cell.value for cell in asset_sheet[1]] + if ha_name == "HA25": + asset_sheet_colnames[11] = "matching_postcode" + + values_only = not ha_name != "HA25" rows_data = [] - for row in asset_sheet.iter_rows(min_row=2, values_only=False): - row_data = [cell.value for cell in row] # This will get you the cell values - rows_data.append(row_data) + if not values_only: + for row in asset_sheet.iter_rows(min_row=2, values_only=values_only): + row_data = [cell.value for cell in row] # This will get you the cell values + rows_data.append(row_data) + else: + for row in asset_sheet.iter_rows(min_row=2, values_only=values_only): # use values_only=True to get values + row_data = list(row) # No need for comprehension, values_only=True returns a tuple of values + rows_data.append(row_data) asset_list = pd.DataFrame(rows_data, columns=asset_sheet_colnames) + asset_list = asset_list.loc[:, asset_list.columns.notnull()] # Remove entirely empty rows - consider all rows apart from row_color @@ -403,9 +417,10 @@ class DataLoader: asset_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_asset_list") asset_list = asset_list_correction_function(asset_list) - # For HA1, there is an exception in the structure of the data. We don't have any survey or ciga lists, and so + # For HA1 and HA25, there is an exception in the structure of the data. We don't have any survey or ciga + # lists, and so # we can return the asset list now - if ha_name == "HA1": + if ha_name in ["HA1", "HA25"]: return asset_list, pd.DataFrame(), pd.DataFrame() # We check if there is a survey list @@ -1149,7 +1164,8 @@ class DataLoader: "ECO4": "ECO4", "AFFORDABLE WARMTH": "ECO4", "ECO4 A/W": "ECO4", - "ECO4 GBIS (ECO+)": "GBIS" + "ECO4 GBIS (ECO+)": "GBIS", + "ECO4 GBIS (ECO+) JJC UNDER 73m²": "GBIS" } eco_eligibility_map = { @@ -3305,6 +3321,8 @@ def app(): december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA25", "HA39", "HA107"] + # Next HAs to do: 15, 32, 33, + # Then: 28, 41, 38, 10, 14, 20, 48 # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From 9a69f8741ece9fdb740cb1b9855f53e639637f44 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 2 Mar 2024 12:54:19 +0000 Subject: [PATCH 057/155] adding HA15 --- .../ha_15_32/ha_analysis_batch_3.py | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 4ae881d2..81ed2301 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -165,6 +165,7 @@ class DataLoader: UNMATCHED_CIGA = { "HA6": 117, "HA14": 3, + "HA15": 3, "HA16": 7, "HA24": 12, "HA107": 51, @@ -204,7 +205,15 @@ class DataLoader: asset_list["Address 4"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() - + elif ha_name == "HA15": + asset_list["matching_address"] = ( + asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + + asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + + asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + + asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + + asset_list["Postcode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA39": # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \ @@ -502,6 +511,15 @@ class DataLoader: return asset_list + @staticmethod + def correct_ha15_asset_list(asset_list): + asset_list["matching_postcode"] = np.where( + asset_list["Address Line 1"] == "103 Priory Crescent", + "hp19 9ny", + asset_list["matching_postcode"] + ) + return asset_list + @staticmethod def correct_ha6_survey_list(survey_list): @@ -655,6 +673,14 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha15_survey_list(survey_list): + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Mary Mac Manus Drive, Milton Keynes", "Mary Mac Manus Drive" + ) + + return survey_list + @staticmethod def correct_ha16_survey_list(survey_list): survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace("/", ", ") @@ -3320,7 +3346,9 @@ def app(): # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA25", "HA39", "HA107"] + priority_has = [ + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA39", "HA107" + ] # Next HAs to do: 15, 32, 33, # Then: 28, 41, 38, 10, 14, 20, 48 # Filter down the directories to only the priority HAs From dad2fc74c889112cbed0a67578fb013e21b276f9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 2 Mar 2024 13:10:28 +0000 Subject: [PATCH 058/155] HA15 checked and added --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 81ed2301..1ae05d16 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1191,12 +1191,15 @@ class DataLoader: "AFFORDABLE WARMTH": "ECO4", "ECO4 A/W": "ECO4", "ECO4 GBIS (ECO+)": "GBIS", - "ECO4 GBIS (ECO+) JJC UNDER 73m²": "GBIS" + "ECO4 GBIS (ECO+) JJC UNDER 73m²": "GBIS", + "ECO4 AFFORDABLE WARMTH": "ECO4" } eco_eligibility_map = { "not eligble": "not eligible", "eco 4(subject to ciga)": "eco4 (subject to ciga)", + "eco4 (subject to ciga/archetype check": "eco4 (subject to ciga)", + "eco4 (subject to archetype check)": "eco4" } ha_facts_and_figures = [] From 9eccfca70dda75ac1c49084bcd63ec3734e3dd23 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 2 Mar 2024 13:26:54 +0000 Subject: [PATCH 059/155] fixing merge --- .../ha_15_32/ha_analysis_batch_3.py | 67 ++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1ae05d16..1f99d23c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -214,6 +214,13 @@ class DataLoader: asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA32": + asset_list["matching_address"] = ( + asset_list["Dwelling num"].astype(str).str.lower().str.strip() + ", " + + asset_list["Street"].astype(str).str.lower().str.strip() + ", " + + asset_list["Postcode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA39": # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \ @@ -308,6 +315,8 @@ class DataLoader: if ha_name in ["HA107"]: asset_list["HouseNo"] = asset_list["House No"].copy() + elif ha_name == "HA32": + asset_list["HouseNo"] = asset_list["Dwelling num"].copy() else: split_addresses = asset_list['matching_address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) @@ -520,6 +529,16 @@ class DataLoader: ) return asset_list + @staticmethod + def correct_ha32_asset_list(asset_list): + asset_list["Postcode"] = np.where( + (asset_list["Street"] == "Norton Grove") & (asset_list["Postcode"] == "HU4 6HQ") & ( + asset_list["Dwelling num"] == "7"), + "hu4 6hg", + asset_list["Postcode"] + ) + return asset_list + @staticmethod def correct_ha6_survey_list(survey_list): @@ -845,6 +864,50 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha32_survey_list(survey_list): + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == "Coxwold", + "Coxwold Grove", + survey_list["Street / Block Name"] + ) + + # Update the Barringhton Avenue with their correct spelling: Barrington Avenue + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == "Barringhton Avenue", + "Barrington Avenue", + survey_list["Street / Block Name"] + ) + + # Update how the Rustenburn addresses are listed in the identified addresses + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == "Rustenburg", + "Rustenburg Street", + survey_list["Street / Block Name"] + ) + + # Update how the MALIN LODGE, RONALDSWAY CLOSE addresses are listed in the identified addresses + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == "MALIN LODGE, RONALDSWAY CLOSE", + "Malin Lodge", + survey_list["Street / Block Name"] + ) + + # Update how the Feroes Close are listed in the identified addresses + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == "Feroes Close", + "Faroes Close", + survey_list["Street / Block Name"] + ) + + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == 'FORESTER WAY', + 'FORESTER WAY', + survey_list["Street / Block Name"] + ) + + return survey_list + @staticmethod def correct_ha107_survey_list(survey_list): # Replace Front Street, East Stockham with Front Street, East Stockwith @@ -3350,9 +3413,9 @@ def app(): december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA39", "HA107" + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA32", "HA39", "HA107" ] - # Next HAs to do: 15, 32, 33, + # Next HAs to do: 15[DONE], 32, 33, # Then: 28, 41, 38, 10, 14, 20, 48 # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From 2828b005cbb3676216827fcb5dc70630f8ecb393 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 3 Mar 2024 15:06:31 +0000 Subject: [PATCH 060/155] fixing HA32 merge --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1f99d23c..c84a2c5c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -906,6 +906,19 @@ class DataLoader: survey_list["Street / Block Name"] ) + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == '6 Zeigfeld', + 'Ziegfeld Court', + survey_list["Street / Block Name"] + ) + + # Malin Lodge, Ronaldsway Close + survey_list["Street / Block Name"] = np.where( + survey_list["Street / Block Name"] == 'Malin Lodge, Ronaldsway Close', + 'Malin Lodge', + survey_list["Street / Block Name"] + ) + return survey_list @staticmethod From 811f141c45b1fcfa52c9f1d685690389df55f531 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 3 Mar 2024 15:35:49 +0000 Subject: [PATCH 061/155] started working on ha33 but paused --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index c84a2c5c..9bd04884 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -221,6 +221,12 @@ class DataLoader: asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA33": + asset_list["matching_address"] = ( + asset_list["ADDRESS"].astype(str).str.lower().str.strip() + ", " + + asset_list["POST CODE"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["POST CODE"].astype(str).str.lower().str.strip() elif ha_name == "HA39": # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \ @@ -3426,9 +3432,9 @@ def app(): december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA32", "HA39", "HA107" + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA107", ] - # Next HAs to do: 15[DONE], 32, 33, + # Next HAs to do: 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come back on this], # Then: 28, 41, 38, 10, 14, 20, 48 # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From cb39590f618e7c6ff382e76cc461792101a9741a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 3 Mar 2024 15:48:05 +0000 Subject: [PATCH 062/155] debugging matching for HA28 --- .../ha_15_32/ha_analysis_batch_3.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 9bd04884..7481724b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -214,6 +214,13 @@ class DataLoader: asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA28": + asset_list["matching_address"] = ( + asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + + asset_list["Street 1"].astype(str).str.lower().str.strip() + ", " + + asset_list["Postcode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA32": asset_list["matching_address"] = ( asset_list["Dwelling num"].astype(str).str.lower().str.strip() + ", " + @@ -323,6 +330,8 @@ class DataLoader: asset_list["HouseNo"] = asset_list["House No"].copy() elif ha_name == "HA32": asset_list["HouseNo"] = asset_list["Dwelling num"].copy() + elif ha_name == "HA28": + asset_list["HouseNo"] = asset_list["House Number"].copy() else: split_addresses = asset_list['matching_address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) @@ -371,6 +380,8 @@ class DataLoader: def get_asset_sheetname(workbook): if "Asset List" in workbook.sheetnames: return "Asset List" + elif "Asset list" in workbook.sheetnames: + return "Asset list" elif "Asset" in workbook.sheetnames and "Assets" not in workbook.sheetnames: return "Asset" elif "Decent Homes Stock" in workbook.sheetnames: @@ -394,6 +405,8 @@ class DataLoader: def get_survey_sheetname(workbook): if "ECO Surveys" in workbook.sheetnames: return "ECO Surveys" + elif "ECO Survey" in workbook.sheetnames: + return "ECO Survey" else: return "ECO surveys" @@ -870,6 +883,12 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha28_survey_list(survey_list): + # Rename the "No" column to "No." to align with the other survey sheets + survey_list = survey_list.rename(columns={"NO ": "NO."}) + return survey_list + @staticmethod def correct_ha32_survey_list(survey_list): survey_list["Street / Block Name"] = np.where( @@ -1027,6 +1046,10 @@ class DataLoader: asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip()) ].copy() + if str(house_number) not in df["matching_address"].values: + if "flat" in str(house_number): + house_number = house_number.split("flat")[1].strip() + df = df[df["matching_address"].str.contains(str(house_number))] if df.empty: From 0909b811ee7aea834784f0deb947308593ce7cdd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 3 Mar 2024 15:57:49 +0000 Subject: [PATCH 063/155] fixed matching for ha28 --- .../ha_15_32/ha_analysis_batch_3.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7481724b..b954a651 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -887,6 +887,27 @@ class DataLoader: def correct_ha28_survey_list(survey_list): # Rename the "No" column to "No." to align with the other survey sheets survey_list = survey_list.rename(columns={"NO ": "NO."}) + + survey_list["Post Code"] = np.where( + survey_list["Post Code"] == "ME75HA", + "ME7 5HA", + survey_list["Post Code"] + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "ANDREW MANOR/BRITTON ST", "ANDREW MANOR" + ) + + survey_list["Post Code"] = np.where( + survey_list["Post Code"] == "ME75TW", + "ME7 5TW", + survey_list["Post Code"] + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "ST MARKS HOUSE/SAXON ST", "ST MARKS HOUSE" + ) + return survey_list @staticmethod @@ -1046,7 +1067,7 @@ class DataLoader: asset_list["matching_address"].str.contains(row["Street / Block Name"].lower().strip()) ].copy() - if str(house_number) not in df["matching_address"].values: + if not any(df["matching_address"].str.contains(str(house_number))): if "flat" in str(house_number): house_number = house_number.split("flat")[1].strip() From 87c77e53c03ec83286718d6ef6bb5593466a48b1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 3 Mar 2024 16:22:42 +0000 Subject: [PATCH 064/155] handing facts and figures for ha28 --- .../ha_15_32/ha_analysis_batch_3.py | 92 +++++++++++-------- 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index b954a651..3ded09ba 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -398,6 +398,8 @@ class DataLoader: return "CIGA Checks" elif "CIGA checks" in workbook.sheetnames: return "CIGA checks" + elif "CIGA check" in workbook.sheetnames: + return "CIGA check" else: return "CIGA" @@ -1318,14 +1320,16 @@ class DataLoader: "ECO4 A/W": "ECO4", "ECO4 GBIS (ECO+)": "GBIS", "ECO4 GBIS (ECO+) JJC UNDER 73m²": "GBIS", - "ECO4 AFFORDABLE WARMTH": "ECO4" + "ECO4 AFFORDABLE WARMTH": "ECO4", + "Affordable Warmth": "ECO4" } eco_eligibility_map = { "not eligble": "not eligible", "eco 4(subject to ciga)": "eco4 (subject to ciga)", "eco4 (subject to ciga/archetype check": "eco4 (subject to ciga)", - "eco4 (subject to archetype check)": "eco4" + "eco4 (subject to archetype check)": "eco4", + "eco4 (subject to ciga/archetype)": "eco4 (subject to ciga)", } ha_facts_and_figures = [] @@ -1384,46 +1388,56 @@ class DataLoader: sales_report = {} if not survey_list.empty: scheme_column = survey_list.columns[0] - # We clean up the survey list installation or cancelled - survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower() - # Remove all punctuation - survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( - r'[^\w\s]', '', regex=True - ) - # Remove double spaces - survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( - r'\s+', ' ', regex=True - ) - # Remove trailing spaces - survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip() - # Remap the values in the scheme column survey_list[scheme_column] = survey_list[scheme_column].replace(scheme_map) + # We clean up the survey list installation or cancelled + if "INSTALLED OR CANCELLED" in survey_list.columns: + survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower() + # Remove all punctuation + survey_list["installed_or_cancelled_clean"] = survey_list[ + "installed_or_cancelled_clean"].str.replace( + r'[^\w\s]', '', regex=True + ) + # Remove double spaces + survey_list["installed_or_cancelled_clean"] = survey_list[ + "installed_or_cancelled_clean"].str.replace( + r'\s+', ' ', regex=True + ) + # Remove trailing spaces + survey_list["installed_or_cancelled_clean"] = survey_list[ + "installed_or_cancelled_clean"].str.strip() - survey_list["installation_status"] = None - survey_list["installation_status"] = np.where( - survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]), - "installed", - survey_list["installation_status"] - ) - survey_list["installation_status"] = np.where( - survey_list["installed_or_cancelled_clean"].isin(["cancelled"]), - "cancelled", - survey_list["installation_status"] - ) - # Find partial installations - survey_list["installation_status"] = np.where( - survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), - "partially installed", - survey_list["installation_status"] - ) - # Find partial cancellations - # TODO: We might have more indications of partial cancellations - survey_list["installation_status"] = np.where( - survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), - "partially cancelled", - survey_list["installation_status"] - ) + survey_list["installation_status"] = None + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]), + "installed", + survey_list["installation_status"] + ) + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["cancelled"]), + "cancelled", + survey_list["installation_status"] + ) + # Find partial installations + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), + "partially installed", + survey_list["installation_status"] + ) + # Find partial cancellations + # TODO: We might have more indications of partial cancellations + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), + "partially cancelled", + survey_list["installation_status"] + ) + else: + # We have some examples, e.g. HA28, where we do not have the installed or cancelled column + survey_list["installation_status"] = np.where( + survey_list['INSTALL/ CANCELLATION DATE'].str.lower().str.contains("cancelled"), + "cancelled", + "installed", + ) # Finally, for other cases, we set the status to "in progress" survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress") From f8948ff60f9e00d9501bd2f71f4269152cf3ab51 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 3 Mar 2024 16:47:10 +0000 Subject: [PATCH 065/155] ha38 wip: --- .../ha_15_32/ha_analysis_batch_3.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 3ded09ba..4af7d9b9 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -234,6 +234,13 @@ class DataLoader: asset_list["POST CODE"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["POST CODE"].astype(str).str.lower().str.strip() + elif ha_name == "HA38": + asset_list["matching_address"] = asset_list["House_Number"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address_Line_1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address_Line_2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address_Line_3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA39": # Create matching_address by concatenating add_1, add_2, add_3, add_4, add_5, post_code asset_list["matching_address"] = asset_list["add_1"].astype(str).str.lower().str.strip() + ", " + \ @@ -332,6 +339,8 @@ class DataLoader: asset_list["HouseNo"] = asset_list["Dwelling num"].copy() elif ha_name == "HA28": asset_list["HouseNo"] = asset_list["House Number"].copy() + elif ha_name == "HA38": + asset_list["HouseNo"] = asset_list["House_Number"].copy() else: split_addresses = asset_list['matching_address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) @@ -912,6 +921,12 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha38_survey_list(survey_list): + # Rename the "No" column to "No." to align with the other survey sheets + survey_list = survey_list.rename(columns={"NO ": "NO."}) + return survey_list + @staticmethod def correct_ha32_survey_list(survey_list): survey_list["Street / Block Name"] = np.where( @@ -3490,10 +3505,11 @@ def app(): december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA107", + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA38", "HA39", "HA107", ] # Next HAs to do: 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come back on this], - # Then: 28, 41, 38, 10, 14, 20, 48 + # Then: 28 [DONE], + # 38, 41, 10, 14, 20, 48 # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From abe0e627dbe1c89209de2f867c2abe4eef419d2e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 10:24:42 +0000 Subject: [PATCH 066/155] Fixing bug with gbis remaining counts --- .../ha_15_32/ha_analysis_batch_3.py | 266 ++++++++++++------ 1 file changed, 184 insertions(+), 82 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 4af7d9b9..6d1a3b45 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -424,6 +424,12 @@ class DataLoader: def load_asset_list(self, filepath, ha_name): workbook = openpyxl.load_workbook(filepath) asset_sheetname = self.get_asset_sheetname(workbook) + + # TODO: TEMP + sheetnames_lower = [x.lower() for x in workbook.sheetnames] + if any("eco3" in x for x in sheetnames_lower): + raise Exception("REMOVE ME") + asset_sheet = workbook[asset_sheetname] asset_sheet_colnames = [cell.value for cell in asset_sheet[1]] if ha_name == "HA25": @@ -569,6 +575,34 @@ class DataLoader: ) return asset_list + @staticmethod + def correct_ha38_asset_list(asset_list): + # For Kingsford court, the house number is at the end of the address + def rearrange_address_if_flat(address): + if '/flat' in address.lower(): + parts = address.split('/flat', 1) + return f"FLAT{parts[1]}, {parts[0]}" + return address + + def extract_house_no_if_flat(address): + if '/flat' in address.lower(): + # Attempt to extract the house number following "/flat" + try: + house_no = address.split('/flat ')[1].split(' ')[0] + # Remove trailing comma + house_no = house_no.replace(",", "") + except IndexError: + house_no = None + return house_no + return None + + asset_list['ExtractedHouseNo'] = asset_list['matching_address'].apply(extract_house_no_if_flat) + asset_list.loc[asset_list['ExtractedHouseNo'].notnull(), 'HouseNo'] = asset_list['ExtractedHouseNo'] + asset_list['matching_address'] = asset_list['matching_address'].apply(rearrange_address_if_flat) + # We then need to + + return asset_list + @staticmethod def correct_ha6_survey_list(survey_list): @@ -925,6 +959,11 @@ class DataLoader: def correct_ha38_survey_list(survey_list): # Rename the "No" column to "No." to align with the other survey sheets survey_list = survey_list.rename(columns={"NO ": "NO."}) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + 'Kingsford Court, Coombe Valley Road', 'Kingsford Court' + ) + return survey_list @staticmethod @@ -1345,6 +1384,7 @@ class DataLoader: "eco4 (subject to ciga/archetype check": "eco4 (subject to ciga)", "eco4 (subject to archetype check)": "eco4", "eco4 (subject to ciga/archetype)": "eco4 (subject to ciga)", + "eco4 (subject to ciga)": "eco4 (subject to ciga)" } ha_facts_and_figures = [] @@ -2943,8 +2983,8 @@ def forecast_remaining_sales(loader): median_ciga_success_rate = ciga_passrates["# CIGA passed"].sum() / ciga_passrates["# CIGA dependent"].sum() # 3) Calculate the conversion rate of an ECO4 and a GBISjob, that doesn't need ciga, to install - eco4_ciga_independent_passrates = [] - gbis_ciga_independent_passrates = [] + eco4_ciga_independent_to_install = [] + gbis_to_install = [] for ha_name, input_data in loader.data.items(): asset_list = input_data["asset_list"].copy() survey_list = input_data["survey_list"].copy() @@ -2973,7 +3013,7 @@ def forecast_remaining_sales(loader): ) ] - eco4_ciga_independent_passrates.append( + eco4_ciga_independent_to_install.append( { "Ha Name": ha_name, "# ECO4 at install stage": typical_eco4_installed.shape[0], @@ -2993,7 +3033,7 @@ def forecast_remaining_sales(loader): ) ] - gbis_ciga_independent_passrates.append( + gbis_to_install.append( { "Ha Name": ha_name, "# GBIS at install stage": typical_gbis_installed.shape[0], @@ -3001,33 +3041,33 @@ def forecast_remaining_sales(loader): } ) - eco4_ciga_independent_passrates = pd.DataFrame(eco4_ciga_independent_passrates) - gbis_ciga_independent_passrates = pd.DataFrame(gbis_ciga_independent_passrates) + eco4_ciga_independent_to_install = pd.DataFrame(eco4_ciga_independent_to_install) + gbis_to_install = pd.DataFrame(gbis_to_install) - eco4_ciga_independent_passrates["conversion"] = ( - eco4_ciga_independent_passrates["# ECO4 successfully installed"] / - eco4_ciga_independent_passrates["# ECO4 at install stage"] + eco4_ciga_independent_to_install["conversion"] = ( + eco4_ciga_independent_to_install["# ECO4 successfully installed"] / + eco4_ciga_independent_to_install["# ECO4 at install stage"] ) - eco4_ciga_independent_passrates_clipped = eco4_ciga_independent_passrates[ - eco4_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound + eco4_ciga_independent_to_install_clipped = eco4_ciga_independent_to_install[ + eco4_ciga_independent_to_install["conversion"] >= sales_conversion_lower_bound ] - gbis_ciga_independent_passrates["conversion"] = ( - gbis_ciga_independent_passrates["# GBIS successfully installed"] / - gbis_ciga_independent_passrates["# GBIS at install stage"] + gbis_to_install["conversion"] = ( + gbis_to_install["# GBIS successfully installed"] / + gbis_to_install["# GBIS at install stage"] ) - gbis_ciga_independent_passrates_clipped = gbis_ciga_independent_passrates[ - gbis_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound + gbis_to_install_clipped = gbis_to_install[ + gbis_to_install["conversion"] >= sales_conversion_lower_bound ] median_eco4_to_install = ( - eco4_ciga_independent_passrates_clipped["# ECO4 successfully installed"].sum() / - eco4_ciga_independent_passrates_clipped["# ECO4 at install stage"].sum() + eco4_ciga_independent_to_install_clipped["# ECO4 successfully installed"].sum() / + eco4_ciga_independent_to_install_clipped["# ECO4 at install stage"].sum() ) median_gbis_to_install = ( - gbis_ciga_independent_passrates_clipped["# GBIS successfully installed"].sum() / - gbis_ciga_independent_passrates_clipped["# GBIS at install stage"].sum() + gbis_to_install_clipped["# GBIS successfully installed"].sum() / + gbis_to_install_clipped["# GBIS at install stage"].sum() ) # Produce the final output @@ -3044,29 +3084,26 @@ def forecast_remaining_sales(loader): results = [] for ha_name, input_data in loader.data.items(): - # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0] original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0] + original_warmfront_sold_eco4 = ( + original_warmfront_estimates["No. of Tech surveys complete - Eco 4"].values[0] * eco4_rate + ) - # original_warmfront_eco4_revenue = ( - # original_warmfront_remaining_eco4 * eco4_rate + - # (original_warmfront_eco4 - original_warmfront_remaining_eco4) * old_eco4_rate - # ) original_warmfront_eco4_revenue = original_warmfront_eco4 * eco4_rate original_warmfront_remaining_eco4_revenue = original_warmfront_remaining_eco4 * eco4_rate + original_warmfront_sold_gbis = ( + original_warmfront_estimates["No. of Tech surveys complete - GBIS"].values[0] * gbis_rate + ) # Original warmfront figures - GBIS original_warmfront_gbis = original_warmfront_estimates["GBIS"].values[0] original_warmfront_remaining_gbis = original_warmfront_estimates["GBIS remaining"].values[0] - # original_warmfront_gbis_revenue = ( - # original_warmfront_remaining_gbis * gbis_rate + - # (original_warmfront_gbis - original_warmfront_remaining_gbis) * old_gbis_rate - # ) original_warmfront_gbis_revenue = ( original_warmfront_gbis * gbis_rate ) @@ -3123,7 +3160,7 @@ def forecast_remaining_sales(loader): # We also need the ha ciga passed to install success rate ha_ciga_pass_to_sale = converted_ciga_jobs[converted_ciga_jobs["HA Name"] == ha_name] - if not ha_ciga_pass_to_sale.empty: + if not ha_ciga_pass_to_sale.empty and ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0] != 0: ha_ciga_pass_to_sale_rate = ( ha_ciga_pass_to_sale["# Ciga dependent successfully installed"].values[0] / ha_ciga_pass_to_sale["# Ciga dependent at installation"].values[0] @@ -3131,7 +3168,9 @@ def forecast_remaining_sales(loader): else: ha_ciga_pass_to_sale_rate = median_ciga_pass_to_install - ha_eco4_to_sale = eco4_ciga_independent_passrates[eco4_ciga_independent_passrates["Ha Name"] == ha_name] + ha_eco4_to_sale = eco4_ciga_independent_to_install_clipped[ + eco4_ciga_independent_to_install_clipped["Ha Name"] == ha_name + ] if not ha_eco4_to_sale.empty: ha_eco4_to_sale_rate = ( ha_eco4_to_sale['# ECO4 successfully installed'].values[0] / @@ -3149,12 +3188,6 @@ def forecast_remaining_sales(loader): eco4_rate=eco4_rate ) - # Calculate the delta compared to Warmfront's original estimate - eco4_delta_vs_original_estimate = ( - eco4_post_ciga_total_results[ - "ECO4 - post CIGA - #"] - original_warmfront_eco4 - ) / original_warmfront_eco4 - eco4_post_ciga_remaining_results = calculate_eco4_post_ciga( eligiblity_counts=eligiblity_counts_remaining, input_data=input_data, @@ -3164,10 +3197,18 @@ def forecast_remaining_sales(loader): eco4_rate=eco4_rate ) + # Calculate the delta compared to Warmfront's original remaining + if original_warmfront_remaining_eco4 == 0: + eco4_delta_vs_original_estimate_remaining = eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] + else: + eco4_delta_vs_original_estimate_remaining = ((eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] - + original_warmfront_remaining_eco4) / + original_warmfront_remaining_eco4) + # GBIS Figures # Estimate the GBIS conversion rate - ha_gbis_sale_conversion = gbis_ciga_independent_passrates[ - gbis_ciga_independent_passrates["Ha Name"] == ha_name + ha_gbis_sale_conversion = gbis_to_install_clipped[ + gbis_to_install_clipped["Ha Name"] == ha_name ] if not ha_gbis_sale_conversion.empty: @@ -3178,6 +3219,9 @@ def forecast_remaining_sales(loader): else: ha_gbis_sale_conversion = median_gbis_to_install + asset_list["ECO Eligibility"].value_counts() + asset_list_remaining["ECO Eligibility"].value_counts() + gbis_total = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "gbis" ]["count"].sum() @@ -3185,18 +3229,59 @@ def forecast_remaining_sales(loader): gbis_total_revenue = int(gbis_total * gbis_rate) gbis_remaining = eligiblity_counts_remaining[ - eligiblity_counts["ECO Eligibility"] == "gbis" + eligiblity_counts_remaining["ECO Eligibility"] == "gbis" ]["count"].sum() gbis_remaining = int(np.round(gbis_remaining * ha_gbis_sale_conversion)) gbis_remaining_revenue = int(gbis_remaining * gbis_rate) # GBIS delta - if original_warmfront_gbis == 0: - gbis_delta_vs_original_estimate = gbis_total + if original_warmfront_remaining_gbis == 0: + gbis_delta_vs_original_estimate_remaining = gbis_remaining else: - gbis_delta_vs_original_estimate = ( - gbis_total - original_warmfront_gbis - ) / original_warmfront_gbis + gbis_delta_vs_original_estimate_remaining = ( + (gbis_remaining - original_warmfront_remaining_gbis) / original_warmfront_remaining_gbis + ) + + # Current sales figures + # For any sales surveys that are complete, that could still cancel, we apply a conversion rate + eco4_actually_sold = 0 + gbis_actually_sold = 0 + if not survey_list.empty: + surveys_with_eligibility = survey_list.merge( + asset_list[["asset_list_row_id", "ECO Eligibility"]], + how="left", on="asset_list_row_id" + ) + completed_eco4_sales = surveys_with_eligibility[ + surveys_with_eligibility["installation_status"] == "ECO4 - installed" + ] + incomplete_eco4_sales = surveys_with_eligibility[ + (surveys_with_eligibility["installation_status"] == "ECO4 - in progress") & + (~surveys_with_eligibility["ECO Eligibility"].isin( + ["eco4 - passed ciga"]) + ) + ] + incomplete_eco4_sales_ciga = surveys_with_eligibility[ + (surveys_with_eligibility["installation_status"] == "ECO4 - in progress") & + (surveys_with_eligibility["ECO Eligibility"].isin( + ["eco4 - passed ciga"]) + ) + ] + + eco4_actually_sold = (completed_eco4_sales.shape[0] * eco4_rate) + ( + incomplete_eco4_sales.shape[0] * ha_eco4_to_sale_rate + + incomplete_eco4_sales_ciga.shape[0] * ha_ciga_pass_to_sale_rate + ) * eco4_rate + + completed_gbis_sales = surveys_with_eligibility[ + surveys_with_eligibility["installation_status"] == "GBIS - installed" + ] + incomplete_gbis_sales = surveys_with_eligibility[ + (surveys_with_eligibility["installation_status"] == "GBIS - in progress") + ] + + gbis_actually_sold = completed_gbis_sales.shape[0] * gbis_rate + ( + incomplete_gbis_sales.shape[0] * ha_gbis_sale_conversion * gbis_rate + ) to_append = { ("", "", "", "HA Name"): ha_name, @@ -3204,29 +3289,33 @@ def forecast_remaining_sales(loader): ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4, ("ECO4 original", "", "Remaining - #", ""): original_warmfront_remaining_eco4, ("ECO4 original", "", "Total - £", ""): original_warmfront_eco4_revenue, + ("ECO4 original", "", "Sold - £", ""): original_warmfront_sold_eco4, ("ECO4 original", "", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue, # GBIS - original warmfront figures ("", "Original Warmfront estimate", "Total - #", "GBIS - November"): original_warmfront_gbis, ("GBIS original", "", "Remaining - #", ""): original_warmfront_gbis, ("GBIS original", "", "Total - £", ""): original_warmfront_gbis_revenue, + ("GBIS original", "", "Sold - £", ""): original_warmfront_sold_gbis, ("GBIS original", "", "Remaining - £", ""): original_warmfront_remaining_gbis_revenue, # ECO4 - asset list, pre-ciga ("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga, ("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining, ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue, + ("ECO4 pre-ciga", "", "Sold - £", ""): eco4_actually_sold, ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, # ECO4 - asset list, post ciga, total - ("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total (post-ciga)"): + ("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total"): eco4_post_ciga_total_results[ "ECO4 - post CIGA - #"], ("ECO4 post-ciga", "", "Estimated total eligible - £", ""): eco4_post_ciga_total_results[ "ECO4 - post CIGA - £"], - ("ECO4 post-ciga", "", "Delta vs original estimate - %", ""): eco4_delta_vs_original_estimate, # ECO4 - asset list, post ciga, remaining ("ECO4 post-ciga", "", "Estimated remaining eligible - #", ""): eco4_post_ciga_remaining_results[ "ECO4 - post CIGA - #"], ("ECO4 post-ciga", "", "Estimated remaining eligible - £", ""): eco4_post_ciga_remaining_results[ "ECO4 - post CIGA - £"], + ("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", + ""): eco4_delta_vs_original_estimate_remaining, ("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - #", ""): eco4_post_ciga_remaining_results["Of which confirmed - #"], ("ECO4 post-ciga", "", "Of which - confirmed (post CIGA or no CIGA required) - £", ""): @@ -3257,13 +3346,15 @@ def forecast_remaining_sales(loader): # GBIS postcode list ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total, ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_revenue, - ("GBIS Postcode list", "", "Delta vs original estimate - %", ""): gbis_delta_vs_original_estimate, + ("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total"): gbis_actually_sold, ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"): gbis_remaining, ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"): gbis_remaining_revenue, + ("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", ""): + gbis_delta_vs_original_estimate_remaining, } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 33: + if len(to_append) != 37: raise ValueError("Something went wrong") results.append(to_append) @@ -3275,26 +3366,26 @@ def forecast_remaining_sales(loader): if col == ('', '', '', 'HA Name'): totals_row[col] = "Total" elif col in [ - ("ECO4 post-ciga", "", "Delta vs original estimate - %", ""), - ("GBIS Postcode list", "", "Delta vs original estimate - %", "") + ("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", ""), + ("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "") ]: totals_row[col] = None else: totals_row[col] = results[col].sum() # For the delta columns, we calculate the delta on the totals - totals_row[("ECO4 post-ciga", "", "Delta vs original estimate - %", "")] = ( + totals_row[("ECO4 post-ciga", "", "Delta vs original estimate, remaining - %", "")] = ( ( - totals_row[("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total (post-ciga)")] - - totals_row[("", "Original Warmfront estimate", "Total - #", "ECO4 - November")] - ) / totals_row[("", "Original Warmfront estimate", "Total - #", "ECO4 - November")] + totals_row[("ECO4 post-ciga", "", "Estimated remaining eligible - #", "")] - + totals_row[("ECO4 original", "", "Remaining - #", "")] + ) / totals_row[("ECO4 original", "", "Remaining - #", "")] ) - totals_row[("GBIS Postcode list", "", "Delta vs original estimate - %", "")] = ( + totals_row[("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", "")] = ( ( - totals_row[("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total")] - - totals_row[("", "Original Warmfront estimate", "Total - #", "GBIS - November")] - ) / totals_row[("", "Original Warmfront estimate", "Total - #", "GBIS - November")] + totals_row[("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total")] - + totals_row[("GBIS original", "", "Remaining - #", "")] + ) / totals_row[("GBIS original", "", "Remaining - #", "")] ) blank_row = pd.DataFrame([{col: "" for col in results.columns}]) @@ -3342,6 +3433,15 @@ def forecast_remaining_sales(loader): ) headline_total_delta = round(headline_total_delta, 1) + headline_eco4_sold_since_november = ( + totals_row[('ECO4 pre-ciga', '', 'Sold - £', '')] - totals_row[('ECO4 original', '', 'Sold - £', '')] + ) + + headline_gbis_sold_since_november = ( + totals_row[("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total")] - + totals_row[('GBIS original', '', 'Sold - £', '')] + ) + headlines = [ { ("", "", "", "HA Name"): "Headlines", @@ -3358,16 +3458,22 @@ def forecast_remaining_sales(loader): "ECO4 - November"): headline_eco4_original_remaining_revenue }, { - ("", "", "", "HA Name"): "ECO4 Remaining - postcode list - #", + ("", "", "", "HA Name"): "ECO4 Sold since November - £", + ( + "", "Original Warmfront estimate", "Total - #", + "ECO4 - November"): headline_eco4_sold_since_november + }, + { + ("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - #", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_postcode_list_remaining }, { - ("", "", "", "HA Name"): "ECO4 Remaining - postcode list - £", + ("", "", "", "HA Name"): "ECO4 Remaining - postcode list (post CIGA) - £", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_postcode_list_remaining_revenue }, { - ("", "", "", "HA Name"): "ECO4 delta %", + ("", "", "", "HA Name"): "ECO4 £ remaining delta - %", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str(headline_eco4_delta) + "%" }, { @@ -3380,6 +3486,12 @@ def forecast_remaining_sales(loader): "", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_original_remaining_revenue }, + { + ("", "", "", "HA Name"): "GBIS Sold since November - £", + ( + "", "Original Warmfront estimate", "Total - #", + "ECO4 - November"): headline_gbis_sold_since_november + }, { ("", "", "", "HA Name"): "GBIS Remaining - post code list - #", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_postcode_list_remaining @@ -3400,7 +3512,7 @@ def forecast_remaining_sales(loader): "ECO4 - November"): headline_original_total_revenue_remaining }, { - ("", "", "", "HA Name"): "Total Remaining - post code list - £", + ("", "", "", "HA Name"): "Total Remaining - post code list (post CIGA) - £", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_postcode_list_total_revenue_remaining }, @@ -3440,14 +3552,16 @@ def forecast_remaining_sales(loader): ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( round(median_eco4_to_install * 100, 1)) + "%", ("ECO4 original", "", "Remaining - #", - ""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check. Job must not cancel" + ""): " - Sales conversion rate for a ECO4 property that didn't need a CIGA check. Surveys that resulted " + "in cancelled install are excluded." }, { ("", "", "", "HA Name"): "Median ECO4 (subect to CIGA) sales conversion rate", ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): str( round(median_ciga_pass_to_install * 100, 1)) + "%", ("ECO4 original", "", "Remaining - #", - ""): " - Sales conversion rate for a ECO4 property that passed a CIGA check. Job must not cancel" + ""): " - Sales conversion rate for a ECO4 property that passed a CIGA check. Surveys that resulted in " + "cancelled installs are excluded." } ] @@ -3462,23 +3576,7 @@ def forecast_remaining_sales(loader): pd.DataFrame(assumptions) ] ) - - # header_rows = [ - # [name[0] for name in results.columns.values], - # [name[1] for name in results.columns.values], - # [name[2] for name in results.columns.values], - # [name[3] for name in results.columns.values] - # ] - - # Step 2: Write the transformed header and DataFrame data to CSV. - # Open the file in write mode. - import csv with open("HA Remaining Analysis.csv", "w", newline="") as file: - # writer = csv.writer(file) - - # Write the header rows. - # writer.writerows(header_rows) - # Write the DataFrame data without the index (adjust if you want the index). results.to_csv(file, header=True, index=False) @@ -3504,8 +3602,12 @@ def app(): # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" + # priority_has = [ + # "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA38", "HA39", "HA107", + # ] + # TODO: Remove ECO3 sales from HA25 priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA38", "HA39", "HA107", + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA28", "HA32", "HA39", "HA107", ] # Next HAs to do: 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come back on this], # Then: 28 [DONE], From 5b32ac8aad59b1942f80a399d072486ab6db9ec3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 10:59:07 +0000 Subject: [PATCH 067/155] handling case where property is marked as gbis but sold for ECO4 --- .../ha_15_32/ha_analysis_batch_3.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 6d1a3b45..7bfbd7f5 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1509,11 +1509,12 @@ class DataLoader: } # We find some cases where properties have sold but are missing CIGA checks - survey_list_to_merge = survey_list[["asset_list_row_id"]].copy() + survey_list_to_merge = survey_list[["asset_list_row_id", "installation_status"]].copy() survey_list_to_merge["has_a_survey_record"] = True survey_list_to_merge = survey_list_to_merge[~pd.isnull(survey_list_to_merge["asset_list_row_id"])] asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id") + # Update the cases where properties have sold, but are missing a CIGA check asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"] == "eco4 (subject to ciga)") & ( asset_list["has_a_survey_record"] == True @@ -1521,6 +1522,17 @@ class DataLoader: "eco4 - passed ciga", asset_list["ECO Eligibility"] ) + # Update the cases where a property has been marked as eligible for GBIS, but sold for ECO4 + asset_list["ECO Eligibility"] = np.where( + (asset_list["ECO Eligibility"] == "gbis") & ( + asset_list["installation_status"].isin( + ["ECO4 - installed", "ECO4 - cancelled"] + ) + ), + "eco4", + asset_list["ECO Eligibility"] + ) + asset_list = asset_list.drop(columns=["has_a_survey_record"]) # Update the survey list with installation status @@ -3199,7 +3211,7 @@ def forecast_remaining_sales(loader): # Calculate the delta compared to Warmfront's original remaining if original_warmfront_remaining_eco4 == 0: - eco4_delta_vs_original_estimate_remaining = eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] + eco4_delta_vs_original_estimate_remaining = "N/A" else: eco4_delta_vs_original_estimate_remaining = ((eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] - original_warmfront_remaining_eco4) / @@ -3219,9 +3231,6 @@ def forecast_remaining_sales(loader): else: ha_gbis_sale_conversion = median_gbis_to_install - asset_list["ECO Eligibility"].value_counts() - asset_list_remaining["ECO Eligibility"].value_counts() - gbis_total = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "gbis" ]["count"].sum() @@ -3236,7 +3245,7 @@ def forecast_remaining_sales(loader): # GBIS delta if original_warmfront_remaining_gbis == 0: - gbis_delta_vs_original_estimate_remaining = gbis_remaining + gbis_delta_vs_original_estimate_remaining = "N/A" else: gbis_delta_vs_original_estimate_remaining = ( (gbis_remaining - original_warmfront_remaining_gbis) / original_warmfront_remaining_gbis From 9d26c94ae571ce1ba5363e9c850b8017f110bc9d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 11:35:14 +0000 Subject: [PATCH 068/155] removed stray comma causing bugs --- .../ha_15_32/ha_analysis_batch_3.py | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7bfbd7f5..e58c7799 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1526,14 +1526,40 @@ class DataLoader: asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"] == "gbis") & ( asset_list["installation_status"].isin( - ["ECO4 - installed", "ECO4 - cancelled"] + ["ECO4 - installed", "ECO4 - cancelled", "ECO4 - in progress"] ) ), "eco4", asset_list["ECO Eligibility"] ) + # Update the cases where a property was marked as eligible for ECO4, but sold for GBIS + asset_list["ECO Eligibility"] = np.where( + (asset_list["ECO Eligibility"].isin( + ["eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"] + )) & ( + asset_list["installation_status"].isin( + ["GBIS - installed", "GBIS - cancelled", "GBIS - in progress"] + ) + ), + "gbis", + asset_list["ECO Eligibility"] + ) + # Update the cases where a property is marked as not eligible, but sold for GBIS + if ((asset_list["ECO Eligibility"] == "not eligible") & ( + asset_list["installation_status"].isin( + ["GBIS - in progress", "GBIS - installed", "GBIS - cancelled"] + ))).sum(): + bah + asset_list["ECO Eligibility"] = np.where( + (asset_list["ECO Eligibility"] == "not eligible") & ( + asset_list["installation_status"].isin( + ["GBIS - in progress", "GBIS - installed", "GBIS - cancelled"] + )), + "gbis", + asset_list["ECO Eligibility"] + ) - asset_list = asset_list.drop(columns=["has_a_survey_record"]) + asset_list = asset_list.drop(columns=["has_a_survey_record", "installation_status"]) # Update the survey list with installation status self.data[ha_name]["survey_list"] = survey_list @@ -2897,8 +2923,6 @@ def forecast_remaining_sales(loader): gbis_rate = 600 eco4_rate = 1710 - # old_gbis_rate = 432 - # old_eco4_rate = 1456 # 1) Calculate the conversion rate from passed CIGA to actual sale converted_ciga_jobs = [] From a70260f128aec2785a8000669dc981d8220505a3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 11:55:02 +0000 Subject: [PATCH 069/155] Update how we handle partially completed jobs --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index e58c7799..060539e1 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1476,7 +1476,7 @@ class DataLoader: # Find partial installations survey_list["installation_status"] = np.where( survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), - "partially installed", + "in progress", survey_list["installation_status"] ) # Find partial cancellations @@ -1550,6 +1550,7 @@ class DataLoader: ["GBIS - in progress", "GBIS - installed", "GBIS - cancelled"] ))).sum(): bah + asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"] == "not eligible") & ( asset_list["installation_status"].isin( @@ -1559,6 +1560,15 @@ class DataLoader: asset_list["ECO Eligibility"] ) + # Update the cases where a property is marked as not eligible, but sold for ECO4 + asset_list["ECO Eligibility"] = np.where( + (asset_list["ECO Eligibility"] == "not eligible") & ( + asset_list["installation_status"].isin( + ["ECO4 - in progress", "ECO4 - installed", "ECO4 - cancelled"] + ) + ) + ) + asset_list = asset_list.drop(columns=["has_a_survey_record", "installation_status"]) # Update the survey list with installation status From 4cc467e5142c7eba903d2819d59229643cf93e03 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 11:57:03 +0000 Subject: [PATCH 070/155] fix bug in updating eligibility for initially non-eligible rows --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 060539e1..8c03b1ef 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1545,12 +1545,6 @@ class DataLoader: asset_list["ECO Eligibility"] ) # Update the cases where a property is marked as not eligible, but sold for GBIS - if ((asset_list["ECO Eligibility"] == "not eligible") & ( - asset_list["installation_status"].isin( - ["GBIS - in progress", "GBIS - installed", "GBIS - cancelled"] - ))).sum(): - bah - asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"] == "not eligible") & ( asset_list["installation_status"].isin( @@ -1566,7 +1560,9 @@ class DataLoader: asset_list["installation_status"].isin( ["ECO4 - in progress", "ECO4 - installed", "ECO4 - cancelled"] ) - ) + ), + "eco4", + asset_list["ECO Eligibility"] ) asset_list = asset_list.drop(columns=["has_a_survey_record", "installation_status"]) From 5e991547f7239cf5a84f8e5824d4d9379b825a2a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 14:08:05 +0000 Subject: [PATCH 071/155] debuging variances, fixed usage of 75% ciga pass rate --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 8c03b1ef..91c198b1 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3196,8 +3196,8 @@ def forecast_remaining_sales(loader): ) else: ha_ciga_conversion_rate = ( - median_ciga_success_rate if median_ciga_success_rate <= median_ciga_success_rate else - median_ciga_success_rate + median_ciga_success_rate if median_ciga_success_rate <= maximum_ciga_conversion else + maximum_ciga_conversion ) # We also need the ha ciga passed to install success rate From d35d8ea8457ce128ac1fe0c51abd9f83f4e3acaa Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 14:14:50 +0000 Subject: [PATCH 072/155] fixed but in eligibility counts remaining --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 91c198b1..1e2c5d92 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3177,7 +3177,7 @@ def forecast_remaining_sales(loader): ]["count"].sum() eco4_pre_ciga_remaining = eligiblity_counts_remaining[ - eligiblity_counts["ECO Eligibility"].isin( + eligiblity_counts_remaining["ECO Eligibility"].isin( ["eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"] ) ]["count"].sum() From 680f38963a874eef548883d8f0f365f7958d42b1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 15:01:33 +0000 Subject: [PATCH 073/155] Added variance columns to output --- .../ha_15_32/ha_analysis_batch_3.py | 49 ++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1e2c5d92..d4c3f74f 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2859,21 +2859,30 @@ def calculate_eco4_post_ciga( eligiblity_counts["ECO Eligibility"] == "failed ciga" ]["count"].sum() + eco4_no_ciga_needed_or_ciga_passed = eco4_no_ciga_needed + eco4_ciga_passed + eco4_confirmed = (eco4_no_ciga_needed * ha_eco4_to_sale_rate) + (eco4_ciga_passed * ha_ciga_pass_to_sale_rate) eco4_confirmed = np.round(eco4_confirmed) + eco4_no_ciga_needed_cancellations = int(eco4_no_ciga_needed_or_ciga_passed - eco4_confirmed) + if remaining_needing_ciga_check > 0: # We update the eco4 post ciga with the converted remaining eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) + eco4_remaining_forecast = np.round( eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate ) + eco4_ciga_needed_cancellations = eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast else: eco4_remaining_forecast = 0 eco4_estimated_ciga_failures = 0 + eco4_ciga_needed_cancellations = 0 eco4_post_ciga = eco4_confirmed + + eco4_expected_cancellations = eco4_no_ciga_needed_cancellations + eco4_ciga_needed_cancellations else: eco4_no_ciga_needed = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "eco4" @@ -2881,14 +2890,18 @@ def calculate_eco4_post_ciga( eco4_confirmed_ciga_failures = 0 # Multiply by sale conversion eco4_confirmed = np.round(eco4_no_ciga_needed * ha_eco4_to_sale_rate) + eco4_no_ciga_cancellations = int(eco4_no_ciga_needed - eco4_confirmed) eco4_ciga_expected_remaining_to_pass = np.round(remaining_needing_ciga_check * ha_ciga_conversion_rate) eco4_estimated_ciga_failures = remaining_needing_ciga_check - eco4_ciga_expected_remaining_to_pass eco4_remaining_forecast = np.round( eco4_ciga_expected_remaining_to_pass * ha_ciga_pass_to_sale_rate ) + eco4_ciga_cancellations = int(eco4_ciga_expected_remaining_to_pass - eco4_remaining_forecast) eco4_post_ciga = eco4_confirmed + eco4_remaining_forecast + eco4_expected_cancellations = eco4_no_ciga_cancellations + eco4_ciga_cancellations + eco4_post_ciga = int(eco4_post_ciga) eco4_remaining_forecast = int(eco4_remaining_forecast) eco4_confirmed = int(eco4_confirmed) @@ -2912,6 +2925,9 @@ def calculate_eco4_post_ciga( ), "Confirmed CIGA failures - £": int(eco4_confirmed_ciga_failures * eco4_rate), "Estimated CIGA failures - £": int(eco4_estimated_ciga_failures * eco4_rate), + # Expected cencellations + "Expected cancellations - #": eco4_expected_cancellations, + "Expected cancellations - £": eco4_expected_cancellations * eco4_rate } return results @@ -3322,6 +3338,28 @@ def forecast_remaining_sales(loader): incomplete_gbis_sales.shape[0] * ha_gbis_sale_conversion * gbis_rate ) + # Add in the variance: + # We should expect that the pre-ciga total is: + # 1) The number of post CIGA successes + + # 2) the number of CIGA failures + + # 3) The number of cancellations + variance_total = eco4_pre_ciga - ( + eco4_post_ciga_total_results["ECO4 - post CIGA - #"] + + eco4_post_ciga_total_results['Estimated total - failed CIGA'] + + eco4_post_ciga_total_results["Expected cancellations - #"] + ) + if variance_total != 0: + raise ValueError("Something went wrong in variance total") + + variance_remaining = eco4_pre_ciga_remaining - ( + eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] + + eco4_post_ciga_remaining_results['Estimated total - failed CIGA'] + + eco4_post_ciga_remaining_results["Expected cancellations - #"] + ) + + if variance_remaining != 0: + raise ValueError("Something went wrong in variance remaining") + to_append = { ("", "", "", "HA Name"): ha_name, # ECO4 - original warmfront figures @@ -3340,6 +3378,8 @@ def forecast_remaining_sales(loader): ("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga, ("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining, ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue, + ("ECO4 pre-ciga", "", "VARIANCE - TOTAL", ""): variance_total, + ("ECO4 pre-ciga", "", "VARIANCE - REMAINING", ""): variance_remaining, ("ECO4 pre-ciga", "", "Sold - £", ""): eco4_actually_sold, ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, # ECO4 - asset list, post ciga, total @@ -3382,6 +3422,13 @@ def forecast_remaining_sales(loader): ("ECO4 CIGA failures", "", "Estimated failures - £", ""): eco4_post_ciga_remaining_results[ "Estimated CIGA failures - £" ], + # Expected ECO4 cancellations + ("ECO4 Cancellations", "", "Expected cancellations - #", ""): eco4_post_ciga_remaining_results[ + "Expected cancellations - #" + ], + ("ECO4 Cancellations", "", "Expected cancellations - £", ""): eco4_post_ciga_remaining_results[ + "Expected cancellations - £" + ], # GBIS postcode list ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total, ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_revenue, @@ -3393,7 +3440,7 @@ def forecast_remaining_sales(loader): } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 37: + if len(to_append) != 41: raise ValueError("Something went wrong") results.append(to_append) From e966dfdf6e785cbcc1e2245cce852e842d0def92 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 16:22:20 +0000 Subject: [PATCH 074/155] Adding cancellations to output --- .../ha_15_32/ha_analysis_batch_3.py | 68 +++++++++++++------ 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index d4c3f74f..09b0910e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3301,6 +3301,10 @@ def forecast_remaining_sales(loader): # For any sales surveys that are complete, that could still cancel, we apply a conversion rate eco4_actually_sold = 0 gbis_actually_sold = 0 + eco4_confirmed_cancellations = 0 + eco4_expected_cancellations = 0 + gbis_confirmed_cancellations = 0 + gbis_expected_cancellations = 0 if not survey_list.empty: surveys_with_eligibility = survey_list.merge( asset_list[["asset_list_row_id", "ECO Eligibility"]], @@ -3308,34 +3312,54 @@ def forecast_remaining_sales(loader): ) completed_eco4_sales = surveys_with_eligibility[ surveys_with_eligibility["installation_status"] == "ECO4 - installed" - ] + ].shape[0] incomplete_eco4_sales = surveys_with_eligibility[ (surveys_with_eligibility["installation_status"] == "ECO4 - in progress") & (~surveys_with_eligibility["ECO Eligibility"].isin( ["eco4 - passed ciga"]) ) - ] + ].shape[0] incomplete_eco4_sales_ciga = surveys_with_eligibility[ (surveys_with_eligibility["installation_status"] == "ECO4 - in progress") & (surveys_with_eligibility["ECO Eligibility"].isin( ["eco4 - passed ciga"]) ) - ] + ].shape[0] - eco4_actually_sold = (completed_eco4_sales.shape[0] * eco4_rate) + ( - incomplete_eco4_sales.shape[0] * ha_eco4_to_sale_rate + - incomplete_eco4_sales_ciga.shape[0] * ha_ciga_pass_to_sale_rate - ) * eco4_rate + eco4_confirmed_cancellations = surveys_with_eligibility[ + surveys_with_eligibility["installation_status"] == "ECO4 - cancelled" + ].shape[0] + + expected_eco4_sales_no_ciga = np.round(incomplete_eco4_sales * ha_eco4_to_sale_rate) + expected_eco4_sales_ciga = np.round(incomplete_eco4_sales_ciga * ha_ciga_pass_to_sale_rate) + + eco4_expected_cancellations = (incomplete_eco4_sales + incomplete_eco4_sales_ciga) - ( + expected_eco4_sales_no_ciga + expected_eco4_sales_ciga + ) + eco4_expected_cancellations = int(np.round(eco4_expected_cancellations)) + + eco4_actually_sold = eco4_rate * ( + completed_eco4_sales + expected_eco4_sales_no_ciga + expected_eco4_sales_ciga + ) completed_gbis_sales = surveys_with_eligibility[ surveys_with_eligibility["installation_status"] == "GBIS - installed" - ] + ].shape[0] incomplete_gbis_sales = surveys_with_eligibility[ (surveys_with_eligibility["installation_status"] == "GBIS - in progress") - ] + ].shape[0] - gbis_actually_sold = completed_gbis_sales.shape[0] * gbis_rate + ( - incomplete_gbis_sales.shape[0] * ha_gbis_sale_conversion * gbis_rate + # Get confirmed cancellations + gbis_confirmed_cancellations = surveys_with_eligibility[ + surveys_with_eligibility["installation_status"] == "GBIS - cancelled" + ].shape[0] + + expected_gbis_unconfirmed_sales = incomplete_gbis_sales * ha_gbis_sale_conversion + + gbis_expected_cancellations = int(incomplete_gbis_sales - expected_gbis_unconfirmed_sales) + + gbis_actually_sold = completed_gbis_sales * gbis_rate + ( + expected_gbis_unconfirmed_sales * gbis_rate ) # Add in the variance: @@ -3381,6 +3405,9 @@ def forecast_remaining_sales(loader): ("ECO4 pre-ciga", "", "VARIANCE - TOTAL", ""): variance_total, ("ECO4 pre-ciga", "", "VARIANCE - REMAINING", ""): variance_remaining, ("ECO4 pre-ciga", "", "Sold - £", ""): eco4_actually_sold, + ("ECO4 pre-ciga", "", "Confirmed cancellations - £", ""): eco4_confirmed_cancellations, + # This is for jobs that are in-progress and could still cancel + ("ECO4 pre-ciga", "", "Unconfirmed cancellations - £", ""): eco4_expected_cancellations, ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, # ECO4 - asset list, post ciga, total ("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total"): @@ -3403,6 +3430,13 @@ def forecast_remaining_sales(loader): eco4_post_ciga_remaining_results["Of which forecast - #"], ("ECO4 post-ciga", "", "Of which forecast - £", ""): eco4_post_ciga_remaining_results["Of which forecast - £"], + # Expected ECO4 cancellations + ("ECO4 Cancellations", "", "Of which expected cancellations - #", ""): eco4_post_ciga_remaining_results[ + "Expected cancellations - #" + ], + ("ECO4 Cancellations", "", "Of which expected cancellations - £", ""): eco4_post_ciga_remaining_results[ + "Expected cancellations - £" + ], # CIGA failures ("ECO4 CIGA failures", "", "Estimated total - failed CIGA - #", ""): eco4_post_ciga_remaining_results[ 'Estimated total - failed CIGA' @@ -3422,17 +3456,13 @@ def forecast_remaining_sales(loader): ("ECO4 CIGA failures", "", "Estimated failures - £", ""): eco4_post_ciga_remaining_results[ "Estimated CIGA failures - £" ], - # Expected ECO4 cancellations - ("ECO4 Cancellations", "", "Expected cancellations - #", ""): eco4_post_ciga_remaining_results[ - "Expected cancellations - #" - ], - ("ECO4 Cancellations", "", "Expected cancellations - £", ""): eco4_post_ciga_remaining_results[ - "Expected cancellations - £" - ], # GBIS postcode list ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total, ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_revenue, ("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total"): gbis_actually_sold, + ("GBIS Postcode list", "", "Confirmed cancellations - £", ""): gbis_confirmed_cancellations, + # This is for jobs that are in-progress and could still cancel + ("GBIS Postcode list", "", "Unconfirmed cancellations - £", ""): gbis_expected_cancellations, ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"): gbis_remaining, ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"): gbis_remaining_revenue, ("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", ""): @@ -3440,7 +3470,7 @@ def forecast_remaining_sales(loader): } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 41: + if len(to_append) != 45: raise ValueError("Something went wrong") results.append(to_append) From e2055b3b7dde7a1b001a568c23bb3016fbfa4079 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 19:34:43 +0000 Subject: [PATCH 075/155] fixed variance for HA6 --- .../ha_15_32/ha_analysis_batch_3.py | 135 +++++++++++++++++- 1 file changed, 129 insertions(+), 6 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 09b0910e..8c9f59c2 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -730,6 +730,81 @@ class DataLoader: "Post Code" ] = "ST5 7BY" + # PERFORM ADDITIONAL DROPS + # Dropping rows based on multiple conditions + conditions_to_drop = [ + (survey_list['Street / Block Name'] == "Bedford Crescent") & (survey_list['Post Code'] == "ST5 3EH") & ( + survey_list['NO.'] == 23) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), + (survey_list['Street / Block Name'] == "Hereford Avenue") & (survey_list['Post Code'] == "ST5 3EJ") & ( + survey_list['NO.'] == 92) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), + (survey_list['Street / Block Name'] == "Seabridge Lane") & (survey_list['Post Code'] == "ST5 3EX") & ( + survey_list['NO.'].isin([16, 18, 42])) & ( + survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), + (survey_list['Street / Block Name'] == "ESKDALE PLACE") & (survey_list['Post Code'] == "ST5 3QW") & ( + survey_list['NO.'] == 5) & (survey_list['SUBMISSION DATE'].astype(str) == "2023-03-06 00:00:00"), + (survey_list['Street / Block Name'] == "Birch House road") & (survey_list['Post Code'] == "ST6 2LS") & ( + survey_list['NO.'].isin([56, 58])), + (survey_list['Street / Block Name'] == "Blackthorn Place") & (survey_list['Post Code'] == "ST6 2LS") & ( + survey_list['NO.'].isin([37, 39])), + (survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 7BT") & ( + survey_list['NO.'].isin([17, 6])), + (survey_list['Street / Block Name'] == "Lion Grove") & (survey_list['Post Code'] == "ST5 7HQ") & ( + survey_list['NO.'].isin([10, 12])) & ( + survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), + (survey_list['Street / Block Name'] == "DENRY CRESCENT") & (survey_list['Post Code'] == "ST5 8JW") & ( + survey_list['NO.'] == 87) & (survey_list['INSTALLED OR CANCELLED'].str.contains("NO UPDATE YET")), + (survey_list['Street / Block Name'] == "HOLLINS CRESCENT") & (survey_list['Post Code'] == "ST7 1JW") & ( + survey_list['NO.'] == 19) + ] + + # Combine all conditions with an OR "|" + combined_condition = np.logical_or.reduce(conditions_to_drop) + + # Drop rows that meet the combined condition + survey_list = survey_list[~combined_condition] + + # Making replacements using np.where + survey_list['Post Code'] = np.where( + (survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3EH") & ( + survey_list['NO.'] == 17), + "ST5 7BT", + survey_list['Post Code'] + ) + + survey_list['Post Code'] = np.where( + (survey_list['Street / Block Name'] == "Whitethorn Way") & (survey_list['Post Code'] == "ST5 3ED") & ( + survey_list['NO.'] == 6), + "ST5 7BT", + survey_list['Post Code'] + ) + + # Maple avenue (stoke on trent, not newcastle) should be st7 1jw + survey_list["Post Code"] = np.where( + (survey_list["Street / Block Name"].str.lower().str.contains("maple avenue")) & ( + survey_list["Post Code"].str.lower() == "st7 1jx" + ), + "st7 1jw", + survey_list["Post Code"] + ) + + # Hollins Crescent should be st7 1jx + survey_list["Post Code"] = np.where( + (survey_list["Street / Block Name"].str.lower().str.contains("hollins crescent")) & ( + survey_list["Post Code"].str.lower() == "st7 1jw" + ), + "st7 1jx", + survey_list["Post Code"] + ) + + # Additional drops as the above misses some: + survey_list = survey_list[ + ~((survey_list["NO."].astype(str).isin(["18", "42"])) & + (survey_list["Street / Block Name"] == "Seabridge Lane") & + (survey_list["Post Code"] == "ST5 3EY") & + (survey_list["SUBMISSION DATE"].astype(str) == "24.07.2023") & + (survey_list["INSTALLED OR CANCELLED"].str.contains("NO UPDATE YET"))) + ] + return survey_list @staticmethod @@ -1176,6 +1251,11 @@ class DataLoader: if matching_lookup.shape[0] != survey_list.shape[0]: raise ValueError("Mismatch in the number of survey rows and matching lookup rows") + matching_lookup = matching_lookup[~pd.isnull(matching_lookup["asset_list_row_id"])] + + if matching_lookup["asset_list_row_id"].duplicated().sum(): + raise ValueError("Duplicated matches in survey list") + # Merge onto the survey list survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id") @@ -1483,7 +1563,7 @@ class DataLoader: # TODO: We might have more indications of partial cancellations survey_list["installation_status"] = np.where( survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), - "partially cancelled", + "cancelled", survey_list["installation_status"] ) else: @@ -3174,6 +3254,8 @@ def forecast_remaining_sales(loader): if survey_list.empty: asset_list_remaining = asset_list.copy() else: + # For HA6, there are a small number of postcodes that do not match to any item in the asset list + survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])] asset_list_remaining = asset_list.merge( survey_list[["asset_list_row_id", "installation_status"]], how="left", @@ -3183,6 +3265,47 @@ def forecast_remaining_sales(loader): asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])] asset_list_remaining = asset_list_remaining.drop(columns=["installation_status"]) + # # TODO: TEMP + # n_pre_ciga = asset_list[ + # asset_list["ECO Eligibility"].isin( + # [ + # "eco4 - passed ciga", + # "eco4 (subject to ciga)", + # "failed ciga", + # "eco4" + # ] + # ) + # ].shape[0] + # + # n_pre_ciga_remaining = asset_list_remaining[ + # asset_list_remaining["ECO Eligibility"].isin( + # [ + # "eco4 - passed ciga", + # "eco4 (subject to ciga)", + # "failed ciga", + # "eco4" + # ] + # ) + # ].shape[0] + # + # compare_to_ids = asset_list_remaining["asset_list_row_id"].values + # assets_diff_ids = [x for x in asset_list["asset_list_row_id"].values if x not in compare_to_ids] + # diff = asset_list[asset_list["asset_list_row_id"].isin(assets_diff_ids)] + # + # n_sold = survey_list[survey_list["installation_status"].str.contains("ECO4")].shape[0] + # # cancellations = survey_list[] + # asset_list["ECO Eligibility"].value_counts() + # + # # Revenenue + # pre_ciga_revenue = n_pre_ciga * eco4_rate + # pre_ciga_remaining_revenue = n_pre_ciga_remaining * eco4_rate + # sold_revenue = n_sold * eco4_rate + # + # pre_ciga_revenue - (pre_ciga_remaining_revenue + sold_revenue) + # # MISSING 1 SALE from sold + # cancelled = survey_list[survey_list["installation_status"].str.contains("ECO4")].shape[0] + # # TODO: END TEMP + eligiblity_counts = pd.DataFrame(asset_list["ECO Eligibility"].value_counts()).reset_index() eligiblity_counts_remaining = pd.DataFrame(asset_list_remaining["ECO Eligibility"].value_counts()).reset_index() @@ -3402,13 +3525,13 @@ def forecast_remaining_sales(loader): ("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga, ("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining, ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue, + ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, ("ECO4 pre-ciga", "", "VARIANCE - TOTAL", ""): variance_total, ("ECO4 pre-ciga", "", "VARIANCE - REMAINING", ""): variance_remaining, ("ECO4 pre-ciga", "", "Sold - £", ""): eco4_actually_sold, - ("ECO4 pre-ciga", "", "Confirmed cancellations - £", ""): eco4_confirmed_cancellations, + ("ECO4 pre-ciga", "", "Confirmed cancellations - £", ""): eco4_confirmed_cancellations * eco4_rate, # This is for jobs that are in-progress and could still cancel - ("ECO4 pre-ciga", "", "Unconfirmed cancellations - £", ""): eco4_expected_cancellations, - ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, + ("ECO4 pre-ciga", "", "Unconfirmed cancellations - £", ""): eco4_expected_cancellations * eco4_rate, # ECO4 - asset list, post ciga, total ("ECO4 post-ciga", "", "Estimated total eligible - #", "ECO4 total"): eco4_post_ciga_total_results[ @@ -3460,9 +3583,9 @@ def forecast_remaining_sales(loader): ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total, ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_revenue, ("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total"): gbis_actually_sold, - ("GBIS Postcode list", "", "Confirmed cancellations - £", ""): gbis_confirmed_cancellations, + ("GBIS Postcode list", "", "Confirmed cancellations - £", ""): gbis_confirmed_cancellations * gbis_rate, # This is for jobs that are in-progress and could still cancel - ("GBIS Postcode list", "", "Unconfirmed cancellations - £", ""): gbis_expected_cancellations, + ("GBIS Postcode list", "", "Unconfirmed cancellations - £", ""): gbis_expected_cancellations * gbis_rate, ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"): gbis_remaining, ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"): gbis_remaining_revenue, ("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", ""): From 21082d8d3779a75cae422becf1a6e589ebcbaba6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 19:46:28 +0000 Subject: [PATCH 076/155] fixed duplication variance for HA16 --- .../ha_15_32/ha_analysis_batch_3.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 8c9f59c2..7859d6d2 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -960,6 +960,21 @@ class DataLoader: survey_list["NO."] ) + # Delete some duplicated entries + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "york road") & + (survey_list["NO."].astype(str) == "12") & + (survey_list["Post Code"] == "M44 5HU") & + (survey_list["SUBMISSION DATE"].astype(str) == "45229")) + ] + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "peatfield avenue") & + (survey_list["NO."].astype(str) == "23") & + (survey_list["Post Code"] == "M27 9XG") & + (survey_list["SUBMISSION DATE"].astype(str) == "45236")) + ] + return survey_list @staticmethod @@ -3265,7 +3280,7 @@ def forecast_remaining_sales(loader): asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])] asset_list_remaining = asset_list_remaining.drop(columns=["installation_status"]) - # # TODO: TEMP + # TODO: TEMP # n_pre_ciga = asset_list[ # asset_list["ECO Eligibility"].isin( # [ @@ -3304,6 +3319,9 @@ def forecast_remaining_sales(loader): # pre_ciga_revenue - (pre_ciga_remaining_revenue + sold_revenue) # # MISSING 1 SALE from sold # cancelled = survey_list[survey_list["installation_status"].str.contains("ECO4")].shape[0] + # dupes = survey_list[survey_list["asset_list_row_id"].duplicated()]["asset_list_row_id"].values + # z = survey_list[survey_list["asset_list_row_id"].isin(dupes)] + # z[['NO.', 'Street / Block Name', 'Post Code', 'INSTALLED OR CANCELLED', 'SUBMISSION DATE']] # # TODO: END TEMP eligiblity_counts = pd.DataFrame(asset_list["ECO Eligibility"].value_counts()).reset_index() From af13467c2c4c9b7fc98e5be1e343399f57c062fb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 20:04:37 +0000 Subject: [PATCH 077/155] Added gbis variance checks --- .../ha_15_32/ha_analysis_batch_3.py | 83 ++++++++----------- 1 file changed, 36 insertions(+), 47 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7859d6d2..553f6271 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3237,6 +3237,7 @@ def forecast_remaining_sales(loader): results = [] for ha_name, input_data in loader.data.items(): + # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] @@ -3280,50 +3281,6 @@ def forecast_remaining_sales(loader): asset_list_remaining = asset_list_remaining[pd.isnull(asset_list_remaining["installation_status"])] asset_list_remaining = asset_list_remaining.drop(columns=["installation_status"]) - # TODO: TEMP - # n_pre_ciga = asset_list[ - # asset_list["ECO Eligibility"].isin( - # [ - # "eco4 - passed ciga", - # "eco4 (subject to ciga)", - # "failed ciga", - # "eco4" - # ] - # ) - # ].shape[0] - # - # n_pre_ciga_remaining = asset_list_remaining[ - # asset_list_remaining["ECO Eligibility"].isin( - # [ - # "eco4 - passed ciga", - # "eco4 (subject to ciga)", - # "failed ciga", - # "eco4" - # ] - # ) - # ].shape[0] - # - # compare_to_ids = asset_list_remaining["asset_list_row_id"].values - # assets_diff_ids = [x for x in asset_list["asset_list_row_id"].values if x not in compare_to_ids] - # diff = asset_list[asset_list["asset_list_row_id"].isin(assets_diff_ids)] - # - # n_sold = survey_list[survey_list["installation_status"].str.contains("ECO4")].shape[0] - # # cancellations = survey_list[] - # asset_list["ECO Eligibility"].value_counts() - # - # # Revenenue - # pre_ciga_revenue = n_pre_ciga * eco4_rate - # pre_ciga_remaining_revenue = n_pre_ciga_remaining * eco4_rate - # sold_revenue = n_sold * eco4_rate - # - # pre_ciga_revenue - (pre_ciga_remaining_revenue + sold_revenue) - # # MISSING 1 SALE from sold - # cancelled = survey_list[survey_list["installation_status"].str.contains("ECO4")].shape[0] - # dupes = survey_list[survey_list["asset_list_row_id"].duplicated()]["asset_list_row_id"].values - # z = survey_list[survey_list["asset_list_row_id"].isin(dupes)] - # z[['NO.', 'Street / Block Name', 'Post Code', 'INSTALLED OR CANCELLED', 'SUBMISSION DATE']] - # # TODO: END TEMP - eligiblity_counts = pd.DataFrame(asset_list["ECO Eligibility"].value_counts()).reset_index() eligiblity_counts_remaining = pd.DataFrame(asset_list_remaining["ECO Eligibility"].value_counts()).reset_index() @@ -3525,6 +3482,35 @@ def forecast_remaining_sales(loader): if variance_remaining != 0: raise ValueError("Something went wrong in variance remaining") + # We also check variances to make sure that the pre-CIGA ECO4 total equals + # 1) Pre CIGA remaining + + # 2) ECO4 sold + + # 3) ECO4 confirmed cancellations + + # 4) ECO4 unconfirmed cancellations + + pre_ciga_eco4_variance = ( + eco4_pre_ciga_revenue - + eco4_pre_ciga_remaining_revenue - + eco4_actually_sold - + eco4_confirmed_cancellations * eco4_rate - + eco4_expected_cancellations * eco4_rate + ) + + if pre_ciga_eco4_variance != 0: + raise ValueError("Something went wrong in pre_ciga_eco4_variance") + + # Check GBIS total variance + gbis_variance = ( + gbis_total_revenue - + gbis_actually_sold - + gbis_confirmed_cancellations * gbis_rate - + gbis_expected_cancellations * gbis_rate - + gbis_remaining_revenue + ) + + if gbis_variance != 0: + raise ValueError("Something went wrong in gbis_variance") + to_append = { ("", "", "", "HA Name"): ha_name, # ECO4 - original warmfront figures @@ -3544,8 +3530,10 @@ def forecast_remaining_sales(loader): ("ECO4 pre-ciga", "", "Remaining - #", ""): eco4_pre_ciga_remaining, ("ECO4 pre-ciga", "", "Total - £", ""): eco4_pre_ciga_revenue, ("ECO4 pre-ciga", "", "Remaining - £", ""): eco4_pre_ciga_remaining_revenue, - ("ECO4 pre-ciga", "", "VARIANCE - TOTAL", ""): variance_total, - ("ECO4 pre-ciga", "", "VARIANCE - REMAINING", ""): variance_remaining, + ("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL", ""): pre_ciga_eco4_variance, + ("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 TOTAL VS ELIGIBLE & INELIGIBLE", ""): variance_total, + ("ECO4 pre-ciga", "", "VARIANCE - PRE-CIGA ECO4 REMAINING VS ELIGIBLE & INELIGIBLE", ""): + variance_remaining, ("ECO4 pre-ciga", "", "Sold - £", ""): eco4_actually_sold, ("ECO4 pre-ciga", "", "Confirmed cancellations - £", ""): eco4_confirmed_cancellations * eco4_rate, # This is for jobs that are in-progress and could still cancel @@ -3600,6 +3588,7 @@ def forecast_remaining_sales(loader): # GBIS postcode list ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total, ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_revenue, + ("GBIS Postcode list", "Warmfront post code list", "GBIS VARIANCE", "GBIS total"): gbis_variance, ("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total"): gbis_actually_sold, ("GBIS Postcode list", "", "Confirmed cancellations - £", ""): gbis_confirmed_cancellations * gbis_rate, # This is for jobs that are in-progress and could still cancel @@ -3611,7 +3600,7 @@ def forecast_remaining_sales(loader): } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 45: + if len(to_append) != 47: raise ValueError("Something went wrong") results.append(to_append) From 8dcb6a9be0f903fc06e4c9dcb3218bb1d6db949e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 21:11:17 +0000 Subject: [PATCH 078/155] 11% through matching ha38 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 553f6271..6998eb4b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1054,6 +1054,17 @@ class DataLoader: 'Kingsford Court, Coombe Valley Road', 'Kingsford Court' ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + 'LESLIE TEW COURT/DERWENT ROAD', 'LESLIE TEW COURT' + ) + + # There is no 18A LESLIE TEW COURT in the asset list + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "LESLIE TEW COURT") & + (survey_list["Post Code"] == "TN10 3TX") & + (survey_list["NO."] == "18A")) + ] + return survey_list @staticmethod @@ -3848,12 +3859,10 @@ def app(): # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - # priority_has = [ - # "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA38", "HA39", "HA107", - # ] + # Add in: "HA25" # TODO: Remove ECO3 sales from HA25 priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA28", "HA32", "HA39", "HA107", + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA28", "HA32", "HA38", "HA39", "HA107", ] # Next HAs to do: 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come back on this], # Then: 28 [DONE], From 17b5f6e140a90d261b790fee1a4a28f43d1e3a62 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 21:42:17 +0000 Subject: [PATCH 079/155] ha38 23% merged --- .../ha_15_32/ha_analysis_batch_3.py | 50 ++++++++++++++----- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 6998eb4b..ff39b190 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1065,6 +1065,24 @@ class DataLoader: (survey_list["NO."] == "18A")) ] + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + 'Brindley House, Wellbeck Road', 'Brindley House' + ) + + # Try taking just the first part of the string, splitting on a / + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split('/').str[0].str.strip() + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + 'HUNTSMAN WAY', 'HUNTSMANS WAY' + ) + + # Try taking just the first part of the string, splitting on a , + survey_list['Street / Block Name'] = survey_list['Street / Block Name'].str.split(',').str[0].str.strip() + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "McCLAREN COURT", "MCLAREN COURT" + ) + return survey_list @staticmethod @@ -1228,6 +1246,10 @@ class DataLoader: if "flat" in str(house_number): house_number = house_number.split("flat")[1].strip() + # We check if we had an instance of flat x, y + if "," in str(house_number): + house_number = house_number.split(",")[0].strip() + df = df[df["matching_address"].str.contains(str(house_number))] if df.empty: @@ -1251,19 +1273,23 @@ class DataLoader: df = df[df["HouseNo"].astype(str) == str(house_number)] if df.shape[0] != 1: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())] - - full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + row[ - "Town/Area"].lower().strip() + row["Post Code"].lower().strip() - # Remove any spaces from the full key - full_key = full_key.replace(" ", "") - - df = self.levenstein_match(full_key, df) - if df.shape[0] != 1: - print(row["Street / Block Name"]) - print(house_number) - print(row["Post Code"]) - raise ValueError("Investigate") + if "Town/Area" not in row.keys(): + full_key = (str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + + row["Post Code"].lower().strip()) + else: + full_key = str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + \ + row["Town/Area"].lower().strip() + row["Post Code"].lower().strip() + # Remove any spaces from the full key + full_key = full_key.replace(" ", "") + + df = self.levenstein_match(full_key, df) + + if df.shape[0] != 1: + print(row["Street / Block Name"]) + print(house_number) + print(row["Post Code"]) + raise ValueError("Investigate") matching_lookup.append( { From 8e258ff3ca164e2eddcd9cc74d1e7531bf655e4f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 22:29:18 +0000 Subject: [PATCH 080/155] 44% through matching --- .../ha_15_32/ha_analysis_batch_3.py | 70 ++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index ff39b190..567394a4 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1083,6 +1083,70 @@ class DataLoader: "McCLAREN COURT", "MCLAREN COURT" ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "ST JAMES CLOISTERS", "ST. JAMES'S CLOISTERS" + ) + + survey_list["Street / Block Name"] = np.where( + ((survey_list["NO."].isin( + [ + "FLAT 1 22", + "FLAT 2 22", + "FLAT 3 22", + "FLAT 4 22", + "FLAT 5 22", + "FLAT 6 22", + ] + )) & + (survey_list["Street / Block Name"] == "MELTON ROAD")), + "22 MELTON ROAD", + survey_list["Street / Block Name"] + ) + + survey_list["Street / Block Name"] = np.where( + ((survey_list["NO."].isin( + [ + "FLAT 1 24", + "FLAT 2 24", + "FLAT 3 24", + "FLAT 4 24", + "FLAT 5 24", + "FLAT 6 24", + ] + )) & + (survey_list["Street / Block Name"] == "MELTON ROAD")), + "24 MELTON ROAD", + survey_list["Street / Block Name"] + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "TURRETT GREEN COURT SILENT STREET", "TURRET GREEN COURT" + ) + + # Turret green court flat 1 doesn't exist in the asset list + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "TURRET GREEN COURT") & + (survey_list["NO."] == 1)) + ] + # 3, 45 raywell steet doesn't exist in the asset list + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "45 RAYWELL STREET") & + (survey_list["NO."] == 3)) + ] + + # 40 Avondale drive doesn't exist in the asset list + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Avondale Drive") & + (survey_list["NO."] == 40)) + ] + # 17A beech road has the wrong postcode + survey_list["Post Code"] = np.where( + (survey_list["Street / Block Name"] == "BEECH ROAD") & + (survey_list["Post Code"] == "DH6 1JD"), + "DH6 1JB", + survey_list["Post Code"] + ) + return survey_list @staticmethod @@ -1250,6 +1314,10 @@ class DataLoader: if "," in str(house_number): house_number = house_number.split(",")[0].strip() + # We may also have a space for an instance of flat x y + if " " in str(house_number): + house_number = house_number.split(" ")[0].strip() + df = df[df["matching_address"].str.contains(str(house_number))] if df.empty: @@ -1270,7 +1338,7 @@ class DataLoader: raise ValueError("Investigate") if df.shape[0] != 1: - df = df[df["HouseNo"].astype(str) == str(house_number)] + df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] if df.shape[0] != 1: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())] if df.shape[0] != 1: From 067a66c1b172b63abc419a112525382ce7c2baa3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 6 Mar 2024 22:45:22 +0000 Subject: [PATCH 081/155] ha38 wip - leaving for now --- .../ha_15_32/ha_analysis_batch_3.py | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 567394a4..c4f6307c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -599,7 +599,52 @@ class DataLoader: asset_list['ExtractedHouseNo'] = asset_list['matching_address'].apply(extract_house_no_if_flat) asset_list.loc[asset_list['ExtractedHouseNo'].notnull(), 'HouseNo'] = asset_list['ExtractedHouseNo'] asset_list['matching_address'] = asset_list['matching_address'].apply(rearrange_address_if_flat) - # We then need to + + # We update a few specific rows + asset_list["HouseNo"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/ROOM A1", + "10 SOUTH VIEW/ROOM A2", + "10 SOUTH VIEW/ROOM A3", + ] + )), + "10A", + asset_list["HouseNo"] + ) + + asset_list["matching_address"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/ROOM A1", + ] + )), + "10a, 10 south view/room a1, spennymoor, co. durham, dl16 7df'", + asset_list["matching_address"] + ) + + asset_list["HouseNo"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/ROOM B1", + "10 SOUTH VIEW/ROOM B2", + "10 SOUTH VIEW/ROOM B3", + "10 SOUTH VIEW/ROOM B4", + ] + )), + "10B", + asset_list["HouseNo"] + ) + + asset_list["matching_address"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/ROOM B1", + ] + )), + "10b, 10 south view/room b1, spennymoor, co. durham, dl16 7df", + asset_list["matching_address"] + ) return asset_list @@ -1147,6 +1192,13 @@ class DataLoader: survey_list["Post Code"] ) + survey_list["Street / Block Name"] = np.where( + (survey_list["Street / Block Name"] == "SOUTHVIEW") & + (survey_list["Post Code"] == "DL16 7DF"), + "SOUTH VIEW", + survey_list["Street / Block Name"] + ) + return survey_list @staticmethod From 5c3f6320dd6bfc2ddaac4fefb8786646c50e7945 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 10:42:51 +0000 Subject: [PATCH 082/155] 29% through matching eco3 ha25 --- .../ha_15_32/ha_analysis_batch_3.py | 136 +++++++++++++++--- 1 file changed, 117 insertions(+), 19 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index c4f6307c..3ea9649e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -183,7 +183,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA16", "HA24", "HA25"]: + if ha_name in ["HA1", "HA6", "HA16", "HA24"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -214,6 +214,14 @@ class DataLoader: asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA25": + asset_list["matching_address"] = asset_list[ + self.COLUMN_CONFIG[ha_name]["address"] + ].astype(str).str.lower().str.strip() + + asset_list["matching_postcode"] = asset_list['matching_address'].apply( + lambda x: ' '.join(x.split()[-2:]) if pd.notnull(x) else x + ) elif ha_name == "HA28": asset_list["matching_address"] = ( asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + @@ -352,6 +360,9 @@ class DataLoader: house_numbers = house_numbers.iloc[:, 0:1] house_numbers.columns = ['HouseNo'] + # Remove trailing punctuation such as , or ; + house_numbers["HouseNo"] = house_numbers["HouseNo"].str.rstrip(',;') + asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) return asset_list @@ -425,27 +436,16 @@ class DataLoader: workbook = openpyxl.load_workbook(filepath) asset_sheetname = self.get_asset_sheetname(workbook) - # TODO: TEMP - sheetnames_lower = [x.lower() for x in workbook.sheetnames] - if any("eco3" in x for x in sheetnames_lower): - raise Exception("REMOVE ME") - asset_sheet = workbook[asset_sheetname] asset_sheet_colnames = [cell.value for cell in asset_sheet[1]] if ha_name == "HA25": asset_sheet_colnames[11] = "matching_postcode" - values_only = not ha_name != "HA25" - rows_data = [] - if not values_only: - for row in asset_sheet.iter_rows(min_row=2, values_only=values_only): - row_data = [cell.value for cell in row] # This will get you the cell values - rows_data.append(row_data) - else: - for row in asset_sheet.iter_rows(min_row=2, values_only=values_only): # use values_only=True to get values - row_data = list(row) # No need for comprehension, values_only=True returns a tuple of values - rows_data.append(row_data) + + for row in asset_sheet.iter_rows(min_row=2, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + rows_data.append(row_data) asset_list = pd.DataFrame(rows_data, columns=asset_sheet_colnames) @@ -477,6 +477,29 @@ class DataLoader: if ha_name in ["HA1", "HA25"]: return asset_list, pd.DataFrame(), pd.DataFrame() + # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be + # suitable under ECO4, since their walls will be filled + eco3_list = pd.DataFrame() + sheetnames_lower = [x.lower() for x in workbook.sheetnames] + eco3_sheetname_index = [i for i, x in enumerate(sheetnames_lower) if "eco3" in x.replace(" ", "")] + if eco3_sheetname_index: + eco3_sheetname = workbook.sheetnames[eco3_sheetname_index[0]] + eco3_sheet = workbook[eco3_sheetname] + eco3_rows = [] + for row in eco3_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + eco3_rows.append(row_data) + + eco3_list = pd.DataFrame(eco3_rows, columns=[cell.value for cell in eco3_sheet[1]]) + # Remove columns that are None + eco3_list = eco3_list.loc[:, eco3_list.columns.notnull()] + # Remove rows that are completely empty + eco3_list = eco3_list.loc[eco3_list.loc[:, eco3_list.columns].notnull().any(axis=1)] + eco3_list["eco3_list_row_id"] = [ha_name + "_Eco3_" + str(i) for i in range(0, len(eco3_list))] + + # Perform the eco3 merge + eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name) + # We check if there is a survey list survey_sheetname = self.get_survey_sheetname(workbook) survey_sheet = workbook[survey_sheetname] @@ -518,7 +541,7 @@ class DataLoader: ciga_list = self.dedupe_ciga_list(ciga_list) ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) - return asset_list, survey_list, ciga_list + return asset_list, survey_list, ciga_list, eco3_list @staticmethod def correct_ha6_asset_list(asset_list): @@ -1433,6 +1456,79 @@ class DataLoader: return survey_list + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): + + # We add on a matching postcode without spaces for this + # asset_list["matching_postcode_no_space"] = asset_list["matching_postcode"].str.lower().str.replace(" ", "") + + # May need an eco3 list correction function + + # NEADS DRIVE, postcode with bs305dt, is not found in the asset list + eco3_list = eco3_list[ + ~(eco3_list["Post Code"] == "BS305DT") + ] + # Drop rows with missings postcode + eco3_list = eco3_list[ + ~pd.isnull(eco3_list["Post Code"]) + ] + + missed_postcodes = [] + if ha_name == "HA25": + missed_postcodes = { + postcode.lower() for postcode in eco3_list["Post Code"] if + postcode.lower() not in asset_list["matching_postcode"].values + } + eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)] + + matching_lookup = [] + missed = [] + for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): + + postcode = row["Post Code"].lower().strip() + + # df will never be empty, since we've already done a check for common postcodes + df = asset_list[ + asset_list["matching_postcode"].str.contains(postcode) + ] + + house_number = row["NO "] + if isinstance(house_number, str): + house_number = house_number.lower().strip() + + if not any(df["matching_address"].str.contains(str(house_number))): + if "flat" in str(house_number): + house_number = house_number.split("flat")[1].strip() + + # We check if we had an instance of flat x, y + if "," in str(house_number): + house_number = house_number.split(",")[0].strip() + + # We may also have a space for an instance of flat x y + if " " in str(house_number): + house_number = house_number.split(" ")[0].strip() + + df = df[df["matching_address"].str.contains(str(house_number))] + + if df.empty: + missed.append(row["eco3_list_row_id"]) + continue + + if df.shape[0] != 1: + df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] + + if df.shape[0] != 1: + print(row["Street / Block Name"]) + print(house_number) + print(row["Post Code"]) + raise ValueError("Investigate") + + matching_lookup.append( + { + "eco3_list_row_id": row["eco3_list_row_id"], + "asset_list_row_id": df["asset_list_row_id"].values[0], + } + ) + @staticmethod def extract_streetname(address, house_number=None, postcode=None): """ @@ -4008,11 +4104,13 @@ def app(): # Add in: "HA25" # TODO: Remove ECO3 sales from HA25 priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA28", "HA32", "HA38", "HA39", "HA107", + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA20", "HA24", "HA25", "HA28", "HA32", "HA39", "HA107", ] # Next HAs to do: 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come back on this], # Then: 28 [DONE], - # 38, 41, 10, 14, 20, 48 + # 41, 10, 14 [DONE], 20, 48, 50 + # 38[problematic, but no ECO4] + # TODO - do 50 and 25 next # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From ef77db10373c653e28c82265460ce9fd3bf3f3bf Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 10:56:27 +0000 Subject: [PATCH 083/155] HA25 eco3 matching 91% complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 3ea9649e..ea5b0456 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1516,6 +1516,15 @@ class DataLoader: if df.shape[0] != 1: df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] + if df.empty: + missed.append(row["eco3_list_row_id"]) + continue + + if df.shape[0] != 1: + # Perform a search on streetname + street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0] + df = df[df["matching_address"].str.contains(street_name_section1)] + if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) From 022244377d36557f83081e505b8068ab2bd98004 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 12:26:16 +0000 Subject: [PATCH 084/155] working on fixing missed matched in eco3 matching --- .../ha_15_32/ha_analysis_batch_3.py | 84 +++++++++++++++---- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index ea5b0456..a5845990 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -171,6 +171,10 @@ class DataLoader: "HA107": 51, } + UNMATCHED_ECO3 = { + "HA25": 94 + } + def __init__(self, directories, december_figures_filepath, use_cache, rebuild): self.directories = directories self.use_cache = use_cache @@ -1458,9 +1462,6 @@ class DataLoader: def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): - # We add on a matching postcode without spaces for this - # asset_list["matching_postcode_no_space"] = asset_list["matching_postcode"].str.lower().str.replace(" ", "") - # May need an eco3 list correction function # NEADS DRIVE, postcode with bs305dt, is not found in the asset list @@ -1471,8 +1472,17 @@ class DataLoader: eco3_list = eco3_list[ ~pd.isnull(eco3_list["Post Code"]) ] + # We have a bunch of genuine duplicates + eco3_list = eco3_list.drop_duplicates(["NO ", "Street / Block Name", "Post Code"]) + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "HALWILL MEADOOW", "HALWILL MEADOW" + ) + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "Hall Road", "Hall Rd" + ) - missed_postcodes = [] if ha_name == "HA25": missed_postcodes = { postcode.lower() for postcode in eco3_list["Post Code"] if @@ -1480,10 +1490,18 @@ class DataLoader: } eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)] + # For the asset list, we create a matching address without any punctuation + # TODO: We should generally just remove puncutation from addresses when matching + asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(r'[^\w\s]', '', + regex=True) + # Remove double spaces + asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace( + " ", " " + ) + matching_lookup = [] missed = [] for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): - postcode = row["Post Code"].lower().strip() # df will never be empty, since we've already done a check for common postcodes @@ -1507,24 +1525,20 @@ class DataLoader: if " " in str(house_number): house_number = house_number.split(" ")[0].strip() - df = df[df["matching_address"].str.contains(str(house_number))] + # We must do the house number filter + df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] + + # Perform a search on streetname + # We do this to prevent duplicate matches to properties with the same postcode and house number, + # but different streets + street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0] + street_name_section1 = re.sub(r'[^\w\s]', '', street_name_section1) + df = df[df["matching_address_no_punctuation"].str.contains(street_name_section1)] if df.empty: missed.append(row["eco3_list_row_id"]) continue - if df.shape[0] != 1: - df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] - - if df.empty: - missed.append(row["eco3_list_row_id"]) - continue - - if df.shape[0] != 1: - # Perform a search on streetname - street_name_section1 = row["Street / Block Name"].lower().split("/")[0].split(",")[0] - df = df[df["matching_address"].str.contains(street_name_section1)] - if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) @@ -1538,6 +1552,40 @@ class DataLoader: } ) + # We verify the missed + # -HA25 contains 88 missed entries. These are actually 8 unique postcodes, where surveys were conducted + # on properties that had house numbers outside of the asset list + if len(missed) != self.UNMATCHED_ECO3[ha_name]: + raise ValueError( + f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" + ) + + # TODO: 194 missed + + matching_lookup = pd.DataFrame(matching_lookup) + # Check dupes as this will cause problems later on + if matching_lookup["asset_list_row_id"].duplicated().any(): + raise ValueError("Duplicated asset list row ids") + + missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)] + missed_df.head(3).tail(1)["eco3_list_row_id"] + + duped_ids = matching_lookup[matching_lookup["asset_list_row_id"].duplicated()]["asset_list_row_id"].tolist() + duped_df = matching_lookup[ + matching_lookup["asset_list_row_id"].isin(duped_ids) + ] + duped_surveys = eco3_list[ + eco3_list["eco3_list_row_id"].isin(duped_df["eco3_list_row_id"].values) + ].copy() + + duped_surveys = duped_surveys.merge(matching_lookup, how="left", on="eco3_list_row_id") + + duped_surveys[ + ["NO ", "Street / Block Name", "Post Code", "eco3_list_row_id", "asset_list_row_id"] + ].sort_values("asset_list_row_id").head() + + asset_list[asset_list["asset_list_row_id"] == "HA2515145"]["matching_address"].values + @staticmethod def extract_streetname(address, house_number=None, postcode=None): """ From b09bd63b53c8d9b14f11c1c5b7cb38b28c63afbc Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 12:53:25 +0000 Subject: [PATCH 085/155] done with ha25 matching for now --- .../ha_15_32/ha_analysis_batch_3.py | 66 +++++++++++-------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index a5845990..f0813aef 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -172,7 +172,7 @@ class DataLoader: } UNMATCHED_ECO3 = { - "HA25": 94 + "HA25": 119 } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): @@ -478,7 +478,7 @@ class DataLoader: # For HA1 and HA25, there is an exception in the structure of the data. We don't have any survey or ciga # lists, and so # we can return the asset list now - if ha_name in ["HA1", "HA25"]: + if ha_name in ["HA1"]: return asset_list, pd.DataFrame(), pd.DataFrame() # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be @@ -1460,10 +1460,8 @@ class DataLoader: return survey_list - def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): - - # May need an eco3 list correction function - + @staticmethod + def correct_ha25_eco3_list(eco3_list): # NEADS DRIVE, postcode with bs305dt, is not found in the asset list eco3_list = eco3_list[ ~(eco3_list["Post Code"] == "BS305DT") @@ -1483,6 +1481,29 @@ class DataLoader: "Hall Road", "Hall Rd" ) + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "SPRINGFIELD WAY SAINT DAY", "SPRINGFIELD WAY ST DAY" + ) + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "BOND SPEAR COURT", "BOND-SPEAR COURT" + ) + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "ST.MARYS HILL", "ST MARYS HILL" + ) + # Correct the postcode for edmund road + eco3_list["Post Code"] = np.where( + (eco3_list["Street / Block Name"] == "EDMUND ROAD") & + (eco3_list["Post Code"] == "TR14 8QJ"), + "TR15 1BY", + eco3_list["Post Code"] + ) + return eco3_list + + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): + + eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") + eco3_list = eco3_list_correction_function(eco3_list) + if ha_name == "HA25": missed_postcodes = { postcode.lower() for postcode in eco3_list["Post Code"] if @@ -1492,8 +1513,9 @@ class DataLoader: # For the asset list, we create a matching address without any punctuation # TODO: We should generally just remove puncutation from addresses when matching - asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace(r'[^\w\s]', '', - regex=True) + asset_list['matching_address_no_punctuation'] = asset_list['matching_address'].str.replace( + r'[^\w\s]', '', regex=True + ) # Remove double spaces asset_list["matching_address_no_punctuation"] = asset_list["matching_address_no_punctuation"].str.replace( " ", " " @@ -1502,6 +1524,8 @@ class DataLoader: matching_lookup = [] missed = [] for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): + # if row["eco3_list_row_id"] == "HA25_Eco3_5422": + # raise Exception() postcode = row["Post Code"].lower().strip() # df will never be empty, since we've already done a check for common postcodes @@ -1553,38 +1577,24 @@ class DataLoader: ) # We verify the missed - # -HA25 contains 88 missed entries. These are actually 8 unique postcodes, where surveys were conducted - # on properties that had house numbers outside of the asset list + # HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2 + # where many surveys were conducted on house numbers, not in the asset list if len(missed) != self.UNMATCHED_ECO3[ha_name]: raise ValueError( f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" ) - # TODO: 194 missed - matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on if matching_lookup["asset_list_row_id"].duplicated().any(): raise ValueError("Duplicated asset list row ids") - missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)] - missed_df.head(3).tail(1)["eco3_list_row_id"] + # Merge onto eco3 list + eco3_list = eco3_list.merge(matching_lookup, how="left", on="eco3_list_row_id") - duped_ids = matching_lookup[matching_lookup["asset_list_row_id"].duplicated()]["asset_list_row_id"].tolist() - duped_df = matching_lookup[ - matching_lookup["asset_list_row_id"].isin(duped_ids) - ] - duped_surveys = eco3_list[ - eco3_list["eco3_list_row_id"].isin(duped_df["eco3_list_row_id"].values) - ].copy() + asset_list = asset_list.drop(columns=["matching_address_no_punctuation"]) - duped_surveys = duped_surveys.merge(matching_lookup, how="left", on="eco3_list_row_id") - - duped_surveys[ - ["NO ", "Street / Block Name", "Post Code", "eco3_list_row_id", "asset_list_row_id"] - ].sort_values("asset_list_row_id").head() - - asset_list[asset_list["asset_list_row_id"] == "HA2515145"]["matching_address"].values + return eco3_list @staticmethod def extract_streetname(address, house_number=None, postcode=None): From 961b53d523bf7dc82d9e83459861cb3aa2865c93 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 12:58:29 +0000 Subject: [PATCH 086/155] Adding return for HA25 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index f0813aef..7ad50583 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -479,7 +479,7 @@ class DataLoader: # lists, and so # we can return the asset list now if ha_name in ["HA1"]: - return asset_list, pd.DataFrame(), pd.DataFrame() + return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame() # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be # suitable under ECO4, since their walls will be filled @@ -504,6 +504,10 @@ class DataLoader: # Perform the eco3 merge eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name) + if ha_name in ["HA25"]: + # Accomodate ha25 unique structure + return asset_list, pd.DataFrame(), pd.DataFrame(), eco3_list + # We check if there is a survey list survey_sheetname = self.get_survey_sheetname(workbook) survey_sheet = workbook[survey_sheetname] @@ -1592,7 +1596,7 @@ class DataLoader: # Merge onto eco3 list eco3_list = eco3_list.merge(matching_lookup, how="left", on="eco3_list_row_id") - asset_list = asset_list.drop(columns=["matching_address_no_punctuation"]) + asset_list.drop(columns=["matching_address_no_punctuation"], inplace=True) return eco3_list @@ -1756,7 +1760,7 @@ class DataLoader: continue # Load asset list logger.info("Loading data for {}".format(ha_name)) - asset_list, survey_list, ciga_list = self.load_asset_list( + asset_list, survey_list, ciga_list, eco3_list = self.load_asset_list( filepath=filepath, ha_name=ha_name, ) @@ -1764,7 +1768,8 @@ class DataLoader: data[ha_name] = { "asset_list": asset_list, "survey_list": survey_list, - "ciga_list": ciga_list + "ciga_list": ciga_list, + "eco3_list": eco3_list } self.data = data From 7f88f0e0f59e584d82a6799671e8f1a64a034392 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 13:59:32 +0000 Subject: [PATCH 087/155] Added in the re-labelling of assets based on eco3 merge --- .../ha_15_32/ha_analysis_batch_3.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7ad50583..21509923 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1812,6 +1812,7 @@ class DataLoader: asset_list = data_assets["asset_list"].copy() survey_list = data_assets["survey_list"].copy() ciga_list = data_assets["ciga_list"].copy() + eco3_list = data_assets.get("eco3_list", pd.DataFrame()) asset_list_starting_size = asset_list.shape[0] @@ -1859,6 +1860,25 @@ class DataLoader: if asset_list.shape[0] != asset_list_starting_size: raise ValueError("The asset list has changed in size") + # If we have eco3 surveys, we set a property to not eligible + if not eco3_list.empty: + eco3_list_to_merge = eco3_list[["asset_list_row_id"]].copy() + eco3_list_to_merge["has_eco3"] = True + asset_list = asset_list.merge( + eco3_list_to_merge, how="left", on="asset_list_row_id" + ) + + if asset_list.shape[0] != asset_list_starting_size: + raise ValueError("The asset list has changed in size, when merging on eco3") + + # Any rows that have an eco3 survey are set to not eligible + asset_list["ECO Eligibility"] = np.where( + asset_list["has_eco3"] == True, + "not eligible", + asset_list["ECO Eligibility"] + ) + asset_list = asset_list.drop(columns=["has_eco3"]) + # Report on sales sales_report = {} if not survey_list.empty: From 9a0c6c3e8fbae7a23980aa7e75912ef6202ab29d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 14:18:08 +0000 Subject: [PATCH 088/155] expanded eco3 matching --- .../ha_15_32/ha_analysis_batch_3.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 21509923..06bb0d96 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -172,7 +172,7 @@ class DataLoader: } UNMATCHED_ECO3 = { - "HA25": 119 + "HA25": 154 } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): @@ -1508,12 +1508,16 @@ class DataLoader: eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") eco3_list = eco3_list_correction_function(eco3_list) + asset_list["matching_postcode_nospace"] = asset_list["matching_postcode"].str.replace(" ", "").str.lower() + eco3_list["postcode_no_space"] = eco3_list["Post Code"].str.lower().str.replace(" ", "") + if ha_name == "HA25": + # 317 -> 259 missed_postcodes = { - postcode.lower() for postcode in eco3_list["Post Code"] if - postcode.lower() not in asset_list["matching_postcode"].values + postcode for postcode in eco3_list["postcode_no_space"] if + postcode not in asset_list["matching_postcode_nospace"].values } - eco3_list = eco3_list[~eco3_list["Post Code"].str.lower().isin(missed_postcodes)] + eco3_list = eco3_list[~eco3_list["postcode_no_space"].isin(missed_postcodes)] # For the asset list, we create a matching address without any punctuation # TODO: We should generally just remove puncutation from addresses when matching @@ -1530,11 +1534,11 @@ class DataLoader: for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): # if row["eco3_list_row_id"] == "HA25_Eco3_5422": # raise Exception() - postcode = row["Post Code"].lower().strip() + postcode = row["postcode_no_space"] # df will never be empty, since we've already done a check for common postcodes df = asset_list[ - asset_list["matching_postcode"].str.contains(postcode) + asset_list["matching_postcode_nospace"].str.contains(postcode) ] house_number = row["NO "] @@ -1588,6 +1592,8 @@ class DataLoader: f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" ) + # 154 missed, 2827 matched for HA 25 + matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on if matching_lookup["asset_list_row_id"].duplicated().any(): From 8b70fb346c0ce51acd24b245bbbecedeaa10d30c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 15:00:51 +0000 Subject: [PATCH 089/155] matching ha50 --- .../ha_15_32/ha_analysis_batch_3.py | 56 ++++++++++++++++--- 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 06bb0d96..4708bf35 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -172,7 +172,8 @@ class DataLoader: } UNMATCHED_ECO3 = { - "HA25": 154 + "HA25": 154, + "HA50": 5 } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): @@ -262,6 +263,10 @@ class DataLoader: asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \ asset_list["post_code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["post_code"].astype(str).str.lower().str.strip() + elif ha_name == "HA50": + asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Post Code"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() elif ha_name == "HA107": # Create matching_address by concatenating House No, Street, Town, District, Postcode asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \ @@ -433,6 +438,8 @@ class DataLoader: return "ECO Surveys" elif "ECO Survey" in workbook.sheetnames: return "ECO Survey" + elif "ECO 4 Surveys completed" in workbook.sheetnames: + return "ECO 4 Surveys completed" else: return "ECO surveys" @@ -1289,6 +1296,34 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha50_survey_list(survey_list): + + survey_list["Post Code"] = np.where( + (survey_list["Street / Block Name"] == 'COSELEY STREET') & + (survey_list["Post Code"] == 'ST16 1LR'), + "ST6 1JU", + survey_list["Post Code"] + ) + + # Remove some of COSELEY STREET, as we have surveys done, outside of the asset list + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "COSELEY STREET") & + (survey_list["Post Code"] == "ST6 1JU") & + (survey_list["NO."].isin([96]))) + ] + + survey_list["Post Code"] = survey_list["Post Code"].str.replace("ST33JZ", "ST3 3JZ") + + # Remove some of Jesmond drive as we have surveys done outside of the asset list + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Jesmond Drive") & + (survey_list["Post Code"] == "ST3 3JZ") & + (survey_list["NO."].isin([29]))) + ] + + return survey_list + @staticmethod def correct_ha107_survey_list(survey_list): # Replace Front Street, East Stockham with Front Street, East Stockwith @@ -1503,6 +1538,10 @@ class DataLoader: ) return eco3_list + @staticmethod + def correct_ha50_eco3_list(eco3_list): + return eco3_list + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") @@ -1517,6 +1556,7 @@ class DataLoader: postcode for postcode in eco3_list["postcode_no_space"] if postcode not in asset_list["matching_postcode_nospace"].values } + eco3_list = eco3_list[~eco3_list["postcode_no_space"].isin(missed_postcodes)] # For the asset list, we create a matching address without any punctuation @@ -4199,16 +4239,18 @@ def app(): # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - # Add in: "HA25" + # Add in: # TODO: Remove ECO3 sales from HA25 priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA20", "HA24", "HA25", "HA28", "HA32", "HA39", "HA107", + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA50", "HA107", ] - # Next HAs to do: 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come back on this], + # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come + # back on this], # Then: 28 [DONE], - # 41, 10, 14 [DONE], 20, 48, 50 - # 38[problematic, but no ECO4] - # TODO - do 50 and 25 next + # 41, 48, 50 + # 38[problematic, but no ECO4], 10 problematic (no eligibility), + # 20 has barely any in + # TODO - do 50 # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From 3001a98421b377cb31e2c3b667528e8d4b80a150 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 15:02:23 +0000 Subject: [PATCH 090/155] ha50 30% matched --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 4708bf35..901784e1 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1322,6 +1322,10 @@ class DataLoader: (survey_list["NO."].isin([29]))) ] + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "BRUNDELL OVAL", "BRUNDALL OVAL" + ) + return survey_list @staticmethod From 4afd012e51bfc3b366dc1e8d1f70281bb1097bd0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 15:14:53 +0000 Subject: [PATCH 091/155] ha50 51% matched --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 901784e1..bde6f647 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1326,6 +1326,13 @@ class DataLoader: "BRUNDELL OVAL", "BRUNDALL OVAL" ) + # Remove 4 Linden Place + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Linden Place") & + (survey_list["Post Code"] == "ST3 3AT") & + (survey_list["NO."].isin([4]))) + ] + return survey_list @staticmethod From 1146f34eba62ab2b00f610502b17ba6f9425cf43 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 15:24:20 +0000 Subject: [PATCH 092/155] matching 81% complete --- .../ha_15_32/ha_analysis_batch_3.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bde6f647..818f6e4f 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1333,6 +1333,45 @@ class DataLoader: (survey_list["NO."].isin([4]))) ] + # Remove 11 Tilehurst Place + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Tilehurst Place") & + (survey_list["Post Code"] == "ST3 3AP") & + (survey_list["NO."].isin([11]))) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "deavile road", "DEAVILLE ROAD" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "WOOLISCROFT ROAD", "WOOLLISCROFT ROAD" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Leak Road", "Leek Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Springfield road", "Springfields road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "MILLWARD RD", "MILLWARD ROAD" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "REPINGTON RD", "REPINGTON ROAD" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "ECCELSTONE PLACE", "ECCLESTONE PLACE" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "St. James Place", "St James Place" + ) + return survey_list @staticmethod From 5a1aa3995221ddf125b25c6d619165fdbcab37ff Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 15:33:26 +0000 Subject: [PATCH 093/155] ha50 93% complete --- .../ha_15_32/ha_analysis_batch_3.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 818f6e4f..3b9bd7ca 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1372,6 +1372,50 @@ class DataLoader: "St. James Place", "St James Place" ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "CHELL HEATH RD", "CHELL HEATH ROAD" + ) + # Correct postcode + survey_list["Post Code"] = np.where( + (survey_list["Street / Block Name"] == 'CHELL HEATH ROAD') & + (survey_list["Post Code"] == 'ST6 6HU'), + "ST6 6HJ", + survey_list["Post Code"] + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Franklin Rd", "Franklin Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Lodge Rd", "Lodge Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "St Matthews Street", "St Matthew Street" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Grove Bank Road", "Grovebank Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "OVERSLEY RD", "OVERSLEY ROAD" + ) + + # Replace all of the " RD" with " ROAD" + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + " RD", " ROAD" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "St. Georges Crescent", "St Georges Crescent" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Tewson Road", "Tewson Green" + ) + return survey_list @staticmethod From d4e378f109deb3c71b87165309a5935b3641a915 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 15:40:37 +0000 Subject: [PATCH 094/155] ha50 matching complete subject to checks --- .../ha_15_32/ha_analysis_batch_3.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 3b9bd7ca..a5b99a72 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1416,6 +1416,35 @@ class DataLoader: "Tewson Road", "Tewson Green" ) + # Remove 55 Seabridge Lane + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Seabridge Lane") & + (survey_list["Post Code"] == "ST5 4AG") & + (survey_list["NO."].isin([55]))) + ] + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Tyne Way") & + (survey_list["Post Code"] == "ST5 4AX") & + (survey_list["NO."].isin([56]))) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "St.Bernards Place", "St Bernard Place" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Penarth Road", "Penarth Grove" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "St. Marys Road", "St Marys Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Larch Drive", "Larch Grove" + ) + return survey_list @staticmethod From 33b3f51ca4701ede548e6af82f80ae191a3c0710 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 15:54:40 +0000 Subject: [PATCH 095/155] handling dupes for ha50 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index a5b99a72..7124919e 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1445,6 +1445,21 @@ class DataLoader: "Larch Drive", "Larch Grove" ) + # Drop 31 Lauder place north, as there is a duplicate. THis version also has a wrong postcode + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "LAUDER PLACE NORTH") & + (survey_list["Post Code"] == "ST20QS") & + (survey_list["NO."].isin([31]))) + ] + + # Handle dropping of dupes + survey_list["street_pruner"] = survey_list["Street / Block Name"].str.lower().str.replace(" ", "") + survey_list["postcode_pruner"] = survey_list["Post Code"].str.lower().str.replace(" ", "") + + # Should go to 18 + survey_list = survey_list.drop_duplicates(["NO.", "street_pruner", "postcode_pruner"]) + survey_list = survey_list.drop(columns=["street_pruner", "postcode_pruner"]) + return survey_list @staticmethod From 23eaa5600118f0df54667ea36422153158db8dd5 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 15:57:00 +0000 Subject: [PATCH 096/155] checked ha50 ciga merge --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7124919e..2feded98 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -168,6 +168,7 @@ class DataLoader: "HA15": 3, "HA16": 7, "HA24": 12, + "HA50": 4, "HA107": 51, } @@ -429,6 +430,8 @@ class DataLoader: return "CIGA checks" elif "CIGA check" in workbook.sheetnames: return "CIGA check" + elif "CIGA requested" in workbook.sheetnames: + return "CIGA requested" else: return "CIGA" From 180c0c53eaa48c185c75cf22aee448aac91bbe30 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 16:26:58 +0000 Subject: [PATCH 097/155] done with ha50 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 2feded98..0720a686 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1982,7 +1982,8 @@ class DataLoader: "ECO4 GBIS (ECO+)": "GBIS", "ECO4 GBIS (ECO+) JJC UNDER 73m²": "GBIS", "ECO4 AFFORDABLE WARMTH": "ECO4", - "Affordable Warmth": "ECO4" + "Affordable Warmth": "ECO4", + "ECO4 GBIS (ECO+) JJC UNDER 73m² ": "GBIS", } eco_eligibility_map = { From c43349a5777326145107a6406779eadcdc6e9dab Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 16:39:47 +0000 Subject: [PATCH 098/155] Added ha41 matching --- .../ha_15_32/ha_analysis_batch_3.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 0720a686..4cf447aa 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -174,7 +174,8 @@ class DataLoader: UNMATCHED_ECO3 = { "HA25": 154, - "HA50": 5 + "HA41": 26, + "HA50": 5, } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): @@ -264,6 +265,14 @@ class DataLoader: asset_list["add_5"].astype(str).str.lower().str.strip() + ", " + \ asset_list["post_code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["post_code"].astype(str).str.lower().str.strip() + elif ha_name == "HA41": + asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["AddressLine3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["AddressLine4"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["AddressLine5"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA50": asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Post Code"].astype(str).str.lower().str.strip() @@ -1683,6 +1692,10 @@ class DataLoader: def correct_ha50_eco3_list(eco3_list): return eco3_list + @staticmethod + def correct_ha41_eco3_list(eco3_list): + return eco3_list + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") @@ -4384,15 +4397,14 @@ def app(): # Add in: # TODO: Remove ECO3 sales from HA25 priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA50", "HA107", + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", "HA50", "HA107", ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], # Then: 28 [DONE], # 41, 48, 50 - # 38[problematic, but no ECO4], 10 problematic (no eligibility), - # 20 has barely any in - # TODO - do 50 + # Ignore for now: + # TODO: 38[problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From c4af2251f4fac0af95676b7158e5baf1ad9d3d3c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 16:41:58 +0000 Subject: [PATCH 099/155] data load for ha41 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 4cf447aa..c2d585a2 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -452,6 +452,8 @@ class DataLoader: return "ECO Survey" elif "ECO 4 Surveys completed" in workbook.sheetnames: return "ECO 4 Surveys completed" + elif "ECO4 Surveys" in workbook.sheetnames: + return "ECO4 Surveys" else: return "ECO surveys" @@ -1533,6 +1535,10 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha41_survey_list(survey_list): + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() From ae714e42a62b1e6def566c6de46b34035d0ab7bb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 17:11:44 +0000 Subject: [PATCH 100/155] identified 9 additional has worth analysing --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index c2d585a2..b22ea273 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -4403,14 +4403,16 @@ def app(): # Add in: # TODO: Remove ECO3 sales from HA25 priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", "HA50", "HA107", + "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", "HA48", + "HA50", "HA107", ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come - # back on this], - # Then: 28 [DONE], - # 41, 48, 50 + # back on this], 28 [DONE], 41 [DONE], 50 [DONE], + # 48 [WIP], + # Consider for ECO4: 2, 63, 12, 13, 136, 117 + # COnsider for GBIS: 56, 35, 34 # Ignore for now: - # TODO: 38[problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in + # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] From c84be65e8defa04aa1453f80b53d073c9011a629 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 19:52:08 +0000 Subject: [PATCH 101/155] ha48 ciga unmatched count added --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index b22ea273..56867ef7 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -159,6 +159,10 @@ class DataLoader: "HA25": { "address": "T1_Address", "postcode": "matching_postcode" + }, + "HA48": { + "address": "Full Address", + "postcode": "Postcode" } } @@ -170,6 +174,7 @@ class DataLoader: "HA24": 12, "HA50": 4, "HA107": 51, + "HA48": 0 } UNMATCHED_ECO3 = { @@ -190,7 +195,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA16", "HA24"]: + if ha_name in ["HA1", "HA6", "HA16", "HA24", "HA48"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() From c3fd2ae902bd96250bc5ca376a424ebc8cbc3335 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 20:58:47 +0000 Subject: [PATCH 102/155] Adding HA2, data load done --- .../ha_15_32/ha_analysis_batch_3.py | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 56867ef7..74c6d3f5 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -167,6 +167,7 @@ class DataLoader: } UNMATCHED_CIGA = { + "HA2": 0, "HA6": 117, "HA14": 3, "HA15": 3, @@ -202,6 +203,12 @@ class DataLoader: asset_list["matching_postcode"] = asset_list[ self.COLUMN_CONFIG[ha_name]["postcode"] ].astype(str).str.lower().str.strip() + elif ha_name == "HA2": + # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode + asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA7": # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \ @@ -3794,7 +3801,6 @@ def forecast_remaining_sales(loader): results = [] for ha_name, input_data in loader.data.items(): - # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] @@ -4074,13 +4080,13 @@ def forecast_remaining_sales(loader): ("", "Original Warmfront estimate", "Total - #", "ECO4 - November"): original_warmfront_eco4, ("ECO4 original", "", "Remaining - #", ""): original_warmfront_remaining_eco4, ("ECO4 original", "", "Total - £", ""): original_warmfront_eco4_revenue, - ("ECO4 original", "", "Sold - £", ""): original_warmfront_sold_eco4, + ("ECO4 original", "", "Sold or cancelled - £", ""): original_warmfront_sold_eco4, ("ECO4 original", "", "Remaining - £", ""): original_warmfront_remaining_eco4_revenue, # GBIS - original warmfront figures ("", "Original Warmfront estimate", "Total - #", "GBIS - November"): original_warmfront_gbis, ("GBIS original", "", "Remaining - #", ""): original_warmfront_gbis, ("GBIS original", "", "Total - £", ""): original_warmfront_gbis_revenue, - ("GBIS original", "", "Sold - £", ""): original_warmfront_sold_gbis, + ("GBIS original", "", "Sold or cancelled - £", ""): original_warmfront_sold_gbis, ("GBIS original", "", "Remaining - £", ""): original_warmfront_remaining_gbis_revenue, # ECO4 - asset list, pre-ciga ("", "Warmfront post code list", "Total #", "ECO4 total (pre-ciga)"): eco4_pre_ciga, @@ -4237,12 +4243,17 @@ def forecast_remaining_sales(loader): headline_total_delta = round(headline_total_delta, 1) headline_eco4_sold_since_november = ( - totals_row[('ECO4 pre-ciga', '', 'Sold - £', '')] - totals_row[('ECO4 original', '', 'Sold - £', '')] + totals_row[('ECO4 pre-ciga', '', 'Sold - £', '')] + + totals_row[('ECO4 pre-ciga', '', 'Confirmed cancellations - £', '')] + # confirmed canclleations + totals_row[('ECO4 pre-ciga', '', 'Unconfirmed cancellations - £', '')] - # expected cancellations + totals_row[('ECO4 original', '', 'Sold or cancelled - £', '')] ) headline_gbis_sold_since_november = ( - totals_row[("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total")] - - totals_row[('GBIS original', '', 'Sold - £', '')] + totals_row[("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total")] + + totals_row[("GBIS Postcode list", "", "Confirmed cancellations - £", "")] + # confirmed cancellations + totals_row[("GBIS Postcode list", "", "Unconfirmed cancellations - £", "")] - # expected cancellations + totals_row[('GBIS original', '', 'Sold or cancelled - £', '')] ) headlines = [ @@ -4261,7 +4272,7 @@ def forecast_remaining_sales(loader): "ECO4 - November"): headline_eco4_original_remaining_revenue }, { - ("", "", "", "HA Name"): "ECO4 Sold since November - £", + ("", "", "", "HA Name"): "ECO4 Sold or cancelled since November - £", ( "", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_eco4_sold_since_november @@ -4290,7 +4301,7 @@ def forecast_remaining_sales(loader): "ECO4 - November"): headline_gbis_original_remaining_revenue }, { - ("", "", "", "HA Name"): "GBIS Sold since November - £", + ("", "", "", "HA Name"): "GBIS Sold or cancelled since November - £", ( "", "Original Warmfront estimate", "Total - #", "ECO4 - November"): headline_gbis_sold_since_november @@ -4399,21 +4410,18 @@ def app(): rebuild_inputs = False # List all of the data in the folder - directories = [str(file) for entry in DATA_FOLDER.iterdir() if entry.is_dir() for file in entry.iterdir() if file.suffix == '.xlsx'] # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" # Add in: - # TODO: Remove ECO3 sales from HA25 priority_has = [ - "HA1", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", "HA48", + "HA1", "HA2", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", "HA48", "HA50", "HA107", ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come - # back on this], 28 [DONE], 41 [DONE], 50 [DONE], - # 48 [WIP], + # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], # Consider for ECO4: 2, 63, 12, 13, 136, 117 # COnsider for GBIS: 56, 35, 34 # Ignore for now: From 19850f924445035e3880eaae40f750d21fb12b80 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 21:34:46 +0000 Subject: [PATCH 103/155] fixing up ha63 eco3 list --- .../ha_15_32/ha_analysis_batch_3.py | 46 +++++++++++++++++-- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 74c6d3f5..aebf0506 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -289,6 +289,10 @@ class DataLoader: asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Post Code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() + elif ha_name == "HA63": + asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["POSTCODE"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip() elif ha_name == "HA107": # Create matching_address by concatenating House No, Street, Town, District, Postcode asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \ @@ -1551,6 +1555,16 @@ class DataLoader: def correct_ha41_survey_list(survey_list): return survey_list + @staticmethod + def correct_ha63_survey_list(survey_list): + # Drop some filler rows + survey_list = survey_list[ + ~survey_list[survey_list.columns[0]].isin( + ["NO JOBS SURVEYED JULY 2021 ", "NO JOBS SURVEYED SEPTEMBER 2021"] + ) + ] + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -1714,6 +1728,26 @@ class DataLoader: def correct_ha41_eco3_list(eco3_list): return eco3_list + @staticmethod + def correct_ha63_eco3_list(eco3_list): + eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])] + # Some postcode that aren't in the asset list + eco3_list = eco3_list[ + ~eco3_list["Post Code"].isin( + ["NR32 15X", "NR30 2BT"] + ) + ] + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "POUND COTTAGES - BLOOMSBERRY CLOSE", "POUND COTTAGES" + ) + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "FREDRICK ROAD", "Frederick Road" + ) + + return eco3_list + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") @@ -1799,12 +1833,15 @@ class DataLoader: # We verify the missed # HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2 # where many surveys were conducted on house numbers, not in the asset list + # 154 missed, 2827 matched for HA 25 if len(missed) != self.UNMATCHED_ECO3[ha_name]: raise ValueError( f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" ) - # 154 missed, 2827 matched for HA 25 + # 41 + missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)] + missed_df.head(1)["Street / Block Name"] matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on @@ -4418,11 +4455,12 @@ def app(): # Add in: priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", "HA48", - "HA50", "HA107", + "HA50", "HA63", "HA107", ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come - # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], - # Consider for ECO4: 2, 63, 12, 13, 136, 117 + # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE] + # 63 [WIP] + # Consider for ECO4: 12, 13, 136, 117 # COnsider for GBIS: 56, 35, 34 # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in From 47b97fce0a6eec4fe15a967f1721e18908bffccf Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 21:46:44 +0000 Subject: [PATCH 104/155] fixing eco3 matching for ha63 --- .../ha_15_32/ha_analysis_batch_3.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index aebf0506..bab5cdab 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -174,6 +174,7 @@ class DataLoader: "HA16": 7, "HA24": 12, "HA50": 4, + "HA63": 15, "HA107": 51, "HA48": 0 } @@ -182,6 +183,7 @@ class DataLoader: "HA25": 154, "HA41": 26, "HA50": 5, + "HA63": 0 } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): @@ -1746,6 +1748,25 @@ class DataLoader: "FREDRICK ROAD", "Frederick Road" ) + # For denmark street, remove the space from the house number + eco3_list["NO "] = np.where( + eco3_list["Street / Block Name"] == "DENMARK STREET", + eco3_list["NO "].str.replace(" ", ""), + eco3_list["NO "] + ) + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "OLD HOSPITAL MEWS HOSPITAL WALK", "Old Hospital Mews" + ) + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "Portland House, Portland Street", "Portland House" + ) + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "MIDDLE MARKET STREET", "Middle Market Road" + ) + return eco3_list def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): @@ -1791,7 +1812,7 @@ class DataLoader: if isinstance(house_number, str): house_number = house_number.lower().strip() - if not any(df["matching_address"].str.contains(str(house_number))): + if not any(df["HouseNo"].str.contains(str(house_number))): if "flat" in str(house_number): house_number = house_number.split("flat")[1].strip() @@ -1839,10 +1860,6 @@ class DataLoader: f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" ) - # 41 - missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)] - missed_df.head(1)["Street / Block Name"] - matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on if matching_lookup["asset_list_row_id"].duplicated().any(): From 9cd166160bfbe9a3cc89f5d43231c3c8ed5c2ede Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 21:51:16 +0000 Subject: [PATCH 105/155] sorted ha63 facts and figures --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bab5cdab..2a1a4b16 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2077,7 +2077,8 @@ class DataLoader: "eco4 (subject to ciga/archetype check": "eco4 (subject to ciga)", "eco4 (subject to archetype check)": "eco4", "eco4 (subject to ciga/archetype)": "eco4 (subject to ciga)", - "eco4 (subject to ciga)": "eco4 (subject to ciga)" + "eco4 (subject to ciga)": "eco4 (subject to ciga)", + "eco4(subject to ciga)": "eco4 (subject to ciga)", } ha_facts_and_figures = [] From 76ef60d06c8d508d4c78e1bda320902880bce96c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 7 Mar 2024 22:16:05 +0000 Subject: [PATCH 106/155] done with ha12 --- .../ha_15_32/ha_analysis_batch_3.py | 58 ++++++++++++++----- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 2a1a4b16..4dbf326b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -148,6 +148,10 @@ class DataLoader: "address": "propertyaddress", "postcode": "address" # The 'address' column actually contains postcode }, + "HA12": { + "address": "Full Address", + "postcode": "Postcode" + }, "HA16": { "address": "Address", "postcode": "Postcode" @@ -169,6 +173,7 @@ class DataLoader: UNMATCHED_CIGA = { "HA2": 0, "HA6": 117, + "HA12": 6, "HA14": 3, "HA15": 3, "HA16": 7, @@ -198,7 +203,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA16", "HA24", "HA48"]: + if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA48"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -1558,13 +1563,39 @@ class DataLoader: return survey_list @staticmethod - def correct_ha63_survey_list(survey_list): - # Drop some filler rows - survey_list = survey_list[ - ~survey_list[survey_list.columns[0]].isin( - ["NO JOBS SURVEYED JULY 2021 ", "NO JOBS SURVEYED SEPTEMBER 2021"] - ) - ] + def correct_ha12_survey_list(survey_list): + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Henstone Road", "Hanstone Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Lindern avenue", "Linden Avenue" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "priness way", "Princess Way" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Worth Crecesent", "Worth Crescent" + ) + + survey_list["Post Code"] = survey_list["Post Code"].str.replace( + "DY117HA", "DY11 7HA" + ) + + survey_list["Post Code"] = survey_list["Post Code"].str.replace( + "DY117HF", "DY11 7HF" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Adderbrook Crescent", "Addenbrooke Crescent" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Kinver Road", "Kinver Avenue" + ) + return survey_list @staticmethod @@ -2079,6 +2110,7 @@ class DataLoader: "eco4 (subject to ciga/archetype)": "eco4 (subject to ciga)", "eco4 (subject to ciga)": "eco4 (subject to ciga)", "eco4(subject to ciga)": "eco4 (subject to ciga)", + "eco4 subject to ciga": "eco4 (subject to ciga)", } ha_facts_and_figures = [] @@ -4472,13 +4504,13 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA6", "HA7", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", "HA48", - "HA50", "HA63", "HA107", + "HA1", "HA2", "HA6", "HA7", "HA12", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", + "HA48", "HA50", "HA63", "HA107", ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come - # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE] - # 63 [WIP] - # Consider for ECO4: 12, 13, 136, 117 + # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE] + # + # Consider for ECO4: 13, 136, 117 # COnsider for GBIS: 56, 35, 34 # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in From e3f36fc881925fd845f623d469d0faf9cd6b89c3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 8 Mar 2024 18:52:32 +0000 Subject: [PATCH 107/155] HA117 data load --- .../ha_15_32/ha_analysis_batch_3.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 4dbf326b..d4de589a 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -188,7 +188,8 @@ class DataLoader: "HA25": 154, "HA41": 26, "HA50": 5, - "HA63": 0 + "HA63": 0, + "HA117": 4 } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): @@ -308,6 +309,11 @@ class DataLoader: asset_list["District"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA117": + asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["PostCode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip() else: raise NotImplementedError("implement me") @@ -1800,6 +1806,17 @@ class DataLoader: return eco3_list + @staticmethod + def correct_ha117_eco3_list(eco3_list): + # Delete rows where postcode is null - there are some placeholder rows where this happens + eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])] + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "TARRING ROAD", "155 TARRING ROAD" + ) + + return eco3_list + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") @@ -4505,13 +4522,13 @@ def app(): # Add in: priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA12", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", - "HA48", "HA50", "HA63", "HA107", + "HA48", "HA50", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE] - # - # Consider for ECO4: 13, 136, 117 - # COnsider for GBIS: 56, 35, 34 + # 117 [WIP] + # Consider for ECO4: 13 + # Consider for GBIS: 56, 35, 34 # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in # Filter down the directories to only the priority HAs From 15efd02b8b8220f1d6cc745cb1b4a571be808643 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 8 Mar 2024 19:14:35 +0000 Subject: [PATCH 108/155] done ha117, ha13 next --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index d4de589a..97ac96da 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2119,15 +2119,19 @@ class DataLoader: "ECO4 GBIS (ECO+) JJC UNDER 73m² ": "GBIS", } + # Since it seems like "subject to archetype check" has some failure conditions, for simplicity, we + # treat these as similar to subject to CIGA, and therefore unconfirmed worked that could fail. There + # are only a small volume of properties for which we see this eco_eligibility_map = { "not eligble": "not eligible", "eco 4(subject to ciga)": "eco4 (subject to ciga)", "eco4 (subject to ciga/archetype check": "eco4 (subject to ciga)", - "eco4 (subject to archetype check)": "eco4", + "eco4 (subject to archetype check)": "eco4 (subject to ciga)", "eco4 (subject to ciga/archetype)": "eco4 (subject to ciga)", "eco4 (subject to ciga)": "eco4 (subject to ciga)", "eco4(subject to ciga)": "eco4 (subject to ciga)", "eco4 subject to ciga": "eco4 (subject to ciga)", + "eco4 (subject to archetype)": "eco4 (subject to ciga)", } ha_facts_and_figures = [] @@ -4525,9 +4529,9 @@ def app(): "HA48", "HA50", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come - # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE] - # 117 [WIP] - # Consider for ECO4: 13 + # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE] + # 13 [WIP] + # Consider for ECO4: # Consider for GBIS: 56, 35, 34 # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in From b2b8fd8f84321f369cc3d14b009515759a2eff9a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 8 Mar 2024 19:20:38 +0000 Subject: [PATCH 109/155] ha13 49% matched --- .../ha_15_32/ha_analysis_batch_3.py | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 97ac96da..3edc1490 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -224,6 +224,12 @@ class DataLoader: asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA13": + asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["address 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA14": # Create matching_address by concatenating Address 1, Address 2, Address 3, Address 4, Postcode asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ @@ -1604,6 +1610,19 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha13_survey_list(survey_list): + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Woodfarm Road", "WOOD FARM ROAD" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "ALLANDALE ROAD", "ALLANDALE" + ) + + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -4525,8 +4544,8 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA6", "HA7", "HA12", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", "HA41", - "HA48", "HA50", "HA63", "HA107", "HA117" + "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", + "HA41", "HA48", "HA50", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE] From 21117f3e585be18d5da6e49744353f7ed830a483 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 8 Mar 2024 19:32:42 +0000 Subject: [PATCH 110/155] worked through ha13 matching - need to do facts and figures --- .../ha_15_32/ha_analysis_batch_3.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 3edc1490..15a4f438 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -174,6 +174,7 @@ class DataLoader: "HA2": 0, "HA6": 117, "HA12": 6, + "HA13": 119, "HA14": 3, "HA15": 3, "HA16": 7, @@ -1621,6 +1622,30 @@ class DataLoader: "ALLANDALE ROAD", "ALLANDALE" ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "NEWFIELDS LANE", "NEWFIELD LANE" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "BROADFIELDS ROAD", "BROADFIELD ROAD" + ) + + survey_list["Post Code"] = survey_list["Post Code"].str.replace( + "HP2 5SF+", "HP2 5SF" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "PESCOTT HILL", "PESCOT HILL" + ) + + # This is a duplicate record + survey_list = survey_list[ + ~((survey_list["NO."] == 33) & + (survey_list["Street / Block Name"] == "Turners Hill") & + (survey_list["Post Code"] == "HP2 4LH") & + (survey_list["INSTALLED OR CANCELLED"] == "NO UPDATE - CHECKED 18.12.23")) + ] + return survey_list @staticmethod @@ -1652,6 +1677,9 @@ class DataLoader: postcode.lower() not in asset_list["matching_postcode"].values ] + if ha_name == "HA13": + missed_postcodes = ["hp17 8le"] + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): From f03485d4f49045e8f68cf7a8dcc5caf58113ede1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 14:41:38 +0000 Subject: [PATCH 111/155] updating facts and figures to treat archetype dependent properties separately --- .../ha_15_32/ha_analysis_batch_3.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 15a4f438..c0f3ab12 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2172,13 +2172,12 @@ class DataLoader: eco_eligibility_map = { "not eligble": "not eligible", "eco 4(subject to ciga)": "eco4 (subject to ciga)", - "eco4 (subject to ciga/archetype check": "eco4 (subject to ciga)", - "eco4 (subject to archetype check)": "eco4 (subject to ciga)", - "eco4 (subject to ciga/archetype)": "eco4 (subject to ciga)", + "eco4 (subject to ciga/archetype check": "eco4 (subject to ciga) (subject to archetype)", + "eco4 (subject to archetype check)": "eco4 (subject to archetype)", + "eco4 (subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)", "eco4 (subject to ciga)": "eco4 (subject to ciga)", "eco4(subject to ciga)": "eco4 (subject to ciga)", "eco4 subject to ciga": "eco4 (subject to ciga)", - "eco4 (subject to archetype)": "eco4 (subject to ciga)", } ha_facts_and_figures = [] @@ -2330,7 +2329,7 @@ class DataLoader: asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id") # Update the cases where properties have sold, but are missing a CIGA check asset_list["ECO Eligibility"] = np.where( - (asset_list["ECO Eligibility"] == "eco4 (subject to ciga)") & ( + (asset_list["ECO Eligibility"].str.contains("(subject to ciga)")) & ( asset_list["has_a_survey_record"] == True ), "eco4 - passed ciga", @@ -2349,7 +2348,14 @@ class DataLoader: # Update the cases where a property was marked as eligible for ECO4, but sold for GBIS asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"].isin( - ["eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"] + [ + "eco4", + "eco4 (subject to ciga)", + "eco4 - passed ciga", + "failed ciga", + "eco4 (subject to archetype)", + "eco4 (subject to ciga) (subject to archetype)" + ] )) & ( asset_list["installation_status"].isin( ["GBIS - installed", "GBIS - cancelled", "GBIS - in progress"] From c1a15052f246288c5216e2c80849ccef3b2c6be0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 14:46:26 +0000 Subject: [PATCH 112/155] Handling warning for regex searching of (subject to ciga) --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index c0f3ab12..430e5ff7 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2329,7 +2329,7 @@ class DataLoader: asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id") # Update the cases where properties have sold, but are missing a CIGA check asset_list["ECO Eligibility"] = np.where( - (asset_list["ECO Eligibility"].str.contains("(subject to ciga)")) & ( + (asset_list["ECO Eligibility"].str.contains("subject to ciga")) & ( asset_list["has_a_survey_record"] == True ), "eco4 - passed ciga", From b46da0f6c0140b28d00385f02f29cae91f412b2d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 15:48:51 +0000 Subject: [PATCH 113/155] adding in archetype check process to model --- .../ha_15_32/ha_analysis_batch_3.py | 99 +++++++++++++++---- 1 file changed, 82 insertions(+), 17 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 430e5ff7..9a959956 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3658,19 +3658,47 @@ def patch_cleaned(cleaned): def calculate_eco4_post_ciga( eligiblity_counts, input_data, ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate, - eco4_rate + eco4_rate, archetype_conversion_rate ): remaining_needing_ciga_check = eligiblity_counts[ - eligiblity_counts["ECO Eligibility"] == "eco4 (subject to ciga)" + eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") & + ~eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype") ]["count"].sum() + remaining_needing_ciga_and_archetype_check = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"].str.contains("subject to ciga") & + eligiblity_counts["ECO Eligibility"].str.contains("subject to archetype") + ]["count"].sum() + # We scale this down by the archetype_conversion_rate, and add this on to the remaining_needing_ciga_check + remaining_needing_ciga_and_archetype_check_passed = np.round( + remaining_needing_ciga_and_archetype_check * archetype_conversion_rate + ) + + remaining_needing_ciga_check += remaining_needing_ciga_and_archetype_check_passed + + eco4_no_ciga_needed = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "eco4" + ]["count"].sum() + + eco4_no_ciga_archetype_needed = eligiblity_counts[ + eligiblity_counts["ECO Eligibility"] == "eco4 (subject to archetype)" + ]["count"].sum() + eco4_no_ciga_archetype_needed_passed = np.round( + eco4_no_ciga_archetype_needed * archetype_conversion_rate + ) + + eco4_no_ciga_needed += eco4_no_ciga_archetype_needed_passed + + failed_archetype_check = int( + remaining_needing_ciga_and_archetype_check + + eco4_no_ciga_archetype_needed - + remaining_needing_ciga_and_archetype_check_passed - + eco4_no_ciga_archetype_needed_passed + ) + has_ciga_check = not input_data["ciga_list"].empty if has_ciga_check: - eco4_no_ciga_needed = eligiblity_counts[ - eligiblity_counts["ECO Eligibility"] == "eco4" - ]["count"].sum() - eco4_ciga_passed = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "eco4 - passed ciga" ]["count"].sum() @@ -3681,8 +3709,10 @@ def calculate_eco4_post_ciga( eco4_no_ciga_needed_or_ciga_passed = eco4_no_ciga_needed + eco4_ciga_passed - eco4_confirmed = (eco4_no_ciga_needed * ha_eco4_to_sale_rate) + (eco4_ciga_passed * ha_ciga_pass_to_sale_rate) - eco4_confirmed = np.round(eco4_confirmed) + eco4_confirmed = np.round( + (eco4_no_ciga_needed * ha_eco4_to_sale_rate) + + (eco4_ciga_passed * ha_ciga_pass_to_sale_rate) + ) eco4_no_ciga_needed_cancellations = int(eco4_no_ciga_needed_or_ciga_passed - eco4_confirmed) @@ -3704,9 +3734,7 @@ def calculate_eco4_post_ciga( eco4_expected_cancellations = eco4_no_ciga_needed_cancellations + eco4_ciga_needed_cancellations else: - eco4_no_ciga_needed = eligiblity_counts[ - eligiblity_counts["ECO Eligibility"] == "eco4" - ]["count"].sum() + eco4_confirmed_ciga_failures = 0 # Multiply by sale conversion eco4_confirmed = np.round(eco4_no_ciga_needed * ha_eco4_to_sale_rate) @@ -3735,6 +3763,9 @@ def calculate_eco4_post_ciga( "ECO4 - post CIGA - £": eco4_post_ciga * eco4_rate, "Of which confirmed - £": eco4_confirmed * eco4_rate, "Of which forecast - £": eco4_remaining_forecast * eco4_rate, + # Archetype check failures + "Estimated total - failed archetype check - #": failed_archetype_check, + "Estimated total - failed archetype check - £": failed_archetype_check * eco4_rate, # Ciga failures "Estimated total - failed CIGA": int(eco4_confirmed_ciga_failures + eco4_estimated_ciga_failures), "Confirmed CIGA failures": eco4_confirmed_ciga_failures, @@ -3766,6 +3797,14 @@ def forecast_remaining_sales(loader): gbis_rate = 600 eco4_rate = 1710 + # Based on ONS https://www.ons.gov.uk/peoplepopulationandcommunity/housing/bulletins/housingenglandandwales + # /census2021 + # there are 5.7 million terraced properties in the UK, of the 19.3 million houses or bungalows. We therefore apply + # a 30% discount to homes that are dependent on an archetype check, since around 30% of them will be mid terraced + # This 30% is slightly harsh but we be conservative + # Therefore, the archetype check conversion rate is 70% + archetype_conversion_rate = 0.7 + # 1) Calculate the conversion rate from passed CIGA to actual sale converted_ciga_jobs = [] for ha_name, input_data in loader.data.items(): @@ -4010,13 +4049,27 @@ def forecast_remaining_sales(loader): eco4_pre_ciga = eligiblity_counts[ eligiblity_counts["ECO Eligibility"].isin( - ["eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"] + [ + "eco4", + "eco4 (subject to ciga)", + "eco4 - passed ciga", + "failed ciga", + "eco4 (subject to ciga) (subject to archetype)", + "eco4 (subject to archetype)" + ] ) ]["count"].sum() eco4_pre_ciga_remaining = eligiblity_counts_remaining[ eligiblity_counts_remaining["ECO Eligibility"].isin( - ["eco4", "eco4 (subject to ciga)", "eco4 - passed ciga", "failed ciga"] + [ + "eco4", + "eco4 (subject to ciga)", + "eco4 - passed ciga", + "failed ciga", + "eco4 (subject to ciga) (subject to archetype)", + "eco4 (subject to archetype)" + ] ) ]["count"].sum() @@ -4065,7 +4118,8 @@ def forecast_remaining_sales(loader): ha_ciga_conversion_rate=ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate=ha_eco4_to_sale_rate, - eco4_rate=eco4_rate + eco4_rate=eco4_rate, + archetype_conversion_rate=archetype_conversion_rate ) eco4_post_ciga_remaining_results = calculate_eco4_post_ciga( @@ -4074,7 +4128,8 @@ def forecast_remaining_sales(loader): ha_ciga_conversion_rate=ha_ciga_conversion_rate, ha_ciga_pass_to_sale_rate=ha_ciga_pass_to_sale_rate, ha_eco4_to_sale_rate=ha_eco4_to_sale_rate, - eco4_rate=eco4_rate + eco4_rate=eco4_rate, + archetype_conversion_rate=archetype_conversion_rate ) # Calculate the delta compared to Warmfront's original remaining @@ -4111,6 +4166,8 @@ def forecast_remaining_sales(loader): gbis_remaining = int(np.round(gbis_remaining * ha_gbis_sale_conversion)) gbis_remaining_revenue = int(gbis_remaining * gbis_rate) + survey_list["installation_status"].value_counts() + # GBIS delta if original_warmfront_remaining_gbis == 0: gbis_delta_vs_original_estimate_remaining = "N/A" @@ -4176,7 +4233,7 @@ def forecast_remaining_sales(loader): surveys_with_eligibility["installation_status"] == "GBIS - cancelled" ].shape[0] - expected_gbis_unconfirmed_sales = incomplete_gbis_sales * ha_gbis_sale_conversion + expected_gbis_unconfirmed_sales = np.round(incomplete_gbis_sales * ha_gbis_sale_conversion) gbis_expected_cancellations = int(incomplete_gbis_sales - expected_gbis_unconfirmed_sales) @@ -4187,10 +4244,12 @@ def forecast_remaining_sales(loader): # Add in the variance: # We should expect that the pre-ciga total is: # 1) The number of post CIGA successes + + # 2) The number of archetype failures + # 2) the number of CIGA failures + # 3) The number of cancellations variance_total = eco4_pre_ciga - ( eco4_post_ciga_total_results["ECO4 - post CIGA - #"] + + eco4_post_ciga_total_results["Estimated total - failed archetype check - #"] + eco4_post_ciga_total_results['Estimated total - failed CIGA'] + eco4_post_ciga_total_results["Expected cancellations - #"] ) @@ -4199,6 +4258,7 @@ def forecast_remaining_sales(loader): variance_remaining = eco4_pre_ciga_remaining - ( eco4_post_ciga_remaining_results["ECO4 - post CIGA - #"] + + eco4_post_ciga_remaining_results["Estimated total - failed archetype check - #"] + eco4_post_ciga_remaining_results['Estimated total - failed CIGA'] + eco4_post_ciga_remaining_results["Expected cancellations - #"] ) @@ -4290,6 +4350,11 @@ def forecast_remaining_sales(loader): ("ECO4 Cancellations", "", "Of which expected cancellations - £", ""): eco4_post_ciga_remaining_results[ "Expected cancellations - £" ], + # Archetype check failures + ("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - #", ""): + eco4_post_ciga_remaining_results['Estimated total - failed archetype check - #'], + ("ECO4 CIGA failures", "", "Estimated total - failed Archetype check - £", ""): + eco4_post_ciga_remaining_results['Estimated total - failed archetype check - £'], # CIGA failures ("ECO4 CIGA failures", "", "Estimated total - failed CIGA - #", ""): eco4_post_ciga_remaining_results[ 'Estimated total - failed CIGA' @@ -4324,7 +4389,7 @@ def forecast_remaining_sales(loader): } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 47: + if len(to_append) != 49: raise ValueError("Something went wrong") results.append(to_append) From a7e593ecd9289551d7ef47481ea3dff0c2a70592 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 16:15:16 +0000 Subject: [PATCH 114/155] Added handling of archetype checks and corrected gbis calculations --- .../ha_15_32/ha_analysis_batch_3.py | 65 ++++++++++++++----- 1 file changed, 47 insertions(+), 18 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 9a959956..aca2ce43 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -4154,19 +4154,25 @@ def forecast_remaining_sales(loader): else: ha_gbis_sale_conversion = median_gbis_to_install - gbis_total = eligiblity_counts[ + gbis_total_pre_cancellations = eligiblity_counts[ eligiblity_counts["ECO Eligibility"] == "gbis" ]["count"].sum() - gbis_total = int(np.round(gbis_total * ha_gbis_sale_conversion)) - gbis_total_revenue = int(gbis_total * gbis_rate) - gbis_remaining = eligiblity_counts_remaining[ + gbis_total_pre_cancellations_revenue = gbis_total_pre_cancellations * gbis_rate + # gbis_total = int(np.round(gbis_total_pre_cancellations * ha_gbis_sale_conversion)) + # gbis_total_revenue = int(gbis_total * gbis_rate) + + gbis_remaining_pre_cancellations = eligiblity_counts_remaining[ eligiblity_counts_remaining["ECO Eligibility"] == "gbis" ]["count"].sum() - gbis_remaining = int(np.round(gbis_remaining * ha_gbis_sale_conversion)) + gbis_remaining_pre_cancellations_revenue = ( + gbis_remaining_pre_cancellations * gbis_rate + ) + # This is the gbis jobs we expect to sell + gbis_remaining = int(np.round(gbis_remaining_pre_cancellations * ha_gbis_sale_conversion)) gbis_remaining_revenue = int(gbis_remaining * gbis_rate) - - survey_list["installation_status"].value_counts() + # This is the number we expect to cancel + gbis_remaining_expected_cancellations = int(gbis_remaining_pre_cancellations - gbis_remaining) * gbis_rate # GBIS delta if original_warmfront_remaining_gbis == 0: @@ -4179,9 +4185,10 @@ def forecast_remaining_sales(loader): # Current sales figures # For any sales surveys that are complete, that could still cancel, we apply a conversion rate eco4_actually_sold = 0 - gbis_actually_sold = 0 eco4_confirmed_cancellations = 0 eco4_expected_cancellations = 0 + + gbis_actually_sold = 0 gbis_confirmed_cancellations = 0 gbis_expected_cancellations = 0 if not survey_list.empty: @@ -4284,17 +4291,30 @@ def forecast_remaining_sales(loader): raise ValueError("Something went wrong in pre_ciga_eco4_variance") # Check GBIS total variance - gbis_variance = ( - gbis_total_revenue - - gbis_actually_sold - - gbis_confirmed_cancellations * gbis_rate - - gbis_expected_cancellations * gbis_rate - - gbis_remaining_revenue + # The total before cancellations should equal: + # The number of sold + + # The number of confirmed cancelled + + # The number of expected cancelled + + # The number of remaining + gbis_variance = gbis_total_pre_cancellations - ( + gbis_actually_sold / gbis_rate + + gbis_confirmed_cancellations + + gbis_expected_cancellations + + gbis_remaining_pre_cancellations ) if gbis_variance != 0: raise ValueError("Something went wrong in gbis_variance") + # We expect the remaining to equal expected sales + expected cancellations + gbis_variance_2 = gbis_remaining_pre_cancellations - ( + gbis_remaining + + gbis_remaining_expected_cancellations + ) + + if gbis_variance_2 != 0: + raise ValueError("Something went wrong in gbis_variance") + to_append = { ("", "", "", "HA Name"): ha_name, # ECO4 - original warmfront figures @@ -4375,17 +4395,26 @@ def forecast_remaining_sales(loader): "Estimated CIGA failures - £" ], # GBIS postcode list - ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total, - ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"): gbis_total_revenue, + ("GBIS Postcode list", "Warmfront post code list", "Total - #", "GBIS total"): gbis_total_pre_cancellations, + ("GBIS Postcode list", "Warmfront post code list", "Total - £", "GBIS total"): + gbis_total_pre_cancellations_revenue, ("GBIS Postcode list", "Warmfront post code list", "GBIS VARIANCE", "GBIS total"): gbis_variance, ("GBIS Postcode list", "Warmfront post code list", "Sold - £", "GBIS total"): gbis_actually_sold, ("GBIS Postcode list", "", "Confirmed cancellations - £", ""): gbis_confirmed_cancellations * gbis_rate, # This is for jobs that are in-progress and could still cancel ("GBIS Postcode list", "", "Unconfirmed cancellations - £", ""): gbis_expected_cancellations * gbis_rate, - ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"): gbis_remaining, - ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"): gbis_remaining_revenue, + ("GBIS Postcode list", "Warmfront post code list", "Remaining - #", "GBIS total"): + gbis_remaining_pre_cancellations, + ("GBIS Postcode list", "Warmfront post code list", "Remaining - £", "GBIS total"): + gbis_remaining_pre_cancellations_revenue, ("GBIS Postcode list", "", "Delta vs original estimate, remaining - %", ""): gbis_delta_vs_original_estimate_remaining, + # Expected cancellations + ( + "GBIS Postcode list", "Of which expected sales - £", "Remaining - £", + "GBIS total"): gbis_remaining_revenue, + ("GBIS Postcode list", "Of which expected cancellations -£", "Remaining - £", "GBIS total"): + gbis_remaining_expected_cancellations } # Make sure nothing is forgotten due to duplicate multi-index keys From f9957a55d066a294e79efdf196b72e79d82689fb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 16:19:54 +0000 Subject: [PATCH 115/155] fixed bug in gbis variance 2? --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index aca2ce43..a25f98c6 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -4172,7 +4172,8 @@ def forecast_remaining_sales(loader): gbis_remaining = int(np.round(gbis_remaining_pre_cancellations * ha_gbis_sale_conversion)) gbis_remaining_revenue = int(gbis_remaining * gbis_rate) # This is the number we expect to cancel - gbis_remaining_expected_cancellations = int(gbis_remaining_pre_cancellations - gbis_remaining) * gbis_rate + gbis_remaining_expected_cancellations = int(gbis_remaining_pre_cancellations - gbis_remaining) + gbis_remaining_expected_cancellations_revenue = gbis_remaining_expected_cancellations * gbis_rate # GBIS delta if original_warmfront_remaining_gbis == 0: @@ -4313,7 +4314,7 @@ def forecast_remaining_sales(loader): ) if gbis_variance_2 != 0: - raise ValueError("Something went wrong in gbis_variance") + raise ValueError("Something went wrong in gbis_variance2") to_append = { ("", "", "", "HA Name"): ha_name, @@ -4414,7 +4415,7 @@ def forecast_remaining_sales(loader): "GBIS Postcode list", "Of which expected sales - £", "Remaining - £", "GBIS total"): gbis_remaining_revenue, ("GBIS Postcode list", "Of which expected cancellations -£", "Remaining - £", "GBIS total"): - gbis_remaining_expected_cancellations + gbis_remaining_expected_cancellations_revenue } # Make sure nothing is forgotten due to duplicate multi-index keys From 1ccb2cdebdca9a2fc17f0b11ef431bac81309357 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 16:22:28 +0000 Subject: [PATCH 116/155] updated number of expected to append --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index a25f98c6..7ddc9844 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -4419,7 +4419,7 @@ def forecast_remaining_sales(loader): } # Make sure nothing is forgotten due to duplicate multi-index keys - if len(to_append) != 49: + if len(to_append) != 51: raise ValueError("Something went wrong") results.append(to_append) From 768a0385e3a2cf7fc29b86b827cfb43d914e4621 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 17:02:33 +0000 Subject: [PATCH 117/155] ha35 data read --- .../ha_15_32/ha_analysis_batch_3.py | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7ddc9844..ea0078c2 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -276,6 +276,13 @@ class DataLoader: asset_list["POST CODE"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["POST CODE"].astype(str).str.lower().str.strip() + elif ha_name == "HA35": + asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Post Code"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Address Post Code"].astype(str).str.lower().str.strip() elif ha_name == "HA38": asset_list["matching_address"] = asset_list["House_Number"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address_Line_1"].astype(str).str.lower().str.strip() + ", " + \ @@ -1648,6 +1655,13 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha35_survey_list(survey_list): + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "BALLADIER WLAK", "BALLADIER WALK" + ) + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -4673,14 +4687,14 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA39", - "HA41", "HA48", "HA50", "HA63", "HA107", "HA117" + "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA35", + "HA39", "HA41", "HA48", "HA50", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come - # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE] - # 13 [WIP] + # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE] + # 35 [WIP] # Consider for ECO4: - # Consider for GBIS: 56, 35, 34 + # Consider for GBIS: 56, 34 # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in # Filter down the directories to only the priority HAs From 29f2a2abf801e4c01ad89383b18eaac4ed97b0af Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 17:09:43 +0000 Subject: [PATCH 118/155] HA35 done --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index ea0078c2..04ee343c 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -4691,8 +4691,9 @@ def app(): "HA39", "HA41", "HA48", "HA50", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come - # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE] - # 35 [WIP] + # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], + # 35 [DONE] + # 34 [WIP] # Consider for ECO4: # Consider for GBIS: 56, 34 # Ignore for now: From 6e4fc23ecc2036e14148b18611cb04aafde8084b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 18:12:12 +0000 Subject: [PATCH 119/155] fixed dupes for HA34 --- .../ha_15_32/ha_analysis_batch_3.py | 104 +++++++++++++++++- 1 file changed, 98 insertions(+), 6 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 04ee343c..8784481b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -276,6 +276,12 @@ class DataLoader: asset_list["POST CODE"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["POST CODE"].astype(str).str.lower().str.strip() + elif ha_name == "HA34": + asset_list["matching_address"] = ( + asset_list[" Address"].astype(str).str.lower().str.strip() + ", " + + asset_list[" Postcode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA35": asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \ @@ -566,7 +572,8 @@ class DataLoader: eco3_list["eco3_list_row_id"] = [ha_name + "_Eco3_" + str(i) for i in range(0, len(eco3_list))] # Perform the eco3 merge - eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name) + if not eco3_list.empty: + eco3_list = self.merge_eco3_to_assets(asset_list, eco3_list, ha_name) if ha_name in ["HA25"]: # Accomodate ha25 unique structure @@ -1657,9 +1664,94 @@ class DataLoader: @staticmethod def correct_ha35_survey_list(survey_list): - survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( - "BALLADIER WLAK", "BALLADIER WALK" + return survey_list + + @staticmethod + def correct_ha34_survey_list(survey_list): + # Note in the asset list + survey_list = survey_list[ + survey_list["Post Code"] != "L5 3SS" + ] + + survey_list["Post Code"] = survey_list["Post Code"].str.replace( + "L177DR", "L17 7DR" ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "PENVALLEY CRESENT", "Penvalley Crescent" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "PENLINKEN DRIVE", "Penlinken Drive" + ) + + # There's no 32 Penlinken Drive in the asset sheet + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Penlinken Drive") & + (survey_list["NO."] == 32)) + ] + + # There's no 30 Gwent Street in the asset sheet + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "GWENT ST") & + (survey_list["NO."] == 30)) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "POULTON RD", "Poulton Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "ST PAULS RD", "St Pauls Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "BROAD LANE, KIRKBY", "BROAD LANE" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "BULLENS RD, KIRKBY", "Bullens Road" + ) + + # There's no 219 NORTH HILL ST in the asset sheet + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "NORTH HILL ST") & + (survey_list["NO."] == 219)) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "CROSLAND RD, KIRKBY", "CROSLAND ROAD" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "PARK BROW DRIVE, KIRKBY", "Park Brow Drive" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "CELTIC TREET", "Celtic Street" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "BUCKLAND ROAD", "Buckland Street" + ) + + # duplicates + survey_list = survey_list.drop_duplicates(["Street / Block Name", "NO.", "Post Code"]) + + # This is a duplicate with wrong postcode + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "CLARIBEL STREET") & + (survey_list["NO."] == 7) & + (survey_list["Post Code"] == "L8 8AF")) + ] + + survey_list["NO."] = np.where( + ((survey_list["NO."] == "187 A") & + (survey_list["Post Code"] == "L32 6QF")), + "187A", + survey_list["NO."] + ) + return survey_list @staticmethod @@ -1685,7 +1777,7 @@ class DataLoader: survey_list = survey_list_correction_function(survey_list) missed_postcodes = [] - if ha_name == "HA6": + if ha_name in ["HA6", "HA34"]: missed_postcodes = [ postcode.lower() for postcode in survey_list["Post Code"] if postcode.lower() not in asset_list["matching_postcode"].values @@ -4687,8 +4779,8 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA35", - "HA39", "HA41", "HA48", "HA50", "HA63", "HA107", "HA117" + "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA34", + "HA35", "HA39", "HA41", "HA48", "HA50", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], From 27fed2dce320a54a049df279fca5c3abd407275f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 9 Mar 2024 18:25:22 +0000 Subject: [PATCH 120/155] temp removed HA34 due to issue --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 8784481b..d1f8d546 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2270,6 +2270,11 @@ class DataLoader: "ECO4 AFFORDABLE WARMTH": "ECO4", "Affordable Warmth": "ECO4", "ECO4 GBIS (ECO+) JJC UNDER 73m² ": "GBIS", + "ECO4 PPS": "ECO4", + "AFFORDABLE WARMTH / REMEDIAL": "ECO4", + "AFF0RDALE WARMTH": "ECO4", + "ECO 4 RdSAP CL": "ECO4", + "Affordable Warmth (R) ": "ECO4" } # Since it seems like "subject to archetype check" has some failure conditions, for simplicity, we @@ -4779,15 +4784,17 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", "HA34", + "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", + # "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], # 35 [DONE] - # 34 [WIP] + # [WIP] # Consider for ECO4: - # Consider for GBIS: 56, 34 + # Consider for GBIS: 56 + # 34 [bug in the results so leaving out for the moment] # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in # Filter down the directories to only the priority HAs From 28434f43c8fd9dac176fd68a1b4e20a79a128e9d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 13:55:44 +0000 Subject: [PATCH 121/155] ha56 wip --- .../ha_15_32/ha_analysis_batch_3.py | 90 +++++++++++++++++-- 1 file changed, 83 insertions(+), 7 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index d1f8d546..064ff8f5 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -317,6 +317,12 @@ class DataLoader: asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Post Code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() + elif ha_name == "HA56": + asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Post Code"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() elif ha_name == "HA63": asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["POSTCODE"].astype(str).str.lower().str.strip() @@ -639,6 +645,54 @@ class DataLoader: return asset_list + @staticmethod + def correct_ha56_asset_list(asset_list): + # CH1 4JR has already been surveyed, but it's listed in the asset list + # as a single row, when it's actually 32 units, so we just set this + # as ineligible + asset_list["ECO Eligibility"] = np.where( + asset_list["Post Code"] == "CH1 4JR", + "Not eligible", + asset_list["ECO Eligibility"] + ) + + # Same for CW8 3EU + asset_list["ECO Eligibility"] = np.where( + asset_list["Post Code"] == "CW8 3EU", + "Not eligible", + asset_list["ECO Eligibility"] + ) + + asset_list["ECO Eligibility"] = np.where( + asset_list["Post Code"] == "CW1 3HP", + "Not eligible", + asset_list["ECO Eligibility"] + ) + + asset_list["ECO Eligibility"] = np.where( + asset_list["Post Code"] == "WA4 2PH", + "Not eligible", + asset_list["ECO Eligibility"] + ) + + asset_list["ECO Eligibility"] = np.where( + asset_list["Post Code"] == "BD6 1QJ", + "Not eligible", + asset_list["ECO Eligibility"] + ) + + asset_list["ECO Eligibility"] = np.where( + asset_list["Post Code"] == "L39 1RS", + "Not eligible", + asset_list["ECO Eligibility"] + ) + + asset_list["ECO Eligibility"] = np.where( + asset_list["Post Code"] == "WA10 2DE", + "Not eligible", + asset_list["ECO Eligibility"] + ) + @staticmethod def correct_ha14_asset_list(asset_list): @@ -1970,6 +2024,24 @@ class DataLoader: return eco3_list + @staticmethod + def correct_ha56_eco3_list(eco3_list): + eco3_list = eco3_list[~pd.isnull(eco3_list["Post Code"])] + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "Mount Pleasant, Crewe", "Mount Pleasant" + ) + + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "Dutton Close", "Dutton Way" + ) + + eco3_list["Post Code"] = eco3_list["Post Code"].str.replace( + "Ls63nl", "LS6 3NL" + ) + + return eco3_list + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") @@ -1978,8 +2050,8 @@ class DataLoader: asset_list["matching_postcode_nospace"] = asset_list["matching_postcode"].str.replace(" ", "").str.lower() eco3_list["postcode_no_space"] = eco3_list["Post Code"].str.lower().str.replace(" ", "") - if ha_name == "HA25": - # 317 -> 259 + if ha_name in ["HA25", "HA56"]: + # HA25: 317 -> 259 missed_postcodes = { postcode for postcode in eco3_list["postcode_no_space"] if postcode not in asset_list["matching_postcode_nospace"].values @@ -2060,6 +2132,7 @@ class DataLoader: raise ValueError( f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" ) + missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)] matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on @@ -3896,6 +3969,9 @@ def calculate_eco4_post_ciga( def forecast_remaining_sales(loader): + # TODO: Skip HA34 for the moment + loader.data = {k: v for k, v in loader.data.items() if k != "HA34"} + # Assumptions: # We cap the ciga conversion rate at 75% because I expect future HAs to have a lower CIGA conversion rate # and I don't want the numbers to change too much, depenent on the CIGA conversation rate @@ -4523,9 +4599,9 @@ def forecast_remaining_sales(loader): gbis_delta_vs_original_estimate_remaining, # Expected cancellations ( - "GBIS Postcode list", "Of which expected sales - £", "Remaining - £", + "GBIS Postcode list", "", "Of which expected sales - £ - £", "GBIS total"): gbis_remaining_revenue, - ("GBIS Postcode list", "Of which expected cancellations -£", "Remaining - £", "GBIS total"): + ("GBIS Postcode list", "", "Of which expected cancellations -£", "GBIS total"): gbis_remaining_expected_cancellations_revenue } @@ -4786,14 +4862,14 @@ def app(): priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", # "HA34", - "HA35", "HA39", "HA41", "HA48", "HA50", "HA63", "HA107", "HA117" + "HA35", "HA39", "HA41", "HA48", "HA50", "HA56", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], # 35 [DONE] - # [WIP] + # 56 [WIP] # Consider for ECO4: - # Consider for GBIS: 56 + # Consider for GBIS: # 34 [bug in the results so leaving out for the moment] # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in From db7b6de87bfb13486a179cbdc547ae375cfc0c8d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 14:13:20 +0000 Subject: [PATCH 122/155] handle HA56 dupes --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 064ff8f5..62099386 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -189,6 +189,7 @@ class DataLoader: "HA25": 154, "HA41": 26, "HA50": 5, + "HA56": 320, "HA63": 0, "HA117": 4 } @@ -693,6 +694,8 @@ class DataLoader: asset_list["ECO Eligibility"] ) + return asset_list + @staticmethod def correct_ha14_asset_list(asset_list): @@ -2040,6 +2043,14 @@ class DataLoader: "Ls63nl", "LS6 3NL" ) + # Handle a duplicate + eco3_list = eco3_list[ + ~((eco3_list["Street / Block Name"] == "Mount Pleasant") & + (eco3_list["Post Code"] == "CW1 3JF") & + (eco3_list["NO "] == 5) & + (eco3_list["INSTALL/ CANCELLATION DATE"] == "CANCELLED 20.5.2022")) + ] + return eco3_list def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): @@ -2128,15 +2139,16 @@ class DataLoader: # HA25 contains 119 missed entries. These are actually 24 unique postcodes, and the majority belong to 2 # where many surveys were conducted on house numbers, not in the asset list # 154 missed, 2827 matched for HA 25 + # For HA56, the number of missed is high at 320, however a big portion of these are due to the block being + # listed in the asset list, and individual units being in the survey list if len(missed) != self.UNMATCHED_ECO3[ha_name]: raise ValueError( f"Unmatched addresses for {ha_name} is not as expected, got {len(missed)} unmatched" ) - missed_df = eco3_list[eco3_list["eco3_list_row_id"].isin(missed)] matching_lookup = pd.DataFrame(matching_lookup) # Check dupes as this will cause problems later on - if matching_lookup["asset_list_row_id"].duplicated().any(): + if matching_lookup["asset_list_row_id"].duplicated().sum(): raise ValueError("Duplicated asset list row ids") # Merge onto eco3 list From 8b3f4d3a520f9148195c6fbd55d3b1d7354d0ee1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 14:25:47 +0000 Subject: [PATCH 123/155] ha56 survey list matching --- .../ha_15_32/ha_analysis_batch_3.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 62099386..f9bf3856 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -694,6 +694,20 @@ class DataLoader: asset_list["ECO Eligibility"] ) + # Already surveyed under ECO4 + asset_list["ECO Eligibility"] = np.where( + asset_list["Post Code"] == "SK17 6NR", + "Not eligible", + asset_list["ECO Eligibility"] + ) + + asset_list["ECO Eligibility"] = np.where( + ~((asset_list["Post Code"] == "WA5 0EN") & + (asset_list["Address 1"] == "Block 17-26 Tavlin Avenue")), + "Not eligible", + asset_list["ECO Eligibility"] + ) + return asset_list @staticmethod @@ -1811,6 +1825,29 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha56_survey_list(survey_list): + # Not in asset list + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Samual Street") & + (survey_list["NO."].isin([22, 24])) & + (survey_list["Post Code"] == "WA5 1BB")) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "STOURTON RD", "Stourton Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "BIRKIN RD", "Birkin Road" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "PORTLAND RD", "Portland Road" + ) + + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -1843,6 +1880,10 @@ class DataLoader: if ha_name == "HA13": missed_postcodes = ["hp17 8le"] + if ha_name == "HA56": + # Multiple properties are listed as blocks, which is a problem for matching + missed_postcodes = ["sk17 6nr", "wa5 0en"] + matching_lookup = [] for _, row in tqdm(survey_list.iterrows(), total=len(survey_list)): @@ -1890,6 +1931,19 @@ class DataLoader: df = df[df["HouseNo"].astype(str).str.lower() == str(house_number)] if df.shape[0] != 1: df = df[df["matching_postcode"].str.lower().str.contains(row["Post Code"].lower().strip())] + + if df.empty: + + postcode_lower = row["Post Code"].lower() + if postcode_lower in missed_postcodes: + matching_lookup.append( + { + "survey_list_row_id": row["survey_list_row_id"], + "asset_list_row_id": None, + } + ) + continue + if df.shape[0] != 1: if "Town/Area" not in row.keys(): full_key = (str(row["NO."]).lower().strip() + row["Street / Block Name"].lower().strip() + From 4a6711a1403a8661b467a0f7023151829e305822 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 14:35:08 +0000 Subject: [PATCH 124/155] handling ha56 dupes| --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index f9bf3856..0030af9d 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1846,6 +1846,13 @@ class DataLoader: "PORTLAND RD", "Portland Road" ) + # We remove a row, because two rows match to a block listing + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Tavlin Avenue") & + (survey_list["NO."] == 17) & + (survey_list["Post Code"] == "WA5 0EN")) + ] + return survey_list @staticmethod From ba65b6c8e37e5a44492c3342a05513d05d275ac4 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 14:39:15 +0000 Subject: [PATCH 125/155] fixed bug in asset list cleaning --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 0030af9d..b1eda326 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -702,8 +702,8 @@ class DataLoader: ) asset_list["ECO Eligibility"] = np.where( - ~((asset_list["Post Code"] == "WA5 0EN") & - (asset_list["Address 1"] == "Block 17-26 Tavlin Avenue")), + ((asset_list["Post Code"] == "WA5 0EN") & + (asset_list["Address 1"] == "Block 17-26 Tavlin Avenue")), "Not eligible", asset_list["ECO Eligibility"] ) From 5eb938bf54fbaaf52bb72e7c8972bad5e2d58a46 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 15:40:02 +0000 Subject: [PATCH 126/155] ha18 done --- .../ha_15_32/ha_analysis_batch_3.py | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index b1eda326..676bd613 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -249,6 +249,20 @@ class DataLoader: asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA18": + asset_list["matching_address"] = ( + asset_list["Address"].astype(str).str.lower().str.strip() + ", " + + asset_list["Post Code"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() + elif ha_name == "HA19": + asset_list["matching_address"] = ( + asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + + asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + + asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + + asset_list["Postcode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA25": asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] @@ -495,6 +509,8 @@ class DataLoader: return "CIGA checks" elif "CIGA check" in workbook.sheetnames: return "CIGA check" + elif "CIGA Check" in workbook.sheetnames: + return "CIGA Check" elif "CIGA requested" in workbook.sheetnames: return "CIGA requested" else: @@ -1733,6 +1749,10 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha18_survey_list(survey_list): + return survey_list + @staticmethod def correct_ha35_survey_list(survey_list): return survey_list @@ -2435,6 +2455,7 @@ class DataLoader: "eco4 (subject to ciga)": "eco4 (subject to ciga)", "eco4(subject to ciga)": "eco4 (subject to ciga)", "eco4 subject to ciga": "eco4 (subject to ciga)", + "eco4 (subject to archetype/ciga)": "eco4 (subject to ciga) (subject to archetype)", } ha_facts_and_figures = [] @@ -4933,14 +4954,15 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA24", "HA25", "HA28", "HA32", + "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", + "HA19", "HA24", "HA25", "HA28", "HA32", # "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA56", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], - # 35 [DONE] - # 56 [WIP] + # 35 [DONE], 56 [DONE], 19 [DONE] + # # Consider for ECO4: # Consider for GBIS: # 34 [bug in the results so leaving out for the moment] From 5b39cf138df458b749d13fd100de011e6f3ac350 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 15:52:33 +0000 Subject: [PATCH 127/155] ha9 data load --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 676bd613..88ab706b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -173,6 +173,7 @@ class DataLoader: UNMATCHED_CIGA = { "HA2": 0, "HA6": 117, + "HA9": 0, "HA12": 6, "HA13": 119, "HA14": 3, @@ -226,6 +227,14 @@ class DataLoader: asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA9": + asset_list["matching_address"] = asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA13": asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["address 2"].astype(str).str.lower().str.strip() + ", " + \ @@ -430,7 +439,7 @@ class DataLoader: :return: """ - if ha_name in ["HA107"]: + if ha_name == "HA107": asset_list["HouseNo"] = asset_list["House No"].copy() elif ha_name == "HA32": asset_list["HouseNo"] = asset_list["Dwelling num"].copy() @@ -438,6 +447,8 @@ class DataLoader: asset_list["HouseNo"] = asset_list["House Number"].copy() elif ha_name == "HA38": asset_list["HouseNo"] = asset_list["House_Number"].copy() + elif ha_name == "HA9": + asset_list["HouseNo"] = asset_list["House Number"].copy() else: split_addresses = asset_list['matching_address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) @@ -4954,7 +4965,7 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA6", "HA7", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", + "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", "HA28", "HA32", # "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA56", "HA63", "HA107", "HA117" From efbda5cece019d8518b770c0ace444c1179a1d6a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 16:09:08 +0000 Subject: [PATCH 128/155] ha27 complete --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 88ab706b..fba30f1f 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -280,6 +280,12 @@ class DataLoader: asset_list["matching_postcode"] = asset_list['matching_address'].apply( lambda x: ' '.join(x.split()[-2:]) if pd.notnull(x) else x ) + elif ha_name == "HA27": + asset_list["matching_address"] = ( + asset_list[" Address"].astype(str).str.lower().str.strip() + ", " + + asset_list[" Postcode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list[" Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA28": asset_list["matching_address"] = ( asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + @@ -582,7 +588,7 @@ class DataLoader: # For HA1 and HA25, there is an exception in the structure of the data. We don't have any survey or ciga # lists, and so # we can return the asset list now - if ha_name in ["HA1"]: + if ha_name in ["HA1", "HA27"]: return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame() # If we have ECO3 surveys, we need to match them, because any properties treated under ECO3 won't be @@ -4966,13 +4972,13 @@ def app(): # Add in: priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", - "HA19", "HA24", "HA25", "HA28", "HA32", + "HA19", "HA24", "HA25", "HA27", "HA28", "HA32", # "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA56", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], - # 35 [DONE], 56 [DONE], 19 [DONE] + # 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 DONE # # Consider for ECO4: # Consider for GBIS: From 22f3aca336abafc164439f00ddbdf34649f4f28a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 16:26:42 +0000 Subject: [PATCH 129/155] ha30 32% matched --- .../ha_15_32/ha_analysis_batch_3.py | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index fba30f1f..bdb0d0c4 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -164,6 +164,10 @@ class DataLoader: "address": "T1_Address", "postcode": "matching_postcode" }, + "HA30": { + "address": "A_Address", + "postcode": "A_Postcode" + }, "HA48": { "address": "Full Address", "postcode": "Postcode" @@ -207,7 +211,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA48"]: + if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA48"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -1892,6 +1896,27 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha30_survey_list(survey_list): + + survey_list = survey_list[~pd.isnull(survey_list["Post Code"])] + + # Split on / and take the first half + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0] + + # Not in the asset list + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Horsebridge Road") & + (survey_list["NO."] == 286)) + ] + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "DUTTON WAY") & + (survey_list["NO."] == 9)) + ] + + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -4972,7 +4997,7 @@ def app(): # Add in: priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", - "HA19", "HA24", "HA25", "HA27", "HA28", "HA32", + "HA19", "HA24", "HA25", "HA27", "HA28", "HA30", "HA32", # "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA56", "HA63", "HA107", "HA117" ] From cd81c2b0b29a65b3fd3c59ec5dec7730afdd64ec Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 16:45:59 +0000 Subject: [PATCH 130/155] done ha30 matching --- .../ha_15_32/ha_analysis_batch_3.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bdb0d0c4..71062b16 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1915,6 +1915,74 @@ class DataLoader: (survey_list["NO."] == 9)) ] + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "PAYTHORNE CLOSE") & + (survey_list["NO."] == 10)) + ] + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "MARCHWOOD ROAD") & + (survey_list["NO."] == 11)) + ] + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Otterburn Close") & + (survey_list["NO."] == 4)) + ] + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Blossom Court") & + (survey_list["NO."] == 5)) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "St LUKES CLOSE , HUNTINGDON", "St. Lukes Close" + ) + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "St. Lukes Close") & + (survey_list["NO."].isin([4, 7, 8]))) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "ROMAN WAY , GODMANCHESTER , HUNTINGDON", "Roman Way" + ) + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Roman Way") & + (survey_list["NO."].isin([58]))) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "HEADLANDS , FENSTANTON , HUNTINGDON", "Headlands Fenstanton" + ) + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Headlands Fenstanton") & + (survey_list["NO."].isin([126, 134]))) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "WALLACE COURT , HUNTINGDON", "Wallace Court" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "CRICKETERS WAY , CHATTERIS", "Cricketers Way" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Jubilee Gardens", "Jubilee Green" + ) + + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "Harrow Road") & + (survey_list["NO."].isin([10]))) + ] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "ST LUKES CLOSE", "St. Lukes Close" + ) + return survey_list @staticmethod From 2810316e22ffe4662ae40c2c3bb9bee2f6af6f83 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 17:14:22 +0000 Subject: [PATCH 131/155] handled bug for HA30 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 71062b16..1ee40dde 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2566,6 +2566,7 @@ class DataLoader: "eco4(subject to ciga)": "eco4 (subject to ciga)", "eco4 subject to ciga": "eco4 (subject to ciga)", "eco4 (subject to archetype/ciga)": "eco4 (subject to ciga) (subject to archetype)", + "eco4( subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)", } ha_facts_and_figures = [] @@ -2716,11 +2717,13 @@ class DataLoader: asset_list = asset_list.merge(survey_list_to_merge, how='left', on="asset_list_row_id") # Update the cases where properties have sold, but are missing a CIGA check + # If we don't have a CIGA list, we set the value to ECO4 + set_to = "eco4 - passed ciga" if not ciga_list.empty else "eco4" asset_list["ECO Eligibility"] = np.where( (asset_list["ECO Eligibility"].str.contains("subject to ciga")) & ( asset_list["has_a_survey_record"] == True ), - "eco4 - passed ciga", + set_to, asset_list["ECO Eligibility"] ) # Update the cases where a property has been marked as eligible for GBIS, but sold for ECO4 @@ -4122,7 +4125,6 @@ def calculate_eco4_post_ciga( eco4_expected_cancellations = eco4_no_ciga_needed_cancellations + eco4_ciga_needed_cancellations else: - eco4_confirmed_ciga_failures = 0 # Multiply by sale conversion eco4_confirmed = np.round(eco4_no_ciga_needed * ha_eco4_to_sale_rate) From e15b977930c1b65ab39099c8c6a92d05039e96af Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 17:25:07 +0000 Subject: [PATCH 132/155] fixed ha34, completed 30 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1ee40dde..7d35386d 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -2550,7 +2550,8 @@ class DataLoader: "AFFORDABLE WARMTH / REMEDIAL": "ECO4", "AFF0RDALE WARMTH": "ECO4", "ECO 4 RdSAP CL": "ECO4", - "Affordable Warmth (R) ": "ECO4" + "Affordable Warmth (R) ": "ECO4", + "Affordable Warmth ": "ECO4" } # Since it seems like "subject to archetype check" has some failure conditions, for simplicity, we @@ -4175,9 +4176,6 @@ def calculate_eco4_post_ciga( def forecast_remaining_sales(loader): - # TODO: Skip HA34 for the moment - loader.data = {k: v for k, v in loader.data.items() if k != "HA34"} - # Assumptions: # We cap the ciga conversion rate at 75% because I expect future HAs to have a lower CIGA conversion rate # and I don't want the numbers to change too much, depenent on the CIGA conversation rate @@ -5066,18 +5064,15 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", - "HA19", "HA24", "HA25", "HA27", "HA28", "HA30", "HA32", - # "HA34", - "HA35", "HA39", "HA41", "HA48", "HA50", "HA56", "HA63", "HA107", "HA117" + "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", + "HA27", "HA28", "HA30", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA56", "HA63", "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], - # 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 DONE + # 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 [DONE], 34 [DONE], 30 [DONE] # # Consider for ECO4: # Consider for GBIS: - # 34 [bug in the results so leaving out for the moment] # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in # Filter down the directories to only the priority HAs From 41c17aa1dafe9110c74d6969f2fa06e58d3f0cf8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 10 Mar 2024 18:13:45 +0000 Subject: [PATCH 133/155] HA54 done --- .../ha_15_32/ha_analysis_batch_3.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7d35386d..d556450b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -168,9 +168,17 @@ class DataLoader: "address": "A_Address", "postcode": "A_Postcode" }, + "HA31": { + "address": "A_Address", + "postcode": "matching_postcode" + }, "HA48": { "address": "Full Address", "postcode": "Postcode" + }, + "HA54": { + "address": "Postal Address", + "postcode": "matching_postcode" } } @@ -211,7 +219,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA48"]: + if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA54"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -559,6 +567,12 @@ class DataLoader: if ha_name == "HA25": asset_sheet_colnames[11] = "matching_postcode" + if ha_name == "HA31": + asset_sheet_colnames[2] = "matching_postcode" + + if ha_name == "HA54": + asset_sheet_colnames[10] = "matching_postcode" + rows_data = [] for row in asset_sheet.iter_rows(min_row=2, values_only=False): @@ -2568,6 +2582,7 @@ class DataLoader: "eco4 subject to ciga": "eco4 (subject to ciga)", "eco4 (subject to archetype/ciga)": "eco4 (subject to ciga) (subject to archetype)", "eco4( subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)", + "eco4 (subject to ciga/ archetype)": "eco4 (subject to ciga) (subject to archetype)" } ha_facts_and_figures = [] @@ -5065,11 +5080,12 @@ def app(): # Add in: priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", - "HA27", "HA28", "HA30", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA56", "HA63", "HA107", "HA117" + "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA54", "HA56", "HA63", + "HA107", "HA117" ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], - # 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 [DONE], 34 [DONE], 30 [DONE] + # 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 [DONE], 34 [DONE], 30 [DONE], 31 [DONE], 54 [DONE] # # Consider for ECO4: # Consider for GBIS: From 6a327629bf0ab5284b1b951cc98360597f30ce1f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 12 Mar 2024 11:09:09 +0000 Subject: [PATCH 134/155] rough attempt to attribute surplus ciga dependent eco4 jobs --- .../ha_15_32/ha_analysis_batch_3.py | 144 +++++++++++++----- 1 file changed, 107 insertions(+), 37 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index d556450b..5ad1aa27 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -176,6 +176,10 @@ class DataLoader: "address": "Full Address", "postcode": "Postcode" }, + "HA49": { + "address": "Property Address Full", + "postcode": "Property Postcode" + }, "HA54": { "address": "Postal Address", "postcode": "matching_postcode" @@ -219,7 +223,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA54"]: + if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA49", "HA54"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -382,6 +386,16 @@ class DataLoader: asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \ asset_list["PostCode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip() + elif ha_name == "HAXX": + asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["PostCode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip() + elif ha_name == "HAXXX": + asset_list["matching_address"] = ( + asset_list["Combined Address"].astype(str).str.lower().str.strip() + ", " + + asset_list["Postcode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() else: raise NotImplementedError("implement me") @@ -467,6 +481,8 @@ class DataLoader: asset_list["HouseNo"] = asset_list["House_Number"].copy() elif ha_name == "HA9": asset_list["HouseNo"] = asset_list["House Number"].copy() + elif ha_name == "HAXXX": + asset_list["HouseNo"] = asset_list["Door Number"].copy() else: split_addresses = asset_list['matching_address'].str.split(',', expand=True) house_numbers = split_addresses[0].str.split(' ', expand=True) @@ -1999,6 +2015,10 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha49_survey_list(survey_list): + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -5080,8 +5100,11 @@ def app(): # Add in: priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", - "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA54", "HA56", "HA63", - "HA107", "HA117" + "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", + "HA63", "HA107", "HA117", + + # New HAS + "HAXX", "HAXXX", ] # Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], @@ -5100,39 +5123,86 @@ def app(): forecast_remaining_sales(loader) - # We load in the additional data required to perform the analysis - # cleaned = read_from_s3( - # s3_file_name="cleaned_epc_data/cleaned.bson", - # bucket_name="retrofit-data-dev" - # ) - # cleaned = msgpack.unpackb(cleaned, raw=False) - # cleaned = patch_cleaned(cleaned) - # - # cleaning_data = read_dataframe_from_s3_parquet( - # bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", - # ) - # created_at = datetime.now().isoformat() - # - # photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") - # - # outputs = get_epc_data( - # loader=loader, - # cleaned=cleaned, - # cleaning_data=cleaning_data, - # created_at=created_at, - # photo_supply_lookup=photo_supply_lookup, - # floor_area_decile_thresholds=floor_area_decile_thresholds, - # pull_data=pull_data - # ) + conversion_rate = 0.95 + archetype_check_conversion = 0.7 + res = [] + for k, v in loader.data.items(): + asset_list = v["asset_list"].copy() + agg = asset_list["ECO Eligibility"].value_counts() + # We find a case where there are properties that have passed CIGA + if not any("passed" in x for x in agg.index): + continue - # import pickle - # with open("ha_analysis.pickle", "wb") as f: - # pickle.dump({"outputs": outputs, "loader": loader}, f) + agg = pd.DataFrame(agg).reset_index() - # To read: - # import pickle - # with open("ha_analysis.pickle", "rb") as f: - # outputs = pickle.load(f)["outputs"] - # - # with open("loader.pickle", "rb") as f: - # loader = pickle.load(f) + passed_ciga = agg[agg["ECO Eligibility"] == "eco4 - passed ciga"] + passed_ciga = passed_ciga["count"].values[0] if not passed_ciga.empty else 0 + + failed_ciga = agg[agg["ECO Eligibility"] == "failed ciga"] + failed_ciga = failed_ciga["count"].values[0] if not failed_ciga.empty else 0 + + ciga_pass_rate = passed_ciga / (passed_ciga + failed_ciga) if (passed_ciga + failed_ciga) > 0 else 1 + + dormant_ciga = agg[ + agg["ECO Eligibility"].str.contains("subject to ciga") & + ~agg["ECO Eligibility"].str.contains("subject to archetype") + ] + + dormant_ciga = dormant_ciga['count'].values[0] if not dormant_ciga.empty else 0 + + dormant_ciga_archetype = agg[ + agg["ECO Eligibility"].str.contains("subject to ciga") & + agg["ECO Eligibility"].str.contains("subject to archetype") + ] + + dormant_ciga_archetype = dormant_ciga_archetype['count'].values[0] if not dormant_ciga_archetype.empty else 0 + + needing_check = dormant_ciga + dormant_ciga_archetype * archetype_check_conversion + needing_check = np.round(needing_check) + + additional_jobs = (dormant_ciga * ciga_pass_rate * conversion_rate) + ( + dormant_ciga_archetype * archetype_check_conversion * ciga_pass_rate * conversion_rate + ) + additional_jobs = np.round(additional_jobs) + + # We attempt to estimate the uplift and how much of that is attributed to surplus subject to ciga jobs + original_estimate = loader.december_figures[ + loader.december_figures["HA Name"] == k + ] + + original_estimate = original_estimate["ECO4"].values[0] if not original_estimate.empty else 0 + base_eco_figures = agg[ + agg["ECO Eligibility"].isin(["eco4", "eco4 - passed ciga"]) + ]["count"].sum() + eco4_from_ciga = original_estimate - base_eco_figures + eco4_from_ciga = eco4_from_ciga if eco4_from_ciga > 0 else 0 + surplus_from_dormant = additional_jobs - eco4_from_ciga + surplus_from_dormant = 0 if surplus_from_dormant < 0 else surplus_from_dormant + + res.append( + { + "ha_name": k, + "additional_eco4": additional_jobs, + "needing_check": needing_check, + "surplus_from_dormant": surplus_from_dormant + } + ) + + res = pd.DataFrame(res) + # Drop the HAs that are not in that pervious draft + # In the v2 draft, there are 12 HAs + + v5_surplus = res[ + ~res["ha_name"].isin(["HA9"]) + ]["additional_eco4"].sum() + # 7212 properties + # This is not a perfect difference though, because of the variations in how the numbers are recorded in the November + # all HAs sheet. E.g for HA 107, there were 1239 properties identified. In the postcode list, there are 1255, + # however 531 are still needing a CIGA check. Therefore their original figures, in this case, included properties + # pre-CIGA + + v5_surplus_from_dormant = res[ + ~res["ha_name"].isin(["HA9"]) + ]["surplus_from_dormant"].sum() + # 5539.0 + # 9471690 From ddb5de50e550190c74cd5a2be767f2960352143a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 14 Mar 2024 13:58:29 +0000 Subject: [PATCH 135/155] testing with another stupid effing method --- .idea/.gitignore | 2 + .../ha_15_32/ha_analysis_batch_3.py | 230 +++++++++++++++++- .../epc_attributes/RoofAttributes.py | 17 +- 3 files changed, 241 insertions(+), 8 deletions(-) diff --git a/.idea/.gitignore b/.idea/.gitignore index 26d33521..8f00030d 100644 --- a/.idea/.gitignore +++ b/.idea/.gitignore @@ -1,3 +1,5 @@ # Default ignored files /shelf/ /workspace.xml +# GitHub Copilot persisted chat sessions +/copilot/chatSessions diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 5ad1aa27..767e13c8 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -7,7 +7,9 @@ import msgpack from datetime import datetime import pandas as pd import numpy as np -from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet, save_pickle_to_s3, read_pickle_from_s3 +from utils.s3 import ( + read_from_s3, read_dataframe_from_s3_parquet, save_pickle_to_s3, read_pickle_from_s3, save_dataframe_to_s3_parquet +) from utils.logger import setup_logger from dotenv import load_dotenv from tqdm import tqdm @@ -2860,8 +2862,8 @@ def get_property_type_and_built_form(property_meta, ha_name): property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Dwelling type"]] built_form = property_meta["built_form"] elif ha_name == "HA7": - property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Archetype"]] - built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"][property_meta["Property Type"]] + property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"].get(property_meta["Archetype"]) + built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"]) elif ha_name == "HA14": if property_meta["Asset Type Description"] == "Block - Repair": # We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address @@ -4429,6 +4431,12 @@ def forecast_remaining_sales(loader): for ha_name, input_data in loader.data.items(): # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] + if original_warmfront_estimates.empty: + # Append an empty row + original_warmfront_estimates = december_figures.head(1).copy() + for k in original_warmfront_estimates.columns: + original_warmfront_estimates[k] = 0 + original_warmfront_estimates["HA Name"] = ha_name original_warmfront_eco4 = original_warmfront_estimates["ECO4"].values[0] original_warmfront_remaining_eco4 = original_warmfront_estimates["ECO4 remaining"].values[0] @@ -4742,6 +4750,12 @@ def forecast_remaining_sales(loader): if gbis_variance_2 != 0: raise ValueError("Something went wrong in gbis_variance2") + # Update the GBIS sold, since Warmfront often sold more GBIS that expected + original_warmfront_gbis_revenue = original_warmfront_sold_gbis + original_warmfront_remaining_gbis_revenue + original_warmfront_gbis = ( + original_warmfront_sold_gbis / gbis_rate + original_warmfront_remaining_gbis_revenue / gbis_rate + ) + to_append = { ("", "", "", "HA Name"): ha_name, # ECO4 - original warmfront figures @@ -5077,6 +5091,216 @@ def forecast_remaining_sales(loader): results.to_csv(file, header=True, index=False) +def fml_data_pull(loader): + has_bruh = ["HA7"] + from backend.SearchEpc import SearchEpc + epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=" + + for ha in has_bruh: + asset_list = loader.data[ha]["asset_list"].copy() + # properties found as eligibile + fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"] + + # For each property, search for the latest EPC + epc_data = [] + for _, row in tqdm(fml.iterrows(), total=fml.shape[0]): + property_type, built_form = get_property_type_and_built_form(property_meta=row, ha_name=ha) + searcher = SearchEpc( + address1=row["HouseNo"], + postcode=row["matching_postcode"], + auth_token=epc_api_key, + os_api_key="", + property_type=property_type, + full_address=row["matching_address"], + ) + searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.built_form = built_form + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + epc = { + "asset_list_row_id": row["asset_list_row_id"], + **searcher.newest_epc.copy() + } + + epc_data.append(epc) + + # Remove None entries + epc_data = [x for x in epc_data if x is not None] + # Save the data in S3 as a parquet + epc_data_df = pd.DataFrame(epc_data) + save_pickle_to_s3( + data=epc_data_df, + bucket_name="retrofit-datalake-dev", + s3_file_name=f"ha-analysis/revised/{ha}/epc_data.pickle" + ) + + +def extract_lower_bound(age_band): + if pd.isna(age_band): + return 1930 + try: + return int(age_band.split(':')[1].split('-')[0].strip()) + except (ValueError, IndexError): + return 1930 + + +def fml_analysis(loader): + from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + from etl.epc.DataProcessor import EPCDataProcessor + assumed_ciga_pass_rate = 0.731 + has_bruh = ["HA7"] + + results = [] + for ha_name in has_bruh: + + original_figures = loader.december_figures[ + loader.december_figures["HA Name"] == ha_name + ].copy() + original_remaining = original_figures["ECO4 remaining"].values[0] + + # Read in the epc data + asset_list = loader.data[ha_name]["asset_list"].copy() + # properties found as eligibile + fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"] + epc_data = read_pickle_from_s3( + bucket_name="retrofit-datalake-dev", + s3_file_name=f"ha-analysis/revised/{ha_name}/epc_data.pickle" + ) + + fuck_this = fml.merge( + epc_data, how="left", on="asset_list_row_id" + ) + if fuck_this.shape[0] != fml.shape[0]: + raise Exception("What the fuck bruv") + + # Take just remaining + if not loader.data[ha_name]["survey_list"].empty: + raise NotImplementedError("TAKE JUST REMAINING IDIOT") + + insulation_thicknesses = [] + for _, x in fuck_this.iterrows(): + if pd.isnull(x["roof-description"]): + continue + thickness = RoofAttributes(x["roof-description"]).process()["insulation_thickness"] + # If there is a + in the thickness, strip it out + thickness = str(thickness).replace("+", "") + insulation_thicknesses.append( + {'uprn': x["uprn"], "roof_insulation_thickness": thickness} + ) + insulation_thicknesses = pd.DataFrame(insulation_thicknesses) + + fuck_this = fuck_this.merge(insulation_thicknesses, how="left", on="uprn") + # clean roof insulation + fuck_this["roof_insulation_thickness"] = fuck_this["roof_insulation_thickness"].fillna("0") + fuck_this["roof_insulation_thickness"] = fuck_this[ + "roof_insulation_thickness" + ].str.replace("below average", "50") + fuck_this["roof_insulation_thickness"] = fuck_this[ + "roof_insulation_thickness" + ].str.replace("None", "0") + fuck_this["roof_insulation_thickness"] = fuck_this[ + "roof_insulation_thickness" + ].str.replace("none", "0") + fuck_this["roof_insulation_thickness"] = fuck_this[ + "roof_insulation_thickness" + ].str.replace("average", "150") + + fuck_this["construction-age-band"] = fuck_this["construction-age-band"].apply( + lambda x: EPCDataProcessor.clean_construction_age_band(x) + ) + + fuck_this['age_lower_bound'] = fuck_this['construction-age-band'].apply(extract_lower_bound) + + had_survey = fuck_this[pd.isnull(fuck_this["estimated"])] + + # proportion with a survey: + proportion_with_survey = 100 * had_survey.shape[0] / fuck_this.shape[0] + + # Let's look just at the ECO4 business + # For things that had a survey, take the properties that didn't need a CIGA check + no_ciga_check_needed = had_survey[ + had_survey["ECO Eligibility"] == "eco4" + ] + + no_ciga_check_needed_with_archetype = no_ciga_check_needed[ + (no_ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) & + (no_ciga_check_needed["roof-description"].str.lower().str.contains("pitched") == True) & + (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) + ] + if not no_ciga_check_needed_with_archetype.empty: + raise Exception("SORT ME OUT") + + # Characterise no CIGA check needed + + # TODO: WHAT ABOUT PASSED CIGA - don't need to apply the further deduction + + ciga_check_needed = had_survey[ + had_survey["ECO Eligibility"].str.contains("subject to ciga") + ] + + # We take just the cavity walls + # UCL paper: https://discovery.ucl.ac.uk/id/eprint/10110371/ + # This paper is based on London properties + # The proportion of EPCs with building characteristics errors are shown to + # differ between variables; floor and wall type errors occur in ~10-15% of EPCs, + # compared with ~5% for wall insulation and glazing performance + + ciga_check_needed_with_archetype = ciga_check_needed[ + (ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) & + (ciga_check_needed["roof-description"].str.lower().str.contains("pitched") == True) & + (ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) + ] + + # We take properties that could feasibly be within install regions + ciga_check_needed_plausible = ciga_check_needed_with_archetype[ + ciga_check_needed_with_archetype["roof_insulation_thickness"].astype(float) < 270 + ] + + if not loader.data[ha_name]["ciga_list"].empty: + raise NotImplementedError("SORT OUT THE CIGA BRUV") + else: + ha_ciga_pass_rate = assumed_ciga_pass_rate + + ciga_check_expectation = np.round(ciga_check_needed_plausible.shape[0] * ha_ciga_pass_rate) + without_ciga_expectation = no_ciga_check_needed_with_archetype.shape[0] + + # Need to add on the non-ciga + total_expectation = ciga_check_expectation + without_ciga_expectation + + if proportion_with_survey < 100: + # We estimate the rest + without_survey_needing_ciga = fuck_this[ + (pd.isnull(fuck_this["estimated"]) == False) & + (fuck_this["ECO Eligibility"].str.contains("subject to ciga") == True) + ] + + # We apply the same conversion rate as the properties with a survey + without_survey_without_ciga_expected = np.round( + without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0]) + ) + + total_expectation += without_survey_without_ciga_expected + + without_survey_without_ciga = fuck_this[ + (pd.isnull(fuck_this["estimated"]) == False) & (fuck_this["ECO Eligibility"].isin(["eco4"])) + ] + + if not without_survey_without_ciga.empty: + raise Exception("Estimate the rest!!") + + results.append( + { + "HA Name": ha_name, + "Original ECO4 Estimate - Remaining": original_remaining, + "Proportion with a survey": proportion_with_survey, + "total_expectation": total_expectation + } + ) + + def app(): """ This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107. diff --git a/etl/epc_clean/epc_attributes/RoofAttributes.py b/etl/epc_clean/epc_attributes/RoofAttributes.py index 9d3b46b4..76f99f09 100644 --- a/etl/epc_clean/epc_attributes/RoofAttributes.py +++ b/etl/epc_clean/epc_attributes/RoofAttributes.py @@ -122,6 +122,13 @@ class RoofAttributes(Definitions): result["is_valid"] = "invalid" not in description description = description.replace("invalid", "") + # We handle an edge case where the description is "pitched, 150 loft insulation" and is missing the mm + if result["is_pitched"] or result["is_loft"]: + # Search for a regular expression that matches 150 insulation + match = re.search(r"(\d+\+?)\s*insulation", description) + if match: + result['insulation_thickness'] = match.group(1) + # insulation thickness thickness_map = { "ceiling insulated": "average", @@ -137,11 +144,11 @@ class RoofAttributes(Definitions): # Remove the match from the description # description = description.replace(key, "") break - else: - # Extract insulation thickness in mm, if present - match = re.search(r'(\d+\+?)\s*mm', description) - if match: - result['insulation_thickness'] = match.group(1) + + # Extract insulation thickness in mm, if present + match = re.search(r'(\d+\+?)\s*mm', description) + if match: + result['insulation_thickness'] = match.group(1) if "insulation_thickness" not in result: result['insulation_thickness'] = None From bee07a253b8285a67c4cb78b9051e2b000de30c0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 14 Mar 2024 16:10:55 +0000 Subject: [PATCH 136/155] new method wip --- .../ha_15_32/ha_analysis_batch_3.py | 125 +++++++++++++++--- 1 file changed, 105 insertions(+), 20 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 767e13c8..9cadaf9f 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -92,6 +92,27 @@ PROPERTY_TYPE_LOOKUP = { 'Flat Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"}, 'Mid Terraced Town House': {"property-type": "House", "built-form": "Mid-Terrace"}, }, + "HA25": { + 'Flat': 'Flat', + 'Mid Terrace House': 'House', + 'Semi Detached House': 'House', + 'End Terrace House': 'House', + 'House': 'House', + 'Semi Detached Bung': 'Bungalow', + 'Bungalow': 'Bungalow', + 'End Terrace Bungalow': 'Bungalow', + 'Maisonnette': 'Maisonette', + 'Mid Terrace Bungalow': 'Bungalow', + 'Bedspace': None, + 'Detached House': 'House', + 'Bedsit': 'Flat', + 'Coach House': 'House', + 'Detached Bungalow': 'Bungalow', + 'Office Buildings': None, + 'Guest Room': None, + 'Mid Terrace Housekeeping ': 'House', + 'End Terrace Housex': 'House' + }, "HA39": { "Semi house": {"property_type": "House", "built_form": "Semi-Detached"}, "1st floor flat": {"property_type": "Flat", "built_form": None}, @@ -2877,6 +2898,9 @@ def get_property_type_and_built_form(property_meta, ha_name): property_meta["Asset Type Description"] ] + built_form = None + elif ha_name == "HA25": + property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["T1_AssetType"]] built_form = None elif ha_name == "HA16": config = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Type"]] @@ -5092,7 +5116,8 @@ def forecast_remaining_sales(loader): def fml_data_pull(loader): - has_bruh = ["HA7"] + has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16"] + # DO from backend.SearchEpc import SearchEpc epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=" @@ -5104,7 +5129,7 @@ def fml_data_pull(loader): # For each property, search for the latest EPC epc_data = [] for _, row in tqdm(fml.iterrows(), total=fml.shape[0]): - property_type, built_form = get_property_type_and_built_form(property_meta=row, ha_name=ha) + property_type, _ = get_property_type_and_built_form(property_meta=row, ha_name=ha) searcher = SearchEpc( address1=row["HouseNo"], postcode=row["matching_postcode"], @@ -5113,8 +5138,9 @@ def fml_data_pull(loader): property_type=property_type, full_address=row["matching_address"], ) - searcher.ordnance_survey_client.property_type = property_type - searcher.ordnance_survey_client.built_form = built_form + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None searcher.find_property(skip_os=True) if searcher.newest_epc is None: @@ -5147,11 +5173,32 @@ def extract_lower_bound(age_band): return 1930 +def classify_loft(x): + # high confidence + if float(x["roof_insulation_thickness"]) <= 100: + return "high" + + if float(x["roof_insulation_thickness"]) <= 200: + return "medium" + + if float(x["roof_insulation_thickness"]) <= 270 and x["epc_age"] >= 5 * 365: + return "medium" + + return "unlikely" + + def fml_analysis(loader): from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes from etl.epc.DataProcessor import EPCDataProcessor + from datetime import datetime assumed_ciga_pass_rate = 0.731 - has_bruh = ["HA7"] + has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16"] + + no_ciga_cavity_descriptions = [ + "Cavity wall, as built, insulated (assumed)", + "Cavity wall, as built, no insulation (assumed)", + "Cavity wall, as built, partial insulation (assumed)" + ] results = [] for ha_name in has_bruh: @@ -5170,6 +5217,11 @@ def fml_analysis(loader): s3_file_name=f"ha-analysis/revised/{ha_name}/epc_data.pickle" ) + # time from the inspection to now + epc_data["epc_age"] = (datetime.now() - pd.to_datetime(epc_data["inspection-date"])).dt.days + if "estimated" not in epc_data.columns: + epc_data["estimated"] = None + fuck_this = fml.merge( epc_data, how="left", on="asset_list_row_id" ) @@ -5178,12 +5230,27 @@ def fml_analysis(loader): # Take just remaining if not loader.data[ha_name]["survey_list"].empty: - raise NotImplementedError("TAKE JUST REMAINING IDIOT") + survey_list = ( + loader.data[ha_name]["survey_list"][ + ~pd.isnull(loader.data[ha_name]["survey_list"]["asset_list_row_id"]) + ] + ) + fuck_this = fuck_this.merge( + survey_list[["asset_list_row_id", "installation_status"]], + how="left", + on="asset_list_row_id" + ) + # Anything that has an installation has gone to installation, and therefore is not remaining + fuck_this = fuck_this[pd.isnull(fuck_this["installation_status"])] + fuck_this = fuck_this.drop(columns=["installation_status"]) insulation_thicknesses = [] for _, x in fuck_this.iterrows(): if pd.isnull(x["roof-description"]): continue + if x["roof-description"] == "SAP05:Roof": + continue + thickness = RoofAttributes(x["roof-description"]).process()["insulation_thickness"] # If there is a + in the thickness, strip it out thickness = str(thickness).replace("+", "") @@ -5208,11 +5275,13 @@ def fml_analysis(loader): "roof_insulation_thickness" ].str.replace("average", "150") - fuck_this["construction-age-band"] = fuck_this["construction-age-band"].apply( - lambda x: EPCDataProcessor.clean_construction_age_band(x) - ) + fuck_this["roof_classiciation"] = fuck_this.apply(lambda x: classify_loft(x), axis=1) - fuck_this['age_lower_bound'] = fuck_this['construction-age-band'].apply(extract_lower_bound) + # fuck_this["construction-age-band"] = fuck_this["construction-age-band"].apply( + # lambda x: EPCDataProcessor.clean_construction_age_band(x) + # ) + # + # fuck_this['age_lower_bound'] = fuck_this['construction-age-band'].apply(extract_lower_bound) had_survey = fuck_this[pd.isnull(fuck_this["estimated"])] @@ -5225,9 +5294,23 @@ def fml_analysis(loader): had_survey["ECO Eligibility"] == "eco4" ] + # Walls: + # Cavity wall, as built, insulated (assumed) + # Cavity wall, as built, no insulation (assumed) + # Cavity wall, as built, partial insulation (assumed) + + # Roof: + # Less than 100mm = high confidence + # Less than 270mm & EPC at least 5 years old = medium confidence + # Otherwise, low confidence + + # SAP criteria is EPC C or below + + # Pre is 54 or below + no_ciga_check_needed_with_archetype = no_ciga_check_needed[ - (no_ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) & - (no_ciga_check_needed["roof-description"].str.lower().str.contains("pitched") == True) & + (no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) & + (no_ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) & (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] if not no_ciga_check_needed_with_archetype.empty: @@ -5239,7 +5322,14 @@ def fml_analysis(loader): ciga_check_needed = had_survey[ had_survey["ECO Eligibility"].str.contains("subject to ciga") - ] + ].copy() + + ciga_check_passed = had_survey[ + had_survey["ECO Eligibility"] == "eco4 - passed ciga" + ] + + if not ciga_check_passed.empty: + raise Exception("SORT ME BRUV") # We take just the cavity walls # UCL paper: https://discovery.ucl.ac.uk/id/eprint/10110371/ @@ -5248,17 +5338,12 @@ def fml_analysis(loader): # differ between variables; floor and wall type errors occur in ~10-15% of EPCs, # compared with ~5% for wall insulation and glazing performance - ciga_check_needed_with_archetype = ciga_check_needed[ + ciga_check_needed_plausible = ciga_check_needed[ (ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) & - (ciga_check_needed["roof-description"].str.lower().str.contains("pitched") == True) & + (ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) & (ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] - # We take properties that could feasibly be within install regions - ciga_check_needed_plausible = ciga_check_needed_with_archetype[ - ciga_check_needed_with_archetype["roof_insulation_thickness"].astype(float) < 270 - ] - if not loader.data[ha_name]["ciga_list"].empty: raise NotImplementedError("SORT OUT THE CIGA BRUV") else: From 9b255029b3f58d9f8653aaf1bbbd0cc43b024803 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 14 Mar 2024 17:36:09 +0000 Subject: [PATCH 137/155] fml fml --- .../ha_15_32/ha_analysis_batch_3.py | 141 ++++++++++++------ 1 file changed, 96 insertions(+), 45 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 9cadaf9f..e1d7db4d 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -20,6 +20,9 @@ from backend.ml_models.api import ModelApi from etl.solar.SolarPhotoSupply import SolarPhotoSupply from recommendations.recommendation_utils import calculate_cavity_age from etl.epc.Record import EPCRecord +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes +from etl.epc.DataProcessor import EPCDataProcessor +from datetime import datetime EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" @@ -5188,9 +5191,6 @@ def classify_loft(x): def fml_analysis(loader): - from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes - from etl.epc.DataProcessor import EPCDataProcessor - from datetime import datetime assumed_ciga_pass_rate = 0.731 has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16"] @@ -5216,15 +5216,20 @@ def fml_analysis(loader): bucket_name="retrofit-datalake-dev", s3_file_name=f"ha-analysis/revised/{ha_name}/epc_data.pickle" ) + # We make sure we don't have duplicated. We do a super basic drop duplicates because it shouldn't be a huge + # issue at this point + epc_data = epc_data.drop_duplicates("uprn") # time from the inspection to now epc_data["epc_age"] = (datetime.now() - pd.to_datetime(epc_data["inspection-date"])).dt.days if "estimated" not in epc_data.columns: - epc_data["estimated"] = None + # For all after HA7, we don't use estimated surveys + epc_data["estimated"] = False fuck_this = fml.merge( epc_data, how="left", on="asset_list_row_id" ) + fuck_this["estimated"] = fuck_this["estimated"].fillna(True) if fuck_this.shape[0] != fml.shape[0]: raise Exception("What the fuck bruv") @@ -5259,7 +5264,15 @@ def fml_analysis(loader): ) insulation_thicknesses = pd.DataFrame(insulation_thicknesses) + before_merge_shape = fuck_this.shape[0] fuck_this = fuck_this.merge(insulation_thicknesses, how="left", on="uprn") + + if fuck_this.shape[0] != before_merge_shape: + raise Exception("SOMETHING WENT WRONG") + + if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")): + blah + # clean roof insulation fuck_this["roof_insulation_thickness"] = fuck_this["roof_insulation_thickness"].fillna("0") fuck_this["roof_insulation_thickness"] = fuck_this[ @@ -5283,7 +5296,7 @@ def fml_analysis(loader): # # fuck_this['age_lower_bound'] = fuck_this['construction-age-band'].apply(extract_lower_bound) - had_survey = fuck_this[pd.isnull(fuck_this["estimated"])] + had_survey = fuck_this[fuck_this["estimated"] == False] # proportion with a survey: proportion_with_survey = 100 * had_survey.shape[0] / fuck_this.shape[0] @@ -5294,27 +5307,11 @@ def fml_analysis(loader): had_survey["ECO Eligibility"] == "eco4" ] - # Walls: - # Cavity wall, as built, insulated (assumed) - # Cavity wall, as built, no insulation (assumed) - # Cavity wall, as built, partial insulation (assumed) - - # Roof: - # Less than 100mm = high confidence - # Less than 270mm & EPC at least 5 years old = medium confidence - # Otherwise, low confidence - - # SAP criteria is EPC C or below - - # Pre is 54 or below - - no_ciga_check_needed_with_archetype = no_ciga_check_needed[ + no_ciga_check_needed_eligible = no_ciga_check_needed[ (no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) & (no_ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) & (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] - if not no_ciga_check_needed_with_archetype.empty: - raise Exception("SORT ME OUT") # Characterise no CIGA check needed @@ -5327,9 +5324,20 @@ def fml_analysis(loader): ciga_check_passed = had_survey[ had_survey["ECO Eligibility"] == "eco4 - passed ciga" ] + # These should be treated the same as one that have passed their ciga checks, from a detection perspective + ciga_check_passed_eligible = ciga_check_passed[ + (ciga_check_passed["walls-description"].str.lower().str.contains("cavity") == True) & + (ciga_check_passed["roof_classiciation"].isin(["high", "medium"])) & + (ciga_check_passed["current-energy-efficiency"].astype(float) <= 80) + ] - if not ciga_check_passed.empty: - raise Exception("SORT ME BRUV") + if not loader.data[ha_name]["ciga_list"].empty: + + proportions = loader.data[ha_name]["ciga_list"]["Guarantee"].value_counts(normalize=True) + ha_ciga_pass_rate = proportions[proportions.index == "No"].values[0] + + else: + ha_ciga_pass_rate = assumed_ciga_pass_rate # We take just the cavity walls # UCL paper: https://discovery.ucl.ac.uk/id/eprint/10110371/ @@ -5338,53 +5346,96 @@ def fml_analysis(loader): # differ between variables; floor and wall type errors occur in ~10-15% of EPCs, # compared with ~5% for wall insulation and glazing performance - ciga_check_needed_plausible = ciga_check_needed[ + ciga_check_needed_eligible = ciga_check_needed[ (ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) & (ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) & (ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] - if not loader.data[ha_name]["ciga_list"].empty: - raise NotImplementedError("SORT OUT THE CIGA BRUV") - else: - ha_ciga_pass_rate = assumed_ciga_pass_rate - - ciga_check_expectation = np.round(ciga_check_needed_plausible.shape[0] * ha_ciga_pass_rate) - without_ciga_expectation = no_ciga_check_needed_with_archetype.shape[0] + ciga_check_expectation = np.round(ciga_check_needed_eligible.shape[0] * ha_ciga_pass_rate) + without_ciga_expectation = no_ciga_check_needed_eligible.shape[0] + passed_ciga_expectation = ciga_check_passed_eligible.shape[0] # Need to add on the non-ciga - total_expectation = ciga_check_expectation + without_ciga_expectation + total_expectation = ciga_check_expectation + without_ciga_expectation + passed_ciga_expectation if proportion_with_survey < 100: # We estimate the rest without_survey_needing_ciga = fuck_this[ - (pd.isnull(fuck_this["estimated"]) == False) & + (fuck_this["estimated"] == True) & (fuck_this["ECO Eligibility"].str.contains("subject to ciga") == True) ] - # We apply the same conversion rate as the properties with a survey - without_survey_without_ciga_expected = np.round( - without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0]) - ) + if without_survey_needing_ciga.empty: + without_survey_without_ciga_expected = 0 + else: + # We apply the same conversion rate as the properties with a survey + without_survey_without_ciga_expected = np.round( + without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0]) + ) - total_expectation += without_survey_without_ciga_expected - - without_survey_without_ciga = fuck_this[ - (pd.isnull(fuck_this["estimated"]) == False) & (fuck_this["ECO Eligibility"].isin(["eco4"])) + without_survey_passed_ciga = fuck_this[ + (fuck_this["estimated"] == True) & + (fuck_this["ECO Eligibility"] == "eco4 - passed ciga") ] - if not without_survey_without_ciga.empty: - raise Exception("Estimate the rest!!") + if without_survey_passed_ciga.empty: + without_survey_passed_ciga_expected = 0 + else: + # We apply the same conversion rate as the properties with a survey + without_survey_passed_ciga_expected = np.round( + without_survey_passed_ciga.shape[0] * (passed_ciga_expectation / ciga_check_passed.shape[0]) + ) + + # Finally, no ciga needed + without_survey_eco4 = fuck_this[ + (fuck_this["estimated"] == True) & + (fuck_this["ECO Eligibility"] == "eco4") + ] + + if without_survey_eco4.empty: + without_survey_eco4_expected = 0 + else: + # We apply the same conversion rate as the properties with a survey + without_survey_eco4_expected = np.round( + without_survey_eco4.shape[0] * (without_ciga_expectation / no_ciga_check_needed.shape[0]) + ) + + total_expectation = ( + total_expectation + + without_survey_without_ciga_expected + + without_survey_passed_ciga_expected + + without_survey_eco4_expected + ) + + surveys = loader.data[ha_name]["survey_list"] + sold_now = 0 + if not surveys.empty: + sold_now = surveys[ + surveys["installation_status"].str.lower().str.contains("eco4") + ].shape[0] + + sales_since_nov = sold_now - original_figures["No. of Tech surveys complete - Eco 4"].values[0] results.append( { "HA Name": ha_name, "Original ECO4 Estimate - Remaining": original_remaining, + "Of which sold": sales_since_nov, + "Of which ECO4 Eligible - Remaining": int(total_expectation), "Proportion with a survey": proportion_with_survey, - "total_expectation": total_expectation } ) + results_df = pd.DataFrame(results) + + results_df["Delta vs November"] = 100 * ( + results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"] + ) / results_df["Original ECO4 Estimate - Remaining"] + + # TODO: Split into high and low confidence? + # + def app(): """ From 3b65a71793721d65fd8356c215813a13d384bc4d Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 14 Mar 2024 18:25:50 +0000 Subject: [PATCH 138/155] added in extra shit to output --- .../ha_15_32/ha_analysis_batch_3.py | 47 ++++++++++++++++--- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index e1d7db4d..53ce69e2 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -5200,6 +5200,22 @@ def fml_analysis(loader): "Cavity wall, as built, partial insulation (assumed)" ] + codes = [ + "HA39", "HA14", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA7", + "HA16", "HA107", "HA25", "HA50", "HA41", "HA48", "HA2", "HA63", "HA12", + "HA117", "HA13", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", + "HA30", "HA31", "HA54", "HAXX", "HA49", "HAXXX" + ] + + values = [ + 706, 2161, 1053, 793, 0, 656, 1200, 1647, 4248, 2703, 1087, 1876, 2135, + 1078, 775, 538, 518, 401, 466, 2627, 98, 1050, 524, 191, 538, 384, 204, + 281, 422, 74, 313, 71, 6 + ] + + # Create a dictionary mapping + remaining_eligible_mapping = dict(zip(codes, values)) + results = [] for ha_name in has_bruh: @@ -5207,6 +5223,7 @@ def fml_analysis(loader): loader.december_figures["HA Name"] == ha_name ].copy() original_remaining = original_figures["ECO4 remaining"].values[0] + postcode_list_remaining = remaining_eligible_mapping[ha_name] # Read in the epc data asset_list = loader.data[ha_name]["asset_list"].copy() @@ -5271,7 +5288,7 @@ def fml_analysis(loader): raise Exception("SOMETHING WENT WRONG") if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")): - blah + raise Exception("DO THE DAMN ARCHETYPE CHECK BRO") # clean roof insulation fuck_this["roof_insulation_thickness"] = fuck_this["roof_insulation_thickness"].fillna("0") @@ -5313,6 +5330,13 @@ def fml_analysis(loader): (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] + # For anything not needing a CIGA check, some of it will be GBIS + no_ciga_check_needed_eligible_gbis = no_ciga_check_needed[ + (no_ciga_check_needed["walls-description"].isin(no_ciga_cavity_descriptions)) & + (no_ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) & + (~no_ciga_check_needed["asset_list_row_id"].isin(no_ciga_check_needed_eligible["asset_list_row_id"].values)) + ] + # Characterise no CIGA check needed # TODO: WHAT ABOUT PASSED CIGA - don't need to apply the further deduction @@ -5359,6 +5383,8 @@ def fml_analysis(loader): # Need to add on the non-ciga total_expectation = ciga_check_expectation + without_ciga_expectation + passed_ciga_expectation + total_gbis_expectation = no_ciga_check_needed_eligible_gbis.shape[0] + if proportion_with_survey < 100: # We estimate the rest without_survey_needing_ciga = fuck_this[ @@ -5395,12 +5421,17 @@ def fml_analysis(loader): if without_survey_eco4.empty: without_survey_eco4_expected = 0 + without_survey_gbis_expected = 0 else: # We apply the same conversion rate as the properties with a survey without_survey_eco4_expected = np.round( without_survey_eco4.shape[0] * (without_ciga_expectation / no_ciga_check_needed.shape[0]) ) + without_survey_gbis_expected = np.round( + without_survey_eco4.shape[0] * (total_gbis_expectation / no_ciga_check_needed.shape[0]) + ) + total_expectation = ( total_expectation + without_survey_without_ciga_expected + @@ -5408,6 +5439,8 @@ def fml_analysis(loader): without_survey_eco4_expected ) + total_gbis_expectation = total_gbis_expectation + without_survey_gbis_expected + surveys = loader.data[ha_name]["survey_list"] sold_now = 0 if not surveys.empty: @@ -5421,20 +5454,22 @@ def fml_analysis(loader): { "HA Name": ha_name, "Original ECO4 Estimate - Remaining": original_remaining, + "Postcode List - Remaining": postcode_list_remaining, "Of which sold": sales_since_nov, "Of which ECO4 Eligible - Remaining": int(total_expectation), + "Of which GBIS Eligibile - Remaining": int(total_gbis_expectation), "Proportion with a survey": proportion_with_survey, } ) results_df = pd.DataFrame(results) - results_df["Delta vs November"] = 100 * ( - results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"] - ) / results_df["Original ECO4 Estimate - Remaining"] + # results_df["Delta vs November"] = 100 * ( + # results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"] + # ) / results_df["Original ECO4 Estimate - Remaining"] - # TODO: Split into high and low confidence? - # + # TODO: Add in estimated GBIS (for eco jobs, of which look like gbis) + # TODO: Change the left hand side number for our post CIGA estimates def app(): From 479a2b08c33e2911a5ae98c3d315903af04e4980 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 14 Mar 2024 19:02:33 +0000 Subject: [PATCH 139/155] ffs --- .../ha_15_32/ha_analysis_batch_3.py | 22 +++++++++++++++++-- etl/epc_clean/app.py | 3 +++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 53ce69e2..9462642f 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -5119,7 +5119,9 @@ def forecast_remaining_sales(loader): def fml_data_pull(loader): - has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16"] + has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16", + # Do these + "HA1", "HA13", "HA50", "HA24"] # DO from backend.SearchEpc import SearchEpc epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=" @@ -5197,9 +5199,19 @@ def fml_analysis(loader): no_ciga_cavity_descriptions = [ "Cavity wall, as built, insulated (assumed)", "Cavity wall, as built, no insulation (assumed)", - "Cavity wall, as built, partial insulation (assumed)" + "Cavity wall, as built, partial insulation (assumed)", + "Cavity wall, no insulation (assumed)", + "Cavity wall, partial insulation (assumed)", + "Cavity wall,", + "Cavity wall, insulated (assumed)", + "Cavity wall, no insulation (assumed)", + "Cavity wall, as built, insulated (assumed)", + "Cavity wall, partial insulation (assumed)", ] + # TODO: There will be some properties that are subject to CIGA that do not look like they ned a CIGA check! pass + # them! + codes = [ "HA39", "HA14", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA7", "HA16", "HA107", "HA25", "HA50", "HA41", "HA48", "HA2", "HA63", "HA12", @@ -5217,6 +5229,7 @@ def fml_analysis(loader): remaining_eligible_mapping = dict(zip(codes, values)) results = [] + wall_descriptions = [] for ha_name in has_bruh: original_figures = loader.december_figures[ @@ -5236,6 +5249,7 @@ def fml_analysis(loader): # We make sure we don't have duplicated. We do a super basic drop duplicates because it shouldn't be a huge # issue at this point epc_data = epc_data.drop_duplicates("uprn") + wall_descriptions.extend(epc_data["walls-description"].unique().tolist()) # time from the inspection to now epc_data["epc_age"] = (datetime.now() - pd.to_datetime(epc_data["inspection-date"])).dt.days @@ -5464,6 +5478,10 @@ def fml_analysis(loader): results_df = pd.DataFrame(results) + wall_descriptions = list(set(wall_descriptions)) + from pprint import pprint + pprint(wall_descriptions) + # results_df["Delta vs November"] = 100 * ( # results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"] # ) / results_df["Original ECO4 Estimate - Remaining"] diff --git a/etl/epc_clean/app.py b/etl/epc_clean/app.py index 53c1a329..3f1a1a80 100644 --- a/etl/epc_clean/app.py +++ b/etl/epc_clean/app.py @@ -36,8 +36,11 @@ def app(): cleaned_data = {} epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] + WALLS = [] for directory in tqdm(epc_directories): data = pd.read_csv(directory / "certificates.csv", low_memory=False) + z = data["WALLS_DESCRIPTION"].unique().tolist() + WALLS.extend(z) # Rename the columns to the same format as the api returns data.columns = [c.replace("_", "-").lower() for c in data.columns] # Take just date before the date threshold From cc319ab91149f77dd04e691e6bc6b99bb9d39702 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Mar 2024 10:09:26 +0000 Subject: [PATCH 140/155] new ha analysis wip --- .../ha_15_32/ha_analysis_batch_3.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 9462642f..a0b7e0bb 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -5210,7 +5210,7 @@ def fml_analysis(loader): ] # TODO: There will be some properties that are subject to CIGA that do not look like they ned a CIGA check! pass - # them! + # them! Non-invasices will have checked the wall though codes = [ "HA39", "HA14", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA7", @@ -5352,16 +5352,11 @@ def fml_analysis(loader): ] # Characterise no CIGA check needed - - # TODO: WHAT ABOUT PASSED CIGA - don't need to apply the further deduction - ciga_check_needed = had_survey[ had_survey["ECO Eligibility"].str.contains("subject to ciga") ].copy() - ciga_check_passed = had_survey[ - had_survey["ECO Eligibility"] == "eco4 - passed ciga" - ] + ciga_check_passed = had_survey[had_survey["ECO Eligibility"] == "eco4 - passed ciga"] # These should be treated the same as one that have passed their ciga checks, from a detection perspective ciga_check_passed_eligible = ciga_check_passed[ (ciga_check_passed["walls-description"].str.lower().str.contains("cavity") == True) & @@ -5469,18 +5464,15 @@ def fml_analysis(loader): "HA Name": ha_name, "Original ECO4 Estimate - Remaining": original_remaining, "Postcode List - Remaining": postcode_list_remaining, - "Of which sold": sales_since_nov, + # "Of which sold": sales_since_nov, "Of which ECO4 Eligible - Remaining": int(total_expectation), "Of which GBIS Eligibile - Remaining": int(total_gbis_expectation), - "Proportion with a survey": proportion_with_survey, + # "Proportion with a survey": proportion_with_survey, } ) results_df = pd.DataFrame(results) - - wall_descriptions = list(set(wall_descriptions)) - from pprint import pprint - pprint(wall_descriptions) + results_df.to_csv("analysis - revised.csv") # results_df["Delta vs November"] = 100 * ( # results_df["Of which ECO4 Eligible - Remaining"] - results_df["Original ECO4 Estimate - Remaining"] From 12f780a08989e896235adf96e175d39240c3adbb Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Mar 2024 16:54:48 +0000 Subject: [PATCH 141/155] setting up complete data pull --- .../ha_15_32/ha_analysis_batch_3.py | 380 +++++++++++++++++- 1 file changed, 369 insertions(+), 11 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index a0b7e0bb..902d48fd 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -42,6 +42,15 @@ PROPERTY_TYPE_LOOKUP = { 'Detached Local Connect': 'Detached', } }, + "HA2": { + 'HOUSE': 'House', + 'FLAT': 'Flat', + 'SHELTERED': None, + 'BUNGALOW': 'Bungalow', + 'BED-SIT': None, + 'MAISONETTE': "Maisonette", + 'HOSTEL': None + }, "HA6": { "property_type": { 'HOUSE': "House", @@ -69,6 +78,23 @@ PROPERTY_TYPE_LOOKUP = { "End Terraced": "End-Terrace", } }, + "HA12": { + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + "Bedsit": None, + }, + "HA13": { + 'House': "House", + 'Flat': "Flat", + 'House MT': "House", + 'House SD': "House", + 'House ET': "House", + 'Bungalow MT': "Bungalow", + 'Bungalow ET': "Bungalow", + 'ii': None, + }, "HA14": { "property_type": { "House": "House", @@ -77,6 +103,13 @@ PROPERTY_TYPE_LOOKUP = { "Maisonette": "Maisonette", } }, + "HA15": { + 'House': 'House', + 'Flat': 'Flat', + 'Bungalow': 'Bungalow', + 'Maisonette': 'Maisonette', + 'Flat over garage': 'Flat', + }, "HA16": { 'Semi Detached Bungalow': {"property-type": "Bungalow", "built-form": "Semi-Detached"}, 'Mid Terraced House': {"property-type": "House", "built-form": "Mid-Terrace"}, @@ -95,6 +128,30 @@ PROPERTY_TYPE_LOOKUP = { 'Flat Over Shop': {"property-type": "Flat", "built-form": "Mid-Terrace"}, 'Mid Terraced Town House': {"property-type": "House", "built-form": "Mid-Terrace"}, }, + "HA18": { + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + "Bedsit": None, + "Shop": None, + "Hostel": None, + "Block": None, + }, + "HA24": { + '01 HOUSE': 'House', + '02 FLAT': 'Flat', + '03 BUNGALOW': 'Bungalow', + '10 PBUNGALOW': 'Bungalow', + '01 HOUSE MID': 'House', + '13 SBUNGALOW': 'Bungalow', + '12 SBEDSIT': None, # BEDSIT does not match the specified property types + '14 SFLAT': 'Flat', + '05 BEDSIT': None, + '04 MAISONETTE': 'Maisonette', + '11 PFLAT': 'Flat', + '09 PBEDSIT': None + }, "HA25": { 'Flat': 'Flat', 'Mid Terrace House': 'House', @@ -116,6 +173,77 @@ PROPERTY_TYPE_LOOKUP = { 'Mid Terrace Housekeeping ': 'House', 'End Terrace Housex': 'House' }, + "HA28": { + 'Flat': 'Flat', + 'Semi detached house': 'House', + 'Terraced house': 'House', + 'Maisonette flat': 'Maisonette', + 'Sheltered bedsit': None, + 'APD flat': 'Flat', + 'Bungalow terraced': 'Bungalow', + 'Flat with partition': 'Flat', + 'Bungalow semi detached': 'Bungalow', + 'APD Bungalow': 'Bungalow', + 'Sheltered flat': 'Flat', + 'Bedsit Flat': 'Flat', + 'Bedsit bungalow semi detached': 'Bungalow', + 'Sheltered bungalow terraced': 'Bungalow', + 'Sheltered bedsit disabled': None, + 'Bedsit bungalow terraced': 'Bungalow', + 'Sheltered bungalow semi detached': 'Bungalow', + 'Sheltered warden flat': 'Flat', + 'Bungalow detached': 'Bungalow', + 'Block': None, # Does not match the specified property types + 'End Terraced House': 'House', + 'Mid Terraced House': 'House', + '#N/A': None, # Assuming this is an invalid or missing entry + 0: None # Assuming 0 is also an invalid or missing entry + }, + "HA30": { + 'House': 'House', + 'Flat': 'Flat', + 'Bungalow': 'Bungalow', + 'House with Attached Garage': 'House', + 'Bed Space': None, # Assuming this does not fit the specified property types + 'House with Garage': 'House', + 'Bungalow with Wheelchair Access': 'Bungalow', + 'Maisonette': 'Maisonette', + 'Flat with Wheelchair Access': 'Flat', + 'Bedsit': None, # Assuming this does not fit the specified property types + 'Flat w Wheelchair Access & Car Park': 'Flat', + 'House with Wheelchair Access': 'House', + 'Bungalow w Wheelchair Access & Car ': 'Bungalow' + }, + "HA32": { + 'Bungalow': 'Bungalow', + 'Flat': 'Flat', + 'Bungalow Disabled': 'Bungalow', # "Disabled" properties categorized with their base type + 'House': 'House', + 'Dormer Bungalow': 'Bungalow', + 'Pop-In': None, # Does not fit the specified property types + 'Flat Disabled': 'Flat', + 'Laundry': None, # Does not fit the specified property types + 'Bedsit': None, # Excluded from the given categories + 'Shed': None, # Does not fit the specified property types + 'Store Room': None # Does not fit the specified property types + }, + "HA34": { + 'Flat': 'Flat', + 'House': 'House', + 'Bungalow': 'Bungalow', + 'Maisonette': 'Maisonette', + 'ND': None, + }, + "HA35": { + "Flat": "Flat", + "Maisonette": "Maisonette", + "House": "House", + "Bedsit": None, + "2 Bedroom Unknown": None, + "1 Bedroom Unknown": None, + "3 Bedroom Unknown": None, + "4 Bedroom Unknown": None, + }, "HA39": { "Semi house": {"property_type": "House", "built_form": "Semi-Detached"}, "1st floor flat": {"property_type": "Flat", "built_form": None}, @@ -140,6 +268,105 @@ PROPERTY_TYPE_LOOKUP = { "1st floor flat with study room": {"property_type": "Flat", "built_form": None}, "2nd floor flat with study": {"property_type": "Flat", "built_form": None}, }, + "HA41": { + 'Garage': None, + 'House 1919-1945': 'House', + 'House 1946-1964': 'House', + 'Flats & Maisonettes post 1974': 'Flat', + 'Non traditional houses': 'House', + 'Sheltered': None, + 'Flats & Maisonettes 1965-1974': 'Flat', + 'House post 1974': 'House', + 'Block': None, + 'Flats & Maisonettes 1946-1964': 'Flat', + 'House 1965-1974': 'House', + 'Non traditional flats': 'Flat', + 'Bungalow 1965-1974': 'Bungalow', + 'PIMSS EMPTY': None, + 'Bungalow post 1974': 'Bungalow', + 'Bungalow 1946-1964': 'Bungalow', + 'Flats & Maisonettes 1919-1945': 'Flat', + 'House pre 1919': 'House', + 'Flats & Maisonettes pre 1919': 'Flat', + 'Bungalow 1919-1945': 'Bungalow', + 'Office': None + }, + "HA48": { + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + "Unit": None + }, + "HA50": { + 'House': 'House', + 'Bungalow': 'Bungalow', + 'Flat': 'Flat', + 'House SD': 'House', + 'House MT': 'House', + 'House ET': 'House', + 'Bungalow ET': 'Bungalow', + 'House SD ': 'House', + 'House. SD': 'House', + 'Bungalow SD': 'Bungalow', + 'Bungalow MT': 'Bungalow', + 'Bungalow D': 'Bungalow', + 'House D': 'House', + 'House. MT': 'House', + 'House ': 'House', + 'House ET ': 'House', + ' ': None, + 'Flat?': 'Flat', + 'Bungalow ': 'Bungalow' + }, + "HA56": { + 'House Non Specific': 'House', + 'HOUSE TERRACED': 'House', + 'HOUSE - SEMI DETACHD': 'House', + 'Bungalow': 'Bungalow', + 'House - End Terraced': 'House', + 'Block': None, + 'Block with Communal': None, + 'Bungalow - Terraced': 'Bungalow', + 'Bungalow - Semi Dtch': 'Bungalow', + 'Block House with rooms': None, + 'Bungalow - End Terr': 'Bungalow', + 'House - Mid Terraced': 'House', + 'Bungalow - Detached': 'Bungalow', + 'House - Detached': 'House', + 'HOUSE THREE STOREY': 'House', + 'Maisonette': 'Maisonette', + 'Communal Block': None, + 'Scheme': None + }, + "HA63": { + 'Flat': 'Flat', + 'House - Semi detached': 'House', + 'House - Detached': 'House', + 'House - End Terrace': 'House', + 'House - Mid Terrace': 'House', + 'Bungalow - Semi detached': 'Bungalow', + 'Bungalow': 'Bungalow', + 'Bedsit': None, # Considering as a non-specific residential category here + 'Maisonette': 'Maisonette', + 'Bungalow - End Terrace': 'Bungalow', + 'Bungalow - Detached': 'Bungalow', + 'Maisonette - Mid Terrace': 'Maisonette', + 'Maisonette - End Terrace': 'Maisonette', + 'Studio Flat': 'Flat', + 'Maisonette - Detached': 'Maisonette', + 'Bungalow - Mid Terrace': 'Bungalow', + 'Bedsit - Mid Terrace': None, + 'Bedsit - End Terrace': None, + 'Amenity Block - Semi detached': None, # Assuming non-residential + 'Maisonette - Semi Detached': 'Maisonette', + 'Amenity Block - Detached': None, # Assuming non-residential + 'Hostel': None, # Typically not considered a standard residential property for this context + 'Bungalow - Attached': 'Bungalow', + 'Unknown': None, # Not enough information to categorize + 'Studio Flat - Mid Terrace': 'Flat', + 'Chalet - Wheelchair': None # Specialized type, not categorized here + }, "HA107": { "property_type": { "HOUSE": "House", @@ -160,6 +387,27 @@ PROPERTY_TYPE_LOOKUP = { "Detached": "Detached", "Detatched": "Detached", } + }, + "HA117": { + "Flat": "Flat", + "House": "House", + "Bungalow": "Bungalow", + "Flat over garage/underpass": "Flat", + }, + "HAXXX": { + 'mid terraced house': 'House', + 'semi detached house': 'House', + '1st fl 4 in a block': 'Flat', + 'G/F 4 in a block': 'Flat', + 'end terraced house': 'House', + '1st floor flat': 'Flat', + 'G/F floor flat': 'Flat', + 'semi detached bungalow': 'Bungalow', + '2nd floor flat': 'Flat', + 'mid terrace bungalow': 'Bungalow', + 'detached bungalow': 'Bungalow', + 'end terrace bungalow': 'Bungalow', + 'Staff accommodation': None # Marked as None due to its special nature } } @@ -2882,12 +3130,36 @@ def get_property_type_and_built_form(property_meta, ha_name): property_type = "Flat" built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"], None) + elif ha_name == "HA2": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type"].strip()) + built_form = None elif ha_name == "HA6": property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Dwelling type"]] built_form = property_meta["built_form"] elif ha_name == "HA7": property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"].get(property_meta["Archetype"]) built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"]) + elif ha_name == "HA9": + property_description = property_meta["Asset Type"].strip().lower() + if "house" in property_description: + return "House", None + + if "flat" in property_description: + return "Flat", None + + if "bungalow" in property_description: + return "Bungalow", None + + if "maisonette" in property_description: + return "Maisonette", None + + return None, None + elif ha_name == "HA12": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset_Type1"].strip()) + built_form = None + elif ha_name == "HA13": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Type Cd"].strip()) + built_form = None elif ha_name == "HA14": if property_meta["Asset Type Description"] == "Block - Repair": # We try and deduce if it's a flat or house, depending on if it has "room" or "flats" in the address @@ -2902,15 +3174,60 @@ def get_property_type_and_built_form(property_meta, ha_name): ] built_form = None - elif ha_name == "HA25": - property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["T1_AssetType"]] + elif ha_name == "HA15": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None elif ha_name == "HA16": config = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Type"]] property_type = config.get("property-type") built_form = config.get("built-form") - elif ha_name == "HA39": + elif ha_name == "HA18": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip()) + built_form = None + elif ha_name == "HA19": + property_type = property_meta["Dwelling Type"] + built_form = None + elif ha_name == "HA24": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) + built_form = None + elif ha_name == "HA25": + property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["T1_AssetType"]] + built_form = None + elif ha_name == "HA27": + property_type = property_meta["Property Type"] + built_form = None + elif ha_name == "HA28": + property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Property Type - Academy"]] + built_form = None + elif ha_name == "HA30": + property_type = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["A_AssetType"]] + built_form = None + elif ha_name == "HA31": + property_description = property_meta["A_AssetType"].strip().lower() + if "house" in property_description: + return "House", None + if "flat" in property_description: + return "Flat", None + + if "bungalow" in property_description: + return "Bungalow", None + + if "maisonette" in property_description: + return "Maisonette", None + + return None, None + + elif ha_name == "HA32": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling type"].strip()) + built_form = None + elif ha_name == "HA34": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) + built_form = None + elif ha_name == "HA35": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type Grouping"].strip()) + built_form = None + elif ha_name == "HA39": property_type_config = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["ConstructionStyle"], {}) property_type = property_type_config.get("property_type", None) built_form = property_type_config.get("built_form", None) @@ -2921,11 +3238,35 @@ def get_property_type_and_built_form(property_meta, ha_name): property_type = "Flat" else: property_type = "House" + elif ha_name == "HA41": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Archetype"].strip()) + built_form = None + elif ha_name == "HA48": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) + built_form = None + elif ha_name == "HA49": + property_type = property_meta["Property Class"].strip() + built_form = None + elif ha_name == "HA54": + property_type = property_meta["Property Type"] + built_form = None + elif ha_name == "HA56": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type Description"].strip()) + built_form = None + elif ha_name == "HA63": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PropertyType"].strip()) + built_form = None elif ha_name == "HA107": - property_type = property_meta.get("property_type", None) built_form = property_meta.get("built_form", None) - + elif ha_name == "HA117": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) + built_form = None + elif ha_name == "HAXX": + return property_meta["Property Type"].split(":")[0].strip(), None + elif ha_name == "HAXXX": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Unit Description"].strip()) + built_form = None else: raise NotImplementedError("Implement me") @@ -5119,9 +5460,16 @@ def forecast_remaining_sales(loader): def fml_data_pull(loader): - has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16", - # Do these - "HA1", "HA13", "HA50", "HA24"] + has_bruh = [ + # "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", + # Updated get_property_type_and_built_form, still needs running + "HA13", "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", + "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", + # todo + ] + + # Can't pull from EPC database because it's based in Scotland + # "HAXXX", "HAXX" # DO from backend.SearchEpc import SearchEpc epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=" @@ -5134,14 +5482,24 @@ def fml_data_pull(loader): # For each property, search for the latest EPC epc_data = [] for _, row in tqdm(fml.iterrows(), total=fml.shape[0]): + property_type, _ = get_property_type_and_built_form(property_meta=row, ha_name=ha) + + if ha == "HAXXX": + to_join = [str(x) for x in + [row["Door Number"], row["Address Line 1"], row["Address Line 2"], row["Address Line 3"], + row["Postcode"]] if x is not None] + full_address = ", ".join(to_join) + else: + full_address = row["matching_address"] + searcher = SearchEpc( - address1=row["HouseNo"], + address1=str(row["HouseNo"]), postcode=row["matching_postcode"], auth_token=epc_api_key, os_api_key="", property_type=property_type, - full_address=row["matching_address"], + full_address=full_address, ) # Force the skipping of estimating the EPC searcher.ordnance_survey_client.property_type = None @@ -5194,7 +5552,7 @@ def classify_loft(x): def fml_analysis(loader): assumed_ciga_pass_rate = 0.731 - has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16"] + has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16", "HA1"] no_ciga_cavity_descriptions = [ "Cavity wall, as built, insulated (assumed)", From 6423ab2fac732a905645260263ebc72149424712 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 15 Mar 2024 17:53:18 +0000 Subject: [PATCH 142/155] data pull pipeline ready --- backend/SearchEpc.py | 11 +- .../ha_15_32/ha_analysis_batch_3.py | 100 ++++++++++-------- 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 3d2df9fb..cc2ee4a9 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -147,6 +147,7 @@ class SearchEpc: uprn: [int, None] = None, size=None, property_type=None, + fast=False ): """ Address lines 1 and postcode are mandatory fields. The other address lines are optional @@ -187,6 +188,7 @@ class SearchEpc: self.size = size if size is not None else 25 self.property_type = property_type + self.fast = fast @classmethod def get_house_number(cls, address: str) -> str | None: @@ -365,9 +367,6 @@ class SearchEpc: # Finally, we identify the newest epc and the rest, and then return newest_epc, older_epcs = self.filter_newest_epc(list_of_epcs=rows) - # Retrieve postcode and address - address_epc, postcode_epc = self.format_address(newest_epc=newest_epc) - # Ge the uprn from the newest record for this home uprns = {r["uprn"] for r in rows if r["uprn"]} # We can sometimes have no uprn for a property @@ -384,6 +383,12 @@ class SearchEpc: uprn = uprns.pop() if uprns else None + if self.fast: + return newest_epc, [], {}, "", "", None + + # Retrieve postcode and address + address_epc, postcode_epc = self.format_address(newest_epc=newest_epc) + return newest_epc, older_epcs, full_sap_epc, address_epc, postcode_epc, uprn @staticmethod diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 902d48fd..7db97733 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -5461,9 +5461,9 @@ def forecast_remaining_sales(loader): def fml_data_pull(loader): has_bruh = [ - # "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", + # "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", # Updated get_property_type_and_built_form, still needs running - "HA13", "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", + "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", # todo ] @@ -5474,57 +5474,63 @@ def fml_data_pull(loader): from backend.SearchEpc import SearchEpc epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=" + failed_has = [] for ha in has_bruh: - asset_list = loader.data[ha]["asset_list"].copy() - # properties found as eligibile - fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"] + print(f"Pulling data for {ha}") + try: + asset_list = loader.data[ha]["asset_list"].copy() + # properties found as eligibile + fml = asset_list[asset_list["ECO Eligibility"] != "not eligible"] - # For each property, search for the latest EPC - epc_data = [] - for _, row in tqdm(fml.iterrows(), total=fml.shape[0]): + # For each property, search for the latest EPC + epc_data = [] + for _, row in tqdm(fml.iterrows(), total=fml.shape[0]): - property_type, _ = get_property_type_and_built_form(property_meta=row, ha_name=ha) + property_type, _ = get_property_type_and_built_form(property_meta=row, ha_name=ha) - if ha == "HAXXX": - to_join = [str(x) for x in - [row["Door Number"], row["Address Line 1"], row["Address Line 2"], row["Address Line 3"], - row["Postcode"]] if x is not None] - full_address = ", ".join(to_join) - else: - full_address = row["matching_address"] + if ha == "HAXXX": + to_join = [str(x) for x in + [row["Door Number"], row["Address Line 1"], row["Address Line 2"], row["Address Line 3"], + row["Postcode"]] if x is not None] + full_address = ", ".join(to_join) + else: + full_address = row["matching_address"] - searcher = SearchEpc( - address1=str(row["HouseNo"]), - postcode=row["matching_postcode"], - auth_token=epc_api_key, - os_api_key="", - property_type=property_type, - full_address=full_address, + searcher = SearchEpc( + address1=str(row["HouseNo"]), + postcode=row["matching_postcode"], + auth_token=epc_api_key, + os_api_key="", + property_type=property_type, + full_address=full_address, + fast=True + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + epc = { + "asset_list_row_id": row["asset_list_row_id"], + **searcher.newest_epc.copy() + } + + epc_data.append(epc) + + # Remove None entries + epc_data = [x for x in epc_data if x is not None] + # Save the data in S3 as a parquet + epc_data_df = pd.DataFrame(epc_data) + save_pickle_to_s3( + data=epc_data_df, + bucket_name="retrofit-datalake-dev", + s3_file_name=f"ha-analysis/revised/{ha}/epc_data.pickle" ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - if searcher.newest_epc is None: - continue - - epc = { - "asset_list_row_id": row["asset_list_row_id"], - **searcher.newest_epc.copy() - } - - epc_data.append(epc) - - # Remove None entries - epc_data = [x for x in epc_data if x is not None] - # Save the data in S3 as a parquet - epc_data_df = pd.DataFrame(epc_data) - save_pickle_to_s3( - data=epc_data_df, - bucket_name="retrofit-datalake-dev", - s3_file_name=f"ha-analysis/revised/{ha}/epc_data.pickle" - ) + except Exception as e: + failed_has.append(ha) def extract_lower_bound(age_band): From 4e077053cd73b4e6cd27392440e4e179846f6f9a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sat, 16 Mar 2024 14:51:39 +0000 Subject: [PATCH 143/155] Adding gbis to output --- .../ha_15_32/ha_analysis_batch_3.py | 92 +++++++++++++++---- 1 file changed, 74 insertions(+), 18 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 7db97733..0ca28927 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3247,6 +3247,9 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA49": property_type = property_meta["Property Class"].strip() built_form = None + elif ha_name == "HA50": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) + built_form = None elif ha_name == "HA54": property_type = property_meta["Property Type"] built_form = None @@ -5685,12 +5688,6 @@ def fml_analysis(loader): fuck_this["roof_classiciation"] = fuck_this.apply(lambda x: classify_loft(x), axis=1) - # fuck_this["construction-age-band"] = fuck_this["construction-age-band"].apply( - # lambda x: EPCDataProcessor.clean_construction_age_band(x) - # ) - # - # fuck_this['age_lower_bound'] = fuck_this['construction-age-band'].apply(extract_lower_bound) - had_survey = fuck_this[fuck_this["estimated"] == False] # proportion with a survey: @@ -5716,10 +5713,6 @@ def fml_analysis(loader): ] # Characterise no CIGA check needed - ciga_check_needed = had_survey[ - had_survey["ECO Eligibility"].str.contains("subject to ciga") - ].copy() - ciga_check_passed = had_survey[had_survey["ECO Eligibility"] == "eco4 - passed ciga"] # These should be treated the same as one that have passed their ciga checks, from a detection perspective ciga_check_passed_eligible = ciga_check_passed[ @@ -5743,20 +5736,60 @@ def fml_analysis(loader): # differ between variables; floor and wall type errors occur in ~10-15% of EPCs, # compared with ~5% for wall insulation and glazing performance + ciga_check_needed = had_survey[ + had_survey["ECO Eligibility"].str.contains("subject to ciga") + ].copy() + ciga_check_needed_eligible = ciga_check_needed[ (ciga_check_needed["walls-description"].str.lower().str.contains("cavity") == True) & (ciga_check_needed["roof_classiciation"].isin(["high", "medium"])) & (ciga_check_needed["current-energy-efficiency"].astype(float) <= 80) ] + # Finally, characterise gbis properties. Some of the business might look like ECO4 work, whereas we then + # qualify what actually looks like gbis + gbis_identified = had_survey[ + had_survey["ECO Eligibility"] == "gbis" + ].copy() + + gbis_looks_like_eco4 = gbis_identified[ + (gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) & + (gbis_identified["roof_classiciation"].isin(["high", "medium"])) & + (gbis_identified["current-energy-efficiency"].astype(float) <= 80) & + ( + ( + (gbis_identified["property-type"] == "House") & + (gbis_identified["built-form"] != "Mid-Terrace") + ) | ( + (gbis_identified["property-type"] == "Bungalow") & + (gbis_identified["built-form"].isin(["Detached"])) + ) + ) + ] + + gbis_qualified = gbis_identified[ + (gbis_identified["walls-description"].isin(no_ciga_cavity_descriptions)) & + (gbis_identified["current-energy-efficiency"].astype(float) <= 80) & + (~gbis_identified["asset_list_row_id"].isin(gbis_looks_like_eco4["asset_list_row_id"].values)) + ] + ciga_check_expectation = np.round(ciga_check_needed_eligible.shape[0] * ha_ciga_pass_rate) without_ciga_expectation = no_ciga_check_needed_eligible.shape[0] passed_ciga_expectation = ciga_check_passed_eligible.shape[0] + identified_as_gbis_looks_like_eco4 = gbis_looks_like_eco4.shape[0] # Need to add on the non-ciga - total_expectation = ciga_check_expectation + without_ciga_expectation + passed_ciga_expectation + total_eco4_expectation = ( + ciga_check_expectation + + without_ciga_expectation + + passed_ciga_expectation + + identified_as_gbis_looks_like_eco4 + ) - total_gbis_expectation = no_ciga_check_needed_eligible_gbis.shape[0] + no_ciga_check_needed_actually_gbis = no_ciga_check_needed_eligible_gbis.shape[0] + gbis_qualified = gbis_qualified.shape[0] + + total_gbis_expectation = no_ciga_check_needed_actually_gbis + gbis_qualified if proportion_with_survey < 100: # We estimate the rest @@ -5805,14 +5838,38 @@ def fml_analysis(loader): without_survey_eco4.shape[0] * (total_gbis_expectation / no_ciga_check_needed.shape[0]) ) - total_expectation = ( - total_expectation + + # And gbis + without_survey_gbis = fuck_this[ + (fuck_this["estimated"] == True) & + (fuck_this["ECO Eligibility"] == "gbis") + ] + + if without_survey_gbis.empty: + without_survey_identified_as_gbis_qualified = 0 + without_survey_identified_as_gbis_eco4 = 0 + else: + # We apply the same conversion rate as the properties with a survey + without_survey_identified_as_gbis_qualified = np.round( + without_survey_gbis.shape[0] * (gbis_qualified / gbis_identified.shape[0]) + ) + + without_survey_identified_as_gbis_eco4 = np.round( + without_survey_eco4.shape[0] * (identified_as_gbis_looks_like_eco4 / gbis_identified.shape[0]) + ) + + total_eco4_expectation = ( + total_eco4_expectation + without_survey_without_ciga_expected + without_survey_passed_ciga_expected + - without_survey_eco4_expected + without_survey_eco4_expected + + without_survey_identified_as_gbis_eco4 ) - total_gbis_expectation = total_gbis_expectation + without_survey_gbis_expected + total_gbis_expectation = ( + total_gbis_expectation + + without_survey_gbis_expected + + without_survey_identified_as_gbis_qualified + ) surveys = loader.data[ha_name]["survey_list"] sold_now = 0 @@ -5829,9 +5886,8 @@ def fml_analysis(loader): "Original ECO4 Estimate - Remaining": original_remaining, "Postcode List - Remaining": postcode_list_remaining, # "Of which sold": sales_since_nov, - "Of which ECO4 Eligible - Remaining": int(total_expectation), + "Of which ECO4 Eligible - Remaining": int(total_eco4_expectation), "Of which GBIS Eligibile - Remaining": int(total_gbis_expectation), - # "Proportion with a survey": proportion_with_survey, } ) From a7ed3b84e560ea3e92517f8568bc7918e352d0e7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Mar 2024 14:12:49 +0000 Subject: [PATCH 144/155] Added HA8 --- .../ha_15_32/ha_analysis_batch_3.py | 98 ++++++++++++++++++- 1 file changed, 93 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 0ca28927..67139e40 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -517,6 +517,11 @@ class DataLoader: asset_list["Address3"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA8": + asset_list["matching_address"] = asset_list["AddressLine1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["AddressLine2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA9": asset_list["matching_address"] = asset_list["House Number"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ @@ -2293,6 +2298,30 @@ class DataLoader: def correct_ha49_survey_list(survey_list): return survey_list + @staticmethod + def correct_ha8_survey_list(survey_list): + # Split on / and take the first half + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.split("/").str[0] + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "WESTONIA COURT HOUSE", "Westonia Court" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Hillesdon Avenue", "Hillesden Avenue" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Weston Street", "Western Street" + ) + + # Remove placeholder rows where postcode is missing + survey_list = survey_list[ + ~pd.isnull(survey_list["Post Code"]) + ] + + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -5464,7 +5493,7 @@ def forecast_remaining_sales(loader): def fml_data_pull(loader): has_bruh = [ - # "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", + "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", # Updated get_property_type_and_built_form, still needs running "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", @@ -5561,7 +5590,13 @@ def classify_loft(x): def fml_analysis(loader): assumed_ciga_pass_rate = 0.731 - has_bruh = ["HA7", "HA14", "HA25", "HA39", "HA16", "HA1"] + has_bruh = [ + "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", + # Updated get_property_type_and_built_form, still needs running + "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", + "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", + # todo + ] no_ciga_cavity_descriptions = [ "Cavity wall, as built, insulated (assumed)", @@ -5597,12 +5632,13 @@ def fml_analysis(loader): results = [] wall_descriptions = [] - for ha_name in has_bruh: + for ha_name in tqdm(has_bruh): original_figures = loader.december_figures[ loader.december_figures["HA Name"] == ha_name ].copy() original_remaining = original_figures["ECO4 remaining"].values[0] + original_gbis_remaining = original_figures["GBIS remaining"].values[0] postcode_list_remaining = remaining_eligible_mapping[ha_name] # Read in the epc data @@ -5669,7 +5705,54 @@ def fml_analysis(loader): raise Exception("SOMETHING WENT WRONG") if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")): - raise Exception("DO THE DAMN ARCHETYPE CHECK BRO") + # We perform the archetype test. If the property is a house, we it needs to be detached, semi-detached + # or end terrace. If it's a bungalow, it must be attached + fuck_this["passes_archetype"] = None + fuck_this["passes_archetype"] = np.where( + (fuck_this["property-type"] == "House") & + (fuck_this["built-form"].isin(["Semi-Detached", "End-Terrace", "Detached"])), + True, + fuck_this["passes_archetype"] + ) + + fuck_this["passes_archetype"] = np.where( + (fuck_this["property-type"] == "Bungalow") & + (fuck_this["built-form"].isin(["Detached"])), + True, + fuck_this["passes_archetype"] + ) + + fuck_this["ECO Eligibility"] = np.where( + (fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") & + (fuck_this["passes_archetype"] == True), + "eco4 (subject to ciga)", + fuck_this["ECO Eligibility"] + ) + + # If failed the archetype check and needs a CIGA, it's not eligibile + fuck_this["ECO Eligibility"] = np.where( + (fuck_this["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)") & + (fuck_this["passes_archetype"] != True), + "not eligible", + fuck_this["ECO Eligibility"] + ) + + fuck_this["ECO Eligibility"] = np.where( + (fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") & + (fuck_this["passes_archetype"] == True), + "eco4", + fuck_this["ECO Eligibility"] + ) + + fuck_this["ECO Eligibility"] = np.where( + (fuck_this["ECO Eligibility"] == "eco4 (subject to archetype)") & + (fuck_this["passes_archetype"] != True), + "gbis", + fuck_this["ECO Eligibility"] + ) + + if any(fuck_this["ECO Eligibility"].str.contains("subject to archetype")): + raise Exception("DO THE DAMN ARCHETYPE CHECK BRO") # clean roof insulation fuck_this["roof_insulation_thickness"] = fuck_this["roof_insulation_thickness"].fillna("0") @@ -5685,6 +5768,9 @@ def fml_analysis(loader): fuck_this["roof_insulation_thickness"] = fuck_this[ "roof_insulation_thickness" ].str.replace("average", "150") + fuck_this["roof_insulation_thickness"] = fuck_this[ + "roof_insulation_thickness" + ].str.replace("above 150", "150") fuck_this["roof_classiciation"] = fuck_this.apply(lambda x: classify_loft(x), axis=1) @@ -5884,6 +5970,7 @@ def fml_analysis(loader): { "HA Name": ha_name, "Original ECO4 Estimate - Remaining": original_remaining, + "Original GGBIS Estimate - Remaining": original_gbis_remaining, "Postcode List - Remaining": postcode_list_remaining, # "Of which sold": sales_since_nov, "Of which ECO4 Eligible - Remaining": int(total_eco4_expectation), @@ -5927,7 +6014,8 @@ def app(): "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", "HA63", "HA107", "HA117", - + # Added as of March 17th + "HA8", # New HAS "HAXX", "HAXXX", ] From 94ad06726320972b02db779b8f2e9440a0ea9c0e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Mar 2024 14:25:49 +0000 Subject: [PATCH 145/155] done ha11 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 67139e40..920ec1b6 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -530,6 +530,12 @@ class DataLoader: asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA11": + asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address 3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Post Code"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() elif ha_name == "HA13": asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["address 2"].astype(str).str.lower().str.strip() + ", " + \ @@ -2322,6 +2328,15 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha11_survey_list(survey_list): + # Remove 39 HOLLYWOOD WAY as it's not in the asset list + survey_list = survey_list[ + ~((survey_list["Street / Block Name"] == "HOLLYWOOD WAY") & + (survey_list["NO."] == 39)) + ] + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -6015,7 +6030,7 @@ def app(): "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", "HA63", "HA107", "HA117", # Added as of March 17th - "HA8", + "HA8", "HA11", # New HAS "HAXX", "HAXXX", ] From 9bbcbc881f3f1c50ab8ec422c5b38f04e864e676 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Sun, 17 Mar 2024 14:42:24 +0000 Subject: [PATCH 146/155] Added ha21 --- etl/eligibility/ha_15_32/ha_analysis_batch_3.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 920ec1b6..e9de4695 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -573,6 +573,12 @@ class DataLoader: asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA21": + asset_list["matching_address"] = ( + asset_list["Address"].astype(str).str.lower().str.strip() + ", " + + asset_list["PostCode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip() elif ha_name == "HA25": asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] @@ -6030,7 +6036,7 @@ def app(): "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", "HA63", "HA107", "HA117", # Added as of March 17th - "HA8", "HA11", + "HA8", "HA11", "HA21", # New HAS "HAXX", "HAXXX", ] @@ -6038,7 +6044,7 @@ def app(): # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], # 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 [DONE], 34 [DONE], 30 [DONE], 31 [DONE], 54 [DONE] # - # Consider for ECO4: + # Consider for ECO4: HA 70 - have to merge ECO3 list though, HA17 has LOTs of assets, but the asset list is a mess # Consider for GBIS: # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in From 897d58eec2ecc1e51d4a46878918f6c019a2705c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Mar 2024 10:40:12 +0000 Subject: [PATCH 147/155] Added ha44 --- .../ha_15_32/ha_analysis_batch_3.py | 189 +++++++++++++++++- 1 file changed, 178 insertions(+), 11 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index e9de4695..dc96d403 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -78,6 +78,29 @@ PROPERTY_TYPE_LOOKUP = { "End Terraced": "End-Terrace", } }, + "HA8": { + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + "Bedsit": None, + "Room": None, + "Other": None, + "Commerical": None + }, + "HA11": { + "Flat": "Flat", + "House": "House", + "Semi-Det House": "House", + "Bedsit": None, + "End-Terr House": "House", + "Mid-Terr House": "House", + "Bungalow": "Bungalow", + "Maisonette": "Maisonette", + "End Terr Flat": "Flat", + "Mid Terr Flat": "Flat", + "Detached Flat": "Flat", + }, "HA12": { "House": "House", "Flat": "Flat", @@ -244,6 +267,13 @@ PROPERTY_TYPE_LOOKUP = { "3 Bedroom Unknown": None, "4 Bedroom Unknown": None, }, + "HA37": { + "FLT": "Flat", + "HSE": "House", + "BNW": "Bungalow", + "MAS": "Maisonette", + "HSL": None + }, "HA39": { "Semi house": {"property_type": "House", "built_form": "Semi-Detached"}, "1st floor flat": {"property_type": "Flat", "built_form": None}, @@ -291,6 +321,21 @@ PROPERTY_TYPE_LOOKUP = { 'Bungalow 1919-1945': 'Bungalow', 'Office': None }, + "HA42": { + 'Flat': 'Flat', + 'House': 'House', + 'Flat Basement': 'Flat', + 'Room': None, + 'Bedsit Flat': 'Flat', + 'Maisonette': 'Maisonette', + 'Scheme Office': None, + 'Scheme Lounge': None, + 'Bungalow': 'Bungalow', + 'Garage': None, + 'Scheme Sleep Room': None, + 'Cluster': None, + 'Scheme Room': None + }, "HA48": { "House": "House", "Flat": "Flat", @@ -626,6 +671,12 @@ class DataLoader: asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address Post Code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Address Post Code"].astype(str).str.lower().str.strip() + elif ha_name == "HA37": + asset_list["matching_address"] = asset_list["ADDRESS LINE 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["ADDRESS LINE 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["ADDRESS LINE 3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["POSTCODE"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip() elif ha_name == "HA38": asset_list["matching_address"] = asset_list["House_Number"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address_Line_1"].astype(str).str.lower().str.strip() + ", " + \ @@ -650,6 +701,18 @@ class DataLoader: asset_list["AddressLine5"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Postcode"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA42": + asset_list["matching_address"] = asset_list["Dwelling Number"].astype(str).str.lower().str.strip() + " " + \ + asset_list["Street"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Locality"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Town"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA44": + asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postal Code"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["Postal Code"].astype(str).str.lower().str.strip() elif ha_name == "HA50": asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Post Code"].astype(str).str.lower().str.strip() @@ -1177,6 +1240,66 @@ class DataLoader: asset_list["matching_address"] ) + asset_list["HouseNo"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/FLAT C", + ] + )), + "10C", + asset_list["HouseNo"] + ) + + asset_list["matching_address"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/FLAT C", + ] + )), + "FLAT c, spennymoor, co. durham, dl16 7df, 10c, 10 south view", + asset_list["matching_address"] + ) + + asset_list["HouseNo"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/FLAT D", + ] + )), + "10D", + asset_list["HouseNo"] + ) + + asset_list["matching_address"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/FLAT D", + ] + )), + "FLAT d, spennymoor, co. durham, dl16 7df, 10d, 10 south view", + asset_list["matching_address"] + ) + + asset_list["HouseNo"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/FLAT E", + ] + )), + "10E", + asset_list["HouseNo"] + ) + + asset_list["matching_address"] = np.where( + (asset_list["Address_Line_1"].isin( + [ + "10 SOUTH VIEW/FLAT E", + ] + )), + 'FLAT e, spennymoor, co. durham, dl16 7df, 10e, 10 south view', + asset_list["matching_address"] + ) + return asset_list @staticmethod @@ -1730,6 +1853,13 @@ class DataLoader: survey_list["Street / Block Name"] ) + survey_list["Post Code"] = np.where( + (survey_list["Street / Block Name"] == "BEECH ROAD") & + (survey_list["Post Code"] == "DH6 1JD"), + "DH6 1JB", + survey_list["Post Code"] + ) + return survey_list @staticmethod @@ -2343,6 +2473,18 @@ class DataLoader: ] return survey_list + @staticmethod + def correct_ha42_survey_list(survey_list): + # original asset list has nothing in the street + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Turnstone Terrace", "" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Pegasus place", "" + ) + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -2926,7 +3068,7 @@ class DataLoader: "eco4 subject to ciga": "eco4 (subject to ciga)", "eco4 (subject to archetype/ciga)": "eco4 (subject to ciga) (subject to archetype)", "eco4( subject to ciga/archetype)": "eco4 (subject to ciga) (subject to archetype)", - "eco4 (subject to ciga/ archetype)": "eco4 (subject to ciga) (subject to archetype)" + "eco4 (subject to ciga/ archetype)": "eco4 (subject to ciga) (subject to archetype)", } ha_facts_and_figures = [] @@ -3189,6 +3331,9 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA7": property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"].get(property_meta["Archetype"]) built_form = PROPERTY_TYPE_LOOKUP[ha_name]["built_form"].get(property_meta["Property Type"]) + elif ha_name == "HA8": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) + built_form = None elif ha_name == "HA9": property_description = property_meta["Asset Type"].strip().lower() if "house" in property_description: @@ -3204,6 +3349,9 @@ def get_property_type_and_built_form(property_meta, ha_name): return "Maisonette", None return None, None + elif ha_name == "HA11": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) + built_form = None elif ha_name == "HA12": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset_Type1"].strip()) built_form = None @@ -3237,6 +3385,21 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA19": property_type = property_meta["Dwelling Type"] built_form = None + elif ha_name == "HA21": + property_description = property_meta["Property Type"].strip().lower() + if "house" in property_description: + return "House", None + + if "flat" in property_description: + return "Flat", None + + if "bungalow" in property_description: + return "Bungalow", None + + if "maisonette" in property_description: + return "Maisonette", None + + return None, None elif ha_name == "HA24": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None @@ -3277,6 +3440,9 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA35": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type Grouping"].strip()) built_form = None + elif ha_name == "HA37": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["PROPERTY TYPE"].strip()) + built_form = None elif ha_name == "HA39": property_type_config = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["ConstructionStyle"], {}) property_type = property_type_config.get("property_type", None) @@ -3291,6 +3457,9 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA41": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Archetype"].strip()) built_form = None + elif ha_name == "HA42": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling use/type"].strip()) + built_form = None elif ha_name == "HA48": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None @@ -5515,10 +5684,9 @@ def forecast_remaining_sales(loader): def fml_data_pull(loader): has_bruh = [ "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", - # Updated get_property_type_and_built_form, still needs running "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", - # todo + 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', ] # Can't pull from EPC database because it's based in Scotland @@ -5613,10 +5781,9 @@ def fml_analysis(loader): assumed_ciga_pass_rate = 0.731 has_bruh = [ "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", - # Updated get_property_type_and_built_form, still needs running "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", - # todo + 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', ] no_ciga_cavity_descriptions = [ @@ -5639,7 +5806,7 @@ def fml_analysis(loader): "HA39", "HA14", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA7", "HA16", "HA107", "HA25", "HA50", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA13", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", - "HA30", "HA31", "HA54", "HAXX", "HA49", "HAXXX" + "HA30", "HA31", "HA54", "HAXX", "HA49", "HAXXX", ] values = [ @@ -5660,7 +5827,6 @@ def fml_analysis(loader): ].copy() original_remaining = original_figures["ECO4 remaining"].values[0] original_gbis_remaining = original_figures["GBIS remaining"].values[0] - postcode_list_remaining = remaining_eligible_mapping[ha_name] # Read in the epc data asset_list = loader.data[ha_name]["asset_list"].copy() @@ -5992,10 +6158,10 @@ def fml_analysis(loader): "HA Name": ha_name, "Original ECO4 Estimate - Remaining": original_remaining, "Original GGBIS Estimate - Remaining": original_gbis_remaining, - "Postcode List - Remaining": postcode_list_remaining, + # "Postcode List - Remaining": postcode_list_remaining, # "Of which sold": sales_since_nov, - "Of which ECO4 Eligible - Remaining": int(total_eco4_expectation), - "Of which GBIS Eligibile - Remaining": int(total_gbis_expectation), + "EPC verified ECO4 Eligible - Remaining": int(total_eco4_expectation), + "EPC verified GBIS Eligibile - Remaining": int(total_gbis_expectation), } ) @@ -6036,7 +6202,8 @@ def app(): "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", "HA63", "HA107", "HA117", # Added as of March 17th - "HA8", "HA11", "HA21", + "HA8", "HA11", "HA21", "HA37", "HA42", + "HA44", # New HAS "HAXX", "HAXXX", ] From c58acadb730b6e6ab1ebb700b4669ab3cf171f5b Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Mar 2024 12:19:15 +0000 Subject: [PATCH 148/155] HA51 eco3 matching --- .../ha_15_32/ha_analysis_batch_3.py | 80 ++++++++++++++++--- 1 file changed, 71 insertions(+), 9 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index dc96d403..af9af514 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -491,6 +491,10 @@ class DataLoader: "address": "A_Address", "postcode": "matching_postcode" }, + "HA45": { + "address": "Full postal address", + "postcode": "Postcode" + }, "HA48": { "address": "Full Address", "postcode": "Postcode" @@ -518,7 +522,8 @@ class DataLoader: "HA50": 4, "HA63": 15, "HA107": 51, - "HA48": 0 + "HA48": 0, + "HA45": 0 } UNMATCHED_ECO3 = { @@ -527,7 +532,8 @@ class DataLoader: "HA50": 5, "HA56": 320, "HA63": 0, - "HA117": 4 + "HA117": 4, + "HA51": 24 } def __init__(self, directories, december_figures_filepath, use_cache, rebuild): @@ -542,7 +548,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA49", "HA54"]: + if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA45", "HA48", "HA49", "HA54"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -717,6 +723,18 @@ class DataLoader: asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Post Code"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["Post Code"].astype(str).str.lower().str.strip() + elif ha_name == "HA51": + asset_list["matching_address"] = asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["Postcode"].astype(str).str.lower().str.strip() + asset_list["matching_address"] = np.where( + asset_list["Block"].str.strip().str.len() > 0, + asset_list["Block"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["matching_address"], + asset_list["matching_address"] + ) + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA56": asset_list["matching_address"] = asset_list["Address 1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["Address 2"].astype(str).str.lower().str.strip() + ", " + \ @@ -2485,6 +2503,13 @@ class DataLoader: ) return survey_list + @staticmethod + def correct_ha45_survey_list(survey_list): + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Norwich Road", "Norwich Avenue" + ) + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -2744,6 +2769,38 @@ class DataLoader: return eco3_list + @staticmethod + def correct_ha51_eco3_list(eco3_list): + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "HASELEMERE AVENUE", "HASLEMERE AVENUE" + ) + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "THORVILLE GROVE", "THORNVILLE GROVE" + ) + eco3_list["Street / Block Name"] = eco3_list["Street / Block Name"].str.replace( + "MONTBRETA CLOSE", "MONTBRETIA CLOSE" + ) + eco3_list["Post Code"] = np.where( + (eco3_list["Street / Block Name"] == "SYDENHAM ROAD") & + (eco3_list["Post Code"] == "CR0 2DW"), + "CR0 2ED", + eco3_list["Post Code"] + ) + # Not in asset list + eco3_list = eco3_list[ + ~((eco3_list["Street / Block Name"] == "WOODLEY LANE") & + (eco3_list["Post Code"] == "SM5 2RJ") & + (eco3_list["NO "] == "FLAT 3, 11")) + ] + + eco3_list["NO "] = np.where( + (eco3_list["NO "] == "47 B"), + "47B", + eco3_list["NO "] + ) + + return eco3_list + def merge_eco3_to_assets(self, asset_list, eco3_list, ha_name): eco3_list_correction_function = getattr(self, f"correct_{ha_name.lower()}_eco3_list") @@ -2752,7 +2809,7 @@ class DataLoader: asset_list["matching_postcode_nospace"] = asset_list["matching_postcode"].str.replace(" ", "").str.lower() eco3_list["postcode_no_space"] = eco3_list["Post Code"].str.lower().str.replace(" ", "") - if ha_name in ["HA25", "HA56"]: + if ha_name in ["HA25", "HA56", "HA51"]: # HA25: 317 -> 259 missed_postcodes = { postcode for postcode in eco3_list["postcode_no_space"] if @@ -2774,7 +2831,7 @@ class DataLoader: matching_lookup = [] missed = [] for _, row in tqdm(eco3_list.iterrows(), total=len(eco3_list)): - # if row["eco3_list_row_id"] == "HA25_Eco3_5422": + # if row["eco3_list_row_id"] == "HA51_Eco3_22": # raise Exception() postcode = row["postcode_no_space"] @@ -2813,6 +2870,12 @@ class DataLoader: missed.append(row["eco3_list_row_id"]) continue + if df.shape[0] > 1: + if "flat" in str(row["NO "]).lower(): + df = df[df["matching_address"].str.contains("flat")] + else: + df = df[~df["matching_address"].str.contains("flat")] + if df.shape[0] != 1: print(row["Street / Block Name"]) print(house_number) @@ -6200,10 +6263,9 @@ def app(): priority_has = [ "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", - "HA63", "HA107", "HA117", - # Added as of March 17th - "HA8", "HA11", "HA21", "HA37", "HA42", - "HA44", + "HA63", "HA107", "HA117", "HA8", "HA11", "HA21", "HA37", "HA42", + # Added as of March 18th + "HA44", "HA45", "HA51", # New HAS "HAXX", "HAXXX", ] From e7cd80eba0ef8f11c62506509b5a7d60c7a37ce7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Mar 2024 12:34:28 +0000 Subject: [PATCH 149/155] Added HA52 --- .../ha_15_32/ha_analysis_batch_3.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index af9af514..056a4190 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -946,6 +946,17 @@ class DataLoader: else: return "ECO surveys" + @staticmethod + def correct_ha51_asset_list(asset_list): + # Correct this + asset_list["HouseNo"] = np.where( + asset_list["matching_address"].str.contains("61 wandle bank"), + asset_list["Block"].str.lower(), + asset_list["HouseNo"] + ) + + return asset_list + def load_asset_list(self, filepath, ha_name): workbook = openpyxl.load_workbook(filepath) asset_sheetname = self.get_asset_sheetname(workbook) @@ -2510,6 +2521,16 @@ class DataLoader: ) return survey_list + @staticmethod + def correct_ha51_survey_list(survey_list): + survey_list = survey_list.rename(columns={"NO ": "NO."}) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Autum Close", "Autumn Close" + ) + + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() From e6c9dd7074dfba12668b31651ec1a5d9eab6a27c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Mar 2024 12:55:37 +0000 Subject: [PATCH 150/155] Done HA52 --- .../ha_15_32/ha_analysis_batch_3.py | 37 +++++++++++++++++-- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 056a4190..bdf15917 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -503,6 +503,10 @@ class DataLoader: "address": "Property Address Full", "postcode": "Property Postcode" }, + "HA52": { + "address": "Postal Address", + "postcode": "POSTCODE" + }, "HA54": { "address": "Postal Address", "postcode": "matching_postcode" @@ -523,7 +527,8 @@ class DataLoader: "HA63": 15, "HA107": 51, "HA48": 0, - "HA45": 0 + "HA45": 0, + "HA52": 5 } UNMATCHED_ECO3 = { @@ -548,7 +553,7 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA45", "HA48", "HA49", "HA54"]: + if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA45", "HA48", "HA49", "HA52", "HA54"]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -2531,6 +2536,25 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha52_survey_list(survey_list): + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Mardalle Avenue", "Mardale Avenue" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Ollerton Close, Grappenhall", "Ollerton Close" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Bradshaw Road, Grappenhall", "Bradshaw Lane" + ) + + # Drop a bunch of dupes + survey_list = survey_list.drop_duplicates(["NO.", "Street / Block Name", "Post Code"]) + + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -3165,7 +3189,12 @@ class DataLoader: asset_list_starting_size = asset_list.shape[0] # Change the column name if it's ECO eligibility - asset_list = asset_list.rename(columns={"ECO eligibility": "ECO Eligibility"}) + asset_list = asset_list.rename( + columns={ + "ECO eligibility": "ECO Eligibility", + "ECO Eligibilty": "ECO Eligibility", + }, + ) # Remove surplus whitespace from the ECO Eligibility column asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.strip() # Push to lower case @@ -6286,7 +6315,7 @@ def app(): "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", "HA63", "HA107", "HA117", "HA8", "HA11", "HA21", "HA37", "HA42", # Added as of March 18th - "HA44", "HA45", "HA51", + "HA44", "HA45", "HA51", "HA52", # New HAS "HAXX", "HAXXX", ] From 92193d773dbd72aca67da82870d3f7da5a4acfe7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Mar 2024 13:21:57 +0000 Subject: [PATCH 151/155] fix facts and figures bug for ha51 --- .../ha_15_32/ha_analysis_batch_3.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index bdf15917..e40bb98b 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3305,11 +3305,18 @@ class DataLoader: ) else: # We have some examples, e.g. HA28, where we do not have the installed or cancelled column - survey_list["installation_status"] = np.where( - survey_list['INSTALL/ CANCELLATION DATE'].str.lower().str.contains("cancelled"), - "cancelled", - "installed", - ) + if 'INSTALL/ CANCELLATION DATE' in survey_list.columns: + survey_list["installation_status"] = np.where( + survey_list['INSTALL/ CANCELLATION DATE'].str.lower().str.contains("cancelled"), + "cancelled", + "installed", + ) + else: + survey_list["installation_status"] = np.where( + survey_list['INSTALL / CANCELLATION DATE'].str.lower().str.contains("cancelled"), + "cancelled", + "installed", + ) # Finally, for other cases, we set the status to "in progress" survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress") @@ -5800,6 +5807,8 @@ def fml_data_pull(loader): "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', + # NEW - add property type + 'HA44', 'HA45', 'HA51', 'HA52' ] # Can't pull from EPC database because it's based in Scotland From 443aa585d0c3c35ae34718f0e8338ec48ba7ad3c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Mar 2024 15:40:52 +0000 Subject: [PATCH 152/155] Adding ha5 --- .../ha_15_32/ha_analysis_batch_3.py | 181 +++++++++++++++++- 1 file changed, 171 insertions(+), 10 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index e40bb98b..009064c6 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -336,6 +336,16 @@ PROPERTY_TYPE_LOOKUP = { 'Cluster': None, 'Scheme Room': None }, + "HA45": { + 'Large block of flats': 'Flat', + 'Small block of flats/dwelling converted in to flats': 'Flat', + 'Semi-detached house': 'House', + 'Mid-terraced house': 'House', + 'End-terraced house': 'House', + 'Block of flats': 'Flat', + 'Detached house': 'House', + 'Flat in mixed use building': 'Flat', + }, "HA48": { "House": "House", "Flat": "Flat", @@ -364,6 +374,30 @@ PROPERTY_TYPE_LOOKUP = { 'Flat?': 'Flat', 'Bungalow ': 'Bungalow' }, + "HA51": { + 'FLAT': 'Flat', + 'HOUSE': 'House', + 'MAISONETTE': 'Maisonette', + 'BEDSIT': None, # Considering as a non-specific residential category here + 'BUNGALOW': 'Bungalow', + }, + "HA52": { + 'House - Mid Terrace': 'House', + 'Flat - First Floor': 'Flat', + 'Flat - Ground Floor': 'Flat', + 'House - Semi-Detached': 'House', + 'House - End Terrace': 'House', + 'Flat - Second Floor': 'Flat', + 'Bedsit': None, # Considering as a non-specific residential category here + 'Bungalow - Semi-Detached': 'Bungalow', + 'Bungalow - Mid Terrace': 'Bungalow', + 'Bungalow - End Terrace': 'Bungalow', + 'House - Detached': 'House', + 'Flat - Third Floor': 'Flat', + 'House attached to flats': 'House', + 'Flat - Fourth Floor': 'Flat', + 'Bungalow - Detached': 'Bungalow' + }, "HA56": { 'House Non Specific': 'House', 'HOUSE TERRACED': 'House', @@ -463,6 +497,10 @@ class DataLoader: "address": "Address", "postcode": "Address - Postcode" }, + "HA5": { + "address": "Address", + "postcode": "matching_postcode" + }, "HA6": { "address": "propertyaddress", "postcode": "address" # The 'address' column actually contains postcode @@ -553,7 +591,9 @@ class DataLoader: def create_asset_list_matching_address(self, ha_name, asset_list): - if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA45", "HA48", "HA49", "HA52", "HA54"]: + if ha_name in [ + "HA1", "HA5", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA45", "HA48", "HA49", "HA52", "HA54" + ]: asset_list["matching_address"] = asset_list[ self.COLUMN_CONFIG[ha_name]["address"] ].astype(str).str.lower().str.strip() @@ -750,6 +790,10 @@ class DataLoader: asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \ asset_list["POSTCODE"].astype(str).str.lower().str.strip() asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip() + elif ha_name == "HA70": + asset_list["matching_address"] = asset_list["Address1"].astype(str).str.lower().str.strip() + ", " + \ + asset_list["POSTCODE"].astype(str).str.lower().str.strip() + asset_list["matching_postcode"] = asset_list["POSTCODE"].astype(str).str.lower().str.strip() elif ha_name == "HA107": # Create matching_address by concatenating House No, Street, Town, District, Postcode asset_list["matching_address"] = asset_list["House No"].astype(str).str.lower().str.strip() + ", " + \ @@ -962,9 +1006,100 @@ class DataLoader: return asset_list + def prepare_ha17(self, workbook): + blocks_sheet = workbook["Blocks List - Cavity Wall only"] + blocks_data = [] + blocks_colnames = [cell.value for cell in blocks_sheet[2]] + for row in blocks_sheet.iter_rows(min_row=4, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + blocks_data.append(row_data) + + blocks_df = pd.DataFrame(blocks_data, columns=blocks_colnames) + + blocks_df["matching_address"] = ( + blocks_df["Block Name\n[as per Naming Convention procedure]"].astype(str).str.lower().str.strip() + ", " + + blocks_df["Block Street Name"].astype(str).str.lower().str.strip() + ", " + + blocks_df["Postcode"].astype(str).str.lower().str.strip() + ) + blocks_df["matching_postcode"] = blocks_df["Postcode"].astype(str).str.lower().str.strip() + blocks_df["property_type"] = "Flat" + + street_properties_sheet = workbook["Street Properties - Cavity Wall"] + street_properties_data = [] + street_properties_colnames = [cell.value for cell in street_properties_sheet[2]] + for row in street_properties_sheet.iter_rows(min_row=3, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + street_properties_data.append(row_data) + + street_properties_df = pd.DataFrame(street_properties_data, columns=street_properties_colnames) + + street_properties_df["matching_address"] = ( + street_properties_df["Block Name\n[as per Naming Convention procedure]"].astype( + str).str.lower().str.strip() + ", " + + street_properties_df["Postcode"].astype(str).str.lower().str.strip() + ) + street_properties_df["matching_postcode"] = street_properties_df["Postcode"].astype(str).str.lower().str.strip() + street_properties_df["property_type"] = street_properties_df[ + "Block typology based on dwelling type\n[defined list]" + ] + + asset_list_compressed = pd.concat( + [ + blocks_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]], + street_properties_df[["matching_address", "matching_postcode", "property_type", "ECO Eligibility"]] + ], + axis=0 + ) + # We expand + range_pattern = r"(\d+)\s+to\s+(\d+)\s+(.*)" + asset_list = [] + for _, row in tqdm(asset_list_compressed.iterrows(), total=len(asset_list_compressed)): + if row["ECO Eligibility"] == "Not Eligible": + asset_list.append(row.to_dict()) + continue + + # Detect a house number range + match = re.search(range_pattern, row["matching_address"]) + + if not match: + asset_list.append(row.to_dict()) + continue + + # Extracting the start and end of the range + start_number = int(match.group(1)) + end_number = int(match.group(2)) + rest_of_address = match.group(3) + + # Generating the list of house numbers + house_numbers = list(range(start_number, end_number + 1)) + data_to_extend = [] + for house_number in house_numbers: + new_adress = f"{house_number} {rest_of_address}" + + entry = row.to_dict().copy() + entry.update({"matching_address": new_adress}) + + data_to_extend.append(entry) + + asset_list.extend(data_to_extend) + + asset_list = pd.DataFrame(asset_list) + + # Add in asset_list_row_id + asset_list["asset_list_row_id"] = ["HA17" + str(i) for i in range(0, len(asset_list))] + + # Add on house number + asset_list = self.create_asset_list_house_no(ha_name="HA17", asset_list=asset_list) + + return asset_list + def load_asset_list(self, filepath, ha_name): workbook = openpyxl.load_workbook(filepath) - asset_sheetname = self.get_asset_sheetname(workbook) + if ha_name == "HA17": + asset_list = self.prepare_ha17(workbook) + return asset_list, pd.DataFrame(), pd.DataFrame(), pd.DataFrame() + else: + asset_sheetname = self.get_asset_sheetname(workbook) asset_sheet = workbook[asset_sheetname] asset_sheet_colnames = [cell.value for cell in asset_sheet[1]] @@ -977,6 +1112,9 @@ class DataLoader: if ha_name == "HA54": asset_sheet_colnames[10] = "matching_postcode" + if ha_name == "HA5": + asset_sheet_colnames[2] = "matching_postcode" + rows_data = [] for row in asset_sheet.iter_rows(min_row=2, values_only=False): @@ -2555,6 +2693,10 @@ class DataLoader: return survey_list + @staticmethod + def correct_ha5_survey_list(survey_list): + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -3431,6 +3573,9 @@ class DataLoader: def get_property_type_and_built_form(property_meta, ha_name): + if ha_name in ["HA44"]: + return None, None + if ha_name == "HA1": property_type = property_meta["Asset Type"] # We correct a small error @@ -3499,6 +3644,8 @@ def get_property_type_and_built_form(property_meta, ha_name): config = PROPERTY_TYPE_LOOKUP[ha_name][property_meta["Type"]] property_type = config.get("property-type") built_form = config.get("built-form") + elif ha_name == "HA17": + return property_meta["property_type"], None elif ha_name == "HA18": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip()) built_form = None @@ -3580,6 +3727,9 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA42": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling use/type"].strip()) built_form = None + elif ha_name == "HA45": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property type"].strip()) + built_form = None elif ha_name == "HA48": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None @@ -3589,6 +3739,14 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA50": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) built_form = None + elif ha_name == "HA51": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip()) + built_form = None + elif ha_name == "HA52": + if property_meta["Property Type"] is None: + return None, None + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Property Type"].strip()) + built_form = None elif ha_name == "HA54": property_type = property_meta["Property Type"] built_form = None @@ -5806,9 +5964,9 @@ def fml_data_pull(loader): "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", - 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', + 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', # NEW - add property type - 'HA44', 'HA45', 'HA51', 'HA52' + "HA17" ] # Can't pull from EPC database because it's based in Scotland @@ -5905,7 +6063,7 @@ def fml_analysis(loader): "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", - 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', + 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52' ] no_ciga_cavity_descriptions = [ @@ -6320,11 +6478,11 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", - "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", - "HA63", "HA107", "HA117", "HA8", "HA11", "HA21", "HA37", "HA42", + "HA1", "HA2", "HA5", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", + "HA25", "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", + "HA56", "HA63", "HA107", "HA117", "HA8", "HA11", "HA21", "HA37", "HA42", # Added as of March 18th - "HA44", "HA45", "HA51", "HA52", + "HA44", "HA45", "HA51", "HA52", "HA17", # New HAS "HAXX", "HAXXX", ] @@ -6332,7 +6490,10 @@ def app(): # back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE], # 35 [DONE], 56 [DONE], 19 [DONE], 18 [DONE], 9 [DONE], 27 [DONE], 34 [DONE], 30 [DONE], 31 [DONE], 54 [DONE] # - # Consider for ECO4: HA 70 - have to merge ECO3 list though, HA17 has LOTs of assets, but the asset list is a mess + # Consider for ECO4: + # HA 70 - have to merge ECO3 list though, + # HA17 has LOTs of assets, but the asset list is a mess + # HA53 but has EPCs done # Consider for GBIS: # Ignore for now: # 38 [problematic, but no ECO4], 10 problematic (no eligibility), 20 has barely any in From 6ccfff0411ee2af58d6f7dc47b98f2deb70eac5c Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Mar 2024 16:14:11 +0000 Subject: [PATCH 153/155] Added ha20 --- .../ha_15_32/ha_analysis_batch_3.py | 50 +++++++++++++++++-- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 009064c6..627fcede 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -566,7 +566,8 @@ class DataLoader: "HA107": 51, "HA48": 0, "HA45": 0, - "HA52": 5 + "HA52": 5, + "HA20": 6 } UNMATCHED_ECO3 = { @@ -669,6 +670,17 @@ class DataLoader: asset_list["Postcode"].astype(str).str.lower().str.strip() ) asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() + elif ha_name == "HA20": + asset_list["matching_address"] = ( + asset_list["House Name"].astype(str).str.lower().str.strip() + ", " + + asset_list["Block"].astype(str).str.lower().str.strip() + ", " + + asset_list["Address Line 1"].astype(str).str.lower().str.strip() + ", " + + asset_list["Address Line 2"].astype(str).str.lower().str.strip() + ", " + + asset_list["Address Line 3"].astype(str).str.lower().str.strip() + ", " + + asset_list["Address Line 4"].astype(str).str.lower().str.strip() + ", " + + asset_list["Postcode"].astype(str).str.lower().str.strip() + ) + asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip() elif ha_name == "HA21": asset_list["matching_address"] = ( asset_list["Address"].astype(str).str.lower().str.strip() + ", " + @@ -2697,6 +2709,35 @@ class DataLoader: def correct_ha5_survey_list(survey_list): return survey_list + @staticmethod + def correct_ha20_survey_list(survey_list): + # Not in the asset list + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Abbot Close", "ABBOTS CLOSE" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Downbarns Road", "DOWN BARNS ROAD" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "Austin Lane", "AUSTINS LANE" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "South Park Way", "SOUTHPARK WAY" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "OAKLAND ROAD", "OAKWOOD ROAD" + ) + + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "ACRE WAY/NORTHWOOD", "ACRE WAY" + ) + + return survey_list + @staticmethod def levenstein_match(matching_string, df): match_to = df["matching_address"].tolist() @@ -3301,7 +3342,8 @@ class DataLoader: "AFF0RDALE WARMTH": "ECO4", "ECO 4 RdSAP CL": "ECO4", "Affordable Warmth (R) ": "ECO4", - "Affordable Warmth ": "ECO4" + "Affordable Warmth ": "ECO4", + "ECO 4 AFFORDABLE WARMTH": "ECO4", } # Since it seems like "subject to archetype check" has some failure conditions, for simplicity, we @@ -6478,11 +6520,11 @@ def app(): # Add in: priority_has = [ - "HA1", "HA2", "HA5", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", + "HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25", "HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56", "HA63", "HA107", "HA117", "HA8", "HA11", "HA21", "HA37", "HA42", # Added as of March 18th - "HA44", "HA45", "HA51", "HA52", "HA17", + "HA44", "HA45", "HA51", "HA52", "HA17", "HA5", "HA20", # New HAS "HAXX", "HAXXX", ] From 3dd30445f92635df45b5da2a756650ca116f3855 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 18 Mar 2024 19:37:11 +0000 Subject: [PATCH 154/155] HA Analysis finalised --- .../ha_15_32/ha_analysis_batch_3.py | 257 +++++++++++++++--- 1 file changed, 225 insertions(+), 32 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 627fcede..2f17ed73 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -51,6 +51,12 @@ PROPERTY_TYPE_LOOKUP = { 'MAISONETTE': "Maisonette", 'HOSTEL': None }, + "HA5": { + "House": "House", + "Flat": "Flat", + "Bungalow": "Bungalow", + "Bedsit": None + }, "HA6": { "property_type": { 'HOUSE': "House", @@ -161,6 +167,21 @@ PROPERTY_TYPE_LOOKUP = { "Hostel": None, "Block": None, }, + "HA20": { + "House": "House", + "Flat": "Flat", + 'Sheltered Flat': "Flat", + 'Maisonette': 'Maisonette', + 'Bungalow': 'Bungalow', + 'House. SD': 'House', + 'House. MT': 'House', + 'House. ET': 'House', + 'Sheltered Bungalow': 'Bungalow', + 'Guest Accomodation': None, + 'Sheltered House': 'House', + 'House. MT ': 'House', + 'House. D': 'House' + }, "HA24": { '01 HOUSE': 'House', '02 FLAT': 'Flat', @@ -3632,6 +3653,9 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA2": property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Dwelling Type"].strip()) built_form = None + elif ha_name == "HA5": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip()) + built_form = None elif ha_name == "HA6": property_type = PROPERTY_TYPE_LOOKUP[ha_name]["property_type"][property_meta["Dwelling type"]] built_form = property_meta["built_form"] @@ -3694,6 +3718,9 @@ def get_property_type_and_built_form(property_meta, ha_name): elif ha_name == "HA19": property_type = property_meta["Dwelling Type"] built_form = None + elif ha_name == "HA20": + property_type = PROPERTY_TYPE_LOOKUP[ha_name].get(property_meta["Asset Type"].strip()) + built_form = None elif ha_name == "HA21": property_description = property_meta["Property Type"].strip().lower() if "house" in property_description: @@ -5775,6 +5802,7 @@ def forecast_remaining_sales(loader): results.append(to_append) results = pd.DataFrame(results) + results.to_csv("pipeline_remaining_raw.csv") totals_row = {} for col in results.columns: @@ -6006,9 +6034,7 @@ def fml_data_pull(loader): "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", - 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', - # NEW - add property type - "HA17" + 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20", ] # Can't pull from EPC database because it's based in Scotland @@ -6105,7 +6131,7 @@ def fml_analysis(loader): "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", - 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52' + 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20", ] no_ciga_cavity_descriptions = [ @@ -6124,22 +6150,6 @@ def fml_analysis(loader): # TODO: There will be some properties that are subject to CIGA that do not look like they ned a CIGA check! pass # them! Non-invasices will have checked the wall though - codes = [ - "HA39", "HA14", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA7", - "HA16", "HA107", "HA25", "HA50", "HA41", "HA48", "HA2", "HA63", "HA12", - "HA117", "HA13", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", - "HA30", "HA31", "HA54", "HAXX", "HA49", "HAXXX", - ] - - values = [ - 706, 2161, 1053, 793, 0, 656, 1200, 1647, 4248, 2703, 1087, 1876, 2135, - 1078, 775, 538, 518, 401, 466, 2627, 98, 1050, 524, 191, 538, 384, 204, - 281, 422, 74, 313, 71, 6 - ] - - # Create a dictionary mapping - remaining_eligible_mapping = dict(zip(codes, values)) - results = [] wall_descriptions = [] for ha_name in tqdm(has_bruh): @@ -6397,9 +6407,13 @@ def fml_analysis(loader): without_survey_without_ciga_expected = 0 else: # We apply the same conversion rate as the properties with a survey - without_survey_without_ciga_expected = np.round( - without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0]) - ) + + if ciga_check_needed.shape[0] == 0 and ciga_check_expectation == 0: + without_survey_without_ciga_expected = without_survey_needing_ciga.shape[0] + else: + without_survey_without_ciga_expected = np.round( + without_survey_needing_ciga.shape[0] * (ciga_check_expectation / ciga_check_needed.shape[0]) + ) without_survey_passed_ciga = fuck_this[ (fuck_this["estimated"] == True) & @@ -6466,15 +6480,6 @@ def fml_analysis(loader): without_survey_identified_as_gbis_qualified ) - surveys = loader.data[ha_name]["survey_list"] - sold_now = 0 - if not surveys.empty: - sold_now = surveys[ - surveys["installation_status"].str.lower().str.contains("eco4") - ].shape[0] - - sales_since_nov = sold_now - original_figures["No. of Tech surveys complete - Eco 4"].values[0] - results.append( { "HA Name": ha_name, @@ -6498,6 +6503,194 @@ def fml_analysis(loader): # TODO: Change the left hand side number for our post CIGA estimates +def create_final_report(): + """ + This function will produce the final output for the HA analysis + :return: + """ + epc_validated_results = pd.read_csv("analysis - revised.csv") + pipeline_results = pd.read_csv("pipeline_remaining_raw.csv") + + #################################### + # Original Warmfront estimates + #################################### + # Create the volumes result + all_ha_summary_remaining = pipeline_results[ + [ + "('', '', '', 'HA Name')", + "('ECO4 original', '', 'Remaining - #', '')", + "('GBIS original', '', 'Remaining - #', '')", + ] + ].copy().rename( + columns={ + "('', '', '', 'HA Name')": "HA Name", + "('ECO4 original', '', 'Remaining - #', '')": "# ECO4 remaining - All HA Summary", + "('GBIS original', '', 'Remaining - #', '')": "# GBIS remaining - All HA Summary", + } + ) + all_ha_summary_remaining["# Total remaining - All HA Summary"] = ( + all_ha_summary_remaining["# ECO4 remaining - All HA Summary"] + + all_ha_summary_remaining["# GBIS remaining - All HA Summary"] + ) + all_ha_summary_remaining = all_ha_summary_remaining.sort_values("HA Name") + + #################################### + # Postcode list - pre-CIGA + #################################### + postcode_list_pre_ciga_remaining = pipeline_results[ + [ + "('', '', '', 'HA Name')", + "('ECO4 pre-ciga', '', 'Remaining - #', '')", + "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')", + ] + ].copy().rename( + columns={ + "('', '', '', 'HA Name')": "HA Name", + "('ECO4 pre-ciga', '', 'Remaining - #', '')": "# ECO4 remaining - Postcode list (pre CIGA)", + "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": ( + "# GBIS remaining - Postcode list (pre CIGA)" + ), + } + ) + + postcode_list_pre_ciga_remaining["# Total remaining - Postcode list (pre CIGA)"] = ( + postcode_list_pre_ciga_remaining["# ECO4 remaining - Postcode list (pre CIGA)"] + + postcode_list_pre_ciga_remaining["# GBIS remaining - Postcode list (pre CIGA)"] + ) + postcode_list_pre_ciga_remaining = postcode_list_pre_ciga_remaining.sort_values("HA Name") + + #################################### + # Postcode list - post-CIGA + #################################### + postcode_list_post_ciga_remaining = pipeline_results[ + [ + "('', '', '', 'HA Name')", + "('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')", + "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')", + ] + ].copy().rename( + columns={ + "('', '', '', 'HA Name')": "HA Name", + "('ECO4 post-ciga', '', 'Estimated remaining eligible - #', '')": + "# ECO4 remaining - Postcode list (post CIGA)", + "('GBIS Postcode list', 'Warmfront post code list', 'Remaining - #', 'GBIS total')": ( + "# GBIS remaining - Postcode list (post CIGA)" + ), + } + ) + + postcode_list_post_ciga_remaining["# Total remaining - Postcode list (post CIGA)"] = ( + postcode_list_post_ciga_remaining["# ECO4 remaining - Postcode list (post CIGA)"] + + postcode_list_post_ciga_remaining["# GBIS remaining - Postcode list (post CIGA)"] + ) + postcode_list_post_ciga_remaining = postcode_list_post_ciga_remaining.sort_values("HA Name") + + #################################### + # From EPC Database + #################################### + from_epc_database = epc_validated_results[ + [ + "HA Name", + "EPC verified ECO4 Eligible - Remaining", + "EPC verified GBIS Eligibile - Remaining" + ] + ].copy().rename( + columns={ + "EPC verified ECO4 Eligible - Remaining": "# ECO4 remaining - From EPC Database (post CIGA)", + "EPC verified GBIS Eligibile - Remaining": "# GBIS remaining - From EPC Database (post CIGA)", + } + ) + + from_epc_database["# Total remaining - From EPC Database (post CIGA)"] = ( + from_epc_database["# ECO4 remaining - From EPC Database (post CIGA)"] + + from_epc_database["# GBIS remaining - From EPC Database (post CIGA)"] + ) + from_epc_database = from_epc_database.sort_values("HA Name") + + # Combine the datasets + volumes = all_ha_summary_remaining.merge( + postcode_list_pre_ciga_remaining, how="left", on="HA Name" + ).merge( + postcode_list_post_ciga_remaining, how="left", on="HA Name" + ).merge( + from_epc_database, how="inner", on="HA Name" + ) + + revenue = volumes.copy() + # Convert the ECO4 volumes to revenue + for col in [ + '# ECO4 remaining - All HA Summary', + '# ECO4 remaining - Postcode list (pre CIGA)', + '# ECO4 remaining - Postcode list (post CIGA)', + '# ECO4 remaining - From EPC Database (post CIGA)' + ]: + revenue[col] = revenue[col] * 1710 + + # Convert the GBIS volumes to revenue + for col in [ + '# GBIS remaining - All HA Summary', + '# GBIS remaining - Postcode list (pre CIGA)', + '# GBIS remaining - Postcode list (post CIGA)', + '# GBIS remaining - From EPC Database (post CIGA)' + ]: + revenue[col] = revenue[col] * 600 + + # Re-calculate the totals + revenue['# Total remaining - All HA Summary'] = ( + revenue['# ECO4 remaining - All HA Summary'] + revenue['# GBIS remaining - All HA Summary'] + ) + + revenue['# Total remaining - Postcode list (pre CIGA)'] = ( + revenue['# ECO4 remaining - Postcode list (pre CIGA)'] + revenue['# GBIS remaining - Postcode list (pre CIGA)'] + ) + + revenue['# Total remaining - Postcode list (post CIGA)'] = ( + revenue['# ECO4 remaining - Postcode list (post CIGA)'] + revenue[ + '# GBIS remaining - Postcode list (post CIGA)'] + ) + + revenue['# Total remaining - From EPC Database (post CIGA)'] = ( + revenue['# ECO4 remaining - From EPC Database (post CIGA)'] + + revenue['# GBIS remaining - From EPC Database (post CIGA)'] + ) + + # Replace the # with £ in the columns + revnue_colnames = [col.replace("#", "£") for col in revenue.columns] + revenue.columns = revnue_colnames + + # We check that each column gets smaller + decreasing_check1 = all( + volumes["# ECO4 remaining - Postcode list (pre CIGA)"] >= volumes[ + '# ECO4 remaining - Postcode list (post CIGA)'] + ) + if not decreasing_check1: + raise ValueError("decreasing_check1 failed") + + # Just HA32 and HA17 should fail this, and it's due to GBIS jobs looking like ECO4 + decreasing_check2 = volumes[volumes["# ECO4 remaining - From EPC Database (post CIGA)"] > volumes[ + "# ECO4 remaining - Postcode list (post CIGA)"]] + + if set(decreasing_check2["HA Name"].tolist()) != {"HA17", "HA32"}: + raise ValueError("decreasing_check2 failed") + + # Check for GBIS + decreasing_check3 = all( + volumes["# GBIS remaining - Postcode list (pre CIGA)"] >= volumes[ + '# GBIS remaining - Postcode list (post CIGA)'] + ) + + if not decreasing_check3: + raise ValueError("decreasing_check3 failed") + + # Don't perform this - this happens for multiple + # decreasing_check4 = volumes[volumes["# GBIS remaining - From EPC Database (post CIGA)"] > volumes[ + # "# GBIS remaining - Postcode list (post CIGA)"]] + + # Store final outputs + volumes.to_csv("HA Analysis Final - volumes.csv") + revenue.to_csv("HA Analysis Final - revenue.csv") + + def app(): """ This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107. From 724379a86d1bd9b79159f2f8f9e5d8abe9496f5f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 26 Mar 2024 18:05:08 +0000 Subject: [PATCH 155/155] wrapping up ha analysis --- .../ha_15_32/ha_analysis_batch_3.py | 170 ++++++++++-------- 1 file changed, 94 insertions(+), 76 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 2f17ed73..e414cd00 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -5366,6 +5366,7 @@ def forecast_remaining_sales(loader): results = [] for ha_name, input_data in loader.data.items(): + # Original warmfront figures - ECO4 original_warmfront_estimates = december_figures[december_figures["HA Name"] == ha_name] if original_warmfront_estimates.empty: @@ -6032,7 +6033,7 @@ def forecast_remaining_sales(loader): def fml_data_pull(loader): has_bruh = [ "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", - "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", + "HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20", ] @@ -6129,7 +6130,7 @@ def fml_analysis(loader): assumed_ciga_pass_rate = 0.731 has_bruh = [ "HA7", "HA14", "HA25", "HA39", "HA16", "HA28", "HA13", - "HA50", "HA24", "HA15", "HA32", "HA28", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", + "HA50", "HA24", "HA15", "HA32", "HA6", "HA1", "HA107", "HA41", "HA48", "HA2", "HA63", "HA12", "HA117", "HA35", "HA34", "HA56", "HA19", "HA18", "HA9", "HA27", "HA30", "HA31", "HA54", "HA49", 'HA8', 'HA11', 'HA21', 'HA37', 'HA42', 'HA44', 'HA45', 'HA51', 'HA52', "HA17", "HA5", "HA20", ] @@ -6738,89 +6739,106 @@ def app(): loader = DataLoader(directories, december_figures_filepath, use_cache, rebuild_inputs) loader.load() loader.ha_facts_and_figures() - forecast_remaining_sales(loader) - conversion_rate = 0.95 - archetype_check_conversion = 0.7 - res = [] - for k, v in loader.data.items(): - asset_list = v["asset_list"].copy() - agg = asset_list["ECO Eligibility"].value_counts() - # We find a case where there are properties that have passed CIGA - if not any("passed" in x for x in agg.index): + # gbis rate + # breakdowns = [] + # for ha, data_assets in loader.data.items(): + # asset_list = data_assets["asset_list"].copy() + # breakdown = asset_list["ECO Eligibility"].value_counts().to_dict() + # breakdowns.append(breakdown) + # breakdowns = pd.DataFrame(breakdowns) + # + # installer = [] + # for ha, data_assets in loader.data.items(): + # survey_list = data_assets["survey_list"] + # if survey_list.empty: + # continue + # if "INSTALLER" not in survey_list.columns: + # continue + # + # installers = survey_list["INSTALLER"].value_counts().to_dict() + # installers["ha_name"] = ha + # installer.append(installers) + # installer = pd.DataFrame(installer) + # installer.drop(columns=["ha_name"]).sum().sum() + + # Adhoc - for HA16, get the properties that still need a CIGA check + asset_list_ha16 = loader.data["HA16"]["asset_list"].copy() + ha_16_need_ciga = asset_list_ha16[ + asset_list_ha16["ECO Eligibility"].str.contains("subject to ciga") + ] + completed_cigas = loader.data["HA16"]["ciga_list"].copy() + # Store the results + ha_16_need_ciga.to_csv("ha16_need_ciga.csv") + completed_cigas.to_csv("ha16_completed_cigas.csv") + + # Adhoc - look at the current pipeline and identify how many dormant, CIGA dependent properties there are for + # live projects + + # Read excel + orderbook_filepath = "local_data/ha_data/Warmfront HA client order book overview_20240129.xlsx" + orderbook_workbook = openpyxl.load_workbook(orderbook_filepath) + orderbook_sheet = orderbook_workbook["Contractual Info"] + orderbook_colnames = [cell.value for cell in orderbook_sheet[1]] + + rows = [] + for row in orderbook_sheet.iter_rows(min_row=2, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + rows.append(row_data) + + orderbook = pd.DataFrame(rows, columns=orderbook_colnames) + live_orderbook = orderbook[orderbook["Live, New, or Historic?"] == "LIVE"].copy() + live_orderbook['Redacted HA'] = live_orderbook['Redacted HA'].str.replace(" ", "") + + dormant_properties = [] + missed_has = [] + for _, customer in live_orderbook.iterrows(): + if customer['Redacted HA'] not in loader.data.keys(): + missed_has.append(customer['Redacted HA']) continue + asset_list = loader.data[customer['Redacted HA']]["asset_list"].copy() + survey_list = loader.data[customer['Redacted HA']]["survey_list"].copy() + # Remove sold + if not survey_list.empty: + survey_list = survey_list[~pd.isnull(survey_list["asset_list_row_id"])] + asset_list = asset_list.merge( + survey_list[["asset_list_row_id", "installation_status"]], + how="left", + on="asset_list_row_id" + ) + # Anything that has an installation has gone to installation, and therefore is not remaining + asset_list = asset_list[pd.isnull(asset_list["installation_status"])] + asset_list = asset_list.drop(columns=["installation_status"]) - agg = pd.DataFrame(agg).reset_index() - - passed_ciga = agg[agg["ECO Eligibility"] == "eco4 - passed ciga"] - passed_ciga = passed_ciga["count"].values[0] if not passed_ciga.empty else 0 - - failed_ciga = agg[agg["ECO Eligibility"] == "failed ciga"] - failed_ciga = failed_ciga["count"].values[0] if not failed_ciga.empty else 0 - - ciga_pass_rate = passed_ciga / (passed_ciga + failed_ciga) if (passed_ciga + failed_ciga) > 0 else 1 - - dormant_ciga = agg[ - agg["ECO Eligibility"].str.contains("subject to ciga") & - ~agg["ECO Eligibility"].str.contains("subject to archetype") + # We pull out the properties that need a CIGA check + need_ciga = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to ciga)"] + need_archetype = asset_list[asset_list["ECO Eligibility"] == "eco4 (subject to archetype)"] + need_ciga_and_archetype = asset_list[ + asset_list["ECO Eligibility"] == "eco4 (subject to ciga) (subject to archetype)" ] - dormant_ciga = dormant_ciga['count'].values[0] if not dormant_ciga.empty else 0 - - dormant_ciga_archetype = agg[ - agg["ECO Eligibility"].str.contains("subject to ciga") & - agg["ECO Eligibility"].str.contains("subject to archetype") - ] - - dormant_ciga_archetype = dormant_ciga_archetype['count'].values[0] if not dormant_ciga_archetype.empty else 0 - - needing_check = dormant_ciga + dormant_ciga_archetype * archetype_check_conversion - needing_check = np.round(needing_check) - - additional_jobs = (dormant_ciga * ciga_pass_rate * conversion_rate) + ( - dormant_ciga_archetype * archetype_check_conversion * ciga_pass_rate * conversion_rate - ) - additional_jobs = np.round(additional_jobs) - - # We attempt to estimate the uplift and how much of that is attributed to surplus subject to ciga jobs - original_estimate = loader.december_figures[ - loader.december_figures["HA Name"] == k - ] - - original_estimate = original_estimate["ECO4"].values[0] if not original_estimate.empty else 0 - base_eco_figures = agg[ - agg["ECO Eligibility"].isin(["eco4", "eco4 - passed ciga"]) - ]["count"].sum() - eco4_from_ciga = original_estimate - base_eco_figures - eco4_from_ciga = eco4_from_ciga if eco4_from_ciga > 0 else 0 - surplus_from_dormant = additional_jobs - eco4_from_ciga - surplus_from_dormant = 0 if surplus_from_dormant < 0 else surplus_from_dormant - - res.append( + dormant_properties.append( { - "ha_name": k, - "additional_eco4": additional_jobs, - "needing_check": needing_check, - "surplus_from_dormant": surplus_from_dormant + "HA Name": customer['Redacted HA'], + "Need CIGA": need_ciga.shape[0], + "Need Archetype": need_archetype.shape[0], + "Need CIGA and Archetype": need_ciga_and_archetype.shape[0] } ) - res = pd.DataFrame(res) - # Drop the HAs that are not in that pervious draft - # In the v2 draft, there are 12 HAs + dormant_properties = pd.DataFrame(dormant_properties) + totals = dormant_properties.sum() + totals["HA Name"] = "Total" - v5_surplus = res[ - ~res["ha_name"].isin(["HA9"]) - ]["additional_eco4"].sum() - # 7212 properties - # This is not a perfect difference though, because of the variations in how the numbers are recorded in the November - # all HAs sheet. E.g for HA 107, there were 1239 properties identified. In the postcode list, there are 1255, - # however 531 are still needing a CIGA check. Therefore their original figures, in this case, included properties - # pre-CIGA + dormant_properties = pd.concat([dormant_properties, totals.to_frame().T]) + dormant_properties.to_csv("dormant_properties.csv") - v5_surplus_from_dormant = res[ - ~res["ha_name"].isin(["HA9"]) - ]["surplus_from_dormant"].sum() - # 5539.0 - # 9471690 + loader.december_figures["ECO4 remaining"].sum() + december_figures = loader.december_figures.copy() + december_figures["ECO4 remaining"] = np.where( + december_figures["ECO4 remaining"] < 0, + 0, + december_figures["ECO4 remaining"] + ) + december_figures["ECO4 remaining"].sum()