diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index de2c0e6a..35bb63fe 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -3459,7 +3459,7 @@ class DataLoader: "not eligible", asset_list["ECO Eligibility"] ) - asset_list = asset_list.drop(columns=["has_eco3"]) + # asset_list = asset_list.drop(columns=["has_eco3"]) # Report on sales sales_report = {} @@ -6778,6 +6778,339 @@ def identify_eco_works(loader): breakdowns = breakdowns.fillna(0) +def unitas_data_prep(loader): + ##### + # Adhoc - for UNITAS, stripping out additional surveys that have been completed + unitas_data = loader.data["HA50"].copy() + unitas_asset_list = unitas_data["asset_list"].copy() + unitas_survey_sheet = unitas_data["survey_list"].copy() + + # We remove the surveyed properties from the asset sheet + unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])] + unitas_asset_list = unitas_asset_list.merge( + unitas_survey_sheet[["asset_list_row_id", "installation_status"]], + how="left", + on="asset_list_row_id" + ) + unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])] + unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"]) + + # We read in the data for the further completed surveys + unitas_phase_1_workbook = openpyxl.load_workbook( + "local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx" + ) + phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"] + phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"] + phase1_colnames = [cell.value for cell in phase_1_worksheet[1]] + phase_1_rows_data = [] + for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + phase_1_rows_data.append(row_data) + + phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames) + + # Correct phase 1 surveys in the same fashion as the previous approach + phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy()) + + # We check all phase 1 surveys are contained in the data we had before + additional = [] + for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)): + # We look for the entry in the old survey sheet: + # matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]] + # if matched_uprn.shape[0] == 1: + # continue + + matched_1 = unitas_survey_sheet[ + (unitas_survey_sheet["Post Code"] == row["Post Code"]) & + (unitas_survey_sheet["NO."] == row["NO."]) + ] + + if matched_1.shape[0] == 1: + continue + + matched_2 = unitas_survey_sheet[ + (unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) & + (unitas_survey_sheet["NO."] == row["NO."]) + ] + + if matched_2.shape[0] == 1: + continue + + additional.append(row.to_dict()) + additional = pd.DataFrame(additional) + + phase_2_rows_data = [] + for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + phase_2_rows_data.append(row_data) + + phase2_colnames = [cell.value for cell in phase_2_worksheet[1]] + phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames) + # Drop all of the occurances of "OFFICE USE ONLY" columns + phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c]) + common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns}) + additional_filtered = additional[common_columns] + + further_unitas_completed_surveys = pd.concat( + [phase_2_surveys, additional_filtered], + axis=0, + ignore_index=True + ) + + # Add a phase 2 key + further_unitas_completed_surveys["survey_list_row_id"] = [ + "unitas_phase_2" + str(i) for i in further_unitas_completed_surveys.index + ] + + not_in_asset_list = [ + "unitas_phase_20", "unitas_phase_234", "unitas_phase_2163", "unitas_phase_2173", "unitas_phase_2374" + ] + + additional_postcodes = ["st28bg"] + + full_asset_list = unitas_data["asset_list"].copy() + full_asset_list["matching_postcode"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "") + further_unitas_completed_surveys["Post Code"] = further_unitas_completed_surveys["Post Code"].str.replace( + "ST 5DT", "ST3 5DT" + ) + + # We match these back to the asset list + matching_lookup = [] + for _, row in tqdm(further_unitas_completed_surveys.iterrows(), total=len(further_unitas_completed_surveys)): + + if row["survey_list_row_id"] in not_in_asset_list: + continue + + postcode_lower = row["Post Code"].lower().strip().replace(" ", "") + if postcode_lower in additional_postcodes: + continue + + # Confirmed not in asset lsit + # Filter asset list on postcode + df = full_asset_list[ + full_asset_list["matching_postcode"].str.contains(postcode_lower) + ] + + df = df[df["HouseNo"] == str(row["NO."])] + + if df.shape[0] != 1: + raise Exception("NOT FOUND") + + matching_lookup.append( + { + "survey_list_row_id": row["survey_list_row_id"], + "asset_list_row_id": df["asset_list_row_id"].values[0], + } + ) + + matching_lookup = pd.DataFrame(matching_lookup) + matching_lookup["phase_2_surveyed"] = True + + # We merge this onto the asset list and remove the rows + unitas_asset_list = unitas_asset_list.merge( + matching_lookup, how="left", on="asset_list_row_id" + ) + # Drop rows where phase_2_surveyed is populated + unitas_asset_list = unitas_asset_list[ + pd.isnull(unitas_asset_list["phase_2_surveyed"]) + ] + + # We add in the new CIGA submissions + unitas_round_2_ciga_workbook = openpyxl.load_workbook("local_data/ha_data/Unitas second round CIGA checks.xlsx") + ciga_round_2_worksheet = unitas_round_2_ciga_workbook["Worksheet"] + ciga_round_2_colnames = [cell.value for cell in ciga_round_2_worksheet[1]] + round_2_rows_data = [] + for row in ciga_round_2_worksheet.iter_rows(min_row=2, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + round_2_rows_data.append(row_data) + + ciga_round_2 = pd.DataFrame(round_2_rows_data, columns=ciga_round_2_colnames) + # We merge the ciga sheet to the asset list + ciga_dependent_asset_list = unitas_asset_list[ + unitas_asset_list["ECO Eligibility"].str.contains("subject to ciga") + ].copy() + + # We merge the ciga sheet to the asset list + ciga_round_2_matched = ciga_dependent_asset_list.merge( + ciga_round_2, how="inner", on=["Address Line 1", "Post Code"] + ) + # Filter on just the properties that had no guarantee + ciga_round_2_matched = ciga_round_2_matched[ciga_round_2_matched["Guarantee"] == "No"] + + # ECO Eligibility + # not eligible 9227 + # failed ciga 2711 + # eco4 (subject to ciga) 2238 + # eco4 - passed ciga 901 + # gbis 114 + # eco4 91 + + # We filter on the properties we're looking to re-survey + unitas_properties_to_survey = unitas_asset_list[ + unitas_asset_list["ECO Eligibility"].isin( + [ + "eco4 - passed ciga", + "eco4" + ] + ) + ].copy() + + unitas_properties_to_survey = pd.concat( + [ + unitas_properties_to_survey, + ciga_round_2_matched[unitas_properties_to_survey.columns] + ] + ) + + epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA=" + + # We now retrieve the lastest EPC data + epc_data = [] + for _, unitas_property in tqdm(unitas_properties_to_survey.iterrows(), total=len(unitas_properties_to_survey)): + property_type, _ = get_property_type_and_built_form(property_meta=unitas_property, ha_name="HA50") + + full_address = unitas_property["matching_address"] + + searcher = SearchEpc( + address1=str(unitas_property["HouseNo"]), + postcode=unitas_property["matching_postcode"], + auth_token=epc_api_key, + os_api_key="", + property_type=property_type, + full_address=full_address, + fast=True + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + epc = { + "asset_list_row_id": unitas_property["asset_list_row_id"], + **searcher.newest_epc.copy() + } + + epc_data.append(epc) + + epc_df = pd.DataFrame(epc_data) + # Pull out just the columns we need + epc_df = epc_df[ + [ + "asset_list_row_id", + "address1", "postcode", + "current-energy-efficiency", + "current-energy-rating", + "inspection-date", + "transaction-type", + "built-form" + ] + ] + + epc_df["EPC Rating"] = ( + epc_df["current-energy-efficiency"].astype(str) + + epc_df["current-energy-rating"].astype(str) + ) + + # Merge onto the Unitas data: + unitas_properties_to_survey_full = unitas_properties_to_survey.merge( + epc_df[ + [ + "asset_list_row_id", + "EPC Rating", + "inspection-date", + "transaction-type", + "built-form" + ] + ], + how="left", + on="asset_list_row_id" + ) + + unitas_properties_to_survey_full["ECO Eligibility"] = unitas_properties_to_survey_full["ECO Eligibility"].replace( + "eco4 (subject to ciga)", "eco4 - passed ciga, phase 2 check" + ) + + for col in ["EPC Rating", "inspection-date", "transaction-type", "built-form"]: + unitas_properties_to_survey_full[col] = np.where( + pd.isnull(unitas_properties_to_survey_full[col]), + "No EPC found", + unitas_properties_to_survey_full[col] + ) + unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].fillna( + "No EPC found" + ) + unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].astype(str) + + unitas_properties_to_survey_full = unitas_properties_to_survey_full.rename( + columns={ + "inspection-date": "Last EPC Inspection Date", + "transaction-type": "Last EPC Reason", + "built-form": "Last EPC Built Form", + } + ) + + # We now match to the survey outcomes + unitas_survey_outcomes_workbook = openpyxl.load_workbook( + "local_data/ha_data/UNITAS - survey outcomes 26.03.2024.xlsx" + ) + unitas_survey_outcomes_worksheet = unitas_survey_outcomes_workbook["OUTCOMES"] + unitas_outcomes_colnames = [cell.value for cell in unitas_survey_outcomes_worksheet[2]] + outcomes_rows_data = [] + for row in unitas_survey_outcomes_worksheet.iter_rows(min_row=3, values_only=False): + row_data = [cell.value for cell in row] # This will get you the cell values + outcomes_rows_data.append(row_data) + + unitas_outcomes = pd.DataFrame(outcomes_rows_data, columns=unitas_outcomes_colnames) + unitas_outcomes = unitas_outcomes.rename( + columns={ + "Notes (If 'no answer' under outcomes, have you checked around the property for access " + "issues where possible?)": "Notes" + } + ) + + unitas_outcomes["Postcode"].unique() + eg1 = unitas_properties_to_survey_full[ + (unitas_properties_to_survey_full["Post Code"] == "ST6 6RF") + ] + eg1_outcomes = unitas_outcomes[ + (unitas_outcomes["Postcode"] == "ST6 6RF") + ] + + # Merge outcomes onto properties to survey. Will probably have to do algorithmically + full_asset_list["matching_postcode_nospace"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "") + outcome_matching = [] + for _, outcome in tqdm(unitas_outcomes.iterrows(), total=len(unitas_outcomes)): + # We search for the corresponding entry in the asset list + postcode_lower = outcome["Postcode"].lower().strip().replace(" ", "") + + # Confirmed not in asset lsit + # Filter asset list on postcode + df = unitas_properties_to_survey_full[ + unitas_properties_to_survey_full["matching_postcode_nospace"].str.contains(postcode_lower) + ] + + df = df[df["HouseNo"] == str(outcome["No."])] + if df.empty: + continue + + if df.shape[0] == 1: + outcome_matching.append( + { + "asset_list_row_id": df["asset_list_row_id"].values[0], + **outcome.to_dict() + } + ) + continue + + raise Exception("something went wrong") + + # Store as an excel + unitas_properties_to_survey_full.to_excel("Unitas - phase 2 properties to Survey.xlsx") + + def app(): """ This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107. @@ -6907,81 +7240,3 @@ def app(): december_figures["ECO4 remaining"] ) december_figures["ECO4 remaining"].sum() - - # Adhoc - for UNITAS, stripping out additional surveys that have been completed - unitas_data = loader.data["HA50"].copy() - unitas_asset_list = unitas_data["asset_list"].copy() - unitas_survey_sheet = unitas_data["survey_list"].copy() - # We remove the surveyed properties from the asset sheet - unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])] - unitas_asset_list = unitas_asset_list.merge( - unitas_survey_sheet[["asset_list_row_id", "installation_status"]], - how="left", - on="asset_list_row_id" - ) - unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])] - unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"]) - - # We read in the data for the further completed surveys - unitas_phase_1_workbook = openpyxl.load_workbook( - "local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx" - ) - phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"] - phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"] - phase1_colnames = [cell.value for cell in phase_1_worksheet[1]] - phase_1_rows_data = [] - for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False): - row_data = [cell.value for cell in row] # This will get you the cell values - phase_1_rows_data.append(row_data) - - phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames) - - # Correct phase 1 surveys in the same fashion as the previous approach - phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy()) - - # We check all phase 1 surveys are contained in the data we had before - additional = [] - for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)): - # We look for the entry in the old survey sheet: - # matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]] - # if matched_uprn.shape[0] == 1: - # continue - - matched_1 = unitas_survey_sheet[ - (unitas_survey_sheet["Post Code"] == row["Post Code"]) & - (unitas_survey_sheet["NO."] == row["NO."]) - ] - - if matched_1.shape[0] == 1: - continue - - matched_2 = unitas_survey_sheet[ - (unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) & - (unitas_survey_sheet["NO."] == row["NO."]) - ] - - if matched_2.shape[0] == 1: - continue - - additional.append(row.to_dict()) - additional = pd.DataFrame(additional) - - phase_2_rows_data = [] - for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False): - row_data = [cell.value for cell in row] # This will get you the cell values - phase_2_rows_data.append(row_data) - - phase2_colnames = [cell.value for cell in phase_2_worksheet[1]] - phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames) - # Drop all of the occurances of "OFFICE USE ONLY" columns - phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c]) - common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns}) - additional_filtered = additional[common_columns] - - further_unitas_completed_surveys = pd.concat( - [phase_2_surveys, additional_filtered], - axis=0, - ignore_index=True - ) - - # We match these back to the asset list