merging EPC data and survey outcomes to asset list

2026-06-08 11:17:27 +00:00 · 2024-04-09 14:57:55 +01:00 · 2024-04-09 14:57:55 +01:00 · dc80313eca
commit dc80313eca
parent 0142e6fe5f
1 changed files with 334 additions and 79 deletions
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@ -3459,7 +3459,7 @@ class DataLoader:
                    "not eligible",
                    asset_list["ECO Eligibility"]
                )
-                asset_list = asset_list.drop(columns=["has_eco3"])
+                # asset_list = asset_list.drop(columns=["has_eco3"])

            # Report on sales
            sales_report = {}
@ -6778,6 +6778,339 @@ def identify_eco_works(loader):
    breakdowns = breakdowns.fillna(0)


+def unitas_data_prep(loader):
+    #####
+    # Adhoc - for UNITAS, stripping out additional surveys that have been completed
+    unitas_data = loader.data["HA50"].copy()
+    unitas_asset_list = unitas_data["asset_list"].copy()
+    unitas_survey_sheet = unitas_data["survey_list"].copy()
+
+    # We remove the surveyed properties from the asset sheet
+    unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])]
+    unitas_asset_list = unitas_asset_list.merge(
+        unitas_survey_sheet[["asset_list_row_id", "installation_status"]],
+        how="left",
+        on="asset_list_row_id"
+    )
+    unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])]
+    unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"])
+
+    # We read in the data for the further completed surveys
+    unitas_phase_1_workbook = openpyxl.load_workbook(
+        "local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx"
+    )
+    phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"]
+    phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"]
+    phase1_colnames = [cell.value for cell in phase_1_worksheet[1]]
+    phase_1_rows_data = []
+    for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False):
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        phase_1_rows_data.append(row_data)
+
+    phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames)
+
+    # Correct phase 1 surveys in the same fashion as the previous approach
+    phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy())
+
+    # We check all phase 1 surveys are contained in the data we had before
+    additional = []
+    for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)):
+        # We look for the entry in the old survey sheet:
+        # matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]]
+        # if matched_uprn.shape[0] == 1:
+        #     continue
+
+        matched_1 = unitas_survey_sheet[
+            (unitas_survey_sheet["Post Code"] == row["Post Code"]) &
+            (unitas_survey_sheet["NO."] == row["NO."])
+            ]
+
+        if matched_1.shape[0] == 1:
+            continue
+
+        matched_2 = unitas_survey_sheet[
+            (unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) &
+            (unitas_survey_sheet["NO."] == row["NO."])
+            ]
+
+        if matched_2.shape[0] == 1:
+            continue
+
+        additional.append(row.to_dict())
+    additional = pd.DataFrame(additional)
+
+    phase_2_rows_data = []
+    for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False):
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        phase_2_rows_data.append(row_data)
+
+    phase2_colnames = [cell.value for cell in phase_2_worksheet[1]]
+    phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames)
+    # Drop all of the occurances of "OFFICE USE ONLY" columns
+    phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c])
+    common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns})
+    additional_filtered = additional[common_columns]
+
+    further_unitas_completed_surveys = pd.concat(
+        [phase_2_surveys, additional_filtered],
+        axis=0,
+        ignore_index=True
+    )
+
+    # Add a phase 2 key
+    further_unitas_completed_surveys["survey_list_row_id"] = [
+        "unitas_phase_2" + str(i) for i in further_unitas_completed_surveys.index
+    ]
+
+    not_in_asset_list = [
+        "unitas_phase_20", "unitas_phase_234", "unitas_phase_2163", "unitas_phase_2173", "unitas_phase_2374"
+    ]
+
+    additional_postcodes = ["st28bg"]
+
+    full_asset_list = unitas_data["asset_list"].copy()
+    full_asset_list["matching_postcode"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
+    further_unitas_completed_surveys["Post Code"] = further_unitas_completed_surveys["Post Code"].str.replace(
+        "ST 5DT", "ST3 5DT"
+    )
+
+    # We match these back to the asset list
+    matching_lookup = []
+    for _, row in tqdm(further_unitas_completed_surveys.iterrows(), total=len(further_unitas_completed_surveys)):
+
+        if row["survey_list_row_id"] in not_in_asset_list:
+            continue
+
+        postcode_lower = row["Post Code"].lower().strip().replace(" ", "")
+        if postcode_lower in additional_postcodes:
+            continue
+
+        # Confirmed not in asset lsit
+        # Filter asset list on postcode
+        df = full_asset_list[
+            full_asset_list["matching_postcode"].str.contains(postcode_lower)
+        ]
+
+        df = df[df["HouseNo"] == str(row["NO."])]
+
+        if df.shape[0] != 1:
+            raise Exception("NOT FOUND")
+
+        matching_lookup.append(
+            {
+                "survey_list_row_id": row["survey_list_row_id"],
+                "asset_list_row_id": df["asset_list_row_id"].values[0],
+            }
+        )
+
+    matching_lookup = pd.DataFrame(matching_lookup)
+    matching_lookup["phase_2_surveyed"] = True
+
+    # We merge this onto the asset list and remove the rows
+    unitas_asset_list = unitas_asset_list.merge(
+        matching_lookup, how="left", on="asset_list_row_id"
+    )
+    # Drop rows where phase_2_surveyed is populated
+    unitas_asset_list = unitas_asset_list[
+        pd.isnull(unitas_asset_list["phase_2_surveyed"])
+    ]
+
+    # We add in the new CIGA submissions
+    unitas_round_2_ciga_workbook = openpyxl.load_workbook("local_data/ha_data/Unitas second round CIGA checks.xlsx")
+    ciga_round_2_worksheet = unitas_round_2_ciga_workbook["Worksheet"]
+    ciga_round_2_colnames = [cell.value for cell in ciga_round_2_worksheet[1]]
+    round_2_rows_data = []
+    for row in ciga_round_2_worksheet.iter_rows(min_row=2, values_only=False):
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        round_2_rows_data.append(row_data)
+
+    ciga_round_2 = pd.DataFrame(round_2_rows_data, columns=ciga_round_2_colnames)
+    # We merge the ciga sheet to the asset list
+    ciga_dependent_asset_list = unitas_asset_list[
+        unitas_asset_list["ECO Eligibility"].str.contains("subject to ciga")
+    ].copy()
+
+    # We merge the ciga sheet to the asset list
+    ciga_round_2_matched = ciga_dependent_asset_list.merge(
+        ciga_round_2, how="inner", on=["Address Line 1", "Post Code"]
+    )
+    # Filter on just the properties that had no guarantee
+    ciga_round_2_matched = ciga_round_2_matched[ciga_round_2_matched["Guarantee"] == "No"]
+
+    # ECO Eligibility
+    # not eligible              9227
+    # failed ciga               2711
+    # eco4 (subject to ciga)    2238
+    # eco4 - passed ciga         901
+    # gbis                       114
+    # eco4                        91
+
+    # We filter on the properties we're looking to re-survey
+    unitas_properties_to_survey = unitas_asset_list[
+        unitas_asset_list["ECO Eligibility"].isin(
+            [
+                "eco4 - passed ciga",
+                "eco4"
+            ]
+        )
+    ].copy()
+
+    unitas_properties_to_survey = pd.concat(
+        [
+            unitas_properties_to_survey,
+            ciga_round_2_matched[unitas_properties_to_survey.columns]
+        ]
+    )
+
+    epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA="
+
+    # We now retrieve the lastest EPC data
+    epc_data = []
+    for _, unitas_property in tqdm(unitas_properties_to_survey.iterrows(), total=len(unitas_properties_to_survey)):
+        property_type, _ = get_property_type_and_built_form(property_meta=unitas_property, ha_name="HA50")
+
+        full_address = unitas_property["matching_address"]
+
+        searcher = SearchEpc(
+            address1=str(unitas_property["HouseNo"]),
+            postcode=unitas_property["matching_postcode"],
+            auth_token=epc_api_key,
+            os_api_key="",
+            property_type=property_type,
+            full_address=full_address,
+            fast=True
+        )
+        # Force the skipping of estimating the EPC
+        searcher.ordnance_survey_client.property_type = None
+        searcher.ordnance_survey_client.built_form = None
+
+        searcher.find_property(skip_os=True)
+        if searcher.newest_epc is None:
+            continue
+
+        epc = {
+            "asset_list_row_id": unitas_property["asset_list_row_id"],
+            **searcher.newest_epc.copy()
+        }
+
+        epc_data.append(epc)
+
+    epc_df = pd.DataFrame(epc_data)
+    # Pull out just the columns we need
+    epc_df = epc_df[
+        [
+            "asset_list_row_id",
+            "address1", "postcode",
+            "current-energy-efficiency",
+            "current-energy-rating",
+            "inspection-date",
+            "transaction-type",
+            "built-form"
+        ]
+    ]
+
+    epc_df["EPC Rating"] = (
+        epc_df["current-energy-efficiency"].astype(str) +
+        epc_df["current-energy-rating"].astype(str)
+    )
+
+    # Merge onto the Unitas data:
+    unitas_properties_to_survey_full = unitas_properties_to_survey.merge(
+        epc_df[
+            [
+                "asset_list_row_id",
+                "EPC Rating",
+                "inspection-date",
+                "transaction-type",
+                "built-form"
+            ]
+        ],
+        how="left",
+        on="asset_list_row_id"
+    )
+
+    unitas_properties_to_survey_full["ECO Eligibility"] = unitas_properties_to_survey_full["ECO Eligibility"].replace(
+        "eco4 (subject to ciga)", "eco4 - passed ciga, phase 2 check"
+    )
+
+    for col in ["EPC Rating", "inspection-date", "transaction-type", "built-form"]:
+        unitas_properties_to_survey_full[col] = np.where(
+            pd.isnull(unitas_properties_to_survey_full[col]),
+            "No EPC found",
+            unitas_properties_to_survey_full[col]
+        )
+        unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].fillna(
+            "No EPC found"
+        )
+        unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].astype(str)
+
+    unitas_properties_to_survey_full = unitas_properties_to_survey_full.rename(
+        columns={
+            "inspection-date": "Last EPC Inspection Date",
+            "transaction-type": "Last EPC Reason",
+            "built-form": "Last EPC Built Form",
+        }
+    )
+
+    # We now match to the survey outcomes
+    unitas_survey_outcomes_workbook = openpyxl.load_workbook(
+        "local_data/ha_data/UNITAS - survey outcomes 26.03.2024.xlsx"
+    )
+    unitas_survey_outcomes_worksheet = unitas_survey_outcomes_workbook["OUTCOMES"]
+    unitas_outcomes_colnames = [cell.value for cell in unitas_survey_outcomes_worksheet[2]]
+    outcomes_rows_data = []
+    for row in unitas_survey_outcomes_worksheet.iter_rows(min_row=3, values_only=False):
+        row_data = [cell.value for cell in row]  # This will get you the cell values
+        outcomes_rows_data.append(row_data)
+
+    unitas_outcomes = pd.DataFrame(outcomes_rows_data, columns=unitas_outcomes_colnames)
+    unitas_outcomes = unitas_outcomes.rename(
+        columns={
+            "Notes                 (If 'no answer' under outcomes, have you checked around the property for access "
+            "issues where possible?)": "Notes"
+        }
+    )
+
+    unitas_outcomes["Postcode"].unique()
+    eg1 = unitas_properties_to_survey_full[
+        (unitas_properties_to_survey_full["Post Code"] == "ST6 6RF")
+    ]
+    eg1_outcomes = unitas_outcomes[
+        (unitas_outcomes["Postcode"] == "ST6 6RF")
+    ]
+
+    # Merge outcomes onto properties to survey. Will probably have to do algorithmically
+    full_asset_list["matching_postcode_nospace"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
+    outcome_matching = []
+    for _, outcome in tqdm(unitas_outcomes.iterrows(), total=len(unitas_outcomes)):
+        # We search for the corresponding entry in the asset list
+        postcode_lower = outcome["Postcode"].lower().strip().replace(" ", "")
+
+        # Confirmed not in asset lsit
+        # Filter asset list on postcode
+        df = unitas_properties_to_survey_full[
+            unitas_properties_to_survey_full["matching_postcode_nospace"].str.contains(postcode_lower)
+        ]
+
+        df = df[df["HouseNo"] == str(outcome["No."])]
+        if df.empty:
+            continue
+
+        if df.shape[0] == 1:
+            outcome_matching.append(
+                {
+                    "asset_list_row_id": df["asset_list_row_id"].values[0],
+                    **outcome.to_dict()
+                }
+            )
+            continue
+
+        raise Exception("something went wrong")
+
+    # Store as an excel
+    unitas_properties_to_survey_full.to_excel("Unitas - phase 2 properties to Survey.xlsx")
+
+
 def app():
    """
    This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107.
@ -6907,81 +7240,3 @@ def app():
        december_figures["ECO4 remaining"]
    )
    december_figures["ECO4 remaining"].sum()
-
-    # Adhoc - for UNITAS, stripping out additional surveys that have been completed
-    unitas_data = loader.data["HA50"].copy()
-    unitas_asset_list = unitas_data["asset_list"].copy()
-    unitas_survey_sheet = unitas_data["survey_list"].copy()
-    # We remove the surveyed properties from the asset sheet
-    unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])]
-    unitas_asset_list = unitas_asset_list.merge(
-        unitas_survey_sheet[["asset_list_row_id", "installation_status"]],
-        how="left",
-        on="asset_list_row_id"
-    )
-    unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])]
-    unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"])
-
-    # We read in the data for the further completed surveys
-    unitas_phase_1_workbook = openpyxl.load_workbook(
-        "local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx"
-    )
-    phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"]
-    phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"]
-    phase1_colnames = [cell.value for cell in phase_1_worksheet[1]]
-    phase_1_rows_data = []
-    for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False):
-        row_data = [cell.value for cell in row]  # This will get you the cell values
-        phase_1_rows_data.append(row_data)
-
-    phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames)
-
-    # Correct phase 1 surveys in the same fashion as the previous approach
-    phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy())
-
-    # We check all phase 1 surveys are contained in the data we had before
-    additional = []
-    for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)):
-        # We look for the entry in the old survey sheet:
-        # matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]]
-        # if matched_uprn.shape[0] == 1:
-        #     continue
-
-        matched_1 = unitas_survey_sheet[
-            (unitas_survey_sheet["Post Code"] == row["Post Code"]) &
-            (unitas_survey_sheet["NO."] == row["NO."])
-            ]
-
-        if matched_1.shape[0] == 1:
-            continue
-
-        matched_2 = unitas_survey_sheet[
-            (unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) &
-            (unitas_survey_sheet["NO."] == row["NO."])
-            ]
-
-        if matched_2.shape[0] == 1:
-            continue
-
-        additional.append(row.to_dict())
-    additional = pd.DataFrame(additional)
-
-    phase_2_rows_data = []
-    for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False):
-        row_data = [cell.value for cell in row]  # This will get you the cell values
-        phase_2_rows_data.append(row_data)
-
-    phase2_colnames = [cell.value for cell in phase_2_worksheet[1]]
-    phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames)
-    # Drop all of the occurances of "OFFICE USE ONLY" columns
-    phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c])
-    common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns})
-    additional_filtered = additional[common_columns]
-
-    further_unitas_completed_surveys = pd.concat(
-        [phase_2_surveys, additional_filtered],
-        axis=0,
-        ignore_index=True
-    )
-
-    # We match these back to the asset list