Added HA25

2026-07-27 23:35:01 +00:00 · 2024-03-01 23:48:27 +00:00 · 2024-03-01 23:48:27 +00:00 · d9e9be4389
commit d9e9be4389
parent 46f5ee8ea4
1 changed files with 51 additions and 28 deletions
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@ -159,19 +159,18 @@ class DataLoader:
    }

    UNMATCHED_CIGA = {
-        # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not
-        # the asset list
+        "HA6": 117,
        "HA14": 3,
        "HA16": 7,
-        # There's just too many unmatched here
-        "HA6": 117,
+        "HA24": 12,
        "HA107": 51,
    }

-    def __init__(self, directories, december_figures_filepath, use_cache):
+    def __init__(self, directories, december_figures_filepath, use_cache, rebuild):
        self.directories = directories
        self.use_cache = use_cache
        self.december_figures_filepath = december_figures_filepath
+        self.rebuild = rebuild

        self.data = {}
        self.december_figures = None
@ -312,23 +311,20 @@ class DataLoader:
        return asset_list

    @staticmethod
-    def create_ciga_list_house_no(ha_name, ciga_list):
+    def create_ciga_list_house_no(ciga_list):
        """
        This function will append the House number onto the asset list
        :return:
        """

-        if ha_name in ["HA6", "HA14", "HA107", "HA16"]:
-            split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
-            house_numbers = split_addresses[0].str.split(' ', expand=True)
-            # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
-            # many columns there might be
-            house_numbers = house_numbers.iloc[:, 0:1]
-            house_numbers.columns = ['HouseNo']
+        split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
+        house_numbers = split_addresses[0].str.split(' ', expand=True)
+        # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
+        # many columns there might be
+        house_numbers = house_numbers.iloc[:, 0:1]
+        house_numbers.columns = ['HouseNo']

-            ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1)
-        else:
-            raise NotImplementedError("Implement me")
+        ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1)

        return ciga_list

@ -447,7 +443,7 @@ class DataLoader:
            # Remove rows with missing postcode which happens in a small number of cases
            ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])]
            ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
-            ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list)
+            ciga_list = self.create_ciga_list_house_no(ciga_list)
            ciga_list = self.dedupe_ciga_list(ciga_list)
            ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)

@ -800,6 +796,10 @@ class DataLoader:
            "st. leodegars close", "st leodegars close"
        )

+        survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
+            "montgomery crescent", "montgomery road"
+        )
+
        return survey_list

    @staticmethod
@ -1102,16 +1102,18 @@ class DataLoader:
        for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]:
            self.december_figures[col] = self.december_figures[col].astype("Int64")

-        if self.use_cache:
-            self.data = read_pickle_from_s3(
+        if self.use_cache and not self.rebuild:
+            data = read_pickle_from_s3(
                bucket_name="retrofit-datalake-dev",
                s3_file_name="ha-analysis/batch3-inputs.pickle",
            )
-            return
+        else:
+            data = {}

-        data = {}
        for filepath in self.directories:
            ha_name = filepath.split("/")[2]
+            if ha_name in data:
+                continue
            # Load asset list
            logger.info("Loading data for {}".format(ha_name))
            asset_list, survey_list, ciga_list = self.load_asset_list(
@ -2635,6 +2637,10 @@ def forecast_remaining_sales(loader):
    # and I don't want the numbers to change too much, depenent on the CIGA conversation rate
    maximum_ciga_conversion = 0.75

+    # This is a hard limit to the allowed conversion rates to final sale. These are typically very
+    # high but there are some anomalies, amongst surveys that are early on
+    sales_conversion_lower_bound = 0.8
+
    gbis_rate = 600
    eco4_rate = 1710
    # old_gbis_rate = 432
@ -2796,14 +2802,30 @@ def forecast_remaining_sales(loader):
    eco4_ciga_independent_passrates = pd.DataFrame(eco4_ciga_independent_passrates)
    gbis_ciga_independent_passrates = pd.DataFrame(gbis_ciga_independent_passrates)

+    eco4_ciga_independent_passrates["conversion"] = (
+        eco4_ciga_independent_passrates["# ECO4 successfully installed"] /
+        eco4_ciga_independent_passrates["# ECO4 at install stage"]
+    )
+    eco4_ciga_independent_passrates_clipped = eco4_ciga_independent_passrates[
+        eco4_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound
+        ]
+
+    gbis_ciga_independent_passrates["conversion"] = (
+        gbis_ciga_independent_passrates["# GBIS successfully installed"] /
+        gbis_ciga_independent_passrates["# GBIS at install stage"]
+    )
+    gbis_ciga_independent_passrates_clipped = gbis_ciga_independent_passrates[
+        gbis_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound
+        ]
+
    median_eco4_to_install = (
-        eco4_ciga_independent_passrates["# ECO4 successfully installed"].sum() /
-        eco4_ciga_independent_passrates["# ECO4 at install stage"].sum()
+        eco4_ciga_independent_passrates_clipped["# ECO4 successfully installed"].sum() /
+        eco4_ciga_independent_passrates_clipped["# ECO4 at install stage"].sum()
    )

    median_gbis_to_install = (
-        gbis_ciga_independent_passrates["# GBIS successfully installed"].sum() /
-        gbis_ciga_independent_passrates["# GBIS at install stage"].sum()
+        gbis_ciga_independent_passrates_clipped["# GBIS successfully installed"].sum() /
+        gbis_ciga_independent_passrates_clipped["# GBIS at install stage"].sum()
    )

    # Produce the final output
@ -3270,6 +3292,8 @@ def app():
    use_cache = True
    # Determines if we want to perform the data pull
    pull_data = False
+    # Override to re-build all inputs
+    rebuild_inputs = False

    # List all of the data in the folder

@ -3278,12 +3302,11 @@ def app():
    # Grab the December HA figures filepath
    december_figures_filepath = "local_data/ha_data/HA_December_figures.csv"

-    # priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA39", "HA107"]
-    priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA39", "HA107"]
+    priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA25", "HA39", "HA107"]
    # Filter down the directories to only the priority HAs
    directories = [d for d in directories if d.split("/")[2] in priority_has]

-    loader = DataLoader(directories, december_figures_filepath, use_cache)
+    loader = DataLoader(directories, december_figures_filepath, use_cache, rebuild_inputs)
    loader.load()
    loader.ha_facts_and_figures()