diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index b5c6835b..baaa4050 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -159,19 +159,18 @@ class DataLoader: } UNMATCHED_CIGA = { - # We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not - # the asset list + "HA6": 117, "HA14": 3, "HA16": 7, - # There's just too many unmatched here - "HA6": 117, + "HA24": 12, "HA107": 51, } - def __init__(self, directories, december_figures_filepath, use_cache): + def __init__(self, directories, december_figures_filepath, use_cache, rebuild): self.directories = directories self.use_cache = use_cache self.december_figures_filepath = december_figures_filepath + self.rebuild = rebuild self.data = {} self.december_figures = None @@ -312,23 +311,20 @@ class DataLoader: return asset_list @staticmethod - def create_ciga_list_house_no(ha_name, ciga_list): + def create_ciga_list_house_no(ciga_list): """ This function will append the House number onto the asset list :return: """ - if ha_name in ["HA6", "HA14", "HA107", "HA16"]: - split_addresses = ciga_list['Matched Address'].str.split(',', expand=True) - house_numbers = split_addresses[0].str.split(' ', expand=True) - # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how - # many columns there might be - house_numbers = house_numbers.iloc[:, 0:1] - house_numbers.columns = ['HouseNo'] + split_addresses = ciga_list['Matched Address'].str.split(',', expand=True) + house_numbers = split_addresses[0].str.split(' ', expand=True) + # THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how + # many columns there might be + house_numbers = house_numbers.iloc[:, 0:1] + house_numbers.columns = ['HouseNo'] - ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1) - else: - raise NotImplementedError("Implement me") + ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1) return ciga_list @@ -447,7 +443,7 @@ class DataLoader: # Remove rows with missing postcode which happens in a small number of cases ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])] ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))] - ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list) + ciga_list = self.create_ciga_list_house_no(ciga_list) ciga_list = self.dedupe_ciga_list(ciga_list) ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name) @@ -800,6 +796,10 @@ class DataLoader: "st. leodegars close", "st leodegars close" ) + survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace( + "montgomery crescent", "montgomery road" + ) + return survey_list @staticmethod @@ -1102,16 +1102,18 @@ class DataLoader: for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]: self.december_figures[col] = self.december_figures[col].astype("Int64") - if self.use_cache: - self.data = read_pickle_from_s3( + if self.use_cache and not self.rebuild: + data = read_pickle_from_s3( bucket_name="retrofit-datalake-dev", s3_file_name="ha-analysis/batch3-inputs.pickle", ) - return + else: + data = {} - data = {} for filepath in self.directories: ha_name = filepath.split("/")[2] + if ha_name in data: + continue # Load asset list logger.info("Loading data for {}".format(ha_name)) asset_list, survey_list, ciga_list = self.load_asset_list( @@ -2635,6 +2637,10 @@ def forecast_remaining_sales(loader): # and I don't want the numbers to change too much, depenent on the CIGA conversation rate maximum_ciga_conversion = 0.75 + # This is a hard limit to the allowed conversion rates to final sale. These are typically very + # high but there are some anomalies, amongst surveys that are early on + sales_conversion_lower_bound = 0.8 + gbis_rate = 600 eco4_rate = 1710 # old_gbis_rate = 432 @@ -2796,14 +2802,30 @@ def forecast_remaining_sales(loader): eco4_ciga_independent_passrates = pd.DataFrame(eco4_ciga_independent_passrates) gbis_ciga_independent_passrates = pd.DataFrame(gbis_ciga_independent_passrates) + eco4_ciga_independent_passrates["conversion"] = ( + eco4_ciga_independent_passrates["# ECO4 successfully installed"] / + eco4_ciga_independent_passrates["# ECO4 at install stage"] + ) + eco4_ciga_independent_passrates_clipped = eco4_ciga_independent_passrates[ + eco4_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound + ] + + gbis_ciga_independent_passrates["conversion"] = ( + gbis_ciga_independent_passrates["# GBIS successfully installed"] / + gbis_ciga_independent_passrates["# GBIS at install stage"] + ) + gbis_ciga_independent_passrates_clipped = gbis_ciga_independent_passrates[ + gbis_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound + ] + median_eco4_to_install = ( - eco4_ciga_independent_passrates["# ECO4 successfully installed"].sum() / - eco4_ciga_independent_passrates["# ECO4 at install stage"].sum() + eco4_ciga_independent_passrates_clipped["# ECO4 successfully installed"].sum() / + eco4_ciga_independent_passrates_clipped["# ECO4 at install stage"].sum() ) median_gbis_to_install = ( - gbis_ciga_independent_passrates["# GBIS successfully installed"].sum() / - gbis_ciga_independent_passrates["# GBIS at install stage"].sum() + gbis_ciga_independent_passrates_clipped["# GBIS successfully installed"].sum() / + gbis_ciga_independent_passrates_clipped["# GBIS at install stage"].sum() ) # Produce the final output @@ -3270,6 +3292,8 @@ def app(): use_cache = True # Determines if we want to perform the data pull pull_data = False + # Override to re-build all inputs + rebuild_inputs = False # List all of the data in the folder @@ -3278,12 +3302,11 @@ def app(): # Grab the December HA figures filepath december_figures_filepath = "local_data/ha_data/HA_December_figures.csv" - # priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA39", "HA107"] - priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA39", "HA107"] + priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA25", "HA39", "HA107"] # Filter down the directories to only the priority HAs directories = [d for d in directories if d.split("/")[2] in priority_has] - loader = DataLoader(directories, december_figures_filepath, use_cache) + loader = DataLoader(directories, december_figures_filepath, use_cache, rebuild_inputs) loader.load() loader.ha_facts_and_figures()