From 8b48dbac9e5e9f25e3c738c1322b1f3a9fbb11db Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 26 Feb 2024 13:37:50 +0000 Subject: [PATCH] working on eco eligibility code --- .../ha_15_32/ha_analysis_batch_3.py | 153 ++++++++++++++---- 1 file changed, 122 insertions(+), 31 deletions(-) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index dbe12e92..fdc00876 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -725,6 +725,13 @@ class DataLoader: def load(self): + # Get the december figures, which is just a csv + self.december_figures = pd.read_csv(self.december_figures_filepath) + # Remove the spaces in HA Name + self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "") + self.december_figures["ECO4"] = self.december_figures["ECO4"].astype("Int64") + self.december_figures["GBIS"] = self.december_figures["GBIS"].astype("Int64") + if self.use_cache: self.data = read_pickle_from_s3( bucket_name="retrofit-datalake-dev", @@ -732,11 +739,6 @@ class DataLoader: ) return - # Get the december figures, which is just a csv - self.december_figures = pd.read_csv(self.december_figures_filepath) - # Remove the spaces in HA Name - self.december_figures["HA Name"] = december_figures["HA Name"].str.replace(" ", "") - data = {} for filepath in self.directories: ha_name = filepath.split("/")[2] @@ -768,46 +770,135 @@ class DataLoader: This function will return a dictionary of facts and figures for each HA :return: """ + + scheme_map = { + "ECO4": "ECO4", + "AFFORDABLE WARMTH": "ECO4", + } + + eco_eligibility_map = { + "not eligble": "not eligible" + } + ha_facts_and_figures = [] for ha_name, data_assets in self.data.items(): asset_list = data_assets["asset_list"].copy() survey_list = data_assets["survey_list"].copy() ciga_list = data_assets["ciga_list"].copy() - asset_list["ECO Eligibility"].value_counts() + # Change the column name if it's ECO eligibility + asset_list = asset_list.rename(columns={"ECO eligibility": "ECO Eligibility"}) + # Remove surplus whitespace from the ECO Eligibility column + asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.strip() + # Push to lower case + asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.lower() + # Remap + asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].map(eco_eligibility_map) - # We merge on ciga and update the status to reflect if it has failed ciga or not - # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA - # check - asset_list = asset_list.merge( - ciga_list[["asset_list_row_id", "Guarantee"]], - how='left', - on="asset_list_row_id" - ) + if not ciga_list.empty: + # We merge on ciga and update the status to reflect if it has failed ciga or not + # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA + # check + asset_list = asset_list.merge( + ciga_list[["asset_list_row_id", "Guarantee"]], + how='left', + on="asset_list_row_id" + ) - asset_list["ECO Eligibility"].value_counts() + asset_list["ECO Eligibility"].value_counts() - asset_list["ECO Eligibility"] = np.where( - ( - asset_list["ECO Eligibility"].str.contains("(Subject to CIGA)", regex=False) & - (asset_list["Guarantee"] == "Yes") - ), - "Failed CIGA", - asset_list["ECO Eligibility"] - ) + asset_list["ECO Eligibility"] = np.where( + ( + asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) & + (asset_list["Guarantee"] == "Yes") + ), + "failed ciga", + asset_list["ECO Eligibility"] + ) - # We replace any remaining "Subject to CIGA" with pass Ciga - asset_list["ECO Eligibility"] = np.where( - asset_list["ECO Eligibility"].str.contains("Subject to CIGA", regex=False), - "Pass CIGA", - asset_list["ECO Eligibility"] - ) + # We replace any remaining "Subject to CIGA" with pass Ciga + asset_list["ECO Eligibility"] = np.where( + asset_list["ECO Eligibility"].str.contains("Subject to CIGA", regex=False), + "eco4 - passed ciga", + asset_list["ECO Eligibility"] + ) - asset_list = asset_list.drop(columns=["Guarantee"]) + asset_list = asset_list.drop(columns=["Guarantee"]) - # Update the asset list with the categorisations + # Update the asset list with the categorisations and rename changes self.data[ha_name]["asset_list"] = asset_list + # Report on sales + sales_report = {} + if not survey_list.empty: + scheme_column = survey_list.columns[0] + # We clean up the survey list installation or cancelled + survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower() + # Remove all punctuation + survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( + r'[^\w\s]', '', regex=True + ) + # Remove double spaces + survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace( + r'\s+', ' ', regex=True + ) + # Remove trailing spaces + survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip() + + # Remap the values in the scheme column + survey_list[scheme_column] = survey_list[scheme_column].map(scheme_map) + + survey_list["installation_status"] = None + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]), + "installed", + survey_list["installation_status"] + ) + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["cancelled"]), + "cancelled", + survey_list["installation_status"] + ) + # Find partial installations + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"), + "partially installed", + survey_list["installation_status"] + ) + # Find partial cancellations + # TODO: We might have more indications of partial cancellations + survey_list["installation_status"] = np.where( + survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]), + "partially cancelled", + survey_list["installation_status"] + ) + + # Finally, for other cases, we set the status to "in progress" + survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress") + + # We concatenate the scheme name with the installation status + survey_list["installation_status"] = ( + survey_list[scheme_column] + " - " + survey_list["installation_status"] + ) + + # We get the sales + sales_report = survey_list["installation_status"].value_counts().to_dict() + + ha_facts_and_figures.append( + { + "HA Name": ha_name, + **asset_list["ECO Eligibility"].value_counts().to_dict(), + **sales_report + } + ) + + ha_facts_and_figures = pd.DataFrame(ha_facts_and_figures) + ha_facts_and_figures = ha_facts_and_figures.drop( + columns=["not eligible"] + ) + + ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name") + return ha_facts_and_figures