From 55e28942e48bb8cf55e7c95875533710d7e21ea1 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 29 Jan 2024 12:13:22 +0000 Subject: [PATCH] Added automated creation of excel and added missing files to git --- etl/eligibility/Eligibility.py | 28 +- .../ha_15_32/WFT Sales data analysis.py | 665 ++++++++++++++++++ etl/eligibility/ha_15_32/cancellation.py | 113 +++ .../ha_15_32/ha_analysis_batch_3.py | 100 ++- 4 files changed, 876 insertions(+), 30 deletions(-) create mode 100644 etl/eligibility/ha_15_32/WFT Sales data analysis.py create mode 100644 etl/eligibility/ha_15_32/cancellation.py diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index 1d868338..906ff594 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -128,7 +128,7 @@ class Eligibility: if insulation_thickness <= 100: thickness_classification = "0-100mm" - elif insulation_thickness <= 270: + elif insulation_thickness <= high_loft_thickness_threshold: thickness_classification = "100-270mm" else: thickness_classification = "270mm+" @@ -146,24 +146,14 @@ class Eligibility: "thickness_classification": thickness_classification } - if insulation_thickness <= high_loft_thickness_threshold: - self.loft = { - "suitability": True, - "thickness": insulation_thickness, - "reason": "high loft thickness but below regulation", - "thickness_classification": thickness_classification - } - return - - if insulation_thickness > high_loft_thickness_threshold: - # Insulation is already thick enough - self.loft = { - "suitability": False, - "thickness": insulation_thickness, - "reason": "existing insulation", - "thickness_classification": thickness_classification - } - return + # Insulation is already thick enough + self.loft = { + "suitability": False, + "thickness": insulation_thickness, + "reason": "existing insulation", + "thickness_classification": thickness_classification + } + return def cavity_insulation(self): diff --git a/etl/eligibility/ha_15_32/WFT Sales data analysis.py b/etl/eligibility/ha_15_32/WFT Sales data analysis.py new file mode 100644 index 00000000..a088fe43 --- /dev/null +++ b/etl/eligibility/ha_15_32/WFT Sales data analysis.py @@ -0,0 +1,665 @@ +import numpy as np +import pandas as pd + +ECO4_NEW_RATES = 1710 +GBIS_NEW_RATES = 600 + + +def app(): + # Load in the excel + nov_ha_data = pd.read_excel( + 'etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS 21.11.2023 with sales data.xlsx', + ) + # Drop rows where HA name is null + nov_ha_data = nov_ha_data.dropna(subset=["HA Name"]) + nov_ha_data["ha_number"] = nov_ha_data["HA Name"].str.extract(r"(\d+)").astype(int) + nov_ha_data = nov_ha_data.sort_values("ha_number", ascending=True) + + variance_explanations = pd.read_excel( + 'etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS 21.11.2023 with sales data.xlsx', + sheet_name="Variance explanations" + ) + + september_figures = pd.read_excel( + "etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS SEP 23 UPDATE (2).xlsx", + sheet_name="HA Stats" + ) + + historical_invoices = pd.read_excel( + "etl/eligibility/ha_15_32/ALL HA FIGURES AND ASSIGNED INSTALLERS 21.11.2023 with sales data.xlsx", + sheet_name="Jul 22 to Oct 23" + ) + # Drop rows where installer rates is null + historical_invoices = historical_invoices[~pd.isnull(historical_invoices["INSTALLER RATES"])] + historical_invoices = historical_invoices[historical_invoices["INSTALLER RATES"] != "NA "] + # By Scheme, take a weighted mean of the INSTALLER RATES, weighted on the number of rows + n_invoices = historical_invoices.groupby(["Scheme", "INSTALLER RATES"])["Invoice number"].count().reset_index() + n_invoices = n_invoices[n_invoices["Scheme"].isin(["Eco 4", "GBIS"])] + historical_scheme_rates = n_invoices.groupby("Scheme").apply( + lambda x: np.average(x["INSTALLER RATES"], weights=x["Invoice number"]) + ).reset_index().rename(columns={0: "Historical rates"}) + + # we take just entries sales data that have sales > 0 + sales_data = nov_ha_data[nov_ha_data["Sales"] > 0] + + # We now need to adjust sales data depending on the variance explanations + sales_data = sales_data.merge( + variance_explanations[["HA", 'Which figure is correct']], + how="left", + left_on="ha_number", + right_on="HA" + ) + + def adjust_sales(row): + if pd.isnull(row["Which figure is correct"]): + return row["Sales"] + + if row["Which figure is correct"] == "HA facts & figures": + return row['No. of Tech surveys complete'] + + if row["Which figure is correct"] == "Billed amount": + return row["Sales"] + + if row["Which figure is correct"] in ["Both correct, HA facts and figures includes November", "Both correct"]: + return row["Sales"] + + raise ValueError(f"Unknown value for 'Which figure is correct': {row['Which figure is correct']}") + + # We now need to adjust sales data depending on the variance explanations + sales_data["adjusted_sales"] = sales_data.apply(lambda row: adjust_sales(row), axis=1) + + # We therefore adjust GBIS and ECO4 sales data based on adjusted sales + sales_data["adjusted_eco4_sales"] = sales_data["No. of Tech surveys complete - Eco 4"] / sales_data["Sales"] * \ + sales_data["adjusted_sales"] + + sales_data["adjusted_gbis_sales"] = sales_data["No. of Tech surveys complete - GBIS"] / sales_data["Sales"] * \ + sales_data["adjusted_sales"] + + sales_data["cancellation_rate"] = (sales_data["Sales"] - sales_data["adjusted_sales"]) / sales_data["Sales"] + + # The difference between the adjusted sales and the actual sales is the cancellation + cancellations = (sales_data["adjusted_sales"].sum() - sales_data["Sales"].sum()) / sales_data["Sales"].sum() + + # Given the cancellations, we can now adjust the expected remaining surveys + sales_data["No. of Tech surveys remaining"] = sales_data["No. of Tech surveys remaining"] * ( + 1 - sales_data["cancellation_rate"] + ) + + # We now merge on the expected values for September + sales_data = sales_data.merge( + september_figures[["Redacted HA", "ECO4", "GBIS"]].rename( + columns={"Redacted HA": "HA Name", "ECO4": "Sept Expected ECO4", "GBIS": "Sept Expected GBIS"} + ), + how="left", + on="HA Name", + ) + + sales_data["Sept Expected ECO4"] = sales_data["Sept Expected ECO4"].fillna(0) + sales_data["Sept Expected GBIS"] = sales_data["Sept Expected GBIS"].fillna(0) + + # We calculate the ECO4 and GBIS conversion rates with the adjusted numbers + sales_data["ECO4 Conversion"] = sales_data["adjusted_eco4_sales"] / sales_data["adjusted_sales"] + sales_data["GBIS Conversion"] = sales_data["adjusted_gbis_sales"] / sales_data["adjusted_sales"] + + # We now calculate the expected remaining ECO4 and GBIS sales + # We take the number of remaining surveys and multiply by the conversion rate for each scheme, which tells us + # how many more we should expect to see + sales_data["Expected Remaining ECO4"] = sales_data["No. of Tech surveys remaining"] * sales_data["ECO4 Conversion"] + sales_data["Expected Remaining GBIS"] = sales_data["No. of Tech surveys remaining"] * sales_data["GBIS Conversion"] + + # We now produce a forecasted ECO4 and GBIS sales figure + sales_data["Forecasted ECO4 Sales"] = sales_data["adjusted_eco4_sales"] + sales_data["Expected Remaining ECO4"] + sales_data["Forecasted GBIS Sales"] = sales_data["adjusted_gbis_sales"] + sales_data["Expected Remaining GBIS"] + + # Take the columns we're interestd in + # HA # Properties Sept ECO4 Figures Sept GBIS Figures Nov Total Sales Nov ECO4 Sales Nov GBIS Sales + # Remaining Surveys ECO4 conversion GBIS conversion Forecasted ECO4 Sales Forecasted GBIS sales ECO4 Change + # GBIS Change + sales_data_formatted = sales_data[[ + "HA Name", + "ASSET LIST no.", + "Sept Expected ECO4", + "Sept Expected GBIS", + "adjusted_sales", + "adjusted_eco4_sales", + "adjusted_gbis_sales", + "No. of Tech surveys remaining", + "ECO4 Conversion", + "GBIS Conversion", + "Forecasted ECO4 Sales", + "Forecasted GBIS Sales" + ]].rename( + columns={ + "adjusted_sales": "Oct Total Sales (adjusted for variance)", + "adjusted_eco4_sales": "Oct ECO4 Sales (adjusted for variance)", + "adjusted_gbis_sales": "Oct GBIS Sales (adjusted for variance)", + "No. of Tech surveys remaining": "Remaining Surveys", + } + ) + + # Convert columns which should be integers to integers + for col in ["ASSET LIST no.", "Remaining Surveys", "Sept Expected ECO4", "Sept Expected GBIS", + "Oct Total Sales (adjusted for variance)", "Oct ECO4 Sales (adjusted for variance)", + "Oct GBIS Sales (adjusted for variance)", "Forecasted ECO4 Sales", "Forecasted GBIS Sales"]: + sales_data_formatted[col] = sales_data_formatted[col].fillna(0) + sales_data_formatted[col] = sales_data_formatted[col].astype(int) + + # Remove HA 17 because this was EPCs only. We also remove HA33 because they do not have access to the full portfolio + sales_data_formatted = sales_data_formatted[ + ~sales_data_formatted["HA Name"].isin(["HA 17", "HA 33"]) + ] + + # September expected ECO4 and GBIS + sept_expected_eco4 = sales_data_formatted["Sept Expected ECO4"].sum() + sept_expected_gbis = sales_data_formatted["Sept Expected GBIS"].sum() + + # Completed so far + oct_eco4_sales = sales_data_formatted["Oct ECO4 Sales (adjusted for variance)"].sum() + oct_gbis_sales = sales_data_formatted["Oct GBIS Sales (adjusted for variance)"].sum() + + # Forecasted figures + forecasted_eco4_sales = sales_data_formatted["Forecasted ECO4 Sales"].sum() + forecasted_gbis_sales = sales_data_formatted["Forecasted GBIS Sales"].sum() + + # Expected remaining sales + expected_remaining_eco4_sales = forecasted_eco4_sales - oct_eco4_sales + expected_remaining_gbis_sales = forecasted_gbis_sales - oct_gbis_sales + + # Forecast change vs September + forecasted_eco4_change = 100 * (forecasted_eco4_sales - sept_expected_eco4) / sept_expected_eco4 + forecasted_gbis_change = 100 * (forecasted_gbis_sales - sept_expected_gbis) / sept_expected_gbis + + aggregates = pd.DataFrame( + columns=["Scheme", "Sept Expected", "Oct Completed", "Forecasted Remaining Sales", "Forecasted Total Sales", + "Forecasted Change vs Sept"], + data=[ + ["ECO4", sept_expected_eco4, oct_eco4_sales, expected_remaining_eco4_sales, forecasted_eco4_sales, + forecasted_eco4_change], + ["GBIS", sept_expected_gbis, oct_gbis_sales, expected_remaining_gbis_sales, forecasted_gbis_sales, + forecasted_gbis_change], + ] + ) + + # Multiply by histoical rates to get revenue + # For ECO4, this is ~£1456 and for GBIS it's ~£432 + historical_gbis_price = historical_scheme_rates[ + historical_scheme_rates["Scheme"] == "GBIS" + ]["Historical rates"].iloc[0] + + historical_eco4_price = historical_scheme_rates[ + historical_scheme_rates["Scheme"] == "Eco 4" + ]["Historical rates"].iloc[0] + + aggregates["Sept Expected Revenue"] = np.where( + aggregates["Scheme"] == "ECO4", + aggregates["Sept Expected"] * historical_eco4_price, + aggregates["Sept Expected"] * historical_gbis_price + ) + + aggregates["Completed Revenue"] = np.where( + aggregates["Scheme"] == "ECO4", + aggregates["Oct Completed"] * historical_eco4_price, + aggregates["Oct Completed"] * historical_gbis_price + ) + + # We use the new rates for the forecasted revenue + aggregates["Forecasted Remaining Revenue"] = np.where( + aggregates["Scheme"] == "ECO4", + aggregates["Forecasted Remaining Sales"] * ECO4_NEW_RATES, + aggregates["Forecasted Remaining Sales"] * GBIS_NEW_RATES + ) + + # We also calculate the forecasted remaining revenue at the original price + aggregates["Forecasted Remaining Revenue (original price)"] = np.where( + aggregates["Scheme"] == "ECO4", + aggregates["Forecasted Remaining Sales"] * historical_eco4_price, + aggregates["Forecasted Remaining Sales"] * historical_gbis_price + ) + + aggregates["Forecasted Revenue"] = aggregates["Completed Revenue"] + aggregates["Forecasted Remaining Revenue"] + + # Forecasted revenue with original price + aggregates["Forecasted Revenue (original price)"] = ( + aggregates["Completed Revenue"] + aggregates["Forecasted Remaining Revenue (original price)"] + ) + + # Create a totals row which sums up the two rows + + forecasted_change_vs_sept = 100 * ( + aggregates["Forecasted Total Sales"].sum() - aggregates["Sept Expected"].sum() + ) / aggregates["Sept Expected"].sum() + + aggregates = pd.concat( + [ + aggregates, + pd.DataFrame( + [ + ["Total", aggregates["Sept Expected"].sum(), aggregates["Oct Completed"].sum(), + aggregates["Forecasted Remaining Sales"].sum(), aggregates["Forecasted Total Sales"].sum(), + forecasted_change_vs_sept, + aggregates["Sept Expected Revenue"].sum(), aggregates["Completed Revenue"].sum(), + aggregates["Forecasted Remaining Revenue"].sum(), + aggregates["Forecasted Remaining Revenue (original price)"].sum(), + aggregates["Forecasted Revenue"].sum(), + aggregates["Forecasted Revenue (original price)"].sum(), + ] + ], + columns=aggregates.columns + ) + ] + ) + + # For each property in the asset list, we now calculate an average conversion rate to ECO4 and GBIS + # We do this by taking the forecasted sales values for each schemes and dividing by the number of properties + + number_properties = sales_data_formatted["ASSET LIST no."].sum() + eco4_conversion_rate = forecasted_eco4_sales / number_properties + gbis_conversion_rate = forecasted_gbis_sales / number_properties + + # We also attribute a future value per property + future_eco4_value = ECO4_NEW_RATES * eco4_conversion_rate + future_gbis_value = GBIS_NEW_RATES * gbis_conversion_rate + + # We also calulate a revenue figure for the old rates + historical_eco4_value = historical_eco4_price * eco4_conversion_rate + historical_gbis_value = historical_gbis_price * gbis_conversion_rate + + # For the HAs that have not begun selling, we estimate the value of the projects + # We start with some problem HAs + + # HA 7, HA 24, HA 25 + # These HAs have no sales data, so we use the expected figures + + problem_has_data = nov_ha_data[ + (nov_ha_data["HA Name"].isin(["HA 7", "HA 24", "HA 25"])) + ].copy() + # Merge on the september expected figures + problem_has_data = problem_has_data.merge( + september_figures[["Redacted HA", "ECO4", "GBIS"]].rename( + columns={"Redacted HA": "HA Name", "ECO4": "Sept Expected ECO4", "GBIS": "Sept Expected GBIS"} + ), + how="left", + on="HA Name", + ) + # Fill NAs + problem_has_data["Sept Expected ECO4"] = problem_has_data["Sept Expected ECO4"].fillna(0) + problem_has_data["Sept Expected GBIS"] = problem_has_data["Sept Expected GBIS"].fillna(0) + + # We now calculate the expected ECO4 and GBIS sales based on the average conversion rates + problem_has_data["Expected ECO4 Sales"] = problem_has_data["ASSET LIST no."] * eco4_conversion_rate + problem_has_data["Expected GBIS Sales"] = problem_has_data["ASSET LIST no."] * gbis_conversion_rate + + # Filter just on columns we're interested in + problem_has_data = problem_has_data[[ + "HA Name", + "ASSET LIST no.", + "Sept Expected ECO4", + "Sept Expected GBIS", + "ECO4", + "GBIS", + "Expected ECO4 Sales", + "Expected GBIS Sales" + ]].rename( + columns={ + "ECO4": "Nov Expected ECO4", + "GBIS": "Nov Expected GBIS", + } + ) + + # Fill NAs + problem_has_data["Nov Expected ECO4"] = problem_has_data["Nov Expected ECO4"].fillna(0) + problem_has_data["Nov Expected GBIS"] = problem_has_data["Nov Expected GBIS"].fillna(0) + + # We calculate HA level Sept, Nov expected revenue, based on historical rates and then forecasted revenue + problem_has_data["Sept Expected ECO4 Value"] = problem_has_data["Sept Expected ECO4"] * historical_eco4_price + problem_has_data["Sept Expected GBIS Value"] = problem_has_data["Sept Expected GBIS"] * historical_gbis_price + + problem_has_data["Nov Expected ECO4 Value"] = problem_has_data["Nov Expected ECO4"] * historical_eco4_price + problem_has_data["Nov Expected GBIS Value"] = problem_has_data["Nov Expected GBIS"] * historical_gbis_price + + problem_has_data["Forecasted ECO4 Revenue"] = problem_has_data["ASSET LIST no."] * future_eco4_value + problem_has_data["Forecasted GBIS Revenue"] = problem_has_data["ASSET LIST no."] * future_gbis_value + + # Totals + problem_has_data["Sept Expected Total Value"] = problem_has_data["Sept Expected ECO4 Value"] + \ + problem_has_data["Sept Expected GBIS Value"] + problem_has_data["Nov Expected Total Value"] = problem_has_data["Nov Expected ECO4 Value"] + \ + problem_has_data["Nov Expected GBIS Value"] + problem_has_data["Forecasted Total Revenue"] = problem_has_data["Forecasted ECO4 Revenue"] + \ + problem_has_data["Forecasted GBIS Revenue"] + + # We calculate a total expected value for September, November and then forecasted + problem_has_expected_eco4_value = problem_has_data["Sept Expected ECO4"].sum() * historical_eco4_price + problem_has_expected_gbis_value = problem_has_data["Sept Expected GBIS"].sum() * historical_gbis_price + problem_has_expected_total_value = problem_has_expected_eco4_value + problem_has_expected_gbis_value + + problem_has_nov_eco4_value = problem_has_data["Nov Expected ECO4"].sum() * historical_eco4_price + problem_has_nov_gbis_value = problem_has_data["Nov Expected GBIS"].sum() * historical_gbis_price + problem_has_nov_total_value = problem_has_nov_eco4_value + problem_has_nov_gbis_value + + forecasted_eco4_value = problem_has_data["ASSET LIST no."].sum() * future_eco4_value + forecasted_gbis_value = problem_has_data["ASSET LIST no."].sum() * future_gbis_value + problem_has_forecasted_total_value = forecasted_eco4_value + forecasted_gbis_value + + problem_has_summary = pd.DataFrame( + columns=["Scheme", "Sept Expected", "Nov Expected", "Forecasted"], + data=[ + ["ECO4", problem_has_expected_eco4_value, problem_has_nov_eco4_value, forecasted_eco4_value], + ["GBIS", problem_has_expected_gbis_value, problem_has_nov_gbis_value, forecasted_gbis_value], + ["Total", problem_has_expected_total_value, problem_has_nov_total_value, problem_has_forecasted_total_value] + ] + ) + + # We now also estimate the value of the remaining HAs based on historical sales performance and new rates + # We take the has that are not in the sales data + remaining_has = nov_ha_data[ + ~nov_ha_data["HA Name"].isin(sales_data_formatted["HA Name"]) + ].copy() + + # Merge on the september expected figures + remaining_has = remaining_has.merge( + september_figures[["Redacted HA", "ECO4", "GBIS"]].rename( + columns={"Redacted HA": "HA Name", "ECO4": "Sept Expected ECO4", "GBIS": "Sept Expected GBIS"} + ), + how="left", + on="HA Name", + ) + + # We update the asset list size for HA 33, because they do not have access to the full portfolio + remaining_has.loc[remaining_has["HA Name"] == "HA 33", "ASSET LIST no."] = 20699 + # We also remove HA 17 + remaining_has = remaining_has[~remaining_has["HA Name"].isin(["HA 17"])] + + # We now calculate the expected ECO4 and GBIS sales based on the average conversion rates + remaining_has["Expected ECO4 Sales"] = remaining_has["ASSET LIST no."] * eco4_conversion_rate + remaining_has["Expected GBIS Sales"] = remaining_has["ASSET LIST no."] * gbis_conversion_rate + + # Filter just on columns we're interested in + remaining_has = remaining_has[[ + "HA Name", + "ASSET LIST no.", + "Sept Expected ECO4", + "Sept Expected GBIS", + "ECO4", + "GBIS", + ]].rename( + columns={ + "ECO4": "Nov Expected ECO4", + "GBIS": "Nov Expected GBIS", + } + ) + + remaining_has = remaining_has.fillna(0) + + # We take just HAs that had an initial september expectation for ECO4 or GBIS, or that now have a Nov expectation + remaining_has = remaining_has[ + (remaining_has["Sept Expected ECO4"] > 0) | (remaining_has["Sept Expected GBIS"] > 0) | + (remaining_has["Nov Expected ECO4"] > 0) | (remaining_has["Nov Expected GBIS"] > 0) + ] + + # Expected sales based on asset list size and conversion rate + remaining_has["Forecasted Sales ECO4"] = remaining_has["ASSET LIST no."] * eco4_conversion_rate + remaining_has["Forecasted Sales GBIS"] = remaining_has["ASSET LIST no."] * gbis_conversion_rate + + # Calculat the total expected value for September and November + remaining_has["Sept Expected ECO4 Value"] = remaining_has["Sept Expected ECO4"] * historical_eco4_price + remaining_has["Sept Expected GBIS Value"] = remaining_has["Sept Expected GBIS"] * historical_gbis_price + + remaining_has["Nov Expected ECO4 Value"] = remaining_has["Nov Expected ECO4"] * historical_eco4_price + remaining_has["Nov Expected GBIS Value"] = remaining_has["Nov Expected GBIS"] * historical_gbis_price + + # Calculate forecasted revenue + remaining_has["Forecasted ECO4 Revenue"] = remaining_has["ASSET LIST no."] * future_eco4_value + remaining_has["Forecasted GBIS Revenue"] = remaining_has["ASSET LIST no."] * future_gbis_value + + # We also calculate forecasted revenue with the original price + remaining_has["Forecasted ECO4 Revenue (original price)"] = remaining_has["ASSET LIST no."] * historical_eco4_value + remaining_has["Forecasted GBIS Revenue (original price)"] = remaining_has["ASSET LIST no."] * historical_gbis_value + + # Calculate totals for each scheme + remaining_has_september_eco4_sales = remaining_has["Sept Expected ECO4"].sum() + remaining_has_september_gbis_sales = remaining_has["Sept Expected GBIS"].sum() + + remaining_has_november_eco4_sales = remaining_has["Nov Expected ECO4"].sum() + remaining_has_november_gbis_sales = remaining_has["Nov Expected GBIS"].sum() + + remaining_has_forecasted_eco4_sales = remaining_has["Forecasted Sales ECO4"].sum() + remaining_has_forecasted_gbis_sales = remaining_has["Forecasted Sales GBIS"].sum() + + remaining_has_september_eco4_value = remaining_has["Sept Expected ECO4 Value"].sum() + remaining_has_september_gbis_value = remaining_has["Sept Expected GBIS Value"].sum() + + remaining_has_november_eco4_value = remaining_has["Nov Expected ECO4 Value"].sum() + remaining_has_november_gbis_value = remaining_has["Nov Expected GBIS Value"].sum() + + remaining_has_forecasted_eco4_value = remaining_has["Forecasted ECO4 Revenue"].sum() + remaining_has_forecasted_gbis_value = remaining_has["Forecasted GBIS Revenue"].sum() + + remaining_has_forecasted_eco4_value_original_price = remaining_has["Forecasted ECO4 Revenue (original price)"].sum() + remaining_has_forecasted_gbis_value_original_price = remaining_has["Forecasted GBIS Revenue (original price)"].sum() + + # Calculate the change in forecasted sales against the September expected sales + remaining_has_foecast_change_eco4 = 100 * ( + remaining_has["Forecasted Sales ECO4"].sum() - remaining_has["Sept Expected ECO4"].sum() + ) / remaining_has["Sept Expected ECO4"].sum() + + remaining_has_foecast_change_gbis = 100 * ( + remaining_has["Forecasted Sales GBIS"].sum() - remaining_has["Sept Expected GBIS"].sum() + ) / remaining_has["Sept Expected GBIS"].sum() + + # Total change + remaining_has_foecast_change_total = 100 * ( + remaining_has["Forecasted Sales ECO4"].sum() + remaining_has["Forecasted Sales GBIS"].sum() - + remaining_has["Sept Expected ECO4"].sum() - remaining_has["Sept Expected GBIS"].sum() + ) / (remaining_has["Sept Expected ECO4"].sum() + remaining_has["Sept Expected GBIS"].sum()) + + asset_list_size = remaining_has["ASSET LIST no."].sum() + + # Create a summary table of the rest with the totals for ECO4, GBIS and then a total row + remaining_has_aggregate = pd.DataFrame( + columns=["Scheme", "Asset List Size", "Sept Expected Sales", "Nov Expected Sales", "Forecasted Sales", + "Forecasted Change vs Sept", + "Sept Expected Value", "Nov Expected Value", "Forecasted Value", "Forecasted Value (original price)"], + data=[ + [ + "ECO4", asset_list_size, remaining_has_september_eco4_sales, remaining_has_november_eco4_sales, + remaining_has_forecasted_eco4_sales, remaining_has_foecast_change_eco4, + remaining_has_september_eco4_value, + remaining_has_november_eco4_value, remaining_has_forecasted_eco4_value, + remaining_has_forecasted_eco4_value_original_price + ], + [ + "GBIS", asset_list_size, remaining_has_september_gbis_sales, remaining_has_november_gbis_sales, + remaining_has_forecasted_gbis_sales, remaining_has_foecast_change_gbis, + remaining_has_september_gbis_value, + remaining_has_november_gbis_value, remaining_has_forecasted_gbis_value, + remaining_has_forecasted_gbis_value_original_price + ], + [ + "Total", asset_list_size, remaining_has_september_eco4_sales + remaining_has_september_gbis_sales, + remaining_has_november_eco4_sales + remaining_has_november_gbis_sales, + remaining_has_forecasted_eco4_sales + remaining_has_forecasted_gbis_sales, + remaining_has_foecast_change_total, + remaining_has_september_eco4_value + remaining_has_september_gbis_value, + remaining_has_november_eco4_value + remaining_has_november_gbis_value, + remaining_has_forecasted_eco4_value + remaining_has_forecasted_gbis_value, + remaining_has_forecasted_eco4_value_original_price + + remaining_has_forecasted_gbis_value_original_price + ] + ] + ) + + # Calculate pipeline value + pipeline_value = aggregates[["Scheme", "Completed Revenue", "Forecasted Remaining Revenue"]].merge( + remaining_has_aggregate[["Scheme", "Forecasted Value"]].rename( + columns={"Forecasted Value": "Forecasted Revenue, Unconfirmed HAs"} + ), how="inner", on="Scheme" + ) + + # Calculate the total + pipeline_value["Total Value"] = ( + pipeline_value["Completed Revenue"] + pipeline_value["Forecasted Remaining Revenue"] + pipeline_value[ + "Forecasted Revenue, Unconfirmed HAs"] + ) + + # TODO: Insert model figures + model_results = pd.DataFrame( + [ + { + # This one, we don't have sales data + "HA Name": "HA 15", + "Model Expected Additional ECO4 (unit level)": None, + "Model Expected Total ECO4 (unit level)": 296, + "Model Expected Additional GBIS (unit level)": None, + "Model Expected Total GBIS (unit level)": 209, + }, + { + "HA Name": "HA 16", + # Old before re-run + # "Model Expected Additional ECO4 (unit level)": 418, + # "Model Expected Total ECO4 (unit level)": 1820, + # "Model Expected Additional GBIS (unit level)": 576, + # "Model Expected Total GBIS (unit level)": 612, + + # IN the partial sales data, WFT have completed 1407 ECO4, 36 GBIS + "Model Expected Additional ECO4 (unit level)": 411 + 342 + 235, + "Model Expected Total ECO4 (unit level)": 1407 + 411 + 342 + 235, + "Model Expected Additional GBIS (unit level)": 223, + "Model Expected Total GBIS (unit level)": 36 + 223, + }, + { + "HA Name": "HA 24", + "Model Expected Additional ECO4 (unit level)": 224, + "Model Expected Total ECO4 (unit level)": 848, + "Model Expected Additional GBIS (unit level)": 552, + "Model Expected Total GBIS (unit level)": 552, + }, + { + "HA Name": "HA 25", + "Model Expected Additional ECO4 (unit level)": None, + "Model Expected Total ECO4 (unit level)": 1709 + 59, + "Model Expected Additional GBIS (unit level)": None, + "Model Expected Total GBIS (unit level)": 2004 + 107, + } + ] + ) + + sales_data_formatted["Remaining ECO4 Sales"] = ( + sales_data_formatted["Forecasted ECO4 Sales"] - sales_data_formatted["Oct ECO4 Sales (adjusted for variance)"] + ) + + sales_data_formatted["Remaining GBIS Sales"] = ( + sales_data_formatted["Forecasted GBIS Sales"] - sales_data_formatted["Oct GBIS Sales (adjusted for variance)"] + ) + + sales_data_formatted["Completed ECO4 Revenue"] = (sales_data_formatted[ + "Oct ECO4 Sales (adjusted for variance)"] * + historical_eco4_price) + sales_data_formatted["Completed GBIS Revenue"] = (sales_data_formatted[ + "Oct GBIS Sales (adjusted for variance)"] * + historical_gbis_price) + + ha_subset_with_sales = ["HA 15", "HA 16", "HA 24"] + + has_subset_with_sales_value = sales_data_formatted[ + sales_data_formatted["HA Name"].isin(ha_subset_with_sales) + ].copy()[ + [ + "HA Name", + "Oct ECO4 Sales (adjusted for variance)", + "Oct GBIS Sales (adjusted for variance)", + "Remaining ECO4 Sales", + "Remaining GBIS Sales", + "Forecasted ECO4 Sales", + "Forecasted GBIS Sales", + "Completed ECO4 Revenue", + "Completed GBIS Revenue" + ] + ] + + has_subset_with_sales_value["Remaining ECO4 Revenue"] = has_subset_with_sales_value[ + "Remaining ECO4 Sales"] * ECO4_NEW_RATES + has_subset_with_sales_value["Remaining GBIS Revenue"] = has_subset_with_sales_value[ + "Remaining GBIS Sales"] * GBIS_NEW_RATES + + has_subset_with_sales_value["Remaining Total Revenue"] = ( + has_subset_with_sales_value["Remaining ECO4 Revenue"] + has_subset_with_sales_value["Remaining GBIS Revenue"] + ) + + model_results["Model Expected Additional ECO4 Revenue"] = ( + model_results["Model Expected Additional ECO4 (unit level)"] * ECO4_NEW_RATES + ) + + model_results["Model Expected Additional GBIS revenue"] = ( + model_results["Model Expected Additional GBIS (unit level)"] * GBIS_NEW_RATES + ) + + model_results["Model Expected Additional Total Revenue"] = ( + model_results["Model Expected Additional ECO4 Revenue"] + model_results[ + "Model Expected Additional GBIS revenue"] + ) + + # Show more columns with pandas + pd.set_option('display.max_rows', 500) + pd.set_option('display.max_columns', 500) + pd.set_option('display.width', 1000) + + # Look at HA 16 + ha16_model = model_results[model_results["HA Name"] == "HA 16"] + has_subset_with_sales_value[has_subset_with_sales_value["HA Name"] == "HA 16"] + + # WFT: For HA 16: 4,598,190 ECO4, 57,000 GBIS + # Model: + + # Look at HA 24 + ha24_model = model_results[model_results["HA Name"] == "HA 24"] + has_subset_with_sales_value[has_subset_with_sales_value["HA Name"] == "HA 24"] + + # Look at HA 15 + ha15_data = has_subset_with_sales_value[has_subset_with_sales_value["HA Name"] == "HA 15"] + ha15_portfolio_value = ha15_data["Completed ECO4 Revenue"] + ha15_data[ + "Completed GBIS Revenue"] + ha15_data["Remaining Total Revenue"] + # # This doesn't have sales data so in the model analysis, we just value the ha as a whole + ha15_model = model_results[model_results["HA Name"] == "HA 15"] + ha15_value = ha15_model["Model Expected Total ECO4 (unit level)"].iloc[0] * ECO4_NEW_RATES + \ + ha15_model["Model Expected Total GBIS (unit level)"].iloc[0] * GBIS_NEW_RATES + + model_results["Expected ECO4 Revenue"] = model_results["Model Expected Total ECO4 (unit level)"] * ECO4_NEW_RATES + model_results["Expected GBIS Revenue"] = model_results["Model Expected Total GBIS (unit level)"] * GBIS_NEW_RATES + model_results["Expected Total Revenue"] = model_results["Expected ECO4 Revenue"] + model_results[ + "Expected GBIS Revenue"] + model_results[model_results["HA Name"].isin(["HA 15"])] + + # We now create a final excel with all of the data + # We want: + # 1) aggregates + # 2) sales_data_formatted + # 3) remaining_has_aggregate + # 4) remaining_has + # 5) problem_has_summary + + # Function to get the maximum column width + def get_col_widths(dataframe): + # First we find the maximum length of the index column + idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))]) + # Then, we concatenate this to the max of the lengths of column name and its max value for each column, row-wise + return [idx_max] + [max(dataframe[col].astype(str).map(len).max(), len(col)) for col in dataframe.columns] + + # Create a Pandas Excel writer using XlsxWriter as the engine + with pd.ExcelWriter('HA Pipeline Analysis.xlsx', engine='xlsxwriter') as writer: + # Write each dataframe to a different worksheet without the index + for df, sheet in [(aggregates, 'Forecasted Sales'), + (sales_data_formatted, 'Sales Data'), + (remaining_has_aggregate, 'Remaining HAs Value'), + (remaining_has, 'Remaining HAs data'), + (pipeline_value, 'Pipeline Value'), + (problem_has_summary, 'Problem HAs Analysis'), + (problem_has_data, 'Problem HAs Data') + + ]: + + df.to_excel(writer, sheet_name=sheet, index=False) + + # Auto-adjust columns' width + for i, width in enumerate(get_col_widths(df)): + writer.sheets[sheet].set_column(i, i, width) diff --git a/etl/eligibility/ha_15_32/cancellation.py b/etl/eligibility/ha_15_32/cancellation.py new file mode 100644 index 00000000..849add45 --- /dev/null +++ b/etl/eligibility/ha_15_32/cancellation.py @@ -0,0 +1,113 @@ +import openpyxl +import pandas as pd +import numpy as np + + +def get_excel_survey_list(workbook_path, worksheet_name=None): + survey_workbook = openpyxl.load_workbook(workbook_path) + if worksheet_name is not None: + survey_sheet = survey_workbook[worksheet_name] + else: + survey_sheet = survey_workbook.active + + survey_rows = [] + survey_colors = [] + + for row in survey_sheet.iter_rows(min_row=2, values_only=False): # Assuming the first row is headers + row_data = [cell.value for cell in row] # This will get you the cell values + row_color = row[0].fill.start_color.index if row[0].fill.start_color.index != '00000000' else None + # row_color = COLOR_INDEX[row_color] + survey_rows.append(row_data) + survey_colors.append(row_color) + + survey_list = pd.DataFrame(survey_rows, columns=[cell.value for cell in survey_sheet[1]]) + survey_list["row_colour"] = survey_colors + + return survey_list + + +def load_data(): + # Load for HA 16 - ECO 4 + ha16_survey_list = get_excel_survey_list('etl/eligibility/ha_15_32/HESTIA- HA 16 ECO4 SURVEY LIST.xlsx') + + # Load for HA 24 - ECO 4 + ha24_survey_list = get_excel_survey_list('etl/eligibility/ha_15_32/HESTIA - HA 24 ECO4 SURVEY LIST.xlsx') + + # Load for HA 25 - ECO 3 + ha25_survey_list = get_excel_survey_list( + 'etl/eligibility/ha_15_32/HESTIA - HA 25 ECO3 SURVEY LIST.xlsx', worksheet_name="CAVITY" + ) + + # Remove columns with None column names + ha25_survey_list = ha25_survey_list.dropna(axis=1, how='all') + + # Standardised this installation status columns + ha16_survey_list["survey_status"] = ha16_survey_list["INSTALLED OR CANCELLED"].copy() + ha16_survey_list["survey_status"] = ha16_survey_list["survey_status"].replace( + { + "NO UPDATE - CHECKED 2.10.23": "no update", + "NO UPDATE - CHECKED 18.12.23": "no update", + "INSTALLED": "installed", + "CANCELLED": "cancelled", + "LOFT STILL TO BE INSTALLED": "loft remaining", + } + ) + + ha24_survey_list["survey_status"] = ha24_survey_list["INSTALLED OR CANCELLED"].copy() + ha24_survey_list["survey_status"] = ha24_survey_list["survey_status"].replace( + { + "NO UPDATE - CHECKED 21.11.23": "no update", + "NO UPDATE - CHECKED 18.12.23": "no update", + "INSTALLED": "installed", + "CANCELLED": "cancelled", + "LOFT STILL TO BE INSTALLED": "loft remaining", + "SEE NOTES >>": "see notes", + } + ) + + # We need to prepare HA25 differently + ha25_survey_list["survey_status"] = np.where( + ha25_survey_list["row_colour"] == "FF7030A0", "installed", + np.where(ha25_survey_list["row_colour"] == "FF92D050", "installed", + np.where(ha25_survey_list["row_colour"] == "FFFF0000", "cancelled", + np.where(ha25_survey_list["row_colour"] == "FFFFFF00", "filler row - drop", + np.where(ha25_survey_list["row_colour"] == "FF38FD23", "installed", "unknown") + ) + ) + ) + ) + ha25_survey_list = ha25_survey_list[ha25_survey_list["survey_status"] != "filler row - drop"] + + # We standardise the cancellation reasons - just create a new column + ha16_survey_list["cancellation_reason"] = ha16_survey_list["INSTALLERS NOTES ; REASONS FOR CANCELLATIONS"].copy() + ha24_survey_list["cancellation_reason"] = ha24_survey_list["INSTALLERS NOTES ; REASONS FOR CANCELLATIONS"].copy() + # There's no cancellation reason for HA25 + ha25_survey_list["cancellation_reason"] = "No reason provided" + + # Combine the dataframes + ha16_survey_list["HA"] = "HA 16" + ha24_survey_list["HA"] = "HA 24" + ha25_survey_list["HA"] = "HA 25" + + cancellation_data = pd.concat( + [ + ha16_survey_list[["HA", "survey_status", "cancellation_reason"]], + ha24_survey_list[["HA", "survey_status", "cancellation_reason"]], + ha25_survey_list[["HA", "survey_status", "cancellation_reason"]] + ] + ) + + # Take just rows that we have a confirmed status for + cancellation_data = cancellation_data[~cancellation_data["survey_status"].isin(["no update", "loft remaining"])] + + return cancellation_data + + +def app(): + """ + This application is used to analyse the cancellation data provided by warmfront + :return: + """ + + # This is cancellations of jobs that completed invasive surveys and the installer could not conclude the work + sales_cancellation_data = load_data() diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 1ed95a30..e94babcd 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -882,6 +882,13 @@ def get_epc_data( return outputs +def get_col_widths(dataframe): + # First we find the maximum length of the index column + idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))]) + # Then, we concatenate this to the max of the lengths of column name and its max value for each column, row-wise + return [idx_max] + [max(dataframe[col].astype(str).map(len).max(), len(col)) for col in dataframe.columns] + + def analyse_ha_data(outputs, loader): """ The approach we take within this function is the following: @@ -901,7 +908,11 @@ def analyse_ha_data(outputs, loader): :return: """ + eco4_rate = 1710 + gbis_rate = 600 + ha_analysis_results = [] + ha_revenue_results = [] for ha_name, datasets in outputs.items(): inputs = [x for k, x in loader.data.items() if k == ha_name][0] @@ -1034,7 +1045,8 @@ def analyse_ha_data(outputs, loader): ( (remaining_eco4_df["eco4_message"] == "sap too high") & remaining_eco4_df["eligibility_cavity_type"].isin(["partial", "empty"]) & - remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) + remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) & + pd.isnull(remaining_eco4_df["prospect_type"]) ), "ECO4 if SAP downgrade", remaining_eco4_df["prospect_type"] @@ -1048,7 +1060,7 @@ def analyse_ha_data(outputs, loader): remaining_eco4_df["eligibility_cavity_type"].isin(["full"]) & remaining_eco4_df["eligibility_loft_type"].isin(["0-100mm"]) ), - "Filled cavity - subject to CIGA check", + "ECO4 - Filled cavity - subject to CIGA check", remaining_eco4_df["prospect_type"] ) @@ -1064,7 +1076,7 @@ def analyse_ha_data(outputs, loader): # 5) Looks like GBIS instead remaining_eco4_df["prospect_type"] = np.where( - (remaining_eco4_df["gbis_eligible"] == True), + (remaining_eco4_df["gbis_eligible"] == True) & pd.isnull(remaining_eco4_df["prospect_type"]), "Looks like GBIS", remaining_eco4_df["prospect_type"] ) @@ -1094,16 +1106,17 @@ def analyse_ha_data(outputs, loader): # 2) GBIS candidates that look like strict ECO4 candidates remaining_gbis["prospect_type"] = np.where( (remaining_gbis["eco4_eligible"] == True), - "Upgradable to ECO4", + "GBIS - Upgradable to ECO4", remaining_gbis["prospect_type"] ) # 3) Subject to CIGA check - Filled cavity remaining_gbis["prospect_type"] = np.where( ( - remaining_gbis["eligibility_cavity_type"].isin(["full"]) + remaining_gbis["eligibility_cavity_type"].isin(["full"]) & + pd.isnull(remaining_gbis["prospect_type"]) ), - "Filled cavity - subject to CIGA check", + "GBIS - Filled cavity - subject to CIGA check", remaining_gbis["prospect_type"] ) @@ -1141,30 +1154,95 @@ def analyse_ha_data(outputs, loader): ) ].copy() - ha_analysis_results.append({ + # Perform some checks to make sure we have all of the values + remaining_eco4_dict = remaining_eco4_df["prospect_type"].value_counts().to_dict() + if n_remaining_properties_eco4 != sum([v for k, v in remaining_eco4_dict.items()]): + raise ValueError( + "Number of remaining properties does not match the number of properties in remaining ECO4 dict" + ) + + remaining_gbis_dict = remaining_gbis["prospect_type"].value_counts().to_dict() + if n_remaining_properties_gbis != sum([v for k, v in remaining_gbis_dict.items()]): + raise ValueError( + "Number of remaining properties does not match the number of properties in remaining GBIS dict" + ) + + to_append = { + "ha_name": ha_name, "n_properties_in_asset_list": n_properties_in_asset_list, ############ # ECO4 ############ "properties_sold_eco4": properties_sold_eco4, "n_remaining_properties_eco4": n_remaining_properties_eco4, - **remaining_eco4_df["prospect_type"].value_counts().to_dict(), + **remaining_eco4_dict, ############ # GBIS ############ "properties_sold_gbis": properties_sold_gbis, "n_remaining_properties_gbis": n_remaining_properties_gbis, - **remaining_gbis["prospect_type"].value_counts().to_dict(), + **remaining_gbis_dict, ############ # GBIS ############ "n_eco4_surplus": eco4_surplus.shape[0], "n_gbis_surplus": gbis_surplus.shape[0], - }) + } + + ha_analysis_results.append(to_append) + + revenue_to_append = { + "ha_name": ha_name, + "£ Remaining from asset list": ( + n_remaining_properties_eco4 * eco4_rate + n_remaining_properties_gbis * gbis_rate + ), + "Of which: Strict": ( + to_append.get('strict ECO4', 0) * eco4_rate + to_append.get('strict GBIS', 0) * gbis_rate + + to_append.get('GBIS - Upgradable to ECO4', 0) * gbis_rate + ), + "Of which: Subject to CIGA": ( + to_append.get("ECO4 - Filled cavity - subject to CIGA check", 0) * eco4_rate + + to_append.get("GBIS - Filled cavity - subject to CIGA check", 0) * gbis_rate + ), + "Of which: Prospect, not perfect strict prospect": ( + to_append.get("ECO4 prospect - empty cavity, loft insulation below regulation", 0) * eco4_rate + + to_append.get("ECO4 if SAP downgrade", 0) * eco4_rate + ), + "Of which: Potential downgrade to GBIS": to_append["Looks like GBIS"] * eco4_rate, + "Of which: Does not look like prospect": ( + to_append.get("Does not look like ECO4 candidate", 0) * eco4_rate + + to_append.get("Does not look like GBIS candidate", 0) * gbis_rate + ), + "Surplus: Unidentified properties": eco4_surplus.shape[0] * eco4_rate + gbis_surplus.shape[0] * gbis_rate, + "Surplus: GBIS Updates to ECO4": to_append.get("GBIS - Upgradable to ECO4", 0) * (eco4_rate - gbis_rate) + } + + # Perform a quick check: + if revenue_to_append["£ Remaining from asset list"] - ( + revenue_to_append["Of which: Strict"] + revenue_to_append["Of which: Subject to CIGA"] + + revenue_to_append["Of which: Prospect, not perfect strict prospect"] + + revenue_to_append["Of which: Potential downgrade to GBIS"] + + revenue_to_append["Of which: Does not look like prospect"] + ) > 1: + raise ValueError("Error between top level revenue figures and breakdown - investigate me") + + ha_revenue_results.append(revenue_to_append) ha_analysis_results = pd.DataFrame(ha_analysis_results) + ha_revenue_results = pd.DataFrame(ha_revenue_results) - # Todo: create revenue figures and automate creation of excel + # Automate creation of the excel + # Create a Pandas Excel writer using XlsxWriter as the engine + with pd.ExcelWriter('HA Analysis - batch3.xlsx', engine='xlsxwriter') as writer: + # Write each dataframe to a different worksheet without the index + for df, sheet in [(ha_revenue_results, 'Total Revenue'), + (ha_analysis_results, 'By ECO4 and GBIS')]: + + df.to_excel(writer, sheet_name=sheet, index=False) + + # Auto-adjust columns' width + for i, width in enumerate(get_col_widths(df)): + writer.sheets[sheet].set_column(i, i, width) def app():