Added HA25

This commit is contained in:
Khalim Conn-Kowlessar 2024-03-01 23:48:27 +00:00
parent 46f5ee8ea4
commit d9e9be4389

View file

@ -159,19 +159,18 @@ class DataLoader:
}
UNMATCHED_CIGA = {
# We expect 4 unmatched addresses, which have been validated manually as being in the ciga file but not
# the asset list
"HA6": 117,
"HA14": 3,
"HA16": 7,
# There's just too many unmatched here
"HA6": 117,
"HA24": 12,
"HA107": 51,
}
def __init__(self, directories, december_figures_filepath, use_cache):
def __init__(self, directories, december_figures_filepath, use_cache, rebuild):
self.directories = directories
self.use_cache = use_cache
self.december_figures_filepath = december_figures_filepath
self.rebuild = rebuild
self.data = {}
self.december_figures = None
@ -312,23 +311,20 @@ class DataLoader:
return asset_list
@staticmethod
def create_ciga_list_house_no(ha_name, ciga_list):
def create_ciga_list_house_no(ciga_list):
"""
This function will append the House number onto the asset list
:return:
"""
if ha_name in ["HA6", "HA14", "HA107", "HA16"]:
split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
house_numbers = split_addresses[0].str.split(' ', expand=True)
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
# many columns there might be
house_numbers = house_numbers.iloc[:, 0:1]
house_numbers.columns = ['HouseNo']
split_addresses = ciga_list['Matched Address'].str.split(',', expand=True)
house_numbers = split_addresses[0].str.split(' ', expand=True)
# THe first column should be HouseNo - we aren't interested in the other columns, but we don't know how
# many columns there might be
house_numbers = house_numbers.iloc[:, 0:1]
house_numbers.columns = ['HouseNo']
ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1)
else:
raise NotImplementedError("Implement me")
ciga_list = pd.concat([ciga_list, house_numbers[["HouseNo"]]], axis=1)
return ciga_list
@ -447,7 +443,7 @@ class DataLoader:
# Remove rows with missing postcode which happens in a small number of cases
ciga_list = ciga_list[~pd.isnull(ciga_list["Matched Postcode"])]
ciga_list["ciga_list_row_id"] = [ha_name + "_ciga_" + str(i) for i in range(0, len(ciga_list))]
ciga_list = self.create_ciga_list_house_no(ha_name, ciga_list)
ciga_list = self.create_ciga_list_house_no(ciga_list)
ciga_list = self.dedupe_ciga_list(ciga_list)
ciga_list = self.merge_ciga_to_assets(asset_list, ciga_list, ha_name)
@ -800,6 +796,10 @@ class DataLoader:
"st. leodegars close", "st leodegars close"
)
survey_list["Street / Block Name"] = survey_list["Street / Block Name"].str.replace(
"montgomery crescent", "montgomery road"
)
return survey_list
@staticmethod
@ -1102,16 +1102,18 @@ class DataLoader:
for col in ["ECO4", "GBIS", "ECO4 remaining", "GBIS remaining"]:
self.december_figures[col] = self.december_figures[col].astype("Int64")
if self.use_cache:
self.data = read_pickle_from_s3(
if self.use_cache and not self.rebuild:
data = read_pickle_from_s3(
bucket_name="retrofit-datalake-dev",
s3_file_name="ha-analysis/batch3-inputs.pickle",
)
return
else:
data = {}
data = {}
for filepath in self.directories:
ha_name = filepath.split("/")[2]
if ha_name in data:
continue
# Load asset list
logger.info("Loading data for {}".format(ha_name))
asset_list, survey_list, ciga_list = self.load_asset_list(
@ -2635,6 +2637,10 @@ def forecast_remaining_sales(loader):
# and I don't want the numbers to change too much, depenent on the CIGA conversation rate
maximum_ciga_conversion = 0.75
# This is a hard limit to the allowed conversion rates to final sale. These are typically very
# high but there are some anomalies, amongst surveys that are early on
sales_conversion_lower_bound = 0.8
gbis_rate = 600
eco4_rate = 1710
# old_gbis_rate = 432
@ -2796,14 +2802,30 @@ def forecast_remaining_sales(loader):
eco4_ciga_independent_passrates = pd.DataFrame(eco4_ciga_independent_passrates)
gbis_ciga_independent_passrates = pd.DataFrame(gbis_ciga_independent_passrates)
eco4_ciga_independent_passrates["conversion"] = (
eco4_ciga_independent_passrates["# ECO4 successfully installed"] /
eco4_ciga_independent_passrates["# ECO4 at install stage"]
)
eco4_ciga_independent_passrates_clipped = eco4_ciga_independent_passrates[
eco4_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound
]
gbis_ciga_independent_passrates["conversion"] = (
gbis_ciga_independent_passrates["# GBIS successfully installed"] /
gbis_ciga_independent_passrates["# GBIS at install stage"]
)
gbis_ciga_independent_passrates_clipped = gbis_ciga_independent_passrates[
gbis_ciga_independent_passrates["conversion"] >= sales_conversion_lower_bound
]
median_eco4_to_install = (
eco4_ciga_independent_passrates["# ECO4 successfully installed"].sum() /
eco4_ciga_independent_passrates["# ECO4 at install stage"].sum()
eco4_ciga_independent_passrates_clipped["# ECO4 successfully installed"].sum() /
eco4_ciga_independent_passrates_clipped["# ECO4 at install stage"].sum()
)
median_gbis_to_install = (
gbis_ciga_independent_passrates["# GBIS successfully installed"].sum() /
gbis_ciga_independent_passrates["# GBIS at install stage"].sum()
gbis_ciga_independent_passrates_clipped["# GBIS successfully installed"].sum() /
gbis_ciga_independent_passrates_clipped["# GBIS at install stage"].sum()
)
# Produce the final output
@ -3270,6 +3292,8 @@ def app():
use_cache = True
# Determines if we want to perform the data pull
pull_data = False
# Override to re-build all inputs
rebuild_inputs = False
# List all of the data in the folder
@ -3278,12 +3302,11 @@ def app():
# Grab the December HA figures filepath
december_figures_filepath = "local_data/ha_data/HA_December_figures.csv"
# priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA39", "HA107"]
priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA39", "HA107"]
priority_has = ["HA1", "HA6", "HA7", "HA14", "HA16", "HA24", "HA25", "HA39", "HA107"]
# Filter down the directories to only the priority HAs
directories = [d for d in directories if d.split("/")[2] in priority_has]
loader = DataLoader(directories, december_figures_filepath, use_cache)
loader = DataLoader(directories, december_figures_filepath, use_cache, rebuild_inputs)
loader.load()
loader.ha_facts_and_figures()