rough attempt to attribute surplus ciga dependent eco4 jobs

This commit is contained in:
Khalim Conn-Kowlessar 2024-03-12 11:09:09 +00:00
parent 41c17aa1da
commit 6a327629bf

View file

@ -176,6 +176,10 @@ class DataLoader:
"address": "Full Address",
"postcode": "Postcode"
},
"HA49": {
"address": "Property Address Full",
"postcode": "Property Postcode"
},
"HA54": {
"address": "Postal Address",
"postcode": "matching_postcode"
@ -219,7 +223,7 @@ class DataLoader:
def create_asset_list_matching_address(self, ha_name, asset_list):
if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA54"]:
if ha_name in ["HA1", "HA6", "HA12", "HA16", "HA24", "HA30", "HA31", "HA48", "HA49", "HA54"]:
asset_list["matching_address"] = asset_list[
self.COLUMN_CONFIG[ha_name]["address"]
].astype(str).str.lower().str.strip()
@ -382,6 +386,16 @@ class DataLoader:
asset_list["Address2"].astype(str).str.lower().str.strip() + ", " + \
asset_list["PostCode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
elif ha_name == "HAXX":
asset_list["matching_address"] = asset_list["Address"].astype(str).str.lower().str.strip() + ", " + \
asset_list["PostCode"].astype(str).str.lower().str.strip()
asset_list["matching_postcode"] = asset_list["PostCode"].astype(str).str.lower().str.strip()
elif ha_name == "HAXXX":
asset_list["matching_address"] = (
asset_list["Combined Address"].astype(str).str.lower().str.strip() + ", " +
asset_list["Postcode"].astype(str).str.lower().str.strip()
)
asset_list["matching_postcode"] = asset_list["Postcode"].astype(str).str.lower().str.strip()
else:
raise NotImplementedError("implement me")
@ -467,6 +481,8 @@ class DataLoader:
asset_list["HouseNo"] = asset_list["House_Number"].copy()
elif ha_name == "HA9":
asset_list["HouseNo"] = asset_list["House Number"].copy()
elif ha_name == "HAXXX":
asset_list["HouseNo"] = asset_list["Door Number"].copy()
else:
split_addresses = asset_list['matching_address'].str.split(',', expand=True)
house_numbers = split_addresses[0].str.split(' ', expand=True)
@ -1999,6 +2015,10 @@ class DataLoader:
return survey_list
@staticmethod
def correct_ha49_survey_list(survey_list):
return survey_list
@staticmethod
def levenstein_match(matching_string, df):
match_to = df["matching_address"].tolist()
@ -5080,8 +5100,11 @@ def app():
# Add in:
priority_has = [
"HA1", "HA2", "HA6", "HA7", "HA9", "HA12", "HA13", "HA14", "HA15", "HA16", "HA18", "HA19", "HA24", "HA25",
"HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA50", "HA54", "HA56", "HA63",
"HA107", "HA117"
"HA27", "HA28", "HA30", "HA31", "HA32", "HA34", "HA35", "HA39", "HA41", "HA48", "HA49", "HA50", "HA54", "HA56",
"HA63", "HA107", "HA117",
# New HAS
"HAXX", "HAXXX",
]
# Next HAs to do: 14 [DONE], 15[DONE], 32 [DONE], 33 [Input format is 4 parts and no eco4 jobs identified - come
# back on this], 28 [DONE], 41 [DONE], 50 [DONE], 48 [DONE], 2 [DONE], 63 [DONE], 12 [DONE], 117 [DONE], 13 [DONE],
@ -5100,39 +5123,86 @@ def app():
forecast_remaining_sales(loader)
# We load in the additional data required to perform the analysis
# cleaned = read_from_s3(
# s3_file_name="cleaned_epc_data/cleaned.bson",
# bucket_name="retrofit-data-dev"
# )
# cleaned = msgpack.unpackb(cleaned, raw=False)
# cleaned = patch_cleaned(cleaned)
#
# cleaning_data = read_dataframe_from_s3_parquet(
# bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
# )
# created_at = datetime.now().isoformat()
#
# photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
#
# outputs = get_epc_data(
# loader=loader,
# cleaned=cleaned,
# cleaning_data=cleaning_data,
# created_at=created_at,
# photo_supply_lookup=photo_supply_lookup,
# floor_area_decile_thresholds=floor_area_decile_thresholds,
# pull_data=pull_data
# )
conversion_rate = 0.95
archetype_check_conversion = 0.7
res = []
for k, v in loader.data.items():
asset_list = v["asset_list"].copy()
agg = asset_list["ECO Eligibility"].value_counts()
# We find a case where there are properties that have passed CIGA
if not any("passed" in x for x in agg.index):
continue
# import pickle
# with open("ha_analysis.pickle", "wb") as f:
# pickle.dump({"outputs": outputs, "loader": loader}, f)
agg = pd.DataFrame(agg).reset_index()
# To read:
# import pickle
# with open("ha_analysis.pickle", "rb") as f:
# outputs = pickle.load(f)["outputs"]
#
# with open("loader.pickle", "rb") as f:
# loader = pickle.load(f)
passed_ciga = agg[agg["ECO Eligibility"] == "eco4 - passed ciga"]
passed_ciga = passed_ciga["count"].values[0] if not passed_ciga.empty else 0
failed_ciga = agg[agg["ECO Eligibility"] == "failed ciga"]
failed_ciga = failed_ciga["count"].values[0] if not failed_ciga.empty else 0
ciga_pass_rate = passed_ciga / (passed_ciga + failed_ciga) if (passed_ciga + failed_ciga) > 0 else 1
dormant_ciga = agg[
agg["ECO Eligibility"].str.contains("subject to ciga") &
~agg["ECO Eligibility"].str.contains("subject to archetype")
]
dormant_ciga = dormant_ciga['count'].values[0] if not dormant_ciga.empty else 0
dormant_ciga_archetype = agg[
agg["ECO Eligibility"].str.contains("subject to ciga") &
agg["ECO Eligibility"].str.contains("subject to archetype")
]
dormant_ciga_archetype = dormant_ciga_archetype['count'].values[0] if not dormant_ciga_archetype.empty else 0
needing_check = dormant_ciga + dormant_ciga_archetype * archetype_check_conversion
needing_check = np.round(needing_check)
additional_jobs = (dormant_ciga * ciga_pass_rate * conversion_rate) + (
dormant_ciga_archetype * archetype_check_conversion * ciga_pass_rate * conversion_rate
)
additional_jobs = np.round(additional_jobs)
# We attempt to estimate the uplift and how much of that is attributed to surplus subject to ciga jobs
original_estimate = loader.december_figures[
loader.december_figures["HA Name"] == k
]
original_estimate = original_estimate["ECO4"].values[0] if not original_estimate.empty else 0
base_eco_figures = agg[
agg["ECO Eligibility"].isin(["eco4", "eco4 - passed ciga"])
]["count"].sum()
eco4_from_ciga = original_estimate - base_eco_figures
eco4_from_ciga = eco4_from_ciga if eco4_from_ciga > 0 else 0
surplus_from_dormant = additional_jobs - eco4_from_ciga
surplus_from_dormant = 0 if surplus_from_dormant < 0 else surplus_from_dormant
res.append(
{
"ha_name": k,
"additional_eco4": additional_jobs,
"needing_check": needing_check,
"surplus_from_dormant": surplus_from_dormant
}
)
res = pd.DataFrame(res)
# Drop the HAs that are not in that pervious draft
# In the v2 draft, there are 12 HAs
v5_surplus = res[
~res["ha_name"].isin(["HA9"])
]["additional_eco4"].sum()
# 7212 properties
# This is not a perfect difference though, because of the variations in how the numbers are recorded in the November
# all HAs sheet. E.g for HA 107, there were 1239 properties identified. In the postcode list, there are 1255,
# however 531 are still needing a CIGA check. Therefore their original figures, in this case, included properties
# pre-CIGA
v5_surplus_from_dormant = res[
~res["ha_name"].isin(["HA9"])
]["surplus_from_dormant"].sum()
# 5539.0
# 9471690