merging EPC data and survey outcomes to asset list

This commit is contained in:
Khalim Conn-Kowlessar 2024-04-09 14:57:55 +01:00
parent 0142e6fe5f
commit dc80313eca

View file

@ -3459,7 +3459,7 @@ class DataLoader:
"not eligible",
asset_list["ECO Eligibility"]
)
asset_list = asset_list.drop(columns=["has_eco3"])
# asset_list = asset_list.drop(columns=["has_eco3"])
# Report on sales
sales_report = {}
@ -6778,6 +6778,339 @@ def identify_eco_works(loader):
breakdowns = breakdowns.fillna(0)
def unitas_data_prep(loader):
#####
# Adhoc - for UNITAS, stripping out additional surveys that have been completed
unitas_data = loader.data["HA50"].copy()
unitas_asset_list = unitas_data["asset_list"].copy()
unitas_survey_sheet = unitas_data["survey_list"].copy()
# We remove the surveyed properties from the asset sheet
unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])]
unitas_asset_list = unitas_asset_list.merge(
unitas_survey_sheet[["asset_list_row_id", "installation_status"]],
how="left",
on="asset_list_row_id"
)
unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])]
unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"])
# We read in the data for the further completed surveys
unitas_phase_1_workbook = openpyxl.load_workbook(
"local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx"
)
phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"]
phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"]
phase1_colnames = [cell.value for cell in phase_1_worksheet[1]]
phase_1_rows_data = []
for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
phase_1_rows_data.append(row_data)
phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames)
# Correct phase 1 surveys in the same fashion as the previous approach
phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy())
# We check all phase 1 surveys are contained in the data we had before
additional = []
for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)):
# We look for the entry in the old survey sheet:
# matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]]
# if matched_uprn.shape[0] == 1:
# continue
matched_1 = unitas_survey_sheet[
(unitas_survey_sheet["Post Code"] == row["Post Code"]) &
(unitas_survey_sheet["NO."] == row["NO."])
]
if matched_1.shape[0] == 1:
continue
matched_2 = unitas_survey_sheet[
(unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) &
(unitas_survey_sheet["NO."] == row["NO."])
]
if matched_2.shape[0] == 1:
continue
additional.append(row.to_dict())
additional = pd.DataFrame(additional)
phase_2_rows_data = []
for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
phase_2_rows_data.append(row_data)
phase2_colnames = [cell.value for cell in phase_2_worksheet[1]]
phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames)
# Drop all of the occurances of "OFFICE USE ONLY" columns
phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c])
common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns})
additional_filtered = additional[common_columns]
further_unitas_completed_surveys = pd.concat(
[phase_2_surveys, additional_filtered],
axis=0,
ignore_index=True
)
# Add a phase 2 key
further_unitas_completed_surveys["survey_list_row_id"] = [
"unitas_phase_2" + str(i) for i in further_unitas_completed_surveys.index
]
not_in_asset_list = [
"unitas_phase_20", "unitas_phase_234", "unitas_phase_2163", "unitas_phase_2173", "unitas_phase_2374"
]
additional_postcodes = ["st28bg"]
full_asset_list = unitas_data["asset_list"].copy()
full_asset_list["matching_postcode"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
further_unitas_completed_surveys["Post Code"] = further_unitas_completed_surveys["Post Code"].str.replace(
"ST 5DT", "ST3 5DT"
)
# We match these back to the asset list
matching_lookup = []
for _, row in tqdm(further_unitas_completed_surveys.iterrows(), total=len(further_unitas_completed_surveys)):
if row["survey_list_row_id"] in not_in_asset_list:
continue
postcode_lower = row["Post Code"].lower().strip().replace(" ", "")
if postcode_lower in additional_postcodes:
continue
# Confirmed not in asset lsit
# Filter asset list on postcode
df = full_asset_list[
full_asset_list["matching_postcode"].str.contains(postcode_lower)
]
df = df[df["HouseNo"] == str(row["NO."])]
if df.shape[0] != 1:
raise Exception("NOT FOUND")
matching_lookup.append(
{
"survey_list_row_id": row["survey_list_row_id"],
"asset_list_row_id": df["asset_list_row_id"].values[0],
}
)
matching_lookup = pd.DataFrame(matching_lookup)
matching_lookup["phase_2_surveyed"] = True
# We merge this onto the asset list and remove the rows
unitas_asset_list = unitas_asset_list.merge(
matching_lookup, how="left", on="asset_list_row_id"
)
# Drop rows where phase_2_surveyed is populated
unitas_asset_list = unitas_asset_list[
pd.isnull(unitas_asset_list["phase_2_surveyed"])
]
# We add in the new CIGA submissions
unitas_round_2_ciga_workbook = openpyxl.load_workbook("local_data/ha_data/Unitas second round CIGA checks.xlsx")
ciga_round_2_worksheet = unitas_round_2_ciga_workbook["Worksheet"]
ciga_round_2_colnames = [cell.value for cell in ciga_round_2_worksheet[1]]
round_2_rows_data = []
for row in ciga_round_2_worksheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
round_2_rows_data.append(row_data)
ciga_round_2 = pd.DataFrame(round_2_rows_data, columns=ciga_round_2_colnames)
# We merge the ciga sheet to the asset list
ciga_dependent_asset_list = unitas_asset_list[
unitas_asset_list["ECO Eligibility"].str.contains("subject to ciga")
].copy()
# We merge the ciga sheet to the asset list
ciga_round_2_matched = ciga_dependent_asset_list.merge(
ciga_round_2, how="inner", on=["Address Line 1", "Post Code"]
)
# Filter on just the properties that had no guarantee
ciga_round_2_matched = ciga_round_2_matched[ciga_round_2_matched["Guarantee"] == "No"]
# ECO Eligibility
# not eligible 9227
# failed ciga 2711
# eco4 (subject to ciga) 2238
# eco4 - passed ciga 901
# gbis 114
# eco4 91
# We filter on the properties we're looking to re-survey
unitas_properties_to_survey = unitas_asset_list[
unitas_asset_list["ECO Eligibility"].isin(
[
"eco4 - passed ciga",
"eco4"
]
)
].copy()
unitas_properties_to_survey = pd.concat(
[
unitas_properties_to_survey,
ciga_round_2_matched[unitas_properties_to_survey.columns]
]
)
epc_api_key = "a2Nvbm5rb3dsZXNzYXJAZ21haWwuY29tOjY5MGJiMWM0NmIyOGI5ZDUxYzAxMzQzYzNiZGNlZGJjZDNmODQwMzA="
# We now retrieve the lastest EPC data
epc_data = []
for _, unitas_property in tqdm(unitas_properties_to_survey.iterrows(), total=len(unitas_properties_to_survey)):
property_type, _ = get_property_type_and_built_form(property_meta=unitas_property, ha_name="HA50")
full_address = unitas_property["matching_address"]
searcher = SearchEpc(
address1=str(unitas_property["HouseNo"]),
postcode=unitas_property["matching_postcode"],
auth_token=epc_api_key,
os_api_key="",
property_type=property_type,
full_address=full_address,
fast=True
)
# Force the skipping of estimating the EPC
searcher.ordnance_survey_client.property_type = None
searcher.ordnance_survey_client.built_form = None
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
continue
epc = {
"asset_list_row_id": unitas_property["asset_list_row_id"],
**searcher.newest_epc.copy()
}
epc_data.append(epc)
epc_df = pd.DataFrame(epc_data)
# Pull out just the columns we need
epc_df = epc_df[
[
"asset_list_row_id",
"address1", "postcode",
"current-energy-efficiency",
"current-energy-rating",
"inspection-date",
"transaction-type",
"built-form"
]
]
epc_df["EPC Rating"] = (
epc_df["current-energy-efficiency"].astype(str) +
epc_df["current-energy-rating"].astype(str)
)
# Merge onto the Unitas data:
unitas_properties_to_survey_full = unitas_properties_to_survey.merge(
epc_df[
[
"asset_list_row_id",
"EPC Rating",
"inspection-date",
"transaction-type",
"built-form"
]
],
how="left",
on="asset_list_row_id"
)
unitas_properties_to_survey_full["ECO Eligibility"] = unitas_properties_to_survey_full["ECO Eligibility"].replace(
"eco4 (subject to ciga)", "eco4 - passed ciga, phase 2 check"
)
for col in ["EPC Rating", "inspection-date", "transaction-type", "built-form"]:
unitas_properties_to_survey_full[col] = np.where(
pd.isnull(unitas_properties_to_survey_full[col]),
"No EPC found",
unitas_properties_to_survey_full[col]
)
unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].fillna(
"No EPC found"
)
unitas_properties_to_survey_full[col] = unitas_properties_to_survey_full[col].astype(str)
unitas_properties_to_survey_full = unitas_properties_to_survey_full.rename(
columns={
"inspection-date": "Last EPC Inspection Date",
"transaction-type": "Last EPC Reason",
"built-form": "Last EPC Built Form",
}
)
# We now match to the survey outcomes
unitas_survey_outcomes_workbook = openpyxl.load_workbook(
"local_data/ha_data/UNITAS - survey outcomes 26.03.2024.xlsx"
)
unitas_survey_outcomes_worksheet = unitas_survey_outcomes_workbook["OUTCOMES"]
unitas_outcomes_colnames = [cell.value for cell in unitas_survey_outcomes_worksheet[2]]
outcomes_rows_data = []
for row in unitas_survey_outcomes_worksheet.iter_rows(min_row=3, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
outcomes_rows_data.append(row_data)
unitas_outcomes = pd.DataFrame(outcomes_rows_data, columns=unitas_outcomes_colnames)
unitas_outcomes = unitas_outcomes.rename(
columns={
"Notes (If 'no answer' under outcomes, have you checked around the property for access "
"issues where possible?)": "Notes"
}
)
unitas_outcomes["Postcode"].unique()
eg1 = unitas_properties_to_survey_full[
(unitas_properties_to_survey_full["Post Code"] == "ST6 6RF")
]
eg1_outcomes = unitas_outcomes[
(unitas_outcomes["Postcode"] == "ST6 6RF")
]
# Merge outcomes onto properties to survey. Will probably have to do algorithmically
full_asset_list["matching_postcode_nospace"] = full_asset_list["matching_postcode"].str.lower().str.replace(" ", "")
outcome_matching = []
for _, outcome in tqdm(unitas_outcomes.iterrows(), total=len(unitas_outcomes)):
# We search for the corresponding entry in the asset list
postcode_lower = outcome["Postcode"].lower().strip().replace(" ", "")
# Confirmed not in asset lsit
# Filter asset list on postcode
df = unitas_properties_to_survey_full[
unitas_properties_to_survey_full["matching_postcode_nospace"].str.contains(postcode_lower)
]
df = df[df["HouseNo"] == str(outcome["No."])]
if df.empty:
continue
if df.shape[0] == 1:
outcome_matching.append(
{
"asset_list_row_id": df["asset_list_row_id"].values[0],
**outcome.to_dict()
}
)
continue
raise Exception("something went wrong")
# Store as an excel
unitas_properties_to_survey_full.to_excel("Unitas - phase 2 properties to Survey.xlsx")
def app():
"""
This app contains the housin association analysis for HAs 1, 6, 14, 39 and 107.
@ -6907,81 +7240,3 @@ def app():
december_figures["ECO4 remaining"]
)
december_figures["ECO4 remaining"].sum()
# Adhoc - for UNITAS, stripping out additional surveys that have been completed
unitas_data = loader.data["HA50"].copy()
unitas_asset_list = unitas_data["asset_list"].copy()
unitas_survey_sheet = unitas_data["survey_list"].copy()
# We remove the surveyed properties from the asset sheet
unitas_survey_sheet = unitas_survey_sheet[~pd.isnull(unitas_survey_sheet["asset_list_row_id"])]
unitas_asset_list = unitas_asset_list.merge(
unitas_survey_sheet[["asset_list_row_id", "installation_status"]],
how="left",
on="asset_list_row_id"
)
unitas_asset_list = unitas_asset_list[pd.isnull(unitas_asset_list["installation_status"])]
unitas_asset_list = unitas_asset_list.drop(columns=["installation_status"])
# We read in the data for the further completed surveys
unitas_phase_1_workbook = openpyxl.load_workbook(
"local_data/ha_data/UNITAS ( STOKE) MASTER ROLLING SHEET UPDATED 8.4.24 K - no password.xlsx"
)
phase_1_worksheet = unitas_phase_1_workbook["ECO 4 - PHASE 1"]
phase_2_worksheet = unitas_phase_1_workbook["ECO4 - PHASE 2"]
phase1_colnames = [cell.value for cell in phase_1_worksheet[1]]
phase_1_rows_data = []
for row in phase_1_worksheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
phase_1_rows_data.append(row_data)
phase_1_surveys = pd.DataFrame(phase_1_rows_data, columns=phase1_colnames)
# Correct phase 1 surveys in the same fashion as the previous approach
phase_1_surveys = DataLoader.correct_ha50_survey_list(phase_1_surveys.copy())
# We check all phase 1 surveys are contained in the data we had before
additional = []
for _, row in tqdm(phase_1_surveys.iterrows(), total=len(phase_1_surveys)):
# We look for the entry in the old survey sheet:
# matched_uprn = unitas_survey_sheet[unitas_survey_sheet["EPR UPRN NUMBER"] == row["UPRN"]]
# if matched_uprn.shape[0] == 1:
# continue
matched_1 = unitas_survey_sheet[
(unitas_survey_sheet["Post Code"] == row["Post Code"]) &
(unitas_survey_sheet["NO."] == row["NO."])
]
if matched_1.shape[0] == 1:
continue
matched_2 = unitas_survey_sheet[
(unitas_survey_sheet["Street / Block Name"] == row["Street / Block Name"]) &
(unitas_survey_sheet["NO."] == row["NO."])
]
if matched_2.shape[0] == 1:
continue
additional.append(row.to_dict())
additional = pd.DataFrame(additional)
phase_2_rows_data = []
for row in phase_2_worksheet.iter_rows(min_row=2, values_only=False):
row_data = [cell.value for cell in row] # This will get you the cell values
phase_2_rows_data.append(row_data)
phase2_colnames = [cell.value for cell in phase_2_worksheet[1]]
phase_2_surveys = pd.DataFrame(phase_2_rows_data, columns=phase2_colnames)
# Drop all of the occurances of "OFFICE USE ONLY" columns
phase_2_surveys = phase_2_surveys.drop(columns=[c for c in phase_2_surveys.columns if "OFFICE USE ONLY" in c])
common_columns = list({c for c in phase_2_surveys.columns if c in additional.columns})
additional_filtered = additional[common_columns]
further_unitas_completed_surveys = pd.concat(
[phase_2_surveys, additional_filtered],
axis=0,
ignore_index=True
)
# We match these back to the asset list