Model/etl/customers/Community Housing/reconciliation.py
2025-05-12 15:58:41 +01:00

708 lines
25 KiB
Python

"""
This script is used to reconcile the data from the Community Housing project, to understand the differences in
the various asset lists, and the work that has been conducted
"""
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from asset_list.AssetList import AssetList
from backend.SearchEpc import SearchEpc
# Data preparation
outcomes_1 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme Reconciliation/Outcomes "
"Community Housing.xlsx",
sheet_name="Sheet1",
)
outcomes_2 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme Reconciliation/Outcomes "
"Community Housing.xlsx",
sheet_name="ECO4 + PV",
)
outcomes_2["Type of Funding"] = "ECO4 Solar"
combined_outcomes = pd.concat([outcomes_1, outcomes_2], ignore_index=True)
combined_outcomes.columns = [
'Surveyor', 'Housing Association', 'No.', 'Address', 'Postcode', 'Outcome', 'Type of Funding', "Notes",
'Previous letter sent Date:', 'Date Letter sent', 'Installer'
]
# Store
combined_outcomes.to_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme "
"Reconciliation/combined_outcomes.xlsx",
)
################################################################################################
# Config for asset list standardisation
################################################################################################
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme Reconciliation"
data_filename = "Community Housing - Original Asset List Copy for Reconciliation.xlsx"
sheet_name = "Assets"
postcode_column = 'Postcode'
fulladdress_column = "Full Address"
address1_column = None
address1_method = "house_number_extraction"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = "Build_Date"
landlord_os_uprn = None
landlord_property_type = "Asset_Type1"
landlord_built_form = "Asset_Classification"
landlord_wall_construction = None
landlord_roof_construction = None
landlord_heating_system = "Heat Source Static"
landlord_existing_pv = None
landlord_property_id = "Asset_Reference"
landlord_sap = None
outcomes_filename = [
os.path.join(data_folder, "combined_outcomes.xlsx"),
]
outcomes_sheetname = ["Sheet1"]
outcomes_postcode = ["Postcode"]
outcomes_houseno = ["No."]
outcomes_id = [None]
outcomes_address = ["Address"]
master_filepaths = [
os.path.join(data_folder, "Submissions - for analysis.csv"),
]
master_to_asset_list_filepath = None
phase = False
ecosurv_landlords = "community community|community housing|mr community|david lindwood"
manual_uprn_map = {}
asset_list = AssetList(
local_filepath=os.path.join(data_folder, data_filename),
header=0,
sheet_name=sheet_name,
address1_colname=address1_column,
postcode_colname=postcode_column,
landlord_property_id=landlord_property_id,
full_address_colname=fulladdress_column,
full_address_cols_to_concat=address_cols_to_concat,
missing_postcodes_method=missing_postcodes_method,
address1_extraction_method=address1_method,
landlord_year_built=landlord_year_built,
landlord_uprn=landlord_os_uprn,
landlord_property_type=landlord_property_type,
landlord_built_form=landlord_built_form,
landlord_wall_construction=landlord_wall_construction,
landlord_roof_construction=landlord_roof_construction,
landlord_heating_system=landlord_heating_system,
landlord_existing_pv=landlord_existing_pv,
landlord_sap=landlord_sap,
phase=phase
)
asset_list.init_standardise()
asset_list.apply_standardiation()
# We now flag properties that have been treated under existing programmes
asset_list.flag_outcomes(
outcomes_filepaths=outcomes_filename,
outcomes_sheetname=outcomes_sheetname,
outcomes_address=outcomes_address,
outcomes_postcode=outcomes_postcode,
outcomes_houseno=outcomes_houseno,
outcomes_id=outcomes_id
)
if pd.isnull(asset_list.outcomes["domna_property_id"]).sum() == 1:
# We fix this one manually
asset_list.outcomes["domna_property_id"] = asset_list.outcomes["domna_property_id"].fillna(
"29walternashroadeastbirchencoppicekidderminsterdy117ea-caa3a8d92ea9"
)
else:
raise Exception("Something went wrong")
asset_list.flag_survey_master(
master_filepaths=master_filepaths,
master_to_asset_list_filepath=master_to_asset_list_filepath
)
master_surveyed = asset_list.master_surveyed
scheme_map = {
"ECO4 A/W": "ECO4",
'ECO4 GBIS': "GBIS",
'ECO4 - REMEDIAL CWI ONLY': "ECO4 Remedial",
"ECO4 GBIS REMEDIAL": "GBIS Remedial",
'ECO4 - Remedial CWI Only': "ECO4 Remedial",
'ECO4 GBIS Remedial': "GBIS Remedial"
}
master_surveyed["funding_scheme"] = master_surveyed["funding_scheme"].map(scheme_map)
master_surveyed["survey_reference"] = master_surveyed["funding_scheme"] + ": " + master_surveyed["measure_mix"]
master_surveyed = master_surveyed.merge(
asset_list.standardised_asset_list[["domna_property_id", "landlord_property_id"]],
how="left",
on="landlord_property_id",
)
if pd.isnull(master_surveyed["domna_property_id"]).sum():
raise ValueError("Some of the master surveyed properties do not have a domna_property_id")
# Flag anything in outcomes that has been listed as surveyed, that is NOT in the master_surveyed sheet
surveyed_outcomes = asset_list.outcomes[
asset_list.outcomes["Outcome"].isin(["surveyed", "surveyed"])
]
outcomes_not_in_master = surveyed_outcomes[
~surveyed_outcomes["domna_property_id"].isin(master_surveyed["domna_property_id"])
]
outcomes_not_in_master["Type of Funding"] = outcomes_not_in_master["Type of Funding"].fillna("Work Type Not Filled In")
asset_list.flag_ecosurv(
ecosurv_landlords=ecosurv_landlords,
landlords_to_ignore=[
"Watford Community housing", "Eastlight Community housing", "Mr Tower Hamlets Community Housing"
]
)
# These are properties NOT on the Community Housing asset list that were sold under the wrong HA
# asset_list.ecosurv_no_match.to_csv(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Community Housing/Programme "
# "Reconciliation/Ecosurv - properties sold to Community Housing, not belonging to them.csv",
# index=False
# )
# We read in the works, split by sold to SGEC and on-hold
billed_to_installer = pd.read_csv(
os.path.join(
data_folder, "Community Housing Deck of works", "SGEC BILLED -Table 1.csv"
),
)
billed_to_installer["billed"] = True
not_billed_to_installer = pd.read_csv(
os.path.join(
data_folder, "Community Housing Deck of works", "ON HOLD -Table 1.csv"
),
)
not_billed_to_installer["billed"] = False
sgec_billings = pd.concat(
[billed_to_installer, not_billed_to_installer],
)
sgec_billings = sgec_billings.reset_index(drop=True)
sgec_billings["row_id"] = sgec_billings.index
# We match these two lists back to the domna_property_id. They SHOULD match to submissions
scheme_col = (
"AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" if
"AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" in billed_to_installer.columns else "AFFORDABLE WARMTH"
)
postcode_col = "POSTCODE" if "POSTCODE" in billed_to_installer.columns else "Post Code"
house_no_col = 'NO.' if 'NO.' in billed_to_installer.columns else "NO"
property_type_col = (
"PROPERTY TYPE As per table emailed" if
"PROPERTY TYPE As per table emailed" in
billed_to_installer.columns else "PROPERTY TYPE As per table emailed"
)
measure_mix_col = "MEASURE COMBO"
manual_corrections = {
"30+DY12 1EB": "73440300",
"32+DY12 1EB": "73440320",
"1+DY11 7ES": "20150010",
"12+DY11 7EP": "9460120",
"72+DY11 7PA": "88520720",
"39+DY13 0DR": "44250390",
"43+DY11 7EF": "2460430",
"45+DY11 7EG": "2460450",
"47+DY11 7EG": "2460470",
"49+DY11 7EG": "2460490",
"11+DY13 0HB": "87320110",
"4+DY130HA": "87320040"
}
billed_lookup = []
for _, row in tqdm(sgec_billings.iterrows(), total=len(sgec_billings)):
postcode = row["Post Code"]
houseno = row["NO."]
# We need to correct some records
if manual_corrections.get("+".join([houseno, postcode])):
landlord_pid = manual_corrections["+".join([houseno, postcode])]
df = asset_list.standardised_asset_list[
(asset_list.standardised_asset_list["landlord_property_id"] == landlord_pid)
]
if df.shape[0] != 1:
raise ValueError(f"More than one match found for {landlord_pid} in the standardised asset list")
billed_lookup.append(
{
"domna_property_id": df["domna_property_id"].values[0],
"row_id": row["row_id"],
}
)
continue
df = master_surveyed[
(master_surveyed["original_house_no"] == houseno) &
(master_surveyed["original_postcode"] == postcode)
]
if df.shape[0] != 1:
# Try a search on the asset list
postcode_no_space = row[postcode_col].strip().replace(" ", "").lower()
df = asset_list.standardised_asset_list[
(
asset_list.standardised_asset_list[asset_list.STANDARD_POSTCODE]
.str.strip().str.lower().str.replace(" ", "") == postcode_no_space
)
].copy()
house_no = row[house_no_col]
if isinstance(house_no, float):
house_no = str(int(house_no)).lower()
else:
house_no = str(house_no).lower()
df["house_no"] = df.apply(
lambda x: SearchEpc.get_house_number(
str(x[asset_list.STANDARD_ADDRESS_1]), str(x[asset_list.STANDARD_POSTCODE])
),
axis=1
)
df = df[df["house_no"].str.lower() == house_no].copy()
if df.shape[0] == 1:
billed_lookup.append(
{
"domna_property_id": df["domna_property_id"].values[0],
"row_id": row["row_id"],
}
)
continue
raise ValueError(f"More than one match found for {'+'.join([houseno, postcode])} in the master surveyed list")
billed_lookup.append(
{
"domna_property_id": df["domna_property_id"].values[0],
"row_id": row["row_id"],
}
)
billed_lookup = pd.DataFrame(billed_lookup)
sgec_billings = sgec_billings.merge(
billed_lookup,
how="left",
on="row_id"
)
# We get the asset list that Community Housing thinks they sent Warmfront
master_data_sheet = pd.read_excel(
os.path.join(
data_folder, "Warmfront.xlsx"
),
sheet_name="Asset Stock List (3)",
)
master_data_sheet["Asset_Reference"] = master_data_sheet["Asset_Reference"].astype(str)
# 1) We check that all of the properties in the asset list we have on file are in the asset list that Community Housing
# believe they sent Warmfront
if not asset_list.standardised_asset_list[
~asset_list.standardised_asset_list["landlord_property_id"].isin(
master_data_sheet["Asset_Reference"].astype(str).values
)
].empty:
raise ValueError("Some of the properties in the asset list are not in the Warmfront asset list")
# This column documents whether or not the property is in the asset list that the WFT were sent
# There are 189 properties that were never sent to WFT, but all properties are accounted for in the asset list
master_data_sheet["Is Property in WFT Asset List"] = master_data_sheet["Asset_Reference"].astype(str).isin(
asset_list.standardised_asset_list["landlord_property_id"].astype(str).values
)
# We now merge on the Warmfront findings
master_data_sheet = master_data_sheet.merge(
asset_list.standardised_asset_list[["landlord_property_id", "non-intrusives: ECO Eligibility"]],
how="left",
left_on="Asset_Reference",
right_on="landlord_property_id"
)
master_data_sheet["non-intrusives: ECO Eligibility"] = master_data_sheet["non-intrusives: ECO Eligibility"].fillna(
"Not in original asset list"
)
# SGEC did a number of CIGA checks. We match these onto the master data sheet
# TODO: Need to split the programme into historical 2023 and 2024 (there was a cutoff data in late 2024 which seemed
# to be the start of the new programme
# Seems like there were 2 main checks - it also seems like this was a 2 phase programme, where these CIGA checks
# correspond to phase 2
ciga_checks_1 = pd.read_excel(
os.path.join(
data_folder, "CIGA Checks", "2 CIGA Check WFT 14102024 x1073.xlsx"
),
sheet_name="Worksheet"
)
ciga_checks_1 = ciga_checks_1[~pd.isnull(ciga_checks_1["Postcode"])]
ciga_checks_1["request"] = "1073 properties"
ciga_checks_2 = pd.read_excel(
os.path.join(
data_folder, "CIGA Checks", "2 CIGA Check 01112024 x125.xlsx"
),
sheet_name="Worksheet"
)
ciga_checks_2 = ciga_checks_2[~pd.isnull(ciga_checks_2["Postcode"])]
ciga_checks_2["request"] = "125 flats"
cigas = pd.concat([ciga_checks_1, ciga_checks_2], ignore_index=True)
cigas["row_id"] = cigas.index
# We add some temp columns to allow for easier matching
asset_list.standardised_asset_list["house_no"] = asset_list.standardised_asset_list.apply(
lambda x: SearchEpc.get_house_number(
str(x["domna_full_address"]), str(x["domna_postcode"])
),
axis=1
)
manual_fixes = {
"2 Austcliffe Road Cookley, Kidderminster": "2250020",
'5 Brett Young Close, Kidderminster': "9800050"
}
incorrect_ciga_return = [
"19 Wood Street, Kidderminster",
"nan Charles Street",
"53 Harold Evers Way, Kidderminster",
'63 Harold Evers way'
]
ciga_lookup = []
for _, row in tqdm(cigas.iterrows(), total=len(cigas)):
if manual_fixes.get(row["Matched Address"]):
ll_pid = manual_fixes[row["Matched Address"]]
df = asset_list.standardised_asset_list[
(asset_list.standardised_asset_list["landlord_property_id"] == ll_pid)
]
ciga_lookup.append(
{
"domna_property_id": df["domna_property_id"].values[0],
"row_id": row["row_id"],
}
)
continue
if (row["Matched Address"] in incorrect_ciga_return) or (
" ".join([str(row["Address1"]), row["Address2"]]) in incorrect_ciga_return
):
continue
df = asset_list.standardised_asset_list[
(asset_list.standardised_asset_list["domna_postcode"] == row["Postcode"])
]
df = df[(df["house_no"].astype(str) == str(row["Address1"]))]
if df.empty:
df = asset_list.standardised_asset_list[
(asset_list.standardised_asset_list["domna_postcode"] == row["Matched Postcode"])
]
df = df[(df["house_no"].astype(str) == str(row["Address1"]))]
if df.shape[0] > 1:
df = asset_list.standardised_asset_list[
(asset_list.standardised_asset_list["domna_full_address"].str.lower().str.replace(",", "").str.contains(
row["Matched Address"].lower().replace(",", ""), na=False))
]
if df.empty:
df = asset_list.standardised_asset_list[
(asset_list.standardised_asset_list["domna_full_address"].str.lower().str.replace(",", "").str.contains(
row["Address2"].lower().replace(",", ""), na=False))
]
df = df[(df["house_no"].astype(str) == str(row["Address1"]))]
if df.shape[0] != 1:
raise Exception("More than one match found for {row['Address1']} in the asset list")
ciga_lookup.append(
{
"domna_property_id": df["domna_property_id"].values[0],
"row_id": row["row_id"],
}
)
ciga_lookup = pd.DataFrame(ciga_lookup)
cigas = cigas.merge(
ciga_lookup,
how="left",
on="row_id"
)
cigas = cigas[~pd.isnull(cigas["domna_property_id"])]
cigas = cigas.merge(
asset_list.standardised_asset_list[["domna_property_id", "landlord_property_id"]],
how="left",
on="domna_property_id"
)
# Note 4 entries in the CIGA checks did NOT match to the asset list (were for properties not owned by Community Housing)
master_data_sheet = master_data_sheet.merge(
cigas[["landlord_property_id", "Guarantee", "request"]].rename(
columns={"request": "CIGA request batch"}
),
how="left",
on="landlord_property_id"
)
# Fill missing survey_reference with funding_scheme
master_surveyed["survey_reference"] = master_surveyed["survey_reference"].fillna(
master_surveyed["funding_scheme"]
)
master_surveyed_to_merge = master_surveyed[
~master_surveyed["domna_property_id"].isin(sgec_billings["domna_property_id"].values)
]
master_surveyed_to_merge["Survey Status"] = "Surveyed, Submitted, not on SGEC Deck of Works"
# We now merge on what we've surveyed and submitted
master_data_sheet = master_data_sheet.merge(
master_surveyed_to_merge[
["landlord_property_id", "survey_reference", "submission_date", "cancelled", "Survey Status"]
].rename(
columns={
"survey_reference": "Survey Type", "submission_date": "Survey Date",
"cancelled": "Was the Install Cancelled?"
}
),
how="left",
on="landlord_property_id"
)
# We now deduce the status of the work based on sgec_billings
sgec_billings = sgec_billings.merge(
asset_list.standardised_asset_list[["landlord_property_id", "domna_property_id"]],
how="left",
on="domna_property_id"
)
dupe_ids = sgec_billings[sgec_billings["domna_property_id"].duplicated()]["domna_property_id"]
# We sort by domna_property_id and billed (where true should be first) and take the first instance
sgec_billings = sgec_billings.sort_values(
["domna_property_id", "billed"], ascending=[True, False]
)
sgec_billings = sgec_billings.drop_duplicates(
subset=["domna_property_id"],
keep="first"
)
sgec_billings["Survey Type"] = (
sgec_billings["SUBMISSION TYPE - ECO4,GBIS,SHDF,EPC or OTHER"].map(scheme_map) + ": " +
sgec_billings["MEASURE COMBO"]
)
sgec_billings["Survey Type"] = sgec_billings["Survey Type"].fillna(
sgec_billings["SUBMISSION TYPE - ECO4,GBIS,SHDF,EPC or OTHER"].map(scheme_map)
)
sgec_billings["Survey Date"] = sgec_billings['SUBMISSION DATE']
sgec_billings["Was the Install Cancelled?"] = (
sgec_billings["INSTALLED"].astype(str).str.lower().str.contains("cancel")
)
sgec_billings['Survey Status'] = np.where(
sgec_billings["billed"] == True,
"Surveyed, Submitted, on SGEC Deck of Works",
"Surveyed, not submitted to SGEC, on SGEC Deck of Works"
)
master_data_sheet = master_data_sheet.merge(
sgec_billings[
["landlord_property_id", "Survey Type", "Survey Date", "Was the Install Cancelled?", "Survey Status"]],
how="left",
on="landlord_property_id",
suffixes=("", "_y")
)
for col in ["Survey Type", "Survey Date", "Was the Install Cancelled?", "Survey Status"]:
master_data_sheet[col] = np.where(
pd.isnull(master_data_sheet[col]) & ~pd.isnull(master_data_sheet[col + "_y"]),
master_data_sheet[col + "_y"],
master_data_sheet[col]
)
master_data_sheet = master_data_sheet.drop(columns=[col + "_y"])
outcomes_not_in_master = outcomes_not_in_master.merge(
asset_list.standardised_asset_list[["landlord_property_id", "domna_property_id"]],
how="left",
left_on="domna_property_id",
right_on="domna_property_id"
)
# We also filter out any that were in the SGEC billings
outcomes_not_in_master = outcomes_not_in_master[
~outcomes_not_in_master["domna_property_id"].isin(sgec_billings["domna_property_id"].values)
]
# We now merge on outcomes. There are a small number of surveyed outcomes that were not submitted
master_data_sheet = master_data_sheet.merge(
outcomes_not_in_master[["landlord_property_id", 'Type of Funding', "Date Letter sent"]],
how="left",
on="landlord_property_id",
)
master_data_sheet["Survey Status"] = np.where(
pd.isnull(master_data_sheet["Survey Type"]) & ~pd.isnull(master_data_sheet["Type of Funding"]),
"Surveyed, On Outcomes, not submitted",
master_data_sheet["Survey Status"]
)
master_data_sheet["Survey Type"] = np.where(
pd.isnull(master_data_sheet["Survey Type"]) & ~pd.isnull(master_data_sheet["Type of Funding"]),
master_data_sheet["Type of Funding"],
master_data_sheet["Survey Type"]
)
master_data_sheet["Survey Date"] = np.where(
pd.isnull(master_data_sheet["Survey Date"]) & ~pd.isnull(master_data_sheet["Date Letter sent"]),
master_data_sheet["Date Letter sent"],
master_data_sheet["Survey Date"]
)
master_data_sheet = master_data_sheet.drop(columns=["Type of Funding", "Date Letter sent"])
# We now need to compare the submissions that SGEC have sent us, because the deck of works is likely incorrect given
# given the number of properties that have been received by SGEC
# We have submissions from the following dates:
# - 18/11/2024
# - 10/03/2024
# - A sheet that claims to be 25/11/2024 but has 18/11/2024 as the submission date
# - 16/12/2025
# - 02/12/2024
# - 10/02/2025
sgec_received_submissions = []
for filename in [
"4x108 18.11.24 - RT MASTERS SGEC INVOICE.xlsx",
"4x144 COMMUNITY HOUSING TOTAL PROJECT INV 10032025.xlsx",
"4x19 25.11.2024 - RT Master SGEC.xlsx",
"4x37 16.12.2024 - SGEC INVOICED.xlsx",
"4x60 02.12.2024 - RT SGEC INV.xlsx",
"4x78 10.02.2025 MASTERS - SGEC INVOICED-CORRECT.xlsx"
]:
data = pd.read_excel(
os.path.join(
data_folder, "SGEC Received Submissions", filename
),
)
data["filename"] = filename
sgec_received_submissions.append(data)
sgec_received_submissions = pd.concat(sgec_received_submissions)
sgec_received_submissions = sgec_received_submissions.reset_index(drop=True)
sgec_received_submissions["row_id"] = sgec_received_submissions.index
manual_fix = {
"5a+DY10 3JR": "6856005A",
'12+DY10 3JR': "78900120",
"9+DY10 3JR": "86280090",
'10+DY10 3JL': "86280100",
"66+DY10 3JS": "68560660",
"70+DY10 3JS": "68560700",
"72+DY10 3JS": "68560720",
"12+DY10 3JP": "86280120",
"2A+DY11 5TZ": "6872002A",
"3A+DY11 5TZ": "6872003A",
"4A+DY11 5TZ": "6872004A"
}
sgec_received_submissions_lookup = []
for _, row in tqdm(sgec_received_submissions.iterrows(), total=len(sgec_received_submissions)):
_key = "+".join([str(row["NO."]), str(row["Post Code"])])
if manual_fix.get(_key) is not None:
ll_pid = manual_fix[_key]
sgec_received_submissions_lookup.append(
{
"row_id": row["row_id"],
"landlord_property_id": ll_pid,
}
)
continue
match = sgec_billings[
(sgec_billings['NO.'].astype(str) == str(row['NO.'])) &
(sgec_billings['Post Code'] == row['Post Code'])
]
if match.shape[0] > 1:
raise Exception(f"something went wrong {_key} {row['Street / Block Name']}")
if match.shape[0] == 1:
sgec_received_submissions_lookup.append(
{
"row_id": row["row_id"],
"landlord_property_id": match["landlord_property_id"].values[0],
}
)
continue
match = master_surveyed[
(master_surveyed['original_house_no'].astype(str) == str(row['NO.'])) &
(master_surveyed['original_postcode'] == row['Post Code'])
]
if match.shape[0] > 1:
raise Exception(f"something went wrong 2 {_key} {row['Street / Block Name']}")
if match.shape[0] == 0:
raise Exception(f"No match {_key} {row['Street / Block Name']}")
sgec_received_submissions_lookup.append(
{
"row_id": row["row_id"],
"landlord_property_id": match["landlord_property_id"].values[0],
}
)
sgec_received_submissions_lookup = pd.DataFrame(sgec_received_submissions_lookup)
sgec_received_submissions = sgec_received_submissions.merge(
sgec_received_submissions_lookup[["row_id", "landlord_property_id"]],
how="left",
on="row_id"
)
sgec_received_submissions["Survey Type"] = (
sgec_received_submissions["SUBMISSION TYPE - ECO4,GBIS,SHDF,EPC or OTHER"].map(scheme_map) + ": " +
sgec_received_submissions["MEASURE COMBO"]
)
sgec_received_submissions["Survey Type"] = sgec_received_submissions["Survey Type"].fillna(
sgec_received_submissions["SUBMISSION TYPE - ECO4,GBIS,SHDF,EPC or OTHER"].map(scheme_map)
)
sgec_received_submissions["Survey Date"] = sgec_received_submissions['SUBMISSION DATE']
sgec_received_submissions["Was the Install Cancelled?"] = (
sgec_received_submissions["INSTALLED"].astype(str).str.lower().str.contains("cancel")
)
sgec_received_submissions['Survey Status'] = "Submission sent to SGEC, Confirmed by SGEC"
sgec_received_submissions["Survey Received by SGEC"] = True
# We now merge on the submissions that SGEC have sent us
master_data_sheet = master_data_sheet.merge(
sgec_received_submissions[
[
"landlord_property_id", "Survey Type", "Survey Date", "Was the Install Cancelled?", "Survey Status",
"Survey Received by SGEC"
]
],
how="left",
on="landlord_property_id",
suffixes=("", "_y")
)
# Fill in the gaps
for col in ["Survey Type", "Survey Date", "Was the Install Cancelled?", "Survey Status"]:
master_data_sheet[col] = np.where(
pd.isnull(master_data_sheet[col]) & ~pd.isnull(master_data_sheet[col + "_y"]),
master_data_sheet[col + "_y"],
master_data_sheet[col]
)
master_data_sheet = master_data_sheet.drop(columns=[col + "_y"])
if master_data_sheet["Asset_Reference"].duplicated().sum():
raise ValueError("There are duplicates in the asset reference column")
# Drop this at the end
master_data_sheet = master_data_sheet.drop(columns=["landlord_property_id"])
master_data_sheet.to_excel(
os.path.join(
data_folder, "Draft Results.xlsx"
),
)