Model/etl/customers/mhs/prepare_data.py
2025-05-12 15:58:41 +01:00

60 lines
2 KiB
Python

"""
The data held on file for MHS is fairly incomplete, where not every single property has an observation
"""
from tqdm import tqdm
import pandas as pd
from docutils.utils.math.tex2mathml_extern import blahtexml
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS/MHS HOMES (Full Asset List) - for analysis.xlsx",
sheet_name="Data"
)
# When this list was checked, an observation was made per postcode, and so we need to extrapolate those findings
inspections_observatons = asset_list[["UPRN", "Postcode", "ManagementGroup", "WFT Findings"]].copy()
populated = []
for _, group in tqdm(inspections_observatons.groupby("Postcode"),
total=len(inspections_observatons.groupby("Postcode"))):
if all(pd.isnull(group["WFT Findings"])):
group["WFT Findings"] = "Property not inspected"
populated.append(group)
continue
fill_observation = group["WFT Findings"].values[0]
if pd.isnull(fill_observation):
group["WFT Findings"] = group["WFT Findings"].fillna("Property not inspected")
populated.append(group)
continue
group = group.reset_index(drop=True)
group_filled = []
for idx, x in group.iterrows():
if idx == 0:
group_filled.append(x)
continue
new_value = x["WFT Findings"]
if not pd.isnull(new_value):
fill_observation = new_value
x["WFT Findings"] = fill_observation
group_filled.append(x)
group_filled = pd.DataFrame(group_filled)
populated.append(group_filled)
populated = pd.concat(populated)
missed = populated[~populated["UPRN"].isin(asset_list["UPRN"].values)]
asset_list = asset_list.drop(columns=["WFT Findings"]).merge(
populated.drop(columns=["Postcode", "ManagementGroup"]), how="left", on="UPRN"
)
# Store the data
asset_list.to_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/MHS/MHS HOMES (Full Asset List) - for programme build.xlsx"
)