Model/etl/customers/bromford/data_cleanup.py

"""
12th April 2025
This script attempts to clean up the various pieces of data we have for Bromford, with the intention of producing a
standardised asset list
"""

import pandas as pd

# Step 1
# The inspectons data is spread across three different files. We attempt to produce one finalised asset list, with
# comprehensive inspections

# Primary asset list
asset_list = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford Asset "
    "List.xlsx",
    sheet_name="Asset List"
)

#
inspections_1 = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
    "MDS.xlsx",
    sheet_name="Data list"
)
inspections_1["Heating Type"] = (inspections_1["Heating Type"] + " " + inspections_1["Heating fuel"]).str.strip()

inspections_2 = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
    "MERLIN LANE.xlsx",
    sheet_name="Report"
)
inspections_2["AssetTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[-1]
inspections_2["PropTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[:-1].str.join(" ")

inspections_3 = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
    "SEVERN VALE - KLARKE.xlsx",
    sheet_name="Asset report"
)

inspections_3["FullAddress"] = inspections_3["T1_Address1"] + ", " + inspections_3["T1_Address2"]

# On inspections 3, we have multiple sheets which describe the heating
heating_systems = []
for sheet_name in [
    "Storage Heaters", "No Heating", "Underfloor Heating", "Rointe Electric Heating", "Air Source Heating",
    "Gas Central Heating", "Electric Boiler", "Oil Fired Central Heating",
    "Communal Boilers", "Panel Heaters"
]:
    df = pd.read_excel(
        "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme "
        "Rebuild/Inspections/BROMFORD "
        "SEVERN VALE - KLARKE.xlsx",
        sheet_name=sheet_name
    )
    df = df[["UPRN"]]
    df["Heating Type"] = sheet_name
    heating_systems.append(df)

heating_systems = pd.concat(heating_systems)
# We have no clue which one is correct, we have some dupes
heating_systems = heating_systems.drop_duplicates("UPRN")
heating_systems = heating_systems.rename(columns={"UPRN": "Asset"})
heating_systems["Asset"] = heating_systems["Asset"].astype(int)

inspections_3 = inspections_3.merge(heating_systems, how="left", on="Asset")

# Create a consolidated inspections sheet
inspections = pd.concat(
    [
        inspections_1[["Asset", "Construction type", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
        inspections_2[["Asset", "Construction type", "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
        inspections_3[["Asset", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
    ]
)

inspections_address_data = pd.concat(
    [
        inspections_1[
            ["Asset", "FullAddress", "PostCode", "ConYear", "Beds", "AssetTypeDesc", "PropTypeDesc", 'ManAreaDesc', ]
        ],
        inspections_2[
            ['Asset', 'FullAddress', 'AccomType', "AssetTypeDesc", "PropTypeDesc", 'ConYear', 'Postcode']
        ].rename(columns={"Postcode": "PostCode"}),
        inspections_3[
            ['Asset', "FullAddress", 'T1_Postcode', 'T1_Build Year', 'T1_AssetType']
        ].rename(
            columns={"T1_Postcode": "PostCode", "T1_Build Year": "ConYear", "T1_AssetType": "AssetTypeDesc"}
        ),
    ]
)

# Remove some error values
inspections = inspections[~inspections["Asset"].isin(
    [
        "They're all green partial fill they're all green this",
        "South Staffordshire District Council",
        'Blk Milton Crt F9-10, Perton, Wolverhampton'
    ]
)]

inspections["Asset"] = inspections["Asset"].astype(str)
asset_list["Asset"] = asset_list["Asset"].astype(str)
inspections_address_data["Asset"] = inspections_address_data["Asset"].astype(str)
inspections['WFT Findings'] = inspections['WFT Findings'].replace(r'^\s*$', pd.NA, regex=True)

# We have some cases where the inspetions data has dupes on Asset (the ID column). We take the instance that is
# populated
inspections = inspections.sort_values(by='WFT Findings', na_position='last')
inspections = inspections.drop_duplicates(subset='Asset', keep='first')

# We have dupes in the asset list
asset_list = asset_list.drop_duplicates("Asset")

# Merge on
missed_asset_ids = inspections[
    ~inspections["Asset"].isin(asset_list["Asset"].values)
]["Asset"].values

missed_assets = inspections_address_data[
    inspections_address_data["Asset"].isin(missed_asset_ids)
]
missed_assets = missed_assets.drop_duplicates("Asset")

# We produce a larger asset list
asset_list = pd.concat([asset_list, missed_assets])

asset_list = asset_list.merge(
    inspections, how="left", on="Asset"
)
asset_list["WFT Findings"] = asset_list["WFT Findings"].fillna("No Inspections Note")

# Store
# asset_list.to_excel(
#     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared "
#     "data/asset_list.xlsx"
# )

# We now prepare outcomes into a single file
pv_outcomes = pd.read_csv(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford PV "
    "Outcomes.csv",
    encoding='cp1252'
)
pv_outcomes["measure_type"] = "solar"

other_outcomes = pd.read_excel(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/(Bromford) "
    "15.04.2024.xlsx",
    sheet_name="ECO4 & GBIS",
    header=1
)
other_outcomes["measure_type"] = "cwi"

combined_outcomes = pd.concat(
    [
        other_outcomes[["NO", "ADDRESS", "POSTCODE", "WEEK COMMENCING", "OUTCOMES", "NOTES"]].rename(
            columns={
                "NO": "No", "ADDRESS": "Address", "POSTCODE": "Postcode", "WEEK COMMENCING": "Week Commencing",
                "OUTCOMES": "Outcome", "NOTES": "Notes"
            }
        ),
        pv_outcomes[['No', 'Address', 'Postcode', "Week Commencing", "Outcome", "Notes"]]
    ]
)

# Store
# combined_outcomes.to_excel(
#     "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared "
#     "data/outcomes.xlsx"
# )

# Submissions sheet -
eco3_submissions = pd.read_csv(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 Submissions.csv",
    encoding='cp1252'
)
# Get rid of the unnamed columns
unnamed_columns = [c for c in eco3_submissions.columns if "Unnamed: " in c]
eco3_submissions = eco3_submissions.drop(columns=unnamed_columns)
# Store
eco3_submissions.to_csv(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 submissions.csv",
    index=False
)

eco4_submissions = pd.read_csv(
    "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 4 submissions.csv",
)

same_cols = [c for c in eco4_submissions.columns if c in eco3_submissions.columns]