Model/etl/customers/bromford/data_cleanup.py
2025-04-13 21:39:35 +01:00

192 lines
7 KiB
Python

"""
12th April 2025
This script attempts to clean up the various pieces of data we have for Bromford, with the intention of producing a
standardised asset list
"""
import pandas as pd
# Step 1
# The inspectons data is spread across three different files. We attempt to produce one finalised asset list, with
# comprehensive inspections
# Primary asset list
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford Asset "
"List.xlsx",
sheet_name="Asset List"
)
#
inspections_1 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
"MDS.xlsx",
sheet_name="Data list"
)
inspections_1["Heating Type"] = (inspections_1["Heating Type"] + " " + inspections_1["Heating fuel"]).str.strip()
inspections_2 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
"MERLIN LANE.xlsx",
sheet_name="Report"
)
inspections_2["AssetTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[-1]
inspections_2["PropTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[:-1].str.join(" ")
inspections_3 = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
"SEVERN VALE - KLARKE.xlsx",
sheet_name="Asset report"
)
inspections_3["FullAddress"] = inspections_3["T1_Address1"] + ", " + inspections_3["T1_Address2"]
# On inspections 3, we have multiple sheets which describe the heating
heating_systems = []
for sheet_name in [
"Storage Heaters", "No Heating", "Underfloor Heating", "Rointe Electric Heating", "Air Source Heating",
"Gas Central Heating", "Electric Boiler", "Oil Fired Central Heating",
"Communal Boilers", "Panel Heaters"
]:
df = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme "
"Rebuild/Inspections/BROMFORD "
"SEVERN VALE - KLARKE.xlsx",
sheet_name=sheet_name
)
df = df[["UPRN"]]
df["Heating Type"] = sheet_name
heating_systems.append(df)
heating_systems = pd.concat(heating_systems)
# We have no clue which one is correct, we have some dupes
heating_systems = heating_systems.drop_duplicates("UPRN")
heating_systems = heating_systems.rename(columns={"UPRN": "Asset"})
heating_systems["Asset"] = heating_systems["Asset"].astype(int)
inspections_3 = inspections_3.merge(heating_systems, how="left", on="Asset")
# Create a consolidated inspections sheet
inspections = pd.concat(
[
inspections_1[["Asset", "Construction type", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
inspections_2[["Asset", "Construction type", "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
inspections_3[["Asset", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
]
)
inspections_address_data = pd.concat(
[
inspections_1[
["Asset", "FullAddress", "PostCode", "ConYear", "Beds", "AssetTypeDesc", "PropTypeDesc", 'ManAreaDesc', ]
],
inspections_2[
['Asset', 'FullAddress', 'AccomType', "AssetTypeDesc", "PropTypeDesc", 'ConYear', 'Postcode']
].rename(columns={"Postcode": "PostCode"}),
inspections_3[
['Asset', "FullAddress", 'T1_Postcode', 'T1_Build Year', 'T1_AssetType']
].rename(
columns={"T1_Postcode": "PostCode", "T1_Build Year": "ConYear", "T1_AssetType": "AssetTypeDesc"}
),
]
)
# Remove some error values
inspections = inspections[~inspections["Asset"].isin(
[
"They're all green partial fill they're all green this",
"South Staffordshire District Council",
'Blk Milton Crt F9-10, Perton, Wolverhampton'
]
)]
inspections["Asset"] = inspections["Asset"].astype(str)
asset_list["Asset"] = asset_list["Asset"].astype(str)
inspections_address_data["Asset"] = inspections_address_data["Asset"].astype(str)
inspections['WFT Findings'] = inspections['WFT Findings'].replace(r'^\s*$', pd.NA, regex=True)
# We have some cases where the inspetions data has dupes on Asset (the ID column). We take the instance that is
# populated
inspections = inspections.sort_values(by='WFT Findings', na_position='last')
inspections = inspections.drop_duplicates(subset='Asset', keep='first')
# We have dupes in the asset list
asset_list = asset_list.drop_duplicates("Asset")
# Merge on
missed_asset_ids = inspections[
~inspections["Asset"].isin(asset_list["Asset"].values)
]["Asset"].values
missed_assets = inspections_address_data[
inspections_address_data["Asset"].isin(missed_asset_ids)
]
missed_assets = missed_assets.drop_duplicates("Asset")
# We produce a larger asset list
asset_list = pd.concat([asset_list, missed_assets])
asset_list = asset_list.merge(
inspections, how="left", on="Asset"
)
asset_list["WFT Findings"] = asset_list["WFT Findings"].fillna("No Inspections Note")
# Store
# asset_list.to_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared "
# "data/asset_list.xlsx"
# )
# We now prepare outcomes into a single file
pv_outcomes = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford PV "
"Outcomes.csv",
encoding='cp1252'
)
pv_outcomes["measure_type"] = "solar"
other_outcomes = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/(Bromford) "
"15.04.2024.xlsx",
sheet_name="ECO4 & GBIS",
header=1
)
other_outcomes["measure_type"] = "cwi"
combined_outcomes = pd.concat(
[
other_outcomes[["NO", "ADDRESS", "POSTCODE", "WEEK COMMENCING", "OUTCOMES", "NOTES"]].rename(
columns={
"NO": "No", "ADDRESS": "Address", "POSTCODE": "Postcode", "WEEK COMMENCING": "Week Commencing",
"OUTCOMES": "Outcome", "NOTES": "Notes"
}
),
pv_outcomes[['No', 'Address', 'Postcode', "Week Commencing", "Outcome", "Notes"]]
]
)
# Store
# combined_outcomes.to_excel(
# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared "
# "data/outcomes.xlsx"
# )
# Submissions sheet -
eco3_submissions = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 Submissions.csv",
encoding='cp1252'
)
# Get rid of the unnamed columns
unnamed_columns = [c for c in eco3_submissions.columns if "Unnamed: " in c]
eco3_submissions = eco3_submissions.drop(columns=unnamed_columns)
# Store
eco3_submissions.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 submissions.csv",
index=False
)
eco4_submissions = pd.read_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 4 submissions.csv",
)
same_cols = [c for c in eco4_submissions.columns if c in eco3_submissions.columns]