""" 12th April 2025 This script attempts to clean up the various pieces of data we have for Bromford, with the intention of producing a standardised asset list """ import pandas as pd # Step 1 # The inspectons data is spread across three different files. We attempt to produce one finalised asset list, with # comprehensive inspections # Primary asset list asset_list = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford Asset " "List.xlsx", sheet_name="Asset List" ) # inspections_1 = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD " "MDS.xlsx", sheet_name="Data list" ) inspections_1["Heating Type"] = (inspections_1["Heating Type"] + " " + inspections_1["Heating fuel"]).str.strip() inspections_2 = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD " "MERLIN LANE.xlsx", sheet_name="Report" ) inspections_2["AssetTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[-1] inspections_2["PropTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[:-1].str.join(" ") inspections_3 = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD " "SEVERN VALE - KLARKE.xlsx", sheet_name="Asset report" ) inspections_3["FullAddress"] = inspections_3["T1_Address1"] + ", " + inspections_3["T1_Address2"] # On inspections 3, we have multiple sheets which describe the heating heating_systems = [] for sheet_name in [ "Storage Heaters", "No Heating", "Underfloor Heating", "Rointe Electric Heating", "Air Source Heating", "Gas Central Heating", "Electric Boiler", "Oil Fired Central Heating", "Communal Boilers", "Panel Heaters" ]: df = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme " "Rebuild/Inspections/BROMFORD " "SEVERN VALE - KLARKE.xlsx", sheet_name=sheet_name ) df = df[["UPRN"]] df["Heating Type"] = sheet_name heating_systems.append(df) heating_systems = pd.concat(heating_systems) # We have no clue which one is correct, we have some dupes heating_systems = heating_systems.drop_duplicates("UPRN") heating_systems = heating_systems.rename(columns={"UPRN": "Asset"}) heating_systems["Asset"] = heating_systems["Asset"].astype(int) inspections_3 = inspections_3.merge(heating_systems, how="left", on="Asset") # Create a consolidated inspections sheet inspections = pd.concat( [ inspections_1[["Asset", "Construction type", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]], inspections_2[["Asset", "Construction type", "WFT Findings", "Eligibility (Red/Yellow/Green)"]], inspections_3[["Asset", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]], ] ) inspections_address_data = pd.concat( [ inspections_1[ ["Asset", "FullAddress", "PostCode", "ConYear", "Beds", "AssetTypeDesc", "PropTypeDesc", 'ManAreaDesc', ] ], inspections_2[ ['Asset', 'FullAddress', 'AccomType', "AssetTypeDesc", "PropTypeDesc", 'ConYear', 'Postcode'] ].rename(columns={"Postcode": "PostCode"}), inspections_3[ ['Asset', "FullAddress", 'T1_Postcode', 'T1_Build Year', 'T1_AssetType'] ].rename( columns={"T1_Postcode": "PostCode", "T1_Build Year": "ConYear", "T1_AssetType": "AssetTypeDesc"} ), ] ) # Remove some error values inspections = inspections[~inspections["Asset"].isin( [ "They're all green partial fill they're all green this", "South Staffordshire District Council", 'Blk Milton Crt F9-10, Perton, Wolverhampton' ] )] inspections["Asset"] = inspections["Asset"].astype(str) asset_list["Asset"] = asset_list["Asset"].astype(str) inspections_address_data["Asset"] = inspections_address_data["Asset"].astype(str) inspections['WFT Findings'] = inspections['WFT Findings'].replace(r'^\s*$', pd.NA, regex=True) # We have some cases where the inspetions data has dupes on Asset (the ID column). We take the instance that is # populated inspections = inspections.sort_values(by='WFT Findings', na_position='last') inspections = inspections.drop_duplicates(subset='Asset', keep='first') # We have dupes in the asset list asset_list = asset_list.drop_duplicates("Asset") # Merge on missed_asset_ids = inspections[ ~inspections["Asset"].isin(asset_list["Asset"].values) ]["Asset"].values missed_assets = inspections_address_data[ inspections_address_data["Asset"].isin(missed_asset_ids) ] missed_assets = missed_assets.drop_duplicates("Asset") # We produce a larger asset list asset_list = pd.concat([asset_list, missed_assets]) asset_list = asset_list.merge( inspections, how="left", on="Asset" ) asset_list["WFT Findings"] = asset_list["WFT Findings"].fillna("No Inspections Note") # Store # asset_list.to_excel( # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared " # "data/asset_list.xlsx" # ) # We now prepare outcomes into a single file pv_outcomes = pd.read_csv( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford PV " "Outcomes.csv", encoding='cp1252' ) pv_outcomes["measure_type"] = "solar" other_outcomes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/(Bromford) " "15.04.2024.xlsx", sheet_name="ECO4 & GBIS", header=1 ) other_outcomes["measure_type"] = "cwi" combined_outcomes = pd.concat( [ other_outcomes[["NO", "ADDRESS", "POSTCODE", "WEEK COMMENCING", "OUTCOMES", "NOTES"]].rename( columns={ "NO": "No", "ADDRESS": "Address", "POSTCODE": "Postcode", "WEEK COMMENCING": "Week Commencing", "OUTCOMES": "Outcome", "NOTES": "Notes" } ), pv_outcomes[['No', 'Address', 'Postcode', "Week Commencing", "Outcome", "Notes"]] ] ) # Store # combined_outcomes.to_excel( # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared " # "data/outcomes.xlsx" # ) # Submissions sheet - eco3_submissions = pd.read_csv( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 Submissions.csv", encoding='cp1252' ) # Get rid of the unnamed columns unnamed_columns = [c for c in eco3_submissions.columns if "Unnamed: " in c] eco3_submissions = eco3_submissions.drop(columns=unnamed_columns) # Store eco3_submissions.to_csv( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 submissions.csv", index=False ) eco4_submissions = pd.read_csv( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 4 submissions.csv", ) same_cols = [c for c in eco4_submissions.columns if c in eco3_submissions.columns]