import inspect import pandas as pd from pathlib import Path from tqdm import tqdm from etl.epc.settings import EARLIEST_EPC_DATE from etl.spatial.OpenUprnClient import OpenUprnClient src_file_path = inspect.getfile(lambda: None) EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates") epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] aggregation = [] for directory in tqdm(epc_directories): data = pd.read_csv(directory / "certificates.csv", low_memory=False) # Rename the columns to the same format as the api returns data.columns = [c.replace("_", "-").lower() for c in data.columns] data = data[data["posttown"].str.contains("London", case=False, na=False)] if data.empty: continue # Take just date before the date threshold data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] data = data[~pd.isnull(data["uprn"])] data["uprn"] = data["uprn"].astype(int) # Take just the newest EPC per uprn, based on lodgement-date data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") # Take EPC D and below data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])] data["postal_region"] = data["postcode"].str.split(" ").str[0] # Take homes that don't have a gas boiler off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)] if off_gas.empty: continue # Remote properties with conservation area issues uprns = off_gas["uprn"].unique() # Get data ca_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") off_gas = off_gas.merge( ca_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]].rename( columns={"UPRN": "uprn"} ), how="left", on="uprn", ) # Remove any restricted units off_gas = off_gas[ (off_gas["conservation_status"] != True) & (off_gas["is_listed_building"] != True) & (off_gas["is_heritage_building"] != True) ] off_gas = off_gas[ off_gas["tenure"].isin(["rental (private)", "Rented (private)", "owner-occupied", "Owner-occupied"]) ] region_summary = off_gas.groupby("postal_region").size().reset_index(name="count") aggregation.append(region_summary) postal_region_aggregation = pd.concat(aggregation) # Re-aggregate postal_region_aggregation = postal_region_aggregation.groupby("postal_region")["count"].sum().reset_index() postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False) postal_region_aggregation = postal_region_aggregation.rename( columns={"postal_region": "Postcode Region", "count": "Number of Homes"} ) postal_region_aggregation.to_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions - without conservation " "area.xlsx", index=False )