import inspect import pandas as pd from tqdm import tqdm from pathlib import Path src_file_path = inspect.getfile(lambda: None) EPC_DIRECTORY = Path(src_file_path).parent / "local_data" / "all-domestic-certificates" def app(): # For EPCs lodged from 2020 onwards, this collects data on the energy efficiency categories for wall insulation # so that when we simulate, we know what the resulting energy efficiency category will be epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] date_cutoff = "2020-01-01" walls_data = [] ashp_data = [] for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)): data = pd.read_csv(directory / "certificates.csv", low_memory=False) # Rename the columns to the same format as the api returns data.columns = [c.replace("_", "-").lower() for c in data.columns] insulated_walls = data[ data["walls-description"].isin( [ "Cavity wall, filled cavity", "Solid brick, with internal insulation", "Solid brick, with external insulation", ] ) ] insulated_walls = insulated_walls[~pd.isnull(insulated_walls["uprn"])] insulated_walls = insulated_walls[ pd.to_datetime(insulated_walls["lodgement-date"]) >= date_cutoff ] ashp = data[ data["mainheat-description"] == "Air source heat pump, radiators, electric" ] ashp = ashp[~pd.isnull(ashp["uprn"])] ashp = ashp[ pd.to_datetime(ashp["lodgement-date"]) >= date_cutoff ] walls_data.append(insulated_walls) ashp_data.append(ashp) walls_df = pd.concat(walls_data) ashp_df = pd.concat(ashp_data) ashp_agg = ( ashp_df. groupby( ["construction-age-band", "mainheat-description", "mainheatcont-description", "mainheat-energy-eff", "mainheatc-energy-eff"] ) .size() .reset_index() ) ashp_agg = ashp_agg[ ashp_agg["mainheatcont-description"].isin( ["Programmer, TRVs and bypass", "Time and temperature zone control"] ) ] aggregations = {} for description in [ "Cavity wall, filled cavity", "Solid brick, with internal insulation", "Solid brick, with external insulation" ]: aggregation = walls_df[ walls_df["walls-description"] == description ].groupby( ["construction-age-band", "walls-energy-eff"] ).size().reset_index().rename(columns={0: "count"}) # For each grouping of age band, we use the most populus energy efficiency category aggregation_deduped = aggregation.sort_values("count", ascending=False).drop_duplicates("construction-age-band") aggregations[description] = aggregation_deduped # Since these tables are small, we just convert them to python dictionaries # This data is just held in the wall_energy_efficiency_values script, rather than s3 df1 = aggregations["Cavity wall, filled cavity"] df2 = aggregations["Solid brick, with internal insulation"] df3 = aggregations["Solid brick, with external insulation"] df1.to_dict("records") df2.to_dict("records") df3.to_dict("records")