import numpy as np import pandas as pd from tqdm import tqdm from model_data.BaseUtility import Definitions from pathlib import Path from model_data.simulation_system.Settings import ( MANDATORY_FIXED_FEATURES, AVERAGE_FIXED_FEATURES, LATEST_FIELD, COMPONENT_FEATURES, RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, COLUMNS_TO_MERGE_ON, FLOOR_LEVEL_MAP, BUILT_FORM_REMAP ) from DataProcessor import DataProcessor DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' def app(): # Get all the files in the directory # Data glossary: # https://epc.opendatacommunities.org/docs/guidance#glossary # List all subdirectories directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] dataset = [] # 116 # 128048706 # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic # -certificates/domestic-E09000021-Kingston-upon-Thames') for directory in tqdm(directories): filepath = directory / "certificates.csv" data_processor = DataProcessor(filepath=filepath) df = data_processor.pre_process() cleaning_averages = data_processor.make_cleaning_averages() for uprn, property_data in df.groupby("UPRN", observed=True): # Fixed features - these are property attributes that shouldn't change over time fixed_data = {} # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1: continue # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS latest_field_data = property_data[LATEST_FIELD].iloc[-1].to_dict() mandatory_field_data = property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict() # Taking just the last row, which is the percentage change from the latest to previous one only # property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1 # Extract the columns that are not all None na_columns = property_data[COLUMNS_TO_MERGE_ON].isna().all() cleaned_columns_to_merge_on = na_columns.index[~na_columns].to_list() # Get the corresponding groupby and merge, and fill in NA values cleaning_averages_to_merge = cleaning_averages.groupby(cleaned_columns_to_merge_on)[ ['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean() modified_property_data = pd.merge(property_data, cleaning_averages_to_merge, on=cleaned_columns_to_merge_on, suffixes=['', '_AVERAGE']) modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna( modified_property_data['TOTAL_FLOOR_AREA_AVERAGE']) modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna( modified_property_data['FLOOR_HEIGHT_AVERAGE']) modified_property_data = modified_property_data.drop( columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE']) for field in AVERAGE_FIXED_FEATURES: vals = list(modified_property_data[field].dropna().unique()) if len(vals) > 1: # Check the values are too far apart # TODO: we could have multiple values here, why only use the first two? if abs(vals[0] - vals[1]) / vals[0] > 0.1: # Take the more recent value since it's likely to be more accurate vals = [vals[-1]] if len(vals) == 0: wrong_var fixed_data[field] = np.mean(vals) # Combine all fields together fixed_data.update(mandatory_field_data) fixed_data.update(latest_field_data) # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time variable_data = modified_property_data[ COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE] ] # Note: we look at changes between subsequent EPCS, however we could look at other permutations # e.g. first vs second, second vs third and also first vs third property_model_data = [] for idx in range(0, modified_property_data.shape[0] - 1): if idx >= modified_property_data.shape[0] - 1: break starting_record = variable_data.iloc[idx] ending_record = variable_data.iloc[idx + 1] rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE] heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE] # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and # floors, we may want to use the U-value. We may also want to handle the (assumed) tags # within descriptions starting_record = starting_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING") ending_record = ending_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING") features = pd.concat([starting_record, ending_record]) property_model_data.append( { "UPRN": uprn, "RDSAP_CHANGE": rdsap_change, "HEAT_DEMAND_CHANGE": heat_demand_change, **fixed_data, **features.to_dict() } ) dataset.extend(property_model_data) output = pd.DataFrame(dataset) output.to_parquet('./dataset.parquet') if __name__ == "__main__": app()