import numpy as np import os import pandas as pd from tqdm import tqdm from model_data.BaseUtility import BaseUtility def list_subdirectories(directory_path): return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))] DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates' FIXED_FEATURES = [ 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'NUMBER_HABITABLE_ROOMS', 'CONSTITUENCY', 'NUMBER_HEATED_ROOMS', 'FIXED_LIGHTING_OUTLETS_COUNT', 'GLAZED_AREA', 'FLOOR_HEIGHT', 'FLOOR_LEVEL', 'TOTAL_FLOOR_AREA', ] COMPONENT_FEATURES = [ 'TRANSACTION_TYPE', 'WALLS_DESCRIPTION', 'FLOOR_DESCRIPTION', 'LIGHTING_DESCRIPTION', 'ROOF_DESCRIPTION', 'MAINHEAT_DESCRIPTION', 'HOTWATER_DESCRIPTION', 'MAIN_FUEL', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION', 'ENERGY_TARIFF', # Not sure if this is relevant 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'WINDOWS_DESCRIPTION', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION', 'LIGHTING_DESCRIPTION', 'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'MAINHEATCONT_DESCRIPTION', 'EXTENSION_COUNT' ] AVERAGE_FIXED_FEATURES = [ "TOTAL_FLOOR_AREA" ] def app(): # Get all the files in the directory directories = list_subdirectories(DATA_DIRECTORY) for directory in tqdm(directories): filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv") df = pd.read_csv(filepath, low_memory=False) df = df[~pd.isnull(df["UPRN"])] df["UPRN"] = df["UPRN"].astype(int).astype(str) counts = df.groupby("UPRN").size().reset_index() counts.columns = ["UPRN", "count"] counts = counts.sort_values("count", ascending=False) # take UPRNS with multiple EPCs counts = counts[counts["count"] > 1] df = df[df["UPRN"].isin(counts["UPRN"])] df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) for uprn, property_data in df.groupby("UPRN"): # Fixed features - these are property attributes that shouldn't change over time fixed_data = {} for field in FIXED_FEATURES: vals = property_data[field].dropna().unique() # Remove invalid values vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES] if len(vals) > 1: raise ValueError("Fixed feature {} has more than one value - fix me".format(field)) if field in AVERAGE_FIXED_FEATURES: # Check the values are too far apart if abs(vals[0] - vals[1]) / vals[0] > 0.1: raise ValueError("Large deviation in fixed feature {} - fix me".format(field)) field_value = np.mean(vals) else: field_value = vals[0] if vals else None fixed_data[field] = field_value variable_data = property_data[COMPONENT_FEATURES] for idx in range(0, property_data.shape[0] - 1): if idx >= property_data.shape[0] - 1: break starting_record = variable_data.iloc[idx] ending_record = variable_data.iloc[idx + 1]