from pathlib import Path from core.Settings import ( RDSAP_RESPONSE, FLOOR_LEVEL_MAP, BUILT_FORM_REMAP, EARLIEST_EPC_DATE, FULLY_GLAZED_DESCRIPTIONS, FIXED_FEATURES, LATEST_FIELD, COMPONENT_FEATURES ) from model_data.BaseUtility import Definitions from tqdm import tqdm import pandas as pd import numpy as np from autogluon.tabular import TabularDataset, TabularPredictor RANDOM_SEED = 0 DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' FLOAT_COLUMNS = [ 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'PHOTO_SUPPLY', 'FIXED_LIGHTING_OUTLETS_COUNT', 'FLOOR_HEIGHT', 'NUMBER_HABITABLE_ROOMS', 'LOW_ENERGY_LIGHTING', 'MULTI_GLAZE_PROPORTION', 'NUMBER_HEATED_ROOMS' ] def create_raw_data(): """ Extract all information to do a simple predictor for RDSAP """ directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] # directories = directories[0:10] dfs = [] for directory in tqdm(directories): filepath = directory / "certificates.csv" df = pd.read_csv(filepath, low_memory=False) # Remove any bad uprns and ignore old/bad data df = df[~pd.isnull(df["UPRN"])] df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] df = df[df["TRANSACTION_TYPE"] != "new dwelling"] df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] # Change multi glaze proportion no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & ( df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100 # Recast df["UPRN"] = df["UPRN"].astype(int).astype(str) df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].astype(float) # Sort Data df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) # Map all anomaly values to None data_anomaly_map = dict(zip(Definitions.DATA_ANOMALY_MATCHES, [None] * len(Definitions.DATA_ANOMALY_MATCHES))) # Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values df = df.replace(data_anomaly_map) df = df.replace(np.NAN, None) # Remap certain columns df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP) df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP) # Keep only possible modelling columns df = df[[RDSAP_RESPONSE] + list(set(FIXED_FEATURES + LATEST_FIELD + COMPONENT_FEATURES))] # Reduce memory usage # df.memory_usage() # df.dtypes df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned') df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float') dfs.append(df) data = pd.concat(dfs) data.to_parquet('./energy_predictor_data.parquet') cleaned_data = data.dropna() # GIves you primarily flats cleaned_data.to_parquet('./energy_predictor_cleaned_data.parquet') def main(): data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet') subsample_size = round(len(data) / 100) data = data.sample(subsample_size, random_state=RANDOM_SEED) predictor_RDSAP = TabularPredictor( label=RDSAP_RESPONSE, path="agModels-predictENERGY", problem_type="regression", eval_metric='mean_absolute_error' ).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT']) test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet') performance = predictor_RDSAP.evaluate(test_data) predictions = predictor_RDSAP.predict(test_data) predictor_RDSAP.feature_importance(test_data) if __name__ == "__main__": main()