added latest files

This commit is contained in:
Michael Duong 2023-08-14 17:22:21 +00:00
parent 54964bcf17
commit 31832c7c26
5 changed files with 154 additions and 16 deletions

View file

@ -4,26 +4,46 @@ from Settings import (
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS
FULLY_GLAZED_DESCRIPTIONS,
FIXED_FEATURES,
LATEST_FIELD,
COMPONENT_FEATURES
)
from model_data.BaseUtility import BaseUtility
from tqdm import tqdm
import pandas as pd
import numpy as np
from autogluon.tabular import TabularDataset, TabularPredictor
RANDOM_SEED = 0
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
def main():
FLOAT_COLUMNS = [
'NUMBER_OPEN_FIREPLACES',
'EXTENSION_COUNT',
'TOTAL_FLOOR_AREA',
'PHOTO_SUPPLY',
'FIXED_LIGHTING_OUTLETS_COUNT',
'FLOOR_HEIGHT',
'NUMBER_HABITABLE_ROOMS',
'LOW_ENERGY_LIGHTING',
'MULTI_GLAZE_PROPORTION',
'NUMBER_HEATED_ROOMS'
]
def create_raw_data():
"""
Extract all information to do a simple predictor for RDSAP
"""
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
directories = directories[0:10]
# directories = directories[0:10]
dfs = []
for directory in tqdm(directories):
filepath = directory / "certificates.csv"
df = pd.read_csv(filepath)
df = pd.read_csv(filepath, low_memory=False)
# Remove any bad uprns and ignore old/bad data
df = df[~pd.isnull(df["UPRN"])]
@ -53,11 +73,45 @@ def main():
df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP)
# Keep only possible modelling columns
df = df[[RDSAP_RESPONSE] + list(set(FIXED_FEATURES + LATEST_FIELD + COMPONENT_FEATURES))]
# Reduce memory usage
# df.memory_usage()
# df.dtypes
df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned')
df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float')
dfs.append(df)
data = pd.concat(dfs)
data.to_parquet('./energy_predictor_data.parquet')
cleaned_data = data.dropna()
# GIves you primarily flats
cleaned_data.to_parquet('./energy_predictor_cleaned_data.parquet')
def main():
data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet')
subsample_size = round(len(data)/100)
data = data.sample(subsample_size, random_state=RANDOM_SEED)
predictor_RDSAP = TabularPredictor(
label=RDSAP_RESPONSE,
path="agModels-predictENERGY",
problem_type="regression",
eval_metric='mean_absolute_error'
).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT'])
test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet')
performance = predictor_RDSAP.evaluate(test_data)
predictions = predictor_RDSAP.predict(test_data)
predictor_RDSAP.feature_importance(test_data)
if __name__ == "__main__":
main()

View file

@ -28,7 +28,7 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp
"""
logger.info('---Loading Data---')
data = pd.read_parquet(filepath)
data = pd.read_parquet(filepath).reset_index(drop=True)
if percentage and volume is None:
test_amount = round(len(data)*percentage)
@ -48,7 +48,7 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp
sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index
train_validation_data = data.drop(sample_index)
test_data = data.loc[sample_index]
test_data = data.iloc[sample_index]
elif sampling =='stratified':
# Not yet implemented

View file

@ -1,9 +1,20 @@
import os
import pandas as pd
import argparse
from typing import List
from Logger import logger
from autogluon.tabular import TabularDataset, TabularPredictor
DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
FEATURE_COLUMNS = None
RANDOM_SEED = 0
# FOR TESTING
train_filepath = "./model_build_data/train_validation_data.parquet"
test_filepath = "./model_build_data/test_data.parquet"
def ingest_arguments() -> argparse.Namespace:
"""
Helper function to take in arguments from script start
@ -11,32 +22,105 @@ def ingest_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Inputs for training script')
parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load')
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
args = parser.parse_args()
return args
def training(filepath: str) -> None:
class DataLoader():
@staticmethod
def load(filepath: str) -> pd.DataFrame:
"""
Load different datasets
"""
if filepath.endswith('.parquet'):
df = pd.read_parquet(filepath)
elif filepath.endswith('.csv.'):
df = pd.read_csv(filepath)
else:
logger.error('Not implemented!')
exit(1)
return df
class FeatureProcessor:
"""
Handle all feature manipulation before modelling
"""
@staticmethod
def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
df = df.drop(columns=[drop_columns])
return df
def retain_features(df: pd.DataFrame, features: List[str] = None):
"""
Determine which columns to keep ofr modelling
"""
if features is None:
features = df.columns
else:
if not set(features).issubset(df.columns):
logger.error('Features defined is not contained in data')
exit(1)
df = df[features]
return df
def process(self, df: pd.DataFrame) -> pd.DataFrame:
df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
df = self.retain_features(df, features=FEATURE_COLUMNS)
return df
def training(train_filepath: str, test_filepath: str) -> None:
"""
Pipeline to run training on the dataset
"""
logger.info('Loading data')
data = pd.read_parquet(filepath)
dataloader = DataLoader()
train_df = dataloader.load(filepath=train_filepath)
test_df = dataloader.load(filepath=test_filepath)
# df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
logger.info('Feature selection')
feature_columns = data.columns
data = data[feature_columns]
logger.info('Split data into train and validation')
logger.info('Feature processing')
feature_processor = FeatureProcessor()
train_df = feature_processor.process(train_df)
test_df = feature_processor.process(test_df)
# logger.info('Split data into train and validation')
logger.info('Build Model')
data = TabularDataset(data=train_df)
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
subsample_size = round(len(data)/4)
data = data.sample(subsample_size, random_state=RANDOM_SEED)
target_column = 'RDSAP_CHANGE'
predictor_RDSAP = TabularPredictor(
label=target_column,
path="agModels-predictRDSAP",
problem_type="regression",
eval_metric='mean_absolute_error'
).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN'])
logger.info('Evaluate matrics')
test_data = TabularDataset('./model_build_data/test_data.parquet')
performance = predictor_RDSAP.evaluate(test_data)
predictions = predictor_RDSAP.predict(test_data)
test_data['predictions'] = predictions
test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
if __name__ == "__main__":
logger.info('---Begin Pipeline---')
@ -44,4 +128,4 @@ if __name__ == "__main__":
logger.info('---Ingest Arguments---')
args = ingest_arguments()
training(filepath=args.filepath)
training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)