mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
added latest files
This commit is contained in:
parent
54964bcf17
commit
31832c7c26
5 changed files with 154 additions and 16 deletions
|
|
@ -4,26 +4,46 @@ from Settings import (
|
|||
FLOOR_LEVEL_MAP,
|
||||
BUILT_FORM_REMAP,
|
||||
EARLIEST_EPC_DATE,
|
||||
FULLY_GLAZED_DESCRIPTIONS
|
||||
FULLY_GLAZED_DESCRIPTIONS,
|
||||
FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
COMPONENT_FEATURES
|
||||
)
|
||||
from model_data.BaseUtility import BaseUtility
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
|
||||
RANDOM_SEED = 0
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
|
||||
|
||||
def main():
|
||||
FLOAT_COLUMNS = [
|
||||
'NUMBER_OPEN_FIREPLACES',
|
||||
'EXTENSION_COUNT',
|
||||
'TOTAL_FLOOR_AREA',
|
||||
'PHOTO_SUPPLY',
|
||||
'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
'FLOOR_HEIGHT',
|
||||
'NUMBER_HABITABLE_ROOMS',
|
||||
'LOW_ENERGY_LIGHTING',
|
||||
'MULTI_GLAZE_PROPORTION',
|
||||
'NUMBER_HEATED_ROOMS'
|
||||
]
|
||||
|
||||
def create_raw_data():
|
||||
"""
|
||||
Extract all information to do a simple predictor for RDSAP
|
||||
"""
|
||||
|
||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
directories = directories[0:10]
|
||||
# directories = directories[0:10]
|
||||
dfs = []
|
||||
for directory in tqdm(directories):
|
||||
filepath = directory / "certificates.csv"
|
||||
df = pd.read_csv(filepath)
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
|
||||
# Remove any bad uprns and ignore old/bad data
|
||||
df = df[~pd.isnull(df["UPRN"])]
|
||||
|
|
@ -53,11 +73,45 @@ def main():
|
|||
df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
|
||||
df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP)
|
||||
|
||||
# Keep only possible modelling columns
|
||||
df = df[[RDSAP_RESPONSE] + list(set(FIXED_FEATURES + LATEST_FIELD + COMPONENT_FEATURES))]
|
||||
|
||||
# Reduce memory usage
|
||||
|
||||
# df.memory_usage()
|
||||
# df.dtypes
|
||||
df[RDSAP_RESPONSE] = pd.to_numeric(df[RDSAP_RESPONSE], downcast='unsigned')
|
||||
df[FLOAT_COLUMNS] = df[FLOAT_COLUMNS].apply(pd.to_numeric, downcast='float')
|
||||
|
||||
|
||||
dfs.append(df)
|
||||
|
||||
data = pd.concat(dfs)
|
||||
data.to_parquet('./energy_predictor_data.parquet')
|
||||
|
||||
cleaned_data = data.dropna()
|
||||
# GIves you primarily flats
|
||||
cleaned_data.to_parquet('./energy_predictor_cleaned_data.parquet')
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
data = TabularDataset(data='./model_build_data/energy_data/cleaned_data/train_validation_data.parquet')
|
||||
|
||||
subsample_size = round(len(data)/100)
|
||||
data = data.sample(subsample_size, random_state=RANDOM_SEED)
|
||||
|
||||
predictor_RDSAP = TabularPredictor(
|
||||
label=RDSAP_RESPONSE,
|
||||
path="agModels-predictENERGY",
|
||||
problem_type="regression",
|
||||
eval_metric='mean_absolute_error'
|
||||
).fit(data, time_limit=800, presets='high_quality', excluded_model_types=['KNN', 'CAT'])
|
||||
|
||||
test_data = TabularDataset('./model_build_data/energy_data/cleaned_data/test_data.parquet')
|
||||
performance = predictor_RDSAP.evaluate(test_data)
|
||||
predictions = predictor_RDSAP.predict(test_data)
|
||||
predictor_RDSAP.feature_importance(test_data)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Binary file not shown.
Binary file not shown.
|
|
@ -28,7 +28,7 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp
|
|||
"""
|
||||
|
||||
logger.info('---Loading Data---')
|
||||
data = pd.read_parquet(filepath)
|
||||
data = pd.read_parquet(filepath).reset_index(drop=True)
|
||||
|
||||
if percentage and volume is None:
|
||||
test_amount = round(len(data)*percentage)
|
||||
|
|
@ -48,7 +48,7 @@ def main(filepath: str, output_folder: str, percentage: float, volume: int, samp
|
|||
sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index
|
||||
|
||||
train_validation_data = data.drop(sample_index)
|
||||
test_data = data.loc[sample_index]
|
||||
test_data = data.iloc[sample_index]
|
||||
|
||||
elif sampling =='stratified':
|
||||
# Not yet implemented
|
||||
|
|
|
|||
|
|
@ -1,9 +1,20 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
import argparse
|
||||
from typing import List
|
||||
from Logger import logger
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
|
||||
|
||||
DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
|
||||
FEATURE_COLUMNS = None
|
||||
RANDOM_SEED = 0
|
||||
|
||||
# FOR TESTING
|
||||
train_filepath = "./model_build_data/train_validation_data.parquet"
|
||||
test_filepath = "./model_build_data/test_data.parquet"
|
||||
|
||||
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
Helper function to take in arguments from script start
|
||||
|
|
@ -11,32 +22,105 @@ def ingest_arguments() -> argparse.Namespace:
|
|||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
|
||||
parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load')
|
||||
parser.add_argument('--train-filepath', type=str, help='Location of Parquet dataset to load for training')
|
||||
parser.add_argument('--test-filepath', type=str, help='Location of Parquet dataset to load for testing')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def training(filepath: str) -> None:
|
||||
|
||||
class DataLoader():
|
||||
|
||||
@staticmethod
|
||||
def load(filepath: str) -> pd.DataFrame:
|
||||
"""
|
||||
Load different datasets
|
||||
"""
|
||||
if filepath.endswith('.parquet'):
|
||||
df = pd.read_parquet(filepath)
|
||||
elif filepath.endswith('.csv.'):
|
||||
df = pd.read_csv(filepath)
|
||||
else:
|
||||
logger.error('Not implemented!')
|
||||
exit(1)
|
||||
|
||||
return df
|
||||
|
||||
class FeatureProcessor:
|
||||
"""
|
||||
Handle all feature manipulation before modelling
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def drop_columns(df: pd.DataFrame, drop_columns: str = DROP_COLUMNS) -> pd.DataFrame:
|
||||
df = df.drop(columns=[drop_columns])
|
||||
return df
|
||||
|
||||
def retain_features(df: pd.DataFrame, features: List[str] = None):
|
||||
"""
|
||||
Determine which columns to keep ofr modelling
|
||||
"""
|
||||
if features is None:
|
||||
features = df.columns
|
||||
else:
|
||||
if not set(features).issubset(df.columns):
|
||||
logger.error('Features defined is not contained in data')
|
||||
exit(1)
|
||||
|
||||
df = df[features]
|
||||
|
||||
return df
|
||||
|
||||
def process(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
df = self.drop_columns(df, drop_columns=DROP_COLUMNS)
|
||||
df = self.retain_features(df, features=FEATURE_COLUMNS)
|
||||
return df
|
||||
|
||||
|
||||
|
||||
def training(train_filepath: str, test_filepath: str) -> None:
|
||||
"""
|
||||
Pipeline to run training on the dataset
|
||||
"""
|
||||
|
||||
logger.info('Loading data')
|
||||
data = pd.read_parquet(filepath)
|
||||
dataloader = DataLoader()
|
||||
train_df = dataloader.load(filepath=train_filepath)
|
||||
test_df = dataloader.load(filepath=test_filepath)
|
||||
|
||||
# df = pd.read_parquet(train_filepath).drop(columns=['HEAT_DEMAND_CHANGE'])
|
||||
|
||||
logger.info('Feature selection')
|
||||
feature_columns = data.columns
|
||||
data = data[feature_columns]
|
||||
|
||||
logger.info('Split data into train and validation')
|
||||
logger.info('Feature processing')
|
||||
feature_processor = FeatureProcessor()
|
||||
train_df = feature_processor.process(train_df)
|
||||
test_df = feature_processor.process(test_df)
|
||||
|
||||
# logger.info('Split data into train and validation')
|
||||
|
||||
logger.info('Build Model')
|
||||
|
||||
data = TabularDataset(data=train_df)
|
||||
# data['RDSAP_CHANGE'] = data['RDSAP_CHANGE'].astype(float)
|
||||
subsample_size = round(len(data)/4)
|
||||
data = data.sample(subsample_size, random_state=RANDOM_SEED)
|
||||
|
||||
target_column = 'RDSAP_CHANGE'
|
||||
predictor_RDSAP = TabularPredictor(
|
||||
label=target_column,
|
||||
path="agModels-predictRDSAP",
|
||||
problem_type="regression",
|
||||
eval_metric='mean_absolute_error'
|
||||
).fit(data, time_limit=8000, presets='high_quality', excluded_model_types=['KNN'])
|
||||
|
||||
logger.info('Evaluate matrics')
|
||||
|
||||
test_data = TabularDataset('./model_build_data/test_data.parquet')
|
||||
performance = predictor_RDSAP.evaluate(test_data)
|
||||
predictions = predictor_RDSAP.predict(test_data)
|
||||
|
||||
test_data['predictions'] = predictions
|
||||
test_data['diff'] = abs(test_data['RDSAP_CHANGE'] - test_data['predictions'])
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info('---Begin Pipeline---')
|
||||
|
|
@ -44,4 +128,4 @@ if __name__ == "__main__":
|
|||
logger.info('---Ingest Arguments---')
|
||||
args = ingest_arguments()
|
||||
|
||||
training(filepath=args.filepath)
|
||||
training(train_filepath=args.train_filepath, test_filepath=args.test_filepath)
|
||||
Loading…
Add table
Reference in a new issue