mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
add a test data generation script
This commit is contained in:
parent
e3be4ba344
commit
db67e0e23f
9 changed files with 135 additions and 9 deletions
|
|
@ -1,6 +1,6 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from settings import (
|
||||
from simulation_system.Settings import (
|
||||
DATA_PROCESSOR_SETTINGS,
|
||||
EARLIEST_EPC_DATE,
|
||||
FULLY_GLAZED_DESCRIPTIONS,
|
||||
|
|
|
|||
22
model_data/simulation_system/Logger.py
Normal file
22
model_data/simulation_system/Logger.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
import logging
|
||||
|
||||
def setup_logger():
|
||||
# Create a logger
|
||||
logger = logging.getLogger()
|
||||
|
||||
# Set the log level
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# Create a formatter
|
||||
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
# Create a stream handler to direct logs to stdout
|
||||
stream_handler = logging.StreamHandler()
|
||||
stream_handler.setFormatter(formatter)
|
||||
|
||||
# Add the stream handler to the logger
|
||||
logger.addHandler(stream_handler)
|
||||
|
||||
return logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
|
@ -3,7 +3,7 @@ import pandas as pd
|
|||
from tqdm import tqdm
|
||||
from model_data.BaseUtility import BaseUtility
|
||||
from pathlib import Path
|
||||
from settings import (
|
||||
from simulation_system.Settings import (
|
||||
MANDATORY_FIXED_FEATURES,
|
||||
AVERAGE_FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
|
|
@ -29,7 +29,7 @@ def app():
|
|||
dataset = []
|
||||
|
||||
|
||||
for directory in tqdm(directories):
|
||||
for directory in tqdm(directories):
|
||||
|
||||
filepath = directory / "certificates.csv"
|
||||
|
||||
|
|
@ -75,7 +75,7 @@ def app():
|
|||
# If there are any NA value, back fill first (i.e most recent), then forward fill if needed
|
||||
modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
|
||||
|
||||
# Extract the columns that are non all None
|
||||
# Extract the columns that are not all None
|
||||
na_columns = modified_property_data[columns_to_merge_on].isna().all()
|
||||
columns_to_merge_on = na_columns.index[~na_columns].to_list()
|
||||
|
||||
|
|
|
|||
BIN
model_data/simulation_system/model_build_data/test_data.parquet
Normal file
BIN
model_data/simulation_system/model_build_data/test_data.parquet
Normal file
Binary file not shown.
Binary file not shown.
BIN
model_data/simulation_system/preprocessed_data/dataset.parquet
Normal file
BIN
model_data/simulation_system/preprocessed_data/dataset.parquet
Normal file
Binary file not shown.
77
model_data/simulation_system/test_data_generation.py
Normal file
77
model_data/simulation_system/test_data_generation.py
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
from Logger import logger
|
||||
import argparse
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
RANDOM_SEED = 0
|
||||
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
Helper function to take in arguments from script start
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
|
||||
parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load', required=True)
|
||||
parser.add_argument('--output-folder', type=str, help='Location of Parquet dataset to save', required=True)
|
||||
parser.add_argument('--percentage', type=float, help='Percentage of data to use as test data', default=None)
|
||||
parser.add_argument('--volume', type=int, help='Volume of data to use as test data', default=None)
|
||||
parser.add_argument('--sampling', type=str, help='Type of sampling to do for test data', choices=['random', 'stratified'], default='random')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def main(filepath: str, output_folder: str, percentage: float, volume: int, sampling: str):
|
||||
"""
|
||||
Load a dataset in and split out the training+validation data and the test data.
|
||||
"""
|
||||
|
||||
logger.info('---Loading Data---')
|
||||
data = pd.read_parquet(filepath)
|
||||
|
||||
if percentage and volume is None:
|
||||
test_amount = round(len(data)*percentage)
|
||||
elif percentage is None and volume:
|
||||
test_amount = volume
|
||||
elif percentage is None and volume is None:
|
||||
logger.error('No amount specified - please specify either a percentage or volume')
|
||||
exit(1)
|
||||
else:
|
||||
logger.info('Both percentage and volume specified - taking largest of the two')
|
||||
test_amount = max(round(len(data)*percentage), volume)
|
||||
|
||||
logger.info(f'---Extracting {test_amount} from dataset to be test data')
|
||||
|
||||
if sampling == 'random':
|
||||
logger.info('--- Using random sample method ---')
|
||||
sample_index = data.sample(n=test_amount, random_state=RANDOM_SEED).index
|
||||
|
||||
train_validation_data = data.drop(sample_index)
|
||||
test_data = data.loc[sample_index]
|
||||
|
||||
elif sampling =='stratified':
|
||||
# Not yet implemented
|
||||
pass
|
||||
|
||||
logger.info('--- Saving data ---')
|
||||
|
||||
train_validation_data.to_parquet(Path(output_folder)/'train_validation_data.parquet')
|
||||
test_data.to_parquet(Path(output_folder)/'test_data.parquet')
|
||||
|
||||
logger.info(' ---Pipeline complete---')
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
logger.info('--- Generate test data pipeline ---')
|
||||
|
||||
args = ingest_arguments()
|
||||
|
||||
main(
|
||||
filepath=args.filepath,
|
||||
output_folder=args.output_folder,
|
||||
percentage=args.percentage,
|
||||
volume=args.volume,
|
||||
sampling=args.sampling
|
||||
)
|
||||
|
||||
|
|
@ -1,20 +1,47 @@
|
|||
import os
|
||||
from logging import Logger
|
||||
import pandas as pd
|
||||
import argparse
|
||||
from Logger import logger
|
||||
from autogluon.tabular import TabularDataset, TabularPredictor
|
||||
|
||||
logger = Logger(__name__)
|
||||
def ingest_arguments() -> argparse.Namespace:
|
||||
"""
|
||||
Helper function to take in arguments from script start
|
||||
"""
|
||||
|
||||
def training():
|
||||
parser = argparse.ArgumentParser(description='Inputs for training script')
|
||||
|
||||
parser.add_argument('--filepath', type=str, help='Location of Parquet dataset to load')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def training(filepath: str) -> None:
|
||||
"""
|
||||
Pipeline to run training on the dataset
|
||||
"""
|
||||
|
||||
logger.info('Loading data')
|
||||
|
||||
data = pd.read_parquet(filepath)
|
||||
|
||||
logger.info('Feature selection')
|
||||
feature_columns = data.columns
|
||||
data = data[feature_columns]
|
||||
|
||||
logger.info('Split data into train and validation')
|
||||
|
||||
|
||||
logger.info('Build Model')
|
||||
|
||||
|
||||
logger.info('Evaluate matrics')
|
||||
|
||||
if __name__ == "__main__":
|
||||
training()
|
||||
|
||||
logger.info('---Begin Pipeline---')
|
||||
|
||||
logger.info('---Ingest Arguments---')
|
||||
args = ingest_arguments()
|
||||
|
||||
training(filepath=args.filepath)
|
||||
Loading…
Add table
Reference in a new issue