mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
merged in changes, change imports to stay within core for smaller dependency in dockerfile
This commit is contained in:
commit
0f2cd9b09d
10 changed files with 53 additions and 43 deletions
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -127,6 +127,7 @@ venv/
|
||||||
ENV/
|
ENV/
|
||||||
env.bak/
|
env.bak/
|
||||||
venv.bak/
|
venv.bak/
|
||||||
|
.training_env/
|
||||||
|
|
||||||
# Spyder project settings
|
# Spyder project settings
|
||||||
.spyderproject
|
.spyderproject
|
||||||
|
|
@ -252,6 +253,7 @@ backend/.idea
|
||||||
open_uprn/.idea/
|
open_uprn/.idea/
|
||||||
conservation_areas/.idea/
|
conservation_areas/.idea/
|
||||||
model_data/.idea/
|
model_data/.idea/
|
||||||
|
model_data/simulation_system/.idea/
|
||||||
|
|
||||||
model_data/simulation_system/data*
|
model_data/simulation_system/data*
|
||||||
|
model_data/simulation_system/model_directory/
|
||||||
|
|
|
||||||
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
||||||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||||
</content>
|
</content>
|
||||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Python 3.10 (simulation_system_prediction)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
||||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -1,6 +1,6 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<project version="4">
|
<project version="4">
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (simulation_system_prediction)" project-jdk-type="Python SDK" />
|
||||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||||
<option name="version" value="3" />
|
<option name="version" value="3" />
|
||||||
</component>
|
</component>
|
||||||
|
|
|
||||||
0
__init__.py
Normal file
0
__init__.py
Normal file
|
|
@ -129,17 +129,15 @@ class AutogluonModel:
|
||||||
|
|
||||||
return metrics_df
|
return metrics_df
|
||||||
|
|
||||||
def optimise_model_for_deployment(self, deployment_path: Path = None) -> None:
|
def optimise_model_for_deployment(self, deployment_path: Path = None) -> str:
|
||||||
"""
|
"""
|
||||||
We can optimise the deployment for a autogluon model
|
We can optimise the deployment for a autogluon model
|
||||||
"""
|
"""
|
||||||
if self.model is None:
|
if self.model is None:
|
||||||
logger.error("No model to optimise for deployment")
|
raise ValueError("No model to optimise for deployment")
|
||||||
exit(1)
|
|
||||||
|
|
||||||
if deployment_path is None:
|
if deployment_path is None:
|
||||||
logger.error("Deployment path required")
|
raise ValueError("Deployment path required")
|
||||||
exit(1)
|
|
||||||
|
|
||||||
# This will return a string path of the location
|
# This will return a string path of the location
|
||||||
return self.model.clone_for_deployment(deployment_path)
|
return self.model.clone_for_deployment(deployment_path)
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,18 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from core.Logger import logger
|
import os
|
||||||
|
|
||||||
class DataLoader():
|
|
||||||
|
class DataLoader:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
|
def load(filepath: str, index_col: str = None) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Load different datasets
|
Load different datasets
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if not os.path.exists(filepath):
|
||||||
|
raise FileNotFoundError(f"File not found: {filepath}")
|
||||||
|
|
||||||
if filepath.endswith('.parquet'):
|
if filepath.endswith('.parquet'):
|
||||||
df = pd.read_parquet(filepath)
|
df = pd.read_parquet(filepath)
|
||||||
if index_col is not None:
|
if index_col is not None:
|
||||||
|
|
@ -15,7 +20,6 @@ class DataLoader():
|
||||||
elif filepath.endswith('.csv'):
|
elif filepath.endswith('.csv'):
|
||||||
df = pd.read_csv(filepath, index_col=index_col)
|
df = pd.read_csv(filepath, index_col=index_col)
|
||||||
else:
|
else:
|
||||||
logger.error('Not implemented!')
|
raise ValueError(f"File format not supported for file: {filepath}")
|
||||||
exit(1)
|
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ class DataProcessor:
|
||||||
|
|
||||||
def __init__(self, filepath: Path) -> None:
|
def __init__(self, filepath: Path) -> None:
|
||||||
self.filepath = filepath
|
self.filepath = filepath
|
||||||
|
self.data = None
|
||||||
|
|
||||||
def load_data(self, low_memory=False) -> None:
|
def load_data(self, low_memory=False) -> None:
|
||||||
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
|
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
|
||||||
|
|
|
||||||
|
|
@ -6,18 +6,21 @@ import pandas as pd
|
||||||
from typing import List
|
from typing import List
|
||||||
from core.Logger import logger
|
from core.Logger import logger
|
||||||
|
|
||||||
RDSAP_CHANGE_DROP_COLUMNS = ['UPRN', 'HEAT_DEMAND_CHANGE']
|
RDSAP_CHANGE_DROP_COLUMNS = ["UPRN", "HEAT_DEMAND_CHANGE"]
|
||||||
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ['UPRN', 'RDSAP_CHANGE']
|
HEAT_DEMAND_CHANGE_DROP_COLUMNS = ["UPRN", "RDSAP_CHANGE"]
|
||||||
|
|
||||||
|
RANDOM_SEED = 0
|
||||||
|
|
||||||
|
|
||||||
RANDOM_SEED = 0
|
|
||||||
|
|
||||||
class FeatureProcessor:
|
class FeatureProcessor:
|
||||||
"""
|
"""
|
||||||
Handle all feature manipulation before modelling
|
Handle all feature manipulation before modelling
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def drop_unused_columns(df: pd.DataFrame, target_column: str = "RDSAP_CHANGE") -> pd.DataFrame:
|
def drop_unused_columns(
|
||||||
|
df: pd.DataFrame, target_column: str = "RDSAP_CHANGE"
|
||||||
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Remove the unused columns for RDS
|
Remove the unused columns for RDS
|
||||||
"""
|
"""
|
||||||
|
|
@ -36,13 +39,13 @@ class FeatureProcessor:
|
||||||
features = df.columns
|
features = df.columns
|
||||||
else:
|
else:
|
||||||
if not set(features).issubset(df.columns):
|
if not set(features).issubset(df.columns):
|
||||||
logger.error('Features defined is not contained in data')
|
logger.error("Features defined is not contained in data")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
df = df[features]
|
df = df[features]
|
||||||
|
|
||||||
return df
|
return df
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
|
def subsample_data(df: pd.DataFrame, subsample_amount: int = None) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
|
|
@ -53,14 +56,13 @@ class FeatureProcessor:
|
||||||
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
|
df = df.sample(subsample_amount, random_state=RANDOM_SEED)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
def process(
|
def process(
|
||||||
self,
|
self,
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
target_column: str = "RDSAP_CHANGE",
|
target_column: str = "RDSAP_CHANGE",
|
||||||
features: List[str] = None,
|
features: List[str] = None,
|
||||||
subsample_amount: int = None
|
subsample_amount: int = None,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
"""
|
||||||
Pipeline to get data ready for building a model
|
Pipeline to get data ready for building a model
|
||||||
"""
|
"""
|
||||||
|
|
|
||||||
|
|
@ -4,13 +4,12 @@ Script to load MLModel class and generate predictions
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import argparse
|
import argparse
|
||||||
from MLModel.Models import AutogluonModel
|
|
||||||
from core.Logger import logger
|
|
||||||
from core.DataLoader import DataLoader
|
|
||||||
from pathlib import Path
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from MLModel.Models import AutogluonModel
|
||||||
|
from core.Logger import logger
|
||||||
|
from core.DataLoader import DataLoader
|
||||||
from core.Settings import (
|
from core.Settings import (
|
||||||
BASE_REGISTRY_PATH,
|
BASE_REGISTRY_PATH,
|
||||||
REGISTRY_FILE,
|
REGISTRY_FILE,
|
||||||
|
|
@ -23,7 +22,8 @@ from core.Settings import (
|
||||||
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
|
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
|
||||||
|
|
||||||
# FOR TESTING
|
# FOR TESTING
|
||||||
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to DataFrame)
|
# For now just loading data first and then passing into function (i.e. as if we receive json data and convert to
|
||||||
|
# DataFrame)
|
||||||
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
|
# TEST_DATA = DataLoader.load(filepath="../simulation_system/model_build_data/change_data/rdsap_full/test_data.parquet")
|
||||||
# DATA = TEST_DATA.sample(1)
|
# DATA = TEST_DATA.sample(1)
|
||||||
|
|
||||||
|
|
@ -110,6 +110,7 @@ def prediction(
|
||||||
|
|
||||||
logger.info("--- Loading Model ---")
|
logger.info("--- Loading Model ---")
|
||||||
model = AutogluonModel()
|
model = AutogluonModel()
|
||||||
|
|
||||||
model.load_model(filepath=model_location)
|
model.load_model(filepath=model_location)
|
||||||
|
|
||||||
logger.info("--- Generating Predictions ---")
|
logger.info("--- Generating Predictions ---")
|
||||||
|
|
@ -143,7 +144,6 @@ def prediction(
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
args = ingest_arguments()
|
args = ingest_arguments()
|
||||||
|
|
||||||
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
|
# Data can be passed in as JSON string: python3 predictions.py --data '{"TOTAL_FLOOR_AREA": 1}'
|
||||||
|
|
|
||||||
|
|
@ -3,11 +3,13 @@ import argparse
|
||||||
# import boto3
|
# import boto3
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import pandas as pd
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from MLModel.Models import AutogluonModel
|
||||||
from core.Logger import logger
|
from core.Logger import logger
|
||||||
from core.DataLoader import DataLoader
|
from core.DataLoader import DataLoader
|
||||||
from core.FeatureProcessor import FeatureProcessor
|
from core.FeatureProcessor import FeatureProcessor
|
||||||
from MLModel.Models import AutogluonModel
|
|
||||||
import pandas as pd
|
|
||||||
from core.Settings import (
|
from core.Settings import (
|
||||||
MODEL_DIRECTORY,
|
MODEL_DIRECTORY,
|
||||||
BASE_REGISTRY_PATH,
|
BASE_REGISTRY_PATH,
|
||||||
|
|
@ -30,8 +32,6 @@ from core.Settings import (
|
||||||
SEABORN_RESIDUAL_LINE_COLOUR,
|
SEABORN_RESIDUAL_LINE_COLOUR,
|
||||||
SEABORN_RESIDUAL_LINE_WIDTH,
|
SEABORN_RESIDUAL_LINE_WIDTH,
|
||||||
)
|
)
|
||||||
import seaborn as sns
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
|
|
||||||
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
|
TIMESTAMP = datetime.now().strftime(TIMESTAMP_FORMAT)
|
||||||
|
|
||||||
|
|
@ -137,8 +137,7 @@ def training(
|
||||||
|
|
||||||
model = AutogluonModel(output_filepath=output_base / MODEL_FOLDER)
|
model = AutogluonModel(output_filepath=output_base / MODEL_FOLDER)
|
||||||
else:
|
else:
|
||||||
logger.error("No alternative model implemented yet")
|
raise ValueError("No alternative model implemented yet")
|
||||||
exit(1)
|
|
||||||
|
|
||||||
model.train_model(
|
model.train_model(
|
||||||
data=train_df, target_column=target_column, hyperparameters=hyperparameters
|
data=train_df, target_column=target_column, hyperparameters=hyperparameters
|
||||||
|
|
@ -207,7 +206,6 @@ def training(
|
||||||
# TODO: Need a model registry - for now have this as a CSV
|
# TODO: Need a model registry - for now have this as a CSV
|
||||||
# Save this in the model directory
|
# Save this in the model directory
|
||||||
logger.info("--- Append registry with new model ---")
|
logger.info("--- Append registry with new model ---")
|
||||||
|
|
||||||
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
|
registry_path = BASE_REGISTRY_PATH / target_column / REGISTRY_FILE
|
||||||
|
|
||||||
if registry_path.exists():
|
if registry_path.exists():
|
||||||
|
|
@ -244,7 +242,8 @@ def training(
|
||||||
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
|
registry_row = pd.concat([model_details_df, metrics_df], axis=1)
|
||||||
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
|
registry_df = pd.concat([registry_df, registry_row], axis=0).reset_index(drop=True)
|
||||||
|
|
||||||
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and regenerate new metrics
|
# TODO: will need a rebuild script metric script -i.e. if we add new metrics, we will want to load models and
|
||||||
|
# regenerate new metrics
|
||||||
# TODO: decide metric to optimise to
|
# TODO: decide metric to optimise to
|
||||||
registry_df = registry_df.sort_values(
|
registry_df = registry_df.sort_values(
|
||||||
"mean_absolute_error", ascending=False
|
"mean_absolute_error", ascending=False
|
||||||
|
|
@ -253,6 +252,8 @@ def training(
|
||||||
registry_df.loc[0, "best_model"] = True
|
registry_df.loc[0, "best_model"] = True
|
||||||
|
|
||||||
logger.info("--- Saving new model to registry ---")
|
logger.info("--- Saving new model to registry ---")
|
||||||
|
# Ensure the directory exists
|
||||||
|
registry_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
registry_df.to_csv(registry_path, index=False)
|
registry_df.to_csv(registry_path, index=False)
|
||||||
|
|
||||||
logger.info("--- Training Pipeline Complete --- ")
|
logger.info("--- Training Pipeline Complete --- ")
|
||||||
|
|
@ -265,7 +266,9 @@ if __name__ == "__main__":
|
||||||
logger.info("---Ingest Arguments---")
|
logger.info("---Ingest Arguments---")
|
||||||
args = ingest_arguments()
|
args = ingest_arguments()
|
||||||
|
|
||||||
# To run script: python3 training.py --train-filepath ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath ./model_build_data/change_data/rdsap_full/test_data.parquet
|
# To run script: python3 training.py --train-filepath
|
||||||
|
# ./model_build_data/change_data/rdsap_full/train_validation_data.parquet --test-filepath
|
||||||
|
# ./model_build_data/change_data/rdsap_full/test_data.parquet
|
||||||
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
|
# TODO: Ingest hyper parameters from somewhere - currently change at the top of script
|
||||||
training(
|
training(
|
||||||
train_filepath=args.train_filepath,
|
train_filepath=args.train_filepath,
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue