cleaned up prediction analysis code and add clipping to model

This commit is contained in:
Michael Duong 2023-09-28 18:09:48 +00:00
parent 56cf9c33d4
commit 84d3dee7d7
13 changed files with 230 additions and 102 deletions

View file

@ -15,6 +15,7 @@ from core.interface.InterfaceDataClient import DataClient
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.MLMetrics import metrics_factory
from configs.post_prediction_logic import post_prediction_logic
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -74,7 +75,9 @@ def build_model(
prediction_data = train_data.drop(columns=target)
fit_predictions = model.predict(data=prediction_data)
fit_predictions = model.predict(
data=prediction_data, post_prediction_logic=post_prediction_logic
)
logger.info("------------------------------")
logger.info("--- Generating fit metrics ---")

View file

@ -11,6 +11,6 @@ AutogluonAutoML:
output_filepath: ./data/model/autogluonmodel/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 60
time_limit: 600
presets: medium_quality
excluded_model_types: ['KNN']

View file

@ -3,7 +3,7 @@ feature_processor_config:
subsample_amount: null
subsample_seed: 0
target: SAP_ENDING
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE"]
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "RDSAP_CHANGE", "HEAT_DEMAND_ENDING", "CARBON_ENDING"]
# retain_features: ["TOTAL_FLOOR_AREA_STARTING", "SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "PHOTO_SUPPLY_STARTING", "MULTI_GLAZE_PROPORTION_STARTING", "LOW_ENERGY_LIGHTING_STARTING", "NUMBER_OPEN_FIREPLACES_STARTING", "EXTENSION_COUNT_STARTING", "FLOOR_HEIGHT_STARTING", "PHOTO_SUPPLY_ENDING", "MULTI_GLAZE_PROPORTION_ENDING", "LOW_ENERGY_LIGHTING_ENDING", "NUMBER_OPEN_FIREPLACES_ENDING", "EXTENSION_COUNT_ENDING", "TOTAL_FLOOR_AREA_ENDING", "FLOOR_HEIGHT_ENDING", "DAYS_TO_STARTING", "DAYS_TO_ENDING"]
# retain_features: null
# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
@ -32,29 +32,30 @@ feature_processor_config:
# 'DAYS_TO_STARTING',
# 'WALLS_DESCRIPTION_STARTING',
# 'FLOOR_DESCRIPTION_STARTING']
retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
'NUMBER_HEATED_ROOMS',
'FIXED_LIGHTING_OUTLETS_COUNT',
'CONSTRUCTION_AGE_BAND',
'TRANSACTION_TYPE_ENDING',
'LIGHTING_DESCRIPTION_ENDING',
'MAINHEAT_DESCRIPTION_ENDING',
'HOTWATER_DESCRIPTION_ENDING',
'MAIN_FUEL_ENDING',
'MECHANICAL_VENTILATION_ENDING',
'SECONDHEAT_DESCRIPTION_ENDING',
'ENERGY_TARIFF_ENDING',
'SOLAR_WATER_HEATING_FLAG_ENDING',
'PHOTO_SUPPLY_ENDING',
'WINDOWS_DESCRIPTION_ENDING',
'GLAZED_TYPE_ENDING',
'MULTI_GLAZE_PROPORTION_ENDING',
'LOW_ENERGY_LIGHTING_ENDING',
'NUMBER_OPEN_FIREPLACES_ENDING',
'MAINHEATCONT_DESCRIPTION_ENDING',
'EXTENSION_COUNT_ENDING',
'TOTAL_FLOOR_AREA_ENDING',
'FLOOR_HEIGHT_ENDING',
'DAYS_TO_ENDING',
'WALLS_DESCRIPTION_ENDING',
'FLOOR_DESCRIPTION_ENDING']
# retain_features: ["SAP_STARTING", 'PROPERTY_TYPE', 'BUILT_FORM', 'CONSTITUENCY', 'NUMBER_HABITABLE_ROOMS',
# 'NUMBER_HEATED_ROOMS',
# 'FIXED_LIGHTING_OUTLETS_COUNT',
# 'CONSTRUCTION_AGE_BAND',
# 'TRANSACTION_TYPE_ENDING',
# 'LIGHTING_DESCRIPTION_ENDING',
# 'MAINHEAT_DESCRIPTION_ENDING',
# 'HOTWATER_DESCRIPTION_ENDING',
# 'MAIN_FUEL_ENDING',
# 'MECHANICAL_VENTILATION_ENDING',
# 'SECONDHEAT_DESCRIPTION_ENDING',
# 'ENERGY_TARIFF_ENDING',
# 'SOLAR_WATER_HEATING_FLAG_ENDING',
# 'PHOTO_SUPPLY_ENDING',
# 'WINDOWS_DESCRIPTION_ENDING',
# 'GLAZED_TYPE_ENDING',
# 'MULTI_GLAZE_PROPORTION_ENDING',
# 'LOW_ENERGY_LIGHTING_ENDING',
# 'NUMBER_OPEN_FIREPLACES_ENDING',
# 'MAINHEATCONT_DESCRIPTION_ENDING',
# 'EXTENSION_COUNT_ENDING',
# 'TOTAL_FLOOR_AREA_ENDING',
# 'FLOOR_HEIGHT_ENDING',
# 'DAYS_TO_ENDING',
# 'WALLS_DESCRIPTION_ENDING',
# 'FLOOR_DESCRIPTION_ENDING']
retain_features: null

View file

@ -5,16 +5,40 @@ During the feature processor step, we can apply additional business logic and fe
"""
Business Logic dict + functions
"""
business_logic = {}
def remove_starting_columns(df):
keep_column_index = [
False if col_name.endswith("_STARTING") else True
for col_name in list(df.columns)
]
keep_columns = df.columns[keep_column_index].to_list()
keep_columns.append("SAP_STARTING")
df = df[keep_columns]
return df
# def keep_ending_columns(df):
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
# keep_columns = df.columns[ending_column_index].to_list()
# keep_columns.append("SAP_STARTING")
# print(keep_columns)
# df = df[keep_columns]
# return df
business_logic = {
"remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}
"""
New features dict + function
"""
def SAP_ENDING(df):
return df["SAP_STARTING"] + df["RDSAP_CHANGE"]
# def SAP_ENDING(df):
# return df["SAP_STARTING"] + df["RDSAP_CHANGE"]
new_feature_funcs = {"SAP_ENDING": SAP_ENDING}
# new_feature_funcs = {}
# new_feature_funcs = {"SAP_ENDING": SAP_ENDING}
new_feature_funcs = {}

View file

@ -0,0 +1,32 @@
"""
After predictions, we may want to apply some post processing to the predictions
"""
import pandas as pd
def clip_predictions_to_minimum_value(
data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 1
) -> pd.Series:
series_name = predictions.name
predictions.name = "predictions"
predictions_df = pd.concat([data, predictions], axis=1)
replace_index = predictions_df["SAP_STARTING"] > predictions_df["predictions"]
predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "SAP_STARTING"] + minimum_value
)
predictions_new = predictions_df["predictions"]
predictions_new.name = series_name
return predictions_new
# def round_predictions(data: pd.DataFrame, predictions: pd.Series) -> pd.Series:
# return predictions.round()
post_prediction_logic = {
"clip_predictions_to_minimum_value": clip_predictions_to_minimum_value,
# "round_predictions": round_predictions
}

View file

@ -1 +1,4 @@
dataclient_type: local
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
row_index: [0, 10, 20] # index of an example datapoint

View file

@ -1,6 +1,7 @@
input_dataclient_type: aws-s3
output_dataclient_type: local
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet

View file

@ -109,7 +109,9 @@ class DataFrameFeatureProcessor:
# TODO: to test
for key, value in new_feature_funcs.items():
df[key] = value(df)
key_column = value(df)
key_column.name = key
df = pd.concat([df, key_column], axis=1)
return df

View file

@ -75,7 +75,9 @@ class SKLearnLinearRegression:
y_train = data[target]
self.model.fit(x_train, y_train)
def predict(self, data: pd.DataFrame) -> pd.Series:
def predict(
self, data: pd.DataFrame, post_prediction_logic: dict | None = None
) -> pd.Series:
"""
Method to predict
"""
@ -128,7 +130,9 @@ class SKLearnSVMRegression:
y_train = data[target]
self.model.fit(x_train, y_train)
def predict(self, data: pd.DataFrame) -> pd.Series:
def predict(
self, data: pd.DataFrame, post_prediction_logic: dict | None = None
) -> pd.Series:
"""
Method to predict
"""
@ -197,15 +201,39 @@ class AutogluonAutoML:
excluded_model_types=model_hyperparameters["excluded_model_types"],
)
def predict(self, data: pd.DataFrame) -> pd.Series:
def predict(
self, data: pd.DataFrame, post_prediction_logic: dict | None = None
) -> pd.Series:
"""
Method to predict
"""
if post_prediction_logic is None:
post_prediction_logic = {}
if self.model is None:
print("No model loaded/ trained")
exit(1)
predictions = pd.Series(self.model.predict(data))
if len(post_prediction_logic) != 0:
predictions = self._apply_post_prediction_logic(
data=data,
predictions=predictions,
post_prediction_logic=post_prediction_logic,
)
return predictions
def _apply_post_prediction_logic(
self, data: pd.DataFrame, predictions: pd.Series, post_prediction_logic: dict
):
"""
For predictions, we can apply post processing logic to clean up predictions
"""
for _, value in post_prediction_logic.items():
predictions = value(data, predictions)
return predictions

View file

@ -32,7 +32,9 @@ class MLModel(Protocol):
"""
...
def predict(self, data: pd.DataFrame) -> pd.Series:
def predict(
self, data: pd.DataFrame, post_prediction_logic: dict | None
) -> pd.Series:
"""
Method to predict
"""

View file

@ -15,20 +15,20 @@ stages:
outs:
- path: data/prepared_data/
hash: md5
md5: c183712d22ab739e0be016724f44ee1c.dir
size: 12203729
md5: 2f00c92bf2fff7ed8006f4036f8f7d06.dir
size: 21102167
nfiles: 2
build_model:
cmd: python build_model.py
deps:
- path: build_model.py
hash: md5
md5: f9fa2a66d908b42ae196ce6f0f782258
size: 5134
md5: 84b86e829cb164fb2a202033f39e66e8
size: 5243
- path: data/prepared_data
hash: md5
md5: c183712d22ab739e0be016724f44ee1c.dir
size: 12203729
md5: 2f00c92bf2fff7ed8006f4036f8f7d06.dir
size: 21102167
nfiles: 2
params:
configs/build_model.yaml:
@ -36,7 +36,7 @@ stages:
output_filepath: ./data/model/autogluonmodel/
problem_type: regression
eval_metric: mean_absolute_error
time_limit: 60
time_limit: 600
presets: medium_quality
excluded_model_types:
- KNN
@ -49,30 +49,30 @@ stages:
outs:
- path: data/model/
hash: md5
md5: cb03448b572cb167bf281ee8d43dccd9.dir
size: 99423757
nfiles: 14
md5: d9b051bb9cc626b4fc4b77873838f029.dir
size: 242877007
nfiles: 18
- path: metrics/fit_metrics.json
hash: md5
md5: 48d9cc86c22c1ac0da8903a32a7d10c3
size: 183
md5: bbf8a1bb90cd8d9fea447ca97fe8eea3
size: 180
generate_predictions:
cmd: python generate_predictions.py
deps:
- path: data/model
hash: md5
md5: cb03448b572cb167bf281ee8d43dccd9.dir
size: 99423757
nfiles: 14
md5: d9b051bb9cc626b4fc4b77873838f029.dir
size: 242877007
nfiles: 18
- path: data/prepared_data
hash: md5
md5: c183712d22ab739e0be016724f44ee1c.dir
size: 12203729
md5: 2f00c92bf2fff7ed8006f4036f8f7d06.dir
size: 21102167
nfiles: 2
- path: generate_predictions.py
hash: md5
md5: a25c4611ff467cdc1c921918112a30fe
size: 4311
md5: 20c4657f5872cb8b60b69344600251b8
size: 4420
params:
configs/generate_predictions.yaml:
input_dataclient_type: local
@ -83,21 +83,21 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 3d5002f0eecd2374a0ef2fd6f711503e.dir
size: 383878
md5: 81f707df70bc0d9f7b305427e0034ed1.dir
size: 383598
nfiles: 1
generate_metrics:
cmd: python generate_metrics.py
deps:
- path: data/predictions
hash: md5
md5: 3d5002f0eecd2374a0ef2fd6f711503e.dir
size: 383878
md5: 81f707df70bc0d9f7b305427e0034ed1.dir
size: 383598
nfiles: 1
- path: data/prepared_data
hash: md5
md5: c183712d22ab739e0be016724f44ee1c.dir
size: 12203729
md5: 2f00c92bf2fff7ed8006f4036f8f7d06.dir
size: 21102167
nfiles: 2
- path: generate_metrics.py
hash: md5
@ -111,8 +111,8 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 08a81d2e5cecf360043498526bc98314
size: 183
md5: 75baa77d94386c9a567afdac48384435
size: 185
startup_cleanup:
cmd: python startup_cleanup.py
deps:

View file

@ -12,6 +12,7 @@ from core.interface.InterfaceDataClient import DataClient
from core.DataClient import dataclient_factory
from core.MLModels import model_factory
from core.Logger import logger
from configs.post_prediction_logic import post_prediction_logic
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
@ -70,7 +71,9 @@ def generate_predictions(
test_data.drop(columns=target) if target in test_data.columns else test_data
)
predictions = model.predict(data=prediction_data)
predictions = model.predict(
data=prediction_data, post_prediction_logic=post_prediction_logic
)
logger.info("--------------------------")
logger.info("--- Saving predictions ---")

View file

@ -14,6 +14,7 @@ shap.initjs()
import yaml
from typing import List
from pathlib import Path
from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceDataClient import DataClient
@ -36,6 +37,11 @@ feature_process_params = yaml.safe_load(open(feature_process_path))
build_model_path = Path(__file__).parent / "configs" / "build_model.yaml"
build_model_params = yaml.safe_load(open(build_model_path))
generate_predictions_path = (
Path(__file__).parent / "configs" / "generate_predictions.yaml"
)
generate_predictions_params = yaml.safe_load(open(generate_predictions_path))
prediction_analysis_path = (
Path(__file__).parent / "configs" / "prediction_analysis.yaml"
)
@ -50,29 +56,40 @@ dataclient = dataclient_factory(
dataclient_config=client_params[dataclient_type],
)
target = feature_process_params["feature_processor_config"]["target"]
predictions_column_name = generate_predictions_params["predictions_column_name"]
output_test_filepath = prepare_data_params["output_test_filepath"]
predictions_output_filepath = generate_predictions_params["predictions_output_filepath"]
nshap_samples = prediction_analysis_params["nshap_samples"]
row_index = prediction_analysis_params["row_index"]
def prediction_analysis(
model: MLModel, dataclient: DataClient, output_test_filepath: str
model: MLModel,
dataclient: DataClient,
target: str,
predictions_column_name: str,
output_test_filepath: str,
predictions_output_filepath: str,
nshap_samples: int,
row_index: List[int],
):
test_df = dataclient.load_data(output_test_filepath)
predictions = dataclient.load_data("./data/predictions/predictions.parquet")
predictions = dataclient.load_data(predictions_output_filepath)
mix_df = test_df.copy()
mix_df["predictions"] = predictions
mix_df["residual"] = abs(mix_df["predictions"] - mix_df["SAP_ENDING"])
mix_df = pd.concat([test_df.copy(), predictions], axis=1)
mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
target = "SAP_ENDING"
test_df_without_target = test_df.drop(columns=[target])
# test_df_summary = shap.kmeans(test_df, 10)
# print("Baseline feature-values: \n", test_df_summary)
class AutogluonWrapper:
def __init__(self, predictor, feature_names):
self.ag_model = predictor
class ModelWrapper:
def __init__(self, model, feature_names):
self.model = model
self.feature_names = feature_names
def predict(self, X):
@ -80,33 +97,39 @@ def prediction_analysis(
X = X.values.reshape(1, -1)
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X, columns=self.feature_names)
return self.ag_model.predict(X)
return self.model.predict(X)
model_wrapper = ModelWrapper(model, feature_names=test_df_without_target.columns)
ag_wrapper = AutogluonWrapper(
model.model, feature_names=test_df_without_target.columns
)
explainer = shap.KernelExplainer(
ag_wrapper.predict, test_df_without_target.head(100)
model_wrapper.predict, test_df_without_target.head(100)
)
NSHAP_SAMPLES = 100 # how many samples to use to approximate each Shapely value, larger values will be slower
N_VAL = 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
shap_predictions_df = pd.DataFrame(index=test_df_without_target.columns)
for index in row_index:
single_datapoint = test_df_without_target.iloc[[index]]
# single_prediction = model_wrapper.predict(single_datapoint)
shap_values_single = explainer.shap_values(
single_datapoint, nsamples=nshap_samples
)
shap.force_plot(
explainer.expected_value,
shap_values_single,
test_df_without_target.iloc[index, :],
)
shap_single_prediction_df = pd.DataFrame(
shap_values_single, columns=test_df_without_target.columns
).T
shap_single_prediction_df.columns = [index]
shap_single_prediction_df = shap_single_prediction_df.sort_values(index)
shap_predictions_df = pd.merge(
left=shap_predictions_df,
right=shap_single_prediction_df,
left_index=True,
right_index=True,
)
ROW_INDEX = 8541 # 23690 #21059 # index of an example datapoint
single_datapoint = test_df_without_target.iloc[[ROW_INDEX]]
single_prediction = ag_wrapper.predict(single_datapoint)
shap_values_single = explainer.shap_values(single_datapoint, nsamples=NSHAP_SAMPLES)
shap.force_plot(
explainer.expected_value,
shap_values_single,
test_df_without_target.iloc[ROW_INDEX, :],
)
shap_single_prediciton_df = pd.DataFrame(
shap_values_single, columns=test_df_without_target.columns
).T
shap_single_prediciton_df.columns = ["contribution"]
shap_single_prediciton_df = shap_single_prediciton_df.sort_values("contribution")
return shap_predictions_df
if __name__ == "__main__":
@ -116,7 +139,13 @@ if __name__ == "__main__":
logger.info("----------------------------")
prediction_analysis(
model=model, dataclient=dataclient, output_test_filepath=output_test_filepath
model=model,
dataclient=dataclient,
target=target,
predictions_column_name=predictions_column_name,
output_test_filepath=output_test_filepath,
nshap_samples=nshap_samples,
row_index=row_index,
)
logger.info("-------------------------------")