Compare commits

..

No commits in common. "heat@v0.6.0" and "master" have entirely different histories.

7 changed files with 59 additions and 76 deletions

View file

@ -4,7 +4,9 @@ After the model is built, we can evaluate its performance
""" """
import os import os
import yaml
import pandas as pd import pandas as pd
from pathlib import Path
from core.interface.InterfaceModels import MLModel from core.interface.InterfaceModels import MLModel
from core.interface.InterfaceMetrics import MLMetrics from core.interface.InterfaceMetrics import MLMetrics
from core.interface.InterfaceDataClient import DataClient from core.interface.InterfaceDataClient import DataClient

View file

@ -18,39 +18,30 @@ def remove_starting_columns(df):
return df return df
def keep_negative_heat_change(df): def remove_floor_height_ending(df):
df = df[df["heat_demand_change"] < 0] # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
return df return df
def keep_negative_carbon_change(df): def remove_minimum_habitable_room_size(df):
df = df[df["carbon_change"] < 0] # Need minimum of 6.5m per habitable room
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
return df return df
# TODO: Move to ETL pipeline def keep_flats(df):
def remove_unreasonable_habitable_rooms(df): df = df[df["property_type"] == "Flat"]
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5
)
df = df[minimum_room_size_index]
return df return df
def remove_top_1_percent_heat_demand(df): def keep_non_zero_rdsap(df):
# threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%'] df = df[df["rdsap_change"] != 0]
threshold_value = 860
df = df[df["heat_demand_starting"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["carbon_starting"] < threshold_value]
return df return df
@ -63,11 +54,10 @@ def remove_top_1_percent_carbon(df):
# return df # return df
business_logic = { business_logic = {
"remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms, # "keep_non_zero_rdsap": keep_non_zero_rdsap,
"keep_negative_heat_change": keep_negative_heat_change, # "keep_flats": keep_flats,
"keep_negative_carbon_change": keep_negative_carbon_change, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
"remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand, # "remove_floor_height_ending": remove_floor_height_ending
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
# "remove_starting_columns": remove_starting_columns # "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns # "keep_ENDING_COLUMNS": keep_ending_columns
} }

View file

@ -1,7 +1,6 @@
""" """
After predictions, we may want to apply some post processing to the predictions After predictions, we may want to apply some post processing to the predictions
""" """
import pandas as pd import pandas as pd
@ -14,11 +13,10 @@ def clip_predictions_to_minimum_value(
predictions_df = pd.concat([data, predictions], axis=1) predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement # We expect all prediction to be atleast one point improvement
replace_index = ( replace_index = (
predictions_df["predictions"] predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
> predictions_df["heat_demand_starting"] - minimum_value
) )
predictions_df.loc[replace_index, "predictions"] = ( predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "heat_demand_starting"] - minimum_value predictions_df.loc[replace_index, "sap_starting"] + minimum_value
) )
predictions_new = predictions_df["predictions"] predictions_new = predictions_df["predictions"]

View file

@ -8,6 +8,6 @@ default:
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -31,13 +31,13 @@ default:
feature_processor_config: feature_processor_config:
subsample_amount: null subsample_amount: null
subsample_seed: 0 subsample_seed: 0
target: heat_demand_ending target: sap_ending
identifier_columns: ["uprn"] identifier_columns: ["uprn"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
drop_columns: [ drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "sap_ending", "carbon_ending", "days_to_starting", "days_to_ending", "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms'] 'number_habitable_rooms', 'number_heated_rooms']
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', # 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',

View file

@ -24,7 +24,7 @@ stages:
- heat_demand_change - heat_demand_change
- carbon_change - carbon_change
- rdsap_change - rdsap_change
- sap_ending - heat_demand_ending
- carbon_ending - carbon_ending
- days_to_starting - days_to_starting
- days_to_ending - days_to_ending
@ -37,7 +37,7 @@ stages:
default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: heat_demand_ending default.feature_processor.feature_processor_config.target: sap_ending
default.feature_processor.feature_processor_type: dataframe default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
@ -49,8 +49,8 @@ stages:
outs: outs:
- path: data/prepared_data/ - path: data/prepared_data/
hash: md5 hash: md5
md5: 13cd955d579de20efe743f82bc434c7e.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 37294025 size: 45056059
nfiles: 2 nfiles: 2
build_model: build_model:
cmd: python 2_build_model.py cmd: python 2_build_model.py
@ -61,8 +61,8 @@ stages:
size: 4820 size: 4820
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 13cd955d579de20efe743f82bc434c7e.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 37294025 size: 45056059
nfiles: 2 nfiles: 2
params: params:
configs/build_model.yaml: configs/build_model.yaml:
@ -94,18 +94,18 @@ stages:
outs: outs:
- path: data/fit_predictions/ - path: data/fit_predictions/
hash: md5 hash: md5
md5: b9c9ca64ea6973c409c3a7b8f8ed0c3e.dir md5: d9c9afc05e8780db47c0548b19bf7d19.dir
size: 2902493 size: 3349989
nfiles: 1 nfiles: 1
- path: data/model/ - path: data/model/
hash: md5 hash: md5
md5: a9215bba342ed7ec3f97815dfef94e48.dir md5: 13c3100e1486c27a83a8a47491077842.dir
size: 727501601 size: 773523079
nfiles: 36 nfiles: 36
- path: metrics/fit_metrics.json - path: metrics/fit_metrics.json
hash: md5 hash: md5
md5: 548a431d58cd4f5a3118235dec734372 md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
size: 219 size: 224
generate_predictions: generate_predictions:
cmd: python 3_generate_predictions.py cmd: python 3_generate_predictions.py
deps: deps:
@ -115,13 +115,13 @@ stages:
size: 2464 size: 2464
- path: data/model - path: data/model
hash: md5 hash: md5
md5: a9215bba342ed7ec3f97815dfef94e48.dir md5: 13c3100e1486c27a83a8a47491077842.dir
size: 727501601 size: 773523079
nfiles: 36 nfiles: 36
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 13cd955d579de20efe743f82bc434c7e.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 37294025 size: 45056059
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -133,25 +133,25 @@ stages:
outs: outs:
- path: data/predictions/ - path: data/predictions/
hash: md5 hash: md5
md5: 484781d6b359e458a25e9ab728d6514d.dir md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 380517 size: 463197
nfiles: 1 nfiles: 1
generate_metrics: generate_metrics:
cmd: python 4_generate_metrics.py cmd: python 4_generate_metrics.py
deps: deps:
- path: 4_generate_metrics.py - path: 4_generate_metrics.py
hash: md5 hash: md5
md5: d61bb524f706917f6a3eb72b1ab8bc61 md5: 4fedb86d89d528f0a6597934ba3890a0
size: 3447 size: 3484
- path: data/predictions - path: data/predictions
hash: md5 hash: md5
md5: 484781d6b359e458a25e9ab728d6514d.dir md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 380517 size: 463197
nfiles: 1 nfiles: 1
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: 13cd955d579de20efe743f82bc434c7e.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 37294025 size: 45056059
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -161,8 +161,8 @@ stages:
outs: outs:
- path: metrics/metrics.json - path: metrics/metrics.json
hash: md5 hash: md5
md5: 4d246765aff7c45079d02b4d8f7527f7 md5: 3e08df02fd5c5d094bcf936e1338d596
size: 220 size: 223
generate_scenerio_metrics: generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py cmd: python 5_generate_scenarios.py
deps: deps:
@ -176,14 +176,15 @@ stages:
input_dataclient_type: aws-s3 input_dataclient_type: aws-s3
output_dataclient_type: local output_dataclient_type: local
scenario_data_filepaths: scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md metrics_output_filepath: ./metrics/scenario_metrics.md
outs: outs:
- path: metrics/scenario_metrics.md - path: metrics/scenario_metrics.md
hash: md5 hash: md5
md5: d41d8cd98f00b204e9800998ecf8427e md5: fa4d6d7bbd7818613800da5f8f37ea96
size: 0 size: 363
- path: metrics/scenario_table.md - path: metrics/scenario_table.md
hash: md5 hash: md5
md5: d41d8cd98f00b204e9800998ecf8427e md5: d6baf100a1623cc2467c2f8221d314c9
size: 0 size: 2133

View file

@ -1,7 +1,6 @@
""" """
Doing some eda on dataset Doing some eda on dataset
""" """
# Look at response variable # Look at response variable
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
@ -39,6 +38,7 @@ train_df[[target, "SAP_STARTING"]].plot(y=target, x="SAP_STARTING", style="o")
train_df[[target, "HEAT_DEMAND_STARTING"]].plot( train_df[[target, "HEAT_DEMAND_STARTING"]].plot(
x=target, y="HEAT_DEMAND_STARTING", style="o" x=target, y="HEAT_DEMAND_STARTING", style="o"
) )
# Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict # Both make sense: i.e. the higher the sap, the lower we predict and the higher the heat demand, the higher we predict
# Load the autogluon model and check feature importance # Load the autogluon model and check feature importance
@ -176,8 +176,6 @@ plot_permutation_importance(exp, fig_kw={"figwidth": 7, "figheight": 6})
# #
# #
from core.MLMetrics import metrics_factory
from core.MLModels import model_factory from core.MLModels import model_factory
from core.DataClient import dataclient_factory from core.DataClient import dataclient_factory
import pandas as pd import pandas as pd
@ -218,12 +216,6 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False) mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])] cosine_similarity_df = mix_df[mix_df.columns.difference(["predictions", "residual"])]
metrics = metrics_factory("Regression")
metrics.generate_metrics(mix_df["predictions"], mix_df["HEAT_DEMAND_ENDING"])
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import cosine_similarity
row_index = 0 row_index = 0