Compare commits

..

No commits in common. "carbon@v0.6.0" and "master" have entirely different histories.

8 changed files with 62 additions and 80 deletions

View file

@ -18,7 +18,7 @@
"heat": { "heat": {
"version": "v0.5.0", "version": "v0.5.0",
"stage": { "stage": {
"dev": "v0.11.0" "dev": "v0.5.0"
}, },
"registered": true, "registered": true,
"active": true "active": true

View file

@ -1,3 +1,3 @@
# The generic reproducible ML-pipeline! # The generic reproducible ML-pipeline
Pipeline required to build a model to produce an output, that gets hashed via DVC Pipeline required to build a model to produce an output, that gets hashed via DVC

View file

@ -1,4 +1,3 @@
# Ignore dynaconf secret files # Ignore dynaconf secret files
.secrets.* .secrets.*
example.py

View file

@ -18,44 +18,30 @@ def remove_starting_columns(df):
return df return df
def keep_negative_heat_change(df): def remove_floor_height_ending(df):
df = df[df["heat_demand_change"] < 0] # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
return df return df
def keep_non_negative_carbon_ending(df): def remove_minimum_habitable_room_size(df):
df = df[df["carbon_ending"] > 0] # Need minimum of 6.5m per habitable room
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
return df return df
def keep_negative_carbon_change(df): def keep_flats(df):
df = df[df["carbon_change"] < 0] df = df[df["property_type"] == "Flat"]
return df return df
# TODO: Move to ETL pipeline def keep_non_zero_rdsap(df):
def remove_unreasonable_habitable_rooms(df): df = df[df["rdsap_change"] != 0]
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5
)
df = df[minimum_room_size_index]
return df
def remove_top_1_percent_heat_demand(df):
# threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%']
threshold_value = 860
df = df[df["heat_demand_starting"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["carbon_starting"] < threshold_value]
return df return df
@ -68,12 +54,10 @@ def remove_top_1_percent_carbon(df):
# return df # return df
business_logic = { business_logic = {
"remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms, # "keep_non_zero_rdsap": keep_non_zero_rdsap,
"keep_negative_heat_change": keep_negative_heat_change, # "keep_flats": keep_flats,
"keep_negative_carbon_change": keep_negative_carbon_change, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
"remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand, # "remove_floor_height_ending": remove_floor_height_ending
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
"keep_non_negative_carbon_ending": keep_non_negative_carbon_ending,
# "remove_starting_columns": remove_starting_columns # "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns # "keep_ENDING_COLUMNS": keep_ending_columns
} }

View file

@ -1,24 +1,23 @@
""" """
After predictions, we may want to apply some post processing to the predictions After predictions, we may want to apply some post processing to the predictions
""" """
import pandas as pd import pandas as pd
def clip_predictions_to_minimum_value( def clip_predictions_to_minimum_value(
data: pd.DataFrame, data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0
predictions: pd.Series,
) -> pd.Series: ) -> pd.Series:
series_name = predictions.name series_name = predictions.name
predictions.name = "predictions" predictions.name = "predictions"
predictions = predictions.astype(data["carbon_starting"].dtype)
predictions_df = pd.concat([data, predictions], axis=1) predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement # We expect all prediction to be atleast one point improvement
replace_index = predictions_df["predictions"] > predictions_df["carbon_starting"] replace_index = (
predictions_df.loc[replace_index, "predictions"] = predictions_df.loc[ predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
replace_index, "carbon_starting" )
] predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "sap_starting"] + minimum_value
)
predictions_new = predictions_df["predictions"] predictions_new = predictions_df["predictions"]
predictions_new.name = series_name predictions_new.name = series_name

View file

@ -8,6 +8,6 @@ default:
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -31,14 +31,13 @@ default:
feature_processor_config: feature_processor_config:
subsample_amount: null subsample_amount: null
subsample_seed: 0 subsample_seed: 0
target: carbon_ending target: sap_ending
identifier_columns: ["uprn"] identifier_columns: ["uprn"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending"] # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
drop_columns: [ drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending", "days_to_starting", "days_to_ending", "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms'] 'number_habitable_rooms', 'number_heated_rooms']
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', # 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',

View file

@ -25,7 +25,7 @@ stages:
- carbon_change - carbon_change
- rdsap_change - rdsap_change
- heat_demand_ending - heat_demand_ending
- sap_ending - carbon_ending
- days_to_starting - days_to_starting
- days_to_ending - days_to_ending
- number_habitable_rooms_starting - number_habitable_rooms_starting
@ -37,7 +37,7 @@ stages:
default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: carbon_ending default.feature_processor.feature_processor_config.target: sap_ending
default.feature_processor.feature_processor_type: dataframe default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
@ -49,8 +49,8 @@ stages:
outs: outs:
- path: data/prepared_data/ - path: data/prepared_data/
hash: md5 hash: md5
md5: e2efac20634b919381adfb962a42d40a.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 36961727 size: 45056059
nfiles: 2 nfiles: 2
build_model: build_model:
cmd: python 2_build_model.py cmd: python 2_build_model.py
@ -61,8 +61,8 @@ stages:
size: 4820 size: 4820
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: e2efac20634b919381adfb962a42d40a.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 36961727 size: 45056059
nfiles: 2 nfiles: 2
params: params:
configs/build_model.yaml: configs/build_model.yaml:
@ -94,18 +94,18 @@ stages:
outs: outs:
- path: data/fit_predictions/ - path: data/fit_predictions/
hash: md5 hash: md5
md5: d2568a3244df4d3444b6190599f74b96.dir md5: d9c9afc05e8780db47c0548b19bf7d19.dir
size: 3661106 size: 3349989
nfiles: 1 nfiles: 1
- path: data/model/ - path: data/model/
hash: md5 hash: md5
md5: 756100e033e0bd4445a437e43f4c53af.dir md5: 13c3100e1486c27a83a8a47491077842.dir
size: 730442848 size: 773523079
nfiles: 36 nfiles: 36
- path: metrics/fit_metrics.json - path: metrics/fit_metrics.json
hash: md5 hash: md5
md5: 3bcb3b9728521cd341eb71af109ca778 md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
size: 227 size: 224
generate_predictions: generate_predictions:
cmd: python 3_generate_predictions.py cmd: python 3_generate_predictions.py
deps: deps:
@ -115,13 +115,13 @@ stages:
size: 2464 size: 2464
- path: data/model - path: data/model
hash: md5 hash: md5
md5: 756100e033e0bd4445a437e43f4c53af.dir md5: 13c3100e1486c27a83a8a47491077842.dir
size: 730442848 size: 773523079
nfiles: 36 nfiles: 36
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: e2efac20634b919381adfb962a42d40a.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 36961727 size: 45056059
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -133,8 +133,8 @@ stages:
outs: outs:
- path: data/predictions/ - path: data/predictions/
hash: md5 hash: md5
md5: 09f3584d6fbd447dd2714eb2774139d5.dir md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 499683 size: 463197
nfiles: 1 nfiles: 1
generate_metrics: generate_metrics:
cmd: python 4_generate_metrics.py cmd: python 4_generate_metrics.py
@ -145,13 +145,13 @@ stages:
size: 3484 size: 3484
- path: data/predictions - path: data/predictions
hash: md5 hash: md5
md5: 09f3584d6fbd447dd2714eb2774139d5.dir md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 499683 size: 463197
nfiles: 1 nfiles: 1
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: e2efac20634b919381adfb962a42d40a.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 36961727 size: 45056059
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -161,8 +161,8 @@ stages:
outs: outs:
- path: metrics/metrics.json - path: metrics/metrics.json
hash: md5 hash: md5
md5: abf8720d06f073f47501aa1172527e9e md5: 3e08df02fd5c5d094bcf936e1338d596
size: 225 size: 223
generate_scenerio_metrics: generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py cmd: python 5_generate_scenarios.py
deps: deps:
@ -176,14 +176,15 @@ stages:
input_dataclient_type: aws-s3 input_dataclient_type: aws-s3
output_dataclient_type: local output_dataclient_type: local
scenario_data_filepaths: scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md metrics_output_filepath: ./metrics/scenario_metrics.md
outs: outs:
- path: metrics/scenario_metrics.md - path: metrics/scenario_metrics.md
hash: md5 hash: md5
md5: d41d8cd98f00b204e9800998ecf8427e md5: fa4d6d7bbd7818613800da5f8f37ea96
size: 0 size: 363
- path: metrics/scenario_table.md - path: metrics/scenario_table.md
hash: md5 hash: md5
md5: d41d8cd98f00b204e9800998ecf8427e md5: d6baf100a1623cc2467c2f8221d314c9
size: 0 size: 2133