Compare commits

..

No commits in common. "carbon@v0.7.0" and "master" have entirely different histories.

10 changed files with 72 additions and 101 deletions

View file

@ -24,9 +24,9 @@
"active": true "active": true
}, },
"carbon": { "carbon": {
"version": "v0.6.0", "version": "v0.5.0",
"stage": { "stage": {
"dev": "v0.6.0" "dev": "v0.5.0"
}, },
"registered": true, "registered": true,
"active": true "active": true

View file

@ -13,11 +13,7 @@ RUN yum install -y gcc python3-devel gcc-c++
# Install python packages # Install python packages
COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt COPY modules/ml-pipeline/src/pipeline/requirements/predictions/requirements.txt ./requirements.txt
RUN pip install --no-cache-dir -r ./requirements.txt
RUN pip install uv
RUN uv pip install -r requirements.txt --system
# RUN pip install --no-cache-dir -r ./requirements.txt
# Copy the project code # Copy the project code
COPY modules/ml-pipeline/src/pipeline ./pipeline COPY modules/ml-pipeline/src/pipeline ./pipeline

View file

@ -5,11 +5,8 @@ RUN apt-get update && apt-get install -y libgomp1 gcc python3-dev
COPY pipeline/requirements/predictions/requirements.txt requirements.txt COPY pipeline/requirements/predictions/requirements.txt requirements.txt
RUN pip install uv RUN pip install --upgrade pip
RUN pip install -r requirements.txt
RUN uv pip install -r requirements.txt --system
# RUN pip install -r requirements.txt
# Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script # Assuming in the CI/CD step, there will be a dvc pull step to get data and model, so will just need to run a single script
COPY pipeline/ /home/pipeline/ COPY pipeline/ /home/pipeline/

View file

@ -1,3 +1,3 @@
# The generic reproducible ML-pipeline! # The generic reproducible ML-pipeline
Pipeline required to build a model to produce an output, that gets hashed via DVC Pipeline required to build a model to produce an output, that gets hashed via DVC

View file

@ -1,4 +1,3 @@
# Ignore dynaconf secret files # Ignore dynaconf secret files
.secrets.* .secrets.*
example.py

View file

@ -18,44 +18,30 @@ def remove_starting_columns(df):
return df return df
def keep_negative_heat_change(df): def remove_floor_height_ending(df):
df = df[df["heat_demand_change"] < 0] # df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["floor_height_ending"] > 1.665].reset_index(drop=True)
print("we in here")
return df return df
def keep_non_negative_carbon_ending(df): def remove_minimum_habitable_room_size(df):
df = df[df["carbon_ending"] > 0] # Need minimum of 6.5m per habitable room
df = df[
df["total_floor_area_ending"] / df["number_habitable_rooms"] > 6.5
].reset_index(drop=True)
return df return df
def keep_negative_carbon_change(df): def keep_flats(df):
df = df[df["carbon_change"] < 0] df = df[df["property_type"] == "Flat"]
return df return df
# TODO: Move to ETL pipeline def keep_non_zero_rdsap(df):
def remove_unreasonable_habitable_rooms(df): df = df[df["rdsap_change"] != 0]
"""
Assumption is that proportion of floor area to habitable rooms should be at least 6.5m2
"""
minimum_room_size_index = (
df["total_floor_area_ending"] / df["number_habitable_rooms"] >= 6.5
)
df = df[minimum_room_size_index]
return df
def remove_top_1_percent_heat_demand(df):
# threshold_value = df.describe(percentiles=[0.99])['HEAT_DEMAND_STARTING']['99%']
threshold_value = 860
df = df[df["heat_demand_starting"] < threshold_value]
return df
def remove_top_1_percent_carbon(df):
# threshold_value = df.describe(percentiles=[0.99])['CARBON_STARTING']['99%']
threshold_value = 18
df = df[df["carbon_starting"] < threshold_value]
return df return df
@ -68,12 +54,10 @@ def remove_top_1_percent_carbon(df):
# return df # return df
business_logic = { business_logic = {
"remove_unreasonable_habitable_rooms": remove_unreasonable_habitable_rooms, # "keep_non_zero_rdsap": keep_non_zero_rdsap,
"keep_negative_heat_change": keep_negative_heat_change, # "keep_flats": keep_flats,
"keep_negative_carbon_change": keep_negative_carbon_change, # "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
"remove_top_1_percent_heat_demand": remove_top_1_percent_heat_demand, # "remove_floor_height_ending": remove_floor_height_ending
"remove_top_1_percent_carbon": remove_top_1_percent_carbon,
"keep_non_negative_carbon_ending": keep_non_negative_carbon_ending,
# "remove_starting_columns": remove_starting_columns # "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns # "keep_ENDING_COLUMNS": keep_ending_columns
} }

View file

@ -1,24 +1,23 @@
""" """
After predictions, we may want to apply some post processing to the predictions After predictions, we may want to apply some post processing to the predictions
""" """
import pandas as pd import pandas as pd
def clip_predictions_to_minimum_value( def clip_predictions_to_minimum_value(
data: pd.DataFrame, data: pd.DataFrame, predictions: pd.Series, minimum_value: int = 0
predictions: pd.Series,
) -> pd.Series: ) -> pd.Series:
series_name = predictions.name series_name = predictions.name
predictions.name = "predictions" predictions.name = "predictions"
predictions = predictions.astype(data["carbon_starting"].dtype)
predictions_df = pd.concat([data, predictions], axis=1) predictions_df = pd.concat([data, predictions], axis=1)
# We expect all prediction to be atleast one point improvement # We expect all prediction to be atleast one point improvement
replace_index = predictions_df["predictions"] > predictions_df["carbon_starting"] replace_index = (
predictions_df.loc[replace_index, "predictions"] = predictions_df.loc[ predictions_df["sap_starting"] + minimum_value > predictions_df["predictions"]
replace_index, "carbon_starting" )
] predictions_df.loc[replace_index, "predictions"] = (
predictions_df.loc[replace_index, "sap_starting"] + minimum_value
)
predictions_new = predictions_df["predictions"] predictions_new = predictions_df["predictions"]
predictions_new.name = series_name predictions_new.name = series_name

View file

@ -8,6 +8,6 @@ default:
# - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/27-03-2024-11-38-15/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-08-47-45/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet # - s3://retrofit-data-dev/scenario_data/26-05-2024-10-44-53/recommendations_scoring_data.parquet
# - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet - s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md metrics_output_filepath: ./metrics/scenario_metrics.md

View file

@ -18,8 +18,10 @@ default:
prepare_data: prepare_data:
input_dataclient_type: aws-s3 input_dataclient_type: aws-s3
output_dataclient_type: local output_dataclient_type: local
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-03-22-18-56-53/dataset_rooms.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-10-03-22-57-23/dataset_rooms.parquet # data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-25-08-36-36/dataset_rooms.parquet
# data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-26-10-31-39/dataset_rooms.parquet
data_filepath: s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
train_proportion: 0.9 train_proportion: 0.9
output_train_filepath: ./data/prepared_data/train.parquet output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet output_test_filepath: ./data/prepared_data/test.parquet
@ -29,14 +31,13 @@ default:
feature_processor_config: feature_processor_config:
subsample_amount: null subsample_amount: null
subsample_seed: 0 subsample_seed: 0
target: carbon_ending target: sap_ending
identifier_columns: ["uprn"] identifier_columns: ["uprn"]
# drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending"] # drop_columns: ["heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending"]
drop_columns: [ drop_columns: [
"heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "sap_ending", "days_to_starting", "days_to_ending", "heat_demand_change", "carbon_change", "rdsap_change", "heat_demand_ending", "carbon_ending", "days_to_starting", "days_to_ending",
'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending', 'number_habitable_rooms_starting', 'number_habitable_rooms_ending', 'number_heated_rooms_starting', 'number_heated_rooms_ending',
'number_habitable_rooms', 'number_heated_rooms', 'lighting_cost_starting', 'lighting_cost_ending', 'heating_cost_starting', 'heating_cost_ending', 'hot_water_cost_starting', 'hot_water_cost_ending',] 'number_habitable_rooms', 'number_heated_rooms']
# retain_features: ["SAP_STARTING", "TOTAL_FLOOR_AREA_DIFF"]
retain_features: null retain_features: null
# retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending', # retain_features: ['uprn', 'sap_starting', 'hot_water_energy_eff_ending',
# 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending', # 'mainheat_energy_eff_ending', 'constituency', 'roof_energy_eff_ending',

View file

@ -25,7 +25,7 @@ stages:
- carbon_change - carbon_change
- rdsap_change - rdsap_change
- heat_demand_ending - heat_demand_ending
- sap_ending - carbon_ending
- days_to_starting - days_to_starting
- days_to_ending - days_to_ending
- number_habitable_rooms_starting - number_habitable_rooms_starting
@ -34,19 +34,13 @@ stages:
- number_heated_rooms_ending - number_heated_rooms_ending
- number_habitable_rooms - number_habitable_rooms
- number_heated_rooms - number_heated_rooms
- lighting_cost_starting
- lighting_cost_ending
- heating_cost_starting
- heating_cost_ending
- hot_water_cost_starting
- hot_water_cost_ending
default.feature_processor.feature_processor_config.retain_features: default.feature_processor.feature_processor_config.retain_features:
default.feature_processor.feature_processor_config.subsample_amount: default.feature_processor.feature_processor_config.subsample_amount:
default.feature_processor.feature_processor_config.subsample_seed: 0 default.feature_processor.feature_processor_config.subsample_seed: 0
default.feature_processor.feature_processor_config.target: carbon_ending default.feature_processor.feature_processor_config.target: sap_ending
default.feature_processor.feature_processor_type: dataframe default.feature_processor.feature_processor_type: dataframe
default.prepare_data.data_filepath: default.prepare_data.data_filepath:
s3://retrofit-data-dev/sap_change_model/2024-10-03-22-57-23/dataset_rooms.parquet s3://retrofit-data-dev/sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet
default.prepare_data.input_dataclient_type: aws-s3 default.prepare_data.input_dataclient_type: aws-s3
default.prepare_data.output_dataclient_type: local default.prepare_data.output_dataclient_type: local
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
@ -55,8 +49,8 @@ stages:
outs: outs:
- path: data/prepared_data/ - path: data/prepared_data/
hash: md5 hash: md5
md5: f96aaa1181655a1bef313542f037b346.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 40772097 size: 45056059
nfiles: 2 nfiles: 2
build_model: build_model:
cmd: python 2_build_model.py cmd: python 2_build_model.py
@ -67,8 +61,8 @@ stages:
size: 4820 size: 4820
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: f96aaa1181655a1bef313542f037b346.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 40772097 size: 45056059
nfiles: 2 nfiles: 2
params: params:
configs/build_model.yaml: configs/build_model.yaml:
@ -100,18 +94,18 @@ stages:
outs: outs:
- path: data/fit_predictions/ - path: data/fit_predictions/
hash: md5 hash: md5
md5: 821aace9a1dfb8b2adb507f4d7e6b36b.dir md5: d9c9afc05e8780db47c0548b19bf7d19.dir
size: 3995384 size: 3349989
nfiles: 1 nfiles: 1
- path: data/model/ - path: data/model/
hash: md5 hash: md5
md5: fde129c8b8610bdaecc3d28f4cfc6608.dir md5: 13c3100e1486c27a83a8a47491077842.dir
size: 751284807 size: 773523079
nfiles: 36 nfiles: 36
- path: metrics/fit_metrics.json - path: metrics/fit_metrics.json
hash: md5 hash: md5
md5: 471606cbb7d4f3e62fb94b493d3ec858 md5: 2ff70a2a45813e1bcdf2ea3aa8e07d4a
size: 227 size: 224
generate_predictions: generate_predictions:
cmd: python 3_generate_predictions.py cmd: python 3_generate_predictions.py
deps: deps:
@ -121,13 +115,13 @@ stages:
size: 2464 size: 2464
- path: data/model - path: data/model
hash: md5 hash: md5
md5: fde129c8b8610bdaecc3d28f4cfc6608.dir md5: 13c3100e1486c27a83a8a47491077842.dir
size: 751284807 size: 773523079
nfiles: 36 nfiles: 36
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: f96aaa1181655a1bef313542f037b346.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 40772097 size: 45056059
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -139,8 +133,8 @@ stages:
outs: outs:
- path: data/predictions/ - path: data/predictions/
hash: md5 hash: md5
md5: 985d380681ab1f7645015a67b695b633.dir md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 557231 size: 463197
nfiles: 1 nfiles: 1
generate_metrics: generate_metrics:
cmd: python 4_generate_metrics.py cmd: python 4_generate_metrics.py
@ -151,13 +145,13 @@ stages:
size: 3484 size: 3484
- path: data/predictions - path: data/predictions
hash: md5 hash: md5
md5: 985d380681ab1f7645015a67b695b633.dir md5: 5d07bcebf3160a72bb18dfd79106e85c.dir
size: 557231 size: 463197
nfiles: 1 nfiles: 1
- path: data/prepared_data - path: data/prepared_data
hash: md5 hash: md5
md5: f96aaa1181655a1bef313542f037b346.dir md5: 80c9e138146a1d96b9d16091c207e2e8.dir
size: 40772097 size: 45056059
nfiles: 2 nfiles: 2
params: params:
configs/settings.yaml: configs/settings.yaml:
@ -167,8 +161,8 @@ stages:
outs: outs:
- path: metrics/metrics.json - path: metrics/metrics.json
hash: md5 hash: md5
md5: 9cc5f3a42681b321c26c414589ba561e md5: 3e08df02fd5c5d094bcf936e1338d596
size: 226 size: 223
generate_scenerio_metrics: generate_scenerio_metrics:
cmd: python 5_generate_scenarios.py cmd: python 5_generate_scenarios.py
deps: deps:
@ -182,14 +176,15 @@ stages:
input_dataclient_type: aws-s3 input_dataclient_type: aws-s3
output_dataclient_type: local output_dataclient_type: local
scenario_data_filepaths: scenario_data_filepaths:
- s3://retrofit-data-dev/scenario_data/28-05-2024-19-22-41/recommendations_scoring_data.parquet
comparison_output_filepath: ./metrics/scenario_table.md comparison_output_filepath: ./metrics/scenario_table.md
metrics_output_filepath: ./metrics/scenario_metrics.md metrics_output_filepath: ./metrics/scenario_metrics.md
outs: outs:
- path: metrics/scenario_metrics.md - path: metrics/scenario_metrics.md
hash: md5 hash: md5
md5: d41d8cd98f00b204e9800998ecf8427e md5: fa4d6d7bbd7818613800da5f8f37ea96
size: 0 size: 363
- path: metrics/scenario_table.md - path: metrics/scenario_table.md
hash: md5 hash: md5
md5: d41d8cd98f00b204e9800998ecf8427e md5: d6baf100a1623cc2467c2f8221d314c9
size: 0 size: 2133