fixed bug and using sklearn for simiplicity

This commit is contained in:
Michael Duong 2023-09-15 00:12:08 +01:00
parent ddcce2b56d
commit 88319c1480
9 changed files with 82 additions and 57 deletions

View file

@ -1,5 +1,5 @@
model_type: AutogluonAutoML
model_save_filepath: ./data/model/autogluonmodel/
model_type: SKLearnLinearRegression
model_save_filepath: ./data/model/model.joblib
SKLearnLinearRegression: null

View file

@ -0,0 +1,10 @@
aws-s3:
AWS_ACCESS_KEY_ID: null
AWS_SECRET_ACCESS_KEY: null
ENDPOINT_URL: null
aws-s3-mock:
AWS_ACCESS_KEY_ID: minio
AWS_SECRET_ACCESS_KEY: minio123
ENDPOINT_URL: http://localhost:9000
local:
null

View file

@ -4,5 +4,5 @@ feature_processor_config:
subsample_seed: 0
target: RDSAP_CHANGE
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"]
# retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
retain_features: null
retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
# retain_features: null

View file

@ -1,14 +1,8 @@
input_dataclient_type: aws-s3
input_dataclient:
AWS_ACCESS_KEY_ID: null
AWS_SECRET_ACCESS_KEY: null
ENDPOINT_URL: null
output_dataclient_type: local
output_dataclient:
null
datahandler_type: parquet
data_filepath: s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet
train_proportion: 0.8
train_proportion: 0.1
output_train_filepath: ./data/prepared_data/train.parquet
output_test_filepath: ./data/prepared_data/test.parquet

View file

@ -5,18 +5,18 @@ stages:
deps:
- path: prepare_data.py
hash: md5
md5: 87a83e62512bff93c89f3e93c1ed248d
size: 5593
md5: 9c31bfb1b75ea3c9685ec459cbb50e62
size: 5921
params:
configs/prepare_data.yaml:
output_test_filepath: ./data/prepared_data/test.parquet
output_train_filepath: ./data/prepared_data/train.parquet
train_proportion: 0.8
train_proportion: 0.1
outs:
- path: data/prepared_data/
hash: md5
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
size: 4428909
md5: f7e45d3997cf165904174b2bc2d2eba5.dir
size: 4396934
nfiles: 2
build_model:
cmd: python build_model.py
@ -27,8 +27,8 @@ stages:
size: 3948
- path: data/prepared_data
hash: md5
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
size: 4428909
md5: f7e45d3997cf165904174b2bc2d2eba5.dir
size: 4396934
nfiles: 2
params:
configs/build_model.yaml:
@ -42,31 +42,31 @@ stages:
SKLearnLinearRegression:
SKLearnSVMRegression:
kernel: linear
model_save_filepath: ./data/model/autogluonmodel/
model_type: AutogluonAutoML
model_save_filepath: ./data/model/model.joblib
model_type: SKLearnLinearRegression
outs:
- path: data/model/
hash: md5
md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
size: 1264795580
nfiles: 28
md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir
size: 904
nfiles: 1
generate_predictions:
cmd: python generate_predictions.py
deps:
- path: data/model
hash: md5
md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
size: 1264795580
nfiles: 28
md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir
size: 904
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
size: 4428909
md5: f7e45d3997cf165904174b2bc2d2eba5.dir
size: 4396934
nfiles: 2
- path: generate_predictions.py
hash: md5
md5: 76c45e7575ec979e6c4c8e2cf754a720
size: 4225
md5: 32c0ecd082e1f8fc4426338d6629979c
size: 4686
params:
configs/generate_predictions.yaml:
input_dataclient_type: local
@ -77,26 +77,26 @@ stages:
outs:
- path: data/predictions/
hash: md5
md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
size: 672577
md5: ea0431b600f0ef357de3a543482cefe7.dir
size: 4085105
nfiles: 1
generate_metrics:
cmd: python generate_metrics.py
deps:
- path: data/predictions
hash: md5
md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
size: 672577
md5: ea0431b600f0ef357de3a543482cefe7.dir
size: 4085105
nfiles: 1
- path: data/prepared_data
hash: md5
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
size: 4428909
md5: f7e45d3997cf165904174b2bc2d2eba5.dir
size: 4396934
nfiles: 2
- path: generate_metrics.py
hash: md5
md5: cc368845f62523575a9ed5c791e27815
size: 4329
md5: 4709c42d93f8e717a3d9e4958e46cd76
size: 4587
params:
configs/generate_metrics.yaml:
dataclient_type: local
@ -107,15 +107,15 @@ stages:
outs:
- path: metrics/metrics.json
hash: md5
md5: 3f03e50a419af6730351a5016e2ae98a
size: 182
md5: ae53c4781cb8a754d24e29ba7ddb16ea
size: 183
startup_cleanup:
cmd: python startup_cleanup.py
deps:
- path: startup_cleanup.py
hash: md5
md5: f7fe2ca33004b34530da0a3ab48c1790
size: 1458
md5: 2e51fbcac960d0f960bf32a8ec7486a0
size: 1748
params:
configs/startup_cleanup.yaml:
artefacts: ./data

View file

@ -7,6 +7,7 @@ stages:
- configs/startup_cleanup.yaml:
- artefacts
- metrics
always_changed: true
prepare_data:
cmd: python prepare_data.py
deps:

View file

@ -21,6 +21,9 @@ from core.Logger import logger
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
@ -96,7 +99,12 @@ if __name__ == "__main__":
logger.info("----------------------------")
model = model_factory(build_model_params["model_type"])
dataclient = dataclient_factory(generate_metrics_params["dataclient_type"])
dataclient_type = generate_metrics_params["dataclient_type"]
dataclient = dataclient_factory(dataclient_type)
dataclient.ingest_configurations(client_params[dataclient_type])
dataclient.establish_client()
input_datahandler = datahandler_factory(
generate_metrics_params["input_datahandler_type"]
)

View file

@ -19,6 +19,9 @@ from core.Logger import logger
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
@ -97,12 +100,19 @@ if __name__ == "__main__":
# We may have different locations of loading hence why we use one specified in generate_predictions.yaml
# I.e. for metric runs, this will be a local data client
# For predictions, we will want a cloud data client
input_dataclient = dataclient_factory(
generate_predictions_params["input_dataclient_type"]
)
output_dataclient = dataclient_factory(
generate_predictions_params["output_dataclient_type"]
input_dataclient_type = generate_predictions_params["input_dataclient_type"]
input_dataclient = dataclient_factory(input_dataclient_type)
input_dataclient.ingest_configurations(config=client_params[input_dataclient_type])
input_dataclient.establish_client()
output_dataclient_type = generate_predictions_params["output_dataclient_type"]
output_dataclient = dataclient_factory(output_dataclient_type)
output_dataclient.ingest_configurations(
config=client_params[output_dataclient_type]
)
output_dataclient.establish_client()
datahandler = datahandler_factory(prepare_data_params["datahandler_type"])
generate_predictions(

View file

@ -21,6 +21,9 @@ from core.FeatureProcessor import feature_processor_factory
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
client_path = Path(__file__).parent / "configs" / "client.yaml"
client_params = yaml.safe_load(open(client_path))
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
prepare_data_params = yaml.safe_load(open(prepare_data_path))
@ -94,7 +97,7 @@ def prepare_data(
dataclient=output_dataclient, obj=train, location=output_train_filepath
)
if test:
if test is not None:
datahandler.save_data(
dataclient=output_dataclient, obj=test, location=output_test_filepath
)
@ -112,18 +115,17 @@ if __name__ == "__main__":
logger.info(f"--- Initiate DataClient ---")
logger.info("----------------------------")
input_dataclient = dataclient_factory(prepare_data_params["input_dataclient_type"])
output_dataclient = dataclient_factory(
prepare_data_params["output_dataclient_type"]
)
input_dataclient_type = prepare_data_params["input_dataclient_type"]
output_dataclient_type = prepare_data_params["output_dataclient_type"]
input_dataclient.ingest_configurations(
config=prepare_data_params["input_dataclient"]
)
input_dataclient = dataclient_factory(input_dataclient_type)
output_dataclient = dataclient_factory(output_dataclient_type)
input_dataclient.ingest_configurations(config=client_params[input_dataclient_type])
input_dataclient.establish_client()
output_dataclient.ingest_configurations(
config=prepare_data_params["output_dataclient"]
config=client_params[output_dataclient_type]
)
output_dataclient.establish_client()