mirror of
https://github.com/Hestia-Homes/ML.git
synced 2026-06-08 11:17:25 +00:00
fixed bug and using sklearn for simiplicity
This commit is contained in:
parent
ddcce2b56d
commit
88319c1480
9 changed files with 82 additions and 57 deletions
|
|
@ -1,5 +1,5 @@
|
|||
model_type: AutogluonAutoML
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
model_type: SKLearnLinearRegression
|
||||
model_save_filepath: ./data/model/model.joblib
|
||||
|
||||
SKLearnLinearRegression: null
|
||||
|
||||
|
|
|
|||
10
modules/ml-pipeline/src/pipeline/src/configs/client.yaml
Normal file
10
modules/ml-pipeline/src/pipeline/src/configs/client.yaml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
aws-s3:
|
||||
AWS_ACCESS_KEY_ID: null
|
||||
AWS_SECRET_ACCESS_KEY: null
|
||||
ENDPOINT_URL: null
|
||||
aws-s3-mock:
|
||||
AWS_ACCESS_KEY_ID: minio
|
||||
AWS_SECRET_ACCESS_KEY: minio123
|
||||
ENDPOINT_URL: http://localhost:9000
|
||||
local:
|
||||
null
|
||||
|
|
@ -4,5 +4,5 @@ feature_processor_config:
|
|||
subsample_seed: 0
|
||||
target: RDSAP_CHANGE
|
||||
drop_columns: ["UPRN", "HEAT_DEMAND_CHANGE"]
|
||||
# retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
|
||||
retain_features: null
|
||||
retain_features: ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
|
||||
# retain_features: null
|
||||
|
|
|
|||
|
|
@ -1,14 +1,8 @@
|
|||
input_dataclient_type: aws-s3
|
||||
input_dataclient:
|
||||
AWS_ACCESS_KEY_ID: null
|
||||
AWS_SECRET_ACCESS_KEY: null
|
||||
ENDPOINT_URL: null
|
||||
output_dataclient_type: local
|
||||
output_dataclient:
|
||||
null
|
||||
datahandler_type: parquet
|
||||
data_filepath: s3://retrofit-data-dev/model_build_data/change_data/rdsap_full/train_validation_data.parquet
|
||||
train_proportion: 0.8
|
||||
train_proportion: 0.1
|
||||
output_train_filepath: ./data/prepared_data/train.parquet
|
||||
output_test_filepath: ./data/prepared_data/test.parquet
|
||||
|
||||
|
|
|
|||
|
|
@ -5,18 +5,18 @@ stages:
|
|||
deps:
|
||||
- path: prepare_data.py
|
||||
hash: md5
|
||||
md5: 87a83e62512bff93c89f3e93c1ed248d
|
||||
size: 5593
|
||||
md5: 9c31bfb1b75ea3c9685ec459cbb50e62
|
||||
size: 5921
|
||||
params:
|
||||
configs/prepare_data.yaml:
|
||||
output_test_filepath: ./data/prepared_data/test.parquet
|
||||
output_train_filepath: ./data/prepared_data/train.parquet
|
||||
train_proportion: 0.8
|
||||
train_proportion: 0.1
|
||||
outs:
|
||||
- path: data/prepared_data/
|
||||
hash: md5
|
||||
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
|
||||
size: 4428909
|
||||
md5: f7e45d3997cf165904174b2bc2d2eba5.dir
|
||||
size: 4396934
|
||||
nfiles: 2
|
||||
build_model:
|
||||
cmd: python build_model.py
|
||||
|
|
@ -27,8 +27,8 @@ stages:
|
|||
size: 3948
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
|
||||
size: 4428909
|
||||
md5: f7e45d3997cf165904174b2bc2d2eba5.dir
|
||||
size: 4396934
|
||||
nfiles: 2
|
||||
params:
|
||||
configs/build_model.yaml:
|
||||
|
|
@ -42,31 +42,31 @@ stages:
|
|||
SKLearnLinearRegression:
|
||||
SKLearnSVMRegression:
|
||||
kernel: linear
|
||||
model_save_filepath: ./data/model/autogluonmodel/
|
||||
model_type: AutogluonAutoML
|
||||
model_save_filepath: ./data/model/model.joblib
|
||||
model_type: SKLearnLinearRegression
|
||||
outs:
|
||||
- path: data/model/
|
||||
hash: md5
|
||||
md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
|
||||
size: 1264795580
|
||||
nfiles: 28
|
||||
md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir
|
||||
size: 904
|
||||
nfiles: 1
|
||||
generate_predictions:
|
||||
cmd: python generate_predictions.py
|
||||
deps:
|
||||
- path: data/model
|
||||
hash: md5
|
||||
md5: 04a1e3bc625e7934c9f57a3fa2f1ea5c.dir
|
||||
size: 1264795580
|
||||
nfiles: 28
|
||||
md5: 1d4bc40f23a6866c8daa9f2f5b639d67.dir
|
||||
size: 904
|
||||
nfiles: 1
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
|
||||
size: 4428909
|
||||
md5: f7e45d3997cf165904174b2bc2d2eba5.dir
|
||||
size: 4396934
|
||||
nfiles: 2
|
||||
- path: generate_predictions.py
|
||||
hash: md5
|
||||
md5: 76c45e7575ec979e6c4c8e2cf754a720
|
||||
size: 4225
|
||||
md5: 32c0ecd082e1f8fc4426338d6629979c
|
||||
size: 4686
|
||||
params:
|
||||
configs/generate_predictions.yaml:
|
||||
input_dataclient_type: local
|
||||
|
|
@ -77,26 +77,26 @@ stages:
|
|||
outs:
|
||||
- path: data/predictions/
|
||||
hash: md5
|
||||
md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
|
||||
size: 672577
|
||||
md5: ea0431b600f0ef357de3a543482cefe7.dir
|
||||
size: 4085105
|
||||
nfiles: 1
|
||||
generate_metrics:
|
||||
cmd: python generate_metrics.py
|
||||
deps:
|
||||
- path: data/predictions
|
||||
hash: md5
|
||||
md5: 44c298a28a0bb1367bb82d5da1a5dbd0.dir
|
||||
size: 672577
|
||||
md5: ea0431b600f0ef357de3a543482cefe7.dir
|
||||
size: 4085105
|
||||
nfiles: 1
|
||||
- path: data/prepared_data
|
||||
hash: md5
|
||||
md5: 01a8f8f0b264ac4d61307a67bfa910b4.dir
|
||||
size: 4428909
|
||||
md5: f7e45d3997cf165904174b2bc2d2eba5.dir
|
||||
size: 4396934
|
||||
nfiles: 2
|
||||
- path: generate_metrics.py
|
||||
hash: md5
|
||||
md5: cc368845f62523575a9ed5c791e27815
|
||||
size: 4329
|
||||
md5: 4709c42d93f8e717a3d9e4958e46cd76
|
||||
size: 4587
|
||||
params:
|
||||
configs/generate_metrics.yaml:
|
||||
dataclient_type: local
|
||||
|
|
@ -107,15 +107,15 @@ stages:
|
|||
outs:
|
||||
- path: metrics/metrics.json
|
||||
hash: md5
|
||||
md5: 3f03e50a419af6730351a5016e2ae98a
|
||||
size: 182
|
||||
md5: ae53c4781cb8a754d24e29ba7ddb16ea
|
||||
size: 183
|
||||
startup_cleanup:
|
||||
cmd: python startup_cleanup.py
|
||||
deps:
|
||||
- path: startup_cleanup.py
|
||||
hash: md5
|
||||
md5: f7fe2ca33004b34530da0a3ab48c1790
|
||||
size: 1458
|
||||
md5: 2e51fbcac960d0f960bf32a8ec7486a0
|
||||
size: 1748
|
||||
params:
|
||||
configs/startup_cleanup.yaml:
|
||||
artefacts: ./data
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ stages:
|
|||
- configs/startup_cleanup.yaml:
|
||||
- artefacts
|
||||
- metrics
|
||||
always_changed: true
|
||||
prepare_data:
|
||||
cmd: python prepare_data.py
|
||||
deps:
|
||||
|
|
|
|||
|
|
@ -21,6 +21,9 @@ from core.Logger import logger
|
|||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
|
|
@ -96,7 +99,12 @@ if __name__ == "__main__":
|
|||
logger.info("----------------------------")
|
||||
|
||||
model = model_factory(build_model_params["model_type"])
|
||||
dataclient = dataclient_factory(generate_metrics_params["dataclient_type"])
|
||||
|
||||
dataclient_type = generate_metrics_params["dataclient_type"]
|
||||
dataclient = dataclient_factory(dataclient_type)
|
||||
dataclient.ingest_configurations(client_params[dataclient_type])
|
||||
dataclient.establish_client()
|
||||
|
||||
input_datahandler = datahandler_factory(
|
||||
generate_metrics_params["input_datahandler_type"]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,9 @@ from core.Logger import logger
|
|||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
|
|
@ -97,12 +100,19 @@ if __name__ == "__main__":
|
|||
# We may have different locations of loading hence why we use one specified in generate_predictions.yaml
|
||||
# I.e. for metric runs, this will be a local data client
|
||||
# For predictions, we will want a cloud data client
|
||||
input_dataclient = dataclient_factory(
|
||||
generate_predictions_params["input_dataclient_type"]
|
||||
)
|
||||
output_dataclient = dataclient_factory(
|
||||
generate_predictions_params["output_dataclient_type"]
|
||||
|
||||
input_dataclient_type = generate_predictions_params["input_dataclient_type"]
|
||||
input_dataclient = dataclient_factory(input_dataclient_type)
|
||||
input_dataclient.ingest_configurations(config=client_params[input_dataclient_type])
|
||||
input_dataclient.establish_client()
|
||||
|
||||
output_dataclient_type = generate_predictions_params["output_dataclient_type"]
|
||||
output_dataclient = dataclient_factory(output_dataclient_type)
|
||||
output_dataclient.ingest_configurations(
|
||||
config=client_params[output_dataclient_type]
|
||||
)
|
||||
output_dataclient.establish_client()
|
||||
|
||||
datahandler = datahandler_factory(prepare_data_params["datahandler_type"])
|
||||
|
||||
generate_predictions(
|
||||
|
|
|
|||
|
|
@ -21,6 +21,9 @@ from core.FeatureProcessor import feature_processor_factory
|
|||
|
||||
RUNTIME_ENVIRONMENT = os.environ.get("RUNTIME_ENVIRONMENT", "local")
|
||||
|
||||
client_path = Path(__file__).parent / "configs" / "client.yaml"
|
||||
client_params = yaml.safe_load(open(client_path))
|
||||
|
||||
prepare_data_path = Path(__file__).parent / "configs" / "prepare_data.yaml"
|
||||
prepare_data_params = yaml.safe_load(open(prepare_data_path))
|
||||
|
||||
|
|
@ -94,7 +97,7 @@ def prepare_data(
|
|||
dataclient=output_dataclient, obj=train, location=output_train_filepath
|
||||
)
|
||||
|
||||
if test:
|
||||
if test is not None:
|
||||
datahandler.save_data(
|
||||
dataclient=output_dataclient, obj=test, location=output_test_filepath
|
||||
)
|
||||
|
|
@ -112,18 +115,17 @@ if __name__ == "__main__":
|
|||
logger.info(f"--- Initiate DataClient ---")
|
||||
logger.info("----------------------------")
|
||||
|
||||
input_dataclient = dataclient_factory(prepare_data_params["input_dataclient_type"])
|
||||
output_dataclient = dataclient_factory(
|
||||
prepare_data_params["output_dataclient_type"]
|
||||
)
|
||||
input_dataclient_type = prepare_data_params["input_dataclient_type"]
|
||||
output_dataclient_type = prepare_data_params["output_dataclient_type"]
|
||||
|
||||
input_dataclient.ingest_configurations(
|
||||
config=prepare_data_params["input_dataclient"]
|
||||
)
|
||||
input_dataclient = dataclient_factory(input_dataclient_type)
|
||||
output_dataclient = dataclient_factory(output_dataclient_type)
|
||||
|
||||
input_dataclient.ingest_configurations(config=client_params[input_dataclient_type])
|
||||
input_dataclient.establish_client()
|
||||
|
||||
output_dataclient.ingest_configurations(
|
||||
config=prepare_data_params["output_dataclient"]
|
||||
config=client_params[output_dataclient_type]
|
||||
)
|
||||
output_dataclient.establish_client()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue