mirror of
https://github.com/Hestia-Homes/ML.git
synced 2026-06-30 13:10:43 +00:00
commit
febdbdbd80
6 changed files with 61 additions and 36 deletions
|
|
@ -9,16 +9,16 @@ init: dev-conda
|
||||||
.PHONY: dev-conda
|
.PHONY: dev-conda
|
||||||
dev-conda:
|
dev-conda:
|
||||||
# conda deactivate || echo "Not in conda environment"
|
# conda deactivate || echo "Not in conda environment"
|
||||||
# conda remove --name $CONDA_ENV --all -y || echo "No environment created previously"
|
# conda remove --name ${CONDA_ENV} --all -y || echo "No environment created previously"
|
||||||
conda create --name $CONDA_ENV python=$(PYTHON_VERSION) -y
|
conda create --name ${CONDA_ENV} python=$(PYTHON_VERSION) -y
|
||||||
conda init bash
|
conda init bash
|
||||||
conda run -vvvv -n $CONDA_ENV pip install --upgrade pip
|
conda run -v -n ${CONDA_ENV} pip install --upgrade pip
|
||||||
conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/training/requirements-dev.txt
|
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/training/requirements-dev.txt
|
||||||
conda run -vvvv -n $CONDA_ENV pip install -r src/pipeline/requirements/version_control/requirements.txt
|
conda run -v -n ${CONDA_ENV} pip install -r src/pipeline/requirements/version_control/requirements.txt
|
||||||
conda run -vvvv -n $CONDA_ENV pre-commit install
|
conda run -v -n ${CONDA_ENV} pre-commit install
|
||||||
conda run -vvvv -n $CONDA_ENV pip install ipykernel
|
conda run -v -n ${CONDA_ENV} pip install ipykernel
|
||||||
echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
|
echo "TO ACTIVATE ENVIRONMENT, USE THE FOLLOWING COMMAND"
|
||||||
echo "conda activate $CONDA_ENV"
|
echo "conda activate ${CONDA_ENV}"
|
||||||
|
|
||||||
|
|
||||||
.PHONY: dev-pyenv
|
.PHONY: dev-pyenv
|
||||||
|
|
|
||||||
|
|
@ -13,4 +13,4 @@ default:
|
||||||
dataclient_type: local
|
dataclient_type: local
|
||||||
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
|
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
|
||||||
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
|
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
|
||||||
row_index: [0, 10, 20] # index of an example datapoint
|
row_index: [20695, 50243, 7653] # index of an example datapoint
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,28 @@ def remove_starting_columns(df):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def remove_floor_height_ending(df):
|
||||||
|
# df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
|
||||||
|
# shows bottom 0.5 percentile is 1.665
|
||||||
|
# So keep anything above this
|
||||||
|
df = df[df["FLOOR_HEIGHT_ENDING"] > 1.665].reset_index(drop=True)
|
||||||
|
print("we in here")
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def remove_minimum_habitable_room_size(df):
|
||||||
|
# Need minimum of 6.5m per habitable room
|
||||||
|
df = df[
|
||||||
|
df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] > 6.5
|
||||||
|
].reset_index(drop=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def keep_flats(df):
|
||||||
|
df = df[df["PROPERTY_TYPE"] == "Flat"]
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
# def keep_ending_columns(df):
|
# def keep_ending_columns(df):
|
||||||
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
|
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
|
||||||
# keep_columns = df.columns[ending_column_index].to_list()
|
# keep_columns = df.columns[ending_column_index].to_list()
|
||||||
|
|
@ -27,6 +49,9 @@ def remove_starting_columns(df):
|
||||||
# return df
|
# return df
|
||||||
|
|
||||||
business_logic = {
|
business_logic = {
|
||||||
|
# "keep_flats": keep_flats,
|
||||||
|
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
|
||||||
|
# "remove_floor_height_ending": remove_floor_height_ending
|
||||||
# "remove_starting_columns": remove_starting_columns
|
# "remove_starting_columns": remove_starting_columns
|
||||||
# "keep_ENDING_COLUMNS": keep_ending_columns
|
# "keep_ENDING_COLUMNS": keep_ending_columns
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ default:
|
||||||
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
|
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_with_differencing.parquet
|
||||||
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
|
# data_filepath: s3://retrofit-data-dev/sap_change_model/floor_area_clean_test.parquet
|
||||||
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
|
# data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_without_differencing.parquet
|
||||||
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
|
data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
|
||||||
train_proportion: 0.9
|
train_proportion: 0.9
|
||||||
output_train_filepath: ./data/prepared_data/train.parquet
|
output_train_filepath: ./data/prepared_data/train.parquet
|
||||||
output_test_filepath: ./data/prepared_data/test.parquet
|
output_test_filepath: ./data/prepared_data/test.parquet
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ stages:
|
||||||
default.feature_processor.feature_processor_config.subsample_seed: 0
|
default.feature_processor.feature_processor_config.subsample_seed: 0
|
||||||
default.feature_processor.feature_processor_config.target: SAP_ENDING
|
default.feature_processor.feature_processor_config.target: SAP_ENDING
|
||||||
default.feature_processor.feature_processor_type: dataframe
|
default.feature_processor.feature_processor_type: dataframe
|
||||||
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset.parquet
|
default.prepare_data.data_filepath: s3://retrofit-data-dev/sap_change_model/dataset_test.parquet
|
||||||
default.prepare_data.input_dataclient_type: aws-s3
|
default.prepare_data.input_dataclient_type: aws-s3
|
||||||
default.prepare_data.output_dataclient_type: local
|
default.prepare_data.output_dataclient_type: local
|
||||||
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
|
default.prepare_data.output_test_filepath: ./data/prepared_data/test.parquet
|
||||||
|
|
@ -29,8 +29,8 @@ stages:
|
||||||
outs:
|
outs:
|
||||||
- path: data/prepared_data/
|
- path: data/prepared_data/
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
|
md5: cd75be9fecff0c647792dd2db648085c.dir
|
||||||
size: 33881619
|
size: 37056053
|
||||||
nfiles: 2
|
nfiles: 2
|
||||||
build_model:
|
build_model:
|
||||||
cmd: python 2_build_model.py
|
cmd: python 2_build_model.py
|
||||||
|
|
@ -41,8 +41,8 @@ stages:
|
||||||
size: 5359
|
size: 5359
|
||||||
- path: data/prepared_data
|
- path: data/prepared_data
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
|
md5: cd75be9fecff0c647792dd2db648085c.dir
|
||||||
size: 33881619
|
size: 37056053
|
||||||
nfiles: 2
|
nfiles: 2
|
||||||
params:
|
params:
|
||||||
configs/build_model.yaml:
|
configs/build_model.yaml:
|
||||||
|
|
@ -66,13 +66,13 @@ stages:
|
||||||
outs:
|
outs:
|
||||||
- path: data/model/
|
- path: data/model/
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 7bb5156243b4db39349e80a01ffecde4.dir
|
md5: 7a5527f779efcb1a7db068148b6bcc45.dir
|
||||||
size: 473398662
|
size: 422448184
|
||||||
nfiles: 27
|
nfiles: 27
|
||||||
- path: metrics/fit_metrics.json
|
- path: metrics/fit_metrics.json
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 2bb16ac67de8778fbc08171d562b34d5
|
md5: 77790bb9485c04c77125e361921c3774
|
||||||
size: 184
|
size: 225
|
||||||
generate_predictions:
|
generate_predictions:
|
||||||
cmd: python 3_generate_predictions.py
|
cmd: python 3_generate_predictions.py
|
||||||
deps:
|
deps:
|
||||||
|
|
@ -82,13 +82,13 @@ stages:
|
||||||
size: 3028
|
size: 3028
|
||||||
- path: data/model
|
- path: data/model
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 7bb5156243b4db39349e80a01ffecde4.dir
|
md5: 7a5527f779efcb1a7db068148b6bcc45.dir
|
||||||
size: 473398662
|
size: 422448184
|
||||||
nfiles: 27
|
nfiles: 27
|
||||||
- path: data/prepared_data
|
- path: data/prepared_data
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
|
md5: cd75be9fecff0c647792dd2db648085c.dir
|
||||||
size: 33881619
|
size: 37056053
|
||||||
nfiles: 2
|
nfiles: 2
|
||||||
params:
|
params:
|
||||||
configs/settings.yaml:
|
configs/settings.yaml:
|
||||||
|
|
@ -100,8 +100,8 @@ stages:
|
||||||
outs:
|
outs:
|
||||||
- path: data/predictions/
|
- path: data/predictions/
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
|
md5: 28d2876e6c6d5cc64844ecc1d6ac40b2.dir
|
||||||
size: 374532
|
size: 346687
|
||||||
nfiles: 1
|
nfiles: 1
|
||||||
generate_metrics:
|
generate_metrics:
|
||||||
cmd: python 4_generate_metrics.py
|
cmd: python 4_generate_metrics.py
|
||||||
|
|
@ -112,13 +112,13 @@ stages:
|
||||||
size: 4487
|
size: 4487
|
||||||
- path: data/predictions
|
- path: data/predictions
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 0bb3cf991906953def81c8204cdcfaf0.dir
|
md5: 28d2876e6c6d5cc64844ecc1d6ac40b2.dir
|
||||||
size: 374532
|
size: 346687
|
||||||
nfiles: 1
|
nfiles: 1
|
||||||
- path: data/prepared_data
|
- path: data/prepared_data
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 9ce5c45722da7fc40491b5a4d00daf9e.dir
|
md5: cd75be9fecff0c647792dd2db648085c.dir
|
||||||
size: 33881619
|
size: 37056053
|
||||||
nfiles: 2
|
nfiles: 2
|
||||||
params:
|
params:
|
||||||
configs/settings.yaml:
|
configs/settings.yaml:
|
||||||
|
|
@ -128,8 +128,8 @@ stages:
|
||||||
outs:
|
outs:
|
||||||
- path: metrics/metrics.json
|
- path: metrics/metrics.json
|
||||||
hash: md5
|
hash: md5
|
||||||
md5: 2e13ae67759a64261d03224f1c0d4bf4
|
md5: 7afd04d656dc83ad6aa942d9c63f5b4e
|
||||||
size: 185
|
size: 224
|
||||||
startup_cleanup:
|
startup_cleanup:
|
||||||
cmd: python 0_startup_cleanup.py
|
cmd: python 0_startup_cleanup.py
|
||||||
deps:
|
deps:
|
||||||
|
|
|
||||||
|
|
@ -207,11 +207,11 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
|
||||||
mix_df = mix_df.sort_values("residual", ascending=False)
|
mix_df = mix_df.sort_values("residual", ascending=False)
|
||||||
|
|
||||||
cosine_similarity_df = mix_df[
|
cosine_similarity_df = mix_df[
|
||||||
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
|
mix_df.columns.difference(["UPRN", "predictions", "residual", "SAP_ENDING"])
|
||||||
]
|
]
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
row_index = 58199
|
row_index = 20695
|
||||||
|
|
||||||
from sklearn.preprocessing import LabelEncoder
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
|
@ -224,8 +224,8 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[
|
||||||
feature_vector = cosine_similarity_df.loc[[row_index]]
|
feature_vector = cosine_similarity_df.loc[[row_index]]
|
||||||
|
|
||||||
cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
|
cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
|
||||||
similar_index = (
|
|
||||||
cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
|
similar_df = cosine_similarity_df.sort_values("cosine", ascending=False).head(5)
|
||||||
)
|
similar_index = similar_df.index
|
||||||
|
|
||||||
check_df = mix_df.loc[similar_index]
|
check_df = mix_df.loc[similar_index]
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue