add some processing ocde

This commit is contained in:
Michael Duong 2023-10-09 15:44:37 +00:00
parent ba4d1bcc8b
commit f9b0b6112c
4 changed files with 32 additions and 7 deletions

View file

@ -13,4 +13,4 @@ default:
dataclient_type: local
nshap_samples: 100 # how many samples to use to approximate each Shapely value, larger values will be slower
n_val: 30 # how many datapoints from validation data should we interpret predictions for, larger values will be slower
row_index: [0, 10, 20] # index of an example datapoint
row_index: [20695, 50243, 7653] # index of an example datapoint

View file

@ -13,6 +13,6 @@ default:
output_filepath: ./data/model/allmodels/
problem_type: regression
eval_metric: mean_squared_error #mean_absolute_error
time_limit: 4000
time_limit: 180
presets: medium_quality
excluded_model_types: ['KNN', 'RF']

View file

@ -18,6 +18,28 @@ def remove_starting_columns(df):
return df
def remove_floor_height_ending(df):
# df.describe(percentiles=[0.005,0.99])['FLOOR_HEIGHT_ENDING']
# shows bottom 0.5 percentile is 1.665
# So keep anything above this
df = df[df["FLOOR_HEIGHT_ENDING"] > 1.665].reset_index(drop=True)
print("we in here")
return df
def remove_minimum_habitable_room_size(df):
# Need minimum of 6.5m per habitable room
df = df[
df["TOTAL_FLOOR_AREA_ENDING"] / df["NUMBER_HABITABLE_ROOMS"] > 6.5
].reset_index(drop=True)
return df
def keep_flats(df):
df = df[df["PROPERTY_TYPE"] == "Flat"]
return df
# def keep_ending_columns(df):
# ending_column_index = [ col_name.endswith("_ENDING") for col_name in list(df.columns)]
# keep_columns = df.columns[ending_column_index].to_list()
@ -27,6 +49,9 @@ def remove_starting_columns(df):
# return df
business_logic = {
# "keep_flats": keep_flats,
# "remove_minimum_habitable_room_size": remove_minimum_habitable_room_size,
# "remove_floor_height_ending": remove_floor_height_ending
# "remove_starting_columns": remove_starting_columns
# "keep_ENDING_COLUMNS": keep_ending_columns
}

View file

@ -207,11 +207,11 @@ mix_df["residual"] = abs(mix_df[predictions_column_name] - mix_df[target])
mix_df = mix_df.sort_values("residual", ascending=False)
cosine_similarity_df = mix_df[
mix_df.columns.difference(["predictions", "residual", "SAP_ENDING"])
mix_df.columns.difference(["UPRN", "predictions", "residual", "SAP_ENDING"])
]
from sklearn.metrics.pairwise import cosine_similarity
row_index = 58199
row_index = 20695
from sklearn.preprocessing import LabelEncoder
@ -224,8 +224,8 @@ cosine_similarity_df[object_columns.columns] = cosine_similarity_df[
feature_vector = cosine_similarity_df.loc[[row_index]]
cosine_similarity_df["cosine"] = cosine_similarity(cosine_similarity_df, feature_vector)
similar_index = (
cosine_similarity_df.sort_values("cosine", ascending=False).head(5).index
)
similar_df = cosine_similarity_df.sort_values("cosine", ascending=False).head(5)
similar_index = similar_df.index
check_df = mix_df.loc[similar_index]