had a silly bug in cleaning code, which is fixed

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-04 16:35:08 +01:00
parent c59aff412c
commit 68e903e492

View file

@ -9,7 +9,6 @@ from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, e
median_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import xgboost as xgb
with open("all_data.pkl", "rb") as f:
@ -151,7 +150,7 @@ class SapModel:
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
model_data[col] = np.where(
model_data[col] == "", "0", model_data["photo-supply"]
model_data[col] == "", "0", model_data[col]
).astype(float)
return model_data
@ -343,13 +342,13 @@ class SapModel:
).sort_values("actual", ascending=True).merge(self.model_data[["idx", "property-type"]], on="idx")
# temp hardcoded values
best_fit = {'MAPE': 0.042768242654695386, 'Mean Squared Error': 21.606875710236896,
'Mean Absolute Error': 3.293776606279645, 'R2 Score': 0.7930242722318233,
'Explained Variance Score': 0.7930242722318233, 'Median Absolute Error': 2.47686604239054}
best_fit = {'MAPE': 0.042824355225087686, 'Mean Squared Error': 21.49263731368226,
'Mean Absolute Error': 3.298755911054327, 'R2 Score': 0.794118580154128,
'Explained Variance Score': 0.794118580154128, 'Median Absolute Error': 2.426789554039914}
best_predict = {'MAPE': 0.04397538047202114, 'Mean Squared Error': 22.582856696398935,
'Mean Absolute Error': 3.384549163877968, 'R2 Score': 0.7515887251149801,
'Explained Variance Score': 0.7516508219403573, 'Median Absolute Error': 2.4624472128668344}
best_predict = {'MAPE': 0.04413439429441669, 'Mean Squared Error': 22.700373062051142,
'Mean Absolute Error': 3.3961241443022008, 'R2 Score': 0.750296045867001,
'Explained Variance Score': 0.7503518147827141, 'Median Absolute Error': 2.4442017110145855}
def check_successes(experiment_error, best_error):
@ -641,3 +640,25 @@ self = SapModel(
data=all_data["data"],
cleaner=all_data["cleaner"]
)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=self.random_state)
X = self.df.drop(columns=self.RESPONSE)
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
X[col] = np.where(
X[col] == "", "0", X[col]
).astype(float)
Y = self.df[self.RESPONSE]
rf.fit(X, Y)
# Print the name and importance of each feature
importance_df = []
for feature, importance in zip(train_x.columns, rf.feature_importances_):
importance_df.append(
{
"Feature": feature,
"rf_importance": importance
}
)