From d586441769c9926f5f620cdfbda617cb794a2015 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 3 Jul 2023 18:46:55 +0100 Subject: [PATCH] prepared sap model dataset --- model_data/analysis/SapModel.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index 426a56c5..d18429dc 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -72,6 +72,7 @@ class SalModel: self.df = pd.DataFrame(data) self.cleaner = cleaner + self.model_data = None self.train_x = None self.train_y = None self.results = None @@ -128,6 +129,15 @@ class SalModel: how="left", left_on="roof-description", right_on="original_description" + ).drop( + columns=["original_description"] + ).merge( + lighting_proportions, + how="left", + left_on="lighting-description", + right_on="original_description" + ).drop( + columns=["original_description"] ) return model_data @@ -141,13 +151,11 @@ class SalModel: @staticmethod def _clean_numericals(model_data): - for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting"]: + for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]: model_data[col] = np.where( model_data[col] == "", "0", model_data["photo-supply"] ).astype(float) - # We need to clean lighting - return model_data def create_dataset(self): @@ -176,9 +184,9 @@ class SalModel: exclude_features = ["walls-description", "floor-description", "roof-description", "transaction-type"] features = [ - x for x in self.BASE_FEATURES + - self.COMPONENT_FEATURES + - ["walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE] if x not in exclude_features + x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [ + "walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE + ] if x not in exclude_features ] model_data = model_data[features] @@ -186,6 +194,11 @@ class SalModel: for col in self.CATEGORICAL_COLS: model_data[col] = model_data[col].astype('category') + # Convert response + self.model_data[self.RESPONSE] = self.model_data[self.RESPONSE].astype(float) + + self.model_data = model_data + def make_training_test(self): # Split into training and test # Dummy data