prepared sap model dataset

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-03 18:46:55 +01:00
parent 8e65ac05cf
commit d586441769

View file

@ -72,6 +72,7 @@ class SalModel:
self.df = pd.DataFrame(data)
self.cleaner = cleaner
self.model_data = None
self.train_x = None
self.train_y = None
self.results = None
@ -128,6 +129,15 @@ class SalModel:
how="left",
left_on="roof-description",
right_on="original_description"
).drop(
columns=["original_description"]
).merge(
lighting_proportions,
how="left",
left_on="lighting-description",
right_on="original_description"
).drop(
columns=["original_description"]
)
return model_data
@ -141,13 +151,11 @@ class SalModel:
@staticmethod
def _clean_numericals(model_data):
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting"]:
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting", "number-open-fireplaces"]:
model_data[col] = np.where(
model_data[col] == "", "0", model_data["photo-supply"]
).astype(float)
# We need to clean lighting
return model_data
def create_dataset(self):
@ -176,9 +184,9 @@ class SalModel:
exclude_features = ["walls-description", "floor-description", "roof-description", "transaction-type"]
features = [
x for x in self.BASE_FEATURES +
self.COMPONENT_FEATURES +
["walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE] if x not in exclude_features
x for x in self.BASE_FEATURES + self.COMPONENT_FEATURES + [
"walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE
] if x not in exclude_features
]
model_data = model_data[features]
@ -186,6 +194,11 @@ class SalModel:
for col in self.CATEGORICAL_COLS:
model_data[col] = model_data[col].astype('category')
# Convert response
self.model_data[self.RESPONSE] = self.model_data[self.RESPONSE].astype(float)
self.model_data = model_data
def make_training_test(self):
# Split into training and test
# Dummy data