From 7790822e76cd968e0af10b76ff451e73d2362b56 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 2 Jul 2024 16:06:01 +0100 Subject: [PATCH] making the data objects dictionaries for different targets --- etl/bill_savings/EnergyConsumptionModel.py | 109 +++++++++++++++------ etl/bill_savings/data_collection.py | 6 +- 2 files changed, 81 insertions(+), 34 deletions(-) diff --git a/etl/bill_savings/EnergyConsumptionModel.py b/etl/bill_savings/EnergyConsumptionModel.py index 2ca88da5..d2c77e48 100644 --- a/etl/bill_savings/EnergyConsumptionModel.py +++ b/etl/bill_savings/EnergyConsumptionModel.py @@ -3,51 +3,87 @@ from datetime import datetime from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score -from utils.s3 import save_pickle_to_s3, read_pickle_from_s3 +from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet class EnergyConsumptionModel: - FEATURES = ['feature_1', 'feature_2'] + FEATURES = { + "heating_kwh": [ + "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", + "heating-cost-current", + ], + "hot_water_kwh": [ + "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current", + "hot-water-cost-current" + ] + } TARGETS = ['heating_kwh', 'hot_water_kwh'] + CATEGORICAL_COLUMNS = ["lodgement-year", "lodgement-month"] + NUMERICAL_COLUMNS = ["current-energy-efficiency", "energy-consumption-current", "heating-cost-current", + "hot-water-cost-current"] def __init__(self, model_paths=None): self.models = {} self.model_paths = model_paths or {} self.data = None + self.dummy_columns = None - self.X_train = None - self.X_test = None - self.y_train = None - self.y_test = None + self.x_train = {} + self.x_test = {} + self.y_train = {} + self.y_test = {} if model_paths: for target, path in model_paths.items(): self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path) def read_dataset(self, file_path): - self.data = pd.read_csv(file_path) + self.data = read_dataframe_from_s3_parquet(bucket_name="retrofit-data-dev", file_key=file_path) def feature_engineering(self): - # Example feature engineering steps - self.data['feature_1'] = self.data['original_feature_1'] ** 2 - self.data['feature_2'] = self.data['original_feature_2'] ** 0.5 - # Add more feature engineering steps as required + # Extract date features + self.data["lodgement-date"] = pd.to_datetime(self.data["lodgement-date"]) + self.data["lodgement-year"] = self.data["lodgement-date"].dt.year + self.data["lodgement-month"] = self.data["lodgement-date"].dt.month + + # Convert data types + self.data[self.NUMERICAL_COLUMNS] = self.data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric) + self.data[self.CATEGORICAL_COLUMNS] = self.data[self.CATEGORICAL_COLUMNS].astype(str) + + # Convert categorical columns to dummies + self.data = pd.get_dummies(self.data, columns=self.CATEGORICAL_COLUMNS, drop_first=True) + + # Store the dummy columns + self.dummy_columns = {} + for target in self.TARGETS: + target_features = self.FEATURES[target] + dummy_feature_columns = [] + for feature in target_features: + if feature in self.CATEGORICAL_COLUMNS: + dummy_feature_columns.extend([col for col in self.data.columns if col.startswith(feature + '_')]) + else: + dummy_feature_columns.append(feature) + self.dummy_columns[target] = dummy_feature_columns def split_dataset(self, target, test_size=0.2, random_state=42): - X = self.data[self.FEATURES] + + if target not in self.TARGETS: + raise ValueError(f"Target {target} not in {self.TARGETS}") + + x = self.data[self.dummy_columns[target]] y = self.data[target] - self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( - X, y, test_size=test_size, random_state=random_state + self.x_train[target], self.x_test[target], self.y_train[target], self.y_test[target] = train_test_split( + x, y, test_size=test_size, random_state=random_state ) def fit_model(self, target): self.models[target] = LinearRegression() - self.models[target].fit(self.X_train, self.y_train) + self.models[target].fit(self.x_train[target], self.y_train[target]) def evaluate_model(self, target): - y_pred = self.models[target].predict(self.X_test) - mse = mean_squared_error(self.y_test, y_pred) - r2 = r2_score(self.y_test, y_pred) + y_pred = self.models[target].predict(self.x_test[target]) + mse = mean_squared_error(self.y_test[target], y_pred) + r2 = r2_score(self.y_test[target], y_pred) return {'MSE': mse, 'R2': r2} def save_model(self, target): @@ -67,23 +103,32 @@ class EnergyConsumptionModel: def transform_new_data(self, new_data): # Apply the same transformations as in feature_engineering - new_data['feature_1'] = new_data['original_feature_1'] ** 2 - new_data['feature_2'] = new_data['original_feature_2'] ** 0.5 - return new_data[self.FEATURES] + new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"]) + new_data["lodgement-year"] = new_data["lodgement-date"].dt.year + new_data["lodgement-month"] = new_data["lodgement-date"].dt.month + + # Convert categorical columns to dummies + new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True) + + # Align new data with the dummy columns from training data + new_data = new_data.reindex(columns=self.dummy_columns, fill_value=0) + + return new_data.drop(columns=[target for target in self.TARGETS if target in new_data.columns]) + # Example usage: -# model = EnergyConsumptionModel() -# model.read_dataset('/mnt/data/energy_consumption_dataset.csv') -# model.feature_engineering() +model = EnergyConsumptionModel() +model.read_dataset('energy_consumption/2024-07-02/energy_consumption_dataset.parquet') +model.feature_engineering() # For heating_kwh -# model.split_dataset(target='heating_kwh') -# model.fit_model(target='heating_kwh') -# print(model.evaluate_model(target='heating_kwh')) -# model.save_model(target='heating_kwh') +model.split_dataset(target='heating_kwh') +model.fit_model(target='heating_kwh') +print(model.evaluate_model(target='heating_kwh')) +model.save_model(target='heating_kwh') # For hot_water_kwh -# model.split_dataset(target='hot_water_kwh') -# model.fit_model(target='hot_water_kwh') -# print(model.evaluate_model(target='hot_water_kwh')) -# model.save_model(target='hot_water_kwh') +model.split_dataset(target='hot_water_kwh') +model.fit_model(target='hot_water_kwh') +print(model.evaluate_model(target='hot_water_kwh')) +model.save_model(target='hot_water_kwh') diff --git a/etl/bill_savings/data_collection.py b/etl/bill_savings/data_collection.py index 3b503122..79afa936 100644 --- a/etl/bill_savings/data_collection.py +++ b/etl/bill_savings/data_collection.py @@ -132,6 +132,9 @@ def app(): energy_consumption_data = [] for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)): + # Skip the first 50 + if i < 50: + continue data = pd.read_csv(directory / "certificates.csv", low_memory=False) # Rename the columns to the same format as the api returns @@ -148,8 +151,7 @@ def app(): collected_data = [] for _, property_data in data.iterrows(): - # Sleep for a random time between 0.1 and 1.4 seconds - time.sleep(np.random.uniform(0.1, 1.4)) + time.sleep(np.random.uniform(0.3, 2)) uprn = int(property_data["uprn"]) address = property_data["address1"]