making the data objects dictionaries for different targets

2026-07-27 23:35:01 +00:00 · 2024-07-02 16:06:01 +01:00 · 2024-07-02 16:06:01 +01:00 · 7790822e76
commit 7790822e76
parent dd0deab0ee
2 changed files with 81 additions and 34 deletions
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@ -3,51 +3,87 @@ from datetime import datetime
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import mean_squared_error, r2_score
-from utils.s3 import save_pickle_to_s3, read_pickle_from_s3
+from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet


 class EnergyConsumptionModel:
-    FEATURES = ['feature_1', 'feature_2']
+    FEATURES = {
+        "heating_kwh": [
+            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
+            "heating-cost-current",
+        ],
+        "hot_water_kwh": [
+            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
+            "hot-water-cost-current"
+        ]
+    }
    TARGETS = ['heating_kwh', 'hot_water_kwh']
+    CATEGORICAL_COLUMNS = ["lodgement-year", "lodgement-month"]
+    NUMERICAL_COLUMNS = ["current-energy-efficiency", "energy-consumption-current", "heating-cost-current",
+                         "hot-water-cost-current"]

    def __init__(self, model_paths=None):
        self.models = {}
        self.model_paths = model_paths or {}
        self.data = None
+        self.dummy_columns = None

-        self.X_train = None
-        self.X_test = None
-        self.y_train = None
-        self.y_test = None
+        self.x_train = {}
+        self.x_test = {}
+        self.y_train = {}
+        self.y_test = {}

        if model_paths:
            for target, path in model_paths.items():
                self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path)

    def read_dataset(self, file_path):
-        self.data = pd.read_csv(file_path)
+        self.data = read_dataframe_from_s3_parquet(bucket_name="retrofit-data-dev", file_key=file_path)

    def feature_engineering(self):
-        # Example feature engineering steps
-        self.data['feature_1'] = self.data['original_feature_1'] ** 2
-        self.data['feature_2'] = self.data['original_feature_2'] ** 0.5
-        # Add more feature engineering steps as required
+        # Extract date features
+        self.data["lodgement-date"] = pd.to_datetime(self.data["lodgement-date"])
+        self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
+        self.data["lodgement-month"] = self.data["lodgement-date"].dt.month
+
+        # Convert data types
+        self.data[self.NUMERICAL_COLUMNS] = self.data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
+        self.data[self.CATEGORICAL_COLUMNS] = self.data[self.CATEGORICAL_COLUMNS].astype(str)
+
+        # Convert categorical columns to dummies
+        self.data = pd.get_dummies(self.data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
+
+        # Store the dummy columns
+        self.dummy_columns = {}
+        for target in self.TARGETS:
+            target_features = self.FEATURES[target]
+            dummy_feature_columns = []
+            for feature in target_features:
+                if feature in self.CATEGORICAL_COLUMNS:
+                    dummy_feature_columns.extend([col for col in self.data.columns if col.startswith(feature + '_')])
+                else:
+                    dummy_feature_columns.append(feature)
+            self.dummy_columns[target] = dummy_feature_columns

    def split_dataset(self, target, test_size=0.2, random_state=42):
-        X = self.data[self.FEATURES]
+
+        if target not in self.TARGETS:
+            raise ValueError(f"Target {target} not in {self.TARGETS}")
+
+        x = self.data[self.dummy_columns[target]]
        y = self.data[target]
-        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
-            X, y, test_size=test_size, random_state=random_state
+        self.x_train[target], self.x_test[target], self.y_train[target], self.y_test[target] = train_test_split(
+            x, y, test_size=test_size, random_state=random_state
        )

    def fit_model(self, target):
        self.models[target] = LinearRegression()
-        self.models[target].fit(self.X_train, self.y_train)
+        self.models[target].fit(self.x_train[target], self.y_train[target])

    def evaluate_model(self, target):
-        y_pred = self.models[target].predict(self.X_test)
-        mse = mean_squared_error(self.y_test, y_pred)
-        r2 = r2_score(self.y_test, y_pred)
+        y_pred = self.models[target].predict(self.x_test[target])
+        mse = mean_squared_error(self.y_test[target], y_pred)
+        r2 = r2_score(self.y_test[target], y_pred)
        return {'MSE': mse, 'R2': r2}

    def save_model(self, target):
@ -67,23 +103,32 @@ class EnergyConsumptionModel:

    def transform_new_data(self, new_data):
        # Apply the same transformations as in feature_engineering
-        new_data['feature_1'] = new_data['original_feature_1'] ** 2
-        new_data['feature_2'] = new_data['original_feature_2'] ** 0.5
-        return new_data[self.FEATURES]
+        new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
+        new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
+        new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
+
+        # Convert categorical columns to dummies
+        new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
+
+        # Align new data with the dummy columns from training data
+        new_data = new_data.reindex(columns=self.dummy_columns, fill_value=0)
+
+        return new_data.drop(columns=[target for target in self.TARGETS if target in new_data.columns])
+

 # Example usage:
-# model = EnergyConsumptionModel()
-# model.read_dataset('/mnt/data/energy_consumption_dataset.csv')
-# model.feature_engineering()
+model = EnergyConsumptionModel()
+model.read_dataset('energy_consumption/2024-07-02/energy_consumption_dataset.parquet')
+model.feature_engineering()

 # For heating_kwh
-# model.split_dataset(target='heating_kwh')
-# model.fit_model(target='heating_kwh')
-# print(model.evaluate_model(target='heating_kwh'))
-# model.save_model(target='heating_kwh')
+model.split_dataset(target='heating_kwh')
+model.fit_model(target='heating_kwh')
+print(model.evaluate_model(target='heating_kwh'))
+model.save_model(target='heating_kwh')

 # For hot_water_kwh
-# model.split_dataset(target='hot_water_kwh')
-# model.fit_model(target='hot_water_kwh')
-# print(model.evaluate_model(target='hot_water_kwh'))
-# model.save_model(target='hot_water_kwh')
+model.split_dataset(target='hot_water_kwh')
+model.fit_model(target='hot_water_kwh')
+print(model.evaluate_model(target='hot_water_kwh'))
+model.save_model(target='hot_water_kwh')
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@ -132,6 +132,9 @@ def app():

    energy_consumption_data = []
    for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
+        # Skip the first 50
+        if i < 50:
+            continue

        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
        # Rename the columns to the same format as the api returns
@ -148,8 +151,7 @@ def app():

        collected_data = []
        for _, property_data in data.iterrows():
-            # Sleep for a random time between 0.1 and 1.4 seconds
-            time.sleep(np.random.uniform(0.1, 1.4))
+            time.sleep(np.random.uniform(0.3, 2))

            uprn = int(property_data["uprn"])
            address = property_data["address1"]