making the data objects dictionaries for different targets

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-02 16:06:01 +01:00
parent dd0deab0ee
commit 7790822e76
2 changed files with 81 additions and 34 deletions

View file

@ -3,51 +3,87 @@ from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
class EnergyConsumptionModel:
FEATURES = ['feature_1', 'feature_2']
FEATURES = {
"heating_kwh": [
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
"heating-cost-current",
],
"hot_water_kwh": [
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
"hot-water-cost-current"
]
}
TARGETS = ['heating_kwh', 'hot_water_kwh']
CATEGORICAL_COLUMNS = ["lodgement-year", "lodgement-month"]
NUMERICAL_COLUMNS = ["current-energy-efficiency", "energy-consumption-current", "heating-cost-current",
"hot-water-cost-current"]
def __init__(self, model_paths=None):
self.models = {}
self.model_paths = model_paths or {}
self.data = None
self.dummy_columns = None
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
self.x_train = {}
self.x_test = {}
self.y_train = {}
self.y_test = {}
if model_paths:
for target, path in model_paths.items():
self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path)
def read_dataset(self, file_path):
self.data = pd.read_csv(file_path)
self.data = read_dataframe_from_s3_parquet(bucket_name="retrofit-data-dev", file_key=file_path)
def feature_engineering(self):
# Example feature engineering steps
self.data['feature_1'] = self.data['original_feature_1'] ** 2
self.data['feature_2'] = self.data['original_feature_2'] ** 0.5
# Add more feature engineering steps as required
# Extract date features
self.data["lodgement-date"] = pd.to_datetime(self.data["lodgement-date"])
self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
self.data["lodgement-month"] = self.data["lodgement-date"].dt.month
# Convert data types
self.data[self.NUMERICAL_COLUMNS] = self.data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
self.data[self.CATEGORICAL_COLUMNS] = self.data[self.CATEGORICAL_COLUMNS].astype(str)
# Convert categorical columns to dummies
self.data = pd.get_dummies(self.data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
# Store the dummy columns
self.dummy_columns = {}
for target in self.TARGETS:
target_features = self.FEATURES[target]
dummy_feature_columns = []
for feature in target_features:
if feature in self.CATEGORICAL_COLUMNS:
dummy_feature_columns.extend([col for col in self.data.columns if col.startswith(feature + '_')])
else:
dummy_feature_columns.append(feature)
self.dummy_columns[target] = dummy_feature_columns
def split_dataset(self, target, test_size=0.2, random_state=42):
X = self.data[self.FEATURES]
if target not in self.TARGETS:
raise ValueError(f"Target {target} not in {self.TARGETS}")
x = self.data[self.dummy_columns[target]]
y = self.data[target]
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state
self.x_train[target], self.x_test[target], self.y_train[target], self.y_test[target] = train_test_split(
x, y, test_size=test_size, random_state=random_state
)
def fit_model(self, target):
self.models[target] = LinearRegression()
self.models[target].fit(self.X_train, self.y_train)
self.models[target].fit(self.x_train[target], self.y_train[target])
def evaluate_model(self, target):
y_pred = self.models[target].predict(self.X_test)
mse = mean_squared_error(self.y_test, y_pred)
r2 = r2_score(self.y_test, y_pred)
y_pred = self.models[target].predict(self.x_test[target])
mse = mean_squared_error(self.y_test[target], y_pred)
r2 = r2_score(self.y_test[target], y_pred)
return {'MSE': mse, 'R2': r2}
def save_model(self, target):
@ -67,23 +103,32 @@ class EnergyConsumptionModel:
def transform_new_data(self, new_data):
# Apply the same transformations as in feature_engineering
new_data['feature_1'] = new_data['original_feature_1'] ** 2
new_data['feature_2'] = new_data['original_feature_2'] ** 0.5
return new_data[self.FEATURES]
new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
# Convert categorical columns to dummies
new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
# Align new data with the dummy columns from training data
new_data = new_data.reindex(columns=self.dummy_columns, fill_value=0)
return new_data.drop(columns=[target for target in self.TARGETS if target in new_data.columns])
# Example usage:
# model = EnergyConsumptionModel()
# model.read_dataset('/mnt/data/energy_consumption_dataset.csv')
# model.feature_engineering()
model = EnergyConsumptionModel()
model.read_dataset('energy_consumption/2024-07-02/energy_consumption_dataset.parquet')
model.feature_engineering()
# For heating_kwh
# model.split_dataset(target='heating_kwh')
# model.fit_model(target='heating_kwh')
# print(model.evaluate_model(target='heating_kwh'))
# model.save_model(target='heating_kwh')
model.split_dataset(target='heating_kwh')
model.fit_model(target='heating_kwh')
print(model.evaluate_model(target='heating_kwh'))
model.save_model(target='heating_kwh')
# For hot_water_kwh
# model.split_dataset(target='hot_water_kwh')
# model.fit_model(target='hot_water_kwh')
# print(model.evaluate_model(target='hot_water_kwh'))
# model.save_model(target='hot_water_kwh')
model.split_dataset(target='hot_water_kwh')
model.fit_model(target='hot_water_kwh')
print(model.evaluate_model(target='hot_water_kwh'))
model.save_model(target='hot_water_kwh')

View file

@ -132,6 +132,9 @@ def app():
energy_consumption_data = []
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
# Skip the first 50
if i < 50:
continue
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
# Rename the columns to the same format as the api returns
@ -148,8 +151,7 @@ def app():
collected_data = []
for _, property_data in data.iterrows():
# Sleep for a random time between 0.1 and 1.4 seconds
time.sleep(np.random.uniform(0.1, 1.4))
time.sleep(np.random.uniform(0.3, 2))
uprn = int(property_data["uprn"])
address = property_data["address1"]