mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
making the data objects dictionaries for different targets
This commit is contained in:
parent
dd0deab0ee
commit
7790822e76
2 changed files with 81 additions and 34 deletions
|
|
@ -3,51 +3,87 @@ from datetime import datetime
|
|||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics import mean_squared_error, r2_score
|
||||
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3
|
||||
from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
|
||||
|
||||
|
||||
class EnergyConsumptionModel:
|
||||
FEATURES = ['feature_1', 'feature_2']
|
||||
FEATURES = {
|
||||
"heating_kwh": [
|
||||
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
|
||||
"heating-cost-current",
|
||||
],
|
||||
"hot_water_kwh": [
|
||||
"lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
|
||||
"hot-water-cost-current"
|
||||
]
|
||||
}
|
||||
TARGETS = ['heating_kwh', 'hot_water_kwh']
|
||||
CATEGORICAL_COLUMNS = ["lodgement-year", "lodgement-month"]
|
||||
NUMERICAL_COLUMNS = ["current-energy-efficiency", "energy-consumption-current", "heating-cost-current",
|
||||
"hot-water-cost-current"]
|
||||
|
||||
def __init__(self, model_paths=None):
|
||||
self.models = {}
|
||||
self.model_paths = model_paths or {}
|
||||
self.data = None
|
||||
self.dummy_columns = None
|
||||
|
||||
self.X_train = None
|
||||
self.X_test = None
|
||||
self.y_train = None
|
||||
self.y_test = None
|
||||
self.x_train = {}
|
||||
self.x_test = {}
|
||||
self.y_train = {}
|
||||
self.y_test = {}
|
||||
|
||||
if model_paths:
|
||||
for target, path in model_paths.items():
|
||||
self.models[target] = read_pickle_from_s3(bucket_name="retrofit-model-directory-dev", s3_file_name=path)
|
||||
|
||||
def read_dataset(self, file_path):
|
||||
self.data = pd.read_csv(file_path)
|
||||
self.data = read_dataframe_from_s3_parquet(bucket_name="retrofit-data-dev", file_key=file_path)
|
||||
|
||||
def feature_engineering(self):
|
||||
# Example feature engineering steps
|
||||
self.data['feature_1'] = self.data['original_feature_1'] ** 2
|
||||
self.data['feature_2'] = self.data['original_feature_2'] ** 0.5
|
||||
# Add more feature engineering steps as required
|
||||
# Extract date features
|
||||
self.data["lodgement-date"] = pd.to_datetime(self.data["lodgement-date"])
|
||||
self.data["lodgement-year"] = self.data["lodgement-date"].dt.year
|
||||
self.data["lodgement-month"] = self.data["lodgement-date"].dt.month
|
||||
|
||||
# Convert data types
|
||||
self.data[self.NUMERICAL_COLUMNS] = self.data[self.NUMERICAL_COLUMNS].apply(pd.to_numeric)
|
||||
self.data[self.CATEGORICAL_COLUMNS] = self.data[self.CATEGORICAL_COLUMNS].astype(str)
|
||||
|
||||
# Convert categorical columns to dummies
|
||||
self.data = pd.get_dummies(self.data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
|
||||
|
||||
# Store the dummy columns
|
||||
self.dummy_columns = {}
|
||||
for target in self.TARGETS:
|
||||
target_features = self.FEATURES[target]
|
||||
dummy_feature_columns = []
|
||||
for feature in target_features:
|
||||
if feature in self.CATEGORICAL_COLUMNS:
|
||||
dummy_feature_columns.extend([col for col in self.data.columns if col.startswith(feature + '_')])
|
||||
else:
|
||||
dummy_feature_columns.append(feature)
|
||||
self.dummy_columns[target] = dummy_feature_columns
|
||||
|
||||
def split_dataset(self, target, test_size=0.2, random_state=42):
|
||||
X = self.data[self.FEATURES]
|
||||
|
||||
if target not in self.TARGETS:
|
||||
raise ValueError(f"Target {target} not in {self.TARGETS}")
|
||||
|
||||
x = self.data[self.dummy_columns[target]]
|
||||
y = self.data[target]
|
||||
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
|
||||
X, y, test_size=test_size, random_state=random_state
|
||||
self.x_train[target], self.x_test[target], self.y_train[target], self.y_test[target] = train_test_split(
|
||||
x, y, test_size=test_size, random_state=random_state
|
||||
)
|
||||
|
||||
def fit_model(self, target):
|
||||
self.models[target] = LinearRegression()
|
||||
self.models[target].fit(self.X_train, self.y_train)
|
||||
self.models[target].fit(self.x_train[target], self.y_train[target])
|
||||
|
||||
def evaluate_model(self, target):
|
||||
y_pred = self.models[target].predict(self.X_test)
|
||||
mse = mean_squared_error(self.y_test, y_pred)
|
||||
r2 = r2_score(self.y_test, y_pred)
|
||||
y_pred = self.models[target].predict(self.x_test[target])
|
||||
mse = mean_squared_error(self.y_test[target], y_pred)
|
||||
r2 = r2_score(self.y_test[target], y_pred)
|
||||
return {'MSE': mse, 'R2': r2}
|
||||
|
||||
def save_model(self, target):
|
||||
|
|
@ -67,23 +103,32 @@ class EnergyConsumptionModel:
|
|||
|
||||
def transform_new_data(self, new_data):
|
||||
# Apply the same transformations as in feature_engineering
|
||||
new_data['feature_1'] = new_data['original_feature_1'] ** 2
|
||||
new_data['feature_2'] = new_data['original_feature_2'] ** 0.5
|
||||
return new_data[self.FEATURES]
|
||||
new_data["lodgement-date"] = pd.to_datetime(new_data["lodgement-date"])
|
||||
new_data["lodgement-year"] = new_data["lodgement-date"].dt.year
|
||||
new_data["lodgement-month"] = new_data["lodgement-date"].dt.month
|
||||
|
||||
# Convert categorical columns to dummies
|
||||
new_data = pd.get_dummies(new_data, columns=self.CATEGORICAL_COLUMNS, drop_first=True)
|
||||
|
||||
# Align new data with the dummy columns from training data
|
||||
new_data = new_data.reindex(columns=self.dummy_columns, fill_value=0)
|
||||
|
||||
return new_data.drop(columns=[target for target in self.TARGETS if target in new_data.columns])
|
||||
|
||||
|
||||
# Example usage:
|
||||
# model = EnergyConsumptionModel()
|
||||
# model.read_dataset('/mnt/data/energy_consumption_dataset.csv')
|
||||
# model.feature_engineering()
|
||||
model = EnergyConsumptionModel()
|
||||
model.read_dataset('energy_consumption/2024-07-02/energy_consumption_dataset.parquet')
|
||||
model.feature_engineering()
|
||||
|
||||
# For heating_kwh
|
||||
# model.split_dataset(target='heating_kwh')
|
||||
# model.fit_model(target='heating_kwh')
|
||||
# print(model.evaluate_model(target='heating_kwh'))
|
||||
# model.save_model(target='heating_kwh')
|
||||
model.split_dataset(target='heating_kwh')
|
||||
model.fit_model(target='heating_kwh')
|
||||
print(model.evaluate_model(target='heating_kwh'))
|
||||
model.save_model(target='heating_kwh')
|
||||
|
||||
# For hot_water_kwh
|
||||
# model.split_dataset(target='hot_water_kwh')
|
||||
# model.fit_model(target='hot_water_kwh')
|
||||
# print(model.evaluate_model(target='hot_water_kwh'))
|
||||
# model.save_model(target='hot_water_kwh')
|
||||
model.split_dataset(target='hot_water_kwh')
|
||||
model.fit_model(target='hot_water_kwh')
|
||||
print(model.evaluate_model(target='hot_water_kwh'))
|
||||
model.save_model(target='hot_water_kwh')
|
||||
|
|
|
|||
|
|
@ -132,6 +132,9 @@ def app():
|
|||
|
||||
energy_consumption_data = []
|
||||
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
|
||||
# Skip the first 50
|
||||
if i < 50:
|
||||
continue
|
||||
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
# Rename the columns to the same format as the api returns
|
||||
|
|
@ -148,8 +151,7 @@ def app():
|
|||
|
||||
collected_data = []
|
||||
for _, property_data in data.iterrows():
|
||||
# Sleep for a random time between 0.1 and 1.4 seconds
|
||||
time.sleep(np.random.uniform(0.1, 1.4))
|
||||
time.sleep(np.random.uniform(0.3, 2))
|
||||
|
||||
uprn = int(property_data["uprn"])
|
||||
address = property_data["address1"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue