implemented xgboost which performs really well

2026-07-27 23:35:01 +00:00 · 2024-07-02 17:48:06 +01:00 · 2024-07-02 17:48:06 +01:00 · 0a1f728f37
commit 0a1f728f37
parent 39a4c2e975
2 changed files with 3 additions and 4 deletions
--- a/etl/bill_savings/EnergyConsumptionModel.py
+++ b/etl/bill_savings/EnergyConsumptionModel.py
@ -2,7 +2,6 @@ import pandas as pd
 from xgboost import XGBRegressor
 from datetime import datetime
 from sklearn.model_selection import train_test_split
-from sklearn.linear_model import LinearRegression
 from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
 from sklearn.feature_selection import RFECV
 from utils.s3 import save_pickle_to_s3, read_pickle_from_s3, read_dataframe_from_s3_parquet
@ -16,8 +15,8 @@ class EnergyConsumptionModel:
    FEATURES = {
        "heating_kwh": [
            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
-            "heating-cost-current", "main-fuel", "total-floor-area", "number-heated-rooms", "number-habitable-rooms",
-            "mainheat-energy-eff"
+            "heating-cost-current", "total-floor-area", "number-heated-rooms", "number-habitable-rooms",
+            # "mainheat-energy-eff", "mainheat-description", "main-fuel",
        ],
        "hot_water_kwh": [
            "lodgement-year", "lodgement-month", "current-energy-efficiency", "energy-consumption-current",
--- a/etl/bill_savings/data_collection.py
+++ b/etl/bill_savings/data_collection.py
@ -133,7 +133,7 @@ def app():
    energy_consumption_data = []
    for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
        # Skip the first 50
-        if i < 50:
+        if i < 90:
            continue

        data = pd.read_csv(directory / "certificates.csv", low_memory=False)