working on sap model but need to clean lighting

This commit is contained in:
Khalim Conn-Kowlessar 2023-07-03 14:39:46 +01:00
parent f941a3c512
commit ba201c8b6a
2 changed files with 127 additions and 51 deletions

View file

@ -6,7 +6,7 @@ import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
median_absolute_error, mean_absolute_percentage_error
with open("all_data.pkl", "wb") as f:
with open("all_data.pkl", "rb") as f:
all_data = pickle.load(f)
@ -18,25 +18,53 @@ class SalModel:
BASE_FEATURES = [
"property-type",
"built-form",
# "construction-age-band",
"construction-age-band",
"number-habitable-rooms",
"constituency",
"number-heated-rooms",
"transaction-type"
]
COMPONENT_FEATURES = [
"walls-description",
"floor-description",
"lighting-description",
"windows-description",
"roof-description",
"mainheat-description",
"main-fuel"
"hotwater-description",
"main-fuel",
"mechanical-ventilation",
"secondheat-description",
"energy-tariff",
"solar-water-heating-flag",
"photo-supply",
"windows-description",
"glazed-type",
"glazed-area",
"multi-glaze-proportion",
# "lighting-description" # Might not need to use this
"low-energy-lighting",
"number-open-fireplaces",
]
CATEGORICAL_COLS = [
"property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
"lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
"property-type",
"built-form",
"number-habitable-rooms",
"constituency",
"number-heated-rooms",
"lighting-description",
"mainheat-description",
"hotwater-description",
"main-fuel",
"mechanical-ventilation",
"secondheat-description",
"energy-tariff",
"solar-water-heating-flag",
"windows-description",
"glazed-type",
"glazed-area",
"mainheat-description",
]
@ -51,41 +79,103 @@ class SalModel:
self.fit_error = None
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
def _append_extracted_u_values(self, model_data):
"""
We need to estimate the u-value impact for:
1) Walls
2) Roof
3) Floors
"""
wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[
["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "walls_u_value"}
)
floor_u_values = pd.DataFrame(self.cleaner.cleaned["floor-description"])[
["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "floor_u_value"}
)
roof_u_values = pd.DataFrame(self.cleaner.cleaned["roof-description"])[
["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "roof_u_value", }
)
model_data = model_data.merge(
wall_u_values,
how="left",
left_on="walls-description",
right_on="original_description"
).drop(
columns=["original_description"]
).merge(
floor_u_values,
how="left",
left_on="floor-description",
right_on="original_description"
).drop(
columns=["original_description"]
).merge(
roof_u_values,
how="left",
left_on="roof-description",
right_on="original_description"
)
return model_data
@staticmethod
def _convert_transaction_type(model_data):
model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling"
model_data = model_data.drop(columns=["transaction-type"])
return model_data
@staticmethod
def _clean_numericals(model_data):
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting"]:
model_data[col] = np.where(
model_data[col] == "", "0", model_data["photo-supply"]
).astype(float)
# We need to clean lighting
return model_data
def create_dataset(self):
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
model_data = model_data.reset_index(drop=True)
model_data["idx"] = model_data.index.copy()
# Append on u-value estimates
model_data = model_data.merge(
pd.DataFrame(self.cleaner.cleaned["walls-description"])[
["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "walls_u_value", }
),
how="left",
left_on="walls-description",
right_on="original_description"
) \
.drop(columns=["original_description"]) \
.merge(
pd.DataFrame(self.cleaner.cleaned["floor-description"])[
["original_description", "thermal_transmittance"]].rename(
columns={"thermal_transmittance": "floor_u_value", }
),
how="left",
left_on="floor-description",
right_on="original_description"
)
# Append on u-values
model_data = self._append_extracted_u_values(model_data)
# Convert transaction_type
model_data = self._convert_transaction_type(model_data)
# Clean numerical columns
model_data = self._clean_numericals(model_data)
# Take just entries with U-values
# TODO: Rather than doing this, do we want to include the estimated u-values?
# Since this ends up with just 2k entries
model_data = model_data[
~pd.isnull(model_data["walls_u_value"]) &
~pd.isnull(model_data["floor_u_value"])
]
model_data = model_data[
self.BASE_FEATURES + [c for c in self.COMPONENT_FEATURES if c not in [
"walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", self.RESPONSE]
~pd.isnull(model_data["floor_u_value"]) &
~pd.isnull(model_data["roof_u_value"])
]
exclude_features = ["walls-description", "floor-description", "roof-description", "transaction-type"]
features = [
x for x in self.BASE_FEATURES +
self.COMPONENT_FEATURES +
["walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE] if x not in exclude_features
]
model_data = model_data[features]
for col in self.CATEGORICAL_COLS:
model_data[col] = model_data[col].astype('category')
@ -168,3 +258,9 @@ class SalModel:
worst_errors = errors.nlargest(n, 'Absolute Residual')
return metrics, worst_errors
self = SalModel(
data=all_data["data"],
cleaner=all_data["cleaner"]
)

View file

@ -265,26 +265,6 @@ def handler():
import numpy as np
# Notes
# TODO: We might want to look at adding in the u-value estimates for the properties that do not have them
# so that we have move data.
# TODO: Add in the u-values for roofs rather than the description
# TODO: Add in the actual property features for walls, floors, roof, not just the u-value
# TODO: Think about how we use sap vs rdsap - should we add a feature in the model for transaction-type?
# TODO: Remove cases where descriptions have no data or are error cases
#
# property type looks okay - we're definitely low on the number of bungalows
# number-habitable-rooms & number-heated-rooms is unpopulated so pretty useless atm
# **** constituency should be looked at - potentially modelled individually as some constituencies
# peform much worse that others despite enough data.
# **** Lighting is a bit of mess - needs to be looked at. Most properties are of the same type
# and a few of the categories just have barely any data and poor scores
# **** windows-description again most of the properties are of the same type, need more samples
# for thge smaller groups
# **** Turn roof into U-value
# **** mainheat is a bad one - community scheme seems to actually be quite a lot of properties, it's ok for
# MAPE though.
grouped_error = []
groupby = ["mainheat-description"]
for group, data in model_data.groupby(groupby, observed=True):