mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
working on sap model but need to clean lighting
This commit is contained in:
parent
f941a3c512
commit
ba201c8b6a
2 changed files with 127 additions and 51 deletions
|
|
@ -6,7 +6,7 @@ import pickle
|
||||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \
|
||||||
median_absolute_error, mean_absolute_percentage_error
|
median_absolute_error, mean_absolute_percentage_error
|
||||||
|
|
||||||
with open("all_data.pkl", "wb") as f:
|
with open("all_data.pkl", "rb") as f:
|
||||||
all_data = pickle.load(f)
|
all_data = pickle.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -18,25 +18,53 @@ class SalModel:
|
||||||
BASE_FEATURES = [
|
BASE_FEATURES = [
|
||||||
"property-type",
|
"property-type",
|
||||||
"built-form",
|
"built-form",
|
||||||
# "construction-age-band",
|
"construction-age-band",
|
||||||
"number-habitable-rooms",
|
"number-habitable-rooms",
|
||||||
"constituency",
|
"constituency",
|
||||||
"number-heated-rooms",
|
"number-heated-rooms",
|
||||||
|
"transaction-type"
|
||||||
]
|
]
|
||||||
|
|
||||||
COMPONENT_FEATURES = [
|
COMPONENT_FEATURES = [
|
||||||
"walls-description",
|
"walls-description",
|
||||||
"floor-description",
|
"floor-description",
|
||||||
"lighting-description",
|
"lighting-description",
|
||||||
"windows-description",
|
|
||||||
"roof-description",
|
"roof-description",
|
||||||
"mainheat-description",
|
"mainheat-description",
|
||||||
"main-fuel"
|
"hotwater-description",
|
||||||
|
"main-fuel",
|
||||||
|
"mechanical-ventilation",
|
||||||
|
"secondheat-description",
|
||||||
|
"energy-tariff",
|
||||||
|
"solar-water-heating-flag",
|
||||||
|
"photo-supply",
|
||||||
|
"windows-description",
|
||||||
|
"glazed-type",
|
||||||
|
"glazed-area",
|
||||||
|
"multi-glaze-proportion",
|
||||||
|
# "lighting-description" # Might not need to use this
|
||||||
|
"low-energy-lighting",
|
||||||
|
"number-open-fireplaces",
|
||||||
]
|
]
|
||||||
|
|
||||||
CATEGORICAL_COLS = [
|
CATEGORICAL_COLS = [
|
||||||
"property-type", "built-form", "number-habitable-rooms", "constituency", "number-heated-rooms",
|
"property-type",
|
||||||
"lighting-description", "windows-description", "roof-description", "mainheat-description", "main-fuel",
|
"built-form",
|
||||||
|
"number-habitable-rooms",
|
||||||
|
"constituency",
|
||||||
|
"number-heated-rooms",
|
||||||
|
"lighting-description",
|
||||||
|
"mainheat-description",
|
||||||
|
"hotwater-description",
|
||||||
|
"main-fuel",
|
||||||
|
"mechanical-ventilation",
|
||||||
|
"secondheat-description",
|
||||||
|
"energy-tariff",
|
||||||
|
"solar-water-heating-flag",
|
||||||
|
"windows-description",
|
||||||
|
"glazed-type",
|
||||||
|
"glazed-area",
|
||||||
|
"mainheat-description",
|
||||||
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
@ -51,41 +79,103 @@ class SalModel:
|
||||||
self.fit_error = None
|
self.fit_error = None
|
||||||
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
|
self.worst = {"errors": pd.DataFrame(), "x": pd.DataFrame()}
|
||||||
|
|
||||||
|
def _append_extracted_u_values(self, model_data):
|
||||||
|
"""
|
||||||
|
We need to estimate the u-value impact for:
|
||||||
|
1) Walls
|
||||||
|
2) Roof
|
||||||
|
3) Floors
|
||||||
|
"""
|
||||||
|
|
||||||
|
wall_u_values = pd.DataFrame(self.cleaner.cleaned["walls-description"])[
|
||||||
|
["original_description", "thermal_transmittance"]].rename(
|
||||||
|
columns={"thermal_transmittance": "walls_u_value"}
|
||||||
|
)
|
||||||
|
|
||||||
|
floor_u_values = pd.DataFrame(self.cleaner.cleaned["floor-description"])[
|
||||||
|
["original_description", "thermal_transmittance"]].rename(
|
||||||
|
columns={"thermal_transmittance": "floor_u_value"}
|
||||||
|
)
|
||||||
|
|
||||||
|
roof_u_values = pd.DataFrame(self.cleaner.cleaned["roof-description"])[
|
||||||
|
["original_description", "thermal_transmittance"]].rename(
|
||||||
|
columns={"thermal_transmittance": "roof_u_value", }
|
||||||
|
)
|
||||||
|
|
||||||
|
model_data = model_data.merge(
|
||||||
|
wall_u_values,
|
||||||
|
how="left",
|
||||||
|
left_on="walls-description",
|
||||||
|
right_on="original_description"
|
||||||
|
).drop(
|
||||||
|
columns=["original_description"]
|
||||||
|
).merge(
|
||||||
|
floor_u_values,
|
||||||
|
how="left",
|
||||||
|
left_on="floor-description",
|
||||||
|
right_on="original_description"
|
||||||
|
).drop(
|
||||||
|
columns=["original_description"]
|
||||||
|
).merge(
|
||||||
|
roof_u_values,
|
||||||
|
how="left",
|
||||||
|
left_on="roof-description",
|
||||||
|
right_on="original_description"
|
||||||
|
)
|
||||||
|
|
||||||
|
return model_data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_transaction_type(model_data):
|
||||||
|
model_data["is_rdsap"] = model_data["transaction-type"] != "new dwelling"
|
||||||
|
model_data = model_data.drop(columns=["transaction-type"])
|
||||||
|
return model_data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_numericals(model_data):
|
||||||
|
|
||||||
|
for col in ["photo-supply", "multi-glaze-proportion", "low-energy-lighting"]:
|
||||||
|
model_data[col] = np.where(
|
||||||
|
model_data[col] == "", "0", model_data["photo-supply"]
|
||||||
|
).astype(float)
|
||||||
|
|
||||||
|
# We need to clean lighting
|
||||||
|
|
||||||
|
return model_data
|
||||||
|
|
||||||
def create_dataset(self):
|
def create_dataset(self):
|
||||||
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
|
model_data = self.df[[self.RESPONSE] + self.COMPONENT_FEATURES + self.BASE_FEATURES]
|
||||||
model_data = model_data.reset_index(drop=True)
|
model_data = model_data.reset_index(drop=True)
|
||||||
model_data["idx"] = model_data.index.copy()
|
model_data["idx"] = model_data.index.copy()
|
||||||
|
|
||||||
# Append on u-value estimates
|
# Append on u-values
|
||||||
model_data = model_data.merge(
|
model_data = self._append_extracted_u_values(model_data)
|
||||||
pd.DataFrame(self.cleaner.cleaned["walls-description"])[
|
|
||||||
["original_description", "thermal_transmittance"]].rename(
|
# Convert transaction_type
|
||||||
columns={"thermal_transmittance": "walls_u_value", }
|
model_data = self._convert_transaction_type(model_data)
|
||||||
),
|
|
||||||
how="left",
|
# Clean numerical columns
|
||||||
left_on="walls-description",
|
model_data = self._clean_numericals(model_data)
|
||||||
right_on="original_description"
|
|
||||||
) \
|
|
||||||
.drop(columns=["original_description"]) \
|
|
||||||
.merge(
|
|
||||||
pd.DataFrame(self.cleaner.cleaned["floor-description"])[
|
|
||||||
["original_description", "thermal_transmittance"]].rename(
|
|
||||||
columns={"thermal_transmittance": "floor_u_value", }
|
|
||||||
),
|
|
||||||
how="left",
|
|
||||||
left_on="floor-description",
|
|
||||||
right_on="original_description"
|
|
||||||
)
|
|
||||||
# Take just entries with U-values
|
# Take just entries with U-values
|
||||||
|
# TODO: Rather than doing this, do we want to include the estimated u-values?
|
||||||
|
# Since this ends up with just 2k entries
|
||||||
model_data = model_data[
|
model_data = model_data[
|
||||||
~pd.isnull(model_data["walls_u_value"]) &
|
~pd.isnull(model_data["walls_u_value"]) &
|
||||||
~pd.isnull(model_data["floor_u_value"])
|
~pd.isnull(model_data["floor_u_value"]) &
|
||||||
]
|
~pd.isnull(model_data["roof_u_value"])
|
||||||
model_data = model_data[
|
|
||||||
self.BASE_FEATURES + [c for c in self.COMPONENT_FEATURES if c not in [
|
|
||||||
"walls-description", "floor-description"]] + ["walls_u_value", "floor_u_value", self.RESPONSE]
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
exclude_features = ["walls-description", "floor-description", "roof-description", "transaction-type"]
|
||||||
|
|
||||||
|
features = [
|
||||||
|
x for x in self.BASE_FEATURES +
|
||||||
|
self.COMPONENT_FEATURES +
|
||||||
|
["walls_u_value", "floor_u_value", "roof_u_value", self.RESPONSE] if x not in exclude_features
|
||||||
|
]
|
||||||
|
|
||||||
|
model_data = model_data[features]
|
||||||
|
|
||||||
for col in self.CATEGORICAL_COLS:
|
for col in self.CATEGORICAL_COLS:
|
||||||
model_data[col] = model_data[col].astype('category')
|
model_data[col] = model_data[col].astype('category')
|
||||||
|
|
||||||
|
|
@ -168,3 +258,9 @@ class SalModel:
|
||||||
worst_errors = errors.nlargest(n, 'Absolute Residual')
|
worst_errors = errors.nlargest(n, 'Absolute Residual')
|
||||||
|
|
||||||
return metrics, worst_errors
|
return metrics, worst_errors
|
||||||
|
|
||||||
|
|
||||||
|
self = SalModel(
|
||||||
|
data=all_data["data"],
|
||||||
|
cleaner=all_data["cleaner"]
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -265,26 +265,6 @@ def handler():
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
# Notes
|
|
||||||
# TODO: We might want to look at adding in the u-value estimates for the properties that do not have them
|
|
||||||
# so that we have move data.
|
|
||||||
# TODO: Add in the u-values for roofs rather than the description
|
|
||||||
# TODO: Add in the actual property features for walls, floors, roof, not just the u-value
|
|
||||||
# TODO: Think about how we use sap vs rdsap - should we add a feature in the model for transaction-type?
|
|
||||||
# TODO: Remove cases where descriptions have no data or are error cases
|
|
||||||
#
|
|
||||||
# property type looks okay - we're definitely low on the number of bungalows
|
|
||||||
# number-habitable-rooms & number-heated-rooms is unpopulated so pretty useless atm
|
|
||||||
# **** constituency should be looked at - potentially modelled individually as some constituencies
|
|
||||||
# peform much worse that others despite enough data.
|
|
||||||
# **** Lighting is a bit of mess - needs to be looked at. Most properties are of the same type
|
|
||||||
# and a few of the categories just have barely any data and poor scores
|
|
||||||
# **** windows-description again most of the properties are of the same type, need more samples
|
|
||||||
# for thge smaller groups
|
|
||||||
# **** Turn roof into U-value
|
|
||||||
# **** mainheat is a bad one - community scheme seems to actually be quite a lot of properties, it's ok for
|
|
||||||
# MAPE though.
|
|
||||||
|
|
||||||
grouped_error = []
|
grouped_error = []
|
||||||
groupby = ["mainheat-description"]
|
groupby = ["mainheat-description"]
|
||||||
for group, data in model_data.groupby(groupby, observed=True):
|
for group, data in model_data.groupby(groupby, observed=True):
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue