mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on data diferencing code
This commit is contained in:
parent
7382b17232
commit
00328d461b
4 changed files with 109 additions and 4 deletions
|
|
@ -258,6 +258,10 @@ class DataProcessor:
|
|||
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
|
||||
data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
|
||||
|
||||
convert_to_lower = ["TRANSACTION_TYPE"]
|
||||
for col in convert_to_lower:
|
||||
data[col] = data[col].str.lower()
|
||||
|
||||
self.data = data
|
||||
|
||||
def make_cleaning_averages(self) -> pd.DataFrame:
|
||||
|
|
|
|||
|
|
@ -83,8 +83,6 @@ FIXED_FEATURES = [
|
|||
"CONSTITUENCY",
|
||||
"NUMBER_HEATED_ROOMS",
|
||||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||
"FLOOR_HEIGHT"
|
||||
"TOTAL_FLOOR_AREA",
|
||||
]
|
||||
|
||||
COMPONENT_FEATURES = [
|
||||
|
|
@ -213,3 +211,11 @@ fill_na_map = {
|
|||
"EXTENSION_COUNT": 0,
|
||||
"NUMBER_OPEN_FIREPLACES": 0
|
||||
}
|
||||
|
||||
# After the property descriptions have been re-remapped, we expect these features to be fixed
|
||||
FIXED_DESCRIPTON_MAPPED_FEATURES = [
|
||||
'another_property_below', 'is_roof_room', 'is_granite_or_whinstone', 'is_flat', 'is_suspended',
|
||||
'has_dwelling_above', 'is_as_built', 'is_to_external_air', 'is_cob', 'is_pitched', 'is_solid', 'is_at_rafters',
|
||||
'is_solid_brick', 'is_loft', 'is_system_built', 'is_timber_frame', 'is_sandstone_or_limestone', 'is_filled_cavity',
|
||||
'is_cavity_wall', 'is_thatched', 'is_to_unheated_space'
|
||||
]
|
||||
|
|
|
|||
|
|
@ -509,6 +509,8 @@ def app():
|
|||
pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE)
|
||||
).dt.days
|
||||
|
||||
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
||||
|
||||
# We look for key building fabric features that have changed from one EPC to the next.
|
||||
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
|
||||
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
|
||||
|
|
@ -537,6 +539,86 @@ def app():
|
|||
if pd.isnull(data_by_urpn_df).sum().sum():
|
||||
raise ValueError("Null values found in dataset after process_and_prune_desriptions")
|
||||
|
||||
# TODO: Move to dataprocesser
|
||||
def difference_data(df):
|
||||
|
||||
from model_data.simulation_system.core.Settings import FIXED_FEATURES, FIXED_DESCRIPTON_MAPPED_FEATURES
|
||||
|
||||
columns = {
|
||||
x for x in df.columns if x not in FIXED_FEATURES + FIXED_DESCRIPTON_MAPPED_FEATURES + [
|
||||
"RDSAP_CHANGE", "HEAT_DEMAND_CHANGE", "CARBON_CHANGE", "SAP_STARTING", "HEAT_DEMAND_STARTING",
|
||||
"CARBON_STARTING", "UPRN", "CONSTITUENCY",
|
||||
]
|
||||
}
|
||||
|
||||
non_numerical_columns = df.select_dtypes(exclude=['number']).columns.tolist()
|
||||
non_numerical_columns = [col for col in non_numerical_columns if col in columns]
|
||||
levels = {col: df[col].unique().tolist() for col in non_numerical_columns}
|
||||
|
||||
df = pd.get_dummies(df, columns=non_numerical_columns)
|
||||
|
||||
# We make sure there is a starting and ending version of the column
|
||||
diff_columns = []
|
||||
no_diff_columns = [] # Store for debugging
|
||||
for col in columns:
|
||||
if "_ENDING" in col:
|
||||
# Don't keep the endings
|
||||
continue
|
||||
else:
|
||||
# We have a starting column so check if we have an ending
|
||||
if col.replace("_STARTING", "") + "_ENDING" in columns:
|
||||
diff_columns.append(col)
|
||||
else:
|
||||
no_diff_columns.append(col)
|
||||
|
||||
if any(c not in FIXED_DESCRIPTON_MAPPED_FEATURES for c in no_diff_columns):
|
||||
raise Exception("Something went wrong, potentially missed a differencing colunn")
|
||||
|
||||
datatypes = df.dtypes
|
||||
|
||||
# Do the differencing
|
||||
cols_to_append = {}
|
||||
for starting_col in diff_columns:
|
||||
|
||||
base_col = starting_col.replace("_STARTING", "")
|
||||
|
||||
if "_STARTING" in starting_col:
|
||||
ending_col = starting_col.replace("_STARTING", "_ENDING")
|
||||
else:
|
||||
ending_col = starting_col + "_ENDING"
|
||||
|
||||
if starting_col not in non_numerical_columns:
|
||||
cols_to_append[f"{base_col}_DIFF"] = df[ending_col] - df[starting_col]
|
||||
df = df.drop(columns=[starting_col, ending_col])
|
||||
continue
|
||||
|
||||
level_values = list(set(levels[starting_col] + levels[ending_col]))
|
||||
|
||||
level_cols = []
|
||||
for level in level_values:
|
||||
starting_level_col = "_".join([starting_col, str(level)])
|
||||
ending_level_col = "_".join([ending_col, str(level)])
|
||||
|
||||
col_type = datatypes[starting_level_col].name
|
||||
|
||||
if starting_level_col not in df.columns:
|
||||
df[starting_level_col] = 0
|
||||
|
||||
if ending_level_col not in df.columns:
|
||||
df[ending_level_col] = 0
|
||||
|
||||
if col_type == "bool":
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = (
|
||||
df[ending_level_col].astype(int) - df[starting_level_col].astype(int)
|
||||
)
|
||||
else:
|
||||
cols_to_append[f"{base_col}_{level}_DIFF"] = df[ending_level_col] - df[starting_level_col]
|
||||
|
||||
level_cols.extend([starting_level_col, ending_level_col])
|
||||
|
||||
# Drop the columns
|
||||
df = df.drop(columns=level_cols)
|
||||
|
||||
dataset.append(data_by_urpn_df)
|
||||
|
||||
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
import math
|
||||
from copy import deepcopy
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from backend.Property import Property
|
||||
from statistics import mean
|
||||
from recommendations.rdsap_tables import (
|
||||
|
|
@ -211,6 +214,11 @@ def get_wall_u_value(clean_description, age_band, is_granite_or_whinstone, is_sa
|
|||
mapped_description = epc_wall_description_map[clean_description]
|
||||
|
||||
mapped_value = wall_uvalues_df[wall_uvalues_df["Wall_type"] == mapped_description][age_band].values[0]
|
||||
|
||||
if pd.isnull(mapped_value) and "Park home" in mapped_description:
|
||||
# We don't know enough in this case so we default to 0
|
||||
return 0
|
||||
|
||||
if mapped_value == "a":
|
||||
# The rdSap documentation indicateswe should use a formula to calculate the u-value
|
||||
return float(
|
||||
|
|
@ -231,7 +239,8 @@ def get_wall_u_value(clean_description, age_band, is_granite_or_whinstone, is_sa
|
|||
return min(potential_uvalue, formula_uvalue)
|
||||
|
||||
if mapped_value == "s1.1.2":
|
||||
return None
|
||||
# We don't know enough in this case so we default to 0
|
||||
return 0
|
||||
|
||||
return float(mapped_value)
|
||||
|
||||
|
|
@ -410,7 +419,11 @@ def get_floor_u_value(floor_type, area, perimeter, age_band, wall_type, insulati
|
|||
Rse = 0.04 # in m²K/W
|
||||
lambda_ins = 0.035 # thermal conductivity of floor insulation in W/m·K
|
||||
|
||||
wall_thickness = [x[age_band] for x in default_wall_thickness if x["type"] == wall_type][0] / 1000
|
||||
wall_thickness = [x[age_band] for x in default_wall_thickness if x["type"] == wall_type][0]
|
||||
if wall_thickness is None and wall_type == "park home":
|
||||
# We don't know enough and likely won't make recommendations
|
||||
return 0
|
||||
wall_thickness = wall_thickness / 1000
|
||||
|
||||
if insulation_thickness is None:
|
||||
insulation_lookup = s11[s11["Age_band"].str.contains(age_band) & s11["Floor_construction"] == floor_type]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue