mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on the data prep code
This commit is contained in:
parent
44099f8d83
commit
7d312d5c65
1 changed files with 85 additions and 19 deletions
|
|
@ -19,7 +19,6 @@ FIXED_FEATURES = [
|
|||
'CONSTITUENCY',
|
||||
'NUMBER_HEATED_ROOMS',
|
||||
'FIXED_LIGHTING_OUTLETS_COUNT',
|
||||
'GLAZED_AREA',
|
||||
'FLOOR_HEIGHT',
|
||||
'FLOOR_LEVEL',
|
||||
'TOTAL_FLOOR_AREA',
|
||||
|
|
@ -46,7 +45,8 @@ COMPONENT_FEATURES = [
|
|||
'LOW_ENERGY_LIGHTING',
|
||||
'NUMBER_OPEN_FIREPLACES',
|
||||
'MAINHEATCONT_DESCRIPTION',
|
||||
'EXTENSION_COUNT'
|
||||
'EXTENSION_COUNT',
|
||||
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
|
||||
]
|
||||
|
||||
# For these fields, we take an average if we have multiple values
|
||||
|
|
@ -62,12 +62,15 @@ LATEST_FIELD = [
|
|||
"NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS",
|
||||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||
"CONSTRUCTION_AGE_BAND"
|
||||
"CONSTRUCTION_AGE_BAND",
|
||||
"FLOOR_LEVEL",
|
||||
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
|
||||
]
|
||||
|
||||
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
|
||||
MANDATORY_FIXED_FEATURES = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
]
|
||||
|
||||
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
||||
|
|
@ -80,11 +83,60 @@ HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
|||
|
||||
|
||||
def make_cleaning_averages(df):
|
||||
cleaning_averages = df.groupby(
|
||||
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]
|
||||
)[AVERAGE_FIXED_FEATURES].mean().reset_index()
|
||||
# Define a custom function to calculate the median, excluding missing values
|
||||
def median_without_missing(group):
|
||||
return group[AVERAGE_FIXED_FEATURES].dropna().median()
|
||||
|
||||
return cleaning_averages
|
||||
cleaning_averages = df.groupby(
|
||||
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
||||
observed=True
|
||||
).apply(median_without_missing).reset_index()
|
||||
|
||||
general_averages = df.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
|
||||
median_without_missing).reset_index()
|
||||
|
||||
return cleaning_averages, general_averages
|
||||
|
||||
|
||||
def iterative_filtering(cleaning_averages, property_data):
|
||||
# Define the columns to filter on
|
||||
columns_to_filter = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS"]
|
||||
|
||||
# Start with the entire cleaning_averages DataFrame
|
||||
filtered_data = cleaning_averages.copy()
|
||||
|
||||
# Iterate through the columns and apply filters one by one
|
||||
for column in columns_to_filter:
|
||||
# Apply the filter using the value from property_data
|
||||
new_filtered_data = filtered_data[filtered_data[column] == property_data[column].iloc[0]]
|
||||
|
||||
# If the filter results in no data, return the previous result
|
||||
if new_filtered_data.empty:
|
||||
continue
|
||||
|
||||
# If the filter is successful, update the filtered data
|
||||
filtered_data = new_filtered_data
|
||||
|
||||
return filtered_data
|
||||
|
||||
|
||||
def clean_multi_glaze_proportion(df):
|
||||
fully_glazed_descriptions = [
|
||||
"Fully double glazed",
|
||||
"High performance glazing",
|
||||
"Fully triple glazed",
|
||||
"Full secondary glazing",
|
||||
"Multiple glazing throughout",
|
||||
]
|
||||
|
||||
df["MULTI_GLAZE_PROPORTION"] = np.where(
|
||||
pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(fully_glazed_descriptions)),
|
||||
100,
|
||||
df["MULTI_GLAZE_PROPORTION"],
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
FLOOR_LEVEL_MAP = {
|
||||
|
|
@ -97,6 +149,15 @@ FLOOR_LEVEL_MAP = {
|
|||
"1st": 1,
|
||||
"2nd": 2,
|
||||
"3rd": 3,
|
||||
"ground floor": 0,
|
||||
# Put in estimates for these
|
||||
"mid floor": 2,
|
||||
"top floor": 4
|
||||
}
|
||||
|
||||
BUILT_FORM_REMAP = {
|
||||
"Enclosed End-Terrace": "End-Terrace",
|
||||
"Enclosed Mid-Terrace": "Mid-Terrace",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -105,6 +166,7 @@ def app():
|
|||
|
||||
directories = list_subdirectories(DATA_DIRECTORY)
|
||||
|
||||
dataset = []
|
||||
for directory in tqdm(directories):
|
||||
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
|
|
@ -112,12 +174,14 @@ def app():
|
|||
|
||||
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||
|
||||
cleaning_averages = make_cleaning_averages(df)
|
||||
cleaning_averages, general_averages = make_cleaning_averages(df)
|
||||
|
||||
# We remove EPCS that were conducted for a new build, since these are performed with
|
||||
# full SAP, which produces different results to the RdSAP methodology
|
||||
df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
|
||||
|
||||
df = clean_multi_glaze_proportion(df)
|
||||
|
||||
df["UPRN"] = df["UPRN"].astype(int).astype(str)
|
||||
counts = df.groupby("UPRN").size().reset_index()
|
||||
counts.columns = ["UPRN", "count"]
|
||||
|
|
@ -128,7 +192,6 @@ def app():
|
|||
df = df[df["UPRN"].isin(counts["UPRN"])]
|
||||
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||
|
||||
results = []
|
||||
for uprn, property_data in df.groupby("UPRN", observed=True):
|
||||
|
||||
# Fixed features - these are property attributes that shouldn't change over time
|
||||
|
|
@ -143,6 +206,9 @@ def app():
|
|||
if field == "FLOOR_LEVEL":
|
||||
vals = list({FLOOR_LEVEL_MAP[v] for v in vals})
|
||||
|
||||
if field == "BUILT_FORM":
|
||||
vals = list({BUILT_FORM_REMAP.get(v, v) for v in vals})
|
||||
|
||||
if field in AVERAGE_FIXED_FEATURES:
|
||||
|
||||
if len(vals) > 1:
|
||||
|
|
@ -155,18 +221,17 @@ def app():
|
|||
field_value = np.mean(vals)
|
||||
else:
|
||||
# Clean using averages
|
||||
avgs = cleaning_averages[
|
||||
(cleaning_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) &
|
||||
(cleaning_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0]) &
|
||||
(cleaning_averages["CONSTRUCTION_AGE_BAND"] == property_data["CONSTRUCTION_AGE_BAND"].iloc[
|
||||
0]) &
|
||||
(cleaning_averages["NUMBER_HABITABLE_ROOMS"] ==
|
||||
property_data["NUMBER_HABITABLE_ROOMS"].iloc[0]) &
|
||||
(cleaning_averages["NUMBER_HEATED_ROOMS"] == property_data["NUMBER_HEATED_ROOMS"].iloc[0])
|
||||
]
|
||||
|
||||
avgs = iterative_filtering(cleaning_averages, property_data)
|
||||
field_value = avgs[field].iloc[0]
|
||||
|
||||
if pd.isnull(field_value):
|
||||
# Just the use the general averages
|
||||
field_value = general_averages[
|
||||
(general_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) &
|
||||
(general_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0])
|
||||
][field].iloc[0]
|
||||
|
||||
elif field in LATEST_FIELD:
|
||||
field_value = vals[-1] if vals else None
|
||||
else:
|
||||
|
|
@ -220,8 +285,9 @@ def app():
|
|||
"UPRN": uprn,
|
||||
"RDSAP_CHANGE": rdsap_change,
|
||||
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
||||
**fixed_data,
|
||||
**features.to_dict()
|
||||
}
|
||||
)
|
||||
|
||||
results.extend(property_model_data)
|
||||
dataset.extend(property_model_data)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue