mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
108 lines
3.3 KiB
Python
108 lines
3.3 KiB
Python
import numpy as np
|
|
import os
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
from model_data.BaseUtility import BaseUtility
|
|
|
|
|
|
def list_subdirectories(directory_path):
|
|
return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
|
|
|
|
|
|
DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates'
|
|
|
|
FIXED_FEATURES = [
|
|
'PROPERTY_TYPE',
|
|
'BUILT_FORM',
|
|
'CONSTRUCTION_AGE_BAND',
|
|
'NUMBER_HABITABLE_ROOMS',
|
|
'CONSTITUENCY',
|
|
'NUMBER_HEATED_ROOMS',
|
|
'FIXED_LIGHTING_OUTLETS_COUNT',
|
|
'GLAZED_AREA',
|
|
'FLOOR_HEIGHT',
|
|
'FLOOR_LEVEL',
|
|
'TOTAL_FLOOR_AREA',
|
|
]
|
|
|
|
COMPONENT_FEATURES = [
|
|
'TRANSACTION_TYPE',
|
|
'WALLS_DESCRIPTION',
|
|
'FLOOR_DESCRIPTION',
|
|
'LIGHTING_DESCRIPTION',
|
|
'ROOF_DESCRIPTION',
|
|
'MAINHEAT_DESCRIPTION',
|
|
'HOTWATER_DESCRIPTION',
|
|
'MAIN_FUEL',
|
|
'MECHANICAL_VENTILATION',
|
|
'SECONDHEAT_DESCRIPTION',
|
|
'ENERGY_TARIFF', # Not sure if this is relevant
|
|
'SOLAR_WATER_HEATING_FLAG',
|
|
'PHOTO_SUPPLY',
|
|
'WINDOWS_DESCRIPTION',
|
|
'GLAZED_TYPE',
|
|
'MULTI_GLAZE_PROPORTION',
|
|
'LIGHTING_DESCRIPTION',
|
|
'LOW_ENERGY_LIGHTING',
|
|
'NUMBER_OPEN_FIREPLACES',
|
|
'MAINHEATCONT_DESCRIPTION',
|
|
'EXTENSION_COUNT'
|
|
]
|
|
|
|
AVERAGE_FIXED_FEATURES = [
|
|
"TOTAL_FLOOR_AREA"
|
|
]
|
|
|
|
|
|
def app():
|
|
# Get all the files in the directory
|
|
|
|
directories = list_subdirectories(DATA_DIRECTORY)
|
|
|
|
for directory in tqdm(directories):
|
|
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
|
|
df = pd.read_csv(filepath, low_memory=False)
|
|
df = df[~pd.isnull(df["UPRN"])]
|
|
df["UPRN"] = df["UPRN"].astype(int).astype(str)
|
|
counts = df.groupby("UPRN").size().reset_index()
|
|
counts.columns = ["UPRN", "count"]
|
|
counts = counts.sort_values("count", ascending=False)
|
|
|
|
# take UPRNS with multiple EPCs
|
|
counts = counts[counts["count"] > 1]
|
|
df = df[df["UPRN"].isin(counts["UPRN"])]
|
|
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
|
|
|
for uprn, property_data in df.groupby("UPRN"):
|
|
|
|
# Fixed features - these are property attributes that shouldn't change over time
|
|
|
|
fixed_data = {}
|
|
for field in FIXED_FEATURES:
|
|
vals = property_data[field].dropna().unique()
|
|
# Remove invalid values
|
|
vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES]
|
|
|
|
if len(vals) > 1:
|
|
raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
|
|
|
|
if field in AVERAGE_FIXED_FEATURES:
|
|
# Check the values are too far apart
|
|
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
|
raise ValueError("Large deviation in fixed feature {} - fix me".format(field))
|
|
|
|
field_value = np.mean(vals)
|
|
else:
|
|
field_value = vals[0] if vals else None
|
|
|
|
fixed_data[field] = field_value
|
|
|
|
variable_data = property_data[COMPONENT_FEATURES]
|
|
|
|
for idx in range(0, property_data.shape[0] - 1):
|
|
|
|
if idx >= property_data.shape[0] - 1:
|
|
break
|
|
|
|
starting_record = variable_data.iloc[idx]
|
|
ending_record = variable_data.iloc[idx + 1]
|