Model/model_data/simulation_system/app.py
2023-07-31 11:02:22 +01:00

108 lines
3.3 KiB
Python

import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from model_data.BaseUtility import BaseUtility
def list_subdirectories(directory_path):
return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates'
FIXED_FEATURES = [
'PROPERTY_TYPE',
'BUILT_FORM',
'CONSTRUCTION_AGE_BAND',
'NUMBER_HABITABLE_ROOMS',
'CONSTITUENCY',
'NUMBER_HEATED_ROOMS',
'FIXED_LIGHTING_OUTLETS_COUNT',
'GLAZED_AREA',
'FLOOR_HEIGHT',
'FLOOR_LEVEL',
'TOTAL_FLOOR_AREA',
]
COMPONENT_FEATURES = [
'TRANSACTION_TYPE',
'WALLS_DESCRIPTION',
'FLOOR_DESCRIPTION',
'LIGHTING_DESCRIPTION',
'ROOF_DESCRIPTION',
'MAINHEAT_DESCRIPTION',
'HOTWATER_DESCRIPTION',
'MAIN_FUEL',
'MECHANICAL_VENTILATION',
'SECONDHEAT_DESCRIPTION',
'ENERGY_TARIFF', # Not sure if this is relevant
'SOLAR_WATER_HEATING_FLAG',
'PHOTO_SUPPLY',
'WINDOWS_DESCRIPTION',
'GLAZED_TYPE',
'MULTI_GLAZE_PROPORTION',
'LIGHTING_DESCRIPTION',
'LOW_ENERGY_LIGHTING',
'NUMBER_OPEN_FIREPLACES',
'MAINHEATCONT_DESCRIPTION',
'EXTENSION_COUNT'
]
AVERAGE_FIXED_FEATURES = [
"TOTAL_FLOOR_AREA"
]
def app():
# Get all the files in the directory
directories = list_subdirectories(DATA_DIRECTORY)
for directory in tqdm(directories):
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
df = pd.read_csv(filepath, low_memory=False)
df = df[~pd.isnull(df["UPRN"])]
df["UPRN"] = df["UPRN"].astype(int).astype(str)
counts = df.groupby("UPRN").size().reset_index()
counts.columns = ["UPRN", "count"]
counts = counts.sort_values("count", ascending=False)
# take UPRNS with multiple EPCs
counts = counts[counts["count"] > 1]
df = df[df["UPRN"].isin(counts["UPRN"])]
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
for uprn, property_data in df.groupby("UPRN"):
# Fixed features - these are property attributes that shouldn't change over time
fixed_data = {}
for field in FIXED_FEATURES:
vals = property_data[field].dropna().unique()
# Remove invalid values
vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES]
if len(vals) > 1:
raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
if field in AVERAGE_FIXED_FEATURES:
# Check the values are too far apart
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
raise ValueError("Large deviation in fixed feature {} - fix me".format(field))
field_value = np.mean(vals)
else:
field_value = vals[0] if vals else None
fixed_data[field] = field_value
variable_data = property_data[COMPONENT_FEATURES]
for idx in range(0, property_data.shape[0] - 1):
if idx >= property_data.shape[0] - 1:
break
starting_record = variable_data.iloc[idx]
ending_record = variable_data.iloc[idx + 1]