Model/model_data/simulation_system/energy_predictor.py
2023-08-13 11:43:45 +00:00

63 lines
No EOL
2.1 KiB
Python

from pathlib import Path
from Settings import (
RDSAP_RESPONSE,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS
)
from model_data.BaseUtility import BaseUtility
from tqdm import tqdm
import pandas as pd
import numpy as np
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
def main():
"""
Extract all information to do a simple predictor for RDSAP
"""
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
directories = directories[0:10]
dfs = []
for directory in tqdm(directories):
filepath = directory / "certificates.csv"
df = pd.read_csv(filepath)
# Remove any bad uprns and ignore old/bad data
df = df[~pd.isnull(df["UPRN"])]
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
# Change multi glaze proportion
no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
# Recast
df["UPRN"] = df["UPRN"].astype(int).astype(str)
df['MAIN_HEATING_CONTROLS'] = df['MAIN_HEATING_CONTROLS'].astype(float)
# Sort Data
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
# Map all anomaly values to None
data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
df = df.replace(data_anomaly_map)
df = df.replace(np.NAN, None)
# Remap certain columns
df['FLOOR_LEVEL'] = df['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
df['BUILT_FROM'] = df['BUILT_FORM'].replace(BUILT_FORM_REMAP)
dfs.append(df)
data = pd.concat(dfs)
data.to_parquet('./energy_predictor_data.parquet')
if __name__ == "__main__":
main()