""" This is a simple application which estimates some of the basic dimensions of a property based on EPC data which we can use as a proxy value if we don't have this information on the EPC """ import os from pathlib import Path import pandas as pd from tqdm import tqdm from etl.epc.settings import EARLIEST_EPC_DATE from etl.epc.DataProcessor import EPCDataProcessor from BaseUtility import Definitions from utils.s3 import save_dataframe_to_s3_parquet DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates" GROUPBY = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY", "CONSTRUCTION_AGE_BAND"] BUCKET = os.environ.get("BUCKET", "retrofit-data-dev") def app(): directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] sample = [] for directory in tqdm(directories): data = pd.read_csv(directory / "certificates.csv", low_memory=False) data = data[data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] data = data[~pd.isnull(data["UPRN"])] data["TOTAL_FLOOR_AREA"] = data["TOTAL_FLOOR_AREA"].astype(float) data["CONSTRUCTION_AGE_BAND"] = data["CONSTRUCTION_AGE_BAND"].apply( lambda x: EPCDataProcessor.clean_construction_age_band(x) ) data = data[~pd.isnull(data["CONSTRUCTION_AGE_BAND"])] data = data[~data["CONSTRUCTION_AGE_BAND"].isin(Definitions.DATA_ANOMALY_MATCHES)] data = data[~pd.isnull(data["TOTAL_FLOOR_AREA"])] data = data[~pd.isnull(data["NUMBER_HABITABLE_ROOMS"])] data = data[~pd.isnull(data["FLOOR_HEIGHT"])] data = data[~pd.isnull(data["NUMBER_HEATED_ROOMS"])] df = ( data.groupby(GROUPBY) .agg( {"NUMBER_HEATED_ROOMS": "median", "NUMBER_HABITABLE_ROOMS": "median", "TOTAL_FLOOR_AREA": "mean", "FLOOR_HEIGHT": "mean"} ) .reset_index() ) local_authority = data["LOCAL_AUTHORITY"].unique() if len(local_authority) > 1: raise Exception("More than one la in data") local_authority = local_authority[0] save_dataframe_to_s3_parquet( df=df, bucket_name=BUCKET, file_key=f"property_dimensions/{local_authority}.parquet", )