Model/etl/property_dimensions/app.py
2024-02-19 18:37:50 +00:00

61 lines
2.2 KiB
Python

"""
This is a simple application which estimates some of the basic dimensions of a property based on EPC
data which we can use as a proxy value if we don't have this information on the EPC
"""
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from etl.epc.settings import EARLIEST_EPC_DATE
from etl.epc.DataProcessor import EPCDataProcessor
from BaseUtility import Definitions
from utils.s3 import save_dataframe_to_s3_parquet
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
GROUPBY = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY", "CONSTRUCTION_AGE_BAND"]
BUCKET = os.environ.get("BUCKET", "retrofit-data-dev")
def app():
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
sample = []
for directory in tqdm(directories):
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
data = data[data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
data = data[~pd.isnull(data["UPRN"])]
data["TOTAL_FLOOR_AREA"] = data["TOTAL_FLOOR_AREA"].astype(float)
data["CONSTRUCTION_AGE_BAND"] = data["CONSTRUCTION_AGE_BAND"].apply(
lambda x: EPCDataProcessor.clean_construction_age_band(x)
)
data = data[~pd.isnull(data["CONSTRUCTION_AGE_BAND"])]
data = data[~data["CONSTRUCTION_AGE_BAND"].isin(Definitions.DATA_ANOMALY_MATCHES)]
data = data[~pd.isnull(data["TOTAL_FLOOR_AREA"])]
data = data[~pd.isnull(data["NUMBER_HABITABLE_ROOMS"])]
data = data[~pd.isnull(data["FLOOR_HEIGHT"])]
data = data[~pd.isnull(data["NUMBER_HEATED_ROOMS"])]
df = (
data.groupby(GROUPBY)
.agg(
{"NUMBER_HEATED_ROOMS": "median", "NUMBER_HABITABLE_ROOMS": "median", "TOTAL_FLOOR_AREA": "mean",
"FLOOR_HEIGHT": "mean"}
)
.reset_index()
)
local_authority = data["LOCAL_AUTHORITY"].unique()
if len(local_authority) > 1:
raise Exception("More than one la in data")
local_authority = local_authority[0]
save_dataframe_to_s3_parquet(
df=df,
bucket_name=BUCKET,
file_key=f"property_dimensions/{local_authority}.parquet",
)