mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
61 lines
2.2 KiB
Python
61 lines
2.2 KiB
Python
"""
|
|
This is a simple application which estimates some of the basic dimensions of a property based on EPC
|
|
data which we can use as a proxy value if we don't have this information on the EPC
|
|
"""
|
|
import os
|
|
from pathlib import Path
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
from etl.epc.settings import EARLIEST_EPC_DATE
|
|
from etl.epc.DataProcessor import EPCDataProcessor
|
|
from BaseUtility import Definitions
|
|
from utils.s3 import save_dataframe_to_s3_parquet
|
|
|
|
DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates"
|
|
|
|
GROUPBY = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY", "CONSTRUCTION_AGE_BAND"]
|
|
|
|
BUCKET = os.environ.get("BUCKET", "retrofit-data-dev")
|
|
|
|
|
|
def app():
|
|
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
|
|
|
sample = []
|
|
for directory in tqdm(directories):
|
|
|
|
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
|
|
|
data = data[data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
|
data = data[~pd.isnull(data["UPRN"])]
|
|
data["TOTAL_FLOOR_AREA"] = data["TOTAL_FLOOR_AREA"].astype(float)
|
|
|
|
data["CONSTRUCTION_AGE_BAND"] = data["CONSTRUCTION_AGE_BAND"].apply(
|
|
lambda x: EPCDataProcessor.clean_construction_age_band(x)
|
|
)
|
|
data = data[~pd.isnull(data["CONSTRUCTION_AGE_BAND"])]
|
|
data = data[~data["CONSTRUCTION_AGE_BAND"].isin(Definitions.DATA_ANOMALY_MATCHES)]
|
|
data = data[~pd.isnull(data["TOTAL_FLOOR_AREA"])]
|
|
data = data[~pd.isnull(data["NUMBER_HABITABLE_ROOMS"])]
|
|
data = data[~pd.isnull(data["FLOOR_HEIGHT"])]
|
|
data = data[~pd.isnull(data["NUMBER_HEATED_ROOMS"])]
|
|
|
|
df = (
|
|
data.groupby(GROUPBY)
|
|
.agg(
|
|
{"NUMBER_HEATED_ROOMS": "median", "NUMBER_HABITABLE_ROOMS": "median", "TOTAL_FLOOR_AREA": "mean",
|
|
"FLOOR_HEIGHT": "mean"}
|
|
)
|
|
.reset_index()
|
|
)
|
|
|
|
local_authority = data["LOCAL_AUTHORITY"].unique()
|
|
if len(local_authority) > 1:
|
|
raise Exception("More than one la in data")
|
|
local_authority = local_authority[0]
|
|
|
|
save_dataframe_to_s3_parquet(
|
|
df=df,
|
|
bucket_name=BUCKET,
|
|
file_key=f"property_dimensions/{local_authority}.parquet",
|
|
)
|