diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index c11e7ae7..ab687d74 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -71,6 +71,7 @@ LATEST_FIELD = [ MANDATORY_FIXED_FEATURES = [ "PROPERTY_TYPE", "BUILT_FORM", + "CONSTITUENCY" ] # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were @@ -139,20 +140,25 @@ def clean_multi_glaze_proportion(df): return df +def ordinal(n): + if 10 <= n % 100 <= 20: + suffix = 'th' + else: + suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th') + + return str(n) + suffix + + FLOOR_LEVEL_MAP = { - "00": 0, - "01": 1, - "02": 2, - "03": 3, "Basement": -1, "Ground": 0, - "1st": 1, - "2nd": 2, - "3rd": 3, "ground floor": 0, - # Put in estimates for these - "mid floor": 2, - "top floor": 4 + "20+": 20, + "21st or above": 21, + **{str(i).zfill(2): i for i in range(0, 21)}, + **{ordinal(i): i for i in range(-1, 21)}, + **{str(i): i for i in range(-1, 21)}, + **{i: i for i in range(-1, 21)}, } BUILT_FORM_REMAP = { @@ -170,8 +176,10 @@ def app(): for directory in tqdm(directories): filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv") df = pd.read_csv(filepath, low_memory=False) + # UPRN is a unique identifier for a property, so we remove any EPCs that don't have one df = df[~pd.isnull(df["UPRN"])] - + # Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged + # before the introduction of SAP09 df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] cleaning_averages, general_averages = make_cleaning_averages(df) @@ -182,6 +190,9 @@ def app(): df = clean_multi_glaze_proportion(df) + # We remove floor level in top floor or mid floor since this is ambiguous + df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] + df["UPRN"] = df["UPRN"].astype(int).astype(str) counts = df.groupby("UPRN").size().reset_index() counts.columns = ["UPRN", "count"]