updated version of data prep code

This commit is contained in:
Khalim Conn-Kowlessar 2023-08-08 15:15:27 +01:00
parent 7d312d5c65
commit 57394feda9

View file

@ -71,6 +71,7 @@ LATEST_FIELD = [
MANDATORY_FIXED_FEATURES = [
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTITUENCY"
]
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
@ -139,20 +140,25 @@ def clean_multi_glaze_proportion(df):
return df
def ordinal(n):
if 10 <= n % 100 <= 20:
suffix = 'th'
else:
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
return str(n) + suffix
FLOOR_LEVEL_MAP = {
"00": 0,
"01": 1,
"02": 2,
"03": 3,
"Basement": -1,
"Ground": 0,
"1st": 1,
"2nd": 2,
"3rd": 3,
"ground floor": 0,
# Put in estimates for these
"mid floor": 2,
"top floor": 4
"20+": 20,
"21st or above": 21,
**{str(i).zfill(2): i for i in range(0, 21)},
**{ordinal(i): i for i in range(-1, 21)},
**{str(i): i for i in range(-1, 21)},
**{i: i for i in range(-1, 21)},
}
BUILT_FORM_REMAP = {
@ -170,8 +176,10 @@ def app():
for directory in tqdm(directories):
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
df = pd.read_csv(filepath, low_memory=False)
# UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
df = df[~pd.isnull(df["UPRN"])]
# Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
# before the introduction of SAP09
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
cleaning_averages, general_averages = make_cleaning_averages(df)
@ -182,6 +190,9 @@ def app():
df = clean_multi_glaze_proportion(df)
# We remove floor level in top floor or mid floor since this is ambiguous
df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
df["UPRN"] = df["UPRN"].astype(int).astype(str)
counts = df.groupby("UPRN").size().reset_index()
counts.columns = ["UPRN", "count"]