mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
updated version of data prep code
This commit is contained in:
parent
7d312d5c65
commit
57394feda9
1 changed files with 22 additions and 11 deletions
|
|
@ -71,6 +71,7 @@ LATEST_FIELD = [
|
|||
MANDATORY_FIXED_FEATURES = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
"CONSTITUENCY"
|
||||
]
|
||||
|
||||
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
||||
|
|
@ -139,20 +140,25 @@ def clean_multi_glaze_proportion(df):
|
|||
return df
|
||||
|
||||
|
||||
def ordinal(n):
|
||||
if 10 <= n % 100 <= 20:
|
||||
suffix = 'th'
|
||||
else:
|
||||
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
|
||||
|
||||
return str(n) + suffix
|
||||
|
||||
|
||||
FLOOR_LEVEL_MAP = {
|
||||
"00": 0,
|
||||
"01": 1,
|
||||
"02": 2,
|
||||
"03": 3,
|
||||
"Basement": -1,
|
||||
"Ground": 0,
|
||||
"1st": 1,
|
||||
"2nd": 2,
|
||||
"3rd": 3,
|
||||
"ground floor": 0,
|
||||
# Put in estimates for these
|
||||
"mid floor": 2,
|
||||
"top floor": 4
|
||||
"20+": 20,
|
||||
"21st or above": 21,
|
||||
**{str(i).zfill(2): i for i in range(0, 21)},
|
||||
**{ordinal(i): i for i in range(-1, 21)},
|
||||
**{str(i): i for i in range(-1, 21)},
|
||||
**{i: i for i in range(-1, 21)},
|
||||
}
|
||||
|
||||
BUILT_FORM_REMAP = {
|
||||
|
|
@ -170,8 +176,10 @@ def app():
|
|||
for directory in tqdm(directories):
|
||||
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
# UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
|
||||
df = df[~pd.isnull(df["UPRN"])]
|
||||
|
||||
# Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
|
||||
# before the introduction of SAP09
|
||||
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||
|
||||
cleaning_averages, general_averages = make_cleaning_averages(df)
|
||||
|
|
@ -182,6 +190,9 @@ def app():
|
|||
|
||||
df = clean_multi_glaze_proportion(df)
|
||||
|
||||
# We remove floor level in top floor or mid floor since this is ambiguous
|
||||
df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
||||
|
||||
df["UPRN"] = df["UPRN"].astype(int).astype(str)
|
||||
counts = df.groupby("UPRN").size().reset_index()
|
||||
counts.columns = ["UPRN", "count"]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue