mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Refactored Path, organised preprocessing steps for df
This commit is contained in:
parent
07af58eb85
commit
bac9c2e6ae
1 changed files with 81 additions and 38 deletions
|
|
@ -3,13 +3,21 @@ import os
|
|||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
from model_data.BaseUtility import BaseUtility
|
||||
|
||||
# from BaseUtility import BaseUtility # I need this import as working in different folder
|
||||
from pathlib import Path
|
||||
|
||||
def list_subdirectories(directory_path):
|
||||
return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
|
||||
return [entry for entry in directory_path.iterdir() if entry.is_dir()]
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
|
||||
|
||||
DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates'
|
||||
FULLY_GLAZED_DESCRIPTIONS = [
|
||||
"Fully double glazed",
|
||||
"High performance glazing",
|
||||
"Fully triple glazed",
|
||||
"Full secondary glazing",
|
||||
"Multiple glazing throughout",
|
||||
]
|
||||
|
||||
FIXED_FEATURES = [
|
||||
'PROPERTY_TYPE',
|
||||
|
|
@ -122,20 +130,13 @@ def iterative_filtering(cleaning_averages, property_data):
|
|||
return filtered_data
|
||||
|
||||
|
||||
def clean_multi_glaze_proportion(df):
|
||||
fully_glazed_descriptions = [
|
||||
"Fully double glazed",
|
||||
"High performance glazing",
|
||||
"Fully triple glazed",
|
||||
"Full secondary glazing",
|
||||
"Multiple glazing throughout",
|
||||
]
|
||||
def clean_multi_glaze_proportion(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
|
||||
"""
|
||||
|
||||
df["MULTI_GLAZE_PROPORTION"] = np.where(
|
||||
pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(fully_glazed_descriptions)),
|
||||
100,
|
||||
df["MULTI_GLAZE_PROPORTION"],
|
||||
)
|
||||
no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
|
||||
df = df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
|
||||
|
||||
return df
|
||||
|
||||
|
|
@ -167,6 +168,59 @@ BUILT_FORM_REMAP = {
|
|||
}
|
||||
|
||||
|
||||
def confine_data(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Include all step to reduce down the data based on assumptions
|
||||
"""
|
||||
|
||||
# Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
|
||||
|
||||
# Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
|
||||
# before the introduction of SAP09
|
||||
|
||||
# Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
|
||||
# full SAP, which produces different results to the RdSAP methodology
|
||||
|
||||
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
|
||||
|
||||
|
||||
df = df[~pd.isnull(df["UPRN"])] \
|
||||
[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] \
|
||||
[df["TRANSACTION_TYPE"] != "new dwelling"] \
|
||||
[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
||||
|
||||
return df
|
||||
|
||||
def retain_multiple_epc_properties(df: pd.DataFrame, minimum_count: int = 1) -> pd.DataFrame:
|
||||
'''
|
||||
Reduce the data futher by keeping only datasets with multiple epcs
|
||||
'''
|
||||
|
||||
counts = df.groupby("UPRN").size().reset_index()
|
||||
counts.columns = ["UPRN", "count"]
|
||||
|
||||
# take UPRNS with multiple EPCs
|
||||
counts = counts[counts["count"] > minimum_count]
|
||||
df = pd.merge(df, counts, on='UPRN')
|
||||
|
||||
return df
|
||||
|
||||
def recast_df_columns(df: pd.DataFrame, column_mappings: dict) -> pd.DataFrame:
|
||||
"""
|
||||
Recast columns from the dataframe to ensure the behaviour we want
|
||||
"""
|
||||
|
||||
for key, values in column_mappings.items():
|
||||
if key not in df.columns:
|
||||
print('Column mapping incorrectly specified')
|
||||
exit(1)
|
||||
for value in values:
|
||||
df[key] = df[key].astype(value)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
||||
|
|
@ -177,35 +231,20 @@ def app():
|
|||
|
||||
dataset = []
|
||||
for directory in tqdm(directories):
|
||||
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
|
||||
|
||||
filepath = directory / "certificates.csv"
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
# UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
|
||||
df = df[~pd.isnull(df["UPRN"])]
|
||||
# Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
|
||||
# before the introduction of SAP09
|
||||
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||
|
||||
cleaning_averages, general_averages = make_cleaning_averages(df)
|
||||
|
||||
# We remove EPCS that were conducted for a new build, since these are performed with
|
||||
# full SAP, which produces different results to the RdSAP methodology
|
||||
df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
|
||||
df = confine_data(df)
|
||||
df = recast_df_columns(df, {'UPRN': [int, str]})
|
||||
|
||||
df = clean_multi_glaze_proportion(df)
|
||||
df = retain_multiple_epc_properties(df, minimum_count=1)
|
||||
|
||||
# We remove floor level in top floor or mid floor since this is ambiguous
|
||||
df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
||||
|
||||
df["UPRN"] = df["UPRN"].astype(int).astype(str)
|
||||
counts = df.groupby("UPRN").size().reset_index()
|
||||
counts.columns = ["UPRN", "count"]
|
||||
counts = counts.sort_values("count", ascending=False)
|
||||
|
||||
# take UPRNS with multiple EPCs
|
||||
counts = counts[counts["count"] > 1]
|
||||
df = df[df["UPRN"].isin(counts["UPRN"])]
|
||||
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||
|
||||
cleaning_averages, general_averages = make_cleaning_averages(df)
|
||||
|
||||
for uprn, property_data in df.groupby("UPRN", observed=True):
|
||||
|
||||
# Fixed features - these are property attributes that shouldn't change over time
|
||||
|
|
@ -305,3 +344,7 @@ def app():
|
|||
)
|
||||
|
||||
dataset.extend(property_model_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
Loading…
Add table
Reference in a new issue