mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
push code to generate recommendations pickle file
This commit is contained in:
parent
a66b378295
commit
fae578ba3a
5 changed files with 476 additions and 427 deletions
|
|
@ -56,8 +56,11 @@ construction_age_remap = {
|
|||
|
||||
expanded_map = {
|
||||
i: [
|
||||
label for label, bounds in construction_age_bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
|
||||
][0] for i in range(0, 3001)
|
||||
label
|
||||
for label, bounds in construction_age_bounds_map.items()
|
||||
if (i <= bounds["u"]) and (i >= bounds["l"])
|
||||
][0]
|
||||
for i in range(0, 3001)
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -74,8 +77,13 @@ class EPCDataProcessor:
|
|||
Handle data loading and data preprocessing
|
||||
"""
|
||||
|
||||
def __init__(self, data: pd.DataFrame | None = None, cleaning_averages: pd.DataFrame | None = None,
|
||||
run_mode: str = "training", violation_mode: bool = False) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
data: pd.DataFrame | None = None,
|
||||
cleaning_averages: pd.DataFrame | None = None,
|
||||
run_mode: str = "training",
|
||||
violation_mode: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
:param filepath: If specified, is the physical location of the data
|
||||
:param is_newdata: Indicates if we are processing new, testing data.
|
||||
|
|
@ -86,7 +94,9 @@ class EPCDataProcessor:
|
|||
self.data: pd.DataFrame = data if is_data_a_dataframe else pd.DataFrame()
|
||||
|
||||
is_cleaning_averages_a_dataframe = isinstance(cleaning_averages, pd.DataFrame)
|
||||
self.cleaning_averages: pd.DataFrame = cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame()
|
||||
self.cleaning_averages: pd.DataFrame = (
|
||||
cleaning_averages if is_cleaning_averages_a_dataframe else pd.DataFrame()
|
||||
)
|
||||
|
||||
# FOR NOW IF VIOLATION MODE IS ON, WE USE RUN MODE AS NEWDATA
|
||||
self.violation_mode = violation_mode
|
||||
|
|
@ -103,7 +113,9 @@ class EPCDataProcessor:
|
|||
ignore_step = True if self.run_mode == "newdata" else False
|
||||
|
||||
if filepath is not None:
|
||||
self.load_data(filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
|
||||
self.load_data(
|
||||
filepath=filepath, low_memory=DATA_PROCESSOR_SETTINGS["low_memory"]
|
||||
)
|
||||
|
||||
if len(self.data) == 0:
|
||||
raise Exception("No data to process - check filepath/ data being passed in")
|
||||
|
|
@ -121,7 +133,8 @@ class EPCDataProcessor:
|
|||
self.clean_multi_glaze_proportion(ignore_step=ignore_step)
|
||||
self.clean_photo_supply()
|
||||
self.retain_multiple_epc_properties(
|
||||
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"], ignore_step=ignore_step
|
||||
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"],
|
||||
ignore_step=ignore_step,
|
||||
)
|
||||
|
||||
self.fill_na_fields()
|
||||
|
|
@ -188,7 +201,9 @@ class EPCDataProcessor:
|
|||
if ignore_step:
|
||||
return
|
||||
|
||||
self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0]
|
||||
self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[
|
||||
0
|
||||
]
|
||||
|
||||
def fill_invalid_constituency_fields(self, ignore_step: bool = False):
|
||||
"""
|
||||
|
|
@ -201,7 +216,9 @@ class EPCDataProcessor:
|
|||
if ignore_step:
|
||||
return
|
||||
|
||||
self.data = self.data.fillna({"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]})
|
||||
self.data = self.data.fillna(
|
||||
{"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]}
|
||||
)
|
||||
|
||||
def sort_data_by_uprn_lodgement_date(self, ignore_step: bool = False):
|
||||
"""
|
||||
|
|
@ -301,7 +318,7 @@ class EPCDataProcessor:
|
|||
"""
|
||||
|
||||
if self.violation_mode:
|
||||
# TODO: to fill in
|
||||
# TODO: to fill in
|
||||
return
|
||||
|
||||
if ignore_step:
|
||||
|
|
@ -311,9 +328,7 @@ class EPCDataProcessor:
|
|||
lambda x: self.clean_construction_age_band(x)
|
||||
)
|
||||
|
||||
self.data = self.data[
|
||||
~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"])
|
||||
]
|
||||
self.data = self.data[~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"])]
|
||||
|
||||
def clean_missing_rooms(self, ignore_step: bool = False):
|
||||
"""
|
||||
|
|
@ -331,31 +346,45 @@ class EPCDataProcessor:
|
|||
return
|
||||
|
||||
# TODO: DO we want to move this out of this function? (i.e. alter the data before we do any cleaning)
|
||||
self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
|
||||
self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(
|
||||
lambda x: x.split(" ")[0]
|
||||
)
|
||||
|
||||
def apply_clean(data, matching_columns):
|
||||
|
||||
cleaning_data = data[~pd.isnull(data[col])].groupby(
|
||||
matching_columns
|
||||
)[col].median().reset_index()
|
||||
|
||||
data = data.merge(
|
||||
cleaning_data, how="left", on=matching_columns, suffixes=("", "_CLEANING")
|
||||
cleaning_data = (
|
||||
data[~pd.isnull(data[col])]
|
||||
.groupby(matching_columns)[col]
|
||||
.median()
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
data[col] = np.where(pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col])
|
||||
data = data.merge(
|
||||
cleaning_data,
|
||||
how="left",
|
||||
on=matching_columns,
|
||||
suffixes=("", "_CLEANING"),
|
||||
)
|
||||
|
||||
data[col] = np.where(
|
||||
pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col]
|
||||
)
|
||||
data = data.drop(columns=f"{col}_CLEANING")
|
||||
return data
|
||||
|
||||
for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]:
|
||||
|
||||
to_index = 3
|
||||
matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "POSTAL_AREA"]
|
||||
matching_columns = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
"CONSTRUCTION_AGE_BAND",
|
||||
"POSTAL_AREA",
|
||||
]
|
||||
has_missings = pd.isnull(self.data[col]).sum()
|
||||
while has_missings:
|
||||
self.data = apply_clean(
|
||||
data=self.data,
|
||||
matching_columns=matching_columns[0:to_index + 1]
|
||||
data=self.data, matching_columns=matching_columns[0 : to_index + 1]
|
||||
)
|
||||
has_missings = pd.isnull(self.data[col]).sum()
|
||||
|
||||
|
|
@ -363,7 +392,10 @@ class EPCDataProcessor:
|
|||
# Check if we've gotten to index 0 and still have missings - something has gone wrong or
|
||||
# we have a very unique property type
|
||||
if has_missings:
|
||||
raise NotImplementedError("Handle this edge case, we still have missings for column %s" % col)
|
||||
raise NotImplementedError(
|
||||
"Handle this edge case, we still have missings for column %s"
|
||||
% col
|
||||
)
|
||||
|
||||
break
|
||||
to_index -= 1
|
||||
|
|
@ -410,7 +442,7 @@ class EPCDataProcessor:
|
|||
# coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else
|
||||
# COLUMNTYPES
|
||||
# for k, v in coltypes.items():
|
||||
# self.data[k] = self.data[k].astype(v)
|
||||
# self.data[k] = self.data[k].astype(v)
|
||||
# self.data = self.data.astype(coltypes)
|
||||
|
||||
# self.na_remapping()
|
||||
|
|
@ -437,9 +469,11 @@ class EPCDataProcessor:
|
|||
|
||||
def na_remapping(self, auto_subset_columns: bool = False):
|
||||
|
||||
fill_na_map_apply = {
|
||||
k: v for k, v in fill_na_map.items() if k in self.data.columns
|
||||
} if auto_subset_columns else fill_na_map
|
||||
fill_na_map_apply = (
|
||||
{k: v for k, v in fill_na_map.items() if k in self.data.columns}
|
||||
if auto_subset_columns
|
||||
else fill_na_map
|
||||
)
|
||||
|
||||
for column, fill_value in fill_na_map_apply.items():
|
||||
self.data[column] = self.data[column].fillna(fill_value)
|
||||
|
|
@ -535,28 +569,34 @@ class EPCDataProcessor:
|
|||
|
||||
for variable in AVERAGE_FIXED_FEATURES:
|
||||
# Replace any missing NAN values with averages for the same Property type and built form
|
||||
cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna(
|
||||
cleaning_averages_filled[f"{variable}_AVERAGE"]
|
||||
)
|
||||
cleaning_averages_filled[variable] = cleaning_averages_filled[
|
||||
variable
|
||||
].fillna(cleaning_averages_filled[f"{variable}_AVERAGE"])
|
||||
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_AVERAGE")
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(
|
||||
columns=f"{variable}_AVERAGE"
|
||||
)
|
||||
|
||||
# If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope
|
||||
# and built form
|
||||
# We can use just the property type average and replace
|
||||
|
||||
cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna(
|
||||
cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"]
|
||||
)
|
||||
cleaning_averages_filled[variable] = cleaning_averages_filled[
|
||||
variable
|
||||
].fillna(cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"])
|
||||
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_PROPERTY_AVERAGE")
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(
|
||||
columns=f"{variable}_PROPERTY_AVERAGE"
|
||||
)
|
||||
|
||||
# If there are still NA values, use BUILT FORM averages
|
||||
cleaning_averages_filled["variable"] = cleaning_averages_filled[variable].fillna(
|
||||
cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"]
|
||||
)
|
||||
cleaning_averages_filled["variable"] = cleaning_averages_filled[
|
||||
variable
|
||||
].fillna(cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"])
|
||||
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
|
||||
cleaning_averages_filled = cleaning_averages_filled.drop(
|
||||
columns=f"{variable}_BUILT_FORM_AVERAGE"
|
||||
)
|
||||
|
||||
# If there still is na values, use average across all epc in consituecy
|
||||
cleaning_averages_filled[variable] = cleaning_averages_filled[
|
||||
|
|
@ -573,7 +613,9 @@ class EPCDataProcessor:
|
|||
|
||||
self.cleaning_averages = cleaning_averages_filled
|
||||
|
||||
def retain_multiple_epc_properties(self, epc_minimum_count: int = 1, ignore_step: bool = False) -> None:
|
||||
def retain_multiple_epc_properties(
|
||||
self, epc_minimum_count: int = 1, ignore_step: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Reduce the data futher by keeping only datasets with multiple epcs
|
||||
"""
|
||||
|
|
@ -592,12 +634,16 @@ class EPCDataProcessor:
|
|||
counts = counts[counts["count"] > epc_minimum_count]
|
||||
self.data = pd.merge(self.data, counts, on="UPRN")
|
||||
|
||||
def recast_df_columns(self, column_mappings: dict, auto_subset_columns: bool = False) -> None:
|
||||
def recast_df_columns(
|
||||
self, column_mappings: dict, auto_subset_columns: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Recast columns from the dataframe to ensure the behaviour we want
|
||||
"""
|
||||
if auto_subset_columns:
|
||||
column_mappings = {k: v for k, v in column_mappings.items() if k in self.data.columns}
|
||||
column_mappings = {
|
||||
k: v for k, v in column_mappings.items() if k in self.data.columns
|
||||
}
|
||||
|
||||
for key, values in column_mappings.items():
|
||||
if key not in self.data.columns:
|
||||
|
|
@ -608,13 +654,17 @@ class EPCDataProcessor:
|
|||
else:
|
||||
self.data[key] = self.data[key].astype(values)
|
||||
|
||||
def recast_all_data(self, column_mappings: dict, auto_subset_columns: bool = False) -> None:
|
||||
def recast_all_data(
|
||||
self, column_mappings: dict, auto_subset_columns: bool = False
|
||||
) -> None:
|
||||
"""
|
||||
Using a dictionary to recast all columns at once
|
||||
"""
|
||||
|
||||
if auto_subset_columns:
|
||||
column_mappings = {k: v for k, v in column_mappings.items() if k in self.data.columns}
|
||||
column_mappings = {
|
||||
k: v for k, v in column_mappings.items() if k in self.data.columns
|
||||
}
|
||||
|
||||
self.data = self.data.astype(column_mappings)
|
||||
|
||||
|
|
@ -625,14 +675,28 @@ class EPCDataProcessor:
|
|||
|
||||
if self.violation_mode:
|
||||
violation_uprn_missing = pd.isnull(self.data["UPRN"])
|
||||
violation_old_lodgment_date = self.data["LODGEMENT_DATE"] < EARLIEST_EPC_DATE
|
||||
violation_invalid_transaction_type = self.data["TRANSACTION_TYPE"] == IGNORED_TRANSACTION_TYPES
|
||||
violation_ignored_floor_level = self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS)
|
||||
violation_old_lodgment_date = (
|
||||
self.data["LODGEMENT_DATE"] < EARLIEST_EPC_DATE
|
||||
)
|
||||
violation_invalid_transaction_type = (
|
||||
self.data["TRANSACTION_TYPE"] == IGNORED_TRANSACTION_TYPES
|
||||
)
|
||||
violation_ignored_floor_level = self.data["FLOOR_LEVEL"].isin(
|
||||
IGNORED_FLOOR_LEVELS
|
||||
)
|
||||
violation_rdsap_score_above_max = self.data[RDSAP_RESPONSE] > MAX_SAP_SCORE
|
||||
violation_missing_windows_description = pd.isnull(self.data["WINDOWS_DESCRIPTION"])
|
||||
violation_missing_hotwater_description = pd.isnull(self.data["HOTWATER_DESCRIPTION"])
|
||||
violation_missing_roof_description = pd.isnull(self.data["ROOF_DESCRIPTION"])
|
||||
violation_invalid_property_type = self.data["PROPERTY_TYPE"] == IGNORED_PROPERTY_TYPES
|
||||
violation_missing_windows_description = pd.isnull(
|
||||
self.data["WINDOWS_DESCRIPTION"]
|
||||
)
|
||||
violation_missing_hotwater_description = pd.isnull(
|
||||
self.data["HOTWATER_DESCRIPTION"]
|
||||
)
|
||||
violation_missing_roof_description = pd.isnull(
|
||||
self.data["ROOF_DESCRIPTION"]
|
||||
)
|
||||
violation_invalid_property_type = (
|
||||
self.data["PROPERTY_TYPE"] == IGNORED_PROPERTY_TYPES
|
||||
)
|
||||
violation_invalid_tenure = self.data["TENURE"].isin(IGNORED_TENURES)
|
||||
|
||||
violation_df = pd.concat(
|
||||
|
|
@ -647,7 +711,8 @@ class EPCDataProcessor:
|
|||
violation_missing_roof_description,
|
||||
violation_invalid_property_type,
|
||||
violation_invalid_tenure,
|
||||
], axis=1,
|
||||
],
|
||||
axis=1,
|
||||
keys=[
|
||||
"violation_uprn_missing",
|
||||
"violation_old_lodgment_date",
|
||||
|
|
@ -658,8 +723,8 @@ class EPCDataProcessor:
|
|||
"violation_missing_hotwater_description",
|
||||
"violation_missing_roof_description",
|
||||
"violation_invalid_property_type",
|
||||
"violation_invalid_tenure"
|
||||
]
|
||||
"violation_invalid_tenure",
|
||||
],
|
||||
)
|
||||
|
||||
self.data = pd.concat([self.data, violation_df], axis=1)
|
||||
|
|
@ -685,10 +750,10 @@ class EPCDataProcessor:
|
|||
|
||||
self.data = self.data[~pd.isnull(self.data["UPRN"])]
|
||||
self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||
self.data = self.data[self.data["TRANSACTION_TYPE"] != IGNORED_TRANSACTION_TYPES]
|
||||
self.data = self.data[
|
||||
~self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS)
|
||||
self.data["TRANSACTION_TYPE"] != IGNORED_TRANSACTION_TYPES
|
||||
]
|
||||
self.data = self.data[~self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS)]
|
||||
self.data = self.data[self.data[RDSAP_RESPONSE] <= MAX_SAP_SCORE]
|
||||
|
||||
# We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them
|
||||
|
|
@ -705,7 +770,10 @@ class EPCDataProcessor:
|
|||
self.data = self.data[~self.data["TENURE"].isin(IGNORED_TENURES)]
|
||||
|
||||
# We remap zero values to None
|
||||
self.data.loc[self.data['FLOOR_HEIGHT'] == 0, 'FLOOR_HEIGHT'] = None
|
||||
self.data.loc[self.data["FLOOR_HEIGHT"] == 0, "FLOOR_HEIGHT"] = None
|
||||
|
||||
# Keep only non zero floor area
|
||||
self.data = self.data[self.data["TOTAL_FLOOR_AREA"] != 0]
|
||||
|
||||
def clean_multi_glaze_proportion(self, ignore_step: bool = False) -> None:
|
||||
"""
|
||||
|
|
@ -734,7 +802,11 @@ class EPCDataProcessor:
|
|||
|
||||
@staticmethod
|
||||
def apply_averages_cleaning(
|
||||
data_to_clean, cleaning_data, cols_to_merge_on, colnames=None, ignore_step: bool = False
|
||||
data_to_clean,
|
||||
cleaning_data,
|
||||
cols_to_merge_on,
|
||||
colnames=None,
|
||||
ignore_step: bool = False,
|
||||
):
|
||||
"""
|
||||
Clean the input DataFrame using averages from a cleaning DataFrame.
|
||||
|
|
@ -752,12 +824,13 @@ class EPCDataProcessor:
|
|||
|
||||
# The desired colnames to clean - which may not be present
|
||||
if colnames is None:
|
||||
colnames = ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "FIXED_LIGHTING_OUTLETS_COUNT"]
|
||||
colnames = [
|
||||
"TOTAL_FLOOR_AREA",
|
||||
"FLOOR_HEIGHT",
|
||||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||
]
|
||||
|
||||
cols_to_clean = [
|
||||
c for c in colnames if
|
||||
c in data_to_clean.columns
|
||||
]
|
||||
cols_to_clean = [c for c in colnames if c in data_to_clean.columns]
|
||||
|
||||
# Enforce data types
|
||||
for col in ["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]:
|
||||
|
|
@ -768,7 +841,15 @@ class EPCDataProcessor:
|
|||
|
||||
# Calculate averages
|
||||
cleaning_averages_to_merge = cleaning_data.groupby(columns_to_merge_on).agg(
|
||||
dict(zip(cols_to_clean, ["mean", ] * len(cols_to_clean)))
|
||||
dict(
|
||||
zip(
|
||||
cols_to_clean,
|
||||
[
|
||||
"mean",
|
||||
]
|
||||
* len(cols_to_clean),
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Merge with the original data
|
||||
|
|
@ -777,7 +858,7 @@ class EPCDataProcessor:
|
|||
cleaning_averages_to_merge,
|
||||
on=columns_to_merge_on,
|
||||
suffixes=("", "_AVERAGE"),
|
||||
how='left'
|
||||
how="left",
|
||||
)
|
||||
|
||||
global_averages = cleaning_data[cols_to_clean].mean()
|
||||
|
|
@ -806,14 +887,20 @@ class EPCDataProcessor:
|
|||
raise Exception("Suffix should be one of _starting or _ending")
|
||||
|
||||
if suffix == "_STARTING":
|
||||
starting_cols = self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES].copy().add_suffix(suffix)
|
||||
starting_cols = (
|
||||
self.data[STARTING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES]
|
||||
.copy()
|
||||
.add_suffix(suffix)
|
||||
)
|
||||
fixed_cols = self.data[NO_SUFFIX_COMPONENT_COLS + POTENTIAL_COLUMNS].copy()
|
||||
|
||||
return pd.concat([starting_cols, fixed_cols], axis=1)
|
||||
|
||||
return self.data[
|
||||
ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES
|
||||
].copy().add_suffix(suffix)
|
||||
return (
|
||||
self.data[ENDING_SUFFIX_COMPONENT_COLS + EFFICIENCY_FEATURES]
|
||||
.copy()
|
||||
.add_suffix(suffix)
|
||||
)
|
||||
|
||||
def get_fixed_features(self) -> pd.DataFrame:
|
||||
"""
|
||||
|
|
@ -831,14 +918,17 @@ class EPCDataProcessor:
|
|||
:param cols_to_ignore: If specified, is a list of columns to ignore, e.g. uuids
|
||||
:return: DataFrame with coerced columns.
|
||||
"""
|
||||
object_columns = df.select_dtypes(include=['object']).columns
|
||||
object_columns = df.select_dtypes(include=["object"]).columns
|
||||
if cols_to_ignore:
|
||||
object_columns = [c for c in object_columns if c not in cols_to_ignore]
|
||||
|
||||
for column in object_columns:
|
||||
unique_values = df[column].dropna().unique()
|
||||
# If the unique values in the column are 'True' and 'False', convert the column to boolean
|
||||
if set(unique_values) == {'True', 'False'} or set(unique_values) == {True, False}:
|
||||
if set(unique_values) == {"True", "False"} or set(unique_values) == {
|
||||
True,
|
||||
False,
|
||||
}:
|
||||
df[column] = df[column].astype(bool)
|
||||
|
||||
return df
|
||||
|
|
@ -877,7 +967,6 @@ class EPCDataProcessor:
|
|||
|
||||
@staticmethod
|
||||
def clean_efficiency_variables(df):
|
||||
|
||||
"""
|
||||
These is scope to clean this by the model per corresponding description.
|
||||
E.g. for WALLS_ENG_EFF we could look at the mode efficiency rating by description and
|
||||
|
|
|
|||
|
|
@ -87,6 +87,8 @@ class EPCPipeline:
|
|||
epc_all_equal_rows_key="sap_change_model/{}/all_equal_rows_rooms.parquet",
|
||||
epc_compiled_dataset_key="sap_change_model/{}/dataset_rooms.parquet",
|
||||
use_parallel=False,
|
||||
use_recommendations=False,
|
||||
epc_recommendations_file="recommendations.csv",
|
||||
):
|
||||
"""
|
||||
:param directories: List of directories to process
|
||||
|
|
@ -101,6 +103,7 @@ class EPCPipeline:
|
|||
self.compiled_dataset: pd.DataFrame = pd.DataFrame()
|
||||
self.compiled_all_equal_rows: list = []
|
||||
self.compiled_cleaning_averages: list = []
|
||||
self.recommendation_dataset: pd.DataFrame = pd.DataFrame()
|
||||
|
||||
self.directories = directories
|
||||
self.epc_data_processor = epc_data_processor
|
||||
|
|
@ -109,6 +112,9 @@ class EPCPipeline:
|
|||
self.epc_local_file = epc_local_file
|
||||
self.epc_bucket_name = epc_bucket_name
|
||||
|
||||
self.use_recommendations = use_recommendations
|
||||
self.epc_recommendations_file = epc_recommendations_file
|
||||
|
||||
self.use_parallel = use_parallel
|
||||
self.timeprefix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
||||
|
||||
|
|
@ -194,6 +200,9 @@ class EPCPipeline:
|
|||
self.compiled_dataset = pd.concat(
|
||||
[self.compiled_dataset, result["dataset"]]
|
||||
)
|
||||
self.recommendation_dataset = pd.concat(
|
||||
[self.recommendation_dataset, result["recommendation_dataset"]]
|
||||
)
|
||||
self.compiled_cleaning_averages.append(result["cleaning_averages"])
|
||||
self.compiled_all_equal_rows.extend(result["all_equal_rows"])
|
||||
|
||||
|
|
@ -208,6 +217,7 @@ class EPCPipeline:
|
|||
"dataset": self.compiled_dataset,
|
||||
"cleaning_averages": self.epc_data_processor.cleaning_averages,
|
||||
"all_equal_rows": self.compiled_all_equal_rows,
|
||||
"recommendation_dataset": self.recommendation_dataset,
|
||||
}
|
||||
|
||||
return output
|
||||
|
|
@ -224,15 +234,54 @@ class EPCPipeline:
|
|||
|
||||
constituency_data = self.epc_data_processor.data
|
||||
|
||||
if self.use_recommendations:
|
||||
|
||||
# Use only the most recent epc for each uprn
|
||||
constituency_data = constituency_data.sort_values(
|
||||
"lodgement_date", ascending=False
|
||||
).drop_duplicates("uprn")
|
||||
|
||||
recommendations_filepath = directory / self.epc_recommendations_file
|
||||
recommendations_df = pd.read_csv(recommendations_filepath)
|
||||
|
||||
recommendations_df = recommendations_df[
|
||||
recommendations_df["IMPROVEMENT_ID"].notnull()
|
||||
]
|
||||
recommendations_df["IMPROVEMENT_ID"] = recommendations_df[
|
||||
"IMPROVEMENT_ID"
|
||||
].astype(int)
|
||||
recommendations_df.columns = recommendations_df.columns.str.lower()
|
||||
|
||||
# Get all recommendations for all properties in the constituency (after cleaning)
|
||||
recommendations_df = recommendations_df.merge(
|
||||
constituency_data[["lmk_key", "uprn"]], on="lmk_key", how="inner"
|
||||
)
|
||||
|
||||
# Keep all properties that have recommendations
|
||||
constituency_data = constituency_data[
|
||||
constituency_data["lmk_key"].isin(recommendations_df["lmk_key"])
|
||||
]
|
||||
|
||||
# In order to create a difference record, we repeat each row for each uprn
|
||||
constituency_data = pd.concat(
|
||||
[constituency_data, constituency_data]
|
||||
).reset_index(drop=True)
|
||||
constituency_data = constituency_data.sort_values("uprn")
|
||||
|
||||
self.compiled_cleaning_averages.append(
|
||||
self.epc_data_processor.cleaning_averages
|
||||
)
|
||||
|
||||
constituency_difference_records = []
|
||||
|
||||
require_adequate_data_check = False if self.use_recommendations else True
|
||||
|
||||
for uprn, property_data in constituency_data.groupby("uprn", observed=True):
|
||||
difference_records = self.process_uprn(
|
||||
uprn=str(uprn), property_data=property_data, directory=directory
|
||||
uprn=str(uprn),
|
||||
property_data=property_data,
|
||||
directory=directory,
|
||||
require_adequate_data_check=require_adequate_data_check,
|
||||
)
|
||||
if difference_records is not None:
|
||||
constituency_difference_records.extend(difference_records)
|
||||
|
|
@ -245,7 +294,18 @@ class EPCPipeline:
|
|||
[self.compiled_dataset, constituency_dataset.df]
|
||||
)
|
||||
|
||||
def process_uprn(self, uprn: str, property_data: pd.DataFrame, directory: Path):
|
||||
if self.use_recommendations:
|
||||
self.recommendation_dataset = pd.concat(
|
||||
[self.recommendation_dataset, recommendations_df]
|
||||
)
|
||||
|
||||
def process_uprn(
|
||||
self,
|
||||
uprn: str,
|
||||
property_data: pd.DataFrame,
|
||||
directory: Path,
|
||||
require_adequate_data_check: bool = True,
|
||||
):
|
||||
"""
|
||||
Process a single UPRN, which may have multiple different EPCs
|
||||
:param uprn: UPRN
|
||||
|
|
@ -279,13 +339,18 @@ class EPCPipeline:
|
|||
|
||||
# We can use multiple types of comparison datasets - i.e. Compare consecutive records, or compare all permutations of records
|
||||
property_difference_records = self._generate_property_difference_records(
|
||||
epc_records, uprn, directory, fixed_data
|
||||
epc_records, uprn, directory, fixed_data, require_adequate_data_check
|
||||
)
|
||||
|
||||
return property_difference_records
|
||||
|
||||
def _generate_property_difference_records(
|
||||
self, epc_records: List[EPCRecord], uprn: str, directory: Path, fixed_data: dict
|
||||
self,
|
||||
epc_records: List[EPCRecord],
|
||||
uprn: str,
|
||||
directory: Path,
|
||||
fixed_data: dict,
|
||||
require_adequate_data_check: bool = True,
|
||||
):
|
||||
"""
|
||||
We can use multiple types of comparison datasets, for example:
|
||||
|
|
@ -301,7 +366,12 @@ class EPCPipeline:
|
|||
# property_difference_records = self._compare_consecutive_epcs(epc_records, uprn, directory, fixed_data, property_difference_records)
|
||||
|
||||
property_difference_records = self._compare_all_permutation_epcs(
|
||||
epc_records, uprn, directory, fixed_data, property_difference_records
|
||||
epc_records,
|
||||
uprn,
|
||||
directory,
|
||||
fixed_data,
|
||||
property_difference_records,
|
||||
require_adequate_data_check,
|
||||
)
|
||||
|
||||
return property_difference_records
|
||||
|
|
@ -313,6 +383,7 @@ class EPCPipeline:
|
|||
directory: Path,
|
||||
fixed_data: dict,
|
||||
property_difference_records: list,
|
||||
require_adequate_data_check: bool = True,
|
||||
):
|
||||
"""
|
||||
Compare all permutations of EPCs for a given UPRN
|
||||
|
|
@ -337,7 +408,10 @@ class EPCPipeline:
|
|||
|
||||
# TODO: Pull out RDSAP_CHANGE to a variable
|
||||
if difference_record.get("rdsap_change") == 0:
|
||||
if not difference_record.ensure_adequate_data():
|
||||
if (
|
||||
not difference_record.ensure_adequate_data()
|
||||
and require_adequate_data_check
|
||||
):
|
||||
# Rdsap hasn't changed but we have enough data to use this record
|
||||
# i.e. all fields aside from mechnical ventilation are the same]
|
||||
# self.check_records.append({"uprn": uprn, "directory_name": directory.name, "difference_record": difference_record, "earliest_record": earliest_record, "latest_record": latest_record})
|
||||
|
|
@ -347,7 +421,7 @@ class EPCPipeline:
|
|||
fields=[x.lower() for x in CORE_COMPONENT_FEATURES]
|
||||
)
|
||||
|
||||
if all_equal:
|
||||
if all_equal and require_adequate_data_check:
|
||||
# Keep track of this for the moment so we can analyse
|
||||
self.compiled_all_equal_rows.append(
|
||||
{"uprn": uprn, "directory_name": directory.name}
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ def main():
|
|||
directories=directories,
|
||||
use_parallel=True,
|
||||
epc_data_processor=EPCDataProcessor(run_mode="training"),
|
||||
use_recommendations=True,
|
||||
)
|
||||
|
||||
epc_pipeline.run()
|
||||
|
|
|
|||
|
|
@ -1,352 +1,223 @@
|
|||
# Pipeline to combined recommendations and certificates data together
|
||||
# Pipeline to load all EPC data similar to EPCPipeline but once data is made into EPCRecord,
|
||||
# We intantiate a Property instance so that we can get both the recommendations and the classification of the
|
||||
# walls, roof and floor (i.e. average, above average etc)
|
||||
|
||||
import os
|
||||
from datetime import datetime
|
||||
import itertools
|
||||
from tqdm import tqdm
|
||||
|
||||
import pandas as pd
|
||||
from etl.epc.Record import EPCRecord
|
||||
from backend.SearchEpc import SearchEpc
|
||||
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from backend.app.config import get_settings
|
||||
from backend.app.db.connection import db_engine
|
||||
from backend.app.db.functions.materials_functions import get_materials
|
||||
|
||||
from backend.app.plan.utils import get_cleaned
|
||||
|
||||
from backend.Property import Property
|
||||
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
|
||||
|
||||
from recommendations.Recommendations import Recommendations
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_dataframe_from_s3_parquet, save_dataframe_to_s3_parquet
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
now = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
logger.info("Connecting to db")
|
||||
session = sessionmaker(bind=db_engine)()
|
||||
created_at = datetime.now().isoformat()
|
||||
|
||||
session.begin()
|
||||
logger.info("Getting the inputs")
|
||||
|
||||
cleaning_data = read_dataframe_from_s3_parquet(
|
||||
bucket_name=get_settings().DATA_BUCKET,
|
||||
file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
)
|
||||
|
||||
materials = get_materials(session)
|
||||
cleaned = get_cleaned()
|
||||
|
||||
uprn_filenames = read_dataframe_from_s3_parquet(
|
||||
bucket_name=get_settings().DATA_BUCKET, file_key="spatial/filename_meta.parquet"
|
||||
)
|
||||
photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(
|
||||
bucket=get_settings().DATA_BUCKET
|
||||
)
|
||||
|
||||
scenario_properties_df = pd.read_csv(
|
||||
Path(__file__).parent / "improvement_data_sample.csv"
|
||||
)
|
||||
|
||||
improvement_id_to_check = 1
|
||||
properties_to_check = scenario_properties_df[
|
||||
scenario_properties_df["IMPROVEMENT_ID"] == improvement_id_to_check
|
||||
]
|
||||
|
||||
property_list = []
|
||||
|
||||
for i, row in tdqm(properties_to_check.iterrows()):
|
||||
try:
|
||||
epc_searcher = SearchEpc(
|
||||
address1=row["ADDRESS1"],
|
||||
postcode=row["POSTCODE"],
|
||||
auth_token=get_settings().EPC_AUTH_TOKEN,
|
||||
os_api_key=get_settings().ORDNANCE_SURVEY_API_KEY,
|
||||
)
|
||||
epc_searcher.find_property()
|
||||
|
||||
epc_records = {
|
||||
"original_epc": epc_searcher.newest_epc.copy(),
|
||||
"full_sap_epc": epc_searcher.full_sap_epc.copy(),
|
||||
"old_data": epc_searcher.older_epcs.copy(),
|
||||
}
|
||||
|
||||
prepared_epc = EPCRecord(
|
||||
epc_records=epc_records, run_mode="newdata", cleaning_data=cleaning_data
|
||||
)
|
||||
|
||||
p = Property(
|
||||
id=prepared_epc.uprn,
|
||||
address=epc_searcher.address_clean,
|
||||
postcode=epc_searcher.postcode_clean,
|
||||
epc_record=prepared_epc,
|
||||
)
|
||||
|
||||
p.get_spatial_data(uprn_filenames)
|
||||
p.get_components(cleaned, photo_supply_lookup, floor_area_decile_thresholds)
|
||||
|
||||
recommender = Recommendations(property_instance=p, materials=materials)
|
||||
property_recommendations = recommender.recommend()
|
||||
|
||||
wall_recommendations = recommender.wall_recomender.recommendations
|
||||
loft_recommendations = recommender.roof_recommender.recommendations
|
||||
solar_recommendations = recommender.solar_recommender.recommendation
|
||||
windows_recommendations = recommender.windows_recommender.recommendation
|
||||
|
||||
p.create_base_difference_epc_record(cleaned_lookup=cleaned)
|
||||
|
||||
property_list.append(p.base_difference_record.df)
|
||||
except:
|
||||
pass
|
||||
|
||||
property_df = pd.concat(property_list)
|
||||
|
||||
property_df["walls_insulation_thickness"]
|
||||
|
||||
scenario_properties = [
|
||||
{
|
||||
"address": "2 South Terrace",
|
||||
"postcode": "NN1 5JY",
|
||||
"lmk-key": "1459796789102016070507274146560098",
|
||||
"measures": [
|
||||
[
|
||||
["internal_wall_insulation"],
|
||||
"11",
|
||||
{"walls_insulation_thickness_ending": "average"},
|
||||
[0],
|
||||
],
|
||||
[
|
||||
["external_wall_insulation"],
|
||||
"10",
|
||||
{"walls_insulation_thickness_ending": "average"},
|
||||
[0],
|
||||
],
|
||||
[["solar", "windows"], "15", {"photo_supply_ending": 50}, [0, 1]],
|
||||
],
|
||||
},
|
||||
{
|
||||
"address": "8 Lindlings",
|
||||
"postcode": "HP1 2HA",
|
||||
"lmk-key": "c14029235739827d5f627dc8aa9bb567d026b267e851e0db0001db24638667b1",
|
||||
"measures": [
|
||||
[
|
||||
["cavity_wall_insulation", "loft_insulation"],
|
||||
"15",
|
||||
{"walls_insulation_thickness_ending": "average"},
|
||||
[0, 1],
|
||||
],
|
||||
],
|
||||
},
|
||||
{
|
||||
"address": "44 Lindlings",
|
||||
"postcode": "HP1 2HE",
|
||||
"lmk-key": "99296a6dda21314fef3a61cda59e441e9a2aacf115eb96f4a0fa85696bf7b117",
|
||||
"measures": [
|
||||
[
|
||||
["cavity_wall_insulation", "loft_insulation"],
|
||||
"15",
|
||||
{"walls_insulation_thickness_ending": "average"},
|
||||
[0, 1],
|
||||
],
|
||||
],
|
||||
},
|
||||
{
|
||||
"address": "46 Chaulden Terrace",
|
||||
"postcode": "HP1 2AN",
|
||||
"lmk-key": "d1e0534be3a44c33003323b21d0e322e3daddc65b5ee71936f89c59ddab96b50",
|
||||
"measures": [
|
||||
[
|
||||
["cavity_wall_insulation", "loft_insulation"],
|
||||
"15",
|
||||
{"walls_insulation_thickness_ending": "average"},
|
||||
[0, 1],
|
||||
],
|
||||
],
|
||||
},
|
||||
{
|
||||
"address": "73 Long Chaulden",
|
||||
"postcode": "HP1 2HX",
|
||||
"lmk-key": "1eae354db522a95188018d9cd0502ed8c609910b6c88f8797d3a25f59b11770a",
|
||||
"measures": [
|
||||
[
|
||||
["cavity_wall_insulation", "loft_insulation"],
|
||||
"15",
|
||||
{"walls_insulation_thickness_ending": "average"},
|
||||
[0, 1],
|
||||
],
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
import multiprocessing as mp
|
||||
import itertools
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
from etl.epc.DataProcessor import EPCDataProcessor
|
||||
from etl.epc.Pipeline import EPCPipeline
|
||||
|
||||
DATA_DIRECTORY = (
|
||||
Path(__file__).parent.parent / "epc" / "local_data" / "all-domestic-certificates"
|
||||
)
|
||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
# Start with one folder in the local_data directory
|
||||
|
||||
|
||||
class EPCRecommendationsPipeline:
|
||||
def main():
|
||||
"""
|
||||
Orchestration function
|
||||
"""
|
||||
|
||||
SEARCH_POSTCODE_URL = "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
|
||||
BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
|
||||
}
|
||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
|
||||
def __init__(self, directories: list, use_parallel: bool = True):
|
||||
self.directories = directories
|
||||
self.use_parallel = use_parallel
|
||||
# Set up the a new pipeline only up into the EPCRecord stage
|
||||
# So that we can instantiate a Property instance and get the recommendations
|
||||
|
||||
def determine_number_of_improvement_ids(self):
|
||||
with mp.Pool() as pool:
|
||||
results = list(
|
||||
tqdm(
|
||||
pool.imap(self._task_check_number_of_improvement_ids, directories),
|
||||
total=len(directories),
|
||||
),
|
||||
)
|
||||
# directories = directories[0:3]
|
||||
|
||||
results = list(itertools.chain(*results))
|
||||
# epc_pipeline = EPCPipeline(
|
||||
# directories=directories,
|
||||
# use_parallel=True,
|
||||
# epc_data_processor=EPCDataProcessor(run_mode="training"),
|
||||
# )
|
||||
|
||||
self.number_improvement_ids = set(results)
|
||||
# epc_pipeline.run()
|
||||
|
||||
def extract_improvement_description(self):
|
||||
with mp.Pool() as pool:
|
||||
results = list(
|
||||
tqdm(
|
||||
pool.imap(self._task_extract_improvement_description, directories),
|
||||
total=len(directories),
|
||||
),
|
||||
)
|
||||
|
||||
results = pd.concat(results)
|
||||
self.improvement_description_df = results.groupby("IMPROVEMENT_ID").sample(1)
|
||||
|
||||
# improvement_description = self._get_descriptions_of_improvements(
|
||||
# improvement_description_df
|
||||
# )
|
||||
|
||||
# self.improvement_descriptions = improvement_description
|
||||
|
||||
def extract_full_improvement_dataset(self):
|
||||
with mp.Pool() as pool:
|
||||
results = list(
|
||||
tqdm(
|
||||
pool.imap(self._task_extract_full_improvement_dataset, directories),
|
||||
total=len(directories),
|
||||
),
|
||||
)
|
||||
|
||||
results_df = pd.concat(results)
|
||||
|
||||
# Only sample one for each improvement as we just want to hit the find my energy website minimally for now
|
||||
sampled_df = results_df.groupby("IMPROVEMENT_ID").sample(1)
|
||||
|
||||
improvement_description = self._get_descriptions_of_improvements(sampled_df)
|
||||
|
||||
self.improvement_description = improvement_description
|
||||
|
||||
# self.full_improvement_df = sampled_df
|
||||
|
||||
def _task_check_number_of_improvement_ids(self, directory: Path):
|
||||
"""
|
||||
Parallel task for checking the number of improvement ids
|
||||
"""
|
||||
|
||||
recommendations_filepath = directory / "recommendations.csv"
|
||||
recommendations_df = pd.read_csv(recommendations_filepath)
|
||||
|
||||
recommendations_df = recommendations_df[
|
||||
recommendations_df["IMPROVEMENT_ID"].notnull()
|
||||
]
|
||||
recommendations_df["IMPROVEMENT_ID"] = recommendations_df[
|
||||
"IMPROVEMENT_ID"
|
||||
].astype(int)
|
||||
|
||||
output = list(recommendations_df["IMPROVEMENT_ID"].unique())
|
||||
|
||||
return output
|
||||
|
||||
def _task_extract_improvement_description(self, directory: Path) -> pd.DataFrame:
|
||||
"""
|
||||
Parallel task for checking the number of improvement ids
|
||||
"""
|
||||
|
||||
recommendations_filepath = directory / "recommendations.csv"
|
||||
recommendations_df = pd.read_csv(recommendations_filepath)
|
||||
|
||||
recommendations_df = recommendations_df[
|
||||
recommendations_df["IMPROVEMENT_ID"].notnull()
|
||||
]
|
||||
recommendations_df["IMPROVEMENT_ID"] = recommendations_df[
|
||||
"IMPROVEMENT_ID"
|
||||
].astype(int)
|
||||
|
||||
recommendations_df = recommendations_df[
|
||||
~recommendations_df["IMPROVEMENT_SUMMARY_TEXT"].isnull()
|
||||
]
|
||||
|
||||
recommendations_df = (
|
||||
recommendations_df.sort_values("IMPROVEMENT_ID")
|
||||
.groupby("IMPROVEMENT_ID")
|
||||
.head(1)
|
||||
)
|
||||
|
||||
return recommendations_df
|
||||
|
||||
def _task_extract_full_improvement_dataset(self, directory: Path) -> pd.DataFrame:
|
||||
"""
|
||||
Parallel task for checking the number of improvement ids
|
||||
Flow will be get the certificates,
|
||||
Find the latest EPC certificate for the UPRN,
|
||||
Load the recommendations,
|
||||
Merge on the LMK_KEY,
|
||||
"""
|
||||
|
||||
certificates_filepath = directory / "certificates.csv"
|
||||
certificates_df = pd.read_csv(certificates_filepath)
|
||||
|
||||
certificates_df = (
|
||||
certificates_df.sort_values("LODGEMENT_DATE", ascending=False)
|
||||
.groupby("UPRN")
|
||||
.head(1)
|
||||
.reset_index(drop=True)
|
||||
)
|
||||
|
||||
recommendations_filepath = directory / "recommendations.csv"
|
||||
recommendations_df = pd.read_csv(recommendations_filepath)
|
||||
|
||||
recommendations_df = recommendations_df[
|
||||
recommendations_df["IMPROVEMENT_ID"].notnull()
|
||||
]
|
||||
recommendations_df["IMPROVEMENT_ID"] = recommendations_df[
|
||||
"IMPROVEMENT_ID"
|
||||
].astype(int)
|
||||
|
||||
# sampled_df = recommendations_df.groupby("IMPROVEMENT_ID").sample(1)
|
||||
|
||||
output = certificates_df.merge(recommendations_df, on="LMK_KEY", how="inner")
|
||||
|
||||
res = output.groupby("IMPROVEMENT_ID").sample(1)
|
||||
|
||||
return res
|
||||
|
||||
def _get_descriptions_of_improvements(
|
||||
self, improvement_description_df: pd.DataFrame
|
||||
) -> dict[int, str]:
|
||||
"""
|
||||
For each row of the improvement descriptions, get the description of the improvement via web scraping
|
||||
"""
|
||||
|
||||
improvement_description_mapping = {}
|
||||
|
||||
for row in improvement_description_df.itertuples():
|
||||
# time.sleep(1)
|
||||
postcode = row.POSTCODE
|
||||
postcode_input = postcode.replace(" ", "+")
|
||||
postcode_search = self.SEARCH_POSTCODE_URL.format(
|
||||
postcode_input=postcode_input
|
||||
)
|
||||
postcode_response = requests.get(postcode_search, headers=self.HEADERS)
|
||||
|
||||
postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
|
||||
address_links_full = postcode_res.findAll(
|
||||
"a", {"class": "govuk-link", "rel": "nofollow"}
|
||||
)
|
||||
address_links = {
|
||||
element.text.lstrip().rstrip(): self.BASE_ENERGY_URL + element["href"]
|
||||
for element in address_links_full
|
||||
}
|
||||
|
||||
address_links = {k.replace(",", ""): v for k, v in address_links.items()}
|
||||
|
||||
adjusted_address = row.ADDRESS1.replace(",", "")
|
||||
|
||||
address_link = [
|
||||
(k, v) for k, v in address_links.items() if adjusted_address in k
|
||||
]
|
||||
|
||||
if len(address_link) == 0:
|
||||
raise ValueError("Address not found")
|
||||
|
||||
if len(address_link) > 1:
|
||||
split_address_components = adjusted_address.split(" ")
|
||||
for address in address_link:
|
||||
if split_address_components[0] in address[0].split(" "):
|
||||
chosen_epc = address[1]
|
||||
break
|
||||
raise ValueError("Multiple addresses found")
|
||||
else:
|
||||
chosen_epc = address_link[0][1]
|
||||
|
||||
# time.sleep(1)
|
||||
address_response = requests.get(chosen_epc, headers=self.HEADERS)
|
||||
address_res = BeautifulSoup(address_response.text, features="html.parser")
|
||||
|
||||
# epc_certificate = chosen_epc.split('/')[-1]
|
||||
|
||||
# ratings = address_res.find("desc", {"id": "svg-desc"}).text
|
||||
# current_rating = ratings.split(".")[0]
|
||||
# potential_rating = ratings.split(".")[1]
|
||||
|
||||
# new_property_df = pd.DataFrame(
|
||||
# {
|
||||
# "address": [address_link[0][0]],
|
||||
# "epc_certificate": [epc_certificate],
|
||||
# "current_epc_rating": [current_rating.split(" ")[-6]],
|
||||
# "current_epc_efficiency": [current_rating.split(" ")[-1]],
|
||||
# "potential_epc_rating": [potential_rating.split(" ")[-6]],
|
||||
# "potential_epc_efficiency": [potential_rating.split(" ")[-1]],
|
||||
# "LMK_KEY": [row.LMK_KEY],
|
||||
# }
|
||||
# )
|
||||
|
||||
improvements = address_res.find(
|
||||
"div",
|
||||
{"class": "govuk-body printable-area epb-recommended-improvements"},
|
||||
)
|
||||
|
||||
changes = improvements.find_all("h3")
|
||||
changes_impact = improvements.find_all(
|
||||
"dl", {"class": "govuk-summary-list"}
|
||||
)
|
||||
element = list(zip(changes, changes_impact))[row.IMPROVEMENT_ITEM - 1]
|
||||
|
||||
improvement_header = element[0].text
|
||||
|
||||
col_name = improvement_header.split(":")[1].lstrip().rstrip()
|
||||
# cost = element[1].find('dd', {"class": "govuk-summary-list__value"}).text.lstrip().rstrip()
|
||||
|
||||
improvement_description_mapping[row.IMPROVEMENT_ID] = col_name
|
||||
|
||||
return improvement_description_mapping
|
||||
|
||||
|
||||
# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
|
||||
# postcode_input = postcode_input.replace(" ", "+")
|
||||
# postcode_search = SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
|
||||
# postcode_response = requests.get(postcode_search, headers=headers)
|
||||
|
||||
# postcode_res = BeautifulSoup(postcode_response.text)
|
||||
# address_links_full = postcode_res.findAll('a', {'class': 'govuk-link', 'rel': 'nofollow'})
|
||||
# address_links = {element.text.lstrip().rstrip(): BASE_ENERGY_URL + element['href'] for element in address_links_full}
|
||||
# address_input = st.selectbox('Please select an address:', address_links.keys())
|
||||
|
||||
# if address_input is None:
|
||||
# st.stop()
|
||||
|
||||
# chosen_epc = address_links[address_input]
|
||||
|
||||
# st.write("### The EPC Certificate of this property is:")
|
||||
# epc_certificate = chosen_epc.split('/')[-1]
|
||||
# st.write("##### " + epc_certificate)
|
||||
|
||||
# address_response = requests.get(chosen_epc, headers=headers)
|
||||
# address_res = BeautifulSoup(address_response.text)
|
||||
|
||||
# svg = address_res.find("svg", {'class': 'epc-energy-rating-graph'})
|
||||
# render_svg(svg)
|
||||
|
||||
# st.write("## Energy rating - current and potential")
|
||||
# # st.write(address_res.find('desc', {'id': 'svg-desc'}).text)
|
||||
# # st.image(address_res.find_all('svg', {'class': 'epc-energy-rating-graph'})[0])
|
||||
# ratings = address_res.find('desc', {'id': 'svg-desc'}).text
|
||||
|
||||
# st.write('### Current EPC rating')
|
||||
# current_rating = ratings.split(".")[0]
|
||||
# st.write("##### " + current_rating)
|
||||
|
||||
# st.write('### Potential EPC rating')
|
||||
# potential_rating = ratings.split(".")[1]
|
||||
# st.write("##### " + potential_rating)
|
||||
|
||||
# new_property_df = pd.DataFrame(
|
||||
# {'address': [address_input],
|
||||
# 'epc_certificate': [epc_certificate],
|
||||
# 'current_epc_rating': [current_rating.split(' ')[-6]],
|
||||
# 'current_epc_efficiency': [current_rating.split(' ')[-1]],
|
||||
# 'potential_epc_rating': [potential_rating.split(' ')[-6]],
|
||||
# "potential_epc_efficiency": [potential_rating.split(' ')[-1]]}
|
||||
# )
|
||||
|
||||
# st.write('### Changes that can be made:')
|
||||
# improvements = address_res.find('div', {"class": "govuk-body printable-area epb-recommended-improvements"})
|
||||
|
||||
# if improvements is None:
|
||||
# st.write("No changes suggested")
|
||||
# else:
|
||||
# changes = improvements.find_all('h3')
|
||||
# changes_impact = improvements.find_all('dl', {"class": 'govuk-summary-list'})
|
||||
|
||||
# for element in zip(changes, changes_impact):
|
||||
# improvement_header = element[0].text
|
||||
# st.write("#### " + improvement_header)
|
||||
|
||||
# improvement_text = element[1].text
|
||||
# st.write(improvement_text)
|
||||
|
||||
# col_name = improvement_header.split(":")[1]
|
||||
# cost = element[1].find('dd', {"class": "govuk-summary-list__value"}).text.lstrip().rstrip()
|
||||
|
||||
# impact = element[1].find('text', {"class": "govuk-!-font-weight-bold"}).text.split(" ")
|
||||
# impact_num = impact[0]
|
||||
# impact_cat = impact[1]
|
||||
# print(cost)
|
||||
# new_property_df[col_name] = True
|
||||
# # cost_column = col_name + '-cost'
|
||||
# # new_property_df.assign(cost_column=cost)
|
||||
# new_property_df[col_name + '-cost'] = cost
|
||||
# new_property_df[col_name + '-impact_num'] = impact_num
|
||||
# new_property_df[col_name + '-impact_cat'] = impact_cat
|
||||
# st.markdown("---")
|
||||
|
||||
if __name__ == "__main__":
|
||||
e = EPCRecommendationsPipeline(directories=directories, use_parallel=True)
|
||||
e.determine_number_of_improvement_ids()
|
||||
e.number_improvement_ids
|
||||
e.extract_improvement_description()
|
||||
e.improvement_description_df
|
||||
|
||||
e.extract_full_improvement_dataset()
|
||||
pd.DataFrame.from_dict(
|
||||
e.improvement_description, orient="index", columns=["improvement_description"]
|
||||
).to_markdown("improvement_description.md")
|
||||
|
||||
full_id = pd.DataFrame(e.number_improvement_ids, columns=["IMPROVEMENT_ID"])
|
||||
|
||||
e.improvement_description_df.merge(
|
||||
full_id, on="IMPROVEMENT_ID", how="right"
|
||||
).to_markdown("improvement_description.md")
|
||||
|
||||
# e.
|
||||
main()
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import numpy as np
|
|||
|
||||
from backend.Property import Property
|
||||
from recommendations.Costs import Costs
|
||||
from recommendation_utils import override_costs
|
||||
from recommendations.recommendation_utils import override_costs
|
||||
|
||||
|
||||
class WindowsRecommendations:
|
||||
|
|
@ -14,7 +14,7 @@ class WindowsRecommendations:
|
|||
# glazed
|
||||
"most": 0.33,
|
||||
# If glazing is partial, we assume 50/50 split between glazed and unglazed
|
||||
"partial": 0.5
|
||||
"partial": 0.5,
|
||||
}
|
||||
|
||||
def __init__(self, property_instance: Property, materials: List):
|
||||
|
|
@ -52,14 +52,20 @@ class WindowsRecommendations:
|
|||
if not number_of_windows:
|
||||
raise ValueError("Number of windows not specified")
|
||||
|
||||
if self.property.windows["has_glazing"] & (self.property.windows["glazing_coverage"] == "full"):
|
||||
if self.property.windows["has_glazing"] & (
|
||||
self.property.windows["glazing_coverage"] == "full"
|
||||
):
|
||||
return
|
||||
|
||||
# We scale the number of windows based on the proportion of existing glazing
|
||||
if self.property.data["multi-glaze-proportion"] != "":
|
||||
n_windows_scalar = 1 - (int(self.property.data["multi-glaze-proportion"]) / 100)
|
||||
n_windows_scalar = 1 - (
|
||||
int(self.property.data["multi-glaze-proportion"]) / 100
|
||||
)
|
||||
else:
|
||||
n_windows_scalar = self.COVERAGE_MAP.get(self.property.windows["glazing_coverage"], 1)
|
||||
n_windows_scalar = self.COVERAGE_MAP.get(
|
||||
self.property.windows["glazing_coverage"], 1
|
||||
)
|
||||
|
||||
number_of_windows *= n_windows_scalar
|
||||
number_of_windows = np.ceil(number_of_windows)
|
||||
|
|
@ -68,7 +74,7 @@ class WindowsRecommendations:
|
|||
cost_result = self.costs.window_glazing(
|
||||
number_of_windows=number_of_windows,
|
||||
material=self.glazing_material,
|
||||
is_secondary_glazing=is_secondary_glazing
|
||||
is_secondary_glazing=is_secondary_glazing,
|
||||
)
|
||||
|
||||
already_installed = "windows_glazing" in self.property.already_installed
|
||||
|
|
@ -76,18 +82,26 @@ class WindowsRecommendations:
|
|||
cost_result = override_costs(cost_result)
|
||||
description = "The property already has double glazing installed. No further action is required."
|
||||
else:
|
||||
glazing_type = "secondary glazing" if is_secondary_glazing else "double glazing"
|
||||
glazing_type = (
|
||||
"secondary glazing" if is_secondary_glazing else "double glazing"
|
||||
)
|
||||
if self.property.windows["glazing_coverage"] in ["partial", "most"]:
|
||||
description = f"Install {glazing_type} to the remaining windows"
|
||||
else:
|
||||
description = f"Install {glazing_type} to all windows"
|
||||
|
||||
if self.property.is_listed:
|
||||
description += ". Secondary glazing recommended due to listed building status"
|
||||
description += (
|
||||
". Secondary glazing recommended due to listed building status"
|
||||
)
|
||||
elif self.property.is_heritage:
|
||||
description += ". Secondary glazing recommended due to herigate building status"
|
||||
description += (
|
||||
". Secondary glazing recommended due to herigate building status"
|
||||
)
|
||||
elif self.property.in_conservation_area:
|
||||
description += ". Secondary glazing recommended due to conservation area status"
|
||||
description += (
|
||||
". Secondary glazing recommended due to conservation area status"
|
||||
)
|
||||
|
||||
self.recommendation = [
|
||||
{
|
||||
|
|
@ -100,6 +114,6 @@ class WindowsRecommendations:
|
|||
"sap_points": None,
|
||||
"already_installed": already_installed,
|
||||
**cost_result,
|
||||
"is_secondary_glazing": is_secondary_glazing
|
||||
"is_secondary_glazing": is_secondary_glazing,
|
||||
}
|
||||
]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue