debuggin sap model data prep

This commit is contained in:
Khalim Conn-Kowlessar 2023-10-10 12:21:21 +08:00
parent de9810af43
commit 6ddc9fddca
4 changed files with 68 additions and 26 deletions

View file

@ -632,7 +632,6 @@ class Property(Definitions):
'PHOTO_SUPPLY',
'LOW_ENERGY_LIGHTING',
'SOLAR_WATER_HEATING_FLAG',
'BUILT_FORM',
'GLAZED_TYPE',
'CONSTITUENCY',
'NUMBER_HEATED_ROOMS',
@ -642,6 +641,21 @@ class Property(Definitions):
k: self.data[k.lower().replace("_", "-")] for k in epc_raw_columns
}
built_form_cleaning_map = {
"Flat": "Mid-Terrace",
"House": "Semi-Detached",
"Bungalow": "Detached",
"Maisonette": "Mid-Terrace"
}
built_form = self.data["built-form"]
if built_form in self.DATA_ANOMALY_MATCHES:
# TODO: If built form isn't captured, we use the most common value for that property type - we shall
# improve this methodology
built_form = built_form_cleaning_map.get(self.data["property-type"])
if not built_form:
raise NotImplementedError("Not handled this property type when cleaning built form")
property_data = {
**walls,
**roof,
@ -653,15 +667,16 @@ class Property(Definitions):
**windows,
"SECONDHEAT_DESCRIPTION": second_heating,
"DAYS_TO": DataProcessor.calculate_days_to(self.data["lodgement-date"]),
"SAP": self.data["current-energy-efficiency"],
"CARBON": self.data["co2-emissions-current"],
"HEAT_DEMAND": self.data["energy-consumption-current"],
"SAP": float(self.data["current-energy-efficiency"]),
"CARBON": float(self.data["co2-emissions-current"]),
"HEAT_DEMAND": float(self.data["energy-consumption-current"]),
"estimated_perimeter": self.perimeter,
"CONSTRUCTION_AGE_BAND": self.construction_age_band,
"FLOOR_HEIGHT": self.floor_height,
"NUMBER_HABITABLE_ROOMS": self.number_of_rooms,
"TOTAL_FLOOR_AREA": self.floor_area,
**epc_raw_data
**epc_raw_data,
"BUILT_FORM": built_form,
}
return property_data

View file

@ -157,13 +157,19 @@ async def trigger_plan(body: PlanTriggerRequest):
data_processor = DataProcessor(None, newdata=True)
data_processor.insert_data(pd.DataFrame([p.get_model_data()]))
data_processor.pre_process()
data_processor.data = data_processor.clean_missings_after_description_process(
data_processor.data, [
c for c in data_processor.data.columns if
("thermal_transmittance" in c) or ("insulation_thickness" in c)
]
)
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
fixed_data = data_processor.get_fixed_features()
# We update the ending record with the recommended updates and we set lodgement date to today
ending_epc_data["LODGEMENT_DATE_ENDING"] = data_processor.calculate_days_to(created_at)
ending_epc_data["DAYS_TO_ENDING"] = data_processor.calculate_days_to(created_at)
for recommendations_by_type in property_recommendations:
for rec in recommendations_by_type:
@ -175,21 +181,38 @@ async def trigger_plan(body: PlanTriggerRequest):
fixed_data=fixed_data,
)
fer
none_cols = []
for col in scoring_dict.keys():
if col in [
"UPRN", "id", "LOCAL_AUTHORITY",
]:
continue
if col in ["SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "FLOOR_HEIGHT_STARTING"]:
if scoring_dict[col]:
if col in [
"SAP_STARTING", "HEAT_DEMAND_STARTING", "CARBON_STARTING", "FLOOR_HEIGHT_STARTING",
"TOTAL_FLOOR_AREA_STARTING", "DAYS_TO_STARTING", "estimated_perimeter_STARTING",
"SAP_ENDING", "HEAT_DEMAND_ENDING",
"CARBON_ENDING", "FLOOR_HEIGHT_ENDING",
"TOTAL_FLOOR_AREA_ENDING", "DAYS_TO_ENDING", "estimated_perimeter_ENDING"
]:
try:
if scoring_dict[col] is None:
blah1
float(scoring_dict[col])
continue
except:
raise Exception("wtf")
unique_vals = sap_change_dataset[col].unique()
if scoring_dict[col] not in unique_vals:
if scoring_dict[col] is None:
none_cols.append(col)
continue
blah
if none_cols:
blahblah
recommendations_scoring_data.append(scoring_dict)
# cleanup

View file

@ -550,3 +550,22 @@ class DataProcessor:
return (
pd.to_datetime(lodgement_date) - pd.to_datetime(EARLIEST_EPC_DATE)
).dt.days
@staticmethod
def clean_missings_after_description_process(df, ignore_cols=None):
missings = pd.isnull(df).sum()
missings = missings[missings > 0]
if ignore_cols:
missings = missings[~missings.index.isin(ignore_cols)]
for col in missings.index:
unique_values = df[col].unique()
if True in unique_values or False in unique_values:
df[col] = df[col].fillna(False)
if "none" in unique_values:
df[col] = df[col].fillna("none")
else:
df[col] = df[col].fillna("Unknown")
return df

View file

@ -363,21 +363,6 @@ def make_uvalues(df):
return df
def clean_missings_after_description_process(df):
missings = pd.isnull(df).sum()
missings = missings[missings > 0]
for col in missings.index:
unique_values = df[col].unique()
if True in unique_values or False in unique_values:
df[col] = df[col].fillna(False)
if "none" in unique_values:
df[col] = df[col].fillna("none")
else:
df[col] = df[col].fillna("Unknown")
return df
def app():
# Get all the files in the directory
@ -544,7 +529,7 @@ def app():
# Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
# need to
data_by_urpn_df = clean_missings_after_description_process(data_by_urpn_df)
data_by_urpn_df = DataProcessor.clean_missings_after_description_process(data_by_urpn_df)
if pd.isnull(data_by_urpn_df).sum().sum():
raise ValueError("Null values found in dataset after process_and_prune_desriptions")