mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
removed for loop and if condition
This commit is contained in:
parent
abe894a94d
commit
e95b6e3369
1 changed files with 48 additions and 49 deletions
|
|
@ -70,7 +70,6 @@ LATEST_FIELD = [
|
||||||
"NUMBER_HABITABLE_ROOMS",
|
"NUMBER_HABITABLE_ROOMS",
|
||||||
"NUMBER_HEATED_ROOMS",
|
"NUMBER_HEATED_ROOMS",
|
||||||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||||
"CONSTRUCTION_AGE_BAND",
|
|
||||||
"FLOOR_LEVEL",
|
"FLOOR_LEVEL",
|
||||||
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
|
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
|
||||||
]
|
]
|
||||||
|
|
@ -232,11 +231,10 @@ class DataProcessor:
|
||||||
|
|
||||||
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
|
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
|
||||||
|
|
||||||
|
self.data = self.data[~pd.isnull(self.data["UPRN"])]
|
||||||
self.data = self.data[~pd.isnull(self.data["UPRN"])] \
|
self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||||
[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] \
|
self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
|
||||||
[self.data["TRANSACTION_TYPE"] != "new dwelling"] \
|
self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
||||||
[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
|
||||||
|
|
||||||
|
|
||||||
def clean_multi_glaze_proportion(self) -> None:
|
def clean_multi_glaze_proportion(self) -> None:
|
||||||
|
|
@ -270,73 +268,74 @@ def app():
|
||||||
for uprn, property_data in df.groupby("UPRN", observed=True):
|
for uprn, property_data in df.groupby("UPRN", observed=True):
|
||||||
|
|
||||||
# Fixed features - these are property attributes that shouldn't change over time
|
# Fixed features - these are property attributes that shouldn't change over time
|
||||||
|
|
||||||
|
|
||||||
ignore_epc = False
|
|
||||||
fixed_data = {}
|
fixed_data = {}
|
||||||
for field in FIXED_FEATURES:
|
|
||||||
vals = property_data[field].dropna().unique()
|
|
||||||
# Remove invalid values
|
|
||||||
vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES]
|
|
||||||
|
|
||||||
if field == "FLOOR_LEVEL":
|
# Map all anomaly values to None
|
||||||
vals = list({FLOOR_LEVEL_MAP[v] for v in vals})
|
data_anomaly_map = dict(zip(BaseUtility.DATA_ANOMALY_MATCHES, [None]*len(BaseUtility.DATA_ANOMALY_MATCHES)))
|
||||||
|
|
||||||
|
# Use replace function to map data (if exists in key), to corresponding value - i.e. Remove invalid values
|
||||||
|
modified_property_data = property_data.replace(data_anomaly_map)
|
||||||
|
modified_property_data = modified_property_data.replace(np.NAN, None)
|
||||||
|
|
||||||
if field == "BUILT_FORM":
|
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
|
||||||
vals = list({BUILT_FORM_REMAP.get(v, v) for v in vals})
|
if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
|
||||||
|
continue
|
||||||
|
|
||||||
if field in AVERAGE_FIXED_FEATURES:
|
mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
|
||||||
|
|
||||||
|
# Remap certain columns
|
||||||
|
modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
|
||||||
|
modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP)
|
||||||
|
|
||||||
if len(vals) > 1:
|
latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict()
|
||||||
# Check the values are too far apart
|
|
||||||
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
|
||||||
# Take the more recent value since it's likely to be more accurate
|
|
||||||
vals = [vals[-1]]
|
|
||||||
|
|
||||||
if vals:
|
# Taking just the last row, which is the percentage change from the latest to previous one only
|
||||||
field_value = np.mean(vals)
|
# modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
|
||||||
else:
|
|
||||||
# Clean using averages
|
|
||||||
|
|
||||||
avgs = iterative_filtering(cleaning_averages, property_data)
|
|
||||||
# TODO: Should probably do a mean/median?
|
|
||||||
field_value = avgs[field].iloc[0]
|
|
||||||
|
|
||||||
if pd.isnull(field_value):
|
|
||||||
# Just the use the general averages
|
|
||||||
field_value = general_averages[
|
|
||||||
(general_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) &
|
|
||||||
(general_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0])
|
|
||||||
][field].iloc[0]
|
|
||||||
|
|
||||||
elif field in LATEST_FIELD:
|
for field in AVERAGE_FIXED_FEATURES:
|
||||||
field_value = vals[-1] if vals else None
|
vals = list(modified_property_data[field].dropna().unique())
|
||||||
|
if len(vals) > 1:
|
||||||
|
# Check the values are too far apart
|
||||||
|
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
||||||
|
# Take the more recent value since it's likely to be more accurate
|
||||||
|
vals = [vals[-1]]
|
||||||
|
|
||||||
|
if vals:
|
||||||
|
field_value = np.mean(vals)
|
||||||
else:
|
else:
|
||||||
if len(vals) > 1:
|
# Clean using averages
|
||||||
if field in MANDATORY_FIXED_FEATURES:
|
|
||||||
ignore_epc = True
|
|
||||||
else:
|
|
||||||
raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
|
|
||||||
|
|
||||||
field_value = vals[0] if vals else None
|
avgs = iterative_filtering(cleaning_averages, modified_property_data)
|
||||||
|
# TODO: Should probably do a mean/median?
|
||||||
|
field_value = avgs[field].iloc[0]
|
||||||
|
|
||||||
|
if pd.isnull(field_value):
|
||||||
|
# Just the use the general averages
|
||||||
|
field_value = general_averages[
|
||||||
|
(general_averages["PROPERTY_TYPE"] == modified_property_data["PROPERTY_TYPE"].iloc[0]) &
|
||||||
|
(general_averages["BUILT_FORM"] == modified_property_data["BUILT_FORM"].iloc[0])
|
||||||
|
][field].iloc[0]
|
||||||
|
|
||||||
fixed_data[field] = field_value
|
fixed_data[field] = field_value
|
||||||
|
|
||||||
if ignore_epc:
|
#Combine all fields together
|
||||||
continue
|
fixed_data.update(mandatory_field_data)
|
||||||
|
fixed_data.update(latest_field_data)
|
||||||
|
|
||||||
# We include the lodgement date here as we probably need to factor time into the
|
# We include the lodgement date here as we probably need to factor time into the
|
||||||
# model, since EPC standards and rigour have changed over time
|
# model, since EPC standards and rigour have changed over time
|
||||||
variable_data = property_data[
|
variable_data = modified_property_data[
|
||||||
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
|
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
|
||||||
]
|
]
|
||||||
|
|
||||||
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
||||||
# e.g. first vs second, second vs third and also first vs third
|
# e.g. first vs second, second vs third and also first vs third
|
||||||
property_model_data = []
|
property_model_data = []
|
||||||
for idx in range(0, property_data.shape[0] - 1):
|
for idx in range(0, modified_property_data.shape[0] - 1):
|
||||||
|
|
||||||
if idx >= property_data.shape[0] - 1:
|
if idx >= modified_property_data.shape[0] - 1:
|
||||||
break
|
break
|
||||||
|
|
||||||
starting_record = variable_data.iloc[idx]
|
starting_record = variable_data.iloc[idx]
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue