mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Merge pull request #231 from Hestia-Homes/refactor-data-processor
Refactor data processor
This commit is contained in:
commit
402d71eb77
20 changed files with 592 additions and 5773 deletions
|
|
@ -9,4 +9,5 @@ omit =
|
|||
model_data/plotting/*
|
||||
recommendations/rdsap_tables.py
|
||||
model_data/simulation_system/*
|
||||
model_data/cleaner_app.py
|
||||
model_data/cleaner_app.py
|
||||
backend/app/*
|
||||
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -11,4 +11,4 @@ db_string = connection_string.format(
|
|||
dbname=get_settings().DB_NAME,
|
||||
)
|
||||
|
||||
db_engine = create_engine(db_string)
|
||||
db_engine = create_engine(db_string, pool_size=20, max_overflow=5)
|
||||
|
|
|
|||
|
|
@ -1,32 +0,0 @@
|
|||
columntypes = {
|
||||
'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
|
||||
'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
|
||||
'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64', 'FLOOR_LEVEL': 'float64',
|
||||
'CONSTRUCTION_AGE_BAND': 'object', 'TRANSACTION_TYPE_STARTING': 'object',
|
||||
'WALLS_DESCRIPTION_STARTING': 'object',
|
||||
'FLOOR_DESCRIPTION_STARTING': 'object', 'LIGHTING_DESCRIPTION_STARTING': 'object',
|
||||
'ROOF_DESCRIPTION_STARTING': 'object', 'MAINHEAT_DESCRIPTION_STARTING': 'object',
|
||||
'HOTWATER_DESCRIPTION_STARTING': 'object', 'MAIN_FUEL_STARTING': 'object',
|
||||
'MECHANICAL_VENTILATION_STARTING': 'object',
|
||||
'SECONDHEAT_DESCRIPTION_STARTING': 'object', 'ENERGY_TARIFF_STARTING': 'object',
|
||||
'SOLAR_WATER_HEATING_FLAG_STARTING': 'object', 'PHOTO_SUPPLY_STARTING': 'float64',
|
||||
'WINDOWS_DESCRIPTION_STARTING': 'object', 'GLAZED_TYPE_STARTING': 'object',
|
||||
'MULTI_GLAZE_PROPORTION_STARTING': 'float64', 'LOW_ENERGY_LIGHTING_STARTING': 'float64',
|
||||
'NUMBER_OPEN_FIREPLACES_STARTING': 'float64', 'MAINHEATCONT_DESCRIPTION_STARTING': 'object',
|
||||
'EXTENSION_COUNT_STARTING': 'float64', 'LODGEMENT_DATE_STARTING': 'object',
|
||||
'TRANSACTION_TYPE_ENDING': 'object',
|
||||
'WALLS_DESCRIPTION_ENDING': 'object', 'FLOOR_DESCRIPTION_ENDING': 'object',
|
||||
'LIGHTING_DESCRIPTION_ENDING': 'object',
|
||||
'ROOF_DESCRIPTION_ENDING': 'object', 'MAINHEAT_DESCRIPTION_ENDING': 'object',
|
||||
'HOTWATER_DESCRIPTION_ENDING': 'object',
|
||||
'MAIN_FUEL_ENDING': 'object', 'MECHANICAL_VENTILATION_ENDING': 'object',
|
||||
'SECONDHEAT_DESCRIPTION_ENDING': 'object',
|
||||
'ENERGY_TARIFF_ENDING': 'object', 'SOLAR_WATER_HEATING_FLAG_ENDING': 'object',
|
||||
'PHOTO_SUPPLY_ENDING': 'float64',
|
||||
'WINDOWS_DESCRIPTION_ENDING': 'object', 'GLAZED_TYPE_ENDING': 'object',
|
||||
'MULTI_GLAZE_PROPORTION_ENDING': 'float64',
|
||||
'LOW_ENERGY_LIGHTING_ENDING': 'float64', 'NUMBER_OPEN_FIREPLACES_ENDING': 'float64',
|
||||
'MAINHEATCONT_DESCRIPTION_ENDING': 'object', 'EXTENSION_COUNT_ENDING': 'float64',
|
||||
'LODGEMENT_DATE_ENDING': 'object',
|
||||
'id': 'object'
|
||||
}
|
||||
|
|
@ -8,8 +8,10 @@ from backend.app.config import get_settings
|
|||
from backend.Property import Property
|
||||
from epc_api.client import EpcClient
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_from_s3
|
||||
from recommendations.FloorRecommendations import FloorRecommendations
|
||||
from recommendations.WallRecommendations import WallRecommendations
|
||||
from recommendations.config import UPGRADES_MAP
|
||||
from utils.uvalue_estimates import classify_decile_newvalues
|
||||
from backend.app.db.utils import row2dict
|
||||
from starlette.responses import Response
|
||||
|
|
@ -17,6 +19,7 @@ from sqlalchemy.orm import sessionmaker
|
|||
from sqlalchemy.exc import IntegrityError, OperationalError
|
||||
from datetime import datetime
|
||||
import pandas as pd
|
||||
import msgpack
|
||||
|
||||
# model apis
|
||||
from backend.ml_models.sap_change_model.api import SAPChangeModelAPI
|
||||
|
|
@ -31,21 +34,17 @@ from backend.app.db.functions.recommendations_functions import (
|
|||
)
|
||||
from backend.app.db.functions.portfolio_functions import aggregate_portfolio_recommendations
|
||||
from backend.app.db.connection import db_engine
|
||||
from backend.app.plan.columntypes import columntypes
|
||||
|
||||
from model_data.optimiser.GainOptimiser import GainOptimiser
|
||||
from model_data.optimiser.CostOptimiser import CostOptimiser
|
||||
from backend.app.utils import epc_to_sap_lower_bound, save_dataframe_to_s3_parquet, read_parquet_from_s3
|
||||
from backend.app.utils import epc_to_sap_lower_bound, read_parquet_from_s3
|
||||
from model_data.optimiser.optimiser_functions import prepare_input_measures
|
||||
from model_data.simulation_system.core.DataProcessor import DataProcessor
|
||||
from model_data.simulation_system.core.Settings import (
|
||||
FIXED_FEATURES, COMPONENT_FEATURES, COLUMNS_TO_MERGE_ON
|
||||
)
|
||||
from model_data.simulation_system.core.Settings import COLUMNS_TO_MERGE_ON
|
||||
|
||||
# TODO: This is placeholder until data is stored in DB
|
||||
from backend.app.plan.uvalue_estimates_walls import uvalue_estimates_walls
|
||||
from backend.app.plan.uvalue_estimates_floors import uvalue_estimates_floors
|
||||
from backend.app.plan.temp_cleaned_data import cleaned
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
|
@ -98,12 +97,6 @@ walls_decile_data = {
|
|||
'Decile 9', 'Decile 10'], 'decile_boundaries': [6., 49., 51., 55., 64., 71., 76., 83., 96.,
|
||||
120., 2279.]}
|
||||
|
||||
lighting_averages = [
|
||||
{'lighting-description': 'good lighting efficiency', 'low-energy-lighting': 99.26666666666667},
|
||||
{'lighting-description': 'excellent lighting efficiency', 'low-energy-lighting': 100.0},
|
||||
{'lighting-description': 'below average lighting efficiency', 'low-energy-lighting': 0.0}
|
||||
]
|
||||
|
||||
|
||||
def filter_materials(materials):
|
||||
materials_by_type = defaultdict(list)
|
||||
|
|
@ -138,18 +131,62 @@ def insert_temp_recommendation_id(property_recommendations):
|
|||
return property_recommendations
|
||||
|
||||
|
||||
def score_measures():
|
||||
def get_cleaned():
|
||||
"""
|
||||
This function will retrieve the cleaned dataset from s3 which has the cleaned
|
||||
descriptions for the epc dataset
|
||||
|
||||
This data is stored in MessagePack format and therefore needs to be decoded
|
||||
:return:
|
||||
"""
|
||||
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-{environment}".format(environment=get_settings().ENVIRONMENT)
|
||||
)
|
||||
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def create_recommendation_scoring_data(
|
||||
property: Property,
|
||||
recommendation: dict,
|
||||
starting_epc_data: pd.DataFrame,
|
||||
ending_epc_data: pd.DataFrame,
|
||||
fixed_data: pd.DataFrame,
|
||||
):
|
||||
"""
|
||||
This wrapper function prepares data to be passed to the sap model api
|
||||
:return:
|
||||
"""
|
||||
|
||||
scoring_dict = {
|
||||
"UPRN": property.data["uprn"],
|
||||
"id": "+".join([str(property.id), str(recommendation["recommendation_id"])]),
|
||||
"LOCAL_AUTHORITY": property.data["local-authority"],
|
||||
**starting_epc_data.to_dict("records")[0],
|
||||
**ending_epc_data.to_dict("records")[0],
|
||||
**fixed_data.to_dict("records")[0]
|
||||
}
|
||||
|
||||
# We update the description to indicate it's insulated
|
||||
if recommendation["type"] == "wall_insulation":
|
||||
scoring_dict["WALLS_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.walls["clean_description"]]
|
||||
elif recommendation["type"] == "floor_insulation":
|
||||
scoring_dict["FLOOR_DESCRIPTION_ENDING"] = UPGRADES_MAP[property.floor["clean_description"]]
|
||||
else:
|
||||
raise NotImplementedError("Implement me")
|
||||
|
||||
return scoring_dict
|
||||
|
||||
|
||||
@router.post("/trigger")
|
||||
async def trigger_plan(body: PlanTriggerRequest):
|
||||
logger.info("Connecting to db")
|
||||
Session = sessionmaker(bind=db_engine)
|
||||
session = Session()
|
||||
session = sessionmaker(bind=db_engine)()
|
||||
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
||||
|
||||
try:
|
||||
session.begin()
|
||||
|
|
@ -159,21 +196,17 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
epc_client = EpcClient(auth_token=get_settings().EPC_AUTH_TOKEN)
|
||||
|
||||
plan_input = read_csv_from_s3(bucket_name=bucket_name, filepath=body.trigger_file_path)
|
||||
|
||||
input_properties = []
|
||||
for config in plan_input:
|
||||
# We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
|
||||
# TODO: implment validation
|
||||
|
||||
# Create a record in db
|
||||
property_id, is_new = create_property(
|
||||
session, portfolio_id=body.portfolio_id, address=config['address'], postcode=config['postcode']
|
||||
)
|
||||
|
||||
# if a new record was not created, we don't produduce recommendations
|
||||
if not is_new:
|
||||
continue
|
||||
|
||||
# TODO: Need to add heat demand target
|
||||
create_property_targets(
|
||||
session,
|
||||
|
|
@ -195,37 +228,34 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
if not input_properties:
|
||||
return Response(status_code=204)
|
||||
|
||||
logger.info("Getting EPC data")
|
||||
logger.info("Getting EPC, coordinates and conservation area data")
|
||||
for p in input_properties:
|
||||
p.search_address_epc()
|
||||
p.set_year_built()
|
||||
|
||||
logger.info("Getting coordinates")
|
||||
# This is placeholder, until the full dataset is loaded into the database
|
||||
for p in input_properties:
|
||||
coordinate_data = [x for x in open_uprn_data if x['UPRN'] == int(p.data['uprn'])][0]
|
||||
p.set_coordinates(coordinate_data)
|
||||
|
||||
logger.info("Check if property is in conservation area")
|
||||
for p in input_properties:
|
||||
in_conservation_area = [x for x in in_conservation_area_data if x['uprn'] == int(p.data['uprn'])][0].get(
|
||||
"is_in_conservation_area"
|
||||
)
|
||||
p.set_is_in_conservation_area(in_conservation_area)
|
||||
|
||||
# The materials data could be cached or local so we don't need to make
|
||||
# consistent requrests to the backend for
|
||||
# consistent requests to the backend for
|
||||
# the same data
|
||||
# TODO: It might not be the best choice to store the materials data in a database table since thi
|
||||
# table probably won't be very large and won't be updated that often. It might be better to
|
||||
# store this data in s3 load it into memory when the app starts up. We will test this
|
||||
|
||||
logger.info("Reading in materials and cleaned datasets")
|
||||
materials = get_materials(session)
|
||||
materials_by_type = filter_materials(materials)
|
||||
cleaned = get_cleaned()
|
||||
|
||||
logger.info("Getting components and properties recommendations")
|
||||
|
||||
# TODO: Move this to a class. We probably was a Recommender class which takes the injects the optimisers
|
||||
# TODO: Move this to a class. We probably want a Recommender class which takes the injects the optimisers
|
||||
# in as a dependency and then the optimisers can take the input measures in as part of the setup() method
|
||||
recommendations = {}
|
||||
recommendations_scoring_data = []
|
||||
|
|
@ -310,42 +340,33 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
# TODO: We should use the cleaned data from get_components in the data rather than the raw
|
||||
# values. We should create a method in Property which takes the EPC data and inserts the cleaned
|
||||
# data
|
||||
epc_data = p.data.copy()
|
||||
epc_data = pd.DataFrame([epc_data])
|
||||
epc_data.columns = [col.upper().replace("-", "_") for col in epc_data.columns]
|
||||
starting_epc_data = epc_data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix("_STARTING")
|
||||
ending_epc_data = epc_data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix("_ENDING")
|
||||
fixed_data = epc_data[FIXED_FEATURES]
|
||||
|
||||
data_processor = DataProcessor(None, newdata=True)
|
||||
data_processor.insert_data(pd.DataFrame([p.data.copy()]))
|
||||
data_processor.pre_process()
|
||||
|
||||
starting_epc_data = data_processor.get_component_features(suffix="_STARTING")
|
||||
ending_epc_data = data_processor.get_component_features(suffix="_ENDING")
|
||||
fixed_data = data_processor.get_fixed_features()
|
||||
|
||||
# We update the ending record with the recommended updates and we set lodgement date to today
|
||||
ending_epc_data["LODGEMENT_DATE_ENDING"] = datetime.now().strftime("%Y-%m-%d")
|
||||
ending_epc_data["LODGEMENT_DATE_ENDING"] = created_at
|
||||
|
||||
scoring_map = {
|
||||
'Solid brick, as built, no insulation (assumed)': 'Solid brick, as built, insulated (assumed)',
|
||||
'Suspended, no insulation (assumed)': 'Suspended, insulated (assumed)',
|
||||
'Solid, no insulation (assumed)': 'Solid, insulated (assumed)',
|
||||
}
|
||||
for recommendations_by_type in property_recommendations:
|
||||
for rec in recommendations_by_type:
|
||||
scoring_dict = {
|
||||
"UPRN": p.data["uprn"],
|
||||
"id": "+".join([str(p.id), str(rec["recommendation_id"])]),
|
||||
"LOCAL_AUTHORITY": p.data["local-authority"],
|
||||
**starting_epc_data.to_dict("records")[0],
|
||||
**ending_epc_data.to_dict("records")[0],
|
||||
**fixed_data.to_dict("records")[0]
|
||||
}
|
||||
|
||||
# We update the description to indicate it's insulated
|
||||
if rec["type"] == "wall_insulation":
|
||||
scoring_dict["WALLS_DESCRIPTION_ENDING"] = scoring_map[p.walls["clean_description"]]
|
||||
elif rec["type"] == "floor_insulation":
|
||||
scoring_dict["FLOOR_DESCRIPTION_ENDING"] = scoring_map[p.floor["clean_description"]]
|
||||
else:
|
||||
raise NotImplementedError("Implement me")
|
||||
scoring_dict = create_recommendation_scoring_data(
|
||||
property=p,
|
||||
recommendation=rec,
|
||||
starting_epc_data=starting_epc_data,
|
||||
ending_epc_data=ending_epc_data,
|
||||
fixed_data=fixed_data,
|
||||
)
|
||||
|
||||
recommendations_scoring_data.append(scoring_dict)
|
||||
|
||||
# cleanup
|
||||
del data_processor
|
||||
|
||||
logger.info("Preparing data for scoring in sap change api")
|
||||
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
|
||||
|
||||
|
|
@ -354,61 +375,31 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
cleaning_data = read_parquet_from_s3(
|
||||
bucket_name=get_settings().DATA_BUCKET,
|
||||
file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
)
|
||||
cleaning_data = cleaning_data.rename(columns={"local-authority": "LOCAL_AUTHORITY"})
|
||||
# Merge the cleaning data onto recommendations_scoring_data
|
||||
).rename(columns={"local-authority": "LOCAL_AUTHORITY"})
|
||||
|
||||
recommendations_scoring_data[["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]] = recommendations_scoring_data[
|
||||
["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]
|
||||
].replace("", None)
|
||||
# Merge the cleaning data onto recommendations_scoring_data
|
||||
|
||||
# Perform the same cleaning as in the model
|
||||
recommendations_scoring_data = DataProcessor.apply_averages_cleaning(
|
||||
data_to_clean=recommendations_scoring_data,
|
||||
cleaning_data=cleaning_data,
|
||||
cols_to_merge_on=COLUMNS_TO_MERGE_ON + ["LOCAL_AUTHORITY"]
|
||||
).drop(columns=["LOCAL_AUTHORITY"])
|
||||
|
||||
sap_change_model_api = SAPChangeModelAPI(portfolio_id=body.portfolio_id, timestamp=created_at)
|
||||
file_location = sap_change_model_api.upload_scoring_data(
|
||||
df=recommendations_scoring_data, bucket=get_settings().DATA_BUCKET
|
||||
)
|
||||
recommendations_scoring_data = recommendations_scoring_data.drop(columns=["LOCAL_AUTHORITY"])
|
||||
|
||||
# Note: We might need to perform the full pre-processing here
|
||||
data_processor = DataProcessor(filepath=None)
|
||||
data_processor.insert_data(recommendations_scoring_data)
|
||||
data_processor.remap_columns()
|
||||
recommendations_scoring_data = data_processor.data
|
||||
|
||||
# Remap column types
|
||||
recommendations_scoring_data = recommendations_scoring_data.astype(columntypes)
|
||||
# Store parquet file in s3 for scoring
|
||||
created_at = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
|
||||
file_location = "sap_change_predictions/{portfolio_id}/{timestamp}.parquet".format(
|
||||
portfolio_id=body.portfolio_id,
|
||||
timestamp=created_at
|
||||
)
|
||||
|
||||
logger.info("Storing scoring data to s3")
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=recommendations_scoring_data,
|
||||
bucket_name=get_settings().DATA_BUCKET,
|
||||
file_key=file_location
|
||||
)
|
||||
|
||||
logger.info("Making request to sap change api")
|
||||
sap_change_model_api = SAPChangeModelAPI()
|
||||
response = sap_change_model_api.predict(
|
||||
file_location="s3://{DATA_BUCKET}/".format(DATA_BUCKET=get_settings().DATA_BUCKET) + file_location,
|
||||
created_at=created_at,
|
||||
portfolio_id=body.portfolio_id
|
||||
)
|
||||
|
||||
# Retrieve the predictions
|
||||
predictions = pd.DataFrame(read_csv_from_s3(
|
||||
bucket_name=get_settings().PREDICTIONS_BUCKET,
|
||||
filepath=response["storage_filepath"]
|
||||
))
|
||||
predictions = pd.DataFrame(
|
||||
read_csv_from_s3(bucket_name=get_settings().PREDICTIONS_BUCKET, filepath=response["storage_filepath"])
|
||||
)
|
||||
|
||||
# We round the predictions
|
||||
predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(0)
|
||||
# Extract property_id and recommendation_id
|
||||
predictions["RDSAP_CHANGE"] = predictions["RDSAP_CHANGE"].astype(float).round(1)
|
||||
predictions[['property_id', 'recommendation_id']] = predictions['id'].str.split('+', expand=True)
|
||||
|
||||
# Insert the predictions into the recommendations and run the optimiser
|
||||
|
|
@ -424,7 +415,7 @@ async def trigger_plan(body: PlanTriggerRequest):
|
|||
rec["recommendation_id"]
|
||||
)]["RDSAP_CHANGE"].values[0]
|
||||
|
||||
if not rec["sap_points"]:
|
||||
if rec["sap_points"] is None:
|
||||
raise ValueError("Sap points missing")
|
||||
|
||||
input_measures = prepare_input_measures(recommendations[property_id], body.goal)
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,32 +1,71 @@
|
|||
import pandas as pd
|
||||
import requests
|
||||
from requests.exceptions import RequestException
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import save_dataframe_to_s3_parquet
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
class SAPChangeModelAPI:
|
||||
def __init__(self, base_url="https://api.dev.hestia.homes"):
|
||||
def __init__(
|
||||
self,
|
||||
portfolio_id,
|
||||
timestamp,
|
||||
base_url="https://api.dev.hestia.homes",
|
||||
):
|
||||
"""
|
||||
property_id (int, optional): :
|
||||
:param portfolio_id: The portfolio ID to be passed in the request payload. Defaults to 4.
|
||||
:param timestamp: The creation timestamp to be passed in the request payload. Defaults to None.
|
||||
:param base_url:
|
||||
"""
|
||||
self.base_url = base_url
|
||||
self.portfolio_id = portfolio_id
|
||||
self.timestamp = timestamp
|
||||
|
||||
def predict(self, file_location, property_id="", portfolio_id=4, created_at=None):
|
||||
def upload_scoring_data(self, df: pd.DataFrame, bucket: str) -> str:
|
||||
"""
|
||||
The sap model api needs a scoring data that is sitting in s3 to use as a dataset to score on
|
||||
This method allows the user to upload a table as a parquet file. This method will return the file
|
||||
location, which can be used as the file location in the predict() method
|
||||
|
||||
:param df: Pandas dataframe with scoring data to be uploaded to s3
|
||||
:param bucket: Name of the bucket in s3 to upload to
|
||||
:return:
|
||||
"""
|
||||
|
||||
# Store parquet file in s3 for scoring
|
||||
file_location = "sap_change_predictions/{portfolio_id}/{timestamp}.parquet".format(
|
||||
portfolio_id=self.portfolio_id,
|
||||
timestamp=self.timestamp
|
||||
)
|
||||
|
||||
logger.info("Storing scoring data to s3")
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=df,
|
||||
bucket_name=bucket,
|
||||
file_key=file_location
|
||||
)
|
||||
|
||||
return file_location
|
||||
|
||||
def predict(self, file_location):
|
||||
"""Makes a POST request to the SAP Change Model API with the provided parameters.
|
||||
|
||||
Args:
|
||||
file_location (str): The file location to be passed in the request payload.
|
||||
property_id (int, optional): The property ID to be passed in the request payload. Defaults to 999.
|
||||
portfolio_id (int, optional): The portfolio ID to be passed in the request payload. Defaults to 4.
|
||||
created_at (str, optional): The creation timestamp to be passed in the request payload. Defaults to None.
|
||||
|
||||
Returns:
|
||||
dict: The API response as a dictionary if the request was successful, None otherwise.
|
||||
"""
|
||||
logger.info("Making request to sap change api")
|
||||
url = f"{self.base_url}/sapmodel/predict"
|
||||
payload = {
|
||||
"file_location": f"s3://retrofit-data-dev/{file_location}",
|
||||
"property_id": property_id,
|
||||
"portfolio_id": portfolio_id,
|
||||
"created_at": created_at
|
||||
"property_id": "", # This should get removed
|
||||
"portfolio_id": self.portfolio_id,
|
||||
"created_at": self.timestamp
|
||||
}
|
||||
|
||||
try:
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
msgpack==1.0.5
|
||||
anyio==3.7.1
|
||||
cffi==1.15.1
|
||||
click==8.1.3
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import pytest
|
|||
import pandas as pd
|
||||
from unittest.mock import Mock
|
||||
from epc_api.client import EpcClient
|
||||
from model_data.Property import Property
|
||||
from backend.Property import Property
|
||||
from open_uprn.OpenUprnClient import OpenUprnClient
|
||||
from model_data.EpcClean import EpcClean
|
||||
|
||||
|
|
@ -19,6 +19,18 @@ mock_epc_response = {
|
|||
"hotwater-description": "Hot Water Description",
|
||||
"transaction-type": "rental",
|
||||
"lighting-description": "Good Lighting Efficiency",
|
||||
"energy-consumption-current": "50",
|
||||
"co2-emissions-current": "123",
|
||||
"mechanical-ventilation": "natural",
|
||||
'photo-supply': 0,
|
||||
"solar-water-heating-flag": "N",
|
||||
"wind-turbine-count": 0,
|
||||
"extension-count": 0,
|
||||
"heat-loss-corridor": "no corridor",
|
||||
"unheated-corridor-length": 0,
|
||||
"mains-gas-flag": "Y",
|
||||
"floor-height": 2.5,
|
||||
"total-floor-area": 100
|
||||
},
|
||||
{
|
||||
"inspection-date": "2023-05-01",
|
||||
|
|
@ -30,6 +42,18 @@ mock_epc_response = {
|
|||
"hotwater-description": "Hot Water Description",
|
||||
"transaction-type": "rental",
|
||||
"lighting-description": "Good Lighting Efficiency",
|
||||
"energy-consumption-current": "50",
|
||||
"co2-emissions-current": "123",
|
||||
"mechanical-ventilation": "natural",
|
||||
'photo-supply': 0,
|
||||
"solar-water-heating-flag": "N",
|
||||
"wind-turbine-count": 0,
|
||||
"extension-count": 0,
|
||||
"heat-loss-corridor": "no corridor",
|
||||
"unheated-corridor-length": 0,
|
||||
"mains-gas-flag": "Y",
|
||||
"floor-height": 2.5,
|
||||
"total-floor-area": 100
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -42,6 +66,18 @@ mock_epc_response_dupe = {
|
|||
'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
|
||||
"transaction-type": "rental",
|
||||
"lighting-description": "Good Lighting Efficiency",
|
||||
"energy-consumption-current": "50",
|
||||
"co2-emissions-current": "123",
|
||||
"mechanical-ventilation": "natural",
|
||||
'photo-supply': 0,
|
||||
"solar-water-heating-flag": "N",
|
||||
"wind-turbine-count": 0,
|
||||
"extension-count": 0,
|
||||
"heat-loss-corridor": "no corridor",
|
||||
"unheated-corridor-length": 0,
|
||||
"mains-gas-flag": "Y",
|
||||
"floor-height": 2.5,
|
||||
"total-floor-area": 100
|
||||
},
|
||||
{
|
||||
'inspection-date': '2023-05-01', 'some-other-key': 'some-other-value',
|
||||
|
|
@ -50,6 +86,18 @@ mock_epc_response_dupe = {
|
|||
'hotwater-description': 'Hot Water Description',
|
||||
"transaction-type": "rental",
|
||||
"lighting-description": "Good Lighting Efficiency",
|
||||
"energy-consumption-current": "50",
|
||||
"co2-emissions-current": "123",
|
||||
"mechanical-ventilation": "natural",
|
||||
'photo-supply': 0,
|
||||
"solar-water-heating-flag": "N",
|
||||
"wind-turbine-count": 0,
|
||||
"extension-count": 0,
|
||||
"heat-loss-corridor": "no corridor",
|
||||
"unheated-corridor-length": 0,
|
||||
"mains-gas-flag": "Y",
|
||||
"floor-height": 2.5,
|
||||
"total-floor-area": 100
|
||||
},
|
||||
{
|
||||
'inspection-date': '2023-06-01', 'some-other-key': 'duplicate-date',
|
||||
|
|
@ -58,6 +106,18 @@ mock_epc_response_dupe = {
|
|||
'mainheat-description': 'Main Heating Description', 'hotwater-description': 'Hot Water Description',
|
||||
"transaction-type": "rental",
|
||||
"lighting-description": "Good Lighting Efficiency",
|
||||
"energy-consumption-current": "50",
|
||||
"co2-emissions-current": "123",
|
||||
"mechanical-ventilation": "natural",
|
||||
'photo-supply': 0,
|
||||
"solar-water-heating-flag": "N",
|
||||
"wind-turbine-count": 0,
|
||||
"extension-count": 0,
|
||||
"heat-loss-corridor": "no corridor",
|
||||
"unheated-corridor-length": 0,
|
||||
"mains-gas-flag": "Y",
|
||||
"floor-height": 2.5,
|
||||
"total-floor-area": 100
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -66,11 +126,11 @@ mock_epc_response_dupe = {
|
|||
class TestProperty:
|
||||
@pytest.fixture(autouse=True)
|
||||
def property_instance(self, mock_epc_client, mock_open_uprn_client, mock_cleaner):
|
||||
return Property("AB12CD", "Test Address", epc_client=mock_epc_client)
|
||||
return Property(1, "AB12CD", "Test Address", epc_client=mock_epc_client)
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def property_instance_dupe_data(self, mock_epc_client_dupe_data):
|
||||
return Property("AB12CD", "Test Address", epc_client=mock_epc_client_dupe_data)
|
||||
return Property(2, "AB12CD", "Test Address", epc_client=mock_epc_client_dupe_data)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_epc_client(self):
|
||||
|
|
@ -99,15 +159,26 @@ class TestProperty:
|
|||
|
||||
@pytest.fixture
|
||||
def mock_cleaner(self):
|
||||
mock_cleaner = Mock(spec=EpcClean(data=[
|
||||
{"roof-description": "Roof Description"},
|
||||
{"walls-description": "Walls Description"},
|
||||
{"windows-description": "Windows Description"},
|
||||
{"mainheat-description": "Main Heating Description"},
|
||||
{"hotwater-description": "Hot Water Description"},
|
||||
{"lighting-description": "Good Lighting Efficiency"},
|
||||
{"low-energy-lighting": 0}
|
||||
]))
|
||||
lighting_averages = [
|
||||
{'lighting-description': 'good lighting efficiency', 'low-energy-lighting': 99.26666666666667},
|
||||
{'lighting-description': 'excellent lighting efficiency', 'low-energy-lighting': 100.0},
|
||||
{'lighting-description': 'below average lighting efficiency', 'low-energy-lighting': 0.0}
|
||||
]
|
||||
|
||||
cleaner_spec = EpcClean(
|
||||
data=[
|
||||
{"roof-description": "Roof Description"},
|
||||
{"walls-description": "Walls Description"},
|
||||
{"windows-description": "Windows Description"},
|
||||
{"mainheat-description": "Main Heating Description"},
|
||||
{"hotwater-description": "Hot Water Description"},
|
||||
{"lighting-description": "Good Lighting Efficiency"},
|
||||
{"low-energy-lighting": 0}
|
||||
],
|
||||
lighting_averages=lighting_averages
|
||||
)
|
||||
|
||||
mock_cleaner = Mock(spec=cleaner_spec)
|
||||
mock_cleaner.cleaned = {
|
||||
"roof-description": [{"original_description": "Roof Description"}],
|
||||
"walls-description": [{"original_description": "Walls Description"}],
|
||||
|
|
@ -119,14 +190,14 @@ class TestProperty:
|
|||
return mock_cleaner
|
||||
|
||||
def test_init(self, mock_epc_client):
|
||||
inst1 = Property("AB12CD", "Test Address", epc_client=mock_epc_client)
|
||||
inst1 = Property(0, "AB12CD", "Test Address", epc_client=mock_epc_client)
|
||||
# Should be mocked auth token
|
||||
assert inst1.epc_client.auth_token == "mocked_auth_token"
|
||||
|
||||
inst2 = Property("AB12CD", "Test Address")
|
||||
inst2 = Property(3, "AB12CD", "Test Address")
|
||||
assert inst2.epc_client.auth_token
|
||||
|
||||
inst3 = Property("AB12CD", "Test Address", data={"some": "data"})
|
||||
inst3 = Property(4, "AB12CD", "Test Address", data={"some": "data"})
|
||||
assert inst3.data == {"some": "data"}
|
||||
|
||||
data = inst3.search_address_epc()
|
||||
|
|
@ -143,29 +214,9 @@ class TestProperty:
|
|||
with pytest.raises(Exception, match="More than one result found for this address - investigate me"):
|
||||
property_instance_dupe_data.search_address_epc()
|
||||
|
||||
def test_get_coordinates(self, property_instance, mock_open_uprn_client):
|
||||
# Set up the mock OpenUprnClient
|
||||
property_instance.data = {"uprn": 12345}
|
||||
property_instance.get_coordinates(mock_open_uprn_client)
|
||||
|
||||
# Verify that the coordinates are set correctly
|
||||
assert property_instance.coordinates == {
|
||||
"uprn": 12345,
|
||||
"longitude": 1.2345,
|
||||
"latitude": 2.3456
|
||||
}
|
||||
|
||||
def test_get_coordinates_without_open_uprn_data(self, property_instance, mock_open_uprn_client):
|
||||
# Modify the mock OpenUprnClient to not have read any data
|
||||
mock_open_uprn_client.data = None
|
||||
|
||||
# Verify that ValueError is raised when OpenUprnClient data is None
|
||||
with pytest.raises(ValueError, match="OpenUprnClient has not read data"):
|
||||
property_instance.get_coordinates(mock_open_uprn_client)
|
||||
|
||||
def test_get_components(self, property_instance, mock_cleaner, mock_epc_client):
|
||||
property_instance.search_address_epc()
|
||||
property_instance.get_components(mock_cleaner)
|
||||
property_instance.get_components(mock_cleaner.cleaned)
|
||||
|
||||
# Verify that the components are set correctly
|
||||
assert property_instance.roof == {"original_description": "Roof Description"}
|
||||
|
|
@ -180,7 +231,7 @@ class TestProperty:
|
|||
|
||||
# Verify that ValueError is raised when EpcClean doesn't contain cleaned data
|
||||
with pytest.raises(ValueError, match="Cleaner does not contain cleaned data"):
|
||||
property_instance.get_components(mock_cleaner)
|
||||
property_instance.get_components(mock_cleaner.cleaned)
|
||||
|
||||
def test_get_components_no_data(self, property_instance, mock_cleaner):
|
||||
# Modify the mock cleaner to have no attributes for a specific description
|
||||
|
|
@ -190,7 +241,7 @@ class TestProperty:
|
|||
|
||||
# Verify that ValueError is raised when no attributes are found
|
||||
with pytest.raises(ValueError, match="Property does not contain data"):
|
||||
property_instance.get_components(mock_cleaner)
|
||||
property_instance.get_components(mock_cleaner.cleaned)
|
||||
|
||||
def test_get_components_no_attributes(self, property_instance, mock_cleaner):
|
||||
# Modify the mock cleaner to have no attributes for a specific description
|
||||
|
|
@ -201,12 +252,12 @@ class TestProperty:
|
|||
|
||||
# Verify that ValueError is raised when no attributes are found
|
||||
with pytest.raises(ValueError, match="Either No attributes or multiple found for roof-description"):
|
||||
property_instance.get_components(mock_cleaner)
|
||||
property_instance.get_components(mock_cleaner.cleaned)
|
||||
|
||||
def test_get_components_multiple_attributes(self, property_instance, mock_cleaner):
|
||||
# This shouldn't happen - it would mean a cleaning error
|
||||
property_instance.search_address_epc()
|
||||
mock_cleaner.cleaned = {
|
||||
cleaned = {
|
||||
"roof-description": [
|
||||
{"original_description": "Roof Description"},
|
||||
{"original_description": "Roof Description"}
|
||||
|
|
@ -215,4 +266,4 @@ class TestProperty:
|
|||
|
||||
# Verify that ValueError is raised when multiple attributes are found
|
||||
with pytest.raises(ValueError, match="Either No attributes or multiple found for roof-description"):
|
||||
property_instance.get_components(mock_cleaner)
|
||||
property_instance.get_components(cleaned)
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ from model_data.EpcClean import EpcClean
|
|||
from model_data.analysis.UvalueEstimations import UvalueEstimations
|
||||
from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE
|
||||
from pathlib import Path
|
||||
from model_data.utils import save_data_to_s3
|
||||
from utils.s3 import save_data_to_s3
|
||||
|
||||
LAND_REGISTRY_PATHS = [
|
||||
os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv",
|
||||
|
|
|
|||
|
|
@ -13,14 +13,28 @@ class LightingAttributes:
|
|||
def __init__(self, description, averages):
|
||||
self.description: str = clean_description(description.lower())
|
||||
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
if translation:
|
||||
self.nodata = False
|
||||
self.description = translation
|
||||
self.welsh_translation_search()
|
||||
|
||||
self.description = correct_spelling(self.description)
|
||||
self.averages = averages
|
||||
|
||||
def welsh_translation_search(self):
|
||||
"""
|
||||
For welsh text describing the percentage of low energy lighting, we match the regular
|
||||
expression and perform the translation
|
||||
"""
|
||||
lel_match = re.search(r"goleuadau ynni-isel mewn (\d+)%? ogçör mannau gosod", self.description)
|
||||
|
||||
if lel_match:
|
||||
# Perform the actual translation
|
||||
percentage = lel_match.group(1)
|
||||
self.description = f"low energy lighting in {percentage}% of fixed outlets"
|
||||
else:
|
||||
translation = self.WELSH_TEXT.get(self.description)
|
||||
if translation:
|
||||
self.nodata = False
|
||||
self.description = translation
|
||||
|
||||
def process(self):
|
||||
|
||||
description = self.description
|
||||
|
|
|
|||
|
|
@ -12,6 +12,9 @@ from model_data.simulation_system.core.Settings import (
|
|||
FLOOR_LEVEL_MAP,
|
||||
BUILT_FORM_REMAP,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
COMPONENT_FEATURES,
|
||||
FIXED_FEATURES,
|
||||
COLUMNTYPES
|
||||
)
|
||||
from typing import List
|
||||
|
||||
|
|
@ -21,9 +24,16 @@ class DataProcessor:
|
|||
Handle data loading and data preprocessing
|
||||
"""
|
||||
|
||||
def __init__(self, filepath: Path | None) -> None:
|
||||
def __init__(self, filepath: Path | None, newdata: bool = False) -> None:
|
||||
"""
|
||||
:param filepath: If specified, is the physical location of the data
|
||||
:param newdata: Indicates if we are processing new, testing data.
|
||||
In this instance, there are some operations we do not
|
||||
want to perform, such as confine_data()
|
||||
"""
|
||||
self.filepath = filepath
|
||||
self.data = None
|
||||
self.newdata = newdata
|
||||
|
||||
def load_data(self, low_memory=False) -> None:
|
||||
if not self.filepath:
|
||||
|
|
@ -140,14 +150,30 @@ class DataProcessor:
|
|||
break
|
||||
to_index -= 1
|
||||
|
||||
def reformat_columns(self):
|
||||
"""
|
||||
This function applies the re-formattng of columns from lower case to capitalised
|
||||
|
||||
When requesting the epc data from the api, the columns are lower case
|
||||
and separated by a hyphen, whereas in the bulk download, the columns
|
||||
are capitalised and separated by underscores. If rename_columns is True
|
||||
we convert the columns from lower case to capitalised format
|
||||
:return:
|
||||
"""
|
||||
self.data.columns = [col.upper().replace("-", "_") for col in self.data.columns]
|
||||
|
||||
def pre_process(self) -> pd.DataFrame:
|
||||
"""
|
||||
Load data and begin initial cleaning
|
||||
"""
|
||||
if not self.data:
|
||||
if self.data is None:
|
||||
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
|
||||
|
||||
self.confine_data()
|
||||
if self.newdata:
|
||||
self.reformat_columns()
|
||||
|
||||
if not self.newdata:
|
||||
self.confine_data()
|
||||
|
||||
# We have some non-standard construction age bands which we'll clean for matching
|
||||
self.standardise_construction_age_band()
|
||||
|
|
@ -159,9 +185,10 @@ class DataProcessor:
|
|||
self.clean_multi_glaze_proportion()
|
||||
self.clean_photo_supply()
|
||||
|
||||
self.retain_multiple_epc_properties(
|
||||
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
|
||||
)
|
||||
if not self.newdata:
|
||||
self.retain_multiple_epc_properties(
|
||||
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
|
||||
)
|
||||
self.remap_columns()
|
||||
|
||||
if DATA_PROCESSOR_SETTINGS["epc_minimum_count"] >= 1:
|
||||
|
|
@ -169,6 +196,8 @@ class DataProcessor:
|
|||
self.fill_na_fields()
|
||||
|
||||
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||
# Final re-casting after data transformed and prepared
|
||||
self.data = self.data.astype(COLUMNTYPES)
|
||||
|
||||
return self.data
|
||||
|
||||
|
|
@ -178,6 +207,7 @@ class DataProcessor:
|
|||
"""
|
||||
# Each uprn can fille backward from recent and forward fill from oldest
|
||||
# The groupby changes the order and we use the index to make the original data
|
||||
|
||||
filled_data = (
|
||||
self.data.groupby("UPRN", group_keys=True)[columns_to_fill]
|
||||
.apply(lambda group: group.fillna(method="bfill").fillna(method="ffill"))
|
||||
|
|
@ -188,6 +218,11 @@ class DataProcessor:
|
|||
|
||||
self.data[columns_to_fill] = filled_data[columns_to_fill]
|
||||
|
||||
# For floor area, we also replace "" values with None
|
||||
self.data[["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]] = self.data[
|
||||
["FLOOR_HEIGHT", "TOTAL_FLOOR_AREA"]
|
||||
].replace("", None)
|
||||
|
||||
def remap_columns(self):
|
||||
"""
|
||||
Remap all columns, for any non values
|
||||
|
|
@ -430,3 +465,24 @@ class DataProcessor:
|
|||
data_to_clean.drop(columns=[f"{col}_AVERAGE"], inplace=True)
|
||||
|
||||
return data_to_clean
|
||||
|
||||
def get_component_features(self, suffix: str) -> pd.DataFrame:
|
||||
"""
|
||||
This function will return the property components such as the walls, roof, heating etc
|
||||
as well as lodgement date. These are features that we expect might change from one EPC to the
|
||||
next
|
||||
:param suffix: Should be one of "_STARTING" or "_ENDING"
|
||||
:return: Pandas dataframe containing the subset of columns defined in COMPONENT_FEATURES
|
||||
"""
|
||||
|
||||
if suffix not in ["_STARTING", "_ENDING"]:
|
||||
raise Exception("Suffix should be one of _STARTING or _ENFING")
|
||||
|
||||
return self.data[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].copy().add_suffix(suffix)
|
||||
|
||||
def get_fixed_features(self) -> pd.DataFrame:
|
||||
"""
|
||||
Returns the fixed features that we don't believe should vary from one EPC to the next
|
||||
:return: Pandas dataframe containing the columns defined in FIXED_FEATURES
|
||||
"""
|
||||
return self.data[FIXED_FEATURES]
|
||||
|
|
|
|||
|
|
@ -134,6 +134,7 @@ EARLIEST_EPC_DATE = "2014-08-01"
|
|||
|
||||
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
||||
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
||||
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
|
||||
|
||||
|
||||
def ordinal(n):
|
||||
|
|
@ -167,3 +168,29 @@ DATA_PROCESSOR_SETTINGS = {
|
|||
"epc_minimum_count": 1,
|
||||
"column_mappings": {"UPRN": [int, str]},
|
||||
}
|
||||
|
||||
# This has a manual mapping of the column types required
|
||||
COLUMNTYPES = {
|
||||
'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
|
||||
'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
|
||||
'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64', 'FLOOR_LEVEL': 'float64',
|
||||
'CONSTRUCTION_AGE_BAND': 'object',
|
||||
'TRANSACTION_TYPE': 'object',
|
||||
'WALLS_DESCRIPTION': 'object',
|
||||
'FLOOR_DESCRIPTION': 'object',
|
||||
'LIGHTING_DESCRIPTION': 'object',
|
||||
'ROOF_DESCRIPTION': 'object',
|
||||
'MAINHEAT_DESCRIPTION': 'object',
|
||||
'HOTWATER_DESCRIPTION': 'object', 'MAIN_FUEL': 'object',
|
||||
'MECHANICAL_VENTILATION': 'object',
|
||||
'SECONDHEAT_DESCRIPTION': 'object', 'ENERGY_TARIFF': 'object',
|
||||
'SOLAR_WATER_HEATING_FLAG': 'object', 'PHOTO_SUPPLY': 'float64',
|
||||
'WINDOWS_DESCRIPTION': 'object',
|
||||
'GLAZED_TYPE': 'object',
|
||||
'MULTI_GLAZE_PROPORTION': 'float64',
|
||||
'LOW_ENERGY_LIGHTING': 'float64',
|
||||
'NUMBER_OPEN_FIREPLACES': 'float64',
|
||||
'MAINHEATCONT_DESCRIPTION': 'object',
|
||||
'EXTENSION_COUNT': 'float64',
|
||||
'LODGEMENT_DATE': 'object',
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,20 +1,161 @@
|
|||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
import msgpack
|
||||
|
||||
from pathlib import Path
|
||||
from simulation_system.core.Settings import (
|
||||
from model_data.simulation_system.core.Settings import (
|
||||
MANDATORY_FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
COMPONENT_FEATURES,
|
||||
RDSAP_RESPONSE,
|
||||
HEAT_DEMAND_RESPONSE,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
EARLIEST_EPC_DATE
|
||||
EARLIEST_EPC_DATE,
|
||||
CARBON_RESPONSE,
|
||||
)
|
||||
from simulation_system.core.DataProcessor import DataProcessor
|
||||
from utils import save_dataframe_to_s3_parquet
|
||||
from model_data.simulation_system.core.DataProcessor import DataProcessor
|
||||
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
|
||||
|
||||
DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
|
||||
|
||||
def get_cleaned():
|
||||
"""
|
||||
This function will retrieve the cleaned dataset from s3 which has the cleaned
|
||||
descriptions for the epc dataset
|
||||
|
||||
This data is stored in MessagePack format and therefore needs to be decoded
|
||||
:return:
|
||||
"""
|
||||
|
||||
cleaned = read_from_s3(
|
||||
s3_file_name="cleaned_epc_data/cleaned.bson",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def process_and_prune_desriptions(df, cleaned_lookup):
|
||||
"""
|
||||
This method will merge on the cleaned lookup table and ensure that the building fabric in the
|
||||
starting and ending EPC is consistent, so ensure that we are performing our modelling on the cleanest
|
||||
possible dataset.
|
||||
:param df:
|
||||
:param cleaned_lookup:
|
||||
:return:
|
||||
"""
|
||||
|
||||
# TODO: In a future iteration, we can test using the binary features and the insulation thickness
|
||||
# estimates, we well as estimated U-values
|
||||
|
||||
cols_to_drop = {
|
||||
"walls": [
|
||||
'original_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_cavity_wall', 'is_filled_cavity',
|
||||
'is_solid_brick', 'is_system_built', 'is_timber_frame',
|
||||
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_assumed',
|
||||
'is_sandstone_or_limestone', 'insulation_thickness',
|
||||
'external_insulation', 'internal_insulation',
|
||||
'original_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_cavity_wall_ENDING', 'is_filled_cavity_ENDING',
|
||||
'is_solid_brick_ENDING', 'is_system_built_ENDING',
|
||||
'is_timber_frame_ENDING', 'is_granite_or_whinstone_ENDING',
|
||||
'is_as_built_ENDING', 'is_cob_ENDING', 'is_assumed_ENDING',
|
||||
'is_sandstone_or_limestone_ENDING', 'insulation_thickness_ENDING',
|
||||
'external_insulation_ENDING', 'internal_insulation_ENDING',
|
||||
],
|
||||
"floor": [
|
||||
'original_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_assumed', 'is_to_unheated_space',
|
||||
'is_to_external_air', 'is_suspended', 'is_solid',
|
||||
'another_property_below', 'insulation_thickness', 'no_data',
|
||||
'original_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_assumed_ENDING', 'is_to_unheated_space_ENDING',
|
||||
'is_to_external_air_ENDING', 'is_suspended_ENDING', 'is_solid_ENDING',
|
||||
'another_property_below_ENDING', 'insulation_thickness_ENDING',
|
||||
'no_data_ENDING',
|
||||
],
|
||||
"roof": [
|
||||
'original_description', 'clean_description', 'thermal_transmittance',
|
||||
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
|
||||
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
|
||||
'has_dwelling_above', 'is_valid', 'insulation_thickness',
|
||||
'original_description_ENDING', 'clean_description_ENDING',
|
||||
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
|
||||
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
|
||||
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
|
||||
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
|
||||
'insulation_thickness_ENDING',
|
||||
]
|
||||
|
||||
}
|
||||
|
||||
for component in ["walls", "floor", "roof"]:
|
||||
component_upper = component.upper()
|
||||
|
||||
df = df.merge(
|
||||
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
||||
how="left",
|
||||
left_on=f"{component_upper}_DESCRIPTION_STARTING",
|
||||
right_on="original_description",
|
||||
).merge(
|
||||
pd.DataFrame(cleaned_lookup[f"{component}-description"]),
|
||||
how="left",
|
||||
left_on=f"{component_upper}_DESCRIPTION_ENDING",
|
||||
right_on="original_description",
|
||||
suffixes=("", "_ENDING")
|
||||
)
|
||||
|
||||
if component == "walls":
|
||||
# We make sure the wall construction hasn't changed
|
||||
df = df[
|
||||
(df["is_cavity_wall"] == df["is_cavity_wall_ENDING"]) &
|
||||
(df["is_solid_brick"] == df["is_solid_brick_ENDING"]) &
|
||||
(df["is_timber_frame"] == df["is_timber_frame_ENDING"]) &
|
||||
(df["is_granite_or_whinstone"] == df["is_granite_or_whinstone_ENDING"]) &
|
||||
(df["is_cob"] == df["is_cob_ENDING"]) &
|
||||
(df["is_sandstone_or_limestone"] == df["is_sandstone_or_limestone_ENDING"])
|
||||
]
|
||||
elif component == "floor":
|
||||
df = df[
|
||||
(df["is_suspended"] == df["is_suspended_ENDING"]) &
|
||||
(df["is_solid"] == df["is_solid_ENDING"]) &
|
||||
(df["another_property_below"] == df["another_property_below_ENDING"]) &
|
||||
(df["is_to_unheated_space"] == df["is_to_unheated_space_ENDING"])
|
||||
]
|
||||
else:
|
||||
df = df[
|
||||
(df["is_pitched"] == df["is_pitched_ENDING"]) &
|
||||
(df["is_roof_room"] == df["is_roof_room_ENDING"]) &
|
||||
(df["is_loft"] == df["is_loft_ENDING"]) &
|
||||
(df["is_flat"] == df["is_flat_ENDING"]) &
|
||||
(df["is_thatched"] == df["is_thatched_ENDING"]) &
|
||||
(df["is_at_rafters"] == df["is_at_rafters_ENDING"]) &
|
||||
(df["has_dwelling_above"] == df["has_dwelling_above_ENDING"])
|
||||
]
|
||||
|
||||
# Drop the binary indicators and replace the original description with the cleaned version
|
||||
|
||||
# Drop original cols
|
||||
original_cols = [
|
||||
f"{component_upper}_DESCRIPTION_STARTING", f"{component_upper}_DESCRIPTION_ENDING"
|
||||
]
|
||||
|
||||
df = df.drop(
|
||||
columns=cols_to_drop[component] + original_cols
|
||||
).rename(
|
||||
columns={
|
||||
"clean_description": f"{component_upper}_DESCRIPTION_STARTING",
|
||||
"clean_description_ENDING": f"{component_upper}_DESCRIPTION_ENDING",
|
||||
}
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def app():
|
||||
|
|
@ -23,6 +164,8 @@ def app():
|
|||
# Data glossary:
|
||||
# https://epc.opendatacommunities.org/docs/guidance#glossary
|
||||
|
||||
cleaned_lookup = get_cleaned()
|
||||
|
||||
# List all subdirectories
|
||||
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
|
||||
|
||||
|
|
@ -38,7 +181,7 @@ def app():
|
|||
# TODO [x] : Have a look at temporal features
|
||||
# TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
|
||||
# TODO [x]: Same as floor area for floor height
|
||||
# TODO []: If fundamental building fabric changes, we should proabably discard the record
|
||||
# TODO [x]: If fundamental building fabric changes, we should proabably discard the record
|
||||
# TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
|
||||
# - leave for now and check performance after temporal features
|
||||
# TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
|
||||
|
|
@ -84,7 +227,7 @@ def app():
|
|||
# We include the lodgement date here as we probably need to factor time into the
|
||||
# model, since EPC standards and rigour have changed over time
|
||||
variable_data = modified_property_data[
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
|
||||
]
|
||||
|
||||
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
||||
|
|
@ -104,24 +247,29 @@ def app():
|
|||
if gets_better:
|
||||
starting_sap = earliest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
||||
starting_carbon = earliest_record[CARBON_RESPONSE]
|
||||
|
||||
rdsap_change = latest_record[RDSAP_RESPONSE] - starting_sap
|
||||
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
else:
|
||||
starting_sap = latest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
||||
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
|
||||
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
|
||||
|
||||
if rdsap_change == 0:
|
||||
continue
|
||||
|
||||
if gets_better:
|
||||
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
else:
|
||||
starting_sap = latest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
||||
starting_carbon = latest_record[CARBON_RESPONSE]
|
||||
|
||||
rdsap_change = earliest_record[RDSAP_RESPONSE] - starting_sap
|
||||
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
|
||||
|
||||
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
|
||||
if rdsap_change == 0:
|
||||
continue
|
||||
|
||||
features = pd.concat([starting_record, ending_record])
|
||||
|
||||
property_model_data.append(
|
||||
|
|
@ -129,8 +277,10 @@ def app():
|
|||
"UPRN": uprn,
|
||||
"RDSAP_CHANGE": rdsap_change,
|
||||
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
||||
"STARTING_SAP": starting_sap,
|
||||
"STARTING_HEAT_DEMAND": starting_heat_demand,
|
||||
"CARBON_CHANGE": carbon_change,
|
||||
"SAP_STARTING": starting_sap,
|
||||
"HEAT_DEMAND_STARTING": starting_heat_demand,
|
||||
"CARBON_STARTING": starting_carbon,
|
||||
**fixed_data,
|
||||
**features.to_dict(),
|
||||
}
|
||||
|
|
@ -152,6 +302,14 @@ def app():
|
|||
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
||||
# within descriptions
|
||||
|
||||
# We look for key building fabric features that have changed from one EPC to the next.
|
||||
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
|
||||
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
|
||||
# is low
|
||||
# We also replace descriptions with their cleaned variants
|
||||
|
||||
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
|
||||
|
||||
dataset.append(data_by_urpn_df)
|
||||
|
||||
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
|
||||
|
|
|
|||
|
|
@ -1,7 +1,3 @@
|
|||
import boto3
|
||||
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
|
||||
import pandas as pd
|
||||
from io import BytesIO
|
||||
import re
|
||||
from textblob import TextBlob
|
||||
|
||||
|
|
@ -28,65 +24,3 @@ def correct_spelling(text):
|
|||
|
||||
corrected_text = ' '.join(corrected_words)
|
||||
return corrected_text
|
||||
|
||||
|
||||
def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
|
||||
"""
|
||||
Save a pandas DataFrame to S3 as a Parquet file.
|
||||
|
||||
:param df: The pandas DataFrame.
|
||||
:param bucket_name: Name of the S3 bucket.
|
||||
:param file_key: Key of the file (including directory path within the bucket).
|
||||
"""
|
||||
|
||||
# Convert the DataFrame to a Parquet format in memory
|
||||
parquet_buffer = BytesIO()
|
||||
df.to_parquet(parquet_buffer)
|
||||
|
||||
# Create the boto3 client
|
||||
client = boto3.client('s3')
|
||||
|
||||
# Upload the Parquet file to S3
|
||||
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
|
||||
|
||||
|
||||
def save_data_to_s3(data, bucket_name, s3_file_name):
|
||||
"""
|
||||
Save an object to an S3 bucket
|
||||
|
||||
:param data: The data to save
|
||||
:param bucket_name: The name of the S3 bucket
|
||||
:param s3_file_name: The file name to use for the saved data in S3
|
||||
"""
|
||||
# Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
|
||||
try:
|
||||
s3 = boto3.client('s3')
|
||||
except NoCredentialsError:
|
||||
print("Credentials not available.")
|
||||
return
|
||||
except PartialCredentialsError:
|
||||
print("Incomplete credentials provided.")
|
||||
return
|
||||
|
||||
try:
|
||||
s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
|
||||
print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
|
||||
except Exception as e:
|
||||
print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
|
||||
|
||||
|
||||
def read_from_s3(bucket_name, s3_file_name):
|
||||
"""
|
||||
Read an object from s3. Decoding of the data is left for outside of this function
|
||||
|
||||
:param bucket_name: The name of the S3 bucket
|
||||
:param s3_file_name: The file name to use for the saved data in S3
|
||||
"""
|
||||
# Initialize a session using Amazon S3
|
||||
s3 = boto3.resource('s3')
|
||||
|
||||
# Get the MessagePack data from S3
|
||||
obj = s3.Object(bucket_name, s3_file_name)
|
||||
data = obj.get()['Body'].read()
|
||||
|
||||
return data
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
[pytest]
|
||||
pythonpath = .
|
||||
addopts = --cov-report term-missing --cov=model_data --cov=recommendations
|
||||
testpaths = model_data/tests recommendations/tests
|
||||
testpaths = model_data/tests recommendations/tests backend/tests
|
||||
|
|
|
|||
8
recommendations/config.py
Normal file
8
recommendations/config.py
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# This map defines the upgrades that are possible to be recommended by the recommendation engine
|
||||
# For example,
|
||||
# TODO: once we use cleaned descriptions, this should be updated using the cleaned descriptions
|
||||
UPGRADES_MAP = {
|
||||
'Solid brick, as built, no insulation (assumed)': 'Solid brick, as built, insulated (assumed)',
|
||||
'Suspended, no insulation (assumed)': 'Suspended, insulated (assumed)',
|
||||
'Solid, no insulation (assumed)': 'Solid, insulated (assumed)',
|
||||
}
|
||||
|
|
@ -20,7 +20,7 @@ fi
|
|||
|
||||
# Step 2: Build the Docker image
|
||||
echo "Building Docker image..."
|
||||
docker build -t $IMAGE_NAME:$TAG -f backend/docker/lambda.Dockerfile .
|
||||
docker build --platform linux/amd64 -t $IMAGE_NAME:$TAG -f backend/docker/lambda.Dockerfile .
|
||||
|
||||
# Step 3: Run the Docker image with the emulator, .env file, and AWS credentials
|
||||
echo "Starting the Docker container..."
|
||||
|
|
|
|||
65
utils/s3.py
Normal file
65
utils/s3.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
import boto3
|
||||
from io import BytesIO
|
||||
from botocore.exceptions import NoCredentialsError, PartialCredentialsError
|
||||
|
||||
|
||||
def read_from_s3(bucket_name, s3_file_name):
|
||||
"""
|
||||
Read an object from s3. Decoding of the data is left for outside of this function
|
||||
|
||||
:param bucket_name: The name of the S3 bucket
|
||||
:param s3_file_name: The file name to use for the saved data in S3
|
||||
"""
|
||||
# Initialize a session using Amazon S3
|
||||
s3 = boto3.resource('s3')
|
||||
|
||||
# Get the MessagePack data from S3
|
||||
obj = s3.Object(bucket_name, s3_file_name)
|
||||
data = obj.get()['Body'].read()
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def save_data_to_s3(data, bucket_name, s3_file_name):
|
||||
"""
|
||||
Save an object to an S3 bucket
|
||||
|
||||
:param data: The data to save
|
||||
:param bucket_name: The name of the S3 bucket
|
||||
:param s3_file_name: The file name to use for the saved data in S3
|
||||
"""
|
||||
# Ensure you have AWS credentials set up - either via environment variables, AWS CLI, or IAM roles
|
||||
try:
|
||||
s3 = boto3.client('s3')
|
||||
except NoCredentialsError:
|
||||
print("Credentials not available.")
|
||||
return
|
||||
except PartialCredentialsError:
|
||||
print("Incomplete credentials provided.")
|
||||
return
|
||||
|
||||
try:
|
||||
s3.put_object(Bucket=bucket_name, Key=s3_file_name, Body=data)
|
||||
print(f'Successfully uploaded data to {bucket_name}/{s3_file_name}')
|
||||
except Exception as e:
|
||||
print(f'Failed to upload data to {bucket_name}/{s3_file_name}: {str(e)}')
|
||||
|
||||
|
||||
def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
|
||||
"""
|
||||
Save a pandas DataFrame to S3 as a Parquet file.
|
||||
|
||||
:param df: The pandas DataFrame.
|
||||
:param bucket_name: Name of the S3 bucket.
|
||||
:param file_key: Key of the file (including directory path within the bucket).
|
||||
"""
|
||||
|
||||
# Convert the DataFrame to a Parquet format in memory
|
||||
parquet_buffer = BytesIO()
|
||||
df.to_parquet(parquet_buffer)
|
||||
|
||||
# Create the boto3 client
|
||||
client = boto3.client('s3')
|
||||
|
||||
# Upload the Parquet file to S3
|
||||
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
|
||||
Loading…
Add table
Reference in a new issue